Repository: aiming-lab/AutoResearchClaw Branch: main Commit: 258dae2bb28f Files: 422 Total size: 4.1 MB Directory structure: gitextract_tp1xyq09/ ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── config.researchclaw.example.yaml ├── docs/ │ ├── BUG_FIX_DOCUMENT_20260316.md │ ├── BUG_TRACKER.md │ ├── CHANGELOG_ANTHROPIC_ADAPTER.md │ ├── PIPELINE_TEST_LOG_R5.md │ ├── README_AR.md │ ├── README_CN.md │ ├── README_DE.md │ ├── README_ES.md │ ├── README_FR.md │ ├── README_JA.md │ ├── README_KO.md │ ├── README_PT.md │ ├── README_RU.md │ ├── TESTER_GUIDE.md │ ├── TESTER_GUIDE_CN.md │ ├── TESTER_GUIDE_JA.md │ ├── agent_figure_and_benchmark_plan.md │ ├── figure_prompts/ │ │ ├── case_a_meta_learning.md │ │ └── case_b_rlhf_alignment.md │ ├── integration-guide.md │ ├── issue_tracker_v9.md │ ├── iteration_plan_v8.md │ ├── iteration_showcase_narrative.md │ ├── metaclaw-integration-plan.md │ ├── next_phase_showcase_plan.md │ ├── pipeline_critical_fixes_v8.md │ ├── rate_limit_fix_plan.md │ ├── sandbox_environment_fix_plan.md │ └── showcase/ │ └── SHOWCASE.md ├── prompts.default.yaml ├── pyproject.toml ├── researchclaw/ │ ├── __init__.py │ ├── __main__.py │ ├── adapters.py │ ├── agents/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── benchmark_agent/ │ │ │ ├── __init__.py │ │ │ ├── acquirer.py │ │ │ ├── orchestrator.py │ │ │ ├── selector.py │ │ │ ├── surveyor.py │ │ │ └── validator.py │ │ ├── code_searcher/ │ │ │ ├── __init__.py │ │ │ ├── agent.py │ │ │ ├── cache.py │ │ │ ├── github_client.py │ │ │ ├── pattern_extractor.py │ │ │ └── query_gen.py │ │ └── figure_agent/ │ │ ├── __init__.py │ │ ├── codegen.py │ │ ├── critic.py │ │ ├── decision.py │ │ ├── integrator.py │ │ ├── nano_banana.py │ │ ├── orchestrator.py │ │ ├── planner.py │ │ ├── renderer.py │ │ └── style_config.py │ ├── assessor/ │ │ ├── __init__.py │ │ ├── comparator.py │ │ ├── rubrics.py │ │ ├── scorer.py │ │ └── venue_recommender.py │ ├── calendar/ │ │ ├── __init__.py │ │ ├── data/ │ │ │ └── conferences.yaml │ │ ├── deadlines.py │ │ ├── planner.py │ │ └── reminder.py │ ├── cli.py │ ├── collaboration/ │ │ ├── __init__.py │ │ ├── dedup.py │ │ ├── publisher.py │ │ ├── repository.py │ │ └── subscriber.py │ ├── config.py │ ├── copilot/ │ │ ├── __init__.py │ │ ├── branching.py │ │ ├── controller.py │ │ ├── feedback.py │ │ └── modes.py │ ├── dashboard/ │ │ ├── __init__.py │ │ ├── broadcaster.py │ │ ├── collector.py │ │ └── metrics.py │ ├── data/ │ │ ├── __init__.py │ │ ├── benchmark_knowledge.yaml │ │ ├── dataset_registry.yaml │ │ ├── docker_profiles.yaml │ │ ├── framework_docs/ │ │ │ ├── axolotl.md │ │ │ ├── llamafactory.md │ │ │ ├── peft.md │ │ │ ├── transformers_training.md │ │ │ └── trl.md │ │ └── seminal_papers.yaml │ ├── docker/ │ │ ├── Dockerfile │ │ ├── Dockerfile.biology │ │ ├── Dockerfile.chemistry │ │ ├── Dockerfile.economics │ │ ├── Dockerfile.generic │ │ ├── Dockerfile.math │ │ ├── Dockerfile.physics │ │ └── entrypoint.sh │ ├── domains/ │ │ ├── __init__.py │ │ ├── adapters/ │ │ │ ├── __init__.py │ │ │ ├── biology.py │ │ │ ├── chemistry.py │ │ │ ├── economics.py │ │ │ ├── generic.py │ │ │ ├── math.py │ │ │ ├── ml.py │ │ │ ├── neuroscience.py │ │ │ ├── physics.py │ │ │ ├── robotics.py │ │ │ └── security.py │ │ ├── detector.py │ │ ├── experiment_schema.py │ │ ├── profiles/ │ │ │ ├── _generic.yaml │ │ │ ├── biology_genomics.yaml │ │ │ ├── biology_protein.yaml │ │ │ ├── biology_singlecell.yaml │ │ │ ├── chemistry_molprop.yaml │ │ │ ├── chemistry_qm.yaml │ │ │ ├── economics_empirical.yaml │ │ │ ├── mathematics_numerical.yaml │ │ │ ├── mathematics_optimization.yaml │ │ │ ├── ml_compression.yaml │ │ │ ├── ml_generative.yaml │ │ │ ├── ml_generic.yaml │ │ │ ├── ml_graph.yaml │ │ │ ├── ml_nlp.yaml │ │ │ ├── ml_rl.yaml │ │ │ ├── ml_tabular.yaml │ │ │ ├── ml_vision.yaml │ │ │ ├── neuroscience_computational.yaml │ │ │ ├── neuroscience_imaging.yaml │ │ │ ├── physics_pde.yaml │ │ │ ├── physics_quantum.yaml │ │ │ ├── physics_simulation.yaml │ │ │ ├── robotics_control.yaml │ │ │ └── security_detection.yaml │ │ └── prompt_adapter.py │ ├── evolution.py │ ├── experiment/ │ │ ├── __init__.py │ │ ├── agentic_sandbox.py │ │ ├── code_agent.py │ │ ├── colab_sandbox.py │ │ ├── docker_sandbox.py │ │ ├── evaluators/ │ │ │ ├── __init__.py │ │ │ └── convergence.py │ │ ├── factory.py │ │ ├── git_manager.py │ │ ├── harness_template.py │ │ ├── metrics.py │ │ ├── runner.py │ │ ├── sandbox.py │ │ ├── ssh_sandbox.py │ │ ├── validator.py │ │ └── visualize.py │ ├── feedback/ │ │ └── FEEDBACK_ANALYSIS_PROMPT.md │ ├── hardware.py │ ├── health.py │ ├── knowledge/ │ │ ├── __init__.py │ │ ├── base.py │ │ └── graph/ │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── entities.py │ │ ├── query.py │ │ ├── relations.py │ │ └── visualizer.py │ ├── literature/ │ │ ├── __init__.py │ │ ├── arxiv_client.py │ │ ├── cache.py │ │ ├── models.py │ │ ├── novelty.py │ │ ├── openalex_client.py │ │ ├── search.py │ │ ├── semantic_scholar.py │ │ ├── trends.py │ │ └── verify.py │ ├── llm/ │ │ ├── __init__.py │ │ ├── acp_client.py │ │ ├── anthropic_adapter.py │ │ └── client.py │ ├── mcp/ │ │ ├── __init__.py │ │ ├── client.py │ │ ├── registry.py │ │ ├── server.py │ │ ├── tools.py │ │ └── transport.py │ ├── memory/ │ │ ├── __init__.py │ │ ├── decay.py │ │ ├── embeddings.py │ │ ├── experiment_memory.py │ │ ├── ideation_memory.py │ │ ├── retriever.py │ │ ├── store.py │ │ └── writing_memory.py │ ├── metaclaw_bridge/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── lesson_to_skill.py │ │ ├── prm_gate.py │ │ ├── session.py │ │ ├── skill_feedback.py │ │ └── stage_skill_map.py │ ├── overleaf/ │ │ ├── __init__.py │ │ ├── conflict.py │ │ ├── formatter.py │ │ ├── sync.py │ │ └── watcher.py │ ├── pipeline/ │ │ ├── __init__.py │ │ ├── _domain.py │ │ ├── _helpers.py │ │ ├── code_agent.py │ │ ├── contracts.py │ │ ├── executor.py │ │ ├── experiment_diagnosis.py │ │ ├── experiment_repair.py │ │ ├── opencode_bridge.py │ │ ├── paper_verifier.py │ │ ├── runner.py │ │ ├── stage_impls/ │ │ │ ├── __init__.py │ │ │ ├── _analysis.py │ │ │ ├── _code_generation.py │ │ │ ├── _execution.py │ │ │ ├── _experiment_design.py │ │ │ ├── _literature.py │ │ │ ├── _paper_writing.py │ │ │ ├── _review_publish.py │ │ │ ├── _synthesis.py │ │ │ └── _topic.py │ │ ├── stages.py │ │ └── verified_registry.py │ ├── project/ │ │ ├── __init__.py │ │ ├── idea_pool.py │ │ ├── manager.py │ │ ├── models.py │ │ └── scheduler.py │ ├── prompts.py │ ├── quality.py │ ├── report.py │ ├── server/ │ │ ├── __init__.py │ │ ├── app.py │ │ ├── dialog/ │ │ │ ├── __init__.py │ │ │ ├── intents.py │ │ │ ├── router.py │ │ │ └── session.py │ │ ├── middleware/ │ │ │ ├── __init__.py │ │ │ └── auth.py │ │ ├── routes/ │ │ │ ├── __init__.py │ │ │ ├── chat.py │ │ │ ├── pipeline.py │ │ │ ├── projects.py │ │ │ └── voice.py │ │ └── websocket/ │ │ ├── __init__.py │ │ ├── events.py │ │ └── manager.py │ ├── servers/ │ │ ├── __init__.py │ │ ├── cloud_executor.py │ │ ├── dispatcher.py │ │ ├── monitor.py │ │ ├── registry.py │ │ ├── slurm_executor.py │ │ └── ssh_executor.py │ ├── skills/ │ │ ├── __init__.py │ │ ├── builtin/ │ │ │ ├── __init__.py │ │ │ ├── domain/ │ │ │ │ ├── cv-classification/ │ │ │ │ │ └── SKILL.md │ │ │ │ ├── cv-detection/ │ │ │ │ │ └── SKILL.md │ │ │ │ ├── nlp-alignment/ │ │ │ │ │ └── SKILL.md │ │ │ │ ├── nlp-pretraining/ │ │ │ │ │ └── SKILL.md │ │ │ │ └── rl-policy-optimization/ │ │ │ │ └── SKILL.md │ │ │ ├── experiment/ │ │ │ │ ├── experimental-design/ │ │ │ │ │ └── SKILL.md │ │ │ │ ├── meta-analysis/ │ │ │ │ │ └── SKILL.md │ │ │ │ └── systematic-review/ │ │ │ │ └── SKILL.md │ │ │ └── tooling/ │ │ │ ├── data-loading/ │ │ │ │ └── SKILL.md │ │ │ ├── distributed-training/ │ │ │ │ └── SKILL.md │ │ │ ├── mixed-precision/ │ │ │ │ └── SKILL.md │ │ │ └── pytorch-training/ │ │ │ └── SKILL.md │ │ ├── loader.py │ │ ├── matcher.py │ │ ├── registry.py │ │ └── schema.py │ ├── templates/ │ │ ├── __init__.py │ │ ├── compiler.py │ │ ├── conference.py │ │ ├── converter.py │ │ ├── results_table_builder.py │ │ └── styles/ │ │ ├── iclr_2025/ │ │ │ ├── iclr2025_conference.bst │ │ │ └── iclr2025_conference.sty │ │ ├── iclr_2026/ │ │ │ ├── iclr2026_conference.bst │ │ │ └── iclr2026_conference.sty │ │ ├── icml_2025/ │ │ │ ├── icml2025.bst │ │ │ └── icml2025.sty │ │ ├── icml_2026/ │ │ │ ├── icml2026.bst │ │ │ └── icml2026.sty │ │ ├── neurips_2024/ │ │ │ └── neurips_2024.sty │ │ └── neurips_2025/ │ │ └── neurips_2025.sty │ ├── trends/ │ │ ├── __init__.py │ │ ├── auto_topic.py │ │ ├── daily_digest.py │ │ ├── feeds.py │ │ ├── opportunity_finder.py │ │ └── trend_analyzer.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── sanitize.py │ │ └── thinking_tags.py │ ├── voice/ │ │ ├── __init__.py │ │ ├── commands.py │ │ ├── synthesizer.py │ │ └── transcriber.py │ ├── web/ │ │ ├── __init__.py │ │ ├── _ssrf.py │ │ ├── agent.py │ │ ├── crawler.py │ │ ├── pdf_extractor.py │ │ ├── scholar.py │ │ └── search.py │ ├── wizard/ │ │ ├── __init__.py │ │ ├── quickstart.py │ │ ├── templates.py │ │ └── validator.py │ └── writing_guide.py ├── scripts/ │ ├── metaclaw_start.sh │ ├── plot_iteration_showcase.py │ ├── test_beast_mode_e2e.py │ ├── test_code_agent_live.py │ ├── test_code_agent_sandbox.py │ └── test_codegen_v2.py ├── sentinel.sh ├── tests/ │ ├── __init__.py │ ├── conftest.py │ ├── e2e_docker_sandbox.py │ ├── e2e_real_llm.py │ ├── test_anthropic.py │ ├── test_assessor.py │ ├── test_benchmark_agent.py │ ├── test_calendar.py │ ├── test_cli.py │ ├── test_code_agent.py │ ├── test_code_searcher.py │ ├── test_collaboration.py │ ├── test_compiler.py │ ├── test_convergence_evaluator.py │ ├── test_copilot.py │ ├── test_decision_agent.py │ ├── test_domain_detector.py │ ├── test_entry_point_validation.py │ ├── test_experiment_diagnosis.py │ ├── test_experiment_repair.py │ ├── test_experiment_schema.py │ ├── test_figure_agent.py │ ├── test_knowledge_graph.py │ ├── test_mcp.py │ ├── test_memory_system.py │ ├── test_metaclaw_bridge/ │ │ ├── __init__.py │ │ ├── test_config.py │ │ ├── test_lesson_to_skill.py │ │ ├── test_prm_gate.py │ │ ├── test_session.py │ │ ├── test_skill_feedback.py │ │ └── test_stage_skill_map.py │ ├── test_metric_parser.py │ ├── test_minimax_provider.py │ ├── test_neuroscience_domain.py │ ├── test_opencode_bridge.py │ ├── test_overleaf.py │ ├── test_paper_verifier.py │ ├── test_project_manager.py │ ├── test_prompt_adapter.py │ ├── test_rc_adapters.py │ ├── test_rc_cache.py │ ├── test_rc_checkpoint.py │ ├── test_rc_citation_resolve.py │ ├── test_rc_citation_verify.py │ ├── test_rc_cli.py │ ├── test_rc_config.py │ ├── test_rc_contracts.py │ ├── test_rc_docker_sandbox.py │ ├── test_rc_e2e_regression.py │ ├── test_rc_evolution.py │ ├── test_rc_executor.py │ ├── test_rc_hardware.py │ ├── test_rc_health.py │ ├── test_rc_kb.py │ ├── test_rc_literature.py │ ├── test_rc_llm.py │ ├── test_rc_novelty.py │ ├── test_rc_preflight.py │ ├── test_rc_prompts.py │ ├── test_rc_quality.py │ ├── test_rc_report.py │ ├── test_rc_runner.py │ ├── test_rc_sanitization.py │ ├── test_rc_sentinel.py │ ├── test_rc_stages.py │ ├── test_rc_templates.py │ ├── test_rc_validator.py │ ├── test_results_table_builder.py │ ├── test_robotics_adapter.py │ ├── test_servers.py │ ├── test_skills_library.py │ ├── test_ssh_and_colab_sandbox.py │ ├── test_trends.py │ ├── test_universal_codegen_integration.py │ ├── test_v6_improvements.py │ ├── test_verified_registry.py │ ├── test_web_crawler.py │ ├── test_web_integration.py │ ├── test_web_pdf_extractor.py │ ├── test_web_platform.py │ ├── test_web_scholar.py │ └── test_web_search.py └── website/ ├── features.html ├── getting-started.html ├── index.html ├── papers.html ├── pipeline.html └── style.css ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ HANDOFF_METACLAW_SKILL_LOOP.md .venv/ __pycache__/ *.pyc *.egg-info/ dist/ build/ workspaces/ .claude/* !.claude/agents/ !.claude/agents/*.md !.claude/skills/ !.claude/skills/**/SKILL.md .claude/settings.local.json # Experiment run artifacts (local only) artifacts/ output/ experiment_metaclaw/ promotional/ # Legacy experiment artifacts (pre-v5) exp/ logs/ writing/ # Root-level config (local overrides, not committed) /config.yaml # Sensitive / credentials user_token_cache.json *.secret .env .env.* config_run*.yaml # Literature search cache .researchclaw_cache/ # Playwright MCP logs .playwright-mcp/ # Internal dev/debug docs (not for public) docs/internal/ docs/kb/ docs/plans/ docs/BUGFIX_TRACKER*.md docs/IMPROVEMENT_PLAN*.md docs/IMPROVEMENT_*_EXECUTION.md docs/OPTIMIZATION_PLAN*.md docs/MULTI_CASE_EVALUATION*.md docs/pipeline_quality_issues*.md docs/autobench-loop.md RESEARCHCLAW_AGENTS.md RESEARCHCLAW_CLAUDE.md # Task-specific config files (keep example template only) config_agent_*.yaml config_case*.yaml config_v8_case*.yaml pipeline_run_*.log benchmarks/ # Logo generation prompts image/logo_prompt.md # macOS .DS_Store run.log # Misc temp files .history/ .serena/ cli_pause 暂停 进入 连续失败 重试一次 .venv_arc/ /config.arc.yaml config_*.yaml # Frontend (local dev only) frontend/ # Test outputs and run logs (local only) test_outputs*/ records/ run*_full_log.txt mdpdf.log scripts/md2pdf.py # Local docs (not for public) docs/tasks/ docs/feature_expansion_analysis.* docs/tester_guide_cn.* ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to AutoResearchClaw ## Setup 1. Fork and clone the repo 2. Create a venv and install with dev extras: ``` python3 -m venv .venv && source .venv/bin/activate pip install -e ".[dev]" ``` 3. Generate your local config: ``` researchclaw init ``` 4. Edit `config.arc.yaml` with your LLM settings ## Config Convention - `config.researchclaw.example.yaml` — tracked template (do not add secrets) - `config.arc.yaml` — your local config (gitignored, created by `researchclaw init`) - `config.yaml` — also gitignored, supported as fallback ## Running Tests ``` pytest tests/ ``` ## Checking Your Environment ``` researchclaw doctor ``` ## PR Guidelines - Branch from main - One concern per PR - Ensure `pytest tests/` passes - Include tests for new functionality ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2026 Aiming Lab Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

AutoResearchClaw Logo

Chat an Idea. Get a Paper. Fully Autonomous & Self-Evolving.

Just chat with OpenClaw: "Research X" → done.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Paper Showcase · 📖 Integration Guide · 💬 Discord Community

---
Sample Paper 🏆 Generated Paper Showcase

8 papers across 8 domains — math, statistics, biology, computing, NLP, RL, vision, robustness — generated fully autonomously with zero human intervention.

View Showcase
--- > **🧪 We're looking for testers!** Try the pipeline with your own research idea — from any field — and [tell us what you think](docs/TESTER_GUIDE.md). Your feedback directly shapes the next version. **[→ Testing Guide](docs/TESTER_GUIDE.md)** | **[→ 中文测试指南](docs/TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](docs/TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Cross-Platform Support + Major Stability** — AutoResearchClaw now runs on any ACP-compatible agent backend (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) and supports messaging platforms (Discord, Telegram, Lark, WeChat) via OpenClaw bridge. New CLI-agent code generation backend delegates Stages 10 & 13 to external CLI agents with budget control and timeout management. Also includes anti-fabrication system (VerifiedRegistry + experiment diagnosis & repair loop), 100+ bug fixes, modular executor refactoring, `--resume` auto-detection, LLM retry hardening, and community-reported fixes. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ One Command. One Paper. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 What Is This? **You think it. AutoResearchClaw writes it.** Drop a research topic — get back a full academic paper with real literature from OpenAlex, Semantic Scholar & arXiv, hardware-aware sandbox experiments (GPU/MPS/CPU auto-detected), statistical analysis, multi-agent peer review, and conference-ready LaTeX targeting NeurIPS/ICML/ICLR. No babysitting. No copy-pasting. No hallucinated references.
📄paper_draft.mdFull academic paper (Introduction, Related Work, Method, Experiments, Results, Conclusion)
📐paper.texConference-ready LaTeX (NeurIPS / ICLR / ICML templates)
📚references.bibReal BibTeX references from OpenAlex, Semantic Scholar and arXiv — auto-pruned to match inline citations
🔍verification_report.json4-layer citation integrity + relevance verification (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Generated code + sandbox results + structured JSON metrics
📊charts/Auto-generated condition comparison charts with error bars and confidence intervals
📝reviews.mdMulti-agent peer review with methodology-evidence consistency checks
🧬evolution/Self-learning lessons extracted from each run
📦deliverables/All final outputs in one folder — compile-ready for Overleaf
The pipeline runs **end-to-end without human intervention**. When experiments fail, it self-heals. When hypotheses don't hold, it pivots. When citations are fake, it kills them. 🌍 **Run it anywhere.** AutoResearchClaw isn't locked to a single platform. Use it standalone via CLI, plug it into [OpenClaw](https://github.com/openclaw/openclaw), or wire it up through any ACP-compatible agent — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, you name it. And because OpenClaw bridges to messaging platforms, you can kick off a full research run from 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, or wherever your team already hangs out. One topic in, one paper out — no matter where you type it. --- ## 🚀 Quick Start ```bash # 1. Clone & install git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Setup (interactive — installs OpenCode beast mode, checks Docker/LaTeX) researchclaw setup # 3. Configure researchclaw init # Interactive: choose LLM provider, creates config.arc.yaml # Or manually: cp config.researchclaw.example.yaml config.arc.yaml # 4. Run export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` Output → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — compile-ready LaTeX, BibTeX, experiment code, charts.
📝 Minimum required config ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 What Makes It Different | Capability | How It Works | |-----------|-------------| | **🔄 PIVOT / REFINE Loop** | Stage 15 autonomously decides: PROCEED, REFINE (tweak params), or PIVOT (new direction). Artifacts auto-versioned. | | **🤖 Multi-Agent Debate** | Hypothesis generation, result analysis, and peer review each use structured multi-perspective debate. | | **🧬 Self-Learning** | Lessons extracted per run (decision rationale, runtime warnings, metric anomalies) with 30-day time-decay. Future runs learn from past mistakes. | | **📚 Knowledge Base** | Every run builds structured KB across 6 categories (decisions, experiments, findings, literature, questions, reviews). | | **🛡️ Sentinel Watchdog** | Background quality monitor: NaN/Inf detection, paper-evidence consistency, citation relevance scoring, anti-fabrication guard. | --- ## 🦞 OpenClaw Integration **AutoResearchClaw is an [OpenClaw](https://github.com/openclaw/openclaw)-compatible service.** Install it in OpenClaw and launch autonomous research with a single message — or use it standalone via CLI, Claude Code, or any AI coding assistant.
### 🚀 Use with OpenClaw (Recommended) If you already use [OpenClaw](https://github.com/openclaw/openclaw) as your AI assistant: ``` 1️⃣ Share the GitHub repo URL with OpenClaw 2️⃣ OpenClaw auto-reads RESEARCHCLAW_AGENTS.md → understands the pipeline 3️⃣ Say: "Research [your topic]" 4️⃣ Done — OpenClaw clones, installs, configures, runs, and returns results ``` **That's it.** OpenClaw handles `git clone`, `pip install`, config setup, and pipeline execution automatically. You just chat.
💡 What happens under the hood 1. OpenClaw reads `RESEARCHCLAW_AGENTS.md` → learns the research orchestrator role 2. OpenClaw reads `README.md` → understands installation and pipeline structure 3. OpenClaw copies `config.researchclaw.example.yaml` → `config.yaml` 4. Asks for your LLM API key (or uses your environment variable) 5. Runs `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. Returns the paper, LaTeX, experiments, and citations
### 🔌 OpenClaw Bridge (Advanced) For deeper integration, AutoResearchClaw includes a **bridge adapter system** with 6 optional capabilities: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Scheduled research runs use_message: true # 💬 Progress notifications (Discord/Slack/Telegram) use_memory: true # 🧠 Cross-session knowledge persistence use_sessions_spawn: true # 🔀 Spawn parallel sub-sessions for concurrent stages use_web_fetch: true # 🌐 Live web search during literature review use_browser: false # 🖥️ Browser-based paper collection ``` Each flag activates a typed adapter protocol. When OpenClaw provides these capabilities, the adapters consume them without code changes. See [`docs/integration-guide.md`](docs/integration-guide.md) for full details. ### ACP (Agent Client Protocol) AutoResearchClaw can use **any ACP-compatible coding agent** as its LLM backend — no API keys required. The agent communicates via [acpx](https://github.com/openclaw/acpx), maintaining a single persistent session across all 23 pipeline stages. | Agent | Command | Notes | |-------|---------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ACP example llm: provider: "acp" acp: agent: "claude" # Any ACP-compatible agent CLI command cwd: "." # Working directory for the agent # No base_url or api_key needed — the agent handles its own auth. ``` ```bash # Just run — the agent uses its own credentials researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ Other Ways to Run | Method | How | |--------|-----| | **Standalone CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Reads `RESEARCHCLAW_CLAUDE.md` — just say *"Run research on [topic]"* | | **Copilot CLI** | `researchclaw run --topic "..."` with `llm.acp.agent: "gh"` | | **OpenCode** | Reads `.claude/skills/` — same natural language interface | | **Any AI CLI** | Provide `RESEARCHCLAW_AGENTS.md` as context → agent auto-bootstraps | --- ## 🔬 Pipeline: 23 Stages, 8 Phases ``` Phase A: Research Scoping Phase E: Experiment Execution 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← self-healing Phase B: Literature Discovery Phase F: Analysis & Decision 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← multi-agent 4. LITERATURE_COLLECT ← real API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [gate] 6. KNOWLEDGE_EXTRACT Phase G: Paper Writing 16. PAPER_OUTLINE Phase C: Knowledge Synthesis 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← evidence check 8. HYPOTHESIS_GEN ← debate 19. PAPER_REVISION Phase D: Experiment Design Phase H: Finalization 9. EXPERIMENT_DESIGN [gate] 20. QUALITY_GATE [gate] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← relevance check ``` > **Gate stages** (5, 9, 20) pause for human approval or auto-approve with `--auto-approve`. On rejection, the pipeline rolls back. > **Decision loops**: Stage 15 can trigger REFINE (→ Stage 13) or PIVOT (→ Stage 8), with automatic artifact versioning.
📋 What Each Phase Does | Phase | What Happens | |-------|-------------| | **A: Scoping** | LLM decomposes the topic into a structured problem tree with research questions | | **A+: Hardware** | Auto-detects GPU (NVIDIA CUDA / Apple MPS / CPU-only), warns if local hardware is limited, adapts code generation accordingly | | **B: Literature** | Multi-source search (OpenAlex → Semantic Scholar → arXiv) for real papers, screens by relevance, extracts knowledge cards | | **C: Synthesis** | Clusters findings, identifies research gaps, generates testable hypotheses via multi-agent debate | | **D: Design** | Designs experiment plan, generates hardware-aware runnable Python (GPU tier → package selection), estimates resource needs | | **E: Execution** | Runs experiments in sandbox, detects NaN/Inf and runtime bugs, self-heals code via targeted LLM repair | | **F: Analysis** | Multi-agent analysis of results; autonomous PROCEED / REFINE / PIVOT decision with rationale | | **G: Writing** | Outlines → section-by-section drafting (5,000-6,500 words) → peer reviews (with methodology-evidence consistency) → revises with length guard | | **H: Finalization** | Quality gate, knowledge archival, LaTeX export with conference template, citation integrity + relevance verification |
--- ## ✨ Key Features | Feature | Description | |---------|------------| | **📚 Multi-Source Literature** | Real papers from OpenAlex, Semantic Scholar & arXiv — query expansion, deduplication, circuit breaker with graceful degradation | | **🔍 4-Layer Citation Verification** | arXiv ID check → CrossRef/DataCite DOI → Semantic Scholar title match → LLM relevance scoring. Hallucinated refs auto-removed. | | **🖥️ Hardware-Aware Execution** | Auto-detects GPU (NVIDIA CUDA / Apple MPS / CPU-only) and adapts code generation, imports, and experiment scale accordingly | | **🦾 OpenCode Beast Mode** | Complex experiments auto-routed to [OpenCode](https://github.com/anomalyco/opencode) — generates multi-file projects with custom architectures, training loops, and ablation studies. Install via `researchclaw setup`. | | **🧪 Sandbox Experiments** | AST-validated code, immutable harness, NaN/Inf fast-fail, self-healing repair, iterative refinement (up to 10 rounds), partial result capture | | **📝 Conference-Grade Writing** | NeurIPS/ICML/ICLR templates, section-by-section drafting (5,000-6,500 words), anti-fabrication guard, revision length guard, anti-disclaimer enforcement | | **📐 Template Switching** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX with math, tables, figures, cross-refs, `\cite{}` | | **🛡️ Anti-Fabrication** | VerifiedRegistry enforces ground-truth experiment data in papers. Auto-diagnoses failed experiments and repairs them before writing. Unverified numbers sanitized. | | **🚦 Quality Gates** | 3 human-in-the-loop gates (Stages 5, 9, 20) with rollback. Skip with `--auto-approve`. | --- ## 🧠 MetaClaw Integration **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = A pipeline that learns from every run.** MetaClaw adds **cross-run knowledge transfer** to AutoResearchClaw. When enabled, the pipeline automatically captures lessons from failures and warnings, converts them into reusable skills, and injects those skills into all 23 pipeline stages on subsequent runs — so the same mistakes are never repeated. ### How It Works ``` Run N executes → failures/warnings captured as Lessons ↓ MetaClaw Lesson → Skill conversion ↓ arc-* Skill files stored in ~/.metaclaw/skills/ ↓ Run N+1 → build_overlay() injects skills into every LLM prompt ↓ LLM avoids known pitfalls → higher quality, fewer retries ``` ### Quick Setup ```bash # 1. Install MetaClaw (if not already) pip install metaclaw # 2. Enable in your config ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # MetaClaw proxy (optional) skills_dir: "~/.metaclaw/skills" # Where skills are stored fallback_url: "https://api.openai.com/v1" # Direct LLM fallback fallback_api_key: "" # API key for fallback URL lesson_to_skill: enabled: true min_severity: "warning" # Convert warnings + errors max_skills_per_run: 3 ``` ```bash # 3. Run as usual — MetaClaw works transparently researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` After each run, check `~/.metaclaw/skills/arc-*/SKILL.md` to see the skills your pipeline has learned. ### Experiment Results In controlled A/B experiments (same topic, same LLM, same configuration): | Metric | Baseline | With MetaClaw | Improvement | |--------|----------|---------------|-------------| | Stage retry rate | 10.5% | 7.9% | **-24.8%** | | Refine cycle count | 2.0 | 1.2 | **-40.0%** | | Pipeline stage completion | 18/19 | 19/19 | **+5.3%** | | Overall robustness score (composite) | 0.714 | 0.845 | **+18.3%** | > Composite robustness score is a weighted average of stage completion rate (40%), retry reduction (30%), and refine cycle efficiency (30%). ### Backward Compatibility - **Default: OFF.** If `metaclaw_bridge` is absent or `enabled: false`, the pipeline behaves exactly as before. - **No new dependencies.** MetaClaw is optional — the core pipeline works without it. - **All 1,823 existing tests pass** with the integration code present. --- ## ⚙️ Configuration Reference
Click to expand full configuration reference ```yaml # === Project === project: name: "my-research" # Project identifier mode: "docs-first" # docs-first | semi-auto | full-auto # === Research === research: topic: "..." # Research topic (required) domains: ["ml", "nlp"] # Research domains for literature search daily_paper_count: 8 # Target papers per search query quality_threshold: 4.0 # Minimum quality score for papers # === Runtime === runtime: timezone: "America/New_York" # For timestamps max_parallel_tasks: 3 # Concurrent experiment limit approval_timeout_hours: 12 # Gate stage timeout retry_limit: 2 # Retry count on stage failure # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # API endpoint (required for openai-compatible) api_key_env: "OPENAI_API_KEY" # Env var for API key (required for openai-compatible) api_key: "" # Or hardcode key here primary_model: "gpt-4o" # Primary model fallback_models: ["gpt-4o-mini"] # Fallback chain s2_api_key: "" # Semantic Scholar API key (optional, higher rate limits) acp: # Only used when provider: "acp" agent: "claude" # ACP agent CLI command (claude, codex, gemini, etc.) cwd: "." # Working directory for the agent # === Experiment === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Max execution time per run (default: 300s) max_iterations: 10 # Max optimization iterations metric_key: "val_loss" # Primary metric name metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Auto-detect imports → requirements.txt ssh_remote: host: "" # GPU server hostname gpu_ids: [] # Available GPU IDs remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (auto-installed via `researchclaw setup`) enabled: true # Master switch (default: true) auto: true # Auto-trigger without confirmation (default: true) complexity_threshold: 0.2 # 0.0-1.0 — higher = only trigger on complex experiments model: "" # Override model (empty = use llm.primary_model) timeout_sec: 600 # Max seconds for OpenCode generation max_retries: 1 # Retry count on failure workspace_cleanup: true # Remove temp workspace after collection code_agent: # CodeAgent v2 — multi-phase code generation enabled: true # Use CodeAgent instead of legacy single-prompt codegen architecture_planning: true # Generate deep implementation blueprint before coding sequential_generation: true # Generate files one-by-one following dependency DAG hard_validation: true # AST-based validation gates (blocks identical ablations, hardcoded metrics) hard_validation_max_repairs: 2 # Max repair attempts when validation fails exec_fix_max_iterations: 3 # Execution-in-the-loop fix attempts exec_fix_timeout_sec: 60 # Timeout per exec-fix attempt benchmark_agent: # BenchmarkAgent — automated dataset & baseline selection enabled: true # Enable 4-agent benchmark pipeline (Surveyor→Selector→Acquirer→Validator) enable_hf_search: true # Search HuggingFace Datasets enable_web_search: true # Search Google Scholar for benchmarks tier_limit: 2 # Dataset tier filtering (1=small/cached, 2=medium, 3=large) min_benchmarks: 1 # Minimum datasets required min_baselines: 2 # Minimum baseline methods required figure_agent: # FigureAgent — academic figure generation enabled: true # Enable 5-agent figure pipeline (Planner→CodeGen→Renderer→Critic→Integrator) min_figures: 3 # Minimum figures to generate max_figures: 8 # Maximum figures max_iterations: 3 # Critic-driven refinement iterations dpi: 300 # Output resolution strict_mode: false # Fail pipeline if figure generation fails repair: # Anti-fabrication experiment repair enabled: true # Auto-diagnose and repair failed experiments max_cycles: 3 # Repair retry loops min_completion_rate: 0.5 # >=50% conditions must complete to proceed min_conditions: 2 # At least 2 conditions for valid experiment use_opencode: true # Route repairs through OpenCode Beast Mode # === Web Search (Optional) === web_search: enabled: true # Enable web-augmented literature search tavily_api_key_env: "TAVILY_API_KEY" # Tavily API key env var (optional) enable_scholar: true # Google Scholar search enable_pdf_extraction: true # Extract text from PDFs max_web_results: 10 # Max web results per query # === Export === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # Path to custom prompts YAML (empty = defaults) # === Security === security: hitl_required_stages: [5, 9, 20] # Stages requiring human approval allow_publish_without_approval: false redact_sensitive_logs: true # === Knowledge Base === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Notifications === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge (Optional) === metaclaw_bridge: enabled: false # Set to true to enable cross-run learning proxy_url: "http://localhost:30000" # MetaClaw proxy URL skills_dir: "~/.metaclaw/skills" # Where arc-* skills are stored fallback_url: "" # Direct LLM fallback when proxy is down fallback_api_key: "" # API key for fallback endpoint lesson_to_skill: enabled: true # Auto-convert lessons to skills min_severity: "warning" # Minimum severity to convert max_skills_per_run: 3 # Max new skills per pipeline run prm: # Process Reward Model quality gate (optional) enabled: false # Use LLM-as-judge to score stage outputs model: "gpt-5.4" # PRM judge model votes: 3 # Majority vote count gate_stages: [5, 9, 15, 20] # Stages to apply PRM gates # === OpenClaw Bridge === openclaw_bridge: use_cron: false # Scheduled research runs use_message: false # Progress notifications use_memory: false # Cross-session knowledge persistence use_sessions_spawn: false # Spawn parallel sub-sessions use_web_fetch: false # Live web search use_browser: false # Browser-based paper collection ```
--- ## 🙏 Acknowledgments Inspired by: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Automated research pioneer - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — End-to-end research automation - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System --- ## 📄 License MIT — see [LICENSE](LICENSE) for details. --- ## 📌 Citation If you find AutoResearchClaw useful, please cite: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Built with 🦞 by the AutoResearchClaw team

================================================ FILE: config.researchclaw.example.yaml ================================================ project: name: "my-research" mode: "full-auto" research: topic: "Your research topic here" domains: - "machine-learning" daily_paper_count: 10 quality_threshold: 4.0 runtime: timezone: "America/New_York" max_parallel_tasks: 3 approval_timeout_hours: 12 retry_limit: 2 notifications: channel: "console" target: "" on_stage_start: true on_stage_fail: true on_gate_required: true knowledge_base: backend: "markdown" root: "docs/kb" openclaw_bridge: use_cron: false use_message: false use_memory: false use_sessions_spawn: false use_web_fetch: false use_browser: false llm: provider: "openai-compatible" base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" api_key: "" primary_model: "gpt-4o" fallback_models: - "gpt-4.1" - "gpt-4o-mini" # --- MiniMax provider example --- # provider: "minimax" # api_key_env: "MINIMAX_API_KEY" # primary_model: "MiniMax-M2.5" # fallback_models: # - "MiniMax-M2.5-highspeed" security: hitl_required_stages: [5, 9, 20] allow_publish_without_approval: false redact_sensitive_logs: true experiment: # ★ mode 决定实验结果的真实性 # "sandbox" — 在本地沙盒中实际执行生成的 Python 代码,产出真实实验数据 # "docker" — 在 Docker 容器中执行,支持 GPU 直通、依赖自动安装、内存隔离 # "simulated" — 不执行代码,使用公式生成假数据(仅用于框架开发调试,不应用于论文生成) mode: "sandbox" time_budget_sec: 300 max_iterations: 10 metric_key: "primary_metric" metric_direction: "minimize" sandbox: # Use ".venv/Scripts/python.exe" on Windows python_path: ".venv/bin/python3" gpu_required: false max_memory_mb: 4096 # Docker sandbox settings (only used when mode: "docker") # Build image first: docker build -t researchclaw/experiment:latest researchclaw/docker/ docker: image: "researchclaw/experiment:latest" gpu_enabled: true # gpu_device_ids: [0] # empty = all GPUs memory_limit_mb: 8192 network_policy: "setup_only" # none | setup_only | pip_only | full # pip_pre_install: ["torchdiffeq", "einops"] auto_install_deps: true shm_size_mb: 2048 keep_containers: false ssh_remote: host: "" # SSH hostname or IP user: "" # SSH username (default: current user) port: 22 # SSH port key_path: "" # Path to private key (default: ~/.ssh/id_rsa) gpu_ids: [] # e.g. [0, 1] for CUDA_VISIBLE_DEVICES remote_workdir: "/tmp/researchclaw_experiments" remote_python: "python3" setup_commands: [] # e.g. ["source ~/venv/bin/activate", "pip install torch"] # Docker-over-SSH (most secure remote execution) use_docker: false # Set true to run experiments inside Docker on remote host docker_image: "researchclaw/experiment:latest" docker_network_policy: "none" # none | full docker_memory_limit_mb: 8192 docker_shm_size_mb: 2048 # OpenCode Beast Mode — external AI coding agent for complex experiments # Install: npm i -g opencode-ai@latest (or use `researchclaw setup`) opencode: enabled: true # Master switch (default: true) auto: true # Auto-trigger without confirmation (default: true) complexity_threshold: 0.2 # 0.0-1.0 — higher = only trigger on complex experiments model: "" # Override model (empty = use llm.primary_model) timeout_sec: 600 # Max seconds for OpenCode generation max_retries: 1 # Retry count on failure workspace_cleanup: true # Remove temp workspace after collection # ============================================================================ # SSH Remote Examples # ============================================================================ # # 1. Lab server (bare Python, basic sandboxing): # experiment: # mode: "ssh_remote" # ssh_remote: # host: "gpu-server.lab.edu" # user: "researcher" # key_path: "~/.ssh/id_rsa" # gpu_ids: [0] # remote_python: "python3" # # 2. Lab server (Docker — most secure): # experiment: # mode: "ssh_remote" # ssh_remote: # host: "gpu-server.lab.edu" # user: "researcher" # key_path: "~/.ssh/id_rsa" # gpu_ids: [0] # use_docker: true # docker_image: "researchclaw/experiment:latest" # docker_network_policy: "none" # # 3. Colab via SSH tunnel: # experiment: # mode: "ssh_remote" # ssh_remote: # host: "localhost" # port: 12345 # user: "root" # remote_python: "python3" # setup_commands: # - "pip install torch torchvision -q" # # 4. Colab via Google Drive (most robust, no SSH needed): # experiment: # mode: "colab_drive" # colab_drive: # drive_root: "~/Library/CloudStorage/GoogleDrive-you@gmail.com/My Drive/researchclaw" # poll_interval_sec: 30 # timeout_sec: 3600 # setup_script: "pip install torch torchvision -q" # # Then in Colab: run the colab_worker.py that appears in your Drive colab_drive: drive_root: "" # Local path to Google Drive mount poll_interval_sec: 30 # How often to check for results timeout_sec: 3600 # Max wait per experiment (1 hour) setup_script: "" # Shell commands to run before each experiment # Scientific Visualization Agent (Code-to-Viz + Nano Banana) # Uses a Decision Agent to analyze paper content and determine: # - Code figures (bar charts, line plots) → Matplotlib/TikZ # - Image figures (architecture, flowcharts) → Gemini Nano Banana figure_agent: enabled: true min_figures: 3 max_figures: 10 max_iterations: 3 render_timeout_sec: 30 # Security: Docker sandbox for visualization code execution # use_docker: null # null = auto-detect, true = force, false = disable docker_image: "researchclaw/experiment:latest" # Output format: "python" (Matplotlib/Seaborn) or "latex" (TikZ/PGFPlots) output_format: "python" # Nano Banana (Gemini native image generation) nano_banana_enabled: true # gemini_api_key: "" # or set GEMINI_API_KEY env var gemini_model: "gemini-2.5-flash-image" strict_mode: false dpi: 300 # === Prompts === # Customize LLM prompts by pointing to your own YAML file. # Copy prompts.default.yaml, edit the prompts you want, and set the path here. prompts: custom_file: "" # e.g. "my_prompts.yaml" (empty = use built-in defaults) # === MetaClaw Integration === # Enable the MetaClaw bridge to get skill injection, PRM quality gates, # and continuous learning from research pipeline failures. # Requires MetaClaw to be running: metaclaw start --mode skills_only metaclaw_bridge: enabled: false proxy_url: "http://localhost:30000" # MetaClaw proxy endpoint skills_dir: "~/.metaclaw/skills" # MetaClaw skills directory fallback_url: "" # Direct LLM URL if proxy is down fallback_api_key: "" # PRM quality gate: LLM-as-judge scoring at gate stages prm: enabled: false api_base: "" # OpenAI-compatible API for PRM judge api_key_env: "PRM_API_KEY" api_key: "" model: "gpt-5.4" votes: 3 # Majority vote count gate_stages: [5, 9, 15, 20] # Stages to apply PRM gating # Lesson-to-skill: auto-convert pipeline failures into MetaClaw skills lesson_to_skill: enabled: true min_severity: "error" # Only convert error-level lessons max_skills_per_run: 3 ================================================ FILE: docs/BUG_FIX_DOCUMENT_20260316.md ================================================ # Bug Fix Document — AutoResearchClaw Pipeline > 生成日期:2026-03-16 > 反馈来源:2 位测试者(user1: CV 方向 / GPU 环境, user2: Windows 环境) > 总计问题:9 个 ## 📊 总览 | 分类 | 数量 | |------|------| | 🔴 确认的 Bug(需修复) | **4** | | 🟠 架构改进(强烈建议) | **2** | | 🔵 功能需求 | **3** | ## 🔥 修复优先级 | 优先级 | ID | 问题 | 阶段 | 涉及文件 | |--------|----|------|------|----------| | 🔴 CRITICAL | BUG-001 | 论文硬件信息与实际不一致 | PAPER_DRAFT (17) | `executor.py`, `prompts.py` | | 🔴 CRITICAL | BUG-002 | Windows 环境 Docker 不可用导致实验链式失败 | EXPERIMENT_RUN (12) | `factory.py`, `docker_sandbox.py` | | 🔴 HIGH | BUG-003 | 论文内容自相矛盾(承诺评测但未执行) | PAPER_DRAFT (17), PEER_REVIEW (18) | `executor.py`, `prompts.py` | | 🔴 HIGH | BUG-004 | 生成代码缺少数值稳定性防护(NaN/Inf) | CODE_GENERATION (10) | `code_agent.py`, `prompts.py` | | 🟠 HIGH | ARCH-001 | Stage 17 过于严格的 hard block 策略 | PAPER_DRAFT (17) | `executor.py` | | 🟠 HIGH | ARCH-002 | Idea 降级时不询问用户确认 | EXPERIMENT_DESIGN (9), RESEARCH_DECISION (15) | `executor.py`, `stages.py` | --- ## 确认的 Bug — 详细修复方案 ### 🔴 `BUG-001` — 论文硬件信息与实际机器不一致 | 字段 | 内容 | |------|------| | **严重程度** | CRITICAL | | **所属阶段** | PAPER_DRAFT (Stage 17) | | **报告者** | user1 | **问题描述:** 论文中声称使用 A100 GPU 训练,但测试者实际机器上是 A5000。Pipeline 在 Stage 1 检测了硬件并保存到 `hardware_profile.json`,但在论文生成阶段完全没有利用这个信息来约束 LLM 输出。 **根因分析:** - `executor.py` 第 1226-1233 行:Stage 1 (TOPIC_INIT) 检测硬件,保存 `hardware_profile.json`,包含 `gpu_name`、`vram_gb` 等 - `executor.py` 第 2352-2391 行:硬件信息 **仅** 用于 CODE_GENERATION 阶段的代码生成 hints - `executor.py` 第 5776-5848 行:PAPER_DRAFT 阶段构建 prompt 时,**没有注入硬件 profile 信息** - LLM 在缺少约束的情况下会「幻觉」出常见的高端硬件名称(如 A100) **涉及文件:** - `researchclaw/pipeline/executor.py`(PAPER_DRAFT 阶段的 prompt 构建部分,约第 5776-5960 行) - `researchclaw/prompts.py`(paper writing prompt 模板) **修复方案:** 1. 在 PAPER_DRAFT 阶段的 prompt 构建中,读取 `stage-01/hardware_profile.json` 2. 将实际硬件信息(GPU 型号、VRAM、CPU 等)作为 **硬性约束** 注入 prompt,例如: ``` HARDWARE CONSTRAINT: The experiments were run on the following hardware: - GPU: {gpu_name} ({vram_gb} GB VRAM) - CPU: {cpu_info} You MUST use this exact hardware specification in the paper. Do NOT substitute with other GPU models. ``` 3. 在 PEER_REVIEW (Stage 18) 的 prompt 中增加一条审核规则:验证 paper 中提到的硬件是否与 `hardware_profile.json` 一致 **修复后预期行为:** 论文中的硬件描述必须与实际运行环境一致。
原始反馈证据 > 然后就是paper和实验中有一些misalign的地方,比如paper里写说用的A100,实际上机器里的是A5000
--- ### 🔴 `BUG-002` — Windows 环境下 Docker 不可用导致实验链式失败 | 字段 | 内容 | |------|------| | **严重程度** | CRITICAL | | **所属阶段** | EXPERIMENT_RUN (Stage 12) → 链式影响到 Stage 13, 14, 17 | | **报告者** | user2 | **问题描述:** 在 Windows 环境下,Docker 不可用时 Pipeline 直接崩溃(`[WinError 2] The system cannot find the file specified`),导致所有后续阶段连锁失败。用户最终看到的是 Stage 17 的误导性错误「没有实验数据无法写论文」,完全看不到真正的根因。 **根因分析:** - `experiment/factory.py` 第 25-29 行:当 `config.experiment.mode == "docker"` 时调用 `DockerSandbox.check_docker_available()`,如果 Docker 不可用直接 raise `RuntimeError`,**没有自动 fallback 到 subprocess sandbox** - `docker_sandbox.py` 第 337、366 行:Docker volume mount 使用 POSIX 风格路径(如 `{staging_dir}:/workspace`),在 Windows 上可能导致挂载失败 - **链式失败:** Stage 12 crash → 无 metrics → Stage 13 空跑(`refine_sandbox_v1` 到 `v9` 都失败) → Stage 14 空 `experiment_summary.json` → Stage 17 hard block - 用户看到的错误完全不提 Docker,只说「no metrics」,非常误导 **涉及文件:** - `researchclaw/experiment/factory.py`(第 25-29 行,sandbox 创建逻辑) - `researchclaw/experiment/docker_sandbox.py`(第 337、366、384 行,路径和命令构建) - `researchclaw/pipeline/executor.py`(第 6000-6020 行,Stage 17 hard block) **修复方案:** 1. `factory.py`:当 Docker 不可用时,自动 fallback 到 subprocess sandbox 模式,而不是 raise RuntimeError。增加日志 warning 告知用户: ```python if not DockerSandbox.check_docker_available(): logger.warning("Docker not available, falling back to subprocess sandbox mode") return SubprocessSandbox(...) ``` 2. `docker_sandbox.py`:修复 Windows 路径兼容性问题,使用 `pathlib.PureWindowsPath` 或 `os.path` 正确处理跨平台路径 3. 在 Stage 12 的错误信息中明确指出是 Docker 问题,而不是让错误沿链传播变成「no metrics」 **修复后预期行为:** Windows 用户即使没有 Docker,Pipeline 也能通过 subprocess sandbox 完成实验。即使实验部分失败,错误信息应清晰指向根因。
原始反馈证据 > 我跑了两次 两次都有stage fail 最后没有生成报告 压缩包中 `experiment_summary.json` stderr: `[WinError 2] The system cannot find the file specified` `pipeline_summary.json`: `"final_status": "failed"`, `"stages_failed": 1` `stage-17/paper_draft.md`: `Experiment stage produced no metrics (status: failed/timeout). Cannot write a paper without real experimental data.`
--- ### 🔴 `BUG-003` — 论文内容自相矛盾(承诺评测数据集但未实际执行) | 字段 | 内容 | |------|------| | **严重程度** | HIGH | | **所属阶段** | PAPER_DRAFT (Stage 17), PEER_REVIEW (Stage 18) | | **报告者** | user1 | **问题描述:** 论文前半部分按照用户的 topic 描述声称会在 MME、DocVQA、TextVQA 等数据集上评测,但实际实验阶段因为环境原因未能完成这些评测。论文后半部分在 Limitation 中又说「没有在这些数据集上评估」,形成自相矛盾。 **根因分析:** - `prompts.py` 第 2006-2018 行:有 EVIDENCE-BOUNDING RULES(Rule 7-9),但这些只是 prompt 中的 **建议**,LLM 可以忽略 - `executor.py` 第 5647-5715 行:`_detect_result_contradictions()` 函数检测 null/negative results,但只生成 advisory text 注入 prompt,**不做硬性阻断** - `executor.py` 第 6432-6443 行:PEER_REVIEW 阶段收集 `actual_run_count` 作为 evidence,但 **没有自动扫描 paper 文本提取声称的数据集列表并与实际评测记录对比** - 核心问题:**缺少 claim-evidence 的自动对齐验证** **涉及文件:** - `researchclaw/pipeline/executor.py`(第 5647-5715 行、5944-5956 行、6432-6443 行) - `researchclaw/prompts.py`(第 2006-2049 行、2124-2138 行) **修复方案:** 1. 在 PAPER_DRAFT 阶段的 prompt 中,**明确列出** 实际完成评测的数据集和指标(从 `experiment_summary.json` 提取),硬性要求 LLM **只能**声称在这些数据集上进行了评测: ``` ACTUAL EVALUATED DATASETS: [ImageNet-val (reconstruction)] You MUST NOT claim evaluation on any dataset not listed above. If the original research plan included additional datasets that were not evaluated, explain this honestly in the Limitations section WITHOUT first claiming you did evaluate them. ``` 2. 在 PEER_REVIEW (Stage 18) 增加一个专项检查:自动提取 paper 中所有提到的 benchmark/dataset 名称,与 `experiment_summary.json` 中的实际 metrics keys 对比,不一致则标记为 CRITICAL discrepancy 3. 在 PAPER_REVISION (Stage 19) 中把这些 discrepancy 作为必须修改的 reviewer comment **修复后预期行为:** 论文中不会出现「前面说评测了 X,后面说没评测 X」的自相矛盾。所有评测声明必须有实验数据支撑。
原始反馈证据 > 以及就是paper中有一些自相矛盾的地方,比如前面按照我的要求,说会在哪几个数据集上面进行评估,后面又没有测,然后在limitation说我们没有在这几个数据集上评估
--- ### 🔴 `BUG-004` — 生成代码缺少数值稳定性防护(NaN/Inf 导致实验提前终止) | 字段 | 内容 | |------|------| | **严重程度** | HIGH | | **所属阶段** | CODE_GENERATION (Stage 10), ITERATIVE_REFINE (Stage 13) | | **报告者** | user1 | **问题描述:** 实验训练过程中出现 `loss = inf` → `loss = nan` 的数值爆炸,触发 harness 的 NaN 检测后实验提前终止。代码生成阶段没有在生成的训练代码中加入数值稳定性保护。 **根因分析:** - `code_agent.py`:**完全没有** 关于数值稳定性的 prompt 指令。4 个阶段(Planning → Code Generation → Execution-in-the-Loop → Multi-Agent Review)都不检查 NaN guard - `experiment/harness_template.py` 第 45-62 行:有 `check_value()` 做 NaN/Inf 检测,但这是 **opt-in 机制**——只有生成代码主动调用 `self.check_value(loss, "loss")` 才有效 - `executor.py` 第 779-900 行:`_detect_runtime_issues()` 在运行 **之后** 检测 NaN,但此时实验已经失败了 - `executor.py` 第 3915-3956 行:Stage 13 检测到 NaN 后调用 LLM 做 `iterative_repair`,但修复质量不稳定 **涉及文件:** - `researchclaw/pipeline/code_agent.py`(prompt 构建,所有阶段) - `researchclaw/prompts.py`(代码生成相关 prompt) - `researchclaw/experiment/harness_template.py`(第 45-62 行) **修复方案:** 1. 在 `code_agent.py` 的代码生成 prompt 中,增加 **强制性** 数值稳定性要求: ``` NUMERICAL STABILITY REQUIREMENTS (MANDATORY): - Add gradient clipping (max_norm=1.0) to all optimizer steps - Check loss for NaN/Inf before backward pass: if not math.isfinite(loss): skip this batch - Use torch.amp.GradScaler for mixed precision training if applicable - Add learning rate warmup for the first 5-10% of training steps - Use self.check_value(loss, "loss") from experiment harness for NaN tracking ``` 2. 在 `harness_template.py` 中,将 `check_value()` 改为 **自动 hook** 而非 opt-in——在 `finalize()` 中自动检查 metrics 是否为 finite 3. 在 Multi-Agent Review 阶段(`code_agent.py` Phase 4)增加数值稳定性作为必审项 **修复后预期行为:** 生成的训练代码默认包含 gradient clipping 和 NaN guard,训练过程中数值爆炸能被及时 catch 并恢复,而不是直接终止。
原始反馈证据 > 好像是他的代码写错了之类的 压缩包中 `experiment_summary.json` stderr: ``` WARNING: loss = inf (non-finite, skipped) WARNING: loss = nan (non-finite, skipped) WARNING: loss = nan (non-finite, skipped) WARNING: loss = nan (non-finite, skipped) WARNING: loss = nan (non-finite, skipped) FAIL: Too many NaN/Inf values detected. Stopping experiment early. ```
--- ## 架构改进 — 强烈建议 ### 🟠 `ARCH-001` — Stage 17 (PAPER_DRAFT) 过于严格的 hard block 策略 | 字段 | 内容 | |------|------| | **严重程度** | HIGH | | **所属阶段** | PAPER_DRAFT (Stage 17) | | **报告者** | user2(链式影响) | **问题描述:** 当实验阶段没有产出完整 metrics 时,Stage 17 直接 FAILED,不尝试用已有数据写论文。这导致前面 1-16 阶段的全部成果被浪费。 **根因分析:** - `executor.py` 第 6000-6020 行:当 `has_real_metrics == False` 且 domain 为 empirical 时,直接返回 `StageStatus.FAILED` - Stage 13 (ITERATIVE_REFINE) 的中间迭代可能产出了部分有效 metrics,但 Stage 17 只看 `experiment_summary.json` 的 final best_run **涉及文件:** - `researchclaw/pipeline/executor.py`(第 6000-6020 行) **修复方案:** 将 hard block 改为 soft degradation: 1. 如果有部分 metrics(即使不完整),用已有数据写论文 2. 在 prompt 中明确告知 LLM 数据不完整,要求在 Abstract 和 Limitations 中如实说明 3. 只有在 **完全没有任何数据**(甚至没有 stage-07 synthesis 和 stage-08 hypotheses)的极端情况下才 hard block 4. 在输出的 `paper_draft.md` 头部加 warning 标记,方便后续阶段识别 **修复后预期行为:** 实验部分失败时,Pipeline 仍能生成一篇带有诚实 Limitations 的论文,用户至少得到有价值的输出。 --- ### 🟠 `ARCH-002` — Idea 被降级到弱版本时不询问用户 | 字段 | 内容 | |------|------| | **严重程度** | HIGH | | **所属阶段** | EXPERIMENT_DESIGN (Stage 9), RESEARCH_DECISION (Stage 15) | | **报告者** | user1 | **问题描述:** 用户给了一个复杂的 strong idea(如 VAE+ViT 统一编码器 + 多数据集评测),Pipeline 因资源限制(数据集不可用、GPU 不够、环境配不好)自动降级到 weaker 版本,但不通知或征求用户意见。用户认为降级后的研究「变得没啥意义」。 **根因分析:** - `executor.py` 第 2220-2236 行:LLM 生成的实验计划无效时,使用 topic-derived fallback,**不询问用户** - `executor.py` 第 4618-4640 行:RESEARCH_DECISION 检测 degenerate cycle 时只给 LLM advisory,**不暂停** - `stages.py` 第 109-115 行:GATE_STAGES 只包含 Stage 5、9、20,不包含 Stage 15 - `agents/benchmark_agent/orchestrator.py` 第 314-322 行:BenchmarkAgent 验证失败时 silent retry,最终 silent proceed **涉及文件:** - `researchclaw/pipeline/executor.py`(第 2220-2236 行、4618-4640 行) - `researchclaw/pipeline/stages.py`(GATE_STAGES 定义) - `researchclaw/agents/benchmark_agent/orchestrator.py`(第 314-322 行) **修复方案:** 1. 在 EXPERIMENT_DESIGN (Stage 9) 中,当检测到 significant downgrade(如:用户要求的数据集不可用、GPU 不满足要求、关键组件被简化)时,生成一个 **downgrade summary** 并暂停等待用户确认 2. 在 RESEARCH_DECISION (Stage 15) 中,将 REFINE → weaker idea 的决策标记为 GATE,需要用户 approve 3. 可以通过 `auto_approve` 参数让用户选择是否跳过这些确认(保持向后兼容) **修复后预期行为:** Pipeline 在降级研究方案前通知用户,用户可以选择:接受降级、提供更多资源(如更大的 GPU)、或终止当前 run。
原始反馈证据 > 对,还有就是比如我提出了一个相对strong的idea,而他因为各种原因(比如数据集找不到,环境配不好,gpu不够)之类的,给我fallback到weaker的idea之后,我感觉这个时候应该询问一下用户要不要继续跑 > > 因为很多时候他继续跑的内容就会变得没啥意义
--- ## 功能需求 ### 🔵 `FEAT-001` — 论文生成后增加一致性反馈循环 - **报告者:** user1 - **描述:** 在论文生成之后,增加专门的 consistency check,检查 paper 中的声明与实际实验结果是否一致 - **建议:** 可以在 PEER_REVIEW (Stage 18) 的 prompt 中增加 claim-evidence alignment 专项检查。或者在 Stage 17 和 18 之间加一个轻量级的自动验证步骤
原始反馈 > 感觉这个可以在paper生成之后,加一些相关的consistence feedback之类的?
### 🔵 `FEAT-002` — 从 Related Works 的 GitHub 学习 Common Practice - **报告者:** user1 - **描述:** 当前 Pipeline 的 literature 阶段只读论文,不看对应的开源代码。用户建议访问 related works 的 GitHub repo,学习 paper 中不会写的实现细节(tricks、common practice),缓解论文内容过于古老的问题 - **建议:** 在 KNOWLEDGE_EXTRACT (Stage 6) 或 EXPERIMENT_DESIGN (Stage 9) 增加 GitHub repo 分析能力。可以用 GitHub API 搜索 related works 的 repo,提取 README、主要代码结构、训练配置等信息
原始反馈 > 对就是我觉得即使不拿来用,visit related works的github也是有必要的,这样可以看到其他工作的common practice(一些不会在paper中出现的细节),应该会挺有用的。感觉可以缓解一下paper内容过于古老的问题
### 🔵 `FEAT-003` — 代码应该复用 Related Works 的框架 - **报告者:** user1 - **描述:** 当前代码都是 LLM 从零写的简单文件,用户建议从 most related works 中选一个合适的框架来用,就像真实研究中的做法 - **建议:** 可以在 BenchmarkAgent 或 CODE_GENERATION 阶段增加框架选择逻辑——从相关论文的开源实现中挑选合适的 codebase 作为起点,而不是从零生成。这是一个较大的改动,可以作为长期目标
原始反馈 > 以及他现在写的代码都比较简单,都是自己写几个文件对吧。我在想或许可以从most related works里面选一个合适的框架来用?我们平时也是这样的对吧。当然这个比较复杂,可以先不考虑
--- ## 附录:按测试者分组 ### 测试者:`user1` - **学科/领域:** 计算机视觉(CV),统一图像编解码器 - **运行环境:** GPU 服务器(A5000),使用 Codex 监控 - **总计问题:** 6 - **确认 Bug:** 3(BUG-001, BUG-003, BUG-004) - **架构改进:** 1(ARCH-002) - **功能需求:** 3(FEAT-001, FEAT-002, FEAT-003) | ID | 问题 | 状态 | 严重程度 | |----|------|------|---------| | BUG-001 | 论文硬件信息与实际不一致 | confirmed | CRITICAL | | BUG-003 | 论文内容自相矛盾 | confirmed | HIGH | | BUG-004 | 代码缺少数值稳定性防护 | confirmed | HIGH | | ARCH-002 | Idea 降级不询问用户 | confirmed | HIGH | | FEAT-001 | 一致性反馈循环 | feature_request | — | | FEAT-002 | 从 GitHub 学习 common practice | feature_request | — | | FEAT-003 | 复用 related works 框架 | feature_request | — | ### 测试者:`user2` - **学科/领域:** 未知(topic 与纳米药物递送相关) - **运行环境:** Windows - **总计问题:** 2 - **确认 Bug:** 1(BUG-002) - **架构改进:** 1(ARCH-001) | ID | 问题 | 状态 | 严重程度 | |----|------|------|---------| | BUG-002 | Windows Docker 链式失败 | confirmed | CRITICAL | | ARCH-001 | Stage 17 过于严格的 hard block | confirmed | HIGH | --- ## 修复执行指引 > 本文档设计为可由另一台机器上的 Claude Code agent 直接读取并执行修复。 > 建议按优先级从上到下依次修复,每修复一个 Bug 运行相关测试验证。 **修复顺序建议:** 1. BUG-002(Docker fallback)→ 解除 Windows 用户的完全阻塞 2. BUG-001(硬件一致性)→ 简单修复,prompt 注入即可 3. BUG-004(NaN guard)→ prompt 层面修复,影响面大 4. BUG-003(claim-evidence 对齐)→ 需要新增验证逻辑 5. ARCH-001(soft degradation)→ 改变 Stage 17 策略 6. ARCH-002(用户确认 Gate)→ 需要状态机和 Gate 逻辑调整 ================================================ FILE: docs/BUG_TRACKER.md ================================================ # Bug Tracker & TODO > 实验运行期间发现的 bug 和待修复事项。实验结束后统一修复。 ## 已发现的 Bug ### BUG-01: Stage 2 合约缺少 queries.json 输出 (已修复) - **状态**: ✅ 已修复 (commit `19c74a0`) - **描述**: `contracts.py` 中 Stage 2 (PROBLEM_DECOMPOSE) 的 `output_files` 包含 `queries.json`,但实际实现只生成 `problem_tree.md`。`queries.json` 实际在 Stage 3 生成。 - **影响**: Pipeline 在 Stage 2 直接失败 - **修复**: 从 Stage 2 output_files 移除 `queries.json`,从 Stage 3 input_files 移除 `queries.json` ### BUG-02: gpt-5.4 持续 429 限流 - **状态**: ⏳ 待观察 - **描述**: 同时运行多个 pipeline 时,gpt-5.4 频繁返回 429。fallback 机制可以兜底但速度大幅下降。 - **影响**: 运行时间显著增加(Case 2 上轮从 ~2.5h 增至 ~6h) - **建议**: 考虑增加 pipeline 间的启动间隔,或实现全局 API 调用速率协调 ### BUG-03: S2/arXiv 文献搜索 429 限流 - **状态**: ✅ 已缓解 (commit `63c5a7d` circuit breaker) - **描述**: Semantic Scholar 和 arXiv API 在并发请求时频繁 429 - **影响**: 文献收集阶段延迟,但 circuit breaker 保证最终完成 ### BUG-04: Stage 10 深度质量检查 — 类方法不足 - **状态**: ✅ 已加强 (远程 commit `855c201`) - **描述**: 生成的代码中多个类只有 1 个非 dunder 方法,质量检查报告 "algorithm classes should have at least __init__ + one core method" - **影响**: 代码质量评分降低,但不阻塞 pipeline - **远程修复**: 新增 Check 6 — ablation 子类必须 override 父类至少一个非 dunder 方法,否则报警告。修复写入 `validator.py` 和 `executor.py` 的 repair prompt。 ### BUG-05: Stage 10 深度质量检查 — UnboundLocalError 风险 - **状态**: ✅ 已修复 (远程 commit `855c201`) - **描述**: 生成代码中变量只在 if 分支内赋值,但在分支外使用(如 main.py:289 `mask`, main.py:300 `out` 等) - **影响**: 生成的实验代码可能在运行时崩溃 - **远程修复**: 新增 `auto_fix_unbound_locals()` 函数(`validator.py`),在 Stage 10 代码生成后自动检测 if-only 变量并插入 `var = None` 初始化。`executor.py` 在深度检查前调用。 ### BUG-05 更新: UnboundLocalError 问题在 v8r3 中大幅恶化 - **状态**: ✅ 已修复 (被 `auto_fix_unbound_locals()` 覆盖) - **描述**: v8r3 中 Case 3 (PEFT) 生成的代码有 **47 处** UnboundLocalError 风险(data.py 27 处, methods.py 20 处, main.py 2 处),远超 v8r2 的 8 处。Case 2 也有 8 处。 - **根因**: LLM 生成的代码模式为 `if cond: x = val` 后直接 `use(x)`,缺少 else 分支或默认值初始化 - **远程修复**: 程序化自动修复已集成到 Stage 10 pipeline 中 ### BUG-06: P9 Metric direction mismatch - **状态**: ✅ 已修复 - **描述**: 配置写 `minimize` 但实验代码声明 `direction=higher`,自动纠正为 `maximize` - **影响**: 可能影响实验结果的正确性 - **修复**: (1) Stage 9 prompt 中注入 `metric_direction` 约束; (2) Stage 12 code_generation prompt 中强制 METRIC_DEF direction 与 config 一致; (3) 取消 auto-correction,改为仅 warn 并保持 config 值 ### BUG-07: Stage 23 CITATION_VERIFY 失败率高 - **状态**: ✅ 已修复 - **描述**: 上轮 Case 1 和 Case 3 都在 Stage 23 失败(28/29),仅 Case 2 通过 - **影响**: 最终 pipeline 状态标记为 failed - **根因**: (1) `_check_citation_relevance()` 最多只处理 30 个 citation,超出的无评分; (2) 无评分的 citation 在 hard cap 排序时被当作 0.0 分全部删除 - **修复**: (1) 改为分批处理所有 citation (batch=30); (2) 无评分 citation 默认 0.7(已验证=大概率相关) ### BUG-08: CodeGen `'str' object has no attribute 'get'` (v8r3 新发现) - **状态**: ✅ 已修复 - **严重度**: 中 — 不阻塞 pipeline(有 fallback),但连续失败 6 次 - **描述**: Case 1 在 Stage 14 (RESULT_ANALYSIS) 触发 CodeGen 时连续报 `'str' object has no attribute 'get'`。疑似 LLM 返回了纯字符串而非 dict,代码对返回值调 `.get()` 导致 AttributeError。 - **远程修复**: executor.py 中 `_check_ablation_effectiveness` 等函数已加 `isinstance` 保护 - **本地修复**: `code_agent.py` 中 `_parse_json` 结果增加 `isinstance(review, dict)` 检查 ### BUG-09: FigureAgent 无法生成图表 (v8r3 新发现) - **状态**: ✅ 已修复 - **描述**: Case 1 Stage 14 中 `FigureAgent produced no charts, falling back`。FigureAgent 可能因上游 CodeGen 失败或数据格式问题无法生成图表。 - **影响**: 论文缺少可视化图表,影响质量分数 - **根因**: `_condition_summaries` 在 metrics 不含 `/` 分隔符时为空,导致 Planner 没有数据 - **修复**: (1) 从 `metrics_summary` fallback 构建 condition_summaries; (2) 从 `structured_results` 二次 fallback; (3) 向 FigureAgent 传入 `best_run_metrics` 作为数据源兜底 ### BUG-10: Degenerate refine cycle (v8r3 新发现) - **状态**: ✅ 已修复 (远程 commit `e30443e`) - **描述**: Case 1 出现 `P6: Degenerate refine cycle detected, injecting PROCEED hint`。Pipeline 检测到实验迭代循环没有实质进展,自动注入 PROCEED 跳出。 - **远程修复**: 根因是 LLM 在迭代 refine 时重命名/替换 condition 名称导致漂移。修复方案:在 `iterative_improve` prompt 中注入 `exp_plan.yaml` 锚定,并禁止改名条件。 ## 远程额外修复(BUG_TRACKER 未记录的问题) ### RFix-01: Baselines dict→list 转换 (commit `855c201`) - 若 LLM 输出 baselines 为 dict 而非 list,`executor.py` 现在自动转换为 `list(dict.keys())` ### RFix-02: Gymnasium 环境版本 v4→v5 (commit `855c201`) - `benchmark_knowledge.yaml` 中 HalfCheetah-v4→v5, Hopper-v4→v5 ### RFix-03: Time budget 注入到 Stage 9 (commit `855c201`) - 实验设计 prompt 中增加 `time_budget_sec` 约束,防止生成超时的实验方案 ### RFix-04: 代码模板 optimizers.py→models.py (commit `855c201`) - 代码生成模板从 `optimizers.py` 改为 `models.py`,并禁止生成只有 import/pass 的 stub 文件 ### RFix-05: RL 稳定性修复提示 (commit `e30443e`) - `iterative_repair` prompt 中增加 gradient clipping、LR cap、reward normalization、NaN guard 等常见 RL 修复建议 ## 待修复汇总 | Bug | 优先级 | 状态 | |-----|--------|------| | BUG-02 gpt-5.4 限流 | 低 | ⏳ 待观察 (外部限制) | 所有代码层面的 bug 已修复。 ## 待办事项 (TODO) - [x] 拉取远程更新,对比 bug 修复状态 - [x] 更新 BUG_TRACKER 标注远程已修复项 - [x] 修复 BUG-06: 在 experiment design 阶段校验 metric direction 一致性 - [x] 修复 BUG-07: 分析 Stage 23 引用验证高失败率原因 - [x] 完善 BUG-08: CodeGen 调用处增加 str 类型保护 - [x] 修复 BUG-09: FigureAgent 输入数据格式检查 - [ ] 分析本轮 (v8r3) 三个 case 的质量分数,对比上轮 (v8r2) - [ ] 考虑增加 pipeline 间的 API 调用协调机制 ## 历史质量分数对比 | 版本 | Case 1 (Graph-RAG) | Case 2 (Diffusion) | Case 3 (PEFT) | 平均 | |------|--------------------|--------------------|---------------|------| | v8r2 | 5.2/10 | 8.0/10 | 5.8/10 | 6.3 | | v8r3 | 待定 | 待定 | 待定 | 待定 | --- *最后更新: 2026-03-16* ================================================ FILE: docs/CHANGELOG_ANTHROPIC_ADAPTER.md ================================================ # Anthropic Messages API Adapter — 改动说明 > 本文档详细描述了为 ResearchClaw LLM 模块引入 Anthropic Messages API 原生支持的改动内容, > 并通过架构图说明本次改动 **不影响现有 OpenAI / OpenRouter / DeepSeek 等 provider 的任何行为**。 --- ## 目录 1. [改动背景](#1-改动背景) 2. [架构总览 — 改动前后对比](#2-架构总览--改动前后对比) 3. [核心设计:适配器模式](#3-核心设计适配器模式) 4. [调用流程详解](#4-调用流程详解) 5. [对现有 Provider 零影响的保证](#5-对现有-provider-零影响的保证) 6. [变更文件清单](#6-变更文件清单) 7. [异常处理与重试机制](#7-异常处理与重试机制) 8. [配置示例](#8-配置示例) 9. [新增依赖](#9-新增依赖) --- ## 1. 改动背景 ResearchClaw 的 LLM 模块原先仅支持 **OpenAI Chat Completions API 格式**(含兼容此格式的 OpenRouter、DeepSeek 等)。 Anthropic 的 Claude 系列模型使用独立的 **Messages API**,其请求/响应结构与 OpenAI 格式存在显著差异: | 差异点 | OpenAI 格式 | Anthropic 格式 | |---|---|---| | 认证方式 | `Authorization: Bearer ` | `x-api-key: ` | | System 消息 | 放在 `messages` 数组中 | 独立的 `system` 字段 | | 端点路径 | `/v1/chat/completions` | `/v1/messages` | | 响应结构 | `choices[0].message.content` | `content[0].text` | | Token 统计 | `prompt_tokens` / `completion_tokens` | `input_tokens` / `output_tokens` | 为了原生支持 Anthropic API 而不影响现有功能,我们采用了 **适配器模式(Adapter Pattern)**。 --- ## 2. 架构总览 — 改动前后对比 ### 改动前 ```mermaid graph TB subgraph "create_llm_client (工厂函数)" A[config.llm.provider] -->|"acp"| B[ACPClient] A -->|"其他所有"| C["内联构造 LLMClient
使用 PROVIDER_PRESETS 填充 base_url"] end C --> D["_raw_call()
urllib → OpenAI /chat/completions"] D --> E[LLMResponse] style B fill:#e1f5fe style C fill:#e8f5e9 style D fill:#e8f5e9 ``` ### 改动后 ```mermaid graph TB subgraph "create_llm_client (工厂函数)" A[config.llm.provider] -->|"acp"| B[ACPClient] A -->|"其他所有"| C["LLMClient.from_rc_config()
使用 PROVIDER_PRESETS 填充 base_url"] end C -->|"provider == anthropic"| F["挂载 AnthropicAdapter"] C -->|"其他 provider"| G["_anthropic = None"] subgraph "_raw_call() 内部分支" H{"self._anthropic
是否存在?"} H -->|"是 (Anthropic)"| I["AnthropicAdapter.chat_completion()
httpx → Anthropic /v1/messages"] H -->|"否 (OpenAI 等)"| J["原有逻辑不变
urllib → OpenAI /chat/completions"] end F --> H G --> H I --> K["返回 OpenAI 兼容格式 dict"] J --> K K --> L["统一解析 → LLMResponse"] style B fill:#e1f5fe style F fill:#fff3e0 style I fill:#fff3e0 style G fill:#e8f5e9 style J fill:#e8f5e9 style L fill:#f3e5f5 ``` > 绿色 = 原有逻辑(未修改),橙色 = 新增 Anthropic 路径,紫色 = 共享的统一出口。 --- ## 3. 核心设计:适配器模式 ```mermaid classDiagram class LLMClient { -LLMConfig config -AnthropicAdapter _anthropic +chat(messages, ...) LLMResponse +preflight() tuple -_call_with_retry(model, ...) LLMResponse -_raw_call(model, ...) LLMResponse } class AnthropicAdapter { -str base_url -str api_key -int timeout_sec +chat_completion(model, messages, ...) dict } class LLMResponse { +str content +str model +int prompt_tokens +int completion_tokens } LLMClient "1" *-- "0..1" AnthropicAdapter : _anthropic LLMClient ..> LLMResponse : returns AnthropicAdapter ..> LLMResponse : "返回 OpenAI 兼容 dict\n由 LLMClient 统一解析" note for AnthropicAdapter "仅当 provider=='anthropic' 时实例化\n其他 provider 时 _anthropic = None" ``` **关键设计决策:** - `AnthropicAdapter` 是 `LLMClient` 的一个 **可选内部组件**,不是独立的客户端类 - 适配器返回 **OpenAI 兼容格式的 dict**,由 `_raw_call()` 的统一出口解析为 `LLMResponse` - 当 `_anthropic is None` 时,`_raw_call()` 走 **完全不变的原有 OpenAI 路径** --- ## 4. 调用流程详解 以下时序图展示了两种 provider 各自的完整调用链路: ### OpenAI / OpenRouter / DeepSeek(原有流程,零改动) ```mermaid sequenceDiagram participant Caller as 调用方 participant Client as LLMClient participant Raw as _raw_call() participant API as OpenAI API Caller->>Client: chat(messages) Client->>Client: _call_with_retry(model, ...) Client->>Raw: _raw_call(model, ...) Note over Raw: self._anthropic is None
→ 走 else 分支 (原有逻辑) Raw->>API: urllib POST /chat/completions API-->>Raw: {"choices": [...], "usage": {...}} Raw-->>Client: LLMResponse Client-->>Caller: LLMResponse ``` ### Anthropic(新增流程) ```mermaid sequenceDiagram participant Caller as 调用方 participant Client as LLMClient participant Raw as _raw_call() participant Adapter as AnthropicAdapter participant API as Anthropic API Caller->>Client: chat(messages) Client->>Client: _call_with_retry(model, ...) Client->>Raw: _raw_call(model, ...) Note over Raw: self._anthropic 存在
→ 走 if 分支 Raw->>Adapter: chat_completion(model, messages, ...) Note over Adapter: 1. 提取 system 消息
2. 构建 Anthropic 请求体
3. httpx POST /v1/messages Adapter->>API: httpx POST /v1/messages API-->>Adapter: {"content": [...], "usage": {...}} Note over Adapter: 转换为 OpenAI 兼容格式 Adapter-->>Raw: {"choices": [...], "usage": {...}} Note over Raw: 统一解析(与 OpenAI 路径完全相同) Raw-->>Client: LLMResponse Client-->>Caller: LLMResponse ``` --- ## 5. 对现有 Provider 零影响的保证 ```mermaid graph LR subgraph "provider != 'anthropic' 时的代码路径" A["from_rc_config()"] --> B["PROVIDER_PRESETS 填充 base_url ✅"] B --> C["LLMClient.__init__()"] C --> D["self._anthropic = None"] D --> E["_raw_call()"] E --> F{"self._anthropic?"} F -->|"None → False"| G["else 分支
原有 OpenAI 逻辑
(代码未修改)"] end style G fill:#e8f5e9,stroke:#4caf50,stroke-width:3px style F fill:#fff9c4 ``` **零影响的 5 重保证:** | # | 保证机制 | 说明 | |---|---|---| | 1 | **条件初始化** | `AnthropicAdapter` 仅在 `provider == "anthropic"` 时实例化,其他 provider 不触发任何新代码 | | 2 | **`_anthropic = None`** | `__init__` 中默认设为 `None`,非 Anthropic provider 永远不会进入适配器分支 | | 3 | **else 分支 = 原代码** | `_raw_call()` 的 else 分支包含的是 **未修改的** OpenAI urllib 调用逻辑 | | 4 | **PROVIDER_PRESETS 保留** | 恢复了 preset base_url 回退逻辑,`openai` / `openrouter` / `deepseek` 的自动 URL 填充行为与之前一致 | | 5 | **统一出口** | 两条路径最终都产出相同结构的 dict,由同一段代码解析为 `LLMResponse` | ### PROVIDER_PRESETS 对照表 ```mermaid graph TD subgraph "PROVIDER_PRESETS(base_url 自动填充)" P1["openai → https://api.openai.com/v1"] P2["openrouter → https://openrouter.ai/api/v1"] P3["deepseek → https://api.deepseek.com/v1"] P4["anthropic → https://api.anthropic.com"] P5["openai-compatible → 用户自定义 base_url"] end P1 --> |"不变 ✅"| OK1[" "] P2 --> |"不变 ✅"| OK2[" "] P3 --> |"不变 ✅"| OK3[" "] P4 --> |"新增"| OK4[" "] P5 --> |"不变 ✅"| OK5[" "] style P1 fill:#e8f5e9 style P2 fill:#e8f5e9 style P3 fill:#e8f5e9 style P4 fill:#fff3e0 style P5 fill:#e8f5e9 ``` --- ## 6. 变更文件清单 | 文件路径 | 变更类型 | 改动说明 | |---|---|---| | `researchclaw/llm/__init__.py` | 修改 | 添加 `"anthropic"` preset;简化工厂函数委托给 `from_rc_config()` | | `researchclaw/llm/client.py` | 修改 | `from_rc_config()` 恢复 PRESETS 逻辑 + 条件挂载适配器;`_raw_call()` 添加 if/else 分支 | | `researchclaw/llm/anthropic_adapter.py` | **新增** | `AnthropicAdapter` 类 — Anthropic Messages API → OpenAI 兼容格式转换 | | `tests/test_anthropic.py` | **新增** | Anthropic API 连通性测试脚本 | | `pyproject.toml` | 修改 | 添加 `httpx` 为 optional dependency (`[anthropic]` extra) | | `.gitignore` | 修改 | 添加 `run.log` | --- ## 7. 异常处理与重试机制 Anthropic 适配器内部将 httpx 异常 **转换为 urllib 标准异常**,确保上层重试逻辑无需修改: ```mermaid graph TD subgraph "AnthropicAdapter 内部" A["httpx.HTTPStatusError
(4xx/5xx)"] -->|转换| B["urllib.error.HTTPError
(保留 status_code)"] C["httpx.ConnectError
httpx.TimeoutException"] -->|转换| D["urllib.error.URLError"] end subgraph "_call_with_retry() — 不变" B --> E{"status code?"} E -->|"429/500/502/503/504"| F["指数退避重试 ✅"] E -->|"400"| G["立即抛出(Bad Request)"] E -->|"403 + model forbidden"| H["跳到下一个 fallback model"] D --> I["重试直到耗尽 ✅"] end style A fill:#fff3e0 style C fill:#fff3e0 style B fill:#e8f5e9 style D fill:#e8f5e9 ``` 这意味着 Anthropic 路径享有与 OpenAI 路径 **完全相同的重试策略**:指数退避 + jitter + model fallback chain。 --- ## 8. 配置示例 ### 使用 Anthropic(新增) ```yaml llm: provider: anthropic # base_url 可省略,自动使用 https://api.anthropic.com api_key_env: ANTHROPIC_API_KEY primary_model: claude-sonnet-4-20250514 fallback_models: - claude-haiku-4-5-20251001 ``` ### 使用 OpenAI(不变) ```yaml llm: provider: openai # base_url 可省略,自动使用 https://api.openai.com/v1 api_key_env: OPENAI_API_KEY primary_model: gpt-4o fallback_models: - gpt-4.1 - gpt-4o-mini ``` ### 使用 OpenRouter(不变) ```yaml llm: provider: openrouter api_key_env: OPENROUTER_API_KEY primary_model: anthropic/claude-sonnet-4-20250514 ``` --- ## 9. 新增依赖 | 依赖 | 版本要求 | 安装方式 | 说明 | |---|---|---|---| | `httpx` | `>=0.24` | `pip install researchclaw[anthropic]` | **可选依赖**,仅 Anthropic provider 需要 | 不使用 Anthropic provider 的用户 **无需安装 httpx**,`pip install researchclaw` 的行为完全不变。 --- > **总结**: 本次改动通过适配器模式在 `_raw_call()` 内部添加了一条 Anthropic 专用路径。 > 当 provider 不是 `"anthropic"` 时,`self._anthropic` 为 `None`,代码执行路径与改动前 **完全一致**, > 不触及任何新增代码,不引入任何新依赖。 ================================================ FILE: docs/PIPELINE_TEST_LOG_R5.md ================================================ # Pipeline Test Log — Round 5 (main branch) > **分支**: `main` @ `e95527f` > **日期**: 2026-03-18 > **目的**: 全面测试 main 分支 Pipeline 端到端流程,覆盖多领域、纯计算实验 > **环境**: Python 3.x, numpy 2.4.3, scipy 1.17.1, sklearn 1.8.0, matplotlib 3.10.8 > **LLM**: gpt-5.4 (fallback: gpt-5.1, gpt-4o) via Azure OpenAI --- ## 测试选题 | ID | 领域 | 主题 | metric_direction | 关键依赖 | |----|------|------|-----------------|---------| | N | 计算物理 | 随机矩阵理论:Marchenko-Pastur 分布的有限维修正分析 | minimize | numpy, scipy | | O | 计算经济学 | 弱工具变量下 IV 估计量的 Monte Carlo 偏差-方差权衡 | minimize | numpy, scipy, sklearn | | P | 计算流行病学 | SIR/SEIR 模型参数可辨识性:合成数据下的结构化似然分析 | maximize | numpy, scipy | | Q | 数学/数值分析 | Krylov 子空间方法求解稀疏线性系统:预条件策略对比 | minimize | numpy, scipy | ### 选题原则 - 所有实验纯计算/模拟,无需外部数据集或 GPU - 核心依赖仅 numpy/scipy/sklearn,sandbox 即可执行 - 覆盖 4 个不同领域:物理、经济学、流行病学、数值分析 - 避免 R4 中被 topic refinement 强行引向 ML 的问题——本轮 topic 描述更具体 ### 备选 Topic(未选用) 1. **Agent/RL**: 网格世界中多智能体 emergent communication 的涌现 — 需要 gymnasium,sandbox 兼容性不确定 2. **信号处理**: 压缩感知中 RIP 条件的经验验证 — 可行但领域覆盖与 Q 重叠 3. **统计学**: Bayesian 变点检测的 MCMC 采样效率对比 — 可行,备用 4. **图论**: 随机图上 Erdos-Renyi 相变阈值的数值验证 — 可行,备用 --- ## 运行状态 | Pipeline | Config | Run ID | PID | 启动时间 (UTC) | 最终阶段 | 状态 | 总耗时 | |----------|--------|--------|-----|---------------|---------|------|--------| | N | config_test_N.yaml | `rc-20260318-174754-fc94f2` | 2036352 | 17:47 | 28/29 (S23 fail) | ⚠️ 近完美 | ~2.5h | | O | config_test_O.yaml | `rc-20260318-174826-01c0f3` | 2037261 | 17:48 | 28/29 (S23 fail) | ⚠️ 近完美 | ~3.0h | | P | config_test_P.yaml | `rc-20260318-174900-d5371f` | 2037826 | 17:49 | 29/29 ✅ | ✅ 完美通过 | ~2.4h | | Q | config_test_Q.yaml | `rc-20260318-174935-d0a717` | 2038664 | 17:49 | 28/29 (S23 fail) | ⚠️ 近完美 | ~2.5h | --- ## 观测记录 ### OBS-R5-01: S2 + arXiv 429 限流(预期行为)(17:48 UTC) - **严重度**: 🟢 预期行为 - **描述**: 4个并行 Pipeline 同时触发 S2/arXiv 429 限流 - S2 circuit breaker: 120s cooldown (trip #1) - arXiv circuit breaker: 180s cooldown (trip #1) - **关联**: R4-OBS-02 同类问题 - **影响**: 文献搜索阶段延迟增加,不阻塞 ### OBS-R5-02: Pipeline Q 触发 IMP-35 Topic Refinement (17:49 UTC) - **严重度**: 🟡 值得关注 - **描述**: Krylov 子空间方法的 topic 被评为 4/10,系统建议 refine 为 ML 相关方向 - 原始: "Comparative Analysis of Preconditioning Strategies for Krylov Subspace Methods..." - 建议: "Learned preconditioner selection for Krylov solvers on sparse linear systems..." - **评估**: IMP-35 倾向于把所有 topic 往 ML 方向引导(R4-OBS-03 同类问题) - **影响**: 纯数值分析 topic 可能被扭曲为 ML topic,但实验代码仍应聚焦原始问题 ### OBS-R5-03: 初始进度检查 (~17:55 UTC) - N: Stage 7/SYNTHESIS ✅ 快速推进 - O: Stage 6/KNOWLEDGE_EXTRACT ✅ 正常 - P: Stage 4/LITERATURE_COLLECT — 稍慢(429 影响) - Q: Stage 5/LITERATURE_SCREEN ✅ 正常 ### OBS-R5-04: CodeSearcher query_gen.py TypeError (18:20 UTC) - **严重度**: 🟡 中 — 不阻塞但影响代码质量 - **描述**: `researchclaw/agents/code_searcher/query_gen.py:149` 调用 `llm.chat()` 时传入不支持的 `user` 关键字参数 ``` TypeError: LLMClient.chat() got an unexpected keyword argument 'user' ``` - **影响**: CodeSearcher 无法使用 LLM 生成 GitHub 搜索 query,退化到基于规则的 query - **关联**: R4-BUG-02 (GitHub 401) — 401 问题仍在(无 GITHUB_TOKEN),加上此 TypeError 意味着 CodeSearcher 基本失效 - **需要修复**: ✅ 是 — query_gen.py 中 `llm.chat()` 调用签名与 LLMClient 接口不匹配 ### OBS-R5-05: gpt-5.4 Read Timeout 导致 fallback (18:30 UTC) - **严重度**: 🟡 中 — 自动 fallback 工作正常 - **描述**: Pipeline N 在代码生成阶段遭遇多次 gpt-5.4 read timeout - 触发 fallback 到 gpt-5.1 或 gpt-4o - 代码生成请求因 token 量大,更容易超时 - **影响**: 代码生成速度下降,但不阻塞 ### OBS-R5-06: Sandbox execution timeout 60s (18:35 UTC) - **严重度**: 🟡 中 — 影响代码验证 - **描述**: Pipeline O 代码生成阶段的 sandbox 验证执行超时(60s) - 可能是验证生成的实验代码能否运行 - 代码生成后的 AST 验证 + 试运行超时 - **影响**: 代码可能未经充分验证就进入下一阶段 ### OBS-R5-07: Stage 10 Deep Quality — Copy-paste Detection (18:35 UTC) - **严重度**: 🟡 中 — 代码质量问题 - **描述**: Pipeline O 的 models.py 中检测到多组 copy-paste 类: 1. `FixedFullerOneBiasReducedBaseline` vs `FixedFullerFourAggressiveShrinkageBaseline` (16 vs 16 lines) 2. `FirstStageStrengthOnlyRiskSurfaceBaseline` vs `NoLeverageGeometryRiskSurfaceAblation` (9 vs 9 lines) 3. 多个 ablation 类仅 0-1 个非 dunder 方法 - **评估**: 这是 R4-BUG-13 的同类问题 — ablation 类之间差异不足 - **关联**: BUG-13 (copy-paste ablation) ### OBS-R5-08: 所有 Pipeline 在 Stage 10 停留超 25 分钟 (18:41 UTC) - **严重度**: 🟢 预期行为 - **描述**: 代码生成是最重的 LLM 调用阶段,N=1 attempt, O/P=3 attempts, Q=3 attempts - **评估**: 多次 attempt 表明 code validation loop 在工作,自动修复代码中的问题 - **耗时**: N=2441s (~41min), O=2485s (~41min), P=2796s (~47min), Q=2976s (~50min) ### OBS-R5-09: 所有已执行实验在 Stage 12 首次运行均失败 (18:55 UTC) - **严重度**: 🔴 高 — 系统性 numpy 2.x API 不兼容 - **描述**: 3个已完成 Stage 12 的 Pipeline 均在首次实验运行失败: - **N**: `AttributeError: module 'numpy' has no attribute 'trapz'` - numpy 2.0 移除了 `np.trapz`,应使用 `np.trapezoid` - **O**: `numpy.linalg.LinAlgError: 1-dimensional array given. Array must be two-dimensional` - 代码向 linalg 函数传入了 1D 数组 - **P**: `AttributeError: module 'numpy' has no attribute 'erfinv'` - `erfinv` 从未存在于 numpy 中,应使用 `scipy.special.erfinv` - **根因**: gpt-5.4 生成的代码使用了已在 numpy 2.x 中移除或不存在的 API - **关联**: R5-BUG-01 (见下方) ### OBS-R5-10: Stage 13 自动修复正确修复 numpy.trapz → numpy.trapezoid (18:55 UTC) - **严重度**: 🟢 正面发现 - **描述**: Pipeline N 的 Stage 13 (ITERATIVE_REFINE) 成功检测到 `np.trapz` 错误并: 1. 创建了 `_trapz()` 包装函数 2. 内部使用 `np.trapezoid(y, x)` 替代 3. 同时创建了 `_cumulative_trapezoid_1d()` 辅助函数 - **评估**: 自我修复机制在 numpy API 变更场景中工作良好 ### OBS-R5-11: Pipeline Q Stage 09 YAML 解析警告 (18:40 UTC) - **严重度**: 🟢 低 — 自动恢复 - **描述**: Pipeline Q 的 Stage 09 LLM 返回内容无法直接解析为 YAML - 返回了 38089 字符的响应,远超预期 - content extraction fallback 正常工作 - **影响**: 无实际影响,pipeline 继续正常运行 ### OBS-R5-12: Stage 13 自动修复成功修复所有 numpy 2.x 不兼容 (19:10 UTC) - **严重度**: 🟢 正面发现 - **描述**: 所有 4 个 Pipeline 的 Stage 13 成功修复了 Stage 12 首次运行失败: - N: `np.trapz` → `np.trapezoid` (wrapper function) ✅ - O: 1D→2D array reshape 修复 ✅ - P: `np.erfinv` → `scipy.special.erfinv` ✅ - Q: 修复后成功运行 ✅ - **评估**: 自我修复机制可靠,但首次成功率仍可改善 ### OBS-R5-13: 所有 4 个 Pipeline 首次 Research Decision 均为 REFINE (19:23-19:49 UTC) - **严重度**: 🟡 值得关注 - **描述**: 所有 Pipeline 在第一轮实验后都被判定需要 refine - 这可能意味着:(a) 实验结果不够convincing (b) 系统对首轮结果过于严格 - N、P、Q 在第二轮后仍被 refine → 达到 max refine (2次) → 下次将 forced PROCEED - O 在第一轮 refine 中 - **影响**: Pipeline 总耗时增加(每次 refine 约增加 15-30 分钟实验时间) ### OBS-R5-14: Pipeline N 首先进入纸写作阶段 (~19:57 UTC) - **严重度**: 🟢 正面进展 - **描述**: Pipeline N (Marchenko-Pastur) 完成 2 轮 refine,被 forced PROCEED 到 Stage 16 - Stage 14 (RESULT_ANALYSIS) 耗时 553s (~9min) - Stage 15 decision 耗时 15s ### OBS-R5-15: Pipeline P 完美完成 29/29 stages! (20:13 UTC) - **严重度**: 🟢🟢🟢 重大正面发现 - **描述**: Pipeline P (SIR/SEIR 流行病学) 是 R5 第一个(也是唯一一个)完美完成的 Pipeline - 所有 29 个 stage 成功,0 失败 - 完整交付物:paper.tex (539行), references.bib (405行), 5 张图表, code package - Stage 23 citation verify 成功验证 44 条引用 - LaTeX 编译成功(paper.aux, paper.log 生成) - 总耗时约 2.4 小时 - **评估**: 这是本项目自 R0 以来第一次有 Pipeline 完整通过所有 29 个 stage - R0: Pipeline A 29/29 但那是在较旧版本上 - R4: 所有 4 个 Pipeline 在 Stage 20 被拒(2/10 质量分) - R5: Pipeline P 通过了 Stage 20(degraded 但非 rejected) ### OBS-R5-16: N 和 Q 在 Stage 23 (Citation Verify) 失败 (20:14-20:21 UTC) - **严重度**: 🟡 中 — 不影响论文本身 - **描述**: N 和 Q 的 Stage 23 因 `references_verified.bib` 缺失而失败 - 错误信息: `Missing or empty output: references_verified.bib` - Stage 23 耗时 0s — 意味着在验证前就失败了 - Pipeline P 的 Stage 23 成功(11s),说明这不是系统性问题 - **关联**: R5-BUG-04 (见下方) ### OBS-R5-17: Pipeline O 大量 ablation failure (20:20 UTC) - **严重度**: 🟡 中 — 代码质量问题 - **描述**: Pipeline O (IV estimators) 的 Stage 13 v2 检测到大量 copy-paste ablation 问题 - 8+ 对 conditions 产生完全相同的输出 - 例: `mean_bias_only_jive_evaluation_ablation` ≡ `two_stage_least_squares_wald_baseline` - 例: `no_instrument_density_geometry_risk_surface_ablation` ≡ `no_leverage_geometry_risk_surface_ablation` - **关联**: R5-BUG-03, R4-BUG-13 — copy-paste ablation 问题持续存在 ### OBS-R5-18: 纸面写作阶段高效 (Stage 16-22) - **严重度**: 🟢 正面 - **描述**: 所有完成的 Pipeline 在纸面写作阶段均高效运行: - Stage 16 (PAPER_OUTLINE): 99-119s - Stage 17 (PAPER_DRAFT): 374-406s (~6-7min) - Stage 18 (PEER_REVIEW): 72s - Stage 19 (PAPER_REVISION): 242-277s (~4min) - Stage 20 (QUALITY_GATE): 9-12s - Stage 21 (KNOWLEDGE_ARCHIVE): 42-51s - Stage 22 (EXPORT_PUBLISH): 122-130s (~2min) - **总计**: 纸面写作 + 导出约 15 分钟 ### OBS-R5-19: Pipeline N 论文承认实验失败 (20:14 UTC) - **严重度**: 🟡 中 — 影响论文质量 - **描述**: Pipeline N 的 paper_draft.md 中写道: > "the current execution failed before producing any analyzable spectral metrics" - **分析**: 虽然 Stage 13 成功修复了 numpy 2.x 错误并重新运行了实验,但论文写作阶段可能 没有从修复后的实验结果中获取数据,而是检测到了第一次失败的状态 - **关联**: 可能是 Stage 14 (RESULT_ANALYSIS) 没有正确读取 Stage 13 v2/v3 的结果 --- ## 新发现 Bug ### R5-BUG-01: CodeSearcher query_gen.py — LLMClient.chat() 签名不匹配 ✅ 已修复 - **严重度**: 🟡 中 — 不阻塞 pipeline 但降低代码质量 - **文件**: `researchclaw/agents/code_searcher/query_gen.py:149` - **描述**: - `llm.chat()` 被调用为 `llm.chat(system=..., user=..., max_tokens=...)` - 实际签名是 `chat(messages: list[dict], *, system=, max_tokens=)` - `user` 不是有效参数 → `TypeError` - 另外代码错误地用 `asyncio.run()` 包装同步方法 - **修复**: - 改为 `llm.chat([{"role": "user", "content": prompt}], system=..., max_tokens=...)` - 移除不必要的 `asyncio.run()` 和 `chat_sync` 分支 - **影响**: 修复后 CodeSearcher 可正常使用 LLM 生成搜索查询(仍需 GITHUB_TOKEN) ### R5-BUG-02: 代码生成使用已弃用/不存在的 numpy 2.x API(系统性) - **严重度**: 🔴 高 — 导致所有实验首次运行失败 - **描述**: gpt-5.4 生成的代码使用了已在 numpy 2.0 中移除的 API: - `np.trapz` → 应使用 `np.trapezoid` (numpy 2.0 breaking change) - `np.erfinv` → 从未存在于 numpy,应使用 `scipy.special.erfinv` - `np.bool` / `np.int` 等 → 已在 numpy 1.24+ 移除 - **根因**: LLM 训练数据包含大量 numpy 1.x 代码,未适应 2.x 变化 - **自动修复**: Stage 13 (ITERATIVE_REFINE) 成功修复了这些问题 ✅ - **建议**: 在代码生成 prompt 中添加 numpy 2.x 兼容性提示,减少首次失败 ### R5-BUG-03: Pipeline O copy-paste ablation 检测(已知问题复现) - **严重度**: 🟡 中 - **描述**: Stage 10 deep quality check 检测到多组近似相同的 ablation 类 - Fuller1 vs Fuller4: 仅超参数不同,方法体相同 - Risk surface baseline vs ablation: 方法签名和体积完全相同 - **关联**: R4-BUG-13 (BUG-13 copy-paste ablation) — 该问题跨轮次持续存在 - **建议**: 需要在代码生成阶段强化 ablation 差异性检查 ### R5-BUG-04: Stage 23 Citation Verify — references_verified.bib 缺失 ✅ FIXED - **严重度**: 🔴 高 — 3/4 Pipeline 受影响 - **描述**: N、O 和 Q 在 Stage 23 因 `references_verified.bib` 未生成而失败 - 错误: `Missing or empty output: references_verified.bib` - Stage 23 耗时 0s,说明在输出验证前就失败了 - Pipeline P 的 Stage 23 成功(11s),同一引用验证逻辑正常工作 - **根因分析**: - Stage 23 在无引用时正确写入空的 `references_verified.bib`(executor.py L9082) - 但 contract validation(executor.py L9351)拒绝 `st_size == 0` 的文件 - Pipeline P 有 19KB 的 references.bib → 验证后非空 → 通过 - N/O/Q 无引用 → Stage 23 写空文件 → 被 contract validation 拒绝 - **修复**: 将空文件改为写入 BibTeX 注释 `% No references to verify\n`(executor.py L9085-9086) - 文件非空,通过 contract validation,同时语义上表示"无引用" ### R5-BUG-05: 论文未使用修复后的实验结果 ✅ FIXED - **严重度**: 🔴 高 — 影响论文科学价值 - **描述**: Pipeline N/Q 的论文包含 "quality 2/10" 警告,声称实验失败 但 Stage 13 成功修复了 numpy 错误并产生了完整的实验结果(论文表格中实际包含真实数据) - **根因分析**: Stage 14 LLM analysis 在所有三次 refine 迭代中均给出 2/10(包括最新的非版本化 stage-14), 而 BUG-23 guard(executor.py L7184)在 `_analysis_rating <= 2` 时强制 `has_real_metrics = False`, 即使 `_collect_raw_experiment_metrics()` 已成功从 Stage 13 stdout 解析出真实指标 - **注**: `_read_prior_artifact` 排序是正确的 — 非版本化目录确实是最新的(rollback 时旧目录会被重命名为 `_vN`) - **修复**: 在 BUG-23 guard 中增加 `not _has_parsed_metrics` 条件(executor.py L7187) - 当 Stage 13 refinement 产生了可解析的真实指标时,不再被 analysis rating 覆盖 - 同时保留了原始 BUG-23 防护:在确实没有真实指标时仍会触发 --- ## 总结 ### 整体评价 R5 是目前最成功的测试轮次: | 指标 | R4 (feat/universal-codegen) | R5 (main) | |------|---------------------------|-----------| | 完美通过 (29/29) | 0/4 | **1/4 (Pipeline P)** | | 近完美 (28/29) | 0/4 | **3/4 (N, O, Q)** | | Stage 20 通过 | 0/4 (all rejected 2/10) | **4/4 (all degraded/pass)** | | 崩溃/严重失败 | 1/4 (Pipeline K crash) | **0/4** | | 平均完成阶段 | ~25/29 | **28.75/29** | | 平均耗时 | ~3.5h | **~2.6h** | ### 关键改进 1. **Stage 20 Quality Gate 不再阻塞**: R4 中所有 Pipeline 被 2/10 拒绝,R5 全部通过 2. **自我修复能力可靠**: Stage 13 成功修复了所有 numpy 2.x API 不兼容问题 3. **跨领域能力验证**: 物理、经济学、流行病学、数值分析 4 个不同领域均可完成 4. **无崩溃**: 4/4 Pipeline 全部正常完成,无任何进程级崩溃 ### 关键问题(全部已修复) 1. ✅ **R5-BUG-05**: BUG-23 guard 过度激进 → 论文声称实验失败 2. ✅ **R5-BUG-04**: Stage 23 写入空 bib 文件被 contract validation 拒绝 → 3/4 失败 3. ✅ **R5-BUG-01**: CodeSearcher query_gen.py 签名不匹配 4. ✅ **R5-BUG-02**: 代码生成使用已弃用 numpy 2.x API — 已在 7 个 prompt 中添加兼容性警告 5. ✅ **R5-BUG-03**: copy-paste ablation — 新增 <1% 近似检测 + prompt 强化 6. ✅ **R5-BUG-06**: LaTeX microtype 字体错误 — 已添加 `\usepackage{lmodern}` ### R5-BUG-06: LaTeX 编译失败 — pdfTeX font expansion 错误 ✅ FIXED - **严重度**: 🟡 中 - **描述**: Pipeline Q 的 paper.tex 编译失败 ``` pdfTeX error (font expansion): auto expansion is only possible with scalable Fatal error occurred, no output PDF file produced! ``` - **根因**: `\usepackage[T1]{fontenc}` 激活了 T1 编码,但未加载可缩放字体(lmodern) - **修复**: 在 `researchclaw/templates/conference.py` 的 NEURIPS_2024、NEURIPS_2025、GENERIC 三个模板中 在 `fontenc` 之后添加 `\usepackage{lmodern}` ### R5-BUG-02: numpy 2.x API 不兼容 ✅ FIXED - **修复范围**: 在以下 7 个 prompt 位置添加了 numpy 2.x 兼容性警告 - `prompts.default.yaml` (legacy code_generation) - `prompts.py`: architecture_planning, generate_single_file, code_repair, iterative_improve, iterative_repair, code_exec_fix ### R5-BUG-03: copy-paste ablation ✅ IMPROVED - **修复**: executor.py 新增 P8 近似检测(<1% relative diff → warning),补充了原有的精确匹配检测 - **注**: prompt 中已有 Rule 9 (ABLATION DIFFERENTIATION) 和 Rule 8 (METHOD RICHNESS) 的引导 ### 后续排查结论 - **`_read_prior_artifact` 排序**: ✅ 确认正确 — 非版本化目录确实是最新的(rollback 重命名旧目录为 `_vN`) - **Stage 14 quality rating 问题**: 所有 3 次 refine 迭代的 Stage 14 均给出 2/10 → 这是 LLM 分析偏保守的问题, 但 BUG-05 的修复已绕过该问题(信任实际解析出的指标) ### 交付物检查 | Pipeline | paper.tex | references.bib | charts | code | LaTeX编译 | |----------|-----------|---------------|--------|------|----------| | N | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 | | O | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 | | P | ✅ (539行) | ✅ (405行) | ✅ (5张) | ✅ | ✅ | | Q | ✅ | ❌ (S23 fail) | ✅ | ✅ | 未检查 | ### Pipeline 时间分布(以 Pipeline P 为例) | 阶段 | 耗时 | 说明 | |------|------|------| | S1-S9 (研究+设计) | ~20min | 含 429 限流延迟 | | S10 (代码生成) | ~47min | 最重的 LLM 阶段,3 次 attempt | | S11 (资源规划) | ~14s | | | S12-S13 (实验+修复) | ~15min | 首次失败 + 自动修复 + 重运行 × 2轮 refine | | S14-S15 (分析+决策) | ~10min | 含 2 轮 refine 循环 | | S16-S22 (论文写作+导出) | ~15min | | | S23 (引用验证) | ~11s | | | **总计** | **~2.4h** | | ================================================ FILE: docs/README_AR.md ================================================

AutoResearchClaw Logo

شارك فكرة. احصل على ورقة بحثية. مؤتمت بالكامل & ذاتي التطور.

تحدث مع OpenClaw: «ابحث عن X» → تمّ.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 معرض الأوراق · 📖 دليل التكامل · 💬 مجتمع Discord

---
ورقة نموذجية 🏆 معرض الأوراق المُولّدة

8 أوراق في 8 مجالات — الرياضيات، الإحصاء، الأحياء، الحوسبة، NLP، RL، الرؤية الحاسوبية، المتانة — مُولّدة بشكل مستقل تماماً بدون تدخل بشري.

عرض المعرض
--- > **🧪 نبحث عن مختبرين!** جرّب خط الأنابيب بفكرتك البحثية الخاصة — من أي مجال — و[أخبرنا برأيك](TESTER_GUIDE.md). ملاحظاتك تشكّل الإصدار القادم مباشرة. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **دعم متعدد المنصات + استقرار كبير** — يعمل AutoResearchClaw الآن مع أي وكيل متوافق مع ACP (Claude Code، Codex CLI، Copilot CLI، Gemini CLI، Kimi CLI) ويدعم منصات المراسلة (Discord، Telegram، Lark، WeChat) عبر جسر OpenClaw. واجهة خلفية جديدة لتوليد الكود عبر CLI-agent تفوّض المرحلتين 10 و13 لوكلاء CLI خارجيين مع التحكم في الميزانية وإدارة المهلة الزمنية. يتضمن نظام مكافحة التلفيق (VerifiedRegistry + حلقة تشخيص وإصلاح التجارب)، 100+ إصلاح أخطاء، إعادة هيكلة modular executor، كشف تلقائي لـ `--resume`، تعزيز إعادة محاولات LLM، وإصلاحات المجتمع. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ أمر واحد. ورقة واحدة. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 ما هذا؟ **أنت تفكر. AutoResearchClaw يكتب.** أعطِ موضوعاً بحثياً — احصل على ورقة أكاديمية كاملة مع أدبيات حقيقية من OpenAlex و Semantic Scholar و arXiv، وتجارب في بيئة معزولة واعية بالعتاد (كشف تلقائي لـ GPU/MPS/CPU)، وتحليل إحصائي، ومراجعة أقران متعددة الوكلاء، و LaTeX جاهز للمؤتمرات يستهدف NeurIPS/ICML/ICLR. بدون مراقبة. بدون نسخ ولصق. بدون مراجع مُلفّقة.
📄paper_draft.mdورقة أكاديمية كاملة (مقدمة، أعمال سابقة، المنهجية، التجارب، النتائج، الخاتمة)
📐paper.texLaTeX جاهز للمؤتمرات (قوالب NeurIPS / ICLR / ICML)
📚references.bibمراجع BibTeX حقيقية من OpenAlex و Semantic Scholar و arXiv — مُنقّحة تلقائياً لمطابقة الاستشهادات المضمّنة
🔍verification_report.jsonتحقق من سلامة الاستشهادات على 4 طبقات + التحقق من الصلة (arXiv، CrossRef، DataCite، LLM)
🧪experiment runs/كود مُولّد + نتائج البيئة المعزولة + مقاييس JSON منظمة
📊charts/رسوم بيانية مُولّدة تلقائياً لمقارنة الظروف مع أشرطة الخطأ وفترات الثقة
📝reviews.mdمراجعة أقران متعددة الوكلاء مع فحص اتساق المنهجية والأدلة
🧬evolution/دروس تعلّم ذاتي مستخلصة من كل تشغيل
📦deliverables/جميع المخرجات النهائية في مجلد واحد — جاهزة للترجمة على Overleaf
يعمل خط الأنابيب **من البداية إلى النهاية بدون تدخل بشري**. عندما تفشل التجارب، يصلح نفسه. عندما لا تصمد الفرضيات، يغيّر المسار. عندما تكون الاستشهادات مُلفّقة، يزيلها. 🌍 **شغّله من أي مكان.** AutoResearchClaw ليس مقيّدًا بمنصة واحدة. استخدمه مستقلاً عبر CLI، أو وصّله بـ [OpenClaw](https://github.com/openclaw/openclaw)، أو ادمجه مع أي وكيل متوافق مع ACP — 🤖 Claude Code، 💻 Codex CLI، 🐙 Copilot CLI، ♊ Gemini CLI، 🌙 Kimi CLI، وغيرها. بفضل جسر الرسائل في OpenClaw، يمكنك إطلاق بحث كامل من 💬 Discord، ✈️ Telegram، 🐦 Lark (飞书)، 💚 WeChat، أو أي منصة يستخدمها فريقك بالفعل. موضوع واحد كمُدخل، ورقة بحثية كمُخرج — بغض النظر عن المكان الذي تكتب منه. --- ## 🚀 البداية السريعة ```bash # 1. استنساخ وتثبيت git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. الإعداد (تفاعلي — يثبّت OpenCode beast mode، يتحقق من Docker/LaTeX) researchclaw setup # 3. التهيئة researchclaw init # تفاعلي: اختر مزوّد LLM، ينشئ config.arc.yaml # أو يدوياً: cp config.researchclaw.example.yaml config.arc.yaml # 4. التشغيل export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` المخرجات → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — LaTeX و BibTeX وكود التجارب والرسوم البيانية جاهزة للترجمة.
📝 الحد الأدنى من التهيئة المطلوبة ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 ما الذي يميّزه | القدرة | كيف يعمل | |-----------|-------------| | **🔄 حلقة PIVOT / REFINE** | المرحلة 15 تقرر بشكل مستقل: PROCEED أو REFINE (تعديل المعاملات) أو PIVOT (اتجاه جديد). المخرجات تُحفظ بإصدارات تلقائياً. | | **🤖 نقاش متعدد الوكلاء** | توليد الفرضيات وتحليل النتائج ومراجعة الأقران تستخدم نقاشاً منظماً بوجهات نظر متعددة. | | **🧬 التعلّم الذاتي** | دروس مستخلصة من كل تشغيل (مبررات القرارات، تحذيرات وقت التشغيل، شذوذ المقاييس) مع تناقص زمني بنصف عمر 30 يوماً. التشغيلات المستقبلية تتعلم من الأخطاء السابقة. | | **📚 قاعدة المعرفة** | كل تشغيل يبني قاعدة معرفة منظمة عبر 6 فئات (قرارات، تجارب، اكتشافات، أدبيات، أسئلة، مراجعات). | | **🛡️ الحارس المراقب Sentinel** | مراقب جودة في الخلفية: كشف NaN/Inf، اتساق الورقة والأدلة، تقييم صلة الاستشهادات، حماية ضد التلفيق. | --- ## 🦞 تكامل OpenClaw **AutoResearchClaw هو خدمة متوافقة مع [OpenClaw](https://github.com/openclaw/openclaw).** قم بتثبيته في OpenClaw وابدأ بحثاً مستقلاً برسالة واحدة — أو استخدمه بشكل مستقل عبر سطر الأوامر أو Claude Code أو أي مساعد برمجة بالذكاء الاصطناعي.
### 🚀 الاستخدام مع OpenClaw (موصى به) إذا كنت تستخدم [OpenClaw](https://github.com/openclaw/openclaw) بالفعل كمساعد ذكاء اصطناعي: ``` 1️⃣ شارك رابط مستودع GitHub مع OpenClaw 2️⃣ OpenClaw يقرأ تلقائياً RESEARCHCLAW_AGENTS.md → يفهم خط الأنابيب 3️⃣ قل: "ابحث عن [موضوعك]" 4️⃣ تم — OpenClaw يستنسخ، يثبّت، يهيّئ، يشغّل، ويعيد النتائج ``` **هذا كل شيء.** يتعامل OpenClaw مع `git clone`، `pip install`، إعداد التهيئة، وتنفيذ خط الأنابيب تلقائياً. أنت فقط تتحدث.
💡 ماذا يحدث خلف الكواليس 1. يقرأ OpenClaw ملف `RESEARCHCLAW_AGENTS.md` → يتعلم دور منسّق البحث 2. يقرأ OpenClaw ملف `README.md` → يفهم التثبيت وبنية خط الأنابيب 3. يقرأ OpenClaw ملف `config.researchclaw.example.yaml` → `config.yaml` 4. يسأل عن مفتاح API لنموذج اللغة (أو يستخدم متغير البيئة) 5. يشغّل `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. يعيد الورقة و LaTeX والتجارب والاستشهادات
### 🔌 جسر OpenClaw (متقدم) للتكامل الأعمق، يتضمن AutoResearchClaw **نظام محوّلات جسر** مع 6 إمكانيات اختيارية: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ عمليات تشغيل بحث مجدولة use_message: true # 💬 إشعارات التقدم (Discord/Slack/Telegram) use_memory: true # 🧠 استمرارية المعرفة عبر الجلسات use_sessions_spawn: true # 🔀 إطلاق جلسات فرعية متوازية للمراحل المتزامنة use_web_fetch: true # 🌐 بحث ويب مباشر أثناء مراجعة الأدبيات use_browser: false # 🖥️ جمع الأوراق عبر المتصفح ``` كل علامة تفعّل بروتوكول محوّل مُحدد النوع. عندما يوفر OpenClaw هذه الإمكانيات، تستهلكها المحوّلات بدون تغييرات في الكود. راجع [`integration-guide.md`](integration-guide.md) للتفاصيل الكاملة. ### ACP (Agent Client Protocol) يمكن لـ AutoResearchClaw استخدام **أي وكيل برمجة متوافق مع ACP** كواجهة خلفية لنموذج اللغة — بدون الحاجة لمفاتيح API. يتواصل الوكيل عبر [acpx](https://github.com/openclaw/acpx)، ويحافظ على جلسة واحدة مستمرة عبر جميع مراحل خط الأنابيب الـ 23. | الوكيل | الأمر | ملاحظات | |-------|---------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — مثال ACP llm: provider: "acp" acp: agent: "claude" # أي أمر CLI لوكيل متوافق مع ACP cwd: "." # دليل العمل للوكيل # لا حاجة لـ base_url أو api_key — الوكيل يدير مصادقته بنفسه. ``` ```bash # فقط شغّل — الوكيل يستخدم بيانات اعتماده الخاصة researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ طرق أخرى للتشغيل | الطريقة | الكيفية | |--------|-----| | **سطر أوامر مستقل** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **واجهة Python البرمجية** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | يقرأ `RESEARCHCLAW_CLAUDE.md` — فقط قل *"شغّل بحثاً عن [موضوع]"* | | **Copilot CLI** | `researchclaw run --topic "..."` مع `llm.acp.agent: "gh"` | | **OpenCode** | يقرأ `.claude/skills/` — نفس واجهة اللغة الطبيعية | | **أي واجهة ذكاء اصطناعي** | قدّم `RESEARCHCLAW_AGENTS.md` كسياق → الوكيل يبدأ تلقائياً | --- ## 🔬 خط الأنابيب: 23 مرحلة، 8 أطوار ``` Phase A: تحديد نطاق البحث Phase E: تنفيذ التجارب 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← إصلاح ذاتي Phase B: اكتشاف الأدبيات Phase F: التحليل والقرار 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← متعدد الوكلاء 4. LITERATURE_COLLECT ← API حقيقي 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [بوابة] 6. KNOWLEDGE_EXTRACT Phase G: كتابة الورقة 16. PAPER_OUTLINE Phase C: توليف المعرفة 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← فحص الأدلة 8. HYPOTHESIS_GEN ← نقاش 19. PAPER_REVISION Phase D: تصميم التجارب Phase H: الإنهاء 9. EXPERIMENT_DESIGN [بوابة] 20. QUALITY_GATE [بوابة] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← فحص الصلة ``` > **مراحل البوابات** (5، 9، 20) تتوقف للحصول على موافقة بشرية أو موافقة تلقائية مع `--auto-approve`. عند الرفض، يعود خط الأنابيب للخلف. > **حلقات القرار**: يمكن للمرحلة 15 تفعيل REFINE (→ المرحلة 13) أو PIVOT (→ المرحلة 8)، مع إصدار تلقائي للمخرجات.
📋 ماذا يفعل كل طور | الطور | ما يحدث | |-------|-------------| | **A: تحديد النطاق** | يفكك نموذج اللغة الموضوع إلى شجرة مشاكل منظمة مع أسئلة بحثية | | **A+: العتاد** | كشف تلقائي لـ GPU (NVIDIA CUDA / Apple MPS / CPU فقط)، تحذير إذا كان العتاد المحلي محدوداً، تكييف توليد الكود وفقاً لذلك | | **B: الأدبيات** | بحث متعدد المصادر (OpenAlex → Semantic Scholar → arXiv) عن أوراق حقيقية، فرز حسب الصلة، استخلاص بطاقات معرفية | | **C: التوليف** | تجميع النتائج، تحديد فجوات البحث، توليد فرضيات قابلة للاختبار عبر نقاش متعدد الوكلاء | | **D: التصميم** | تصميم خطة التجارب، توليد كود Python قابل للتشغيل واعٍ بالعتاد (مستوى GPU → اختيار الحزم)، تقدير احتياجات الموارد | | **E: التنفيذ** | تشغيل التجارب في بيئة معزولة، كشف NaN/Inf وأخطاء وقت التشغيل، إصلاح ذاتي للكود عبر إصلاح مُستهدف بنموذج اللغة | | **F: التحليل** | تحليل متعدد الوكلاء للنتائج؛ قرار مستقل PROCEED / REFINE / PIVOT مع المبررات | | **G: الكتابة** | مخطط → صياغة قسم بقسم (5,000-6,500 كلمة) → مراجعات أقران (مع اتساق المنهجية والأدلة) → مراجعة مع حماية الطول | | **H: الإنهاء** | بوابة جودة، أرشفة المعرفة، تصدير LaTeX مع قالب المؤتمر، التحقق من سلامة الاستشهادات + الصلة |
--- ## ✨ الميزات الرئيسية | الميزة | الوصف | |---------|------------| | **📚 أدبيات متعددة المصادر** | أوراق حقيقية من OpenAlex و Semantic Scholar و arXiv — توسيع الاستعلام، إزالة التكرار، قاطع دائرة مع تدهور أنيق | | **🔍 تحقق من الاستشهادات على 4 طبقات** | فحص arXiv ID → CrossRef/DataCite DOI → مطابقة عنوان Semantic Scholar → تقييم صلة LLM. المراجع المُلفّقة تُزال تلقائياً. | | **🖥️ تنفيذ واعٍ بالعتاد** | كشف تلقائي لـ GPU (NVIDIA CUDA / Apple MPS / CPU فقط) مع تكييف توليد الكود والاستيرادات ونطاق التجارب | | **🦾 OpenCode Beast Mode** | التجارب المعقدة تُوجّه تلقائياً إلى [OpenCode](https://github.com/anomalyco/opencode) — يولّد مشاريع متعددة الملفات مع بنى مخصصة وحلقات تدريب ودراسات استئصال. التثبيت عبر `researchclaw setup`. | | **🧪 تجارب في بيئة معزولة** | كود مُتحقق بـ AST، إطار غير قابل للتعديل، فشل سريع عند NaN/Inf، إصلاح ذاتي، تحسين تكراري (حتى 10 جولات)، التقاط نتائج جزئية | | **📝 كتابة بمستوى المؤتمرات** | قوالب NeurIPS/ICML/ICLR، صياغة قسم بقسم (5,000-6,500 كلمة)، حماية ضد التلفيق، حماية طول المراجعة، فرض مضاد لإخلاءات المسؤولية | | **📐 تبديل القوالب** | `neurips_2025`، `iclr_2026`، `icml_2026` — Markdown → LaTeX مع رياضيات وجداول وأشكال ومراجع تبادلية و `\cite{}` | | **🚦 بوابات الجودة** | 3 بوابات بمشاركة بشرية (المراحل 5، 9، 20) مع إمكانية التراجع. تخطّ باستخدام `--auto-approve`. | --- ## 🧠 تكامل MetaClaw **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = خط أنابيب يتعلم من كل تشغيل.** يضيف MetaClaw **نقل المعرفة عبر التشغيلات** إلى AutoResearchClaw. عند التفعيل، يلتقط خط الأنابيب تلقائياً الدروس من الإخفاقات والتحذيرات، ويحوّلها إلى مهارات قابلة لإعادة الاستخدام، ويحقنها في جميع مراحل خط الأنابيب الـ 23 في التشغيلات اللاحقة — بحيث لا تتكرر نفس الأخطاء أبداً. ### كيف يعمل ``` Run N ينفّذ → الإخفاقات/التحذيرات تُلتقط كـ Lessons ↓ MetaClaw Lesson → تحويل إلى Skill ↓ ملفات arc-* Skill تُخزّن في ~/.metaclaw/skills/ ↓ Run N+1 → build_overlay() يحقن المهارات في كل أمر LLM ↓ LLM يتجنب المزالق المعروفة → جودة أعلى، محاولات أقل ``` ### الإعداد السريع ```bash # 1. تثبيت MetaClaw (إذا لم يكن مُثبّتاً) pip install metaclaw # 2. التفعيل في التهيئة ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # وكيل MetaClaw (اختياري) skills_dir: "~/.metaclaw/skills" # أين تُخزّن المهارات fallback_url: "https://api.openai.com/v1" # بديل LLM مباشر fallback_api_key: "" # مفتاح API لعنوان البديل lesson_to_skill: enabled: true min_severity: "warning" # تحويل التحذيرات + الأخطاء max_skills_per_run: 3 ``` ```bash # 3. شغّل كالمعتاد — MetaClaw يعمل بشفافية researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` بعد كل تشغيل، تحقق من `~/.metaclaw/skills/arc-*/SKILL.md` لمشاهدة المهارات التي تعلّمها خط أنابيبك. ### نتائج التجارب في تجارب A/B مُحكمة (نفس الموضوع، نفس LLM، نفس التهيئة): | المقياس | خط الأساس | مع MetaClaw | التحسين | |---------|----------|---------------|----------| | معدل إعادة المحاولة لكل مرحلة | 10.5% | 7.9% | **-24.8%** | | عدد دورات REFINE | 2.0 | 1.2 | **-40.0%** | | إكمال مراحل خط الأنابيب | 18/19 | 19/19 | **+5.3%** | | درجة المتانة الإجمالية (مركّبة) | 0.714 | 0.845 | **+18.3%** | > درجة المتانة المركّبة هي متوسط مرجّح لمعدل إكمال المراحل (40%) وتقليل المحاولات (30%) وكفاءة دورات REFINE (30%). ### التوافق العكسي - **الافتراضي: مُعطّل.** إذا كان `metaclaw_bridge` غائباً أو `enabled: false`، يعمل خط الأنابيب تماماً كما كان. - **بدون تبعيات جديدة.** MetaClaw اختياري — خط الأنابيب الأساسي يعمل بدونه. - **جميع الاختبارات الـ 1,823 الحالية تنجح** مع وجود كود التكامل. --- ## ⚙️ مرجع التهيئة
انقر لتوسيع مرجع التهيئة الكامل ```yaml # === المشروع === project: name: "my-research" # معرّف المشروع mode: "docs-first" # docs-first | semi-auto | full-auto # === البحث === research: topic: "..." # موضوع البحث (مطلوب) domains: ["ml", "nlp"] # مجالات البحث للبحث في الأدبيات daily_paper_count: 8 # عدد الأوراق المستهدف لكل استعلام بحث quality_threshold: 4.0 # الحد الأدنى لدرجة جودة الأوراق # === وقت التشغيل === runtime: timezone: "America/New_York" # للطوابع الزمنية max_parallel_tasks: 3 # حد التجارب المتزامنة approval_timeout_hours: 12 # مهلة مرحلة البوابة retry_limit: 2 # عدد إعادة المحاولة عند فشل المرحلة # === نموذج اللغة === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # نقطة نهاية API (مطلوب لـ openai-compatible) api_key_env: "OPENAI_API_KEY" # متغير بيئة لمفتاح API (مطلوب لـ openai-compatible) api_key: "" # أو ضع المفتاح هنا مباشرة primary_model: "gpt-4o" # النموذج الأساسي fallback_models: ["gpt-4o-mini"] # سلسلة النماذج الاحتياطية s2_api_key: "" # مفتاح Semantic Scholar API (اختياري، حدود معدل أعلى) acp: # يُستخدم فقط عند provider: "acp" agent: "claude" # أمر CLI لوكيل ACP (claude، codex، gemini، إلخ) cwd: "." # دليل العمل للوكيل # === التجارب === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # أقصى وقت تنفيذ لكل تشغيل (الافتراضي: 300 ثانية) max_iterations: 10 # أقصى عدد تكرارات التحسين metric_key: "val_loss" # اسم المقياس الأساسي metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # كشف تلقائي للاستيراد → requirements.txt ssh_remote: host: "" # اسم مضيف خادم GPU gpu_ids: [] # معرّفات GPU المتاحة remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (يُثبّت تلقائياً عبر `researchclaw setup`) enabled: true # المفتاح الرئيسي (الافتراضي: true) auto: true # تشغيل تلقائي بدون تأكيد (الافتراضي: true) complexity_threshold: 0.2 # 0.0-1.0 — أعلى = فقط للتجارب المعقدة model: "" # تجاوز النموذج (فارغ = يستخدم llm.primary_model) timeout_sec: 600 # أقصى ثوانٍ لتوليد OpenCode max_retries: 1 # عدد المحاولات عند الفشل workspace_cleanup: true # حذف مساحة العمل المؤقتة بعد الجمع # === التصدير === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === الأوامر النصية === prompts: custom_file: "" # مسار ملف YAML للأوامر المخصصة (فارغ = الافتراضي) # === الأمان === security: hitl_required_stages: [5, 9, 20] # المراحل التي تتطلب موافقة بشرية allow_publish_without_approval: false redact_sensitive_logs: true # === قاعدة المعرفة === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === الإشعارات === notifications: channel: "console" # console | discord | slack target: "" # === جسر MetaClaw (اختياري) === metaclaw_bridge: enabled: false # اضبط على true لتفعيل التعلم عبر التشغيلات proxy_url: "http://localhost:30000" # عنوان وكيل MetaClaw skills_dir: "~/.metaclaw/skills" # أين تُخزّن مهارات arc-* fallback_url: "" # بديل LLM مباشر عند عدم توفر الوكيل fallback_api_key: "" # مفتاح API لنقطة نهاية البديل lesson_to_skill: enabled: true # تحويل الدروس إلى مهارات تلقائياً min_severity: "warning" # أدنى شدة للتحويل max_skills_per_run: 3 # أقصى مهارات جديدة لكل تشغيل # === جسر OpenClaw === openclaw_bridge: use_cron: false # عمليات تشغيل بحث مجدولة use_message: false # إشعارات التقدم use_memory: false # استمرارية المعرفة عبر الجلسات use_sessions_spawn: false # إطلاق جلسات فرعية متوازية use_web_fetch: false # بحث ويب مباشر use_browser: false # جمع الأوراق عبر المتصفح ```
--- ## 🙏 شكر وتقدير مستوحى من: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — رائد البحث الآلي - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — أتمتة البحث من البداية إلى النهاية - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — نظام بحث مؤتمت بالكامل --- ## 📄 الرخصة MIT — راجع [LICENSE](../LICENSE) للتفاصيل. --- ## 📌 الاستشهاد إذا وجدت AutoResearchClaw مفيداً، يرجى الاستشهاد: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

بُني بـ 🦞 بواسطة فريق AutoResearchClaw

================================================ FILE: docs/README_CN.md ================================================

AutoResearchClaw Logo

聊一个想法。出一篇论文。全自动 & 自演化。

直接与 OpenClaw 对话:"研究 X" → 搞定。

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 论文展示 · 📖 集成指南 · 💬 Discord 社区

---
Sample Paper 🏆 生成论文展示

8 篇论文覆盖 8 个领域 — 数学、统计、生物、计算、NLP、RL、视觉、鲁棒性 — 完全自主生成,零人工干预。

View Showcase
--- > **🧪 我们正在寻找测试者!** 用你自己的研究想法试试这个流水线 — 任何领域 — 然后 [告诉我们你的反馈](TESTER_GUIDE.md)。你的反馈将直接影响下一个版本。 **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **跨平台支持 + 重大稳定性更新** — AutoResearchClaw 现已支持任何 ACP 兼容的 AI 代理后端(Claude Code、Codex CLI、Copilot CLI、Gemini CLI、Kimi CLI),并通过 OpenClaw 桥接支持消息平台(Discord、Telegram、飞书、微信)。新增 CLI-agent 代码生成后端,将 Stage 10 和 13 委托给外部 CLI agent,支持预算控制和超时管理。同时包含反数据捏造系统(VerifiedRegistry + 实验诊断与修复循环),100+ 个 bug 修复,模块化 executor 重构,`--resume` 自动检测,LLM 重试加固,以及社区反馈修复。 - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ 一行命令。一篇论文。 ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 这是什么? **你有一个灵感,AutoResearchClaw 把它写出来。** 输入一个研究主题——获得一篇完整的学术论文,包含来自 OpenAlex、Semantic Scholar 和 arXiv 的真实文献,硬件感知沙箱实验(自动检测 GPU/MPS/CPU),统计分析,多 Agent 同行评审,以及面向 NeurIPS/ICML/ICLR 的顶会级 LaTeX。不需要盯着看。不需要复制粘贴。不会出现幻觉引用。
📄paper_draft.md完整学术论文(引言、相关工作、方法、实验、结果、结论)
📐paper.tex适配顶会模板的 LaTeX 文件(NeurIPS / ICLR / ICML)
📚references.bib来自 OpenAlex、Semantic Scholar 和 arXiv 的真实 BibTeX 引用——自动精简至与正文引用一致
🔍verification_report.json四层引用完整性 + 相关性核查(arXiv、CrossRef、DataCite、LLM)
🧪experiment runs/生成的代码 + 沙箱结果 + 结构化 JSON 指标
📊charts/自动生成的条件对比图(含误差线和置信区间)
📝reviews.md多 Agent 同行评审(含方法论-证据一致性检查)
🧬evolution/从每次运行中提取的自学习教训
📦deliverables/所有最终产出集中在一个文件夹——可直接上传 Overleaf 编译
流水线**端到端无需人工介入**运行。实验失败时自动修复。假设不成立时自主转向。引用是假的?自动删除。 🌍 **随处可用。** AutoResearchClaw 不绑定任何单一平台。你可以通过 CLI 独立运行,接入 [OpenClaw](https://github.com/openclaw/openclaw),或对接任何 ACP 兼容的 AI 代理 —— 🤖 Claude Code、💻 Codex CLI、🐙 Copilot CLI、♊ Gemini CLI、🌙 Kimi CLI,应有尽有。而且,借助 OpenClaw 的消息桥接能力,你还可以从 💬 Discord、✈️ Telegram、🐦 飞书、💚 微信,或任何你团队日常使用的平台发起一次完整的研究。输入一个课题,输出一篇论文 —— 无论你在哪里输入。 --- ## 🚀 快速开始 ```bash # 1. 克隆 & 安装 git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. 初始化(交互式 — 安装 OpenCode Beast Mode,检查 Docker/LaTeX) researchclaw setup # 3. 配置 researchclaw init # 交互式:选择 LLM 提供商,创建 config.arc.yaml # 或手动:cp config.researchclaw.example.yaml config.arc.yaml # 4. 运行 export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` 输出 → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — 可编译的 LaTeX、BibTeX、实验代码、图表。
📝 最小必要配置 ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 有什么不同 | 能力 | 工作原理 | |------|----------| | **🔄 PIVOT / REFINE 循环** | 第 15 阶段自主决策:PROCEED、REFINE(调参)或 PIVOT(新方向)。产物自动版本化。 | | **🤖 多 Agent 辩论** | 假设生成、结果分析、同行评审均使用结构化的多视角辩论。 | | **🧬 自学习** | 每次运行提取教训(决策理由、运行时警告、指标异常),30 天时间衰减。未来运行从过去的错误中学习。 | | **📚 知识库** | 每次运行在 6 个类别(决策、实验、发现、文献、问题、评审)中构建结构化知识库。 | | **🛡️ Sentinel 看门狗** | 后台质量监控:NaN/Inf 检测、论文-证据一致性、引用相关性评分、反数据捏造守卫。 | --- ## 🦞 OpenClaw 集成 **AutoResearchClaw 是 [OpenClaw](https://github.com/openclaw/openclaw) 兼容服务。** 在 OpenClaw 中安装后,一句话即可启动自主研究——也可通过 CLI、Claude Code 或其他 AI 编码助手独立使用。
### 🚀 通过 OpenClaw 使用(推荐) 如果你已经在使用 [OpenClaw](https://github.com/openclaw/openclaw) 作为 AI 助手: ``` 1️⃣ 把 GitHub 仓库地址分享给 OpenClaw 2️⃣ OpenClaw 自动读取 RESEARCHCLAW_AGENTS.md → 理解流水线 3️⃣ 对它说:"帮我研究 [你的主题]" 4️⃣ 完成 — OpenClaw 自动克隆、安装、配置、运行,然后返回结果 ``` **就这么简单。** OpenClaw 自动处理 `git clone`、`pip install`、配置和流水线执行。你只需聊天。
💡 底层发生了什么 1. OpenClaw 读取 `RESEARCHCLAW_AGENTS.md` → 学习研究编排器角色 2. OpenClaw 读取 `README.md` → 理解安装方式和流水线结构 3. OpenClaw 复制 `config.researchclaw.example.yaml` → `config.yaml` 4. 向你询问 LLM API Key(或使用环境变量) 5. 运行 `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. 返回论文、LaTeX、实验结果和引用
### 🔌 OpenClaw Bridge(高级功能) AutoResearchClaw 内置了 **Bridge 适配器系统**,提供 6 个可选集成能力: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ 定时研究任务 use_message: true # 💬 进度通知(Discord/Slack/Telegram) use_memory: true # 🧠 跨会话知识持久化 use_sessions_spawn: true # 🔀 为并行阶段派生子会话 use_web_fetch: true # 🌐 文献检索中的实时网络搜索 use_browser: false # 🖥️ 基于浏览器的论文采集 ``` 每个标志激活一个类型化适配器协议。当 OpenClaw 提供对应能力时,适配器无需改代码即可消费。详见 [`integration-guide.md`](integration-guide.md)。 ### ACP (Agent Client Protocol) AutoResearchClaw 可以使用**任何 ACP 兼容的编码 Agent** 作为其 LLM 后端——无需 API 密钥。Agent 通过 [acpx](https://github.com/openclaw/acpx) 通信,在全部 23 个流水线阶段中维持单个持久会话。 | Agent | 命令 | 备注 | |-------|------|------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ACP 示例 llm: provider: "acp" acp: agent: "claude" # 任何 ACP 兼容的 Agent CLI 命令 cwd: "." # Agent 的工作目录 # 无需 base_url 或 api_key — Agent 自行处理认证。 ``` ```bash # 直接运行 — Agent 使用自己的凭据 researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ 其他运行方式 | 方式 | 怎么用 | |------|--------| | **独立 CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | 读取 `RESEARCHCLAW_CLAUDE.md` — 直接说 *"Run research on [主题]"* | | **Copilot CLI** | `researchclaw run --topic "..."` 配合 `llm.acp.agent: "gh"` | | **OpenCode** | 读取 `.claude/skills/` — 同样的自然语言交互 | | **任何 AI CLI** | 提供 `RESEARCHCLAW_AGENTS.md` 作为上下文 → agent 自动引导 | --- ## 🔬 流水线:23 个阶段,8 个阶段组 ``` 阶段组 A:研究定义 阶段组 E:实验执行 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← 自修复 阶段组 B:文献发现 阶段组 F:分析与决策 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← 多Agent 4. LITERATURE_COLLECT ← 真实API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [门控] 6. KNOWLEDGE_EXTRACT 阶段组 G:论文撰写 16. PAPER_OUTLINE 阶段组 C:知识综合 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← 证据审查 8. HYPOTHESIS_GEN ← 辩论 19. PAPER_REVISION 阶段组 D:实验设计 阶段组 H:终稿 9. EXPERIMENT_DESIGN [门控] 20. QUALITY_GATE [门控] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← 相关性审查 ``` > **门控阶段**(5、9、20)可暂停等待人工审批,也可用 `--auto-approve` 自动通过。拒绝后流水线回滚。 > **决策循环**:第 15 阶段可触发 REFINE(→ 第 13 阶段)或 PIVOT(→ 第 8 阶段),自动版本化之前的产物。
📋 各阶段组职责 | 阶段组 | 做什么 | |--------|--------| | **A:定义** | LLM 将主题分解为结构化问题树和研究问题 | | **A+:硬件检测** | 自动检测 GPU(NVIDIA CUDA / Apple MPS / 纯 CPU),性能不足时警告用户,据此调整代码生成策略 | | **B:文献** | 多源搜索(OpenAlex → Semantic Scholar → arXiv)获取真实论文,按相关性筛选,提取知识卡片 | | **C:综合** | 聚类研究发现,识别研究空白,通过多 Agent 辩论生成可验证假设 | | **D:设计** | 设计实验方案,生成硬件感知的可运行 Python 代码(GPU 等级 → 包选择),估算资源需求 | | **E:执行** | 在沙箱中运行实验,检测 NaN/Inf 和运行时 Bug,通过定向 LLM 修复自愈代码 | | **F:分析** | 多 Agent 分析实验结果;自主 PROCEED / REFINE / PIVOT 决策并附理由 | | **G:写作** | 大纲 → 分段撰写初稿(5,000-6,500 词)→ 同行评审(含方法论-证据一致性)→ 带长度保障的修订 | | **H:终稿** | 质量门控,知识归档,LaTeX 导出(适配顶会模板),引用完整性 + 相关性核查 |
--- ## ✨ 核心功能 | 功能 | 说明 | |------|------| | **📚 多源文献** | 来自 OpenAlex、Semantic Scholar 和 arXiv 的真实论文——查询扩展、去重、三态熔断器与优雅降级 | | **🔍 四层引用核查** | arXiv ID 校验 → CrossRef/DataCite DOI → Semantic Scholar 标题匹配 → LLM 相关性评分。幻觉引用自动删除。 | | **🖥️ 硬件感知执行** | 自动检测 GPU(NVIDIA CUDA / Apple MPS / 纯 CPU),据此调整代码生成、import 和实验规模 | | **🦾 OpenCode Beast Mode** | 复杂实验自动路由至 [OpenCode](https://github.com/anomalyco/opencode)——生成多文件项目,含自定义架构、训练循环和消融实验。通过 `researchclaw setup` 安装。 | | **🧪 沙箱实验** | AST 验证代码、不可变 harness、NaN/Inf 快速失败、自修复、迭代优化(最多 10 轮)、部分结果捕获 | | **📝 顶会级写作** | NeurIPS/ICML/ICLR 模板,分段撰写(5,000-6,500 词),反数据捏造守卫、修订长度保障、反免责声明强制 | | **📐 模板切换** | `neurips_2025`、`iclr_2026`、`icml_2026` — Markdown → LaTeX,含数学公式、表格、图片、交叉引用、`\cite{}` | | **🛡️ 反数据捏造** | VerifiedRegistry 强制论文中使用经过验证的实验数据。自动诊断失败实验并在写作前修复。未验证数字被清理。 | | **🚦 质量门控** | 3 个人工审批门控(阶段 5、9、20),支持回滚。用 `--auto-approve` 跳过。 | --- ## 🧠 MetaClaw 集成 **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = 一个能从每次运行中学习的流水线。** MetaClaw 为 AutoResearchClaw 添加了**跨运行知识迁移**。启用后,流水线会自动从失败和警告中提取教训,将其转化为可复用的技能,并在后续运行中注入到全部 23 个阶段——让同样的错误不再重犯。 ### 工作原理 ``` 运行 N 执行 → 失败/警告被捕获为 Lessons ↓ MetaClaw Lesson → Skill 转换 ↓ arc-* Skill 文件存储在 ~/.metaclaw/skills/ ↓ 运行 N+1 → build_overlay() 将技能注入每个 LLM 提示 ↓ LLM 规避已知陷阱 → 更高质量,更少重试 ``` ### 快速配置 ```bash # 1. 安装 MetaClaw(如未安装) pip install metaclaw # 2. 在配置中启用 ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # MetaClaw 代理(可选) skills_dir: "~/.metaclaw/skills" # 技能存储位置 fallback_url: "https://api.openai.com/v1" # 直连 LLM 回退 fallback_api_key: "" # 回退 URL 的 API key lesson_to_skill: enabled: true min_severity: "warning" # 转换 warning + error max_skills_per_run: 3 ``` ```bash # 3. 照常运行 — MetaClaw 透明运作 researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` 每次运行后,查看 `~/.metaclaw/skills/arc-*/SKILL.md` 以了解流水线学到了哪些技能。 ### 实验结果 在对照 A/B 实验中(相同主题、相同 LLM、相同配置): | 指标 | 基线 | 使用 MetaClaw | 改善 | |------|------|---------------|------| | 阶段重试率 | 10.5% | 7.9% | **-24.8%** | | Refine 循环次数 | 2.0 | 1.2 | **-40.0%** | | 流水线阶段完成率 | 18/19 | 19/19 | **+5.3%** | | 整体鲁棒性得分(综合) | 0.714 | 0.845 | **+18.3%** | > 综合鲁棒性得分是阶段完成率(40%)、重试减少(30%)和 Refine 循环效率(30%)的加权平均。 ### 向后兼容性 - **默认:关闭。** 如果 `metaclaw_bridge` 不存在或 `enabled: false`,流水线行为与之前完全一致。 - **无新依赖。** MetaClaw 是可选的——核心流水线无需它即可运行。 - **所有 1,823 项现有测试通过**(包含集成代码)。 --- ## ⚙️ 配置参考
点击展开完整配置参考 ```yaml # === 项目 === project: name: "my-research" # 项目标识符 mode: "docs-first" # docs-first | semi-auto | full-auto # === 研究 === research: topic: "..." # 研究主题(必填) domains: ["ml", "nlp"] # 文献搜索的研究领域 daily_paper_count: 8 # 每个搜索查询的目标论文数 quality_threshold: 4.0 # 论文最低质量分 # === 运行时 === runtime: timezone: "America/New_York" # 用于时间戳 max_parallel_tasks: 3 # 并发实验限制 approval_timeout_hours: 12 # 门控阶段超时 retry_limit: 2 # 阶段失败重试次数 # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # API 端点(openai-compatible 必填) api_key_env: "OPENAI_API_KEY" # API key 环境变量(openai-compatible 必填) api_key: "" # 或直接填写 key primary_model: "gpt-4o" # 主模型 fallback_models: ["gpt-4o-mini"] # 回退链 s2_api_key: "" # Semantic Scholar API key(可选,更高速率限制) acp: # 仅在 provider: "acp" 时使用 agent: "claude" # ACP Agent CLI 命令(claude, codex, gemini 等) cwd: "." # Agent 的工作目录 # === 实验 === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # 每次运行最大执行时间(默认:300 秒) max_iterations: 10 # 最大优化迭代次数 metric_key: "val_loss" # 主指标名称 metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # 自动检测 import → requirements.txt ssh_remote: host: "" # GPU 服务器主机名 gpu_ids: [] # 可用 GPU ID remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode(通过 `researchclaw setup` 自动安装) enabled: true # 主开关(默认:true) auto: true # 无需确认自动触发(默认:true) complexity_threshold: 0.2 # 0.0-1.0 — 越高 = 仅在复杂实验时触发 model: "" # 覆盖模型(空 = 使用 llm.primary_model) timeout_sec: 600 # OpenCode 生成最大秒数 max_retries: 1 # 失败重试次数 workspace_cleanup: true # 采集后清理临时工作区 code_agent: # CodeAgent v2 — 多阶段代码生成 enabled: true # 使用 CodeAgent 替代传统单 prompt 代码生成 architecture_planning: true # 生成代码前先生成深度实现蓝图 sequential_generation: true # 按依赖 DAG 逐文件生成 hard_validation: true # 基于 AST 的验证门控(拦截相同消融、硬编码指标) hard_validation_max_repairs: 2 # 验证失败时最大修复次数 exec_fix_max_iterations: 3 # 执行修复循环最大次数 exec_fix_timeout_sec: 60 # 每次执行修复超时(秒) benchmark_agent: # BenchmarkAgent — 自动数据集和基线选择 enabled: true # 启用 4-agent 基准测试流水线(Surveyor→Selector→Acquirer→Validator) enable_hf_search: true # 搜索 HuggingFace Datasets enable_web_search: true # 搜索 Google Scholar 获取基准 tier_limit: 2 # 数据集级别过滤(1=小型/已缓存,2=中型,3=大型) min_benchmarks: 1 # 最少需要的数据集数量 min_baselines: 2 # 最少需要的基线方法数量 figure_agent: # FigureAgent — 学术图表生成 enabled: true # 启用 5-agent 图表流水线(Planner→CodeGen→Renderer→Critic→Integrator) min_figures: 3 # 最少生成图表数 max_figures: 8 # 最多生成图表数 max_iterations: 3 # Critic 驱动的迭代优化次数 dpi: 300 # 输出分辨率 strict_mode: false # 图表生成失败时是否阻塞流水线 repair: # 反数据捏造实验修复 enabled: true # 自动诊断并修复失败的实验 max_cycles: 3 # 修复重试循环数 min_completion_rate: 0.5 # >=50% 条件必须完成才可继续 min_conditions: 2 # 有效实验至少需要 2 个条件 use_opencode: true # 通过 OpenCode Beast Mode 进行修复 # === 网络搜索(可选)=== web_search: enabled: true # 启用网络增强文献搜索 tavily_api_key_env: "TAVILY_API_KEY" # Tavily API key 环境变量(可选) enable_scholar: true # Google Scholar 搜索 enable_pdf_extraction: true # 从 PDF 中提取文本 max_web_results: 10 # 每次查询最大网络结果数 # === 导出 === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # 自定义 Prompt YAML 路径(空 = 使用默认) # === 安全 === security: hitl_required_stages: [5, 9, 20] # 需要人工审批的阶段 allow_publish_without_approval: false redact_sensitive_logs: true # === 知识库 === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === 通知 === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge(可选)=== metaclaw_bridge: enabled: false # 设为 true 以启用跨运行学习 proxy_url: "http://localhost:30000" # MetaClaw 代理 URL skills_dir: "~/.metaclaw/skills" # arc-* 技能的存储位置 fallback_url: "" # 代理不可用时的直连 LLM 回退 fallback_api_key: "" # 回退端点的 API key lesson_to_skill: enabled: true # 自动将教训转换为技能 min_severity: "warning" # 转换的最低严重级别 max_skills_per_run: 3 # 每次流水线运行的最大新技能数 prm: # 过程奖励模型质量门控(可选) enabled: false # 使用 LLM-as-judge 评分阶段产出 model: "gpt-5.4" # PRM 评判模型 votes: 3 # 多数投票次数 gate_stages: [5, 9, 15, 20] # 应用 PRM 门控的阶段 # === OpenClaw Bridge === openclaw_bridge: use_cron: false # 定时研究运行 use_message: false # 进度通知 use_memory: false # 跨会话知识持久化 use_sessions_spawn: false # 派生并行子会话 use_web_fetch: false # 实时网络搜索 use_browser: false # 基于浏览器的论文采集 ```
--- ## 🙏 致谢 灵感来源: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist)(Sakana AI)— 自动化研究先驱 - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch)(Andrej Karpathy)— 端到端研究自动化 - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/)(Analemma)— 全自动研究系统 --- ## 📄 许可证 MIT — 详见 [LICENSE](../LICENSE)。 --- ## 📌 引用 如果你觉得 AutoResearchClaw 有用,请引用: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Built with 🦞 by the AutoResearchClaw team

================================================ FILE: docs/README_DE.md ================================================

AutoResearchClaw Logo

Idee besprechen. Paper erhalten. Vollautomatisch & selbstentwickelnd.

Einfach mit OpenClaw chatten: "Research X" → erledigt.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Paper-Showcase · 📖 Integrationsanleitung · 💬 Discord-Community

---
Sample Paper 🏆 Showcase generierter Paper

8 Paper aus 8 Disziplinen — Mathematik, Statistik, Biologie, Informatik, NLP, RL, Vision, Robustheit — vollstaendig autonom generiert ohne menschliches Eingreifen.

View Showcase
--- > **🧪 Wir suchen Tester!** Teste die Pipeline mit deiner eigenen Forschungsidee — aus jedem Fachgebiet — und [sag uns, was du denkst](TESTER_GUIDE.md). Dein Feedback beeinflusst direkt die naechste Version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Plattformuebergreifende Unterstuetzung + grosse Stabilitaet** — AutoResearchClaw laeuft jetzt mit jedem ACP-kompatiblen Agenten-Backend (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) und unterstuetzt Messaging-Plattformen (Discord, Telegram, Lark, WeChat) ueber die OpenClaw-Bruecke. Neues CLI-Agent-Code-Generierungs-Backend delegiert Stages 10 und 13 an externe CLI-Agenten mit Budgetkontrolle und Timeout-Management. Enthaelt Anti-Fabrication-System (VerifiedRegistry + Experiment-Diagnose- und Reparaturschleife), 100+ Bugfixes, modulares Executor-Refactoring, `--resume` Auto-Erkennung, LLM-Retry-Haertung und Community-Fixes. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ Ein Befehl. Ein Paper. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 Was ist das? **Du denkst es. AutoResearchClaw schreibt es.** Gib ein Forschungsthema ein — erhalte ein vollstaendiges wissenschaftliches Paper mit echter Literatur von OpenAlex, Semantic Scholar und arXiv, hardwarebewussten Sandbox-Experimenten (automatische GPU/MPS/CPU-Erkennung), statistischer Analyse, Multi-Agenten-Peer-Review und konferenzfertigem LaTeX fuer NeurIPS/ICML/ICLR. Kein Babysitting. Kein Kopieren. Keine halluzinierten Referenzen.
📄paper_draft.mdVollstaendiges wissenschaftliches Paper (Einleitung, Verwandte Arbeiten, Methode, Experimente, Ergebnisse, Fazit)
📐paper.texKonferenzfertiges LaTeX (NeurIPS / ICLR / ICML Templates)
📚references.bibEchte BibTeX-Referenzen von OpenAlex, Semantic Scholar und arXiv — automatisch bereinigt, um Inline-Zitationen zu entsprechen
🔍verification_report.json4-Schicht-Zitationsintegritaets- und Relevanzpruefung (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Generierter Code + Sandbox-Ergebnisse + strukturierte JSON-Metriken
📊charts/Automatisch generierte Vergleichsdiagramme mit Fehlerbalken und Konfidenzintervallen
📝reviews.mdMulti-Agenten-Peer-Review mit Methodik-Evidenz-Konsistenzpruefungen
🧬evolution/Selbstlernende Erkenntnisse aus jedem Durchlauf
📦deliverables/Alle finalen Ergebnisse in einem Ordner — kompilierbereit fuer Overleaf
Die Pipeline laeuft **vollstaendig ohne menschliches Eingreifen**. Wenn Experimente fehlschlagen, repariert sie sich selbst. Wenn Hypothesen nicht bestaetigt werden, schwenkt sie um. Wenn Zitationen gefaelscht sind, entfernt sie diese. 🌍 **Ueberall ausfuehrbar.** AutoResearchClaw ist nicht an eine einzelne Plattform gebunden. Nutzen Sie es eigenstaendig ueber die CLI, verbinden Sie es mit [OpenClaw](https://github.com/openclaw/openclaw), oder integrieren Sie es mit jedem ACP-kompatiblen AI-Agenten — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI und mehr. Dank der Messaging-Bruecke von OpenClaw koennen Sie eine komplette Forschung von 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat oder jeder anderen Plattform starten, die Ihr Team bereits nutzt. Ein Thema rein, ein Paper raus — egal wo Sie tippen. --- ## 🚀 Schnellstart ```bash # 1. Klonen & installieren git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Setup (interaktiv — installiert OpenCode Beast Mode, prueft Docker/LaTeX) researchclaw setup # 3. Konfigurieren researchclaw init # Interaktiv: LLM-Anbieter waehlen, erstellt config.arc.yaml # Oder manuell: cp config.researchclaw.example.yaml config.arc.yaml # 4. Ausfuehren export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` Ausgabe → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — kompilierfertiges LaTeX, BibTeX, Experimentcode, Diagramme.
📝 Minimale erforderliche Konfiguration ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 Was macht es anders | Faehigkeit | Funktionsweise | |-----------|---------------| | **🔄 PIVOT / REFINE Schleife** | Stufe 15 entscheidet autonom: PROCEED, REFINE (Parameter anpassen) oder PIVOT (neue Richtung). Artefakte automatisch versioniert. | | **🤖 Multi-Agenten-Debatte** | Hypothesengenerierung, Ergebnisanalyse und Peer-Review verwenden jeweils strukturierte Multi-Perspektiven-Debatten. | | **🧬 Selbstlernen** | Erkenntnisse pro Durchlauf extrahiert (Entscheidungsbegruendungen, Laufzeitwarnungen, Metrikanaomalien) mit 30-Tage-Zeitabklingung. Zukuenftige Durchlaeufe lernen aus vergangenen Fehlern. | | **📚 Wissensdatenbank** | Jeder Durchlauf baut eine strukturierte KB ueber 6 Kategorien auf (Entscheidungen, Experimente, Ergebnisse, Literatur, Fragen, Reviews). | | **🛡️ Sentinel Watchdog** | Hintergrund-Qualitaetsmonitor: NaN/Inf-Erkennung, Paper-Evidenz-Konsistenz, Zitationsrelevanz-Bewertung, Anti-Fabrikationsschutz. | --- ## 🦞 OpenClaw-Integration **AutoResearchClaw ist ein [OpenClaw](https://github.com/openclaw/openclaw)-kompatibler Dienst.** Installiere es in OpenClaw und starte autonome Forschung mit einer einzigen Nachricht — oder verwende es eigenstaendig ueber CLI, Claude Code oder jeden anderen KI-Coding-Assistenten.
### 🚀 Verwendung mit OpenClaw (empfohlen) Wenn du bereits [OpenClaw](https://github.com/openclaw/openclaw) als KI-Assistenten nutzt: ``` 1️⃣ Teile die GitHub-Repo-URL mit OpenClaw 2️⃣ OpenClaw liest automatisch RESEARCHCLAW_AGENTS.md → versteht die Pipeline 3️⃣ Sage: "Research [dein Thema]" 4️⃣ Fertig — OpenClaw klont, installiert, konfiguriert, fuehrt aus und liefert Ergebnisse ``` **Das war's.** OpenClaw uebernimmt `git clone`, `pip install`, Konfiguration und Pipeline-Ausfuehrung automatisch. Du chattest einfach.
💡 Was unter der Haube passiert 1. OpenClaw liest `RESEARCHCLAW_AGENTS.md` → lernt die Forschungs-Orchestrator-Rolle 2. OpenClaw liest `README.md` → versteht Installation und Pipeline-Struktur 3. OpenClaw kopiert `config.researchclaw.example.yaml` → `config.yaml` 4. Fragt nach deinem LLM-API-Schluessel (oder verwendet deine Umgebungsvariable) 5. Fuehrt `pip install -e .` + `researchclaw run --topic "..." --auto-approve` aus 6. Liefert Paper, LaTeX, Experimente und Zitationen zurueck
### 🔌 OpenClaw Bridge (Fortgeschritten) Fuer tiefere Integration enthaelt AutoResearchClaw ein **Bridge-Adapter-System** mit 6 optionalen Faehigkeiten: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Geplante Forschungsdurchlaeufe use_message: true # 💬 Fortschrittsbenachrichtigungen (Discord/Slack/Telegram) use_memory: true # 🧠 Sitzungsuebergreifende Wissenspersistenz use_sessions_spawn: true # 🔀 Parallele Sub-Sessions fuer gleichzeitige Stufen use_web_fetch: true # 🌐 Live-Websuche waehrend der Literaturrecherche use_browser: false # 🖥️ Browserbasierte Paper-Sammlung ``` Jedes Flag aktiviert ein typisiertes Adapter-Protokoll. Wenn OpenClaw diese Faehigkeiten bereitstellt, nutzen die Adapter sie ohne Codeaenderungen. Siehe [`integration-guide.md`](integration-guide.md) fuer vollstaendige Details. ### ACP (Agent Client Protocol) AutoResearchClaw kann **jeden ACP-kompatiblen Coding-Agenten** als LLM-Backend verwenden — keine API-Schluessel erforderlich. Der Agent kommuniziert ueber [acpx](https://github.com/openclaw/acpx) und haelt eine einzige persistente Sitzung ueber alle 23 Pipeline-Stufen aufrecht. | Agent | Befehl | Hinweise | |-------|--------|----------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ACP-Beispiel llm: provider: "acp" acp: agent: "claude" # Jeder ACP-kompatible Agent-CLI-Befehl cwd: "." # Arbeitsverzeichnis fuer den Agenten # Kein base_url oder api_key noetig — der Agent verwaltet seine eigene Authentifizierung. ``` ```bash # Einfach ausfuehren — der Agent verwendet seine eigenen Anmeldedaten researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ Weitere Ausfuehrungsmoeglichkeiten | Methode | Anleitung | |---------|-----------| | **Standalone CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Liest `RESEARCHCLAW_CLAUDE.md` — sage einfach *"Run research on [Thema]"* | | **Copilot CLI** | `researchclaw run --topic "..."` mit `llm.acp.agent: "gh"` | | **OpenCode** | Liest `.claude/skills/` — gleiche natuerliche Sprachschnittstelle | | **Jeder KI-CLI** | Uebergib `RESEARCHCLAW_AGENTS.md` als Kontext → Agent bootstrappt automatisch | --- ## 🔬 Pipeline: 23 Stufen, 8 Phasen ``` Phase A: Forschungsplanung Phase E: Experimentausfuehrung 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← Selbstheilung Phase B: Literaturrecherche Phase F: Analyse & Entscheidung 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← Multi-Agent 4. LITERATURE_COLLECT ← echte API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [Gate] 6. KNOWLEDGE_EXTRACT Phase G: Papiererstellung 16. PAPER_OUTLINE Phase C: Wissenssynthese 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← Evidenzpruefung 8. HYPOTHESIS_GEN ← Debatte 19. PAPER_REVISION Phase D: Experimentdesign Phase H: Finalisierung 9. EXPERIMENT_DESIGN [Gate] 20. QUALITY_GATE [Gate] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← Relevanzpruefung ``` > **Gate-Stufen** (5, 9, 20) pausieren fuer menschliche Genehmigung oder werden mit `--auto-approve` automatisch genehmigt. Bei Ablehnung wird die Pipeline zurueckgesetzt. > **Entscheidungsschleifen**: Stufe 15 kann REFINE (→ Stufe 13) oder PIVOT (→ Stufe 8) ausloesen, mit automatischer Artefakt-Versionierung.
📋 Was jede Phase bewirkt | Phase | Beschreibung | |-------|-------------| | **A: Planung** | LLM zerlegt das Thema in einen strukturierten Problembaum mit Forschungsfragen | | **A+: Hardware** | Automatische GPU-Erkennung (NVIDIA CUDA / Apple MPS / nur CPU), Warnung bei eingeschraenkter Hardware, Codegenerierung wird entsprechend angepasst | | **B: Literatur** | Multi-Source-Suche (OpenAlex → Semantic Scholar → arXiv) nach echten Papern, Relevanzscreening, Extraktion von Wissenskarten | | **C: Synthese** | Clustering der Ergebnisse, Identifizierung von Forschungsluecken, Generierung testbarer Hypothesen via Multi-Agenten-Debatte | | **D: Design** | Experimentplan entwerfen, hardwarebewussten ausfuehrbaren Python-Code generieren (GPU-Stufe → Paketauswahl), Ressourcenbedarf schaetzen | | **E: Ausfuehrung** | Experimente in Sandbox ausfuehren, NaN/Inf und Laufzeitfehler erkennen, Code via gezielter LLM-Reparatur selbst heilen | | **F: Analyse** | Multi-Agenten-Analyse der Ergebnisse; autonome PROCEED / REFINE / PIVOT Entscheidung mit Begruendung | | **G: Schreiben** | Gliederung → abschnittsweises Verfassen (5.000-6.500 Woerter) → Peer-Review (mit Methodik-Evidenz-Konsistenz) → Revision mit Laengenpruefung | | **H: Finalisierung** | Qualitaets-Gate, Wissensarchivierung, LaTeX-Export mit Konferenztemplate, Zitationsintegritaets- und Relevanzpruefung |
--- ## ✨ Hauptfunktionen | Funktion | Beschreibung | |----------|-------------| | **📚 Multi-Source-Literatur** | Echte Paper von OpenAlex, Semantic Scholar und arXiv — Abfrageerweiterung, Deduplizierung, Circuit Breaker mit Graceful Degradation | | **🔍 4-Schicht-Zitationsverifikation** | arXiv-ID-Pruefung → CrossRef/DataCite-DOI → Semantic-Scholar-Titelabgleich → LLM-Relevanzbewertung. Halluzinierte Refs automatisch entfernt. | | **🖥️ Hardwarebewusste Ausfuehrung** | Automatische GPU-Erkennung (NVIDIA CUDA / Apple MPS / nur CPU) und Anpassung von Codegenerierung, Imports und Experimentumfang | | **🦾 OpenCode Beast Mode** | Komplexe Experimente werden automatisch an [OpenCode](https://github.com/anomalyco/opencode) weitergeleitet — generiert Multi-File-Projekte mit individuellen Architekturen, Trainingsschleifen und Ablationsstudien. Installation ueber `researchclaw setup`. | | **🧪 Sandbox-Experimente** | AST-validierter Code, unveraenderlicher Harness, NaN/Inf-Schnellabbruch, selbstheilende Reparatur, iterative Verfeinerung (bis zu 10 Runden), Teilergebnis-Erfassung | | **📝 Konferenzqualitaet** | NeurIPS/ICML/ICLR-Templates, abschnittsweises Verfassen (5.000-6.500 Woerter), Anti-Fabrikationsschutz, Revisions-Laengenschutz, Anti-Disclaimer-Durchsetzung | | **📐 Template-Umschaltung** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX mit Mathematik, Tabellen, Abbildungen, Querverweisen, `\cite{}` | | **🚦 Qualitaets-Gates** | 3 Human-in-the-Loop-Gates (Stufen 5, 9, 20) mit Rollback. Ueberspringen mit `--auto-approve`. | --- ## 🧠 MetaClaw-Integration **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Eine Pipeline, die aus jedem Durchlauf lernt.** MetaClaw fuegt **durchlaufuebergreifenden Wissenstransfer** zu AutoResearchClaw hinzu. Wenn aktiviert, erfasst die Pipeline automatisch Erkenntnisse aus Fehlern und Warnungen, konvertiert sie in wiederverwendbare Skills und injiziert diese Skills in alle 23 Pipeline-Stufen bei nachfolgenden Durchlaeufen — damit dieselben Fehler nie wiederholt werden. ### Funktionsweise ``` Durchlauf N wird ausgefuehrt → Fehler/Warnungen als Lektionen erfasst ↓ MetaClaw Lektion → Skill-Konvertierung ↓ arc-* Skill-Dateien in ~/.metaclaw/skills/ gespeichert ↓ Durchlauf N+1 → build_overlay() injiziert Skills in jeden LLM-Prompt ↓ LLM vermeidet bekannte Fallstricke → hoehere Qualitaet, weniger Wiederholungen ``` ### Schnelleinrichtung ```bash # 1. MetaClaw installieren (falls nicht vorhanden) pip install metaclaw # 2. In der Konfiguration aktivieren ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # MetaClaw-Proxy (optional) skills_dir: "~/.metaclaw/skills" # Wo Skills gespeichert werden fallback_url: "https://api.openai.com/v1" # Direkter LLM-Fallback fallback_api_key: "" # API-Schluessel fuer Fallback-URL lesson_to_skill: enabled: true min_severity: "warning" # Warnungen + Fehler konvertieren max_skills_per_run: 3 ``` ```bash # 3. Wie gewohnt ausfuehren — MetaClaw arbeitet transparent researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` Nach jedem Durchlauf kannst du `~/.metaclaw/skills/arc-*/SKILL.md` pruefen, um die erlernten Skills deiner Pipeline zu sehen. ### Experimentergebnisse In kontrollierten A/B-Experimenten (gleiches Thema, gleiches LLM, gleiche Konfiguration): | Metrik | Baseline | Mit MetaClaw | Verbesserung | |--------|----------|--------------|--------------| | Stufen-Wiederholungsrate | 10.5% | 7.9% | **-24.8%** | | Anzahl REFINE-Zyklen | 2.0 | 1.2 | **-40.0%** | | Pipeline-Stufenabschluss | 18/19 | 19/19 | **+5.3%** | | Gesamtrobustheitswert (Komposit) | 0.714 | 0.845 | **+18.3%** | > Der Komposit-Robustheitswert ist ein gewichteter Durchschnitt aus Stufenabschlussrate (40%), Wiederholungsreduktion (30%) und REFINE-Zykluseffizienz (30%). ### Abwaertskompatibilitaet - **Standard: AUS.** Wenn `metaclaw_bridge` fehlt oder `enabled: false`, verhaelt sich die Pipeline exakt wie zuvor. - **Keine neuen Abhaengigkeiten.** MetaClaw ist optional — die Kern-Pipeline funktioniert ohne. - **Alle 1.823 bestehenden Tests bestehen** mit dem Integrationscode. --- ## ⚙️ Konfigurationsreferenz
Klicken zum Aufklappen der vollstaendigen Konfigurationsreferenz ```yaml # === Projekt === project: name: "my-research" # Projektbezeichner mode: "docs-first" # docs-first | semi-auto | full-auto # === Forschung === research: topic: "..." # Forschungsthema (erforderlich) domains: ["ml", "nlp"] # Forschungsdomaenen fuer Literatursuche daily_paper_count: 8 # Ziel-Paperzahl pro Suchabfrage quality_threshold: 4.0 # Mindestqualitaetswert fuer Paper # === Laufzeit === runtime: timezone: "America/New_York" # Fuer Zeitstempel max_parallel_tasks: 3 # Limit gleichzeitiger Experimente approval_timeout_hours: 12 # Gate-Stufen-Timeout retry_limit: 2 # Wiederholungsanzahl bei Stufenfehler # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # API-Endpunkt (erforderlich fuer openai-compatible) api_key_env: "OPENAI_API_KEY" # Umgebungsvariable fuer API-Schluessel (erforderlich fuer openai-compatible) api_key: "" # Oder Schluessel direkt eintragen primary_model: "gpt-4o" # Primaeres Modell fallback_models: ["gpt-4o-mini"] # Fallback-Kette s2_api_key: "" # Semantic Scholar API-Schluessel (optional, hoehere Rate-Limits) acp: # Nur verwendet wenn provider: "acp" agent: "claude" # ACP-Agent-CLI-Befehl (claude, codex, gemini, etc.) cwd: "." # Arbeitsverzeichnis fuer den Agenten # === Experiment === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Max. Ausfuehrungszeit pro Durchlauf (Standard: 300s) max_iterations: 10 # Max. Optimierungsiterationen metric_key: "val_loss" # Primaerer Metrikname metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Automatische Import-Erkennung → requirements.txt ssh_remote: host: "" # GPU-Server-Hostname gpu_ids: [] # Verfuegbare GPU-IDs remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (auto-installiert ueber `researchclaw setup`) enabled: true # Hauptschalter (Standard: true) auto: true # Auto-Ausloesung ohne Bestaetigung (Standard: true) complexity_threshold: 0.2 # 0.0-1.0 — hoeher = nur bei komplexen Experimenten ausloesen model: "" # Modell ueberschreiben (leer = llm.primary_model verwenden) timeout_sec: 600 # Max. Sekunden fuer OpenCode-Generierung max_retries: 1 # Wiederholungsanzahl bei Fehler workspace_cleanup: true # Temporaeren Workspace nach Sammlung entfernen # === Export === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # Pfad zur benutzerdefinierten Prompts-YAML (leer = Standardwerte) # === Sicherheit === security: hitl_required_stages: [5, 9, 20] # Stufen, die menschliche Genehmigung erfordern allow_publish_without_approval: false redact_sensitive_logs: true # === Wissensdatenbank === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Benachrichtigungen === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge (Optional) === metaclaw_bridge: enabled: false # Auf true setzen fuer durchlaufuebergreifendes Lernen proxy_url: "http://localhost:30000" # MetaClaw-Proxy-URL skills_dir: "~/.metaclaw/skills" # Wo arc-* Skills gespeichert werden fallback_url: "" # Direkter LLM-Fallback wenn Proxy nicht erreichbar fallback_api_key: "" # API-Schluessel fuer Fallback-Endpunkt lesson_to_skill: enabled: true # Lektionen automatisch in Skills konvertieren min_severity: "warning" # Mindestschwere fuer Konvertierung max_skills_per_run: 3 # Max. neue Skills pro Pipeline-Durchlauf # === OpenClaw Bridge === openclaw_bridge: use_cron: false # Geplante Forschungsdurchlaeufe use_message: false # Fortschrittsbenachrichtigungen use_memory: false # Sitzungsuebergreifende Wissenspersistenz use_sessions_spawn: false # Parallele Sub-Sessions starten use_web_fetch: false # Live-Websuche use_browser: false # Browserbasierte Paper-Sammlung ```
--- ## 🙏 Danksagungen Inspiriert von: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionier der automatisierten Forschung - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — End-to-End-Forschungsautomatisierung - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System --- ## 📄 Lizenz MIT — siehe [LICENSE](../LICENSE) fuer Details. --- ## 📌 Zitation Wenn du AutoResearchClaw nuetzlich findest, zitiere bitte: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Gebaut mit 🦞 vom AutoResearchClaw-Team

================================================ FILE: docs/README_ES.md ================================================

AutoResearchClaw Logo

Comparte una idea. Obten un articulo. Totalmente autonomo & autoevolutivo.

Chatea con OpenClaw: "Investiga X" → hecho.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Galeria de articulos · 📖 Guia de integracion · 💬 Comunidad Discord

---
Sample Paper 🏆 Galeria de articulos generados

8 articulos en 8 dominios — matematicas, estadistica, biologia, computacion, NLP, RL, vision, robustez — generados de forma completamente autonoma sin intervencion humana.

View Showcase
--- > **🧪 Buscamos testers!** Prueba el pipeline con tu propia idea de investigacion — de cualquier campo — y [cuentanos que piensas](TESTER_GUIDE.md). Tu feedback da forma directamente a la proxima version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Soporte multiplataforma + estabilidad mayor** — AutoResearchClaw ahora funciona con cualquier agente compatible con ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) y soporta plataformas de mensajeria (Discord, Telegram, Lark, WeChat) via el puente OpenClaw. Nuevo backend de generacion de codigo CLI-agent que delega las Stages 10 y 13 a agentes CLI externos con control de presupuesto y gestion de timeouts. Incluye sistema anti-fabricacion (VerifiedRegistry + bucle de diagnostico y reparacion), 100+ correcciones de bugs, refactorizacion modular del executor, auto-deteccion de `--resume`, endurecimiento de reintentos LLM y correcciones de la comunidad. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-integracion-metaclaw). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ Un comando. Un articulo. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 Que es esto? **Tu lo piensas. AutoResearchClaw lo escribe.** Proporciona un tema de investigacion — recibe un articulo academico completo con literatura real de OpenAlex, Semantic Scholar y arXiv, experimentos en sandbox adaptados al hardware (deteccion automatica GPU/MPS/CPU), analisis estadistico, revision multi-agentes, y LaTeX listo para conferencia orientado a NeurIPS/ICML/ICLR. Sin supervision. Sin copiar y pegar. Sin referencias alucinadas.
📄paper_draft.mdArticulo academico completo (Introduccion, Trabajo relacionado, Metodo, Experimentos, Resultados, Conclusion)
📐paper.texLaTeX listo para conferencia (plantillas NeurIPS / ICLR / ICML)
📚references.bibReferencias BibTeX reales de OpenAlex, Semantic Scholar y arXiv — auto-depuradas para coincidir con las citas en linea
🔍verification_report.jsonVerificacion de integridad + relevancia de citas en 4 capas (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Codigo generado + resultados en sandbox + metricas JSON estructuradas
📊charts/Graficos de comparacion de condiciones auto-generados con barras de error e intervalos de confianza
📝reviews.mdRevision por pares multi-agente con verificacion de consistencia metodologia-evidencia
🧬evolution/Lecciones de auto-aprendizaje extraidas de cada ejecucion
📦deliverables/Todos los entregables finales en una sola carpeta — listos para compilar en Overleaf
El pipeline se ejecuta **de principio a fin sin intervencion humana**. Cuando los experimentos fallan, se auto-repara. Cuando las hipotesis no se sostienen, pivotea. Cuando las citas son falsas, las elimina. 🌍 **Ejecutalo en cualquier lugar.** AutoResearchClaw no esta atado a una sola plataforma. Usalo de forma independiente por CLI, conectalo a [OpenClaw](https://github.com/openclaw/openclaw), o integralo con cualquier agente compatible con ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, y mas. Gracias al puente de mensajeria de OpenClaw, puedes iniciar una investigacion completa desde 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, o cualquier plataforma que tu equipo ya utilice. Un tema de entrada, un paper de salida — sin importar donde lo escribas. --- ## 🚀 Inicio rapido ```bash # 1. Clonar e instalar git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Setup (interactivo — instala OpenCode beast mode, verifica Docker/LaTeX) researchclaw setup # 3. Configurar researchclaw init # Interactivo: elegir proveedor LLM, crea config.arc.yaml # O manualmente: cp config.researchclaw.example.yaml config.arc.yaml # 4. Ejecutar export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` Salida → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — LaTeX listo para compilar, BibTeX, codigo experimental, graficos.
📝 Configuracion minima requerida ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 Que lo hace diferente | Capacidad | Como funciona | |-----------|--------------| | **🔄 Bucle PIVOT / REFINE** | La etapa 15 decide de forma autonoma: PROCEED, REFINE (ajustar parametros) o PIVOT (nueva direccion). Artefactos auto-versionados. | | **🤖 Debate multi-agente** | La generacion de hipotesis, el analisis de resultados y la revision por pares utilizan cada uno debate estructurado multi-perspectiva. | | **🧬 Auto-aprendizaje** | Lecciones extraidas por ejecucion (justificacion de decisiones, advertencias de ejecucion, anomalias de metricas) con decaimiento temporal de 30 dias. Las ejecuciones futuras aprenden de errores pasados. | | **📚 Base de conocimiento** | Cada ejecucion construye una KB estructurada en 6 categorias (decisiones, experimentos, hallazgos, literatura, preguntas, revisiones). | | **🛡️ Vigilante Sentinel** | Monitor de calidad en segundo plano: deteccion NaN/Inf, consistencia articulo-evidencia, puntuacion de relevancia de citas, guardia anti-fabricacion. | --- ## 🦞 Integracion con OpenClaw **AutoResearchClaw es un servicio compatible con [OpenClaw](https://github.com/openclaw/openclaw).** Instalalo en OpenClaw y lanza investigacion autonoma con un solo mensaje — o usalo de forma independiente via CLI, Claude Code o cualquier asistente de programacion con IA.
### 🚀 Uso con OpenClaw (Recomendado) Si ya usas [OpenClaw](https://github.com/openclaw/openclaw) como tu asistente de IA: ``` 1️⃣ Comparte la URL del repositorio de GitHub con OpenClaw 2️⃣ OpenClaw lee automaticamente RESEARCHCLAW_AGENTS.md → comprende el pipeline 3️⃣ Di: "Research [tu tema]" 4️⃣ Listo — OpenClaw clona, instala, configura, ejecuta y devuelve los resultados ``` **Eso es todo.** OpenClaw se encarga de `git clone`, `pip install`, configuracion y ejecucion del pipeline automaticamente. Tu solo chateas.
💡 Que sucede internamente 1. OpenClaw lee `RESEARCHCLAW_AGENTS.md` → aprende el rol de orquestador de investigacion 2. OpenClaw lee `README.md` → comprende la instalacion y la estructura del pipeline 3. OpenClaw copia `config.researchclaw.example.yaml` → `config.yaml` 4. Solicita tu clave API del LLM (o usa tu variable de entorno) 5. Ejecuta `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. Devuelve el articulo, LaTeX, experimentos y citas
### 🔌 Bridge de OpenClaw (Avanzado) Para una integracion mas profunda, AutoResearchClaw incluye un **sistema de adaptadores bridge** con 6 capacidades opcionales: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Ejecuciones de investigacion programadas use_message: true # 💬 Notificaciones de progreso (Discord/Slack/Telegram) use_memory: true # 🧠 Persistencia de conocimiento entre sesiones use_sessions_spawn: true # 🔀 Generar sub-sesiones paralelas para etapas concurrentes use_web_fetch: true # 🌐 Busqueda web en vivo durante la revision de literatura use_browser: false # 🖥️ Recopilacion de articulos basada en navegador ``` Cada flag activa un protocolo de adaptador tipado. Cuando OpenClaw proporciona estas capacidades, los adaptadores las consumen sin cambios en el codigo. Consulta [`integration-guide.md`](integration-guide.md) para mas detalles. ### ACP (Agent Client Protocol) AutoResearchClaw puede usar **cualquier agente de programacion compatible con ACP** como backend LLM — sin necesidad de claves API. El agente se comunica via [acpx](https://github.com/openclaw/acpx), manteniendo una sola sesion persistente a traves de las 23 etapas del pipeline. | Agente | Comando | Notas | |--------|---------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ejemplo ACP llm: provider: "acp" acp: agent: "claude" # Cualquier comando CLI de agente compatible con ACP cwd: "." # Directorio de trabajo para el agente # No se necesita base_url ni api_key — el agente gestiona su propia autenticacion. ``` ```bash # Solo ejecuta — el agente usa sus propias credenciales researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ Otras formas de ejecucion | Metodo | Como | |--------|------| | **CLI independiente** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **API de Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Lee `RESEARCHCLAW_CLAUDE.md` — solo di *"Run research on [tema]"* | | **Copilot CLI** | `researchclaw run --topic "..."` con `llm.acp.agent: "gh"` | | **OpenCode** | Lee `.claude/skills/` — la misma interfaz en lenguaje natural | | **Cualquier CLI de IA** | Proporciona `RESEARCHCLAW_AGENTS.md` como contexto → el agente se auto-configura | --- ## 🔬 Pipeline: 23 etapas, 8 fases ``` Fase A: Alcance de investigacion Fase E: Ejecucion de experimentos 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← auto-reparacion Fase B: Descubrimiento de literatura Fase F: Analisis y decision 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← multi-agente 4. LITERATURE_COLLECT ← API real 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [compuerta] 6. KNOWLEDGE_EXTRACT Fase G: Redaccion del articulo 16. PAPER_OUTLINE Fase C: Sintesis de conocimiento 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← verif. evidencia 8. HYPOTHESIS_GEN ← debate 19. PAPER_REVISION Fase D: Diseno experimental Fase H: Finalizacion 9. EXPERIMENT_DESIGN [compuerta] 20. QUALITY_GATE [compuerta] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← verif. relevancia ``` > Las **etapas con compuerta** (5, 9, 20) se pausan para aprobacion humana o se auto-aprueban con `--auto-approve`. Al rechazar, el pipeline retrocede. > **Bucles de decision**: La etapa 15 puede activar REFINE (→ Etapa 13) o PIVOT (→ Etapa 8), con versionado automatico de artefactos.
📋 Que hace cada fase | Fase | Que sucede | |------|-----------| | **A: Alcance** | El LLM descompone el tema en un arbol de problemas estructurado con preguntas de investigacion | | **A+: Hardware** | Deteccion automatica de GPU (NVIDIA CUDA / Apple MPS / solo CPU), advierte si el hardware local es limitado, adapta la generacion de codigo en consecuencia | | **B: Literatura** | Busqueda multi-fuente (OpenAlex → Semantic Scholar → arXiv) de articulos reales, filtrado por relevancia, extraccion de fichas de conocimiento | | **C: Sintesis** | Agrupa hallazgos, identifica brechas de investigacion, genera hipotesis comprobables mediante debate multi-agente | | **D: Diseno** | Disena plan experimental, genera Python ejecutable adaptado al hardware (nivel de GPU → seleccion de paquetes), estima necesidades de recursos | | **E: Ejecucion** | Ejecuta experimentos en sandbox, detecta NaN/Inf y errores en tiempo de ejecucion, auto-repara codigo mediante reparacion LLM dirigida | | **F: Analisis** | Analisis multi-agente de resultados; decision autonoma PROCEED / REFINE / PIVOT con justificacion | | **G: Redaccion** | Esquema → redaccion seccion por seccion (5,000-6,500 palabras) → revision por pares (con consistencia metodologia-evidencia) → revision con guardia de longitud | | **H: Finalizacion** | Compuerta de calidad, archivado de conocimiento, exportacion LaTeX con plantilla de conferencia, verificacion de integridad + relevancia de citas |
--- ## ✨ Caracteristicas principales | Caracteristica | Descripcion | |----------------|------------| | **📚 Literatura multi-fuente** | Articulos reales de OpenAlex, Semantic Scholar y arXiv — expansion de consultas, deduplicacion, circuit breaker con degradacion gradual | | **🔍 Verificacion de citas en 4 capas** | Verificacion de arXiv ID → DOI CrossRef/DataCite → coincidencia de titulo Semantic Scholar → puntuacion de relevancia LLM. Referencias alucinadas auto-eliminadas. | | **🖥️ Ejecucion adaptada al hardware** | Deteccion automatica de GPU (NVIDIA CUDA / Apple MPS / solo CPU) y adaptacion de la generacion de codigo, imports y escala experimental | | **🦾 OpenCode Beast Mode** | Los experimentos complejos se enrutan automaticamente a [OpenCode](https://github.com/anomalyco/opencode) — genera proyectos multi-archivo con arquitecturas personalizadas, bucles de entrenamiento y estudios de ablacion. Instalacion via `researchclaw setup`. | | **🧪 Experimentos en sandbox** | Codigo validado por AST, harness inmutable, fallo rapido NaN/Inf, reparacion auto-curativa, refinamiento iterativo (hasta 10 rondas), captura de resultados parciales | | **📝 Redaccion de calidad conferencia** | Plantillas NeurIPS/ICML/ICLR, redaccion seccion por seccion (5,000-6,500 palabras), guardia anti-fabricacion, guardia de longitud en revision, enforcement anti-disclaimer | | **📐 Cambio de plantilla** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX con formulas, tablas, figuras, referencias cruzadas, `\cite{}` | | **🚦 Compuertas de calidad** | 3 compuertas con intervencion humana posible (etapas 5, 9, 20) con retroceso. Omitir con `--auto-approve`. | --- ## 🧠 Integracion MetaClaw **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Un pipeline que aprende de cada ejecucion.** MetaClaw agrega **transferencia de conocimiento entre ejecuciones** a AutoResearchClaw. Cuando esta habilitado, el pipeline captura automaticamente lecciones de fallos y advertencias, las convierte en habilidades reutilizables, e inyecta esas habilidades en las 23 etapas del pipeline en ejecuciones posteriores — para que los mismos errores nunca se repitan. ### Como funciona ``` Ejecucion N se ejecuta → fallos/advertencias capturados como Lecciones ↓ MetaClaw Leccion → conversion a Habilidad ↓ Archivos de habilidades arc-* almacenados en ~/.metaclaw/skills/ ↓ Ejecucion N+1 → build_overlay() inyecta habilidades en cada prompt LLM ↓ El LLM evita trampas conocidas → mayor calidad, menos reintentos ``` ### Configuracion rapida ```bash # 1. Instalar MetaClaw (si no esta instalado) pip install metaclaw # 2. Habilitar en tu configuracion ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # Proxy MetaClaw (opcional) skills_dir: "~/.metaclaw/skills" # Donde se almacenan las habilidades fallback_url: "https://api.openai.com/v1" # Fallback directo al LLM fallback_api_key: "" # Clave API para la URL de fallback lesson_to_skill: enabled: true min_severity: "warning" # Convertir advertencias + errores max_skills_per_run: 3 ``` ```bash # 3. Ejecuta como siempre — MetaClaw funciona de forma transparente researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` Despues de cada ejecucion, revisa `~/.metaclaw/skills/arc-*/SKILL.md` para ver las habilidades que tu pipeline ha aprendido. ### Resultados experimentales En experimentos controlados A/B (mismo tema, mismo LLM, misma configuracion): | Metrica | Linea base | Con MetaClaw | Mejora | |---------|------------|--------------|--------| | Tasa de reintento de etapas | 10.5% | 7.9% | **-24.8%** | | Conteo de ciclos REFINE | 2.0 | 1.2 | **-40.0%** | | Completacion de etapas del pipeline | 18/19 | 19/19 | **+5.3%** | | Puntuacion de robustez global (compuesta) | 0.714 | 0.845 | **+18.3%** | > La puntuacion de robustez compuesta es un promedio ponderado de la tasa de completacion de etapas (40%), reduccion de reintentos (30%) y eficiencia de ciclos REFINE (30%). ### Retrocompatibilidad - **Por defecto: DESACTIVADO.** Si `metaclaw_bridge` esta ausente o `enabled: false`, el pipeline se comporta exactamente como antes. - **Sin nuevas dependencias.** MetaClaw es opcional — el pipeline base funciona sin el. - **Los 1,823 tests existentes pasan** con el codigo de integracion presente. --- ## ⚙️ Referencia de configuracion
Haz clic para expandir la referencia completa de configuracion ```yaml # === Proyecto === project: name: "my-research" # Identificador del proyecto mode: "docs-first" # docs-first | semi-auto | full-auto # === Investigacion === research: topic: "..." # Tema de investigacion (requerido) domains: ["ml", "nlp"] # Dominios de investigacion para busqueda de literatura daily_paper_count: 8 # Articulos objetivo por consulta de busqueda quality_threshold: 4.0 # Puntuacion minima de calidad para articulos # === Tiempo de ejecucion === runtime: timezone: "America/New_York" # Para marcas de tiempo max_parallel_tasks: 3 # Limite de experimentos concurrentes approval_timeout_hours: 12 # Timeout de etapas con compuerta retry_limit: 2 # Numero de reintentos por fallo de etapa # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # Endpoint de API (requerido para openai-compatible) api_key_env: "OPENAI_API_KEY" # Variable de entorno para la clave API (requerido para openai-compatible) api_key: "" # O codifica la clave aqui directamente primary_model: "gpt-4o" # Modelo principal fallback_models: ["gpt-4o-mini"] # Cadena de fallback s2_api_key: "" # Clave API de Semantic Scholar (opcional, mayores limites de tasa) acp: # Solo se usa cuando provider: "acp" agent: "claude" # Comando CLI del agente ACP (claude, codex, gemini, etc.) cwd: "." # Directorio de trabajo para el agente # === Experimento === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Tiempo maximo de ejecucion por corrida (por defecto: 300s) max_iterations: 10 # Maximo de iteraciones de optimizacion metric_key: "val_loss" # Nombre de la metrica principal metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Deteccion automatica de imports → requirements.txt ssh_remote: host: "" # Nombre de host del servidor GPU gpu_ids: [] # IDs de GPU disponibles remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (auto-instalado via `researchclaw setup`) enabled: true # Interruptor principal (por defecto: true) auto: true # Auto-activacion sin confirmacion (por defecto: true) complexity_threshold: 0.2 # 0.0-1.0 — mas alto = solo se activa para experimentos complejos model: "" # Modelo a forzar (vacio = usa llm.primary_model) timeout_sec: 600 # Segundos maximos para generacion OpenCode max_retries: 1 # Numero de reintentos por fallo workspace_cleanup: true # Eliminar workspace temporal despues de recoleccion # === Exportacion === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # Ruta a YAML de prompts personalizados (vacio = valores por defecto) # === Seguridad === security: hitl_required_stages: [5, 9, 20] # Etapas que requieren aprobacion humana allow_publish_without_approval: false redact_sensitive_logs: true # === Base de conocimiento === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Notificaciones === notifications: channel: "console" # console | discord | slack target: "" # === Puente MetaClaw (Opcional) === metaclaw_bridge: enabled: false # Establecer en true para habilitar aprendizaje entre ejecuciones proxy_url: "http://localhost:30000" # URL del proxy MetaClaw skills_dir: "~/.metaclaw/skills" # Donde se almacenan las habilidades arc-* fallback_url: "" # Fallback directo al LLM cuando el proxy esta caido fallback_api_key: "" # Clave API para el endpoint de fallback lesson_to_skill: enabled: true # Convertir lecciones en habilidades automaticamente min_severity: "warning" # Severidad minima para conversion max_skills_per_run: 3 # Max de nuevas habilidades por ejecucion del pipeline # === Bridge de OpenClaw === openclaw_bridge: use_cron: false # Ejecuciones de investigacion programadas use_message: false # Notificaciones de progreso use_memory: false # Persistencia de conocimiento entre sesiones use_sessions_spawn: false # Generar sub-sesiones paralelas use_web_fetch: false # Busqueda web en vivo use_browser: false # Recopilacion de articulos basada en navegador ```
--- ## 🙏 Agradecimientos Inspirado por: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionero en investigacion automatizada - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automatizacion de investigacion de principio a fin - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Sistema de investigacion completamente automatizado --- ## 📄 Licencia MIT — consulta [LICENSE](../LICENSE) para mas detalles. --- ## 📌 Citacion Si encuentras AutoResearchClaw util, por favor cita: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Construido con 🦞 por el equipo de AutoResearchClaw

================================================ FILE: docs/README_FR.md ================================================

AutoResearchClaw Logo

Discutez une idee. Obtenez un article. Entierement autonome & auto-evolutif.

Discutez avec OpenClaw : "Recherche X" → termine.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Vitrine des articles · 📖 Guide d'integration · 💬 Communaute Discord

---
Sample Paper 🏆 Vitrine des articles generes

8 articles couvrant 8 domaines — mathematiques, statistiques, biologie, informatique, NLP, RL, vision, robustesse — generes de maniere entierement autonome sans aucune intervention humaine.

View Showcase
--- > **🧪 Nous recherchons des testeurs !** Essayez le pipeline avec votre propre idee de recherche — dans n'importe quel domaine — et [dites-nous ce que vous en pensez](TESTER_GUIDE.md). Vos retours faconnent directement la prochaine version. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Support multiplateforme + stabilite majeure** — AutoResearchClaw fonctionne desormais avec tout agent compatible ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) et supporte les plateformes de messagerie (Discord, Telegram, Lark, WeChat) via le pont OpenClaw. Nouveau backend de generation de code CLI-agent qui delegue les Stages 10 et 13 a des agents CLI externes avec controle de budget et gestion des timeouts. Inclut le systeme anti-fabrication (VerifiedRegistry + boucle diagnostic/reparation), 100+ corrections de bugs, refactoring modulaire de l'executor, auto-detection `--resume`, renforcement des retries LLM, et corrections communautaires. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-integration-metaclaw). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ Une commande. Un article. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 De quoi s'agit-il ? **Vous y pensez. AutoResearchClaw l'ecrit.** Donnez un sujet de recherche — recevez un article academique complet avec de la vraie litterature provenant d'OpenAlex, Semantic Scholar et arXiv, des experiences en sandbox adaptees au materiel (detection automatique GPU/MPS/CPU), une analyse statistique, une relecture multi-agents, et du LaTeX pret pour les conferences ciblant NeurIPS/ICML/ICLR. Aucune supervision. Aucun copier-coller. Aucune reference hallucinee.
📄paper_draft.mdArticle academique complet (Introduction, Travaux connexes, Methode, Experiences, Resultats, Conclusion)
📐paper.texLaTeX pret pour les conferences (templates NeurIPS / ICLR / ICML)
📚references.bibReferences BibTeX reelles provenant d'OpenAlex, Semantic Scholar et arXiv — auto-elaguees pour correspondre aux citations dans le texte
🔍verification_report.jsonVerification d'integrite et de pertinence des citations sur 4 couches (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Code genere + resultats sandbox + metriques JSON structurees
📊charts/Graphiques de comparaison de conditions auto-generes avec barres d'erreur et intervalles de confiance
📝reviews.mdRelecture multi-agents avec verification de coherence methodologie-preuves
🧬evolution/Lecons d'auto-apprentissage extraites de chaque execution
📦deliverables/Tous les livrables finaux dans un seul dossier — pret a compiler pour Overleaf
Le pipeline s'execute **de bout en bout sans intervention humaine**. Quand les experiences echouent, il s'auto-repare. Quand les hypotheses ne tiennent pas, il pivote. Quand les citations sont fausses, il les supprime. 🌍 **Utilisable partout.** AutoResearchClaw n'est pas verrouille sur une seule plateforme. Utilisez-le en CLI autonome, connectez-le a [OpenClaw](https://github.com/openclaw/openclaw), ou integrez-le avec n'importe quel agent compatible ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, et bien d'autres. Grace au pont de messagerie d'OpenClaw, vous pouvez lancer une recherche complete depuis 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, ou la plateforme que votre equipe utilise deja. Un sujet en entree, un article en sortie — peu importe d'ou vous l'envoyez. --- ## 🚀 Demarrage rapide ```bash # 1. Cloner & installer git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Setup (interactif — installe OpenCode beast mode, verifie Docker/LaTeX) researchclaw setup # 3. Configurer researchclaw init # Interactif : choisir le fournisseur LLM, cree config.arc.yaml # Ou manuellement : cp config.researchclaw.example.yaml config.arc.yaml # 4. Executer export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` Sortie → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — LaTeX pret a compiler, BibTeX, code d'experience, graphiques.
📝 Configuration minimale requise ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 Ce qui le distingue | Capacite | Fonctionnement | |----------|---------------| | **🔄 Boucle PIVOT / REFINE** | L'etape 15 decide de maniere autonome : PROCEED, REFINE (ajuster les parametres) ou PIVOT (nouvelle direction). Artefacts auto-versionnes. | | **🤖 Debat multi-agents** | La generation d'hypotheses, l'analyse de resultats et la relecture par les pairs utilisent chacune un debat structure multi-perspectives. | | **🧬 Auto-apprentissage** | Lecons extraites a chaque execution (justification des decisions, avertissements d'execution, anomalies de metriques) avec decroissance temporelle a 30 jours. Les executions futures apprennent des erreurs passees. | | **📚 Base de connaissances** | Chaque execution construit une KB structuree couvrant 6 categories (decisions, experiences, resultats, litterature, questions, relectures). | | **🛡️ Sentinel Watchdog** | Moniteur de qualite en arriere-plan : detection NaN/Inf, coherence article-preuves, score de pertinence des citations, protection anti-fabrication. | --- ## 🦞 Integration OpenClaw **AutoResearchClaw est un service compatible [OpenClaw](https://github.com/openclaw/openclaw).** Installez-le dans OpenClaw et lancez une recherche autonome avec un seul message — ou utilisez-le de maniere autonome via CLI, Claude Code, ou tout assistant de codage IA.
### 🚀 Utilisation avec OpenClaw (recommande) Si vous utilisez deja [OpenClaw](https://github.com/openclaw/openclaw) comme assistant IA : ``` 1️⃣ Partagez l'URL du depot GitHub avec OpenClaw 2️⃣ OpenClaw lit automatiquement RESEARCHCLAW_AGENTS.md → comprend le pipeline 3️⃣ Dites : "Research [votre sujet]" 4️⃣ C'est fait — OpenClaw clone, installe, configure, execute et renvoie les resultats ``` **C'est tout.** OpenClaw gere `git clone`, `pip install`, la configuration et l'execution du pipeline automatiquement. Vous n'avez qu'a discuter.
💡 Ce qui se passe en coulisses 1. OpenClaw lit `RESEARCHCLAW_AGENTS.md` → apprend le role d'orchestrateur de recherche 2. OpenClaw lit `README.md` → comprend l'installation et la structure du pipeline 3. OpenClaw copie `config.researchclaw.example.yaml` → `config.yaml` 4. Demande votre cle API LLM (ou utilise votre variable d'environnement) 5. Execute `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. Renvoie l'article, le LaTeX, les experiences et les citations
### 🔌 Pont OpenClaw (avance) Pour une integration plus poussee, AutoResearchClaw inclut un **systeme d'adaptateurs pont** avec 6 fonctionnalites optionnelles : ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Executions de recherche planifiees use_message: true # 💬 Notifications de progression (Discord/Slack/Telegram) use_memory: true # 🧠 Persistance des connaissances inter-sessions use_sessions_spawn: true # 🔀 Lancement de sous-sessions paralleles pour les etapes concurrentes use_web_fetch: true # 🌐 Recherche web en direct pendant la revue de litterature use_browser: false # 🖥️ Collecte d'articles via navigateur ``` Chaque option active un protocole d'adaptateur type. Quand OpenClaw fournit ces fonctionnalites, les adaptateurs les consomment sans modification de code. Voir [`integration-guide.md`](integration-guide.md) pour tous les details. ### ACP (Agent Client Protocol) AutoResearchClaw peut utiliser **n'importe quel agent de codage compatible ACP** comme backend LLM — sans cle API requise. L'agent communique via [acpx](https://github.com/openclaw/acpx), en maintenant une session persistante unique a travers les 23 etapes du pipeline. | Agent | Commande | Notes | |-------|----------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — exemple ACP llm: provider: "acp" acp: agent: "claude" # N'importe quel agent CLI compatible ACP cwd: "." # Repertoire de travail pour l'agent # Pas besoin de base_url ou api_key — l'agent gere sa propre authentification. ``` ```bash # Executez simplement — l'agent utilise ses propres identifiants researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ Autres methodes d'execution | Methode | Comment | |---------|---------| | **CLI autonome** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **API Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Lit `RESEARCHCLAW_CLAUDE.md` — dites simplement *"Run research on [sujet]"* | | **Copilot CLI** | `researchclaw run --topic "..."` avec `llm.acp.agent: "gh"` | | **OpenCode** | Lit `.claude/skills/` — meme interface en langage naturel | | **Tout CLI IA** | Fournissez `RESEARCHCLAW_AGENTS.md` comme contexte → l'agent s'auto-initialise | --- ## 🔬 Pipeline : 23 etapes, 8 phases ``` Phase A : Cadrage de la recherche Phase E : Execution des experiences 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← auto-reparation Phase B : Decouverte de litterature Phase F : Analyse et decision 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← multi-agents 4. LITERATURE_COLLECT ← API reelle 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [porte] 6. KNOWLEDGE_EXTRACT Phase G : Redaction de l'article 16. PAPER_OUTLINE Phase C : Synthese des connaissances 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← verif. preuves 8. HYPOTHESIS_GEN ← debat 19. PAPER_REVISION Phase D : Conception experimentale Phase H : Finalisation 9. EXPERIMENT_DESIGN [porte] 20. QUALITY_GATE [porte] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← verif. pertinence ``` > **Etapes de validation** (5, 9, 20) : pause pour approbation humaine ou approbation automatique avec `--auto-approve`. En cas de rejet, le pipeline revient en arriere. > **Boucles de decision** : l'etape 15 peut declencher REFINE (→ etape 13) ou PIVOT (→ etape 8), avec versionnement automatique des artefacts.
📋 Ce que fait chaque phase | Phase | Ce qui se passe | |-------|-----------------| | **A : Cadrage** | Le LLM decompose le sujet en un arbre de problemes structure avec des questions de recherche | | **A+ : Materiel** | Detection automatique du GPU (NVIDIA CUDA / Apple MPS / CPU uniquement), avertissement si le materiel local est limite, adaptation de la generation de code en consequence | | **B : Litterature** | Recherche multi-sources (OpenAlex → Semantic Scholar → arXiv) de vrais articles, filtrage par pertinence, extraction de fiches de connaissances | | **C : Synthese** | Regroupement des resultats, identification des lacunes de recherche, generation d'hypotheses testables via debat multi-agents | | **D : Conception** | Conception du plan experimental, generation de Python executable adapte au materiel (niveau GPU → selection de packages), estimation des besoins en ressources | | **E : Execution** | Execution des experiences en sandbox, detection de NaN/Inf et bugs d'execution, auto-reparation du code via reparation ciblee par LLM | | **F : Analyse** | Analyse multi-agents des resultats ; decision autonome PROCEED / REFINE / PIVOT avec justification | | **G : Redaction** | Plan → redaction section par section (5 000-6 500 mots) → relecture (avec verification de coherence methodologie-preuves) → revision avec controle de longueur | | **H : Finalisation** | Porte qualite, archivage des connaissances, export LaTeX avec template de conference, verification d'integrite et de pertinence des citations |
--- ## ✨ Fonctionnalites cles | Fonctionnalite | Description | |----------------|------------| | **📚 Litterature multi-sources** | Vrais articles depuis OpenAlex, Semantic Scholar et arXiv — expansion de requetes, deduplication, disjoncteur avec degradation gracieuse | | **🔍 Verification des citations en 4 couches** | Verification arXiv ID → DOI CrossRef/DataCite → correspondance de titre Semantic Scholar → score de pertinence LLM. References hallucinées auto-supprimees. | | **🖥️ Execution adaptee au materiel** | Detection automatique du GPU (NVIDIA CUDA / Apple MPS / CPU uniquement) et adaptation de la generation de code, des imports et de l'echelle experimentale | | **🦾 OpenCode Beast Mode** | Les experiences complexes sont automatiquement dirigees vers [OpenCode](https://github.com/anomalyco/opencode) — genere des projets multi-fichiers avec architectures personnalisees, boucles d'entrainement et etudes d'ablation. Installation via `researchclaw setup`. | | **🧪 Experiences en sandbox** | Code valide par AST, harnais immuable, echec rapide NaN/Inf, reparation auto-guerison, raffinement iteratif (jusqu'a 10 tours), capture de resultats partiels | | **📝 Redaction de qualite conference** | Templates NeurIPS/ICML/ICLR, redaction section par section (5 000-6 500 mots), protection anti-fabrication, controle de longueur en revision, application anti-clause de non-responsabilite | | **📐 Changement de template** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX avec formules, tableaux, figures, references croisees, `\cite{}` | | **🚦 Portes qualite** | 3 portes avec intervention humaine possible (etapes 5, 9, 20) avec retour en arriere. A passer avec `--auto-approve`. | --- ## 🧠 Integration MetaClaw **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Un pipeline qui apprend de chaque execution.** MetaClaw ajoute le **transfert de connaissances inter-executions** a AutoResearchClaw. Lorsqu'il est active, le pipeline capture automatiquement les lecons des echecs et avertissements, les convertit en competences reutilisables, et injecte ces competences dans les 23 etapes du pipeline lors des executions suivantes — pour ne jamais repeter les memes erreurs. ### Fonctionnement ``` Execution N s'execute → echecs/avertissements captures comme Lecons ↓ MetaClaw Lecon → conversion en Competence ↓ Fichiers de competences arc-* stockes dans ~/.metaclaw/skills/ ↓ Execution N+1 → build_overlay() injecte les competences dans chaque prompt LLM ↓ Le LLM evite les pieges connus → meilleure qualite, moins de tentatives ``` ### Configuration rapide ```bash # 1. Installer MetaClaw (si ce n'est pas deja fait) pip install metaclaw # 2. Activer dans votre configuration ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # Proxy MetaClaw (optionnel) skills_dir: "~/.metaclaw/skills" # Ou les competences sont stockees fallback_url: "https://api.openai.com/v1" # Repli direct vers le LLM fallback_api_key: "" # Cle API pour l'URL de repli lesson_to_skill: enabled: true min_severity: "warning" # Convertir avertissements + erreurs max_skills_per_run: 3 ``` ```bash # 3. Executez comme d'habitude — MetaClaw fonctionne de maniere transparente researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` Apres chaque execution, verifiez `~/.metaclaw/skills/arc-*/SKILL.md` pour voir les competences que votre pipeline a apprises. ### Resultats experimentaux Dans des experiences controlees A/B (meme sujet, meme LLM, meme configuration) : | Metrique | Reference | Avec MetaClaw | Amelioration | |----------|-----------|---------------|--------------| | Taux de relance des etapes | 10.5% | 7.9% | **-24.8%** | | Nombre de cycles REFINE | 2.0 | 1.2 | **-40.0%** | | Completion des etapes du pipeline | 18/19 | 19/19 | **+5.3%** | | Score de robustesse global (composite) | 0.714 | 0.845 | **+18.3%** | > Le score de robustesse composite est une moyenne ponderee du taux de completion des etapes (40%), de la reduction des tentatives (30%) et de l'efficacite des cycles REFINE (30%). ### Retrocompatibilite - **Par defaut : DESACTIVE.** Si `metaclaw_bridge` est absent ou `enabled: false`, le pipeline se comporte exactement comme avant. - **Aucune nouvelle dependance.** MetaClaw est optionnel — le pipeline de base fonctionne sans. - **Les 1 823 tests existants passent** avec le code d'integration present. --- ## ⚙️ Reference de configuration
Cliquez pour afficher la reference complete de configuration ```yaml # === Projet === project: name: "my-research" # Identifiant du projet mode: "docs-first" # docs-first | semi-auto | full-auto # === Recherche === research: topic: "..." # Sujet de recherche (requis) domains: ["ml", "nlp"] # Domaines de recherche pour la revue de litterature daily_paper_count: 8 # Nombre cible d'articles par requete de recherche quality_threshold: 4.0 # Score qualite minimum pour les articles # === Execution === runtime: timezone: "America/New_York" # Pour les horodatages max_parallel_tasks: 3 # Limite d'experiences concurrentes approval_timeout_hours: 12 # Timeout des etapes de validation retry_limit: 2 # Nombre de tentatives en cas d'echec d'etape # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # Point d'acces API (requis pour openai-compatible) api_key_env: "OPENAI_API_KEY" # Variable d'env pour la cle API (requis pour openai-compatible) api_key: "" # Ou cle en dur ici primary_model: "gpt-4o" # Modele principal fallback_models: ["gpt-4o-mini"] # Chaine de repli s2_api_key: "" # Cle API Semantic Scholar (optionnel, limites de debit plus elevees) acp: # Utilise uniquement quand provider: "acp" agent: "claude" # Commande CLI de l'agent ACP (claude, codex, gemini, etc.) cwd: "." # Repertoire de travail pour l'agent # === Experience === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Temps d'execution max par lancement (defaut : 300s) max_iterations: 10 # Iterations d'optimisation max metric_key: "val_loss" # Nom de la metrique principale metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Detection auto des imports → requirements.txt ssh_remote: host: "" # Nom d'hote du serveur GPU gpu_ids: [] # Identifiants GPU disponibles remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (auto-installe via `researchclaw setup`) enabled: true # Interrupteur principal (defaut : true) auto: true # Declenchement auto sans confirmation (defaut : true) complexity_threshold: 0.2 # 0.0-1.0 — plus eleve = ne se declenche que pour les experiences complexes model: "" # Modele a forcer (vide = utilise llm.primary_model) timeout_sec: 600 # Duree max en secondes pour la generation OpenCode max_retries: 1 # Nombre de tentatives en cas d'echec workspace_cleanup: true # Supprimer l'espace de travail temporaire apres collecte # === Export === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # Chemin vers un YAML de prompts personnalises (vide = defauts) # === Securite === security: hitl_required_stages: [5, 9, 20] # Etapes necessitant une approbation humaine allow_publish_without_approval: false redact_sensitive_logs: true # === Base de connaissances === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Notifications === notifications: channel: "console" # console | discord | slack target: "" # === Pont MetaClaw (Optionnel) === metaclaw_bridge: enabled: false # Mettre a true pour activer l'apprentissage inter-executions proxy_url: "http://localhost:30000" # URL du proxy MetaClaw skills_dir: "~/.metaclaw/skills" # Ou les competences arc-* sont stockees fallback_url: "" # Repli direct vers le LLM quand le proxy est indisponible fallback_api_key: "" # Cle API pour le point d'acces de repli lesson_to_skill: enabled: true # Conversion automatique des lecons en competences min_severity: "warning" # Severite minimum pour la conversion max_skills_per_run: 3 # Max de nouvelles competences par execution # === Pont OpenClaw === openclaw_bridge: use_cron: false # Executions de recherche planifiees use_message: false # Notifications de progression use_memory: false # Persistance des connaissances inter-sessions use_sessions_spawn: false # Lancement de sous-sessions paralleles use_web_fetch: false # Recherche web en direct use_browser: false # Collecte d'articles via navigateur ```
--- ## 🙏 Remerciements Inspire par : - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pionnier de la recherche automatisee - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automatisation de la recherche de bout en bout - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Systeme de recherche entierement automatise --- ## 📄 Licence MIT — voir [LICENSE](../LICENSE) pour les details. --- ## 📌 Citation Si vous trouvez AutoResearchClaw utile, veuillez citer : ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Construit avec 🦞 par l'equipe AutoResearchClaw

================================================ FILE: docs/README_JA.md ================================================

AutoResearchClaw Logo

アイデアを話す。論文を手に入れる。完全自動 & 自己進化。

OpenClaw にチャットするだけ:「Xを研究して」→ 完了。

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 論文ショーケース · 📖 統合ガイド · 💬 Discordコミュニティ

---
Sample Paper 🏆 生成論文ショーケース

8つの分野にわたる8本の論文 — 数学、統計、生物学、コンピューティング、NLP、RL、ビジョン、ロバスト性 — 人間の介入なしに完全自律生成。

View Showcase
--- > **🧪 テスターを募集しています!** あなた自身の研究アイデアで — どの分野からでも — パイプラインをお試しください。[ご意見をお聞かせください](TESTER_GUIDE.md)。あなたのフィードバックが次のバージョンに直接反映されます。 **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **クロスプラットフォーム対応 + 安定性大幅向上** — ACP互換AIエージェントバックエンド(Claude Code、Codex CLI、Copilot CLI、Gemini CLI、Kimi CLI)に対応し、OpenClawブリッジ経由でメッセージングプラットフォーム(Discord、Telegram、Lark、WeChat)もサポート。新しいCLIエージェントコード生成バックエンドにより、ステージ10と13を外部CLIエージェントに委任し、予算制御とタイムアウト管理に対応。反データ捏造システム(VerifiedRegistry + 実験診断・修復ループ)、100件以上のバグ修正、モジュラーexecutorリファクタリング、`--resume`自動検出、LLMリトライ強化、コミュニティ報告の修正を含む。 - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ ワンコマンド。ワンペーパー。 ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 これは何? **あなたが考える。AutoResearchClawが書く。** 研究トピックを入力するだけで — OpenAlex、Semantic Scholar、arXivからの実際の文献、ハードウェア対応のサンドボックス実験(GPU/MPS/CPUを自動検出)、統計分析、マルチエージェント査読、NeurIPS/ICML/ICLR対応の学会グレードLaTeXを含む完全な学術論文が得られます。監視不要。コピペ不要。幻覚された参考文献なし。
📄paper_draft.md完全な学術論文(序論、関連研究、手法、実験、結果、結論)
📐paper.tex学会対応LaTeX(NeurIPS / ICLR / ICMLテンプレート)
📚references.bibOpenAlex、Semantic Scholar、arXivからの実際のBibTeX参考文献 — 本文中の引用に合わせて自動整理
🔍verification_report.json4層の引用整合性 + 関連性検証(arXiv、CrossRef、DataCite、LLM)
🧪experiment runs/生成されたコード + サンドボックス実行結果 + 構造化JSONメトリクス
📊charts/誤差棒と信頼区間付きの条件比較チャートを自動生成
📝reviews.md手法-証拠の一貫性チェック付きマルチエージェント査読
🧬evolution/各実行から抽出された自己学習の教訓
📦deliverables/すべての最終成果物を1フォルダに集約 — Overleafですぐにコンパイル可能
パイプラインは**人手の介入なしにエンドツーエンドで実行**されます。実験が失敗すれば自己修復します。仮説が成り立たなければ方向転換します。引用が偽物なら削除します。 🌍 **どこでも実行可能。** AutoResearchClaw は特定のプラットフォームに縛られません。CLI でスタンドアロン実行、[OpenClaw](https://github.com/openclaw/openclaw) に接続、または ACP 互換の AI エージェント —— 🤖 Claude Code、💻 Codex CLI、🐙 Copilot CLI、♊ Gemini CLI、🌙 Kimi CLI など —— と連携できます。さらに OpenClaw のメッセージブリッジにより、💬 Discord、✈️ Telegram、🐦 Lark(飛書)、💚 WeChat など、チームが普段使っているプラットフォームから研究を開始できます。トピックを入力すれば、論文が出力されます —— どこからでも。 --- ## 🚀 クイックスタート ```bash # 1. クローン & インストール git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. セットアップ(対話式 — OpenCode Beast Modeのインストール、Docker/LaTeXの確認) researchclaw setup # 3. 設定 researchclaw init # 対話式:LLMプロバイダーを選択、config.arc.yamlを作成 # または手動:cp config.researchclaw.example.yaml config.arc.yaml # 4. 実行 export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` 出力先 → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — コンパイル可能なLaTeX、BibTeX、実験コード、チャート。
📝 最小限の必要設定 ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 他と何が違うのか | 機能 | 仕組み | |------|--------| | **🔄 PIVOT / REFINE ループ** | ステージ15が自律的に判定:PROCEED、REFINE(パラメータ調整)、またはPIVOT(新方向)。成果物は自動バージョン管理。 | | **🤖 マルチエージェント討論** | 仮説生成、結果分析、査読のそれぞれで構造化された多視点討論を実施。 | | **🧬 自己学習** | 各実行から教訓を抽出(判定根拠、ランタイム警告、メトリクス異常)、30日の時間減衰付き。将来の実行が過去のミスから学習。 | | **📚 知識ベース** | 各実行で6カテゴリ(判定、実験、発見、文献、質問、レビュー)にわたる構造化知識ベースを構築。 | | **🛡️ Sentinel Watchdog** | バックグラウンド品質モニター:NaN/Inf検出、論文-証拠の一貫性、引用関連性スコアリング、捏造防止ガード。 | --- ## 🦞 OpenClaw統合 **AutoResearchClawは[OpenClaw](https://github.com/openclaw/openclaw)互換サービスです。** OpenClawにインストールして、メッセージ1つで自律研究を開始できます — CLI、Claude Code、その他のAIコーディングアシスタントを使ってスタンドアロンでも利用可能です。
### 🚀 OpenClawで使う(推奨) [OpenClaw](https://github.com/openclaw/openclaw)をすでにAIアシスタントとしてお使いの場合: ``` 1️⃣ GitHubリポジトリのURLをOpenClawに共有 2️⃣ OpenClawがRESEARCHCLAW_AGENTS.mdを自動読み込み → パイプラインを理解 3️⃣ 「Research [あなたのトピック]」と話しかける 4️⃣ 完了 — OpenClawがクローン、インストール、設定、実行、結果の返却まですべて自動実行 ``` **以上です。** OpenClawが`git clone`、`pip install`、設定、パイプライン実行を自動的に処理します。チャットするだけです。
💡 内部で何が起きているか 1. OpenClawが`RESEARCHCLAW_AGENTS.md`を読み取り → 研究オーケストレーターの役割を学習 2. OpenClawが`README.md`を読み取り → インストールとパイプライン構造を理解 3. OpenClawが`config.researchclaw.example.yaml` → `config.yaml`にコピー 4. LLMのAPIキーを要求(または環境変数を使用) 5. `pip install -e .` + `researchclaw run --topic "..." --auto-approve`を実行 6. 論文、LaTeX、実験、引用を返却
### 🔌 OpenClaw Bridge(上級) より深い統合のために、AutoResearchClawには6つのオプション機能を備えた**ブリッジアダプターシステム**が含まれています: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ スケジュール実行 use_message: true # 💬 進捗通知(Discord/Slack/Telegram) use_memory: true # 🧠 セッション間の知識永続化 use_sessions_spawn: true # 🔀 並列サブセッションの生成 use_web_fetch: true # 🌐 文献レビュー中のライブWeb検索 use_browser: false # 🖥️ ブラウザベースの論文収集 ``` 各フラグは型付きアダプタープロトコルをアクティブにします。OpenClawがこれらの機能を提供する場合、アダプターはコード変更なしにそれらを利用します。詳細は[`integration-guide.md`](integration-guide.md)をご覧ください。 ### ACP (Agent Client Protocol) AutoResearchClawは**任意のACP互換コーディングエージェント**をLLMバックエンドとして使用できます — APIキーは不要です。エージェントは[acpx](https://github.com/openclaw/acpx)を介して通信し、全23パイプラインステージにわたって単一の永続セッションを維持します。 | エージェント | コマンド | 備考 | |-------------|---------|------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ACP例 llm: provider: "acp" acp: agent: "claude" # 任意のACP互換エージェントCLIコマンド cwd: "." # エージェントの作業ディレクトリ # base_urlやapi_keyは不要 — エージェントが独自の認証を処理します。 ``` ```bash # そのまま実行 — エージェントは独自の認証情報を使用 researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ その他の実行方法 | 方法 | 手順 | |------|------| | **スタンドアロンCLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | `RESEARCHCLAW_CLAUDE.md`を読み取り — *「Run research on [トピック]」*と言うだけ | | **Copilot CLI** | `researchclaw run --topic "..."` で `llm.acp.agent: "gh"` を使用 | | **OpenCode** | `.claude/skills/`を読み取り — 同じ自然言語インターフェース | | **任意のAI CLI** | `RESEARCHCLAW_AGENTS.md`をコンテキストとして提供 → エージェントが自動ブートストラップ | --- ## 🔬 パイプライン:23ステージ、8フェーズ ``` フェーズ A: 研究スコーピング フェーズ E: 実験実行 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← 自己修復 フェーズ B: 文献探索 フェーズ F: 分析と判定 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← マルチエージェント 4. LITERATURE_COLLECT ← 実API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [ゲート] 6. KNOWLEDGE_EXTRACT フェーズ G: 論文執筆 16. PAPER_OUTLINE フェーズ C: 知識統合 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← 証拠チェック 8. HYPOTHESIS_GEN ← 討論 19. PAPER_REVISION フェーズ D: 実験設計 フェーズ H: 最終処理 9. EXPERIMENT_DESIGN [ゲート] 20. QUALITY_GATE [ゲート] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← 関連性チェック ``` > **ゲートステージ**(5, 9, 20)は人間の承認を待つか、`--auto-approve`で自動承認されます。却下時にはパイプラインがロールバックします。 > **判定ループ**: ステージ15はREFINE(→ ステージ13)またはPIVOT(→ ステージ8)をトリガーでき、成果物のバージョン管理が自動的に行われます。
📋 各フェーズの詳細 | フェーズ | 処理内容 | |---------|----------| | **A: スコーピング** | LLMがトピックを研究質問を含む構造化された問題ツリーに分解 | | **A+: ハードウェア** | GPU(NVIDIA CUDA / Apple MPS / CPUのみ)を自動検出、ローカルハードウェアが限定的な場合は警告、コード生成を適応 | | **B: 文献** | マルチソース検索(OpenAlex → Semantic Scholar → arXiv)で実際の論文を取得、関連性でスクリーニング、知識カードを抽出 | | **C: 統合** | 発見事項をクラスタリング、研究ギャップを特定、マルチエージェント討論で検証可能な仮説を生成 | | **D: 設計** | 実験計画を設計、ハードウェア対応の実行可能Python(GPUティア→パッケージ選択)を生成、リソース需要を推定 | | **E: 実行** | サンドボックスで実験を実行、NaN/Infとランタイムバグを検出、LLMによる的確な修復で自己修復 | | **F: 分析** | マルチエージェントによる結果分析;根拠付きの自律的PROCEED / REFINE / PIVOT判定 | | **G: 執筆** | アウトライン → セクション別ドラフト(5,000〜6,500語)→ 査読(手法-証拠の一貫性付き)→ 文字数ガード付き改訂 | | **H: 最終処理** | 品質ゲート、知識アーカイブ、学会テンプレート付きLaTeXエクスポート、引用の整合性 + 関連性検証 |
--- ## ✨ 主な機能 | 機能 | 説明 | |------|------| | **📚 マルチソース文献** | OpenAlex、Semantic Scholar、arXivからの実際の論文 — クエリ拡張、重複排除、三状態サーキットブレーカーとグレースフルデグラデーション | | **🔍 4層引用検証** | arXiv IDチェック → CrossRef/DataCite DOI → Semantic Scholarタイトルマッチ → LLM関連性スコアリング。幻覚された参考文献は自動削除。 | | **🖥️ ハードウェア対応実行** | GPU(NVIDIA CUDA / Apple MPS / CPUのみ)を自動検出し、コード生成、インポート、実験スケールを適応 | | **🦾 OpenCode Beast Mode** | 複雑な実験を自動的に[OpenCode](https://github.com/anomalyco/opencode)にルーティング — カスタムアーキテクチャ、トレーニングループ、アブレーション研究を含むマルチファイルプロジェクトを生成。`researchclaw setup`でインストール。 | | **🧪 サンドボックス実験** | AST検証済みコード、不変ハーネス、NaN/Inf早期停止、自己修復、反復的改良(最大10ラウンド)、部分結果の保持 | | **📝 学会グレード執筆** | NeurIPS/ICML/ICLRテンプレート、セクション別ドラフト(5,000〜6,500語)、捏造防止ガード、改訂文字数ガード、免責事項抑制 | | **📐 テンプレート切り替え** | `neurips_2025`、`iclr_2026`、`icml_2026` — Markdown → LaTeX(数式、表、図、相互参照、`\cite{}`対応) | | **🚦 品質ゲート** | 3つのHuman-in-the-loopゲート(ステージ5, 9, 20)、ロールバック対応。`--auto-approve`でスキップ。 | --- ## 🧠 MetaClaw統合 **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = すべての実行から学習するパイプライン。** MetaClawはAutoResearchClawに**クロスラン知識転移**を追加します。有効にすると、パイプラインは失敗や警告から自動的に教訓を抽出し、再利用可能なスキルに変換し、後続の実行で全23ステージに注入します — 同じ過ちを二度と繰り返しません。 ### 仕組み ``` Run N executes → failures/warnings captured as Lessons ↓ MetaClaw Lesson → Skill conversion ↓ arc-* Skill files stored in ~/.metaclaw/skills/ ↓ Run N+1 → build_overlay() injects skills into every LLM prompt ↓ LLM avoids known pitfalls → higher quality, fewer retries ``` ### クイックセットアップ ```bash # 1. MetaClawをインストール(未インストールの場合) pip install metaclaw # 2. 設定で有効化 ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # MetaClawプロキシ(オプション) skills_dir: "~/.metaclaw/skills" # スキルの保存場所 fallback_url: "https://api.openai.com/v1" # 直接LLMフォールバック fallback_api_key: "" # フォールバックURLのAPIキー lesson_to_skill: enabled: true min_severity: "warning" # warning + errorを変換 max_skills_per_run: 3 ``` ```bash # 3. 通常通り実行 — MetaClawは透過的に動作 researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` 各実行後、`~/.metaclaw/skills/arc-*/SKILL.md`を確認して、パイプラインが学習したスキルを確認できます。 ### 実験結果 対照A/B実験(同じトピック、同じLLM、同じ設定): | メトリクス | ベースライン | MetaClaw使用時 | 改善 | |-----------|------------|---------------|------| | ステージリトライ率 | 10.5% | 7.9% | **-24.8%** | | Refineサイクル数 | 2.0 | 1.2 | **-40.0%** | | パイプラインステージ完了率 | 18/19 | 19/19 | **+5.3%** | | 総合ロバスト性スコア(複合) | 0.714 | 0.845 | **+18.3%** | > 複合ロバスト性スコアは、ステージ完了率(40%)、リトライ削減(30%)、Refineサイクル効率(30%)の加重平均です。 ### 後方互換性 - **デフォルト: オフ。** `metaclaw_bridge`が存在しないか`enabled: false`の場合、パイプラインは以前と全く同じように動作します。 - **新しい依存関係なし。** MetaClawはオプションです — コアパイプラインはMetaClawなしで動作します。 - **既存の1,823テストすべてがパス**(統合コードを含む)。 --- ## ⚙️ 設定リファレンス
クリックして設定リファレンスの全体を展開 ```yaml # === プロジェクト === project: name: "my-research" # プロジェクト識別子 mode: "docs-first" # docs-first | semi-auto | full-auto # === 研究 === research: topic: "..." # 研究トピック(必須) domains: ["ml", "nlp"] # 文献検索の研究ドメイン daily_paper_count: 8 # 検索クエリあたりの目標論文数 quality_threshold: 4.0 # 論文の最小品質スコア # === ランタイム === runtime: timezone: "America/New_York" # タイムスタンプ用 max_parallel_tasks: 3 # 同時実験数の上限 approval_timeout_hours: 12 # ゲートステージのタイムアウト retry_limit: 2 # ステージ失敗時のリトライ回数 # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # APIエンドポイント(openai-compatible必須) api_key_env: "OPENAI_API_KEY" # APIキーの環境変数(openai-compatible必須) api_key: "" # またはここにキーを直接記入 primary_model: "gpt-4o" # プライマリモデル fallback_models: ["gpt-4o-mini"] # フォールバックチェーン s2_api_key: "" # Semantic Scholar APIキー(オプション、レート制限緩和) acp: # provider: "acp" の場合のみ使用 agent: "claude" # ACP Agent CLIコマンド(claude, codex, gemini等) cwd: "." # エージェントの作業ディレクトリ # === 実験 === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # 実行あたりの最大実行時間(デフォルト: 300秒) max_iterations: 10 # 最大最適化反復回数 metric_key: "val_loss" # プライマリメトリクス名 metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # importを自動検出 → requirements.txt ssh_remote: host: "" # GPUサーバーのホスト名 gpu_ids: [] # 利用可能なGPU ID remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode(`researchclaw setup`で自動インストール) enabled: true # マスタースイッチ(デフォルト: true) auto: true # 確認なしで自動トリガー(デフォルト: true) complexity_threshold: 0.2 # 0.0-1.0 — 高い = 複雑な実験のみトリガー model: "" # モデルのオーバーライド(空 = llm.primary_modelを使用) timeout_sec: 600 # OpenCode生成の最大秒数 max_retries: 1 # 失敗時のリトライ回数 workspace_cleanup: true # 収集後に一時ワークスペースを削除 # === エクスポート === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === プロンプト === prompts: custom_file: "" # カスタムプロンプトYAMLのパス(空 = デフォルト) # === セキュリティ === security: hitl_required_stages: [5, 9, 20] # 人間の承認が必要なステージ allow_publish_without_approval: false redact_sensitive_logs: true # === 知識ベース === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === 通知 === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge(オプション)=== metaclaw_bridge: enabled: false # trueに設定してクロスラン学習を有効化 proxy_url: "http://localhost:30000" # MetaClawプロキシURL skills_dir: "~/.metaclaw/skills" # arc-*スキルの保存場所 fallback_url: "" # プロキシがダウン時の直接LLMフォールバック fallback_api_key: "" # フォールバックエンドポイントのAPIキー lesson_to_skill: enabled: true # 教訓をスキルに自動変換 min_severity: "warning" # 変換する最小重大度 max_skills_per_run: 3 # パイプラン実行あたりの最大新規スキル数 # === OpenClaw Bridge === openclaw_bridge: use_cron: false # スケジュール研究実行 use_message: false # 進捗通知 use_memory: false # セッション間の知識永続化 use_sessions_spawn: false # 並列サブセッションの生成 use_web_fetch: false # ライブWeb検索 use_browser: false # ブラウザベースの論文収集 ```
--- ## 🙏 謝辞 以下のプロジェクトに着想を得ています: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — 自動研究のパイオニア - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — エンドツーエンドの研究自動化 - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — 完全自動研究システム --- ## 📄 ライセンス MIT — 詳細は[LICENSE](../LICENSE)をご覧ください。 --- ## 📌 引用 AutoResearchClawが役に立った場合は、以下を引用してください: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Built with 🦞 by the AutoResearchClaw team

================================================ FILE: docs/README_KO.md ================================================

AutoResearchClaw Logo

아이디어를 말하다. 논문을 받다. 완전 자동 & 자기 진화.

OpenClaw에 채팅하세요: "X 연구해줘" → 완료.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 논문 쇼케이스 · 📖 통합 가이드 · 💬 Discord 커뮤니티

---
Sample Paper 🏆 생성된 논문 쇼케이스

8개 분야에 걸친 8편의 논문 — 수학, 통계, 생물학, 컴퓨팅, NLP, RL, 비전, 견고성 — 인간 개입 없이 완전 자율 생성.

View Showcase
--- > **🧪 테스터를 모집합니다!** 여러분의 연구 아이디어로 — 어떤 분야든 — 파이프라인을 시험해 보시고 [의견을 들려주세요](TESTER_GUIDE.md). 여러분의 피드백이 다음 버전에 직접 반영됩니다. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **크로스 플랫폼 지원 + 주요 안정성 개선** — ACP 호환 AI 에이전트 백엔드(Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) 지원 및 OpenClaw 브릿지를 통한 메시징 플랫폼(Discord, Telegram, Lark, WeChat) 지원 추가. 새로운 CLI-agent 코드 생성 백엔드가 Stage 10 및 13을 외부 CLI 에이전트에 위임하며, 예산 제어 및 타임아웃 관리를 지원. 반데이터 조작 시스템(VerifiedRegistry + 실험 진단 및 복구 루프), 100건 이상의 버그 수정, 모듈러 executor 리팩토링, `--resume` 자동 감지, LLM 재시도 강화, 커뮤니티 보고 수정 포함. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ 하나의 명령. 하나의 논문. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 이것은 무엇인가요? **당신이 생각하면, AutoResearchClaw가 씁니다.** 연구 주제를 입력하면 — OpenAlex, Semantic Scholar, arXiv의 실제 문헌, 하드웨어 인식 샌드박스 실험 (GPU/MPS/CPU 자동 감지), 통계 분석, 멀티 에이전트 피어 리뷰, NeurIPS/ICML/ICLR 대상 학회 수준 LaTeX를 포함한 완전한 학술 논문을 받을 수 있습니다. 관리가 필요 없습니다. 복사-붙여넣기도 필요 없습니다. 환각된 참고문헌도 없습니다.
📄paper_draft.md완성된 학술 논문 (서론, 관련 연구, 방법론, 실험, 결과, 결론)
📐paper.tex학회 제출용 LaTeX (NeurIPS / ICLR / ICML 템플릿)
📚references.bibOpenAlex, Semantic Scholar, arXiv에서 가져온 실제 BibTeX 참고문헌 — 인라인 인용과 일치하도록 자동 정리
🔍verification_report.json4계층 인용 무결성 + 관련성 검증 (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/생성된 코드 + 샌드박스 결과 + 구조화된 JSON 메트릭
📊charts/오차 막대와 신뢰 구간이 포함된 자동 생성 조건 비교 차트
📝reviews.md방법론-증거 일관성 검사를 포함한 멀티 에이전트 피어 리뷰
🧬evolution/각 실행에서 추출된 자기 학습 교훈
📦deliverables/모든 최종 산출물을 하나의 폴더에 — Overleaf에 바로 컴파일 가능
파이프라인은 **사람의 개입 없이 처음부터 끝까지 실행**됩니다. 실험이 실패하면 자가 복구합니다. 가설이 성립하지 않으면 방향을 전환합니다. 인용이 가짜면 삭제합니다. 🌍 **어디서든 실행 가능.** AutoResearchClaw는 특정 플랫폼에 종속되지 않습니다. CLI로 독립 실행하거나, [OpenClaw](https://github.com/openclaw/openclaw)에 연결하거나, ACP 호환 AI 에이전트 —— 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI 등 —— 와 연동할 수 있습니다. OpenClaw의 메시지 브릿지 덕분에 💬 Discord, ✈️ Telegram, 🐦 Lark(飞书), 💚 WeChat 등 팀이 이미 사용 중인 플랫폼에서 연구를 시작할 수 있습니다. 주제 하나 입력하면 논문 하나 완성 — 어디서 입력하든 상관없습니다. --- ## 🚀 빠른 시작 ```bash # 1. 클론 & 설치 git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. 설정 (대화형 — OpenCode Beast Mode 설치, Docker/LaTeX 확인) researchclaw setup # 3. 구성 researchclaw init # 대화형: LLM 제공자 선택, config.arc.yaml 생성 # 또는 수동: cp config.researchclaw.example.yaml config.arc.yaml # 4. 실행 export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` 출력 → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — 컴파일 가능한 LaTeX, BibTeX, 실험 코드, 차트.
📝 최소 필수 설정 ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 차별화 요소 | 기능 | 작동 방식 | |------|----------| | **🔄 PIVOT / REFINE 루프** | 15단계에서 자율적으로 결정: PROCEED, REFINE (매개변수 조정), 또는 PIVOT (새 방향). 산출물 자동 버전 관리. | | **🤖 멀티 에이전트 토론** | 가설 생성, 결과 분석, 피어 리뷰 각각에서 구조화된 다관점 토론을 수행. | | **🧬 자기 학습** | 각 실행에서 교훈 추출 (의사결정 근거, 런타임 경고, 메트릭 이상), 30일 시간 감쇠. 향후 실행이 과거의 실수에서 학습. | | **📚 지식 기반** | 각 실행에서 6개 카테고리 (결정, 실험, 발견, 문헌, 질문, 리뷰)에 걸친 구조화된 지식 기반 구축. | | **🛡️ 센티넬 감시견** | 백그라운드 품질 모니터: NaN/Inf 감지, 논문-증거 일관성, 인용 관련성 점수, 날조 방지 가드. | --- ## 🦞 OpenClaw 통합 **AutoResearchClaw는 [OpenClaw](https://github.com/openclaw/openclaw) 호환 서비스입니다.** OpenClaw에 설치하고 단일 메시지로 자율 연구를 시작하거나 — CLI, Claude Code 또는 기타 AI 코딩 어시스턴트를 통해 독립적으로 사용하세요.
### 🚀 OpenClaw와 함께 사용 (권장) [OpenClaw](https://github.com/openclaw/openclaw)을 이미 AI 어시스턴트로 사용하고 있다면: ``` 1️⃣ GitHub 저장소 URL을 OpenClaw에 공유 2️⃣ OpenClaw이 자동으로 RESEARCHCLAW_AGENTS.md를 읽고 → 파이프라인을 이해 3️⃣ "Research [주제]"라고 말하기 4️⃣ 완료 — OpenClaw이 클론, 설치, 설정, 실행, 결과 반환까지 자동 처리 ``` **그게 전부입니다.** OpenClaw이 `git clone`, `pip install`, 설정 구성, 파이프라인 실행을 자동으로 처리합니다. 채팅만 하면 됩니다.
💡 내부 동작 과정 1. OpenClaw이 `RESEARCHCLAW_AGENTS.md`를 읽고 → 연구 오케스트레이터 역할을 학습 2. OpenClaw이 `README.md`를 읽고 → 설치 및 파이프라인 구조를 이해 3. OpenClaw이 `config.researchclaw.example.yaml`을 → `config.yaml`로 복사 4. LLM API 키를 요청 (또는 환경 변수를 사용) 5. `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 실행 6. 논문, LaTeX, 실험, 인용을 반환
### 🔌 OpenClaw 브릿지 (고급) 더 깊은 통합을 위해 AutoResearchClaw는 6가지 선택적 기능을 갖춘 **브릿지 어댑터 시스템**을 포함합니다: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ 예약된 연구 실행 use_message: true # 💬 진행 상황 알림 (Discord/Slack/Telegram) use_memory: true # 🧠 세션 간 지식 영속성 use_sessions_spawn: true # 🔀 동시 단계를 위한 병렬 서브세션 생성 use_web_fetch: true # 🌐 문헌 검토 중 실시간 웹 검색 use_browser: false # 🖥️ 브라우저 기반 논문 수집 ``` 각 플래그는 타입이 지정된 어댑터 프로토콜을 활성화합니다. OpenClaw이 이러한 기능을 제공하면 어댑터가 코드 변경 없이 이를 소비합니다. 전체 세부 사항은 [`integration-guide.md`](integration-guide.md)를 참조하세요. ### ACP (Agent Client Protocol) AutoResearchClaw는 **모든 ACP 호환 코딩 에이전트**를 LLM 백엔드로 사용할 수 있습니다 — API 키가 필요 없습니다. 에이전트는 [acpx](https://github.com/openclaw/acpx)를 통해 통신하며, 전체 23개 파이프라인 단계에 걸쳐 단일 영구 세션을 유지합니다. | 에이전트 | 명령어 | 비고 | |---------|--------|------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — ACP 예시 llm: provider: "acp" acp: agent: "claude" # 모든 ACP 호환 에이전트 CLI 명령어 cwd: "." # 에이전트의 작업 디렉토리 # base_url이나 api_key 불필요 — 에이전트가 자체 인증을 처리합니다. ``` ```bash # 바로 실행 — 에이전트가 자체 자격 증명 사용 researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ 기타 실행 방법 | 방법 | 사용법 | |------|--------| | **독립형 CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | `RESEARCHCLAW_CLAUDE.md`를 읽음 — *"Run research on [주제]"*라고 말하기 | | **Copilot CLI** | `researchclaw run --topic "..."` 에 `llm.acp.agent: "gh"` 사용 | | **OpenCode** | `.claude/skills/`를 읽음 — 동일한 자연어 인터페이스 | | **기타 AI CLI** | `RESEARCHCLAW_AGENTS.md`를 컨텍스트로 제공 → 에이전트가 자동 부트스트랩 | --- ## 🔬 파이프라인: 23단계, 8페이즈 ``` 페이즈 A: 연구 범위 설정 페이즈 E: 실험 실행 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← 자가 복구 페이즈 B: 문헌 탐색 페이즈 F: 분석 및 의사결정 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← 멀티 에이전트 4. LITERATURE_COLLECT ← 실제 API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [게이트] 6. KNOWLEDGE_EXTRACT 페이즈 G: 논문 작성 16. PAPER_OUTLINE 페이즈 C: 지식 종합 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← 증거 확인 8. HYPOTHESIS_GEN ← 토론 19. PAPER_REVISION 페이즈 D: 실험 설계 페이즈 H: 최종화 9. EXPERIMENT_DESIGN [게이트] 20. QUALITY_GATE [게이트] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← 관련성 확인 ``` > **게이트 단계** (5, 9, 20)는 사람의 승인을 기다리거나 `--auto-approve`로 자동 승인합니다. 거부 시 파이프라인이 롤백됩니다. > **의사결정 루프**: 15단계에서 REFINE (→ 13단계) 또는 PIVOT (→ 8단계)을 트리거할 수 있으며, 산출물 버전 관리가 자동으로 이루어집니다.
📋 각 페이즈별 상세 설명 | 페이즈 | 수행 내용 | |--------|----------| | **A: 범위 설정** | LLM이 주제를 연구 질문이 포함된 구조화된 문제 트리로 분해 | | **A+: 하드웨어** | GPU 자동 감지 (NVIDIA CUDA / Apple MPS / CPU 전용), 로컬 하드웨어가 제한적인 경우 경고, 이에 맞게 코드 생성 적응 | | **B: 문헌** | 다중 소스 검색 (OpenAlex → Semantic Scholar → arXiv)으로 실제 논문 수집, 관련성별 선별, 지식 카드 추출 | | **C: 종합** | 연구 결과 클러스터링, 연구 갭 식별, 멀티 에이전트 토론을 통한 검증 가능한 가설 생성 | | **D: 설계** | 실험 계획 설계, 하드웨어 인식 실행 가능 Python 생성 (GPU 등급 → 패키지 선택), 리소스 요구 사항 추정 | | **E: 실행** | 샌드박스에서 실험 실행, NaN/Inf 및 런타임 버그 감지, LLM을 통한 표적화된 코드 자가 복구 | | **F: 분석** | 결과에 대한 멀티 에이전트 분석; 근거가 포함된 자율 PROCEED / REFINE / PIVOT 결정 | | **G: 작성** | 개요 → 섹션별 작성 (5,000-6,500단어) → 피어 리뷰 (방법론-증거 일관성 포함) → 길이 제한 적용 수정 | | **H: 최종화** | 품질 게이트, 지식 아카이빙, 학회 템플릿 포함 LaTeX 내보내기, 인용 무결성 + 관련성 검증 |
--- ## ✨ 주요 기능 | 기능 | 설명 | |------|------| | **📚 다중 소스 문헌** | OpenAlex, Semantic Scholar, arXiv에서 실제 논문 — 쿼리 확장, 중복 제거, 3상태 서킷 브레이커와 단계적 성능 저하 | | **🔍 4계층 인용 검증** | arXiv ID 확인 → CrossRef/DataCite DOI → Semantic Scholar 제목 매칭 → LLM 관련성 점수. 환각된 참고문헌 자동 삭제. | | **🖥️ 하드웨어 인식 실행** | GPU (NVIDIA CUDA / Apple MPS / CPU 전용) 자동 감지, 이에 맞게 코드 생성, import, 실험 규모 적응 | | **🦾 OpenCode Beast Mode** | 복잡한 실험을 [OpenCode](https://github.com/anomalyco/opencode)로 자동 라우팅 — 커스텀 아키텍처, 학습 루프, 절제 연구가 포함된 다중 파일 프로젝트 생성. `researchclaw setup`으로 설치. | | **🧪 샌드박스 실험** | AST 검증 코드, 불변 하네스, NaN/Inf 즉시 실패, 자가 복구, 반복적 개선 (최대 10라운드), 부분 결과 캡처 | | **📝 학회 수준 작성** | NeurIPS/ICML/ICLR 템플릿, 섹션별 작성 (5,000-6,500단어), 날조 방지 가드, 수정 길이 제한, 면책 조항 방지 적용 | | **📐 템플릿 전환** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX (수학, 표, 그림, 교차 참조, `\cite{}` 포함) | | **🚦 품질 게이트** | 3개의 Human-in-the-loop 게이트 (단계 5, 9, 20), 롤백 지원. `--auto-approve`로 건너뛰기. | --- ## 🧠 MetaClaw 통합 **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = 모든 실행에서 학습하는 파이프라인.** MetaClaw는 AutoResearchClaw에 **교차 실행 지식 전이**를 추가합니다. 활성화되면 파이프라인이 실패와 경고에서 자동으로 교훈을 추출하고, 이를 재사용 가능한 스킬로 변환하여 후속 실행의 전체 23단계에 주입합니다 — 같은 실수를 다시 반복하지 않습니다. ### 작동 방식 ``` Run N executes → failures/warnings captured as Lessons ↓ MetaClaw Lesson → Skill conversion ↓ arc-* Skill files stored in ~/.metaclaw/skills/ ↓ Run N+1 → build_overlay() injects skills into every LLM prompt ↓ LLM avoids known pitfalls → higher quality, fewer retries ``` ### 빠른 설정 ```bash # 1. MetaClaw 설치 (미설치 시) pip install metaclaw # 2. 설정에서 활성화 ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # MetaClaw 프록시 (선택) skills_dir: "~/.metaclaw/skills" # 스킬 저장 위치 fallback_url: "https://api.openai.com/v1" # 직접 LLM 폴백 fallback_api_key: "" # 폴백 URL의 API 키 lesson_to_skill: enabled: true min_severity: "warning" # warning + error 변환 max_skills_per_run: 3 ``` ```bash # 3. 평소대로 실행 — MetaClaw가 투명하게 작동 researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` 각 실행 후 `~/.metaclaw/skills/arc-*/SKILL.md`를 확인하여 파이프라인이 학습한 스킬을 확인하세요. ### 실험 결과 대조 A/B 실험 (동일 주제, 동일 LLM, 동일 설정): | 메트릭 | 기준선 | MetaClaw 사용 시 | 개선 | |--------|--------|-----------------|------| | 단계 재시도율 | 10.5% | 7.9% | **-24.8%** | | Refine 사이클 수 | 2.0 | 1.2 | **-40.0%** | | 파이프라인 단계 완료율 | 18/19 | 19/19 | **+5.3%** | | 전체 견고성 점수 (종합) | 0.714 | 0.845 | **+18.3%** | > 종합 견고성 점수는 단계 완료율 (40%), 재시도 감소 (30%), Refine 사이클 효율성 (30%)의 가중 평균입니다. ### 하위 호환성 - **기본값: 꺼짐.** `metaclaw_bridge`가 없거나 `enabled: false`이면 파이프라인은 이전과 정확히 동일하게 동작합니다. - **새로운 종속성 없음.** MetaClaw는 선택 사항입니다 — 핵심 파이프라인은 MetaClaw 없이도 동작합니다. - **기존 1,823개 테스트 모두 통과** (통합 코드 포함). --- ## ⚙️ 설정 참고서
전체 설정 참고서 펼치기 ```yaml # === 프로젝트 === project: name: "my-research" # 프로젝트 식별자 mode: "docs-first" # docs-first | semi-auto | full-auto # === 연구 === research: topic: "..." # 연구 주제 (필수) domains: ["ml", "nlp"] # 문헌 검색용 연구 분야 daily_paper_count: 8 # 검색 쿼리당 목표 논문 수 quality_threshold: 4.0 # 논문 최소 품질 점수 # === 런타임 === runtime: timezone: "America/New_York" # 타임스탬프용 max_parallel_tasks: 3 # 동시 실험 제한 approval_timeout_hours: 12 # 게이트 단계 타임아웃 retry_limit: 2 # 단계 실패 시 재시도 횟수 # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # API 엔드포인트 (openai-compatible 필수) api_key_env: "OPENAI_API_KEY" # API 키용 환경 변수 (openai-compatible 필수) api_key: "" # 또는 키를 직접 입력 primary_model: "gpt-4o" # 기본 모델 fallback_models: ["gpt-4o-mini"] # 폴백 체인 s2_api_key: "" # Semantic Scholar API 키 (선택, 더 높은 속도 제한) acp: # provider: "acp" 인 경우에만 사용 agent: "claude" # ACP 에이전트 CLI 명령어 (claude, codex, gemini 등) cwd: "." # 에이전트의 작업 디렉토리 # === 실험 === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # 실행당 최대 실행 시간 (기본값: 300초) max_iterations: 10 # 최대 최적화 반복 횟수 metric_key: "val_loss" # 기본 메트릭 이름 metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # import 자동 감지 → requirements.txt ssh_remote: host: "" # GPU 서버 호스트명 gpu_ids: [] # 사용 가능한 GPU ID remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (`researchclaw setup`으로 자동 설치) enabled: true # 마스터 스위치 (기본값: true) auto: true # 확인 없이 자동 트리거 (기본값: true) complexity_threshold: 0.2 # 0.0-1.0 — 높을수록 복잡한 실험에서만 트리거 model: "" # 모델 오버라이드 (비어있으면 llm.primary_model 사용) timeout_sec: 600 # OpenCode 생성 최대 초 max_retries: 1 # 실패 시 재시도 횟수 workspace_cleanup: true # 수집 후 임시 작업 공간 제거 # === 내보내기 === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === 프롬프트 === prompts: custom_file: "" # 사용자 정의 프롬프트 YAML 경로 (비어 있으면 기본값) # === 보안 === security: hitl_required_stages: [5, 9, 20] # 사람의 승인이 필요한 단계 allow_publish_without_approval: false redact_sensitive_logs: true # === 지식 기반 === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === 알림 === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge (선택) === metaclaw_bridge: enabled: false # true로 설정하여 교차 실행 학습 활성화 proxy_url: "http://localhost:30000" # MetaClaw 프록시 URL skills_dir: "~/.metaclaw/skills" # arc-* 스킬 저장 위치 fallback_url: "" # 프록시 장애 시 직접 LLM 폴백 fallback_api_key: "" # 폴백 엔드포인트의 API 키 lesson_to_skill: enabled: true # 교훈을 스킬로 자동 변환 min_severity: "warning" # 변환할 최소 심각도 max_skills_per_run: 3 # 파이프라인 실행당 최대 새 스킬 수 # === OpenClaw 브릿지 === openclaw_bridge: use_cron: false # 예약된 연구 실행 use_message: false # 진행 상황 알림 use_memory: false # 세션 간 지식 영속성 use_sessions_spawn: false # 병렬 서브세션 생성 use_web_fetch: false # 실시간 웹 검색 use_browser: false # 브라우저 기반 논문 수집 ```
--- ## 🙏 감사의 말 다음 프로젝트에서 영감을 받았습니다: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — 자동화 연구의 선구자 - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — 엔드투엔드 연구 자동화 - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — 완전 자동 연구 시스템 --- ## 📄 라이선스 MIT — 자세한 내용은 [LICENSE](../LICENSE)를 참조하세요. --- ## 📌 인용 AutoResearchClaw가 유용했다면, 아래를 인용해 주세요: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Built with 🦞 by the AutoResearchClaw team

================================================ FILE: docs/README_PT.md ================================================

AutoResearchClaw Logo

Converse uma ideia. Receba um artigo. Totalmente autônomo & autoevolutivo.

Converse com o OpenClaw: "Pesquise X" → pronto.

AutoResearchClaw Framework

MIT License Python 3.11+ 1823 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Galeria de Artigos · 📖 Guia de Integração · 💬 Comunidade Discord

---
Artigo Exemplo 🏆 Galeria de Artigos Gerados

8 artigos em 8 domínios — matemática, estatística, biologia, computação, NLP, RL, visão, robustez — gerados de forma totalmente autônoma sem intervenção humana.

Ver Galeria
--- > **🧪 Estamos procurando testadores!** Experimente o pipeline com sua própria ideia de pesquisa — de qualquer área — e [diga-nos o que achou](TESTER_GUIDE.md). Seu feedback molda diretamente a próxima versão. **[→ Testing Guide](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 News - **[03/22/2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Suporte multiplataforma + grande estabilidade** — O AutoResearchClaw agora funciona com qualquer agente compativel com ACP (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) e suporta plataformas de mensagens (Discord, Telegram, Lark, WeChat) via ponte OpenClaw. Novo backend de geracao de codigo CLI-agent que delega os Stages 10 e 13 a agentes CLI externos com controle de orcamento e gerenciamento de timeout. Inclui sistema anti-fabricacao (VerifiedRegistry + loop de diagnostico e reparo), 100+ correcoes de bugs, refatoracao modular do executor, auto-deteccao de `--resume`, endurecimento de retries LLM e correcoes da comunidade. - **[03/18/2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Community Contributions** — New "Beast Mode" routes complex code generation to [OpenCode](https://github.com/anomalyco/opencode) with automatic complexity scoring and graceful fallback. Added Novita AI provider support, thread-safety hardening, improved LLM output parsing robustness, and 20+ bug fixes from community PRs and internal audit. - **[03/17/2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **MetaClaw Integration** — AutoResearchClaw now supports [MetaClaw](https://github.com/aiming-lab/MetaClaw) cross-run learning: pipeline failures → structured lessons → reusable skills, injected into all 23 stages. **+18.3%** robustness in controlled experiments. Opt-in (`metaclaw_bridge.enabled: true`), fully backward-compatible. See [Integration Guide](#-metaclaw-integration). - **[03/16/2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Three multi-agent subsystems (CodeAgent, BenchmarkAgent, FigureAgent), hardened Docker sandbox with network-policy-aware execution, 4-round paper quality audit (AI-slop detection, 7-dim review scoring, NeurIPS checklist), and 15+ bug fixes from production runs. - **[03/15/2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — We release AutoResearchClaw: a fully autonomous 23-stage research pipeline that turns a single research idea into a conference-ready paper. No human intervention required. --- ## ⚡ Um Comando. Um Artigo. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Your research idea here" --auto-approve ``` --- ## 🤔 O Que É Isto? **Você pensa. AutoResearchClaw escreve.** Forneça um tópico de pesquisa — receba de volta um artigo acadêmico completo com literatura real do OpenAlex, Semantic Scholar & arXiv, experimentos em sandbox com detecção automática de hardware (GPU/MPS/CPU), análise estatística, revisão por pares multi-agente, e LaTeX pronto para conferência mirando NeurIPS/ICML/ICLR. Sem babá. Sem copiar e colar. Sem referências alucinadas.
📄paper_draft.mdArtigo acadêmico completo (Introdução, Trabalhos Relacionados, Método, Experimentos, Resultados, Conclusão)
📐paper.texLaTeX pronto para conferência (templates NeurIPS / ICLR / ICML)
📚references.bibReferências BibTeX reais do OpenAlex, Semantic Scholar e arXiv — auto-podadas para corresponder às citações inline
🔍verification_report.jsonVerificação de integridade + relevância de citações em 4 camadas (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Código gerado + resultados do sandbox + métricas JSON estruturadas
📊charts/Gráficos de comparação de condições gerados automaticamente com barras de erro e intervalos de confiança
📝reviews.mdRevisão por pares multi-agente com verificações de consistência metodologia-evidência
🧬evolution/Lições de autoaprendizagem extraídas de cada execução
📦deliverables/Todas as saídas finais em uma pasta — pronto para compilar no Overleaf
O pipeline roda **de ponta a ponta sem intervenção humana**. Quando experimentos falham, ele se auto-repara. Quando hipóteses não se sustentam, ele pivota. Quando citações são falsas, ele as elimina. 🌍 **Execute em qualquer lugar.** O AutoResearchClaw não está preso a uma única plataforma. Use-o de forma independente via CLI, conecte-o ao [OpenClaw](https://github.com/openclaw/openclaw), ou integre-o com qualquer agente compatível com ACP — 🤖 Claude Code, 💻 Codex CLI, 🐙 Copilot CLI, ♊ Gemini CLI, 🌙 Kimi CLI, e muito mais. Graças à ponte de mensagens do OpenClaw, você pode iniciar uma pesquisa completa pelo 💬 Discord, ✈️ Telegram, 🐦 Lark (飞书), 💚 WeChat, ou qualquer plataforma que sua equipe já utiliza. Um tópico na entrada, um artigo na saída — não importa de onde você digita. --- ## 🚀 Início Rápido ```bash # 1. Clone & instale git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Setup (interativo — instala OpenCode beast mode, verifica Docker/LaTeX) researchclaw setup # 3. Configure researchclaw init # Interativo: escolha provedor LLM, cria config.arc.yaml # Ou manualmente: cp config.researchclaw.example.yaml config.arc.yaml # 4. Execute export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Your research idea" --auto-approve ``` Saída → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — LaTeX, BibTeX, código de experimentos, gráficos prontos para compilação.
📝 Configuração mínima necessária ```yaml project: name: "my-research" research: topic: "Your research topic here" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 O Que o Torna Diferente | Capacidade | Como Funciona | |-----------|-------------| | **🔄 Loop PIVOT / REFINE** | O Estágio 15 decide autonomamente: PROCEED, REFINE (ajustar parâmetros) ou PIVOT (nova direção). Artefatos versionados automaticamente. | | **🤖 Debate Multi-Agente** | Geração de hipóteses, análise de resultados e revisão por pares usam debate estruturado com múltiplas perspectivas. | | **🧬 Autoaprendizagem** | Lições extraídas por execução (justificativa de decisões, avisos de runtime, anomalias em métricas) com decaimento temporal de 30 dias. Execuções futuras aprendem com erros passados. | | **📚 Base de Conhecimento** | Cada execução constrói uma KB estruturada com 6 categorias (decisões, experimentos, descobertas, literatura, questões, revisões). | | **🛡️ Sentinel Watchdog** | Monitor de qualidade em segundo plano: detecção de NaN/Inf, consistência artigo-evidência, pontuação de relevância de citações, guarda anti-fabricação. | --- ## 🦞 Integração OpenClaw **AutoResearchClaw é um serviço compatível com [OpenClaw](https://github.com/openclaw/openclaw).** Instale-o no OpenClaw e inicie pesquisa autônoma com uma única mensagem — ou use-o de forma independente via CLI, Claude Code ou qualquer assistente de codificação IA.
### 🚀 Usar com OpenClaw (Recomendado) Se você já usa o [OpenClaw](https://github.com/openclaw/openclaw) como seu assistente de IA: ``` 1️⃣ Compartilhe a URL do repositório GitHub com o OpenClaw 2️⃣ O OpenClaw lê automaticamente RESEARCHCLAW_AGENTS.md → entende o pipeline 3️⃣ Diga: "Pesquise [seu tópico]" 4️⃣ Pronto — o OpenClaw clona, instala, configura, executa e retorna os resultados ``` **É isso.** O OpenClaw gerencia `git clone`, `pip install`, configuração e execução do pipeline automaticamente. Você apenas conversa.
💡 O que acontece por baixo dos panos 1. O OpenClaw lê `RESEARCHCLAW_AGENTS.md` → aprende o papel de orquestrador de pesquisa 2. O OpenClaw lê `README.md` → entende a instalação e estrutura do pipeline 3. O OpenClaw copia `config.researchclaw.example.yaml` → `config.yaml` 4. Solicita sua chave de API do LLM (ou usa sua variável de ambiente) 5. Executa `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. Retorna o artigo, LaTeX, experimentos e citações
### 🔌 Bridge OpenClaw (Avançado) Para integração mais profunda, o AutoResearchClaw inclui um **sistema de adaptadores bridge** com 6 capacidades opcionais: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Execuções de pesquisa agendadas use_message: true # 💬 Notificações de progresso (Discord/Slack/Telegram) use_memory: true # 🧠 Persistência de conhecimento entre sessões use_sessions_spawn: true # 🔀 Criar sub-sessões paralelas para estágios concorrentes use_web_fetch: true # 🌐 Busca web ao vivo durante revisão de literatura use_browser: false # 🖥️ Coleta de artigos baseada em navegador ``` Cada flag ativa um protocolo de adaptador tipado. Quando o OpenClaw fornece essas capacidades, os adaptadores as consomem sem alterações no código. Consulte [`integration-guide.md`](integration-guide.md) para detalhes completos. ### ACP (Agent Client Protocol) O AutoResearchClaw pode usar **qualquer agente de codificação compatível com ACP** como seu backend LLM — sem necessidade de chaves de API. O agente se comunica via [acpx](https://github.com/openclaw/acpx), mantendo uma única sessão persistente ao longo de todos os 23 estágios do pipeline. | Agente | Comando | Notas | |-------|---------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — exemplo ACP llm: provider: "acp" acp: agent: "claude" # Qualquer comando CLI de agente compatível com ACP cwd: "." # Diretório de trabalho para o agente # Sem base_url ou api_key necessários — o agente gerencia sua própria autenticação. ``` ```bash # Basta executar — o agente usa suas próprias credenciais researchclaw run --config config.yaml --topic "Your research idea" --auto-approve ``` ### 🛠️ Outras Formas de Executar | Método | Como | |--------|------| | **CLI Independente** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **API Python** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Lê `RESEARCHCLAW_CLAUDE.md` — basta dizer *"Execute pesquisa sobre [tópico]"* | | **Copilot CLI** | `researchclaw run --topic "..."` com `llm.acp.agent: "gh"` | | **OpenCode** | Lê `.claude/skills/` — mesma interface em linguagem natural | | **Qualquer CLI de IA** | Forneça `RESEARCHCLAW_AGENTS.md` como contexto → o agente faz bootstrap automaticamente | --- ## 🔬 Pipeline: 23 Estágios, 8 Fases ``` Fase A: Escopo da Pesquisa Fase E: Execução de Experimentos 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← auto-reparo Fase B: Descoberta de Literatura Fase F: Análise & Decisão 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← multi-agente 4. LITERATURE_COLLECT ← API real 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [gate] 6. KNOWLEDGE_EXTRACT Fase G: Escrita do Artigo 16. PAPER_OUTLINE Fase C: Síntese de Conhecimento 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← verif. evidência 8. HYPOTHESIS_GEN ← debate 19. PAPER_REVISION Fase D: Design de Experimentos Fase H: Finalização 9. EXPERIMENT_DESIGN [gate] 20. QUALITY_GATE [gate] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← verif. relevância ``` > **Estágios gate** (5, 9, 20) pausam para aprovação humana ou aprovam automaticamente com `--auto-approve`. Em caso de rejeição, o pipeline faz rollback. > **Loops de decisão**: O Estágio 15 pode acionar REFINE (→ Estágio 13) ou PIVOT (→ Estágio 8), com versionamento automático de artefatos.
📋 O Que Cada Fase Faz | Fase | O Que Acontece | |------|----------------| | **A: Escopo** | O LLM decompõe o tópico em uma árvore de problemas estruturada com questões de pesquisa | | **A+: Hardware** | Detecta automaticamente GPU (NVIDIA CUDA / Apple MPS / apenas CPU), avisa se o hardware local é limitado, adapta a geração de código adequadamente | | **B: Literatura** | Busca multi-fonte (OpenAlex → Semantic Scholar → arXiv) por artigos reais, triagem por relevância, extração de fichas de conhecimento | | **C: Síntese** | Agrupa descobertas, identifica lacunas de pesquisa, gera hipóteses testáveis via debate multi-agente | | **D: Design** | Projeta plano de experimento, gera Python executável com consciência de hardware (tier de GPU → seleção de pacotes), estima necessidades de recursos | | **E: Execução** | Executa experimentos em sandbox, detecta NaN/Inf e bugs de runtime, auto-repara código via reparo direcionado por LLM | | **F: Análise** | Análise multi-agente dos resultados; decisão autônoma PROCEED / REFINE / PIVOT com justificativa | | **G: Escrita** | Outline → redação seção por seção (5.000-6.500 palavras) → revisão por pares (com consistência metodologia-evidência) → revisão com guarda de tamanho | | **H: Finalização** | Quality gate, arquivamento de conhecimento, exportação LaTeX com template de conferência, verificação de integridade + relevância de citações |
--- ## ✨ Funcionalidades Principais | Funcionalidade | Descrição | |---------|------------| | **📚 Literatura Multi-Fonte** | Artigos reais do OpenAlex, Semantic Scholar & arXiv — expansão de consultas, deduplicação, circuit breaker com degradação graciosa | | **🔍 Verificação de Citações em 4 Camadas** | Verificação de arXiv ID → CrossRef/DataCite DOI → correspondência de título no Semantic Scholar → pontuação de relevância por LLM. Referências alucinadas removidas automaticamente. | | **🖥️ Execução com Consciência de Hardware** | Detecta automaticamente GPU (NVIDIA CUDA / Apple MPS / apenas CPU) e adapta geração de código, imports e escala de experimentos | | **🦾 OpenCode Beast Mode** | Experimentos complexos roteados automaticamente para o [OpenCode](https://github.com/anomalyco/opencode) — gera projetos multi-arquivo com arquiteturas customizadas, loops de treinamento e estudos de ablação. Instale via `researchclaw setup`. | | **🧪 Experimentos em Sandbox** | Código validado por AST, harness imutável, fast-fail para NaN/Inf, reparo auto-reparável, refinamento iterativo (até 10 rodadas), captura de resultados parciais | | **📝 Escrita com Qualidade de Conferência** | Templates NeurIPS/ICML/ICLR, redação seção por seção (5.000-6.500 palavras), guarda anti-fabricação, guarda de tamanho na revisão, imposição anti-disclaimer | | **📐 Troca de Template** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX com matemática, tabelas, figuras, referências cruzadas, `\cite{}` | | **🚦 Quality Gates** | 3 gates com human-in-the-loop (Estágios 5, 9, 20) com rollback. Pule com `--auto-approve`. | --- ## 🧠 Integração MetaClaw **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Um pipeline que aprende com cada execução.** MetaClaw adiciona **transferência de conhecimento entre execuções** ao AutoResearchClaw. Quando ativado, o pipeline captura automaticamente lições de falhas e avisos, converte-as em habilidades reutilizáveis e injeta essas habilidades em todos os 23 estágios do pipeline em execuções subsequentes — para que os mesmos erros nunca se repitam. ### Como Funciona ``` Run N executa → falhas/avisos capturados como Lessons ↓ MetaClaw Lesson → conversão em Skill ↓ Arquivos arc-* Skill armazenados em ~/.metaclaw/skills/ ↓ Run N+1 → build_overlay() injeta skills em cada prompt LLM ↓ LLM evita armadilhas conhecidas → maior qualidade, menos retentativas ``` ### Configuração Rápida ```bash # 1. Instale o MetaClaw (se ainda não tiver) pip install metaclaw # 2. Ative na sua configuração ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # Proxy MetaClaw (opcional) skills_dir: "~/.metaclaw/skills" # Onde as skills são armazenadas fallback_url: "https://api.openai.com/v1" # Fallback direto para LLM fallback_api_key: "" # Chave de API para URL de fallback lesson_to_skill: enabled: true min_severity: "warning" # Converte warnings + errors max_skills_per_run: 3 ``` ```bash # 3. Execute normalmente — MetaClaw funciona de forma transparente researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` Após cada execução, verifique `~/.metaclaw/skills/arc-*/SKILL.md` para ver as skills que seu pipeline aprendeu. ### Resultados dos Experimentos Em experimentos A/B controlados (mesmo tópico, mesmo LLM, mesma configuração): | Métrica | Baseline | Com MetaClaw | Melhoria | |---------|----------|---------------|----------| | Taxa de retentativa por estágio | 10.5% | 7.9% | **-24.8%** | | Contagem de ciclos REFINE | 2.0 | 1.2 | **-40.0%** | | Conclusão de estágios do pipeline | 18/19 | 19/19 | **+5.3%** | | Pontuação de robustez geral (composta) | 0.714 | 0.845 | **+18.3%** | > A pontuação composta de robustez é uma média ponderada da taxa de conclusão de estágios (40%), redução de retentativas (30%) e eficiência de ciclos REFINE (30%). ### Compatibilidade Retroativa - **Padrão: DESATIVADO.** Se `metaclaw_bridge` estiver ausente ou `enabled: false`, o pipeline funciona exatamente como antes. - **Sem novas dependências.** MetaClaw é opcional — o pipeline principal funciona sem ele. - **Todos os 1.823 testes existentes passam** com o código de integração presente. --- ## ⚙️ Referência de Configuração
Clique para expandir a referência completa de configuração ```yaml # === Projeto === project: name: "my-research" # Identificador do projeto mode: "docs-first" # docs-first | semi-auto | full-auto # === Pesquisa === research: topic: "..." # Tópico de pesquisa (obrigatório) domains: ["ml", "nlp"] # Domínios de pesquisa para busca de literatura daily_paper_count: 8 # Artigos alvo por consulta de busca quality_threshold: 4.0 # Pontuação mínima de qualidade para artigos # === Runtime === runtime: timezone: "America/New_York" # Para timestamps max_parallel_tasks: 3 # Limite de experimentos concorrentes approval_timeout_hours: 12 # Timeout de estágios gate retry_limit: 2 # Contagem de retentativas em falha de estágio # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # Endpoint da API (obrigatório para openai-compatible) api_key_env: "OPENAI_API_KEY" # Variável de ambiente para chave da API (obrigatório para openai-compatible) api_key: "" # Ou insira a chave diretamente aqui primary_model: "gpt-4o" # Modelo primário fallback_models: ["gpt-4o-mini"] # Cadeia de fallback s2_api_key: "" # Chave API do Semantic Scholar (opcional, limites de taxa maiores) acp: # Usado apenas quando provider: "acp" agent: "claude" # Comando CLI do agente ACP (claude, codex, gemini, etc.) cwd: "." # Diretório de trabalho para o agente # === Experimento === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Tempo máximo de execução por run (padrão: 300s) max_iterations: 10 # Máximo de iterações de otimização metric_key: "val_loss" # Nome da métrica primária metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Detecção automática de imports → requirements.txt ssh_remote: host: "" # Hostname do servidor GPU gpu_ids: [] # IDs de GPU disponíveis remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (auto-instalado via `researchclaw setup`) enabled: true # Interruptor principal (padrão: true) auto: true # Acionamento automático sem confirmação (padrão: true) complexity_threshold: 0.2 # 0.0-1.0 — maior = só aciona em experimentos complexos model: "" # Modelo override (vazio = usa llm.primary_model) timeout_sec: 600 # Máximo de segundos para geração OpenCode max_retries: 1 # Contagem de retentativas em falha workspace_cleanup: true # Remove workspace temporário após coleta # === Exportação === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Prompts === prompts: custom_file: "" # Caminho para YAML de prompts customizados (vazio = padrões) # === Segurança === security: hitl_required_stages: [5, 9, 20] # Estágios que requerem aprovação humana allow_publish_without_approval: false redact_sensitive_logs: true # === Base de Conhecimento === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Notificações === notifications: channel: "console" # console | discord | slack target: "" # === MetaClaw Bridge (Opcional) === metaclaw_bridge: enabled: false # Defina como true para ativar aprendizado entre execuções proxy_url: "http://localhost:30000" # URL do proxy MetaClaw skills_dir: "~/.metaclaw/skills" # Onde as skills arc-* são armazenadas fallback_url: "" # Fallback direto para LLM quando o proxy está fora fallback_api_key: "" # Chave de API para endpoint de fallback lesson_to_skill: enabled: true # Auto-converter lições em skills min_severity: "warning" # Severidade mínima para converter max_skills_per_run: 3 # Máximo de novas skills por execução do pipeline # === Bridge OpenClaw === openclaw_bridge: use_cron: false # Execuções de pesquisa agendadas use_message: false # Notificações de progresso use_memory: false # Persistência de conhecimento entre sessões use_sessions_spawn: false # Criar sub-sessões paralelas use_web_fetch: false # Busca web ao vivo use_browser: false # Coleta de artigos baseada em navegador ```
--- ## 🙏 Agradecimentos Inspirado por: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Pioneiro em pesquisa automatizada - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Automação de pesquisa de ponta a ponta - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Fully Automated Research System --- ## 📄 Licença MIT — veja [LICENSE](../LICENSE) para detalhes. --- ## 📌 Citação Se você achar o AutoResearchClaw útil, por favor cite: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Construído com 🦞 pela equipe AutoResearchClaw

================================================ FILE: docs/README_RU.md ================================================

AutoResearchClaw Logo

Напишите идею. Получите статью. Полностью автономно и с самообучением.

Просто напишите OpenClaw: «Исследуй X» → готово.

AutoResearchClaw Framework

MIT License Python 3.11+ 1634 Tests Passed GitHub OpenClaw Compatible Discord

🇺🇸 English · 🇨🇳 中文 · 🇯🇵 日本語 · 🇰🇷 한국어 · 🇫🇷 Français · 🇩🇪 Deutsch · 🇪🇸 Español · 🇧🇷 Português · 🇷🇺 Русский · 🇸🇦 العربية

🏆 Галерея статей · 📖 Руководство по интеграции · 💬 Сообщество в Discord

---
Пример статьи 🏆 Галерея сгенерированных статей

8 статей в 8 областях — математика, статистика, биология, информатика, NLP, RL, компьютерное зрение, робастность — сгенерированы полностью автономно без участия человека.

Посмотреть галерею
--- > **🧪 Мы ищем тестировщиков!** Попробуйте запустить пайплайн со своей исследовательской идеей из любой области и [расскажите нам о результатах](TESTER_GUIDE.md). Ваш фидбек напрямую влияет на развитие проекта. **[→ Руководство по тестированию](TESTER_GUIDE.md)** | **[→ 中文测试指南](TESTER_GUIDE_CN.md)** | **[→ 日本語テストガイド](TESTER_GUIDE_JA.md)** --- ## 🔥 Новости - **[22.03.2026]** [v0.3.2](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.2) — **Кроссплатформенная поддержка + крупное обновление стабильности** — AutoResearchClaw теперь работает с любым ACP-совместимым агентом (Claude Code, Codex CLI, Copilot CLI, Gemini CLI, Kimi CLI) и поддерживает мессенджеры (Discord, Telegram, Lark, WeChat) через мост OpenClaw. Новый CLI-agent бэкенд генерации кода делегирует Stage 10 и 13 внешним CLI-агентам с контролем бюджета и управлением таймаутами. Включает систему защиты от фабрикации (VerifiedRegistry + цикл диагностики и ремонта экспериментов), 100+ исправлений багов, модульный рефакторинг executor, автоопределение `--resume`, усиление повторов LLM и исправления от сообщества. - **[18.03.2026]** [v0.3.1](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.1) — **OpenCode Beast Mode + Контрибьюты сообщества** — Новый режим "Beast Mode" перенаправляет сложную генерацию кода в [OpenCode](https://github.com/anomalyco/opencode) с автоматической оценкой сложности и безопасным фоллбэком. Добавлена поддержка провайдера Novita AI, улучшена потокобезопасность, повышена надежность парсинга ответов LLM, а также исправлено более 20 багов благодаря PR от сообщества и внутреннему аудиту. - **[17.03.2026]** [v0.3.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.3.0) — **Интеграция с MetaClaw** — AutoResearchClaw теперь поддерживает кросс-сессионное обучение через [MetaClaw](https://github.com/aiming-lab/MetaClaw): ошибки пайплайна → структурированные уроки → переиспользуемые навыки, которые внедряются во все 23 этапа. Робастность в контролируемых экспериментах выросла на **+18.3%**. Фича опциональна (`metaclaw_bridge.enabled: true`) и полностью обратно совместима. См. [Руководство по интеграции](#-интеграция-с-metaclaw). - **[16.03.2026]** [v0.2.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.2.0) — Три мультиагентные подсистемы (CodeAgent, BenchmarkAgent, FigureAgent), защищенная Docker-песочница с поддержкой сетевых политик, 4-этапный аудит качества статьи (поиск ИИ-галлюцинаций, оценка по 7 критериям, чек-лист NeurIPS) и более 15 исправлений багов с продакшена. - **[15.03.2026]** [v0.1.0](https://github.com/aiming-lab/AutoResearchClaw/releases/tag/v0.1.0) — Релиз AutoResearchClaw: полностью автономный исследовательский пайплайн из 23 этапов, который превращает одну идею в готовую для конференции статью. Без вмешательства человека. --- ## ⚡ Одна команда. Одна статья. ```bash pip install -e . && researchclaw setup && researchclaw init && researchclaw run --topic "Ваша исследовательская идея" --auto-approve ``` --- ## 🤔 Что это такое? **Вы придумываете. AutoResearchClaw пишет.** Задайте тему исследования — и получите полноценную академическую статью с реальным обзором литературы из OpenAlex, Semantic Scholar и arXiv, экспериментами в песочнице с учетом вашего железа (автоопределение GPU/MPS/CPU), статистическим анализом, мультиагентным рецензированием и готовым LaTeX-кодом для конференций NeurIPS/ICML/ICLR. Никакого ручного контроля. Никакого копипаста. Никаких выдуманных ссылок.
📄paper_draft.mdПолная академическая статья (Введение, Обзор литературы, Метод, Эксперименты, Результаты, Заключение)
📐paper.texГотовый LaTeX-код (шаблоны NeurIPS / ICLR / ICML)
📚references.bibРеальные BibTeX-ссылки из OpenAlex, Semantic Scholar и arXiv — автоматически отфильтрованные под цитаты в тексте
🔍verification_report.json4-уровневая проверка целостности и релевантности цитирования (arXiv, CrossRef, DataCite, LLM)
🧪experiment runs/Сгенерированный код + результаты из песочницы + структурированные JSON-метрики
📊charts/Автоматически сгенерированные графики сравнения с планками погрешностей и доверительными интервалами
📝reviews.mdМультиагентное рецензирование с проверкой согласованности методологии и результатов
🧬evolution/Уроки для самообучения, извлеченные из каждого запуска
📦deliverables/Все итоговые материалы в одной папке — готовы к загрузке в Overleaf
Пайплайн работает **от начала до конца без вмешательства человека**. Если эксперименты падают — он чинит код. Если гипотезы не подтверждаются — он меняет направление. Если цитаты оказываются фейковыми — он их удаляет. --- ## 🚀 Быстрый старт ```bash # 1. Клонируйте и установите git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw python3 -m venv .venv && source .venv/bin/activate pip install -e . # 2. Настройка (интерактивная — устанавливает OpenCode beast mode, проверяет Docker/LaTeX) researchclaw setup # 3. Конфигурация researchclaw init # Интерактивный режим: выбор провайдера LLM, создание config.arc.yaml # Или вручную: cp config.researchclaw.example.yaml config.arc.yaml # 4. Запуск export OPENAI_API_KEY="sk-..." researchclaw run --config config.arc.yaml --topic "Ваша исследовательская идея" --auto-approve ``` Результаты → `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` — готовые к компиляции LaTeX, BibTeX, код экспериментов, графики.
📝 Минимальная конфигурация ```yaml project: name: "my-research" research: topic: "Ваша тема исследования" llm: base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: ["gpt-4o-mini"] experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python" ```
--- ## 🧠 В чем отличие от других | Фича | Как это работает | |-----------|-------------| | **🔄 Цикл PIVOT / REFINE** | На 15-м этапе система автономно решает: ПРОДОЛЖИТЬ, УЛУЧШИТЬ (подобрать параметры) или СМЕНИТЬ КУРС (PIVOT). Артефакты версионируются автоматически. | | **🤖 Мультиагентные дебаты** | Генерация гипотез, анализ результатов и рецензирование проходят в формате структурированных дебатов с разных точек зрения. | | **🧬 Самообучение** | Из каждого запуска извлекаются уроки (обоснование решений, ошибки в коде, аномалии в метриках) с периодом полураспада в 30 дней. Будущие запуски учатся на прошлых ошибках. | | **📚 База знаний** | Каждый запуск пополняет структурированную базу знаний по 6 категориями (решения, эксперименты, находки, литература, вопросы, рецензии). | | **🛡️ Сторожевой модуль Sentinel** | Фоновый мониторинг качества: обнаружение NaN/Inf, проверка соответствия текста статьи реальным данным, оценка релевантности цитат, защита от фабрикации фактов. | --- ## 🦞 Интеграция с OpenClaw **AutoResearchClaw полностью совместим с [OpenClaw](https://github.com/openclaw/openclaw).** Установите его в OpenClaw и запускайте автономные исследования одним сообщением — или используйте отдельно через CLI, Claude Code или любой другой ИИ-ассистент.
### 🚀 Использование с OpenClaw (Рекомендуется) Если вы уже используете [OpenClaw](https://github.com/openclaw/openclaw) как своего ИИ-ассистента: ``` 1️⃣ Отправьте URL репозитория в OpenClaw 2️⃣ OpenClaw автоматически прочитает RESEARCHCLAW_AGENTS.md → поймет структуру пайплайна 3️⃣ Напишите: "Проведи исследование на тему [ваша тема]" 4️⃣ Готово — OpenClaw сам склонирует, установит, настроит, запустит и вернет результаты ``` **Вот и всё.** OpenClaw берет на себя `git clone`, `pip install`, настройку конфигов и запуск пайплайна. Вы просто общаетесь в чате.
💡 Что происходит под капотом 1. OpenClaw читает `RESEARCHCLAW_AGENTS.md` → принимает на себя роль исследовательского оркестратора 2. OpenClaw читает `README.md` → понимает процесс установки и структуру пайплайна 3. OpenClaw копирует `config.researchclaw.example.yaml` → `config.yaml` 4. Запрашивает ваш API-ключ (или использует переменную окружения) 5. Выполняет `pip install -e .` + `researchclaw run --topic "..." --auto-approve` 6. Возвращает готовую статью, LaTeX, код экспериментов и список литературы
### 🔌 Мост OpenClaw (Продвинутый уровень) Для более глубокой интеграции в AutoResearchClaw встроена **система адаптеров** с 6 опциональными возможностями: ```yaml # config.arc.yaml openclaw_bridge: use_cron: true # ⏰ Запуск исследований по расписанию use_message: true # 💬 Уведомления о прогрессе (Discord/Slack/Telegram) use_memory: true # 🧠 Сохранение знаний между сессиями use_sessions_spawn: true # 🔀 Запуск параллельных подсессий для независимых этапов use_web_fetch: true # 🌐 Поиск в интернете в реальном времени при обзоре литературы use_browser: false # 🖥️ Сбор статей через браузер ``` Каждый флаг активирует типизированный протокол адаптера. Если OpenClaw поддерживает эти функции, адаптеры используют их без изменения кода. Подробности см. в [`docs/integration-guide.md`](docs/integration-guide.md). ### ACP (Agent Client Protocol) AutoResearchClaw может использовать **любого ACP-совместимого агента** в качестве LLM-бэкенда — API-ключи не требуются. Агент общается через [acpx](https://github.com/openclaw/acpx), поддерживая единую сессию на протяжении всех 23 этапов. | Агент | Команда | Примечания | |-------|---------|-------| | Claude Code | `claude` | Anthropic | | Codex CLI | `codex` | OpenAI | | Copilot CLI | `gh` | GitHub | | Gemini CLI | `gemini` | Google | | OpenCode | `opencode` | SST | | Kimi CLI | `kimi` | Moonshot | ```yaml # config.yaml — пример ACP llm: provider: "acp" acp: agent: "claude" # Любая команда CLI ACP-совместимого агента cwd: "." # Рабочая директория для агента # base_url и api_key не нужны — агент сам управляет авторизацией. ``` ```bash # Просто запускайте — агент использует свои собственные учетные данные researchclaw run --config config.yaml --topic "Ваша идея" --auto-approve ``` ### 🛠️ Другие способы запуска | Способ | Как запустить | |--------|-----| | **CLI** | `researchclaw setup` → `researchclaw init` → `researchclaw run --topic "..." --auto-approve` | | **Python API** | `from researchclaw.pipeline import Runner; Runner(config).run()` | | **Claude Code** | Читает `RESEARCHCLAW_CLAUDE.md` — просто напишите *"Run research on [topic]"* | | **Copilot CLI** | `researchclaw run --topic "..."` с `llm.acp.agent: "gh"` | | **OpenCode** | Читает `.claude/skills/` — такой же интерфейс на естественном языке | | **Любой AI CLI** | Скормите `RESEARCHCLAW_AGENTS.md` в контекст → агент сам поймет, что делать | --- ## 🔬 Пайплайн: 23 этапа, 8 фаз ``` Фаза A: Определение области Фаза E: Выполнение экспериментов 1. TOPIC_INIT 12. EXPERIMENT_RUN 2. PROBLEM_DECOMPOSE 13. ITERATIVE_REFINE ← самовосстановление Фаза B: Поиск литературы Фаза F: Анализ и принятие решений 3. SEARCH_STRATEGY 14. RESULT_ANALYSIS ← мультиагентный анализ 4. LITERATURE_COLLECT ← API 15. RESEARCH_DECISION ← PIVOT/REFINE 5. LITERATURE_SCREEN [гейт] 6. KNOWLEDGE_EXTRACT Фаза G: Написание статьи 16. PAPER_OUTLINE Фаза C: Синтез знаний 17. PAPER_DRAFT 7. SYNTHESIS 18. PEER_REVIEW ← проверка доказательств 8. HYPOTHESIS_GEN ← дебаты 19. PAPER_REVISION Фаза D: Дизайн экспериментов Фаза H: Финализация 9. EXPERIMENT_DESIGN [гейт] 20. QUALITY_GATE [гейт] 10. CODE_GENERATION 21. KNOWLEDGE_ARCHIVE 11. RESOURCE_PLANNING 22. EXPORT_PUBLISH ← LaTeX 23. CITATION_VERIFY ← проверка релевантности ``` > **Гейты (Контрольные точки)** (5, 9, 20) ставят пайплайн на паузу для апрува человеком (или пропускаются флагом `--auto-approve`). При отклонении пайплайн откатывается назад. > **Циклы принятия решений**: На 15-м этапе система может уйти на доработку (REFINE → Этап 13) или сменить курс (PIVOT → Этап 8), автоматически сохраняя версии артефактов.
📋 Что происходит на каждой фазе | Фаза | Описание | |-------|-------------| | **A: Определение области** | LLM разбивает тему на структурированное дерево проблем с исследовательскими вопросами. | | **A+: Железо** | Автоопределение GPU (NVIDIA CUDA / Apple MPS / CPU), предупреждения о нехватке ресурсов, адаптация генерации кода под доступное железо. | | **B: Литература** | Поиск по нескольким базам (OpenAlex → Semantic Scholar → arXiv) реальных статей, фильтрация по релевантности, извлечение карточек знаний. | | **C: Синтез** | Кластеризация находок, поиск пробелов в исследованиях, генерация проверяемых гипотез через мультиагентные дебаты. | | **D: Дизайн** | Проектирование плана экспериментов, генерация Python-кода с учетом железа (выбор пакетов под GPU), оценка требуемых ресурсов. | | **E: Выполнение** | Запуск экспериментов в песочнице, отлов NaN/Inf и багов в рантайме, самовосстановление кода через LLM. | | **F: Анализ** | Мультиагентный анализ результатов; автономное решение ПРОДОЛЖИТЬ / УЛУЧШИТЬ / СМЕНИТЬ КУРС с подробным обоснованием. | | **G: Написание** | План → написание по разделам (5,000-6,500 слов) → рецензирование (с проверкой соответствия методологии и результатов) → редактура с контролем объема. | | **H: Финализация** | Контроль качества, архивация знаний, экспорт в LaTeX по шаблонам конференций, проверка целостности и релевантности цитат. |
--- ## ✨ Ключевые фичи | Фича | Описание | |---------|------------| | **📚 Мультиисточниковая литература** | Реальные статьи из OpenAlex, Semantic Scholar и arXiv — расширение запросов, дедупликация, защита от падений API с постепенной деградацией. | | **🔍 4-уровневая проверка цитат** | Проверка arXiv ID → CrossRef/DataCite DOI → совпадение заголовков в Semantic Scholar → оценка релевантности через LLM. Выдуманные ссылки удаляются автоматически. | | **🖥️ Адаптация под железо** | Автоопределение GPU (NVIDIA CUDA / Apple MPS / CPU) и адаптация генерации кода, импортов и масштаба экспериментов. | | **🦾 OpenCode Beast Mode** | Сложные эксперименты автоматически перенаправляются в [OpenCode](https://github.com/anomalyco/opencode) — генерация многофайловых проектов с кастомными архитектурами, циклами обучения и ablation studies. Устанавливается через `researchclaw setup`. | | **🧪 Эксперименты в песочнице** | Валидация кода через AST, неизменяемая обвязка, быстрый отказ при NaN/Inf, самовосстановление, итеративное улучшение (до 10 раундов), сохранение частичных результатов. | | **📝 Написание уровня конференций** | Шаблоны NeurIPS/ICML/ICLR, написание по разделам (5,000-6,500 слов), защита от выдуманных фактов, контроль объема при редактуре, удаление типичных ИИ-оговорок. | | **📐 Переключение шаблонов** | `neurips_2025`, `iclr_2026`, `icml_2026` — Markdown → LaTeX с формулами, таблицами, графиками, перекрестными ссылками и `\cite{}`. | | **🚦 Гейты качества** | 3 точки контроля человеком (Этапы 5, 9, 20) с возможностью отката. Можно пропустить флагом `--auto-approve`. | --- ## 🧠 Интеграция с MetaClaw **AutoResearchClaw + [MetaClaw](https://github.com/aiming-lab/MetaClaw) = Пайплайн, который учится на каждом запуске.** MetaClaw добавляет **перенос знаний между запусками**. Если эта функция включена, пайплайн автоматически извлекает уроки из ошибок и предупреждений, превращает их в переиспользуемые навыки и внедряет во все 23 этапа при следующих запусках — чтобы больше никогда не повторять одни и те же ошибки. ### Как это работает ``` Запуск N выполняется → ошибки/предупреждения сохраняются как Уроки (Lessons) ↓ MetaClaw конвертирует Урок → Навык (Skill) ↓ Файлы навыков arc-* сохраняются в ~/.metaclaw/skills/ ↓ Запуск N+1 → build_overlay() внедряет навыки в каждый промпт LLM ↓ LLM избегает известных ошибок → выше качество, меньше ретраев ``` ### Быстрая настройка ```bash # 1. Установите MetaClaw (если еще не установлен) pip install metaclaw # 2. Включите в конфиге ``` ```yaml # config.arc.yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" # Прокси MetaClaw (опционально) skills_dir: "~/.metaclaw/skills" # Папка для хранения навыков fallback_url: "https://api.openai.com/v1" # Прямой фоллбэк к LLM fallback_api_key: "" # API-ключ для фоллбэка lesson_to_skill: enabled: true min_severity: "warning" # Конвертировать предупреждения и ошибки max_skills_per_run: 3 ``` ```bash # 3. Запускайте как обычно — MetaClaw работает прозрачно researchclaw run --config config.arc.yaml --topic "Ваша идея" --auto-approve ``` После каждого запуска заглядывайте в `~/.metaclaw/skills/arc-*/SKILL.md`, чтобы посмотреть, чему научился ваш пайплайн. ### Результаты экспериментов В контролируемых A/B тестах (одна тема, одна LLM, один конфиг): | Метрика | База | С MetaClaw | Улучшение | |--------|----------|---------------|-------------| | Частота ретраев на этапах | 10.5% | 7.9% | **-24.8%** | | Количество циклов доработки (Refine) | 2.0 | 1.2 | **-40.0%** | | Успешное завершение пайплайна | 18/19 | 19/19 | **+5.3%** | | Общий индекс робастности (композитный) | 0.714 | 0.845 | **+18.3%** | > Композитный индекс робастности — это взвешенное среднее из процента завершения (40%), снижения ретраев (30%) и эффективности циклов доработки (30%). ### Обратная совместимость - **По умолчанию: ВЫКЛЮЧЕНО.** Если блока `metaclaw_bridge` нет или `enabled: false`, пайплайн работает как раньше. - **Никаких новых зависимостей.** MetaClaw опционален — ядро работает и без него. - **Все 1 935 тестов проходят успешно** даже с кодом интеграции. --- ## ⚙️ Справочник по конфигурации
Нажмите, чтобы развернуть полный конфиг ```yaml # === Проект === project: name: "my-research" # Идентификатор проекта mode: "docs-first" # docs-first | semi-auto | full-auto # === Исследование === research: topic: "..." # Тема исследования (обязательно) domains: ["ml", "nlp"] # Домены для поиска литературы daily_paper_count: 8 # Целевое количество статей на один запрос quality_threshold: 4.0 # Минимальный порог качества для статей # === Рантайм === runtime: timezone: "Europe/Moscow" # Для таймстемпов max_parallel_tasks: 3 # Лимит параллельных экспериментов approval_timeout_hours: 12 # Таймаут ожидания на гейтах retry_limit: 2 # Количество ретраев при падении этапа # === LLM === llm: provider: "openai-compatible" # openai | openrouter | deepseek | minimax | acp | openai-compatible base_url: "https://..." # API endpoint (обязательно для openai-compatible) api_key_env: "OPENAI_API_KEY" # Переменная окружения с ключом (обязательно для openai-compatible) api_key: "" # Или можно захардкодить ключ здесь primary_model: "gpt-4o" # Основная модель fallback_models: ["gpt-4o-mini"] # Цепочка фоллбэков s2_api_key: "" # API-ключ Semantic Scholar (опционально, дает лимиты выше) acp: # Используется только если provider: "acp" agent: "claude" # Команда CLI ACP-агента (claude, codex, gemini и т.д.) cwd: "." # Рабочая директория агента # === Эксперименты === experiment: mode: "sandbox" # simulated | sandbox | docker | ssh_remote time_budget_sec: 300 # Макс. время на один запуск (по умолчанию: 300с) max_iterations: 10 # Макс. количество итераций оптимизации metric_key: "val_loss" # Название главной метрики metric_direction: "minimize" # minimize | maximize sandbox: python_path: ".venv/bin/python" gpu_required: false allowed_imports: [math, random, json, csv, numpy, torch, sklearn] max_memory_mb: 4096 docker: image: "researchclaw/experiment:latest" network_policy: "setup_only" # none | setup_only | pip_only | full gpu_enabled: true memory_limit_mb: 8192 auto_install_deps: true # Автоопределение импортов → requirements.txt ssh_remote: host: "" # Хостнейм GPU-сервера gpu_ids: [] # Доступные ID видеокарт remote_workdir: "/tmp/researchclaw_experiments" opencode: # OpenCode Beast Mode (устанавливается через `researchclaw setup`) enabled: true # Главный рубильник (по умолчанию: true) auto: true # Автозапуск без подтверждения (по умолчанию: true) complexity_threshold: 0.2 # 0.0-1.0 — чем выше, тем реже триггерится (только на сложных задачах) model: "" # Переопределение модели (пусто = использовать llm.primary_model) timeout_sec: 600 # Макс. время на генерацию в OpenCode max_retries: 1 # Количество ретраев при падении workspace_cleanup: true # Удалять временный воркспейс после сбора результатов # === Экспорт === export: target_conference: "neurips_2025" # neurips_2025 | iclr_2026 | icml_2026 authors: "Anonymous" bib_file: "references" # === Промпты === prompts: custom_file: "" # Путь к кастомному YAML с промптами (пусто = дефолтные) # === Безопасность === security: hitl_required_stages: [5, 9, 20] # Этапы, требующие апрува человеком (Human-in-the-loop) allow_publish_without_approval: false redact_sensitive_logs: true # === База знаний === knowledge_base: backend: "markdown" # markdown | obsidian root: "docs/kb" # === Уведомления === notifications: channel: "console" # console | discord | slack target: "" # === Мост MetaClaw (Опционально) === metaclaw_bridge: enabled: false # Включить кросс-сессионное обучение proxy_url: "http://localhost:30000" # URL прокси MetaClaw skills_dir: "~/.metaclaw/skills" # Папка для хранения навыков arc-* fallback_url: "" # Прямой фоллбэк к LLM, если прокси лежит fallback_api_key: "" # API-ключ для фоллбэка lesson_to_skill: enabled: true # Автоматически конвертировать уроки в навыки min_severity: "warning" # Минимальная серьезность для конвертации max_skills_per_run: 3 # Макс. количество новых навыков за один запуск # === Мост OpenClaw === openclaw_bridge: use_cron: false # Запуск исследований по расписанию use_message: false # Уведомления о прогрессе use_memory: false # Сохранение знаний между сессиями use_sessions_spawn: false # Запуск параллельных подсессий use_web_fetch: false # Поиск в интернете в реальном времени use_browser: false # Сбор статей через браузер ```
--- ## 🙏 Благодарности Вдохновлено проектами: - 🔬 [AI Scientist](https://github.com/SakanaAI/AI-Scientist) (Sakana AI) — Пионер автоматизированных исследований - 🧠 [AutoResearch](https://github.com/karpathy/autoresearch) (Andrej Karpathy) — Сквозная автоматизация исследований - 🌐 [FARS](https://analemma.ai/blog/introducing-fars/) (Analemma) — Полностью автоматизированная исследовательская система --- ## 📄 Лицензия MIT — подробности см. в [LICENSE](../LICENSE). --- ## 📌 Цитирование Если AutoResearchClaw оказался вам полезен, пожалуйста, процитируйте: ```bibtex @misc{liu2026autoresearchclaw, author = {Liu, Jiaqi and Xia, Peng and Han, Siwei and Qiu, Shi and Zhang, Letian and Chen, Guiming and Tu, Haoqin and Yang, Xinyu and and Zhou, Jiawei and Zhu, Hongtu and Li, Yun and Zhou, Yuyin and Zheng, Zeyu and Xie, Cihang and Ding, Mingyu and Yao, Huaxiu}, title = {AutoResearchClaw: Fully Autonomous Research from Idea to Paper}, year = {2026}, organization = {GitHub}, url = {https://github.com/aiming-lab/AutoResearchClaw}, } ```

Создано с 🦞 командой AutoResearchClaw

================================================ FILE: docs/TESTER_GUIDE.md ================================================

AutoResearchClaw Logo

🧪 Community Testing Guide

Help us stress-test the world's first fully autonomous research pipeline — across every domain.

⭐ Star the Repo · 🚀 Quick Start · 📋 Feedback Template · 🇨🇳 中文测试指南 · 🇯🇵 日本語テストガイド

--- ## 👋 Welcome, Tester! **AutoResearchClaw** is a fully autonomous academic paper generation pipeline. You give it a research idea — it handles everything else: literature search, experiment design, code generation, experiment execution, paper writing, peer review, and final delivery. **23 stages, zero human intervention.** We're looking for testers from **all disciplines and backgrounds** — machine learning, NLP, computer vision, reinforcement learning, bioinformatics, physics, social sciences, and beyond. The more diverse the testing, the better the pipeline becomes. **Your mission:** Run the pipeline with your own research idea, inspect the output, and submit a detailed feedback report. That's it. Every piece of feedback directly shapes the next version. --- ## 📋 Table of Contents 1. [Prerequisites](#-prerequisites) 2. [Installation & Setup](#-installation--setup) 3. [Running the Pipeline](#-running-the-pipeline) 4. [Inspecting the Output](#-inspecting-the-output) 5. [Feedback Report Requirements](#-feedback-report-requirements) 6. [Feedback Template](#-feedback-template) 7. [FAQ](#-faq) --- ## 📦 Prerequisites | Item | Minimum | Recommended | |------|---------|-------------| | OS | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) | | Python | 3.11+ | 3.11 or 3.12 | | Disk | 500 MB | 2 GB+ | | RAM | 8 GB | 16 GB+ | | GPU | Not required (sandbox mode) | NVIDIA GPU + CUDA 12.x (docker mode) | | Network | Required (LLM API + literature search) | Stable connection | | LLM API Key | **Required** | OpenAI or Anthropic | ### 🔑 About API Keys The pipeline calls a large language model (LLM) at every stage — writing, coding, reviewing, and more. You'll need an API key from **OpenAI** or **Anthropic**. > **We strongly recommend using the most capable models available for the best results:** > > | Provider | Recommended Model | Fallback | > |----------|------------------|----------| > | **OpenAI** | **GPT-5.4** (best) | GPT-5.1 or GPT-4.1 | > | **Anthropic** | **Claude Opus 4.6** (best) | Claude Sonnet 4.6 | > > Using a top-tier model significantly improves paper quality, code correctness, and experiment design. Older models (e.g., GPT-4o) may produce noticeably weaker output. --- ## 🛠 Installation & Setup ### ⚠️ Always Use the Latest Version > **This project is under active development.** The codebase is updated frequently, and different versions can produce significantly different results. > > **Before every test run, always pull the latest code:** > > ```bash > cd AutoResearchClaw > git pull origin main > pip install -e . # Re-install to pick up changes > ``` > > Record your version for the feedback report: > ```bash > git log --oneline -1 > ``` --- ### Option A: Claude Code (Fastest — Recommended ⚡) If you have [Claude Code](https://claude.ai/claude-code) (Anthropic's CLI tool), just paste this: ``` Please clone and install AutoResearchClaw: https://github.com/aiming-lab/AutoResearchClaw.git If already cloned, run git pull origin main to update to the latest version first. Then create a config file with: - LLM: OpenAI with gpt-5.4 (or Anthropic Claude Opus 4.6) - Experiment mode: sandbox (local execution) - Research topic: "" - Auto-approve all gate stages My API key is: sk-xxxx (set it as an environment variable, don't hardcode it) ``` Claude Code will handle cloning, dependencies, configuration, and execution automatically. ### Option B: Manual Installation ```bash # 1. Clone the repo git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw # 2. Create a virtual environment python3 -m venv .venv source .venv/bin/activate # macOS / Linux # .venv\Scripts\activate # Windows (prefer WSL2) # 3. Install pip install -e . # 4. Verify researchclaw --help ``` ### ⚙️ Configuration ```bash cp config.researchclaw.example.yaml config.yaml ``` Edit `config.yaml` — here are the key fields: ```yaml # === Project === project: name: "my-test" mode: "full-auto" # === Research Topic — describe your idea in English === research: topic: "Your research idea in 1-2 sentences" domains: - "machine-learning" # Options: nlp, cv, rl, graph-learning, etc. # === LLM — use the strongest model you have access to! === # # Option 1: OpenAI (GPT-5.4 recommended) llm: provider: "openai-compatible" base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-5.4" # Best available fallback_models: - "gpt-5.1" - "gpt-4.1" # Option 2: Anthropic Claude (Claude Opus 4.6 recommended) # llm: # provider: "openai-compatible" # base_url: "https://api.anthropic.com/v1" # api_key_env: "ANTHROPIC_API_KEY" # primary_model: "claude-opus-4-6" # fallback_models: # - "claude-sonnet-4-6" # === Experiment === experiment: mode: "sandbox" # sandbox = local execution (recommended) time_budget_sec: 600 # Max seconds per experiment run max_iterations: 10 metric_key: "primary_metric" metric_direction: "minimize" # or "maximize" ``` ### 🔐 Set Your API Key ```bash # OpenAI users: export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx" # Anthropic users: export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx" # Optional: Semantic Scholar API key (speeds up literature search) export S2_API_KEY="your-s2-key" ``` > **🔒 Security:** Never hardcode API keys in files. Use `api_key_env` in the config to reference an environment variable. --- ## 🚀 Running the Pipeline ### Quick Start ```bash source .venv/bin/activate export OPENAI_API_KEY="sk-xxxx" # or ANTHROPIC_API_KEY researchclaw run --config config.yaml --auto-approve ``` ### With a Specific Topic ```bash researchclaw run \ --config config.yaml \ --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \ --auto-approve ``` ### ⏱ Expected Runtime | Mode | Estimated Time | Notes | |------|---------------|-------| | sandbox | 30 min – 2 hours | Depends on experiment complexity & API speed | | docker (GPU) | 1 – 4 hours | For heavier deep learning experiments | The terminal shows real-time progress. **No manual intervention needed** — sit back and let it run. ### ✅ How to Know It's Done You'll see output like: ``` [Stage 23/23] ✓ Deliverables packaged Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/ ``` ### 🔄 If It Gets Interrupted The pipeline supports checkpointing — just resume: ```bash researchclaw run --config config.yaml --resume ``` --- ## 🔍 Inspecting the Output After completion, find your results in `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/`. ### 📂 Deliverables | File / Directory | Description | |-----------------|-------------| | `paper_final.md` | Final paper in Markdown (5,000–6,500 words) | | `paper.tex` | Conference-ready LaTeX source (directly compilable) | | `references.bib` | BibTeX bibliography (verified citations) | | `code/main.py` | Auto-generated experiment code | | `code/requirements.txt` | Python dependencies for experiments | | `charts/` | Result visualization charts (PNG) | | `verification_report.json` | Citation integrity verification report | | `manifest.json` | Deliverable manifest with metadata | ### 🔎 What to Check 1. **Paper Content** (`paper_final.md` or `paper.tex`) - Is the title relevant to the topic? - Does the abstract clearly state problem, method, and results? - Does Related Work cite key papers in the field? - Is the method description technically correct? - Is the experiment design sound (datasets, baselines, metrics)? - Are results meaningful (not all zeros, not NaN)? - Are conclusions consistent with experimental findings? 2. **Experiment Code** (`code/main.py`) - Can it run independently? - Does it use real datasets (not randomly generated fake data)? - Does it implement what the paper describes? - Are hyperparameters reasonable? 3. **Charts** (`charts/`) - Are they readable and clean? - Are axis labels correct? - Does the data match the paper's claims? 4. **References** (`references.bib`) - Do the cited papers actually exist? - Are citations relevant to the discussion? ### 📊 Auto-Generated Quality Report The pipeline produces a quality assessment at `stage-20/quality_report.json` containing: - `score_1_to_10` — automated quality score - `verdict` — accept / reject recommendation - `strengths` — what went well - `weaknesses` — identified issues - `required_actions` — suggested improvements Please reference this in your feedback, and add your own expert judgment. --- ## 📝 Feedback Report Requirements **Your feedback is the single most important input for improving this project.** Please be thorough and honest — critical feedback is just as valuable as praise. ### What to Submit | # | Item | Details | |---|------|---------| | F1 | **Feedback Report** (use template below) | Markdown format, named `feedback_.md` | | F2 | **Full Output Directory** | Zip the entire `artifacts/rc-XXXXXX/` directory | | F3 | **Config File** | Your `config.yaml` (**remove API keys first!**) | | F4 | **Terminal Log** (optional but helpful) | Copy of the terminal output during the run | ### The Four Dimensions of Feedback #### 🎯 (a) Quality Assessment From your domain expertise: - If this were a paper in your field, what level would it reach? (top venue / mid-tier / workshop / unpublishable) - How does the writing compare to papers you normally read? - Is the method technically correct? Any obvious errors? - Is the experiment design reasonable? #### 💡 (b) Improvement Suggestions - Which stage produced the weakest output? (literature search / experiment design / code generation / paper writing) - Any obvious code errors or poor design choices? - Specific suggestions for improving the paper structure or writing? #### ⚖️ (c) Pipeline Design Assessment - Are the 23 stages well-designed? Any redundant or missing steps? - Is the iterative experiment refinement effective? - Is the LLM guidance at each stage appropriate? #### 🐛 (d) Bug Reports Please report any issues you find, as specifically as possible: - **Writing bugs:** grammar errors, repeated paragraphs, contradictions, references to non-existent figures - **Code bugs:** runtime errors, logic errors, data handling issues - **Result bugs:** all-zero results, NaN values, unreasonable metrics - **Pipeline bugs:** stages getting stuck, unexpected crashes, resource exhaustion --- ## 📋 Feedback Template Copy the template below, fill it out, and save as `feedback_.md`: ````markdown # AutoResearchClaw — Test Feedback Report ## Basic Information - **Tester Name:** - **Domain / Field:** (e.g., Computer Vision / NLP / Reinforcement Learning / Bioinformatics / ...) - **Test Date:** - **Code Version:** (output of `git log --oneline -1`, e.g., `44151b1 fix: Phase 3 regression test findings`) - **Research Topic (English):** - **LLM Model Used:** (e.g., gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6) - **Experiment Mode:** (sandbox / docker) - **Total Runtime:** (~X minutes) - **Completed All 23 Stages?:** Yes / No (if No, which stage failed?) --- ## 1. Quality Assessment (Score: 1–10) **My Score:** X / 10 ### 1.1 Overall Paper Quality - What level paper does this correspond to? (top venue / mid-tier / workshop / unpublishable) - Reason for score: ### 1.2 Section-by-Section Assessment | Section | Score (1-10) | Comments | |---------|-------------|----------| | Title | | | | Abstract | | | | Introduction | | | | Related Work | | | | Method | | | | Experiment Design | | | | Results & Analysis | | | | Conclusion | | | | References | | | | Charts / Figures | | | | Code Quality | | | ### 1.3 Comparison with Human-Written Papers - Compared to papers you normally read/write, where are the gaps? - Anything surprisingly good? --- ## 2. Improvement Suggestions ### 2.1 Top Issues (list 3-5, in priority order) 1. 2. 3. ### 2.2 Code Issues - Can the code run independently? - Does it use real datasets and baselines? - Specific code issues (if any): ### 2.3 Writing Issues - Is the paper structure reasonable? - Is the technical description accurate? - Specific writing issues (if any): --- ## 3. Pipeline Design Assessment ### 3.1 Pipeline Flow - Is the 23-stage design reasonable? - Any redundant or missing steps? ### 3.2 Experiment Execution - Is the experiment design sound? (dataset choices, comparison methods, metrics) - Is the iterative refinement effective? ### 3.3 LLM Usage - How well did the LLM perform at each stage? - Any obvious "hallucinations" or unreasonable outputs? --- ## 4. Bug Reports ### 4.1 Writing Bugs | # | Location (section/paragraph) | Description | Severity (High/Med/Low) | |---|------------------------------|-------------|------------------------| | W1 | | | | | W2 | | | | ### 4.2 Code Bugs | # | File / Line | Description | Severity (High/Med/Low) | |---|-------------|-------------|------------------------| | C1 | | | | | C2 | | | | ### 4.3 Result Bugs | # | Description | Affected Metrics/Charts | Severity (High/Med/Low) | |---|-------------|------------------------|------------------------| | R1 | | | | | R2 | | | | ### 4.4 Pipeline Bugs | # | Stage | Description | Severity (High/Med/Low) | |---|-------|-------------|------------------------| | P1 | | | | | P2 | | | | --- ## 5. Additional Comments (Free-form: any observations, ideas, or suggestions you think would be valuable) --- ## Attachments Checklist - [ ] Feedback report (`feedback_.md`) - [ ] Full output directory (`artifacts/rc-XXXXXX.zip`) - [ ] Config file (`config.yaml`, API keys removed) - [ ] Terminal log (optional) ```` --- ## ❓ FAQ ### Q1: Can I test without a GPU? **Yes!** Use `experiment.mode: "sandbox"` — the pipeline runs experiments on your CPU. The experiments will be simpler, but still enough for a full end-to-end test. ### Q2: How much does an API call cost? A full pipeline run costs roughly **$5–15** in API fees, depending on the model, number of revision iterations, and experiment complexity. Top-tier models (GPT-5.4, Claude Opus 4.6) cost a bit more but produce significantly better results. ### Q3: What if the pipeline crashes mid-run? Resume from the checkpoint: ```bash researchclaw run --config config.yaml --resume ``` ### Q4: Can I use a non-English research topic? We recommend describing your topic in **English**. The pipeline's prompts, literature search, and paper generation are all English-based. If your idea is originally in another language, please translate it first. ### Q5: What kind of research topic should I pick? Choose a **specific research question in a field you know well** — that way you can meaningfully assess whether the output is technically correct. Tips: - ✅ Pick topics with clear experimental validation (classification, regression, RL tasks, etc.) - ❌ Avoid overly broad or abstract topics (e.g., "AGI", "general intelligence") - ✅ Be specific: *"Investigating the effect of data augmentation strategies on few-shot learning for medical image classification"* ### Q6: How do I use Docker mode? (Advanced) If you have an NVIDIA GPU with Docker + NVIDIA Container Toolkit: ```bash # 1. Build the experiment image docker build -t researchclaw/experiment:latest researchclaw/docker/ # 2. Update config.yaml: # experiment: # mode: "docker" # docker: # gpu_enabled: true # memory_limit_mb: 8192 # network_policy: "setup_only" # recommended default # 3. Run researchclaw run --config config.yaml --auto-approve ``` Docker mode uses a three-phase execution model: pip install (network on) → setup.py (network on) → experiment (network off). The image includes pre-cached datasets (CIFAR-10/100, MNIST, FashionMNIST, STL-10, SVHN) so standard benchmarks work without network access. ### Q7: I tested before — what should I do for a re-test? **Always pull the latest code** before each test: ```bash cd AutoResearchClaw git pull origin main pip install -e . ``` Then verify your version: ```bash git log --oneline -1 ``` Different versions can produce very different results. Always note the commit hash in your feedback report. ### Q8: Where do I submit my feedback? Submit your feedback report and attachments through one of these channels: - **GitHub Issues:** [Open an issue](https://github.com/aiming-lab/AutoResearchClaw/issues) with the label `feedback` - **Pull Request:** Submit your `feedback_.md` to the `community-feedback/` directory - **Email:** Contact the project maintainers (see repo for details) --- ## 🌍 We Need Testers from Every Field The pipeline has been tested primarily on ML topics so far. We especially welcome testers from: - 🧬 **Bioinformatics & Computational Biology** - 🧪 **Chemistry & Materials Science** - 📊 **Statistics & Applied Mathematics** - 🤖 **Robotics & Control Systems** - 🗣️ **NLP & Computational Linguistics** - 👁️ **Computer Vision & Graphics** - 🎮 **Reinforcement Learning & Game Theory** - 🏥 **Medical AI & Healthcare** - 🌐 **Graph Learning & Network Science** - 💹 **Financial ML & Econometrics** - 🛰️ **Remote Sensing & Geospatial AI** ...and any other field where computational experiments are involved! --- ## 🙏 Thank You Every piece of feedback — big or small — directly improves AutoResearchClaw. Thank you for being part of this journey.

⭐ If you find this project interesting, please give us a star on GitHub!

================================================ FILE: docs/TESTER_GUIDE_CN.md ================================================

AutoResearchClaw Logo

🧪 社区测试指南

欢迎来自各个领域的你,一起测试全球首个全自动学术论文生成 Pipeline。

⭐ Star 项目 · 🚀 快速开始 · 📋 反馈模板 · 🇬🇧 English · 🇯🇵 日本語テストガイド

--- ## 👋 你好,测试者! **AutoResearchClaw** 是一个全自动学术论文生成 Pipeline。你只需提供一个研究 idea,系统就会自动完成文献检索、实验设计、代码生成、实验执行、论文撰写、同行评审到最终交付的全部 **23 个阶段**——无需任何人工干预。 我们正在寻找来自**各个学科和领域**的测试者——机器学习、NLP、计算机视觉、强化学习、生物信息学、物理学、社会科学……领域越多样,Pipeline 就能变得越好。 **你的任务:** 用你自己的研究 idea 运行一次完整的 Pipeline,检查输出质量,然后向我们提交一份详细的反馈报告。就这么简单——你的每一条反馈都会直接推动下一个版本的改进。 --- ## 📋 目录 1. [环境要求](#-环境要求) 2. [安装与配置](#-安装与配置) 3. [运行测试](#-运行测试) 4. [查看交付结果](#-查看交付结果) 5. [反馈报告要求](#-反馈报告要求) 6. [反馈报告模板](#-反馈报告模板) 7. [常见问题](#-常见问题) --- ## 📦 环境要求 | 项目 | 最低要求 | 推荐配置 | |------|---------|---------| | 操作系统 | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) | | Python | 3.11+ | 3.11 或 3.12 | | 磁盘空间 | 500 MB | 2 GB+ | | 内存 | 8 GB | 16 GB+ | | GPU | 非必须(sandbox 模式) | NVIDIA GPU + CUDA 12.x(docker 模式) | | 网络 | 需要(调用 LLM API + 文献检索) | 稳定的网络连接 | | LLM API Key | **必须** | OpenAI 或 Anthropic | ### 🔑 关于 API Key Pipeline 在每个阶段都会调用大语言模型(LLM)来完成写作、编码、评审等任务。你需要准备一个 **OpenAI** 或 **Anthropic** 的 API Key。 > **强烈建议使用最新、最强的模型以获得最佳效果:** > > | 提供商 | 推荐模型 | 备选 | > |--------|---------|------| > | **OpenAI** | **GPT-5.4**(首选) | GPT-5.1 或 GPT-4.1 | > | **Anthropic** | **Claude Opus 4.6**(首选) | Claude Sonnet 4.6 | > > 使用顶级模型会显著提升论文写作质量、代码生成准确性和实验设计合理性。较低版本的模型(如 gpt-4o)可能导致输出质量明显下降。 --- ## 🛠 安装与配置 ### ⚠️ 请务必使用最新版本 > **本项目处于快速迭代阶段,** 代码更新频繁,不同版本之间的生成效果可能存在较大差异。 > > **每次测试前,请务必拉取最新代码:** > > ```bash > cd AutoResearchClaw > git pull origin main > pip install -e . # 重新安装以确保更新生效 > ``` > > 记录你的版本号,方便填写反馈报告: > ```bash > git log --oneline -1 > ``` --- ### 方式 A:使用 Claude Code(最快 ⚡ 推荐) 如果你正在使用 [Claude Code](https://claude.ai/claude-code)(Anthropic 的 CLI 工具),直接粘贴以下内容即可: ``` 请帮我克隆并安装 AutoResearchClaw 项目: https://github.com/aiming-lab/AutoResearchClaw.git 如果已经克隆过,请先 git pull origin main 更新到最新版本。 安装完成后,帮我创建一个配置文件,使用以下参数: - LLM: OpenAI,模型选择 gpt-5.4(或 Anthropic Claude Opus 4.6) - 实验模式: sandbox(本地沙盒执行) - 研究主题: "<在这里填入你的研究 idea>" - 自动审批所有 gate stage 我的 API Key 是: sk-xxxx(请设为环境变量,不要写在配置文件里) ``` Claude Code 会自动完成克隆、安装依赖、创建配置文件、运行 Pipeline 的全部步骤。 ### 方式 B:手动安装 ```bash # 1. 克隆项目 git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw # ⚠️ 如果已经克隆过,务必先更新! # git pull origin main # 2. 创建 Python 虚拟环境 python3 -m venv .venv source .venv/bin/activate # macOS / Linux # .venv\Scripts\activate # Windows(推荐使用 WSL2) # 3. 安装项目 pip install -e . # 4. 验证安装成功 researchclaw --help ``` ### ⚙️ 配置文件 ```bash cp config.researchclaw.example.yaml config.yaml ``` 编辑 `config.yaml`,修改以下关键字段: ```yaml # === 项目设置 === project: name: "my-test" mode: "full-auto" # === 研究主题——用英文描述你的 idea === research: topic: "你的研究 idea,用英文描述,一两句话即可" domains: - "machine-learning" # 可选: nlp, cv, rl, graph-learning, etc. # === LLM 配置——请使用最强模型! === # # 方案一:OpenAI(推荐 GPT-5.4) llm: provider: "openai-compatible" base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-5.4" # 首选最强模型 fallback_models: - "gpt-5.1" - "gpt-4.1" # 方案二:Anthropic Claude(推荐 Claude Opus 4.6) # llm: # provider: "openai-compatible" # base_url: "https://api.anthropic.com/v1" # api_key_env: "ANTHROPIC_API_KEY" # primary_model: "claude-opus-4-6" # fallback_models: # - "claude-sonnet-4-6" # === 实验模式 === experiment: mode: "sandbox" # sandbox = 本地执行(推荐) time_budget_sec: 600 # 每次实验最长运行时间(秒) max_iterations: 10 metric_key: "primary_metric" metric_direction: "minimize" # 或 "maximize" ``` ### 🔐 设置 API Key ```bash # OpenAI 用户: export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx" # Anthropic 用户: export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx" # 可选:Semantic Scholar API Key(可加快文献检索) export S2_API_KEY="your-s2-key" ``` > **🔒 安全提醒:** 请勿将 API Key 硬编码在任何文件中。使用 `api_key_env` 指定环境变量名即可。 --- ## 🚀 运行测试 ### 快速开始 ```bash source .venv/bin/activate export OPENAI_API_KEY="sk-xxxx" # 或 ANTHROPIC_API_KEY researchclaw run --config config.yaml --auto-approve ``` ### 指定研究主题运行 ```bash researchclaw run \ --config config.yaml \ --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \ --auto-approve ``` ### ⏱ 预估运行时间 | 实验模式 | 预估时间 | 说明 | |---------|---------|------| | sandbox | 30 分钟 ~ 2 小时 | 取决于实验复杂度和 API 响应速度 | | docker (GPU) | 1 ~ 4 小时 | 可运行更复杂的深度学习实验 | 运行过程中终端会实时显示当前阶段和进度。**无需任何手动操作**,安心等待即可。 ### ✅ 如何知道运行结束 当看到类似以下输出时,表示 Pipeline 已成功完成: ``` [Stage 23/23] ✓ Deliverables packaged Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/ ``` ### 🔄 如果运行中断 Pipeline 支持断点续跑: ```bash researchclaw run --config config.yaml --resume ``` --- ## 🔍 查看交付结果 运行结束后,输出文件位于 `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` 目录下。 ### 📂 交付物清单 | 文件/目录 | 内容 | |----------|------| | `paper_final.md` | 最终论文(Markdown 格式,5,000–6,500 词) | | `paper.tex` | 会议格式 LaTeX 源文件(可直接编译为 PDF) | | `references.bib` | BibTeX 参考文献(经过引用验证) | | `code/main.py` | 自动生成的实验代码 | | `code/requirements.txt` | 实验代码的 Python 依赖 | | `charts/` | 实验结果可视化图表(PNG 格式) | | `verification_report.json` | 引用完整性验证报告 | | `manifest.json` | 交付物清单及元信息 | ### 🔎 重点检查项 1. **论文内容**(`paper_final.md` 或 `paper.tex`) - 标题是否合理、与主题相关 - 摘要是否清晰概述了问题、方法、结果 - 相关工作是否引用了该领域的关键文献 - 方法描述是否清晰、技术上正确 - 实验设计是否合理(数据集、baselines、评估指标) - 结果是否有意义(不是全零、不是 NaN) - 结论是否与实验结果一致 2. **实验代码**(`code/main.py`) - 代码是否能独立运行 - 是否使用了真实数据集(而非随机生成的假数据) - 是否实现了论文中描述的方法 - 是否包含合理的超参数设置 3. **图表**(`charts/`) - 图表是否清晰可读 - 坐标轴标签是否正确 - 数据是否与论文描述一致 4. **引用**(`references.bib`) - 引用的论文是否真实存在 - 引用是否与论文讨论的内容相关 ### 📊 自动质量评估报告 Pipeline 会自动生成一份质量评估报告,位于 `stage-20/quality_report.json`,其中包含: - `score_1_to_10` — 自动评分 - `verdict` — 接收/拒绝建议 - `strengths` — 优点列表 - `weaknesses` — 缺点列表 - `required_actions` — 建议的改进事项 请在你的反馈报告中参考此评估,并补充你自己的专业判断。 --- ## 📝 反馈报告要求 **你的反馈是本项目改进的核心依据。** 无论是批评还是肯定,对我们都同样重要——请务必认真、详细地填写。 ### 需要提交的内容 | # | 提交内容 | 说明 | |---|---------|------| | F1 | **反馈报告**(按下方模板填写) | Markdown 格式,命名为 `feedback_<你的名字>.md` | | F2 | **完整输出目录** | 将整个 `artifacts/rc-XXXXXX/` 目录打包提交(`.zip` 或 `.tar.gz`) | | F3 | **配置文件** | 你使用的 `config.yaml`(**删除 API Key 后**提交) | | F4 | **终端日志**(可选但推荐) | 运行时的终端输出,便于我们排查问题 | ### 反馈的四个维度 #### 🎯 (a) 质量评价 请从你的专业领域角度评价产出论文的质量: - 如果这是你所在领域的论文,它能达到什么水平?(顶会 / 一般会议 / 无法发表) - 与你读过的该领域论文相比,写作质量如何? - 方法的技术正确性如何?有无明显错误? - 实验设计的合理性如何? #### 💡 (b) 优化建议 请指出你认为可以改进的地方: - 哪个阶段的输出质量最差?(文献检索 / 实验设计 / 代码生成 / 论文撰写) - 代码中有没有明显写错或不合理的地方? - 论文结构或表述有什么具体的改进建议? #### ⚖️ (c) 合理性评估 请评估 Pipeline 流程的合理性: - 23 个阶段的设计是否合理?有没有多余或缺失的步骤? - 实验迭代优化的过程是否有效? - LLM 生成内容的引导方式是否合理? #### 🐛 (d) Bug 报告 请尽可能详细地报告你发现的任何问题: - **写作 Bug**:语法错误、重复段落、前后矛盾、引用不存在的图表 - **代码 Bug**:运行报错、逻辑错误、数据处理问题 - **结果 Bug**:全零结果、NaN 值、指标不合理 - **流程 Bug**:阶段卡住、异常中断、资源耗尽 --- ## 📋 反馈报告模板 请复制以下模板,填写后保存为 `feedback_<你的名字>.md`: ````markdown # AutoResearchClaw 测试反馈报告 ## 基本信息 - **测试人员**: - **所属领域**:(例如:计算机视觉 / 自然语言处理 / 强化学习 / 生物信息 / ...) - **测试日期**: - **代码版本**:(运行 `git log --oneline -1` 的输出,例如:`44151b1 fix: Phase 3 regression test findings`) - **研究主题(英文)**: - **使用的 LLM 模型**:(例如:gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6) - **实验模式**:(sandbox / docker) - **运行总时长**:(约 X 分钟) - **是否成功完成 23 个阶段**:是 / 否(如否,请说明卡在哪个阶段) --- ## 一、质量评价(总分 1-10) **我的评分**:X / 10 ### 1.1 论文整体质量 - 相当于什么级别的论文?(顶会 / 一般会议 / workshop / 无法发表) - 简要说明评分理由: ### 1.2 各部分质量评价 | 部分 | 评分 (1-10) | 评价说明 | |------|-----------|---------| | 标题 | | | | 摘要 | | | | 引言 | | | | 相关工作 | | | | 方法 | | | | 实验设计 | | | | 结果与分析 | | | | 结论 | | | | 参考文献 | | | | 图表质量 | | | | 代码质量 | | | ### 1.3 与人工撰写论文的对比 - 与你平时阅读/撰写的论文相比,差距在哪里? - 有哪些方面出乎意料地好? --- ## 二、优化建议 ### 2.1 最需要改进的环节 (请列出 3-5 个最需要改进的具体问题,按优先级排序) 1. 2. 3. ### 2.2 代码问题 - 代码是否能独立运行? - 是否使用了真实数据集和基线方法? - 具体代码问题(如有): ### 2.3 写作问题 - 论文结构是否合理? - 技术描述是否准确? - 具体写作问题(如有): --- ## 三、合理性评估 ### 3.1 Pipeline 流程评价 - 23 个阶段的流程设计是否合理? - 有没有你认为多余或缺失的步骤? ### 3.2 实验执行评价 - 实验设计是否合理?(数据集选择、对比方法、评估指标) - 迭代优化过程是否有效? ### 3.3 LLM 使用评价 - LLM 在各阶段的表现如何? - 有没有明显的"幻觉"或不合理的生成内容? --- ## 四、Bug 报告 ### 4.1 写作 Bug | 编号 | 位置(章节/段落) | 描述 | 严重程度 (高/中/低) | |------|-----------------|------|-------------------| | W1 | | | | | W2 | | | | ### 4.2 代码 Bug | 编号 | 文件/行号 | 描述 | 严重程度 (高/中/低) | |------|----------|------|-------------------| | C1 | | | | | C2 | | | | ### 4.3 结果 Bug | 编号 | 描述 | 涉及指标/图表 | 严重程度 (高/中/低) | |------|------|-------------|-------------------| | R1 | | | | | R2 | | | | ### 4.4 流程 Bug | 编号 | 阶段 | 描述 | 严重程度 (高/中/低) | |------|------|------|-------------------| | P1 | | | | | P2 | | | | --- ## 五、其他建议 (自由发挥:任何你觉得有价值的观察、建议或想法) --- ## 附件清单 - [ ] 反馈报告 (`feedback_<名字>.md`) - [ ] 完整输出目录 (`artifacts/rc-XXXXXX.zip`) - [ ] 配置文件 (`config.yaml`,已删除 API Key) - [ ] 终端日志(可选) ```` --- ## ❓ 常见问题 ### Q1: 没有 GPU 能测试吗? **当然可以!** 使用 `experiment.mode: "sandbox"` 模式,Pipeline 会在本地 CPU 上运行实验。虽然实验规模会受限,但足以完成一次完整的端到端测试。 ### Q2: API 调用大概要花多少钱? 一次完整的 Pipeline 运行约消耗 **$5–15** 的 API 费用,取决于所选模型、论文修订次数和实验复杂度。顶级模型(GPT-5.4、Claude Opus 4.6)费用稍高,但产出质量显著更好,推荐优先使用。 ### Q3: Pipeline 运行中断了怎么办? 从断点继续即可: ```bash researchclaw run --config config.yaml --resume ``` ### Q4: 可以用中文主题吗? 建议使用 **英文** 描述你的研究主题。Pipeline 的提示词、文献检索和论文生成均以英文为主。如果你的 idea 原始语言是中文,请先翻译成英文。 ### Q5: 我应该选什么样的研究主题? 选择你**熟悉的领域内的一个具体研究问题**——这样你才能有效评估论文的技术正确性。建议: - ✅ 选择有明确实验验证方法的主题(分类、回归、强化学习任务等) - ❌ 避免过于宏大或抽象的主题(如 "AGI" 或 "通用人工智能") - ✅ 描述要具体,例如:*"Investigating the effect of data augmentation strategies on few-shot learning for medical image classification"* ### Q6: 如何使用 Docker 模式?(进阶) 如果你有 NVIDIA GPU 并安装了 Docker + NVIDIA Container Toolkit: ```bash # 1. 构建实验镜像 docker build -t researchclaw/experiment:latest researchclaw/docker/ # 2. 修改 config.yaml: # experiment: # mode: "docker" # docker: # gpu_enabled: true # memory_limit_mb: 8192 # network_policy: "setup_only" # 推荐默认值 # 3. 运行 researchclaw run --config config.yaml --auto-approve ``` Docker 模式采用三阶段执行:pip install(联网)→ setup.py(联网)→ 实验代码(断网)。镜像已预缓存常用数据集(CIFAR-10/100、MNIST、FashionMNIST、STL-10、SVHN),标准基准测试无需网络。 ### Q7: 我之前已经测试过了,再次测试需要注意什么? **每次测试前务必拉取最新代码:** ```bash cd AutoResearchClaw git pull origin main pip install -e . ``` 然后确认版本号: ```bash git log --oneline -1 ``` 不同版本的生成效果可能差异很大,请在反馈报告中注明你使用的 commit hash。 ### Q8: 反馈提交到哪里? 你可以通过以下任一渠道提交反馈: - **GitHub Issues:** [提交 Issue](https://github.com/aiming-lab/AutoResearchClaw/issues),添加 `feedback` 标签 - **Pull Request:** 将 `feedback_<名字>.md` 提交到 `community-feedback/` 目录 - **邮件:** 联系项目维护者(详见仓库主页) --- ## 🌍 我们需要来自各个领域的测试者 目前 Pipeline 主要在机器学习领域进行了测试,我们特别欢迎来自以下领域的测试者: - 🧬 **生物信息学与计算生物学** - 🧪 **化学与材料科学** - 📊 **统计学与应用数学** - 🤖 **机器人学与控制系统** - 🗣️ **NLP 与计算语言学** - 👁️ **计算机视觉与图形学** - 🎮 **强化学习与博弈论** - 🏥 **医学 AI 与医疗健康** - 🌐 **图学习与网络科学** - 💹 **金融 ML 与计量经济学** - 🛰️ **遥感与地理空间 AI** ……以及任何涉及计算实验的领域! --- ## 🙏 感谢你的参与 你的每一条反馈——无论大小——都在直接推动 AutoResearchClaw 变得更好。感谢你成为这段旅程的一部分。

⭐ 如果你觉得这个项目有趣,请在 GitHub 上给我们一颗 Star!

================================================ FILE: docs/TESTER_GUIDE_JA.md ================================================

AutoResearchClaw Logo

🧪 コミュニティテストガイド

世界初の完全自律型研究パイプラインを、あらゆる分野でストレステストするためにご協力ください。

⭐ リポジトリにスターを付ける · 🚀 クイックスタート · 📋 フィードバックテンプレート · 🇺🇸 English Testing Guide · 🇨🇳 中文测试指南

--- ## 👋 テスターの皆さんへ **AutoResearchClaw** は、完全自律型の学術論文生成パイプラインです。研究アイデアを入力するだけで、文献検索、実験設計、コード生成、実験実行、論文執筆、査読、最終成果物の作成まで、すべてを自動で処理します。**23ステージ、人手介入ゼロ。** **あらゆる分野・バックグラウンド**のテスターを募集しています — 機械学習、NLP、コンピュータビジョン、強化学習、バイオインフォマティクス、物理学、社会科学など。テストが多様であるほど、パイプラインの改善に繋がります。 **あなたのミッション:** 自分の研究アイデアでパイプラインを実行し、出力を検査して、詳細なフィードバックレポートを提出してください。それだけです。すべてのフィードバックが次のバージョンに直接反映されます。 --- ## 📋 目次 1. [前提条件](#-前提条件) 2. [インストールとセットアップ](#-インストールとセットアップ) 3. [パイプラインの実行](#-パイプラインの実行) 4. [出力の確認](#-出力の確認) 5. [フィードバックレポートの要件](#-フィードバックレポートの要件) 6. [フィードバックテンプレート](#-フィードバックテンプレート) 7. [FAQ](#-faq) --- ## 📦 前提条件 | 項目 | 最小要件 | 推奨 | |------|---------|------| | OS | macOS / Linux / WSL2 | Linux (Ubuntu 22.04+) | | Python | 3.11+ | 3.11 または 3.12 | | ディスク | 500 MB | 2 GB+ | | RAM | 8 GB | 16 GB+ | | GPU | 不要(sandboxモード) | NVIDIA GPU + CUDA 12.x(dockerモード) | | ネットワーク | 必要(LLM API + 文献検索) | 安定した接続 | | LLM APIキー | **必須** | OpenAI または Anthropic | ### 🔑 APIキーについて パイプラインは、執筆、コーディング、レビューなど、すべてのステージで大規模言語モデル(LLM)を呼び出します。**OpenAI** または **Anthropic** のAPIキーが必要です。 > **最良の結果を得るために、利用可能な最も高性能なモデルの使用を強く推奨します:** > > | プロバイダー | 推奨モデル | フォールバック | > |-------------|-----------|--------------| > | **OpenAI** | **GPT-5.4**(最良) | GPT-5.1 または GPT-4.1 | > | **Anthropic** | **Claude Opus 4.6**(最良) | Claude Sonnet 4.6 | > > トップティアのモデルを使用することで、論文の品質、コードの正確性、実験設計が大幅に向上します。古いモデル(例:GPT-4o)では、出力品質が著しく低下する可能性があります。 --- ## 🛠 インストールとセットアップ ### ⚠️ 常に最新バージョンを使用してください > **このプロジェクトは活発に開発中です。** コードベースは頻繁に更新され、バージョンによって結果が大きく異なる場合があります。 > > **テスト実行の前に、必ず最新のコードをプルしてください:** > > ```bash > cd AutoResearchClaw > git pull origin main > pip install -e . # 変更を反映するために再インストール > ``` > > フィードバックレポート用にバージョンを記録してください: > ```bash > git log --oneline -1 > ``` --- ### オプションA:Claude Code(最速 — 推奨 ⚡) [Claude Code](https://claude.ai/claude-code)(AnthropicのCLIツール)をお持ちの場合、以下を貼り付けるだけです: ``` Please clone and install AutoResearchClaw: https://github.com/aiming-lab/AutoResearchClaw.git If already cloned, run git pull origin main to update to the latest version first. Then create a config file with: - LLM: OpenAI with gpt-5.4 (or Anthropic Claude Opus 4.6) - Experiment mode: sandbox (local execution) - Research topic: "<ここに研究アイデアを入力>" - Auto-approve all gate stages My API key is: sk-xxxx (set it as an environment variable, don't hardcode it) ``` Claude Codeがクローン、依存関係、設定、実行をすべて自動で処理します。 ### オプションB:手動インストール ```bash # 1. リポジトリをクローン git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw # 2. 仮想環境を作成 python3 -m venv .venv source .venv/bin/activate # macOS / Linux # .venv\Scripts\activate # Windows(WSL2推奨) # 3. インストール pip install -e . # 4. 動作確認 researchclaw --help ``` ### ⚙️ 設定 ```bash cp config.researchclaw.example.yaml config.yaml ``` `config.yaml` を編集してください — 主要なフィールドは以下の通りです: ```yaml # === プロジェクト === project: name: "my-test" mode: "full-auto" # === 研究トピック — アイデアを英語で記述してください === research: topic: "Your research idea in 1-2 sentences" domains: - "machine-learning" # 選択肢: nlp, cv, rl, graph-learning など # === LLM — 利用可能な最も高性能なモデルを使用してください! === # # オプション1: OpenAI(GPT-5.4推奨) llm: provider: "openai-compatible" base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-5.4" # 最良のモデル fallback_models: - "gpt-5.1" - "gpt-4.1" # オプション2: Anthropic Claude(Claude Opus 4.6推奨) # llm: # provider: "openai-compatible" # base_url: "https://api.anthropic.com/v1" # api_key_env: "ANTHROPIC_API_KEY" # primary_model: "claude-opus-4-6" # fallback_models: # - "claude-sonnet-4-6" # === 実験 === experiment: mode: "sandbox" # sandbox = ローカル実行(推奨) time_budget_sec: 600 # 実験実行あたりの最大秒数 max_iterations: 10 metric_key: "primary_metric" metric_direction: "minimize" # または "maximize" ``` ### 🔐 APIキーの設定 ```bash # OpenAIユーザー: export OPENAI_API_KEY="sk-xxxxxxxxxxxxxxxxxxxxxxxx" # Anthropicユーザー: export ANTHROPIC_API_KEY="sk-ant-xxxxxxxxxxxxxxxxxxxxxxxx" # オプション:Semantic Scholar APIキー(文献検索を高速化) export S2_API_KEY="your-s2-key" ``` > **🔒 セキュリティ:** APIキーをファイルにハードコードしないでください。設定ファイルの `api_key_env` を使用して環境変数を参照してください。 --- ## 🚀 パイプラインの実行 ### クイックスタート ```bash source .venv/bin/activate export OPENAI_API_KEY="sk-xxxx" # または ANTHROPIC_API_KEY researchclaw run --config config.yaml --auto-approve ``` ### 特定のトピックを指定する場合 ```bash researchclaw run \ --config config.yaml \ --topic "Investigating the effect of curriculum learning on image classification with adaptive difficulty scheduling" \ --auto-approve ``` ### ⏱ 想定実行時間 | モード | 推定時間 | 備考 | |--------|---------|------| | sandbox | 30分 〜 2時間 | 実験の複雑さとAPIの速度に依存 | | docker (GPU) | 1 〜 4時間 | より大規模なディープラーニング実験向け | ターミナルにリアルタイムで進捗が表示されます。**手動介入は不要です** — あとは実行完了を待つだけです。 ### ✅ 完了の確認方法 以下のような出力が表示されます: ``` [Stage 23/23] ✓ Deliverables packaged Pipeline complete — deliverables at: artifacts/rc-20260315-XXXXXX-YYYY/deliverables/ ``` ### 🔄 中断された場合 パイプラインはチェックポイントをサポートしています — 再開するだけです: ```bash researchclaw run --config config.yaml --resume ``` --- ## 🔍 出力の確認 完了後、結果は `artifacts/rc-YYYYMMDD-HHMMSS-/deliverables/` に格納されます。 ### 📂 成果物 | ファイル / ディレクトリ | 説明 | |------------------------|------| | `paper_final.md` | Markdown形式の最終論文(5,000〜6,500語) | | `paper.tex` | 学会投稿可能なLaTeXソース(直接コンパイル可能) | | `references.bib` | BibTeX参考文献(検証済みの引用) | | `code/main.py` | 自動生成された実験コード | | `code/requirements.txt` | 実験用のPython依存関係 | | `charts/` | 結果の可視化チャート(PNG) | | `verification_report.json` | 引用整合性の検証レポート | | `manifest.json` | メタデータ付きの成果物マニフェスト | ### 🔎 確認すべきポイント 1. **論文の内容** (`paper_final.md` または `paper.tex`) - タイトルはトピックに関連しているか? - アブストラクトは問題、手法、結果を明確に述べているか? - 関連研究はその分野の主要な論文を引用しているか? - 手法の記述は技術的に正確か? - 実験設計は妥当か(データセット、ベースライン、指標)? - 結果は有意義か(すべてゼロやNaNではないか)? - 結論は実験結果と一貫しているか? 2. **実験コード** (`code/main.py`) - 単独で実行できるか? - 実際のデータセットを使用しているか(ランダム生成の偽データではないか)? - 論文に記述された内容を実装しているか? - ハイパーパラメータは妥当か? 3. **チャート** (`charts/`) - 読みやすく整理されているか? - 軸ラベルは正しいか? - データは論文の主張と一致しているか? 4. **参考文献** (`references.bib`) - 引用された論文は実在するか? - 引用は議論に関連しているか? ### 📊 自動生成品質レポート パイプラインは `stage-20/quality_report.json` に品質評価を出力します。内容は以下の通りです: - `score_1_to_10` — 自動品質スコア - `verdict` — 受理 / 却下の推奨 - `strengths` — 良かった点 - `weaknesses` — 特定された問題点 - `required_actions` — 改善提案 フィードバックでこれを参照し、ご自身の専門的な判断も加えてください。 --- ## 📝 フィードバックレポートの要件 **あなたのフィードバックは、このプロジェクトを改善するための最も重要なインプットです。** 徹底的かつ正直に記述してください — 批判的なフィードバックも称賛と同様に価値があります。 ### 提出物 | # | 項目 | 詳細 | |---|------|------| | F1 | **フィードバックレポート**(以下のテンプレートを使用) | Markdown形式、ファイル名は `feedback_.md` | | F2 | **出力ディレクトリ一式** | `artifacts/rc-XXXXXX/` ディレクトリ全体をZip圧縮 | | F3 | **設定ファイル** | `config.yaml`(**APIキーを事前に削除してください!**) | | F4 | **ターミナルログ**(任意だが推奨) | 実行中のターミナル出力のコピー | ### フィードバックの4つの観点 #### 🎯 (a) 品質評価 あなたの専門知識から: - この論文があなたの分野で発表されたとしたら、どのレベルに達するか?(トップ会議 / 中堅 / ワークショップ / 出版不可) - 普段読む論文と比較して、文章の質はどうか? - 手法は技術的に正確か?明らかな誤りはないか? - 実験設計は妥当か? #### 💡 (b) 改善提案 - どのステージの出力が最も弱いか?(文献検索 / 実験設計 / コード生成 / 論文執筆) - 明らかなコードエラーや設計上の問題はないか? - 論文の構成や執筆の改善に関する具体的な提案は? #### ⚖️ (c) パイプライン設計の評価 - 23ステージの設計は適切か?冗長または不足しているステップはないか? - 反復的な実験改善は効果的か? - 各ステージでのLLMの指示は適切か? #### 🐛 (d) バグ報告 発見した問題をできるだけ具体的に報告してください: - **文章のバグ:** 文法エラー、段落の繰り返し、矛盾、存在しない図への参照 - **コードのバグ:** ランタイムエラー、ロジックエラー、データ処理の問題 - **結果のバグ:** すべてゼロの結果、NaN値、不合理な指標 - **パイプラインのバグ:** ステージの停止、予期しないクラッシュ、リソース枯渇 --- ## 📋 フィードバックテンプレート 以下のテンプレートをコピーし、記入して `feedback_.md` として保存してください: ````markdown # AutoResearchClaw — テストフィードバックレポート ## 基本情報 - **テスター名:** - **専門分野:** (例:コンピュータビジョン / NLP / 強化学習 / バイオインフォマティクス / ...) - **テスト日:** - **コードバージョン:** (`git log --oneline -1` の出力、例:`44151b1 fix: Phase 3 regression test findings`) - **研究トピック(英語):** - **使用したLLMモデル:** (例:gpt-5.4 / gpt-5.1 / claude-opus-4-6 / claude-sonnet-4-6) - **実験モード:** (sandbox / docker) - **合計実行時間:** (約X分) - **全23ステージ完了?:** はい / いいえ(いいえの場合、どのステージで失敗?) --- ## 1. 品質評価(スコア:1〜10) **私のスコア:** X / 10 ### 1.1 論文全体の品質 - この論文はどのレベルに相当するか?(トップ会議 / 中堅 / ワークショップ / 出版不可) - スコアの理由: ### 1.2 セクション別評価 | セクション | スコア (1-10) | コメント | |-----------|-------------|---------| | タイトル | | | | アブストラクト | | | | イントロダクション | | | | 関連研究 | | | | 手法 | | | | 実験設計 | | | | 結果と分析 | | | | 結論 | | | | 参考文献 | | | | チャート / 図表 | | | | コード品質 | | | ### 1.3 人間が書いた論文との比較 - 普段読み書きする論文と比較して、どこにギャップがあるか? - 意外に良かった点は? --- ## 2. 改善提案 ### 2.1 主要な問題点(優先順位で3〜5つ) 1. 2. 3. ### 2.2 コードの問題 - コードは単独で実行できるか? - 実際のデータセットとベースラインを使用しているか? - 具体的なコードの問題(もしあれば): ### 2.3 文章の問題 - 論文の構成は妥当か? - 技術的な記述は正確か? - 具体的な文章の問題(もしあれば): --- ## 3. パイプライン設計の評価 ### 3.1 パイプラインフロー - 23ステージの設計は妥当か? - 冗長または不足しているステップはないか? ### 3.2 実験実行 - 実験設計は妥当か?(データセットの選択、比較手法、指標) - 反復的な改善は効果的か? ### 3.3 LLMの使用 - 各ステージでのLLMのパフォーマンスはどうか? - 明らかな「ハルシネーション」や不合理な出力はないか? --- ## 4. バグ報告 ### 4.1 文章のバグ | # | 場所(セクション/段落) | 説明 | 重要度(高/中/低) | |---|------------------------|------|-------------------| | W1 | | | | | W2 | | | | ### 4.2 コードのバグ | # | ファイル / 行 | 説明 | 重要度(高/中/低) | |---|--------------|------|-------------------| | C1 | | | | | C2 | | | | ### 4.3 結果のバグ | # | 説明 | 影響を受ける指標/チャート | 重要度(高/中/低) | |---|------|--------------------------|-------------------| | R1 | | | | | R2 | | | | ### 4.4 パイプラインのバグ | # | ステージ | 説明 | 重要度(高/中/低) | |---|---------|------|-------------------| | P1 | | | | | P2 | | | | --- ## 5. その他のコメント (自由記述:有益と思われる観察、アイデア、提案など) --- ## 添付チェックリスト - [ ] フィードバックレポート (`feedback_.md`) - [ ] 出力ディレクトリ一式 (`artifacts/rc-XXXXXX.zip`) - [ ] 設定ファイル (`config.yaml`、APIキー削除済み) - [ ] ターミナルログ(任意) ```` --- ## ❓ FAQ ### Q1: GPUなしでテストできますか? **はい!** `experiment.mode: "sandbox"` を使用してください — パイプラインはCPU上で実験を実行します。実験はシンプルになりますが、エンドツーエンドの完全なテストには十分です。 ### Q2: API呼び出しの費用はどのくらいですか? パイプラインの完全な実行は、モデル、修正反復回数、実験の複雑さに応じて、APIの費用が約**$5〜15**かかります。トップティアのモデル(GPT-5.4、Claude Opus 4.6)はやや高価ですが、大幅に良い結果を生成します。 ### Q3: パイプラインが実行中にクラッシュした場合は? チェックポイントから再開してください: ```bash researchclaw run --config config.yaml --resume ``` ### Q4: 英語以外の研究トピックを使用できますか? トピックは**英語**で記述することを推奨します。パイプラインのプロンプト、文献検索、論文生成はすべて英語ベースです。アイデアが他の言語の場合は、事前に翻訳してください。 ### Q5: どのような研究トピックを選べばよいですか? **自分がよく知っている分野の具体的な研究課題**を選んでください — そうすることで、出力が技術的に正確かどうかを意味のある形で評価できます。ヒント: - ✅ 明確な実験的検証があるトピックを選ぶ(分類、回帰、強化学習タスクなど) - ❌ 過度に広範または抽象的なトピックは避ける(例:「AGI」、「汎用知能」) - ✅ 具体的に:*"医用画像分類におけるFew-shot学習に対するデータ拡張戦略の効果の調査"* ### Q6: Dockerモードの使用方法は?(上級者向け) NVIDIA GPUとDocker + NVIDIA Container Toolkitがある場合: ```bash # 1. 実験用イメージをビルド docker build -t researchclaw/experiment:latest researchclaw/docker/ # 2. config.yamlを更新: # experiment: # mode: "docker" # docker: # gpu_enabled: true # memory_limit_mb: 8192 # network_policy: "setup_only" # 推奨デフォルト # 3. 実行 researchclaw run --config config.yaml --auto-approve ``` Dockerモードは3フェーズの実行モデルを使用します:pip install(ネットワーク有効)→ setup.py(ネットワーク有効)→ 実験(ネットワーク無効)。イメージにはプリキャッシュされたデータセット(CIFAR-10/100、MNIST、FashionMNIST、STL-10、SVHN)が含まれているため、標準的なベンチマークはネットワークアクセスなしで動作します。 ### Q7: 以前テストしましたが、再テストの場合はどうすればよいですか? テストの前に**必ず最新のコードをプル**してください: ```bash cd AutoResearchClaw git pull origin main pip install -e . ``` バージョンを確認してください: ```bash git log --oneline -1 ``` バージョンが異なると、結果が大きく変わる可能性があります。フィードバックレポートには必ずコミットハッシュを記載してください。 ### Q8: フィードバックはどこに提出しますか? フィードバックレポートと添付ファイルは、以下のいずれかの方法で提出してください: - **GitHub Issues:** [Issueを作成](https://github.com/aiming-lab/AutoResearchClaw/issues)し、`feedback` ラベルを付ける - **Pull Request:** `feedback_.md` を `community-feedback/` ディレクトリに提出 - **メール:** プロジェクトのメンテナーに連絡(詳細はリポジトリを参照) --- ## 🌍 あらゆる分野のテスターを募集しています パイプラインはこれまで主にML関連のトピックでテストされてきました。特に以下の分野のテスターを歓迎します: - 🧬 **バイオインフォマティクス・計算生物学** - 🧪 **化学・材料科学** - 📊 **統計学・応用数学** - 🤖 **ロボティクス・制御システム** - 🗣️ **NLP・計算言語学** - 👁️ **コンピュータビジョン・グラフィックス** - 🎮 **強化学習・ゲーム理論** - 🏥 **医療AI・ヘルスケア** - 🌐 **グラフ学習・ネットワーク科学** - 💹 **金融ML・計量経済学** - 🛰️ **リモートセンシング・地理空間AI** ...その他、計算実験が関わるあらゆる分野! --- ## 🙏 ありがとうございます 大小問わず、すべてのフィードバックがAutoResearchClawの改善に直接つながります。この取り組みに参加していただき、ありがとうございます。

⭐ このプロジェクトに興味を持たれたら、GitHubでスターをお願いします!

================================================ FILE: docs/agent_figure_and_benchmark_plan.md ================================================ # Multi-Agent Figure Generation & Benchmark Selection — Task Requirements > **Created**: 2026-03-15 > **Updated**: 2026-03-15 > **Status**: BenchmarkAgent IMPLEMENTED, FigureAgent IMPLEMENTED > **Scope**: Two new multi-agent subsystems for AutoResearchClaw pipeline > > **Implementation Progress**: > - [x] Part B: BenchmarkAgent — fully implemented (4 agents + orchestrator + config + pipeline integration + 43 tests) > - [x] Part A: FigureAgent — fully implemented (5 agents + orchestrator + config + pipeline integration + 45 tests) > > **Key Research Findings (supplemental)**: > - Papers With Code was shut down by Meta in July 2025; HuggingFace Hub API is now the primary dataset discovery source > - AI Scientist v2 and MLR-Copilot both use pure LLM-driven dataset selection (no API search) — our API-based approach is more structured > - MLE-bench (OpenAI) validates the pre-download + container-mount pattern (matches our `setup_only` network policy) > - CodeSOTA (codesota.com) provides a lighter-weight benchmark database as an alternative to Papers With Code --- ## Executive Summary 当前 Pipeline 的图表生成和数据集/基准选择存在根本性缺陷: **图表问题**(实测产出): - 每次固定只生成 2 张图(`method_comparison.png` + `experiment_comparison.png`) - 图表类型单一:只有柱状图,无训练曲线、热力图、消融分析图等 - 数据无差异化:所有方法都显示 1.000,完全无信息量 - 样式简陋:默认 matplotlib 风格,远低于 AI 顶会标准 - 不适应实验内容:无论做什么研究都画一样的图 - DPI=150,不满足出版要求(300+ DPI) **数据集/基准问题**: - 当前仅通过 `dataset_guidance` 提示词列出预缓存数据集 - 无法根据研究领域动态搜索和选择最合适的 benchmark - 无法自动下载非预缓存数据集 - 缺乏 baseline 方法的自动复现能力 **解决方案**:设计两个独立的多 Agent 子系统: 1. **FigureAgent** — 智能图表生成系统(6 个子 Agent 协作) 2. **BenchmarkAgent** — 数据集与基准选择系统(4 个子 Agent 协作) --- ## Part A: FigureAgent — 多 Agent 图表生成系统 ### A.1 问题分析 #### 当前架构缺陷 ``` 现状:Stage 14 → visualize.py (5 个硬编码函数) → 固定 2 张图 → Stage 17/22 嵌入论文 ``` | 问题 | 严重程度 | 说明 | |------|---------|------| | 图表类型固定 | Critical | 只有 bar chart 和 line chart,缺少 heatmap、scatter、violin、architecture diagram 等 | | 不适应实验内容 | Critical | 知识蒸馏实验和 RL 实验画的图完全一样 | | 无智能决策 | Critical | 不分析"应该画什么",直接调用固定函数 | | 数据正确性无验证 | High | 不验证图中数据是否与实验结果一致 | | 样式不达标 | High | 默认 matplotlib,不符合学术论文视觉标准 | | 无架构图能力 | High | 不能生成方法流程图 / 模型架构图(顶会 Figure 1 必备) | | DPI 不足 | Medium | 150 DPI,出版要求 300+ | | 无 VLM 审查 | Medium | 生成后不检查质量,直接用 | #### 业界参考方案 | 项目 | 图表策略 | 核心创新 | |------|---------|---------| | AI Scientist v1 (Sakana) | 人工编写 `plot.py` 模板,LLM 不参与 | 可靠但不灵活 | | AI Scientist v2 (Sakana) | LLM 自主生成画图代码 + VLM 审查反馈循环 | **VLM-as-critic**,首篇通过 ICLR workshop 审稿 | | PlotGen (Adobe) | 三模态反馈:数值准确性 + 文本正确性 + 视觉质量 | **Tri-modal feedback**,MatPlotBench 最优 | | PaperBanana (Google) | 3 阶段 pipeline:Caption 精炼 → 参考检索 → 迭代渲染 | **Caption sharpening** + 参考图库 | ### A.2 目标架构 ``` ┌─────────────────────┐ │ FigureAgent │ │ (Orchestrator) │ └──────────┬──────────┘ │ ┌──────────┬───────────┼───────────┬──────────┐ ▼ ▼ ▼ ▼ ▼ ┌──────────┐┌──────────┐┌──────────┐┌──────────┐┌──────────┐ │ Planner ││ CodeGen ││ Renderer ││ Critic ││ Integra- │ │ Agent ││ Agent ││ Agent ││ Agent ││ tor Agent│ └──────────┘└──────────┘└──────────┘└──────────┘└──────────┘ │ │ │ │ │ ▼ ▼ ▼ ▼ ▼ 图表规划 代码生成 执行渲染 质量审查 论文嵌入 ``` #### Agent 职责定义 **1. Orchestrator(编排器)** - 接收:实验结果 JSON、论文草稿 markdown、研究主题描述 - 协调所有子 Agent 的执行顺序 - 管理迭代循环(Critic 不满意时回到 CodeGen) - 输出:最终图表集合 + 嵌入指令 **2. Planner Agent(图表规划)** - 输入:实验结果数据结构、论文 idea、研究领域 - 职责: - 分析实验数据,确定需要哪些图、每张图展示什么 - 为每张图生成精确的 caption specification(非模糊描述) - 确定图表类型(bar / line / heatmap / scatter / architecture / ablation 等) - 确定布局(single / subplot / multi-panel) - 输出图表规划清单(JSON 格式) - 关键规则: - 至少规划 4 张图:1 架构图 + 1 主结果图 + 1 消融图 + 1 分析图 - 根据研究领域自动选择合适的图表类型 - Caption sharpening:将模糊描述转化为精确视觉规范 **3. CodeGen Agent(代码生成)** - 输入:Planner 输出的图表规划 + 实验数据 - 职责: - 为每张图生成独立的 Python 绘图脚本 - 使用 SciencePlots 学术样式 (`plt.style.use(['science', 'ieee'])`) - 确保 colorblind-safe 配色 - 300+ DPI 输出 - 代码保存到 `charts/scripts/` 供复现 - 代码模板库: - 内置常用学术图表模板(training curve, bar comparison, heatmap, confusion matrix 等) - 新图表可基于模板扩展 **4. Renderer Agent(渲染执行)** - 输入:CodeGen 生成的 Python 脚本 - 职责: - 在 Docker sandbox 中执行绘图脚本 - 捕获执行错误并反馈给 CodeGen 修复 - 验证输出文件存在且可读 - 检查图像尺寸和分辨率 **5. Critic Agent(质量审查 — 三模态反馈)** - 输入:渲染后的图像 + 源数据 + caption 规范 - 职责(三维度审查,参考 PlotGen): - **数值准确性**:验证图中呈现的数值与源数据一致(读取 JSON → 对比图中数据点) - **文本正确性**:检查标题、坐标轴标签、图例是否准确完整 - **视觉质量**:通过 VLM(如 GPT-4o vision)审查整体美观度、可读性、学术规范 - 输出:pass / fail + 具体修改建议 - 如果 fail:将反馈发回 CodeGen Agent,最多迭代 3 次 **6. Integrator Agent(论文嵌入)** - 输入:通过审查的图表集合 + 论文草稿 - 职责: - 确定每张图在论文中的最佳位置 - 生成 LaTeX figure 环境代码(支持 subfigure 多面板) - 生成交叉引用(`\ref{fig:xxx}`) - 确保图表在正确的 section(架构图在 Method,结果图在 Results) - 更新论文文本中的图表引用语句 ### A.3 图表类型矩阵 根据研究领域和实验类型,Planner Agent 应遵循以下决策矩阵: | 实验类型 | 必须包含的图表 | 可选图表 | |---------|--------------|---------| | **分类任务** | 精度对比 bar chart、confusion matrix | ROC 曲线、t-SNE 可视化 | | **生成模型** | 生成样本 grid、FID/IS 曲线 | 插值可视化、attention map | | **强化学习** | reward curve (mean±std shading)、episode length | 策略可视化、环境截图 | | **知识蒸馏** | teacher-student 精度对比、知识迁移效率曲线 | 特征对齐热力图 | | **NLP** | BLEU/ROUGE 对比表、attention heatmap | 样本输出对比 | | **图神经网络** | 节点分类精度、图可视化 | 消息传递可视化 | | **元学习** | few-shot 精度 vs shot 数曲线 | 任务适应速度 | | **持续学习** | 遗忘率曲线、任务精度矩阵 | 表征漂移可视化 | | **所有类型** | 消融分析 (grouped bar)、训练 loss 曲线 | 超参敏感性热力图 | ### A.4 样式规范 所有图表必须遵循以下学术出版标准: ```python # 全局样式配置 (charts/style_config.py) STYLE_CONFIG = { "matplotlib_style": ["science", "ieee"], # SciencePlots "dpi": 300, # 出版级 "font_size": {"title": 12, "axis": 10, "tick": 8, "legend": 9}, "figure_width": { "single_column": 3.5, # IEEE single column (inches) "double_column": 7.0, # IEEE double column "full_page": 7.0, # Full width }, "colors": "bright", # colorblind-safe (Paul Tol) "line_styles": ["-", "--", "-.", ":"], # 配合 B&W 打印 "marker_styles": ["o", "s", "^", "D", "v", "P"], "error_bar_style": "shading", # mean ± std 用阴影而非 error bar "format": "pdf", # 矢量格式优先 "fallback_format": "png", # PNG 备用 } ``` ### A.5 实现计划 #### 文件结构 ``` researchclaw/ ├── agents/ │ └── figure_agent/ │ ├── __init__.py │ ├── orchestrator.py # FigureAgent 主编排器 │ ├── planner.py # Planner Agent │ ├── codegen.py # CodeGen Agent │ ├── renderer.py # Renderer Agent │ ├── critic.py # Critic Agent (三模态审查) │ ├── integrator.py # Integrator Agent │ ├── templates/ # 图表代码模板库 │ │ ├── bar_comparison.py │ │ ├── training_curve.py │ │ ├── heatmap.py │ │ ├── confusion_matrix.py │ │ ├── scatter_plot.py │ │ ├── ablation_grouped.py │ │ ├── violin_box.py │ │ └── multi_panel.py │ └── style_config.py # 全局样式配置 ``` #### 开发任务清单 | ID | 任务 | 依赖 | 估计改动量 | |----|------|------|-----------| | FA-01 | 创建 `agents/figure_agent/` 目录结构和基础类 | 无 | 新建 | | FA-02 | 实现 Planner Agent:图表规划逻辑 + 类型决策矩阵 | FA-01 | ~300 行 | | FA-03 | 实现 CodeGen Agent:代码生成 + 模板库 | FA-01 | ~500 行 | | FA-04 | 实现 Renderer Agent:sandbox 执行 + 错误处理 | FA-01, FA-03 | ~200 行 | | FA-05 | 实现 Critic Agent:三模态审查(数值 / 文本 / VLM) | FA-01, FA-04 | ~400 行 | | FA-06 | 实现 Integrator Agent:论文嵌入 + LaTeX subfigure 支持 | FA-01 | ~250 行 | | FA-07 | 实现 Orchestrator:编排循环 + 最大迭代控制 | FA-02 ~ FA-06 | ~300 行 | | FA-08 | 添加 SciencePlots 到 Docker 镜像 + 样式配置 | 无 | ~50 行 | | FA-09 | 修改 executor.py:Stage 14 调用 FigureAgent 替代 `visualize.py` | FA-07 | ~100 行 | | FA-10 | 修改 executor.py:Stage 17/22 使用 Integrator 输出 | FA-07 | ~100 行 | | FA-11 | 修改 converter.py:支持 subfigure 和 PDF 格式 | FA-06 | ~80 行 | | FA-12 | 添加图表代码模板库(8+ 模板) | FA-03 | ~600 行 | | FA-13 | 测试:单元测试 + 集成测试 | FA-01 ~ FA-12 | ~400 行 | | FA-14 | 向后兼容:保留 `visualize.py` 作为 fallback | FA-09 | ~30 行 | #### Pipeline 集成点 ``` Stage 12-13: 实验执行完成,生成 results.json │ ▼ Stage 14: Result Analysis │── 调用 FigureAgent.orchestrate() │ ├── Planner: 分析 results.json → 图表规划 │ ├── CodeGen: 生成绘图脚本 → charts/scripts/ │ ├── Renderer: 执行脚本 → charts/*.pdf + charts/*.png │ ├── Critic: 审查图表质量 (max 3 iterations) │ └── 输出: charts/ 目录 + figure_manifest.json │ ▼ Stage 17: Paper Draft │── Integrator: 读取 figure_manifest.json │ ├── 确定每张图的论文位置 │ ├── 注入 markdown 图片引用 + caption │ └── 更新交叉引用文本 │ ▼ Stage 22: Paper Export │── 复制 charts/ 到 submission/ │── converter.py 处理 subfigure 环境 └── 最终 LaTeX 编译验证 ``` --- ## Part B: BenchmarkAgent — 多 Agent 数据集与基准选择系统 ### B.1 问题分析 #### 当前架构缺陷 ``` 现状:dataset_guidance 提示词 (硬编码列表) + dataset_registry.yaml (静态清单) → LLM 自行选择 ``` | 问题 | 严重程度 | 说明 | |------|---------|------| | 数据集选择不智能 | Critical | 仅列出预缓存数据集,LLM 可能选择不合适的 benchmark | | 无领域适配 | Critical | 不根据研究领域搜索该领域的标准 benchmark | | 无最新性保证 | High | 不检查是否有更新、更好的 benchmark 可用 | | baseline 无法复现 | High | 不提供已有方法的参考实现 / 预训练权重 | | 下载路径硬编码 | Medium | 非预缓存数据集无法自动获取 | | 无数据集验证 | Medium | 不验证下载的数据集是否完整、格式正确 | #### 理想工作流 一个好的数据集/基准选择流程应该: 1. **理解研究问题** → 确定评估维度(分类精度?生成质量?推理速度?) 2. **搜索领域标准** → 查找该领域顶会论文常用的 benchmark 3. **评估适用性** → 数据集大小、难度、License、可获取性 4. **获取数据** → 自动下载或生成下载脚本 5. **获取 baseline** → 找到对比方法的开源实现或预训练权重 6. **验证完整性** → 确认数据集可正常加载和使用 ### B.2 目标架构 ``` ┌─────────────────────┐ │ BenchmarkAgent │ │ (Orchestrator) │ └──────────┬──────────┘ │ ┌──────────┬───────────┼───────────┐ ▼ ▼ ▼ ▼ ┌──────────┐┌──────────┐┌──────────┐┌──────────┐ │ Surveyor ││ Selector ││ Acquirer ││ Validator│ │ Agent ││ Agent ││ Agent ││ Agent │ └──────────┘└──────────┘└──────────┘└──────────┘ │ │ │ │ ▼ ▼ ▼ ▼ 领域调研 选择决策 数据获取 验证确认 ``` #### Agent 职责定义 **1. Orchestrator(编排器)** - 接收:研究主题、假设、实验设计方案 - 协调 4 个子 Agent 的执行 - 输出:`benchmark_plan.json`(包含数据集列表、下载脚本、baseline 方案) **2. Surveyor Agent(领域调研)** - 输入:研究主题关键词、相关文献列表 - 职责: - 搜索 Papers With Code 的领域 benchmark 排行榜 - 搜索 HuggingFace Datasets 的相关数据集 - 搜索 OpenML、Kaggle 的相关 benchmark - 分析近 2 年顶会论文(ICML、NeurIPS、ICLR)使用的数据集 - 汇总领域标准 benchmark 清单(含引用频次、数据规模、难度级别) - 输出:`survey_results.json` — 候选 benchmark 列表(按推荐度排序) - 数据源优先级: 1. Papers With Code (Benchmarks API) 2. HuggingFace Datasets Hub 3. torchvision / torchaudio / torchtext 内置 4. 顶会论文附录中的数据集描述 **3. Selector Agent(选择决策)** - 输入:survey_results.json + 实验约束(GPU 内存、时间预算、网络可用性) - 职责: - 根据约束过滤不可行的数据集(太大 / 需要申请 / License 不兼容) - 考虑 Docker sandbox 已缓存的数据集(优先使用) - 选择 primary benchmark(必须是领域标准)+ secondary benchmarks(补充验证) - 选择 baseline 方法(至少 2 个有开源实现的对比方法) - 生成选择理由文档(供论文 Experimental Setup section 使用) - 约束规则: - Tier 1(已缓存):无网络需求,最优先 - Tier 2(torchvision/HF datasets 可直接下载):需 setup 阶段网络 - Tier 3(需自定义下载脚本):仅在 `network_policy: full` 时可用 - 输出:`selected_benchmarks.json` + `baseline_methods.json` **4. Acquirer Agent(数据获取)** - 输入:selected_benchmarks.json - 职责: - 生成 `setup.py` 中的数据集下载代码 - 为每个数据集生成加载 boilerplate 代码 - 为 baseline 方法生成安装和调用代码 - 处理 HuggingFace `datasets.load_dataset()` / `torchvision.datasets` 等接口 - 生成 `requirements.txt` 中需要额外安装的包 - 输出: - `data_loading_snippets.py` — 数据加载代码片段(注入 CodeAgent) - `baseline_snippets.py` — baseline 调用代码片段 - `setup.py` 追加内容 — 下载脚本 **5. Validator Agent(验证确认)** - 输入:Acquirer 生成的下载/加载代码 - 职责: - 验证数据集 API 调用语法正确 - 验证数据集分割(train/val/test)存在 - 验证数据格式与实验代码兼容 - 验证 baseline 方法可运行 - 如果验证失败,反馈给 Acquirer 修复 - 输出:validation_report.json ### B.3 知识库设计 BenchmarkAgent 需要一个结构化知识库来支持决策: ```yaml # researchclaw/data/benchmark_knowledge.yaml domains: image_classification: standard_benchmarks: - name: CIFAR-10/100 source: torchvision tier: 1 # 已缓存 difficulty: easy/medium use_when: "小规模验证、快速原型" - name: ImageNet-1K source: torchvision tier: 3 # 需要下载 ~150GB difficulty: hard use_when: "大规模验证、与 SOTA 对比" common_baselines: - name: ResNet-50 source: "torchvision.models.resnet50(pretrained=True)" paper: "He et al., 2016" - name: ViT-B/16 source: "timm.create_model('vit_base_patch16_224', pretrained=True)" paper: "Dosovitskiy et al., 2021" reinforcement_learning: standard_benchmarks: - name: Gymnasium (MuJoCo) source: "gymnasium[mujoco]" tier: 2 - name: Atari source: "gymnasium[atari]" tier: 2 common_baselines: - name: PPO source: "stable-baselines3" paper: "Schulman et al., 2017" # ... 更多领域 ``` ### B.4 实现计划 #### 文件结构 ``` researchclaw/ ├── agents/ │ └── benchmark_agent/ │ ├── __init__.py │ ├── orchestrator.py # BenchmarkAgent 主编排器 │ ├── surveyor.py # Surveyor Agent (领域调研) │ ├── selector.py # Selector Agent (选择决策) │ ├── acquirer.py # Acquirer Agent (数据获取) │ ├── validator.py # Validator Agent (验证确认) │ └── knowledge_base.py # 知识库加载和查询 ├── data/ │ ├── benchmark_knowledge.yaml # 领域 benchmark 知识库 │ └── dataset_registry.yaml # 已有数据集注册表 (保留) ``` #### 开发任务清单 | ID | 任务 | 依赖 | 估计改动量 | |----|------|------|-----------| | BA-01 | 创建 `agents/benchmark_agent/` 目录结构和基础类 | 无 | 新建 | | BA-02 | 编写 `benchmark_knowledge.yaml` 知识库(覆盖 10+ 领域) | 无 | ~500 行 YAML | | BA-03 | 实现 Surveyor Agent:Papers With Code API + HF Datasets 搜索 | BA-01 | ~350 行 | | BA-04 | 实现 Selector Agent:约束过滤 + Tier 匹配 + 选择逻辑 | BA-01, BA-02 | ~300 行 | | BA-05 | 实现 Acquirer Agent:代码生成 + 下载脚本 | BA-01, BA-04 | ~350 行 | | BA-06 | 实现 Validator Agent:语法/可用性验证 | BA-01, BA-05 | ~250 行 | | BA-07 | 实现 Orchestrator:编排 + 迭代修复 | BA-02 ~ BA-06 | ~250 行 | | BA-08 | 修改 executor.py:Stage 6/7 调用 BenchmarkAgent | BA-07 | ~150 行 | | BA-09 | 修改 executor.py:将 benchmark_plan 注入 CodeAgent | BA-07 | ~100 行 | | BA-10 | 更新 prompts.py:基于 BenchmarkAgent 输出动态构建提示词 | BA-07 | ~100 行 | | BA-11 | 测试:单元测试 + 集成测试 | BA-01 ~ BA-10 | ~300 行 | | BA-12 | 向后兼容:保留 `dataset_registry.yaml` 作为 fallback | BA-08 | ~30 行 | #### Pipeline 集成点 ``` Stage 3: Topic Initialization │── 研究主题确定 ▼ Stage 4-5: Literature Collection & Screening │── 文献列表生成 ▼ Stage 6: Hypothesis Generation │── 调用 BenchmarkAgent.orchestrate() │ ├── Surveyor: 搜索领域标准 benchmark │ ├── Selector: 根据约束选择最优 benchmark + baseline │ ├── Acquirer: 生成下载/加载代码 │ └── Validator: 验证代码可执行 │── 输出: benchmark_plan.json ▼ Stage 7: Experiment Design │── benchmark_plan.json 注入实验设计 │── 实验方案明确使用哪些数据集和 baseline ▼ Stage 8-9: Code Generation (CodeAgent) │── data_loading_snippets 注入生成代码 │── baseline_snippets 注入对比方法 ▼ Stage 10-11: Experiment Execution │── setup.py 执行数据集下载 │── main.py 使用生成的数据加载代码 ▼ Stage 14: Result Analysis │── 对比结果基于真实 baseline,可信度高 ``` --- ## Part C: 共同基础设施 ### C.1 Agent 基类 两个多 Agent 系统共享同一套基础设施: ```python # researchclaw/agents/base.py class BaseAgent: """所有子 Agent 的基类""" def __init__(self, llm_client, config): self.llm = llm_client self.config = config self.logger = logging.getLogger(self.__class__.__name__) async def execute(self, context: dict) -> dict: """执行 Agent 任务,返回结果""" raise NotImplementedError def _call_llm(self, system_prompt, user_prompt, **kwargs): """统一 LLM 调用接口""" return self.llm.chat(system_prompt, user_prompt, **kwargs) class AgentOrchestrator: """Agent 编排器基类""" def __init__(self, agents: list[BaseAgent], max_iterations=3): self.agents = agents self.max_iterations = max_iterations async def orchestrate(self, context: dict) -> dict: """执行多 Agent 编排流程""" raise NotImplementedError ``` ### C.2 与现有 LLM Client 的集成 两个系统都通过现有的 `researchclaw/llm/client.py` 调用 LLM: - Planner / Selector / Critic 等决策类 Agent → 使用 `gpt-4.1` 或 `gpt-4o` - CodeGen 类 Agent → 使用 `gpt-4.1`(代码生成能力最强) - VLM Critic → 使用 `gpt-4o`(支持 vision) ### C.3 配置扩展 ```yaml # config.yaml 新增配置 agents: figure_agent: enabled: true max_iterations: 3 # Critic 反馈最大迭代次数 min_figures: 4 # 最少图表数 style: "science+ieee" # matplotlib 样式 dpi: 300 format: "pdf" # 优先格式 vlm_review: true # 是否启用 VLM 视觉审查 benchmark_agent: enabled: true max_search_results: 20 # Papers With Code 最大搜索结果 prefer_cached: true # 优先使用已缓存数据集 tier_limit: 2 # 最高允许的 Tier 级别 (1=缓存, 2=可下载, 3=大型) min_baselines: 2 # 最少 baseline 方法数 ``` --- ## Part D: 风险与兜底 ### D.1 向后兼容 | 组件 | 兜底策略 | |------|---------| | FigureAgent 失败 | 回退到现有 `visualize.py` 生成基础图表 | | BenchmarkAgent 失败 | 回退到 `dataset_registry.yaml` + `dataset_guidance` 提示词 | | VLM 审查不可用 | 跳过视觉审查,仅做数值 + 文本验证 | | SciencePlots 未安装 | 使用 `seaborn-v0_8-whitegrid` 样式 | | 网络不可用 | Surveyor 使用本地 `benchmark_knowledge.yaml` | ### D.2 Token 成本控制 | 操作 | 预估 Token 消耗 | 控制策略 | |------|----------------|---------| | Planner (1 次) | ~2K input + ~1K output | 固定 | | CodeGen (4 图 × 最多 3 次迭代) | ~3K × 12 = ~36K | 迭代次数上限 | | Critic (4 图 × 最多 3 次) | ~2K × 12 = ~24K | 迭代次数上限 | | VLM 审查 (4 图) | ~4K × 4 = ~16K | 仅终轮审查 | | Surveyor (1 次) | ~2K input + ~2K output | API 调用为主 | | Selector (1 次) | ~3K input + ~1K output | 固定 | | **总增量** | **~80K tokens** | 约增加 $0.30-0.50/run | ### D.3 测试策略 1. **单元测试**:每个 Agent 独立测试(mock LLM 响应) 2. **集成测试**:使用固定 results.json 测试 FigureAgent 完整流程 3. **回归测试**:确认 fallback 到旧系统仍可正常工作 4. **端到端测试**:Run 14+ 完整 Pipeline 运行,对比图表质量 --- ## Part E: 执行优先级 建议按以下顺序实施: ### Phase 1: FigureAgent 核心(优先级最高) 1. FA-01 ~ FA-03: 基础类 + Planner + CodeGen 2. FA-04 ~ FA-05: Renderer + Critic 3. FA-08: SciencePlots 集成 4. FA-12: 模板库 ### Phase 2: FigureAgent 集成 5. FA-06 ~ FA-07: Integrator + Orchestrator 6. FA-09 ~ FA-11: Pipeline 集成 7. FA-13 ~ FA-14: 测试 + 向后兼容 ### Phase 3: BenchmarkAgent 核心 8. BA-01 ~ BA-02: 基础类 + 知识库 9. BA-03 ~ BA-06: 4 个子 Agent 10. BA-07: Orchestrator ### Phase 4: BenchmarkAgent 集成 11. BA-08 ~ BA-10: Pipeline 集成 12. BA-11 ~ BA-12: 测试 + 向后兼容 ### Phase 5: 端到端验证 13. 完整 Pipeline 运行(Run 14+) 14. 对比图表质量和数据集选择质量 15. 根据结果调优 --- ## Appendix: 参考资料 | 来源 | 关键收获 | |------|---------| | [AI Scientist v2](https://github.com/SakanaAI/AI-Scientist-v2) | VLM-as-critic, 首篇通过 ICLR workshop 审稿 | | [PlotGen (Adobe)](https://arxiv.org/abs/2502.00988) | 三模态反馈:数值 + 文本 + 视觉 | | [PaperBanana (Google)](https://github.com/llmsresearch/paperbanana) | Caption sharpening + 参考图库检索 | | [SciencePlots](https://github.com/garrettj403/SciencePlots) | 学术论文 matplotlib 样式库 | | [VLM-Enhanced Discovery](https://arxiv.org/html/2511.14631) | Correction mode + Discovery mode | | [Papers With Code API](https://paperswithcode.com/api/v1/) | 领域 benchmark 排行榜搜索 | | [HuggingFace Datasets](https://huggingface.co/docs/datasets/) | 数据集搜索和加载 API | ================================================ FILE: docs/figure_prompts/case_a_meta_learning.md ================================================ # Case A: Continual Meta-Learning — Image Generation Prompt ## Prompt A premium, modern data visualization infographic on a clean white background with subtle light-gray grid lines. The chart is a **line chart** showing progressive performance improvement across 5 data points on the X-axis (labeled "Self-Iteration Round"). **Overall title** at the top in bold dark navy sans-serif font: "Case A: Continual Meta-Learning for Few-Shot Adaptation" **Y-axis:** "Few-Shot Accuracy (%)" ranging from 15% to 105%. **X-axis:** "Self-Iteration Round" with 5 labeled tick marks. **Data points and line:** - Point 0 (Baseline): 25.9% — large circle marker, colored **slate gray** (#757575). X-label below: "Baseline" with a small gray beaker/flask icon, subtitle "(Initial Code)". - Point 1 (Iter 1): 81.2% — large circle marker, colored **emerald green** (#2E7D32). X-label: "Iter 1" with a small green brain/neural-network icon, subtitle "(Deep Encoder + Meta-SGD)". - Point 2 (Iter 2): 77.5% — large circle marker, colored **crimson red** (#C62828). X-label: "Iter 2" with a small red warning-triangle icon, subtitle "(Prototype Net — Regression)". - Point 3 (Iter 3): 93.4% — large circle marker, colored **emerald green** (#2E7D32). X-label: "Iter 3" with a small green rocket icon, subtitle "(Linear Clf + L2 Anchor)". - Point 4 (Iter 4): 93.4% — large circle marker, colored **slate gray** (#757575). X-label: "Iter 4" with a small gray checkmark-circle icon, subtitle "(Converged)". **Connecting line:** Thick (3px) solid line in **royal blue** (#1565C0) connecting all 5 points in order. The area below the line (above the baseline value of 25.9%) is filled with a very light semi-transparent blue wash (#1565C0 at 8% opacity). **Annotations with callout arrows:** - Near Point 1: A green callout box with text "+55.3 pts" in bold green, below it "Deep encoder + context-gated replay" in smaller green text. A thin green arrow points from the callout to Point 1. Include a small upward-arrow icon. - Near Point 2: A red italic callout "Prototype net too simple" with a thin red arrow pointing to Point 2. Include a small X-mark icon. - Near Point 3: A green callout box with text "+15.9 pts" in bold green, below it "Linear clf + L2 anchor + cosine gating" in smaller green text. A thin green arrow points from the callout to Point 3. Include a small upward-arrow icon. **Reference line:** A horizontal **dashed orange line** (#E65100) at y=100% with a small label "Oracle (100%)" at the right end in italic orange text. Include a small trophy/target icon next to the label. **Summary stats box:** Upper-left corner, a rounded rectangle with light blue background (#E3F2FD) and blue border (#1565C0), containing monospace text: ``` Baseline: 25.9% → Best: 93.4% Improvement: +67.5 pts (261% rel.) ``` **Legend** at the bottom center with three items, each with a colored square swatch: - Green square: "Improved" - Red square: "Regressed (auto-recovered)" - Gray square: "No change / Baseline" **Style:** Clean, professional, tech-forward aesthetic. Use a modern sans-serif font (like Inter, SF Pro, or Helvetica Neue). Subtle drop shadows on the summary box and annotation callouts. Smooth anti-aliased lines. The overall feel should be suitable for a top-tier AI research company's product page or investor deck — polished, data-rich, and visually compelling. High contrast text. No 3D effects. Flat design with depth through subtle shadows and layering. **Dimensions:** 1200 x 900 pixels, 2x retina resolution. ================================================ FILE: docs/figure_prompts/case_b_rlhf_alignment.md ================================================ # Case B: RLHF with Curriculum Reward Shaping — Image Generation Prompt ## Prompt A premium, modern data visualization infographic on a clean white background with subtle light-gray grid lines. The chart is a **line chart with square markers** showing progressive performance improvement across 5 data points on the X-axis (labeled "Self-Iteration Round"). **Overall title** at the top in bold dark navy sans-serif font: "Case B: RLHF with Curriculum-Based Reward Shaping for LLM Alignment" **Y-axis:** "LLM Alignment Score (%)" ranging from 15% to 105%. **X-axis:** "Self-Iteration Round" with 5 labeled tick marks. **Data points and line:** - Point 0 (Baseline): 35.6% — large square marker, colored **slate gray** (#757575). X-label below: "Baseline" with a small gray play-button icon, subtitle "(Vanilla PPO)". - Point 1 (Iter 1): 35.6% — large square marker, colored **slate gray** (#757575). X-label: "Iter 1" with a small gray pause icon, subtitle "(No Change)". - Point 2 (Iter 2): 61.6% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 2" with a small green sparkle/star icon, subtitle "(+Reward Model +Curriculum)". - Point 3 (Iter 3): 63.0% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 3" with a small green chart-trending-up icon, subtitle "(+Rank-Norm +Policy EMA)". - Point 4 (Iter 4): 66.6% — large square marker, colored **emerald green** (#2E7D32). X-label: "Iter 4" with a small green shield-check icon, subtitle "(+Confidence Gating)". **Connecting line:** Thick (3px) solid line in **deep purple** (#6A1B9A) connecting all 5 points in order. The area below the line (above the baseline value of 35.6%) is filled with a very light semi-transparent purple wash (#6A1B9A at 8% opacity). **Annotations with callout arrows:** - Near Point 1: A gray italic callout "No improvement (minor code fix)" with a thin gray arrow pointing down to Point 1. Include a small minus-circle icon. - Near Point 2: A green callout box with text "+26.0 pts" in bold green, below it "+Learned reward model" and "+Curriculum scheduling" in smaller green text. A thin green arrow points from the callout to Point 2. Include a small upward-arrow icon and a tiny brain icon. - Near Point 3: A smaller green callout with text "+1.4 pts" in green, below it "+Rank-norm +Policy EMA" in smaller text. A thin green arrow points to Point 3. Include a small upward-arrow icon. - Near Point 4: A green callout box with text "+3.6 pts" in bold green, below it "+Confidence gating" and "+Mini-batch RM" in smaller green text. A thin green arrow points to Point 4. Include a small upward-arrow icon and a tiny lock/shield icon. **Summary stats box:** Upper-left corner, a rounded rectangle with light purple background (#F3E5F5) and purple border (#6A1B9A), containing monospace text: ``` Baseline: 35.6% → Best: 66.6% Improvement: +31.0 pts (87% rel.) ``` **Legend** at the bottom center with three items, each with a colored square swatch: - Green square: "Improved" - Red square: "Regressed (auto-recovered)" - Gray square: "No change / Baseline" **Style:** Clean, professional, tech-forward aesthetic. Use a modern sans-serif font (like Inter, SF Pro, or Helvetica Neue). Subtle drop shadows on the summary box and annotation callouts. Smooth anti-aliased lines. The overall feel should be suitable for a top-tier AI research company's product page or investor deck — polished, data-rich, and visually compelling. High contrast text. No 3D effects. Flat design with depth through subtle shadows and layering. **Dimensions:** 1200 x 900 pixels, 2x retina resolution. ================================================ FILE: docs/integration-guide.md ================================================ # AutoResearchClaw Integration Guide > **The simplest way to use AutoResearchClaw**: give the repo URL to [OpenClaw](https://github.com/openclaw/openclaw) and say *"Research [your topic]."* That's it — OpenClaw handles cloning, installing, configuring, and running the entire 23-stage pipeline for you. This guide is for humans who want to understand what's happening under the hood, or who prefer to set things up manually. --- ## Table of Contents 1. [The Easy Way: OpenClaw](#1-the-easy-way-openclaw) 2. [Manual Setup](#2-manual-setup) 3. [Configuration Walkthrough](#3-configuration-walkthrough) 4. [Running the Pipeline](#4-running-the-pipeline) 5. [Understanding the 23 Stages](#5-understanding-the-23-stages) 6. [Output Artifacts](#6-output-artifacts) 7. [Experiment Modes](#7-experiment-modes) 8. [Conference Templates](#8-conference-templates) 9. [OpenClaw Bridge (Advanced)](#9-openclaw-bridge-advanced) 10. [MetaClaw Integration (Cross-Run Learning)](#10-metaclaw-integration-cross-run-learning) 11. [Other AI Platforms](#11-other-ai-platforms) 12. [Python API](#12-python-api) 13. [Troubleshooting](#13-troubleshooting) 14. [FAQ](#14-faq) --- ## 1. The Easy Way: OpenClaw If you use [OpenClaw](https://github.com/openclaw/openclaw) as your AI assistant, you don't need to read the rest of this guide. ### Steps 1. Share the GitHub repo URL with OpenClaw: ``` https://github.com/aiming-lab/AutoResearchClaw ``` 2. OpenClaw reads `RESEARCHCLAW_AGENTS.md` and `README.md` — it now understands the entire system. > **Note:** `RESEARCHCLAW_AGENTS.md` is generated locally and listed in `.gitignore`. If it doesn't exist, OpenClaw can bootstrap from `README.md` and the project structure. 3. Say something like: ``` Research the application of graph neural networks in drug discovery ``` 4. OpenClaw will: - Clone the repo - Create a virtual environment and install dependencies (`pip install -e .`) - Copy `config.researchclaw.example.yaml` → `config.yaml` - Ask you for an OpenAI API key (or use your environment variable) - Run the full 23-stage pipeline - Return the paper, experiment code, charts, and citations **That's the whole process.** OpenClaw is designed to read agent definition files and bootstrap itself. AutoResearchClaw ships with these files specifically so that any OpenClaw-compatible AI assistant can pick it up and run. ### What if I want to tweak settings? Tell OpenClaw in natural language: - *"Use GPT-5.2 instead of GPT-4o"* - *"Run experiments in sandbox mode, not simulated"* - *"Target ICLR 2025 format instead of NeurIPS"* - *"Skip the quality gate, just auto-approve everything"* OpenClaw will modify `config.yaml` accordingly before running the pipeline. --- ## 2. Manual Setup ### Prerequisites | Requirement | Details | |-------------|---------| | **Python** | 3.11 or newer | | **LLM API** | Any OpenAI-compatible endpoint (OpenAI, Azure, local proxy, etc.) | | **Disk space** | ~100 MB for the repo + artifacts per run | | **Network** | Required for LLM API calls and literature search (Semantic Scholar, arXiv) | ### Installation ```bash # Clone the repository git clone https://github.com/aiming-lab/AutoResearchClaw.git cd AutoResearchClaw # Create a virtual environment (recommended) python3 -m venv .venv source .venv/bin/activate # macOS/Linux # .venv\Scripts\activate # Windows # Install pip install -e . ``` ### Verify Installation ```bash # Check the CLI is available researchclaw --help # Validate your configuration researchclaw validate --config config.yaml ``` --- ## 3. Configuration Walkthrough Start from the provided template: ```bash cp config.researchclaw.example.yaml config.yaml ``` Open `config.yaml` in your editor. Here's what each section does: ### LLM Settings (Required) This is the only section you **must** configure. Everything else has sensible defaults. ```yaml llm: base_url: "https://api.openai.com/v1" # Your LLM API endpoint api_key_env: "OPENAI_API_KEY" # Environment variable name... api_key: "" # ...or paste the key directly here primary_model: "gpt-4o" # Model to use (gpt-4o, gpt-5.2, etc.) fallback_models: # Tried in order if primary fails - "gpt-4.1" - "gpt-4o-mini" s2_api_key: "" # Optional: Semantic Scholar API key for higher rate limits ``` **Using an environment variable** (recommended for security): ```bash export OPENAI_API_KEY="sk-..." ``` **Using a direct key** (simpler, less secure): ```yaml llm: api_key: "sk-your-key-here" ``` **Using a proxy or alternative provider**: ```yaml llm: base_url: "https://your-proxy.example.com/v1" api_key: "your-proxy-key" primary_model: "gpt-4o" # Must be supported by your endpoint ``` ### Research Settings ```yaml research: topic: "Your research topic here" # Can also be set via CLI --topic flag domains: - "machine-learning" # Guides literature search scope daily_paper_count: 10 # Target papers to collect quality_threshold: 4.0 # Minimum paper quality score (1-5) ``` ### Experiment Settings ```yaml experiment: mode: "sandbox" # How experiments run (see Section 7) time_budget_sec: 300 # Max seconds per experiment run max_iterations: 10 # Max refinement loops in Stage 13 metric_key: "primary_metric" # What metric to optimize metric_direction: "minimize" # "minimize" or "maximize" sandbox: python_path: ".venv/bin/python3" # Python binary for sandbox execution gpu_required: false max_memory_mb: 4096 code_agent: # CodeAgent v2 (multi-phase code generation) enabled: true # Architecture planning + sequential file gen + hard validation benchmark_agent: # Automated dataset & baseline selection enabled: true # 4-agent pipeline: Surveyor→Selector→Acquirer→Validator figure_agent: # Academic figure generation enabled: true # 5-agent pipeline: Planner→CodeGen→Renderer→Critic→Integrator repair: # Anti-fabrication experiment repair enabled: true # Diagnose and fix failed experiments before paper writing max_cycles: 3 # Repair retry loops opencode: # OpenCode Beast Mode (see README for details) enabled: true ``` ### Export Settings ```yaml export: target_conference: "neurips_2025" # See Section 8 for all available templates authors: "Anonymous" # Author line in the paper bib_file: "references" # BibTeX file name (without .bib) ``` ### Everything Else (Optional) These have reasonable defaults. Change them only if you need to: ```yaml project: name: "my-research" # Just an identifier for your run mode: "full-auto" # "docs-first", "semi-auto", or "full-auto" runtime: timezone: "America/New_York" max_parallel_tasks: 3 approval_timeout_hours: 12 retry_limit: 2 security: hitl_required_stages: [5, 9, 20] # Stages that pause for human approval allow_publish_without_approval: false notifications: channel: "console" # "console", "discord", or "slack" knowledge_base: backend: "markdown" root: "docs/kb" ``` --- ## 4. Running the Pipeline ### Basic Run ```bash # Run with topic from config.yaml researchclaw run --config config.yaml --auto-approve # Override topic from command line researchclaw run --config config.yaml --topic "Transformer attention for time series" --auto-approve ``` ### CLI Commands | Command | What It Does | |---------|-------------| | `researchclaw setup` | Interactive first-time setup (installs OpenCode Beast Mode, checks Docker/LaTeX) | | `researchclaw init` | Interactive config creation (choose LLM provider, creates `config.arc.yaml`) | | `researchclaw run` | Run the full 23-stage pipeline | | `researchclaw validate` | Check your config file for errors | | `researchclaw doctor` | Diagnose environment issues (Python, dependencies, API connectivity) | | `researchclaw report --run-dir ` | Generate a human-readable summary of a completed run | ### Run Flags | Flag | Effect | |------|--------| | `--topic "..."` | Override the topic in config.yaml | | `--config path` | Config file path (default: `config.yaml`) | | `--output path` | Output directory (default: `artifacts//`) | | `--auto-approve` | Skip manual approval at gate stages (5, 9, 20) | | `--from-stage STAGE_NAME` | Start from a specific stage (e.g., `PAPER_OUTLINE`) | | `--resume` | Resume from the last checkpoint (auto-detects the most recent run matching your topic) | | `--skip-preflight` | Skip LLM connectivity check before starting | | `--skip-noncritical-stage` | Skip non-critical stages on failure instead of aborting | | `--no-graceful-degradation` | Fail pipeline on quality gate failure instead of degrading gracefully | ### Examples ```bash # Full autonomous run — no human intervention researchclaw run -c config.yaml -t "Graph neural networks for protein folding" --auto-approve # Resume a failed run from where it stopped researchclaw run -c config.yaml --resume --auto-approve # Re-run just the paper writing stages researchclaw run -c config.yaml --from-stage PAPER_OUTLINE --auto-approve # Check your setup before running researchclaw doctor -c config.yaml ``` --- ## 5. Understanding the 23 Stages The pipeline runs in 8 phases. Each stage reads artifacts from previous stages and produces new ones. ### Phase A: Research Scoping | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 1 | TOPIC_INIT | LLM formulates a SMART research goal; auto-detects GPU hardware (NVIDIA/MPS/CPU) | `goal.md`, `hardware_profile.json` | | 2 | PROBLEM_DECOMPOSE | Breaks the goal into prioritized sub-questions | `problem_tree.md` | ### Phase B: Literature Discovery | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 3 | SEARCH_STRATEGY | Plans search queries and data sources | `search_plan.yaml`, `sources.json` | | 4 | LITERATURE_COLLECT | Queries **real APIs** (arXiv-first, then Semantic Scholar) with expanded queries for broad coverage | `candidates.jsonl` | | 5 | LITERATURE_SCREEN | **[Gate]** Filters by relevance and quality | `shortlist.jsonl` | | 6 | KNOWLEDGE_EXTRACT | Extracts structured knowledge cards from each paper | `cards/` | ### Phase C: Knowledge Synthesis | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 7 | SYNTHESIS | Clusters findings, identifies research gaps | `synthesis.md` | | 8 | HYPOTHESIS_GEN | Generates falsifiable hypotheses | `hypotheses.md` | ### Phase D: Experiment Design | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 9 | EXPERIMENT_DESIGN | **[Gate]** Designs experiment plan with baselines and metrics | `exp_plan.yaml` | | 10 | CODE_GENERATION | LLM writes hardware-aware experiment code (adapts packages/constraints to GPU tier) | `experiment.py`, `experiment_spec.md` | | 11 | RESOURCE_PLANNING | Estimates GPU/time requirements | `schedule.json` | ### Phase E: Experiment Execution | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 12 | EXPERIMENT_RUN | Runs the experiment code (sandbox or simulated); immutable harness injected for time guard and metric validation; partial results captured on timeout | `runs/` | | 13 | ITERATIVE_REFINE | LLM analyzes results, improves code, re-runs (up to 10 iterations); timeout-aware prompts; NaN/divergence fast-fail; stdout truncated for context efficiency | `refinement_log.json`, `experiment_final.py` | ### Phase F: Analysis & Decision | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 14 | RESULT_ANALYSIS | Statistical analysis of experiment results | `analysis.md` | | 15 | RESEARCH_DECISION | PROCEED / PIVOT decision with evidence | `decision.md` | ### Phase G: Paper Writing | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 16 | PAPER_OUTLINE | Creates section-level paper outline | `outline.md` | | 17 | PAPER_DRAFT | Writes paper section-by-section (3 LLM calls, 5,000-6,500 words); **hard-blocked when no experiment metrics** (anti-fabrication); conference-grade title guidelines and abstract structure injected | `paper_draft.md` | | 18 | PEER_REVIEW | Simulates 2+ reviewer perspectives with NeurIPS/ICML rubric (1-10 scoring); checks baselines, ablations, claims vs evidence | `reviews.md` | | 19 | PAPER_REVISION | Addresses review comments with length guard (auto-retries if revised paper is shorter than draft) | `paper_revised.md` | ### Phase H: Finalization | # | Stage | What Happens | Produces | |---|-------|-------------|----------| | 20 | QUALITY_GATE | **[Gate]** Checks paper quality score | `quality_report.json` | | 21 | KNOWLEDGE_ARCHIVE | Saves retrospective + reproducibility bundle | `archive.md`, `bundle_index.json` | | 22 | EXPORT_PUBLISH | Generates LaTeX, charts, and code package | `paper_final.md`, `paper.tex`, `code/` | | 23 | CITATION_VERIFY | Fact-checks all references against real APIs | `verification_report.json`, `references_verified.bib` | ### Gate Stages Three stages pause for human review (unless `--auto-approve` is set): | Gate | What's Being Reviewed | On Reject, Rolls Back To | |------|-----------------------|--------------------------| | Stage 5 | Are the collected papers relevant and sufficient? | Stage 4 (re-collect literature) | | Stage 9 | Is the experiment design sound? | Stage 8 (re-generate hypotheses) | | Stage 20 | Does the paper meet quality standards? | Stage 16 (re-write from outline) | For fully autonomous operation, always use `--auto-approve`. --- ## 6. Output Artifacts Each run creates a timestamped directory under `artifacts/`: ``` artifacts/rc-20260310-143200-a1b2c3/ ├── stage-1/goal.md # Research goal ├── stage-2/problem_tree.md # Problem decomposition ├── stage-3/search_plan.yaml # Search strategy ├── stage-4/candidates.jsonl # Raw literature results ├── stage-5/shortlist.jsonl # Screened papers ├── stage-6/cards/ # Knowledge cards (one per paper) ├── stage-7/synthesis.md # Research gap analysis ├── stage-8/hypotheses.md # Research hypotheses ├── stage-9/exp_plan.yaml # Experiment plan ├── stage-10/experiment.py # Generated experiment code ├── stage-10/experiment_spec.md # Experiment specification ├── stage-11/schedule.json # Resource schedule ├── stage-12/runs/run-1.json # Experiment results ├── stage-13/experiment_final.py # Refined experiment code ├── stage-13/experiment_v1.py # Iteration 1 snapshot ├── stage-13/refinement_log.json # Refinement history ├── stage-14/analysis.md # Statistical analysis ├── stage-14/experiment_summary.json # Metrics summary ├── stage-15/decision.md # Proceed/Pivot decision ├── stage-16/outline.md # Paper outline ├── stage-17/paper_draft.md # Full paper draft ├── stage-18/reviews.md # Simulated peer reviews ├── stage-19/paper_revised.md # Revised paper ├── stage-20/quality_report.json # Quality assessment ├── stage-21/archive.md # Knowledge retrospective ├── stage-22/ │ ├── paper_final.md # Final paper (Markdown) │ ├── paper.tex # Conference-ready LaTeX │ ├── references.bib # BibTeX references │ ├── charts/ # Result visualizations │ └── code/ # Open-source code package │ ├── experiment.py │ ├── requirements.txt │ └── README.md ├── stage-23/ │ ├── verification_report.json # Citation fact-check results │ └── references_verified.bib # Cleaned bibliography └── pipeline_summary.json # Overall execution summary ``` ### Key Output Files | File | What You'll Use It For | |------|----------------------| | `stage-22/paper.tex` | Submit to a conference (compile with `pdflatex` or `tectonic`) | | `stage-22/paper_final.md` | Read or further edit the paper | | `stage-22/references.bib` | Bibliography for LaTeX compilation | | `stage-22/code/` | Share experiment code alongside the paper | | `stage-23/verification_report.json` | Check which citations are real vs. hallucinated | | `stage-13/experiment_final.py` | The best-performing experiment code | | `stage-22/charts/` | Figures for the paper | --- ## 7. Experiment Modes AutoResearchClaw supports four modes for running experiments: ### Simulated (Default) ```yaml experiment: mode: "simulated" ``` The LLM **generates synthetic experiment results** without executing any code. This is fast and requires no special setup, but the results are not real. **Best for**: Quick prototyping, testing the pipeline end-to-end, environments without Python scientific packages. ### Sandbox ```yaml experiment: mode: "sandbox" sandbox: python_path: ".venv/bin/python3" gpu_required: false max_memory_mb: 4096 ``` The pipeline **generates Python code and actually runs it** in a subprocess. The code is validated before execution (AST parsing, import whitelist, no file I/O outside sandbox). **Hardware-aware**: Stage 1 auto-detects your GPU (NVIDIA CUDA / Apple MPS / CPU-only) and adapts the generated code accordingly — high-tier GPUs get full PyTorch code, limited GPUs get lightweight experiments, CPU-only gets NumPy/sklearn only. **Best for**: Real experiments on your local machine. Supports numpy and stdlib; deep learning frameworks (torch, tensorflow) are available if installed in your environment and GPU is detected. **Safety features**: - Code validation blocks dangerous operations (subprocess, eval, exec, network calls) - Configurable memory limit and execution timeout - Auto-repair: if generated code has validation errors, the LLM fixes them (up to 3 attempts) ### Docker ```yaml experiment: mode: "docker" docker: image: "researchclaw/experiment:latest" gpu_enabled: true memory_limit_mb: 8192 network_policy: "setup_only" # none | setup_only | pip_only | full auto_install_deps: true shm_size_mb: 2048 ``` The pipeline runs generated code inside a **Docker container** with GPU passthrough, dependency auto-installation, and network isolation. Execution follows a **three-phase model** within a single container: 1. **Phase 0 (pip install)**: Installs auto-detected dependencies from `requirements.txt` (network enabled) 2. **Phase 1 (setup.py)**: Runs `setup.py` for dataset downloads and environment preparation (network enabled) 3. **Phase 2 (experiment)**: Executes the experiment code (network disabled by default via iptables) **Network policies**: - `none` — No network at all (all phases offline). Requires all deps pre-installed in image. - `setup_only` (default) — Network during Phase 0+1, disabled before Phase 2 via iptables (`--cap-add=NET_ADMIN`). - `pip_only` — Network only during Phase 0 (pip install), disabled for Phase 1+2. - `full` — Network available throughout all phases. **Pre-cached datasets**: The Docker image includes CIFAR-10/100, MNIST, FashionMNIST, STL-10, and SVHN at `/opt/datasets`, mounted read-only as `/workspace/data`. No download needed for these standard benchmarks. **Best for**: Reproducible experiments with full dependency isolation. Supports GPU passthrough (NVIDIA) and configurable network policies. **Setup**: Build the image first: ```bash docker build -t researchclaw/experiment:latest researchclaw/docker/ ``` ### SSH Remote ```yaml experiment: mode: "ssh_remote" ssh_remote: host: "gpu-server.example.com" gpu_ids: [0, 1] remote_workdir: "/tmp/researchclaw_experiments" ``` The pipeline sends generated code to a remote GPU server for execution. **Best for**: Experiments that require GPU hardware you don't have locally. --- ## 8. Conference Templates AutoResearchClaw generates LaTeX files formatted for specific conferences: ```yaml export: target_conference: "neurips_2025" ``` | Conference | Config Value | Layout | |------------|-------------|--------| | NeurIPS 2025 | `neurips_2025` (default) | Single-column, `neurips_2025` style | | NeurIPS 2024 | `neurips_2024` | Single-column, `neurips_2024` style | | ICLR 2026 | `iclr_2026` | Single-column, `iclr2026_conference` style | | ICLR 2025 | `iclr_2025` | Single-column, `iclr2025_conference` style | | ICML 2026 | `icml_2026` | Double-column, `icml2026` style | | ICML 2025 | `icml_2025` | Double-column, `icml2025` style | Short aliases are also accepted: `neurips` (→ 2025), `iclr` (→ 2026), `icml` (→ 2026). The Markdown-to-LaTeX converter handles: - Section headings (`#`, `##`, `###`) - Inline and display math (`$...$`, `$$...$$`) - Bold and italic text - Ordered and unordered lists - Tables - Code blocks - Citation references (`[cite_key]` → `\cite{cite_key}`) ### Compiling the LaTeX ```bash # Using tectonic (recommended) tectonic artifacts//stage-22/paper.tex # Using pdflatex cd artifacts//stage-22/ pdflatex paper.tex bibtex paper pdflatex paper.tex pdflatex paper.tex ``` --- ## 9. OpenClaw Bridge (Advanced) For deeper integration with OpenClaw, AutoResearchClaw includes a bridge adapter system. Each flag in the config activates a typed protocol interface: ```yaml openclaw_bridge: use_cron: true # Scheduled research runs use_message: true # Progress notifications (Discord/Slack/Telegram) use_memory: true # Cross-session knowledge persistence use_sessions_spawn: true # Spawn parallel sub-sessions for concurrent stages use_web_fetch: true # Live web search during literature review use_browser: false # Browser-based paper collection ``` ### What Each Adapter Does | Adapter | Protocol | Use Case | |---------|----------|----------| | **Cron** | `CronAdapter.schedule_resume(run_id, stage_id, reason)` | Schedule pipeline resumption (e.g., daily re-runs) | | **Message** | `MessageAdapter.notify(channel, subject, body)` | Send progress updates to chat platforms | | **Memory** | `MemoryAdapter.append(namespace, content)` | Persist knowledge across sessions | | **Sessions** | `SessionsAdapter.spawn(name, command)` | Run pipeline stages in parallel sub-sessions | | **WebFetch** | `WebFetchAdapter.fetch(url)` | Fetch web pages during literature search | | **Browser** | `BrowserAdapter.open(url)` | Open and interact with web pages | When OpenClaw provides a capability (e.g., message sending), the adapter consumes it automatically. When running standalone, recording stubs capture all calls for debugging without side effects. This is an **extension point** — you don't need to configure it for basic usage. --- ## 10. MetaClaw Integration (Cross-Run Learning) [MetaClaw](https://github.com/aiming-lab/MetaClaw) adds **cross-run knowledge transfer** to AutoResearchClaw. When enabled, the pipeline automatically captures lessons from failures and converts them into reusable skills that improve subsequent runs. ### Architecture ``` ┌──────────────────────────────────────────────────────┐ │ AutoResearchClaw Pipeline │ │ Stage 1 → 2 → ... → 23 │ │ │ │ ┌─────────────┐ ┌──────────────────────────────┐ │ │ │ LLMClient │───▶│ MetaClaw Integration Layer │ │ │ │ │ │ (metaclaw_bridge module) │ │ │ └─────────────┘ └──────────┬───────────────────┘ │ │ │ │ │ ┌─────────────┐ ┌──────────▼───────────────────┐ │ │ │ Evolution │◀──▶│ Lesson ↔ Skill Bridge │ │ │ │ Store │ └─────────────────────────────┘ │ │ └─────────────┘ │ └──────────────────────────┬───────────────────────────┘ │ ┌──────────────▼──────────────┐ │ MetaClaw Proxy Server │ │ (optional, :30000) │ │ ┌────────────────────────┐ │ │ │ SkillManager (40+ skills)│ │ │ │ + arc-* learned skills │ │ │ └────────────────────────┘ │ └─────────────────────────────┘ ``` ### How It Works 1. **Lesson Capture**: During each pipeline run, the `EvolutionStore` automatically records failures, warnings, and anomalies as structured lessons in `evolution/lessons.jsonl`. 2. **Lesson → Skill Conversion**: After a run completes, lessons above a configurable severity threshold are converted into `arc-*` skill files stored in `~/.metaclaw/skills/`. Each skill contains: trigger conditions, failure root cause, and actionable guidance. 3. **Skill Injection**: On the next run, `build_overlay()` reads all `arc-*` skills and injects them into the LLM prompt for every stage via the `evolution_overlay` parameter. The LLM receives explicit instructions to avoid previously encountered pitfalls. 4. **Proxy Routing (Optional)**: When the MetaClaw proxy is running, LLM requests are routed through it for additional skill matching and session tracking. If the proxy is unavailable, requests automatically fall back to the direct LLM endpoint. ### Setup #### Step 1: Install MetaClaw ```bash pip install metaclaw # Or clone from source: git clone https://github.com/aiming-lab/MetaClaw.git cd metaclaw && pip install -e . ``` #### Step 2: Configure Add the `metaclaw_bridge` section to your `config.arc.yaml`: ```yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000/v1" # MetaClaw proxy (optional) skills_dir: "~/.metaclaw/skills" # Skill storage directory fallback_url: "https://api.openai.com/v1" # Direct LLM fallback fallback_api_key_env: "OPENAI_API_KEY" lesson_to_skill: enabled: true min_severity: "warning" # Convert warnings + errors max_skills_per_run: 5 # Max new skills per run ``` #### Step 3: Run ```bash # First run — captures lessons, generates initial skills researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve # Check generated skills ls ~/.metaclaw/skills/arc-*/SKILL.md # Second run — skills from Run 1 are automatically injected researchclaw run --config config.arc.yaml --topic "Your idea" --auto-approve ``` #### Optional: Start MetaClaw Proxy For full skill matching and session tracking: ```bash metaclaw start --mode skills_only --port 30000 # Or use the provided script: bash scripts/metaclaw_start.sh ``` The proxy is optional — without it, the pipeline still benefits from skill injection via `build_overlay()` and falls back to your configured LLM endpoint. ### Experiment Results In controlled A/B experiments (same topic, same LLM, same configuration): | Metric | Baseline | With MetaClaw | Improvement | |--------|----------|---------------|-------------| | Stage retry rate | 10.5% | 7.9% | **-24.8%** | | Refine cycle count | 2.0 | 1.2 | **-40.0%** | | Pipeline stage completion | 18/19 | 19/19 | **+5.3%** | | Overall robustness score (composite) | 0.714 | 0.845 | **+18.3%** | > Composite robustness score is a weighted average of stage completion rate (40%), retry reduction (30%), and refine cycle efficiency (30%). ### Key Files | File | Purpose | |------|---------| | `researchclaw/metaclaw_bridge/` | Integration module (config, session, lesson_to_skill, prm_gate, skill_feedback) | | `researchclaw/evolution.py` | `build_overlay()` — reads intra-run lessons + cross-run arc-* skills | | `researchclaw/llm/client.py` | Proxy routing with automatic fallback | | `~/.metaclaw/skills/arc-*/SKILL.md` | Learned skill files (auto-generated) | | `scripts/metaclaw_start.sh` | Helper script to launch MetaClaw proxy | ### Backward Compatibility - **Default: OFF.** Without `metaclaw_bridge.enabled: true`, the pipeline is completely unchanged. - **No new required dependencies.** MetaClaw is optional. - **All 1,823 existing tests pass** with the integration code. --- ## 11. Other AI Platforms AutoResearchClaw works with any AI coding assistant that can read project context files. ### Claude Code Claude Code automatically reads `RESEARCHCLAW_CLAUDE.md` (if present) when you open the project. It also loads the skill definition from `.claude/skills/researchclaw/SKILL.md`. > **Note:** `RESEARCHCLAW_CLAUDE.md` is generated locally and listed in `.gitignore`. The `.claude/skills/researchclaw/SKILL.md` file is always available in the repo. ``` You: Research the impact of attention mechanisms on speech recognition Claude: [Reads project context, runs the pipeline, returns results] ``` ### Copilot CLI (GitHub) GitHub Copilot can be used as an ACP agent via the `gh` CLI command (GitHub CLI with Copilot extension). Set the ACP agent to `gh` in your config: ```yaml llm: provider: "acp" acp: agent: "gh" cwd: "." ``` Prerequisites: 1. Install [GitHub CLI](https://cli.github.com/) (`gh`) 2. Install the Copilot extension: `gh extension install github/gh-copilot` 3. Authenticate: `gh auth login` ### OpenCode OpenCode loads skills from `.claude/skills/`. The `researchclaw` skill activates on research-related queries and guides the agent through the pipeline. ### Any AI CLI Provide `RESEARCHCLAW_AGENTS.md` (if generated locally) or `README.md` as context to any AI assistant. `RESEARCHCLAW_AGENTS.md` contains: - The agent role definition (research orchestrator) - Quick setup instructions - Pipeline stage reference - Decision guide for common scenarios The agent reads this file and knows how to install, configure, and run the pipeline. If the file is not present, the `README.md` and `.claude/skills/researchclaw/SKILL.md` provide sufficient context for any AI assistant to operate the pipeline. --- ## 12. Python API For programmatic use or custom integrations: ```python from researchclaw.pipeline.runner import execute_pipeline from researchclaw.config import RCConfig from researchclaw.adapters import AdapterBundle from pathlib import Path # Load configuration config = RCConfig.load("config.yaml", check_paths=False) # Run the full pipeline results = execute_pipeline( run_dir=Path("artifacts/my-run"), run_id="run-001", config=config, adapters=AdapterBundle(), auto_approve_gates=True, ) # Check results for result in results: print(f"Stage {result.stage.name}: {result.status.value}") ``` ### Iterative Pipeline (Multiple Paper Revisions) ```python from researchclaw.pipeline.runner import execute_iterative_pipeline results = execute_iterative_pipeline( run_dir=Path("artifacts/my-run"), run_id="run-001", config=config, adapters=AdapterBundle(), max_iterations=3, # Re-run paper writing up to 3 times convergence_rounds=2, # Stop if quality stabilizes for 2 rounds ) ``` ### Literature Search Only ```python from researchclaw.literature.search import search_papers papers = search_papers("transformer attention mechanisms", limit=20) for p in papers: print(f"{p.title} ({p.year}) — cited {p.citation_count}x") print(p.to_bibtex()) ``` --- ## 13. Troubleshooting ### Pre-Run Diagnostics ```bash # Check everything: Python version, dependencies, API connectivity, config validity researchclaw doctor --config config.yaml ``` ### Common Issues | Problem | Cause | Solution | |---------|-------|----------| | `Missing required field: llm.base_url` | Config incomplete | Set `llm.base_url` and `llm.api_key` (or `api_key_env`) | | `Config validation FAILED` | Invalid YAML or missing fields | Run `researchclaw validate -c config.yaml` for details | | `Preflight check... FAILED` | LLM API unreachable | Check `base_url`, API key, and network connectivity | | Sandbox execution fails | Python path wrong or missing packages | Verify `experiment.sandbox.python_path` exists; ensure numpy is installed | | Code validation rejects all attempts | LLM generates unsafe code | Switch to `simulated` mode, or try a more capable model | | Gate stage blocks pipeline | Manual approval required | Use `--auto-approve` for autonomous mode | | Pipeline fails mid-run | Transient API error | Run with `--resume` to continue from the last checkpoint | | Citations marked HALLUCINATED | LLM invented fake references | This is expected — Stage 23 catches these. Use `references_verified.bib` instead | | LaTeX won't compile | Missing style packages | Install the conference style files, or use `tectonic` which auto-downloads them | ### Resuming a Failed Run ```bash # Resume from the exact point of failure researchclaw run -c config.yaml --resume --auto-approve # Or restart from a specific stage researchclaw run -c config.yaml --from-stage EXPERIMENT_RUN --auto-approve --output artifacts/ ``` ### Reading a Run Report ```bash researchclaw report --run-dir artifacts/rc-20260310-143200-a1b2c3 ``` This prints a human-readable summary: which stages passed, which failed, key metrics, and paper quality scores. --- ## 14. FAQ **Q: How much does a full pipeline run cost in API credits?** A: Depends on your model and topic complexity. A typical run with GPT-4o makes ~35-60 API calls across all 23 stages (paper drafting now uses 3 sequential calls for section-by-section writing). Expect roughly $3-12 per run. Simulated mode uses slightly fewer tokens since it doesn't generate real experiment code. **Q: Can I use a local LLM (Ollama, vLLM, etc.)?** A: Yes — any OpenAI-compatible endpoint works. Set `llm.base_url` to your local server (e.g., `http://localhost:11434/v1` for Ollama). Quality depends heavily on the model's capabilities. **Q: Can I run only part of the pipeline?** A: Yes. Use `--from-stage STAGE_NAME` to start from any stage. The stage reads its inputs from previously generated artifacts, so the earlier stages must have completed at least once. **Q: Are the literature references real?** A: Yes. Stage 4 uses a multi-source strategy (arXiv-first, then Semantic Scholar) with query expansion to find real papers with real titles, DOIs, and citation counts. The pipeline typically collects 100-200 candidates and aims for 30-60 references in the final paper. Stage 23 then verifies every reference to catch any that the LLM might have hallucinated during paper writing. **Q: Can I use this for a real paper submission?** A: AutoResearchClaw is a research tool, not a paper mill. The output is a strong first draft that should be reviewed, improved, and validated by a human researcher before submission. Think of it as an extremely thorough research assistant. **Q: What happens if the LLM API goes down mid-run?** A: The pipeline checkpoints after every stage. Use `--resume` to pick up where it left off. Failed stages are retried according to the `max_retries` setting in each stage's contract. **Q: Can I change the research topic mid-run?** A: Not recommended — the pipeline builds on prior stages' outputs. Start a new run with the new topic instead. --- *Last updated: March 2026 · AutoResearchClaw v0.3.1+* ================================================ FILE: docs/issue_tracker_v9.md ================================================ # AutoResearchClaw — Issue Tracker v9 > Created: 2026-03-15 > Status: **Active** — tracking all known issues from Phase 0-3 regression tests > Covers: Run 7-13 findings, V8 merge improvements, upstream sync --- ## Issue Summary | Category | Total | Fixed | Partial | Open | |----------|-------|-------|---------|------| | LaTeX & Title | 5 | 5 | 0 | 0 | | Experiment Quality | 6 | 6 | 0 | 0 | | Code Generation | 4 | 4 | 0 | 0 | | Writing Quality | 5 | 5 | 0 | 0 | | Literature & Citations | 4 | 4 | 0 | 0 | | Infrastructure (Docker) | 5 | 5 | 0 | 0 | | Pipeline Logic | 3 | 3 | 0 | 0 | | New Feature Requests | 2 | 1 | 0 | 1 | | Run 13 Findings | 3 | 3 | 0 | 0 | | **Total** | **37** | **36** | **0** | **1** | --- ## 1. LaTeX & Title Issues ### I-01: Title extraction fails on `##` headings (FIXED) - **Severity**: High - **Status**: FIXED — v9 patch - **Root Cause**: `_extract_paper_title()` in `executor.py:242` only matched `# ` (H1). When LLM generates `## Title ...` (H2), no candidates were found → returned `"Untitled Paper"`. - **Affected Runs**: Run 12 (`\title{Untitled Paper}`) - **Files**: - `researchclaw/pipeline/executor.py:240-253` — regex now matches `#{1,2}`, strips "Title " prefix - `researchclaw/templates/converter.py:429-451` — `_extract_title()` now handles level 1 and 2 - **Fix**: Added H2 fallback; handles `## Title ` pattern by stripping literal "Title " prefix. ### I-02: Converter `_extract_title` also level-1 only (FIXED) - **Severity**: Medium - **Status**: FIXED — v9 patch (same fix as I-01) - **File**: `researchclaw/templates/converter.py:434,442,447` — `sec.level in (1, 2)` + "Title " prefix strip - **Note**: Both I-01 and I-02 fixed together. ### I-03: LaTeX outer fence not stripped (FIXED) - **Severity**: High - **Status**: FIXED — commit `3792fd6` - **File**: `converter.py:107-117` - **Fix**: Greedy regex + boundary strip ### I-04: Metric values 16 decimal places (FIXED) - **Severity**: Medium - **Status**: FIXED — commit `3792fd6` - **File**: `converter.py:119-133` - **Fix**: `_round_raw_metrics()` rounds to 4 places ### I-05: Duplicate tables in LaTeX output (FIXED) - **Severity**: Medium - **Status**: FIXED — IMP-30, commit `b88aba2` - **File**: `converter.py:542-575` - **Fix**: `_deduplicate_tables()` by header row matching --- ## 2. Experiment Quality Issues ### I-06: Experiments only run n=1 seeds (FIXED) - **Severity**: High - **Status**: FIXED — v9 patch - **Evidence**: Run 11 (n=1), Run 12 (n=1), Run 13 (n=1) - **Root Cause**: Time budget pressure + weak enforcement in prompts. - **Fix**: Added `multi_seed_enforcement` block in `prompts.py` with mandatory implementation pattern (3-5 seeds), adaptive seed count, and concrete code template. Injected into code_generation for all sandbox/docker experiments via `executor.py`. - **Files**: `prompts.py` (new `multi_seed_enforcement` block), `executor.py:2145-2149` ### I-07: Ablation methods produce identical outputs (FIXED) - **Severity**: High - **Status**: FIXED — v9.1 patch - **Evidence**: Run 12 — ablation checker flagged many identical conditions - **Files**: - `executor.py:3838-3866` — identical condition detection (WORKS) - `executor.py:3876+` — zero-variance detection across all conditions (NEW) - `validator.py:607-658` — deep AST ablation override check (NEW) - `prompts.py:969+` — stronger ablation guidance - **Fix**: Added Check 5 in `validate_experiment_classes()` — compares AST dumps of overridden methods between child and parent classes. If all overrides are identical AST, warns that ablation is fake. Also added R13-1 zero-variance detection in executor analysis stage. ### I-08: RL training steps insufficient (FIXED) - **Severity**: High - **Status**: FIXED — v9 patch - **Evidence**: Run 13 PPO/SAC/TD3 all near-zero reward after 60k steps - **Root Cause**: RL algorithms need 500k-1M+ steps for MuJoCo tasks - **Fix**: Added `rl_step_guidance` block in `prompts.py` with per-algorithm minimum steps table (PPO MuJoCo: 500K min / 1M-3M recommended, SAC/TD3: 300K min, etc.), step budget allocation strategy, and evaluation protocol. Auto-detected via topic keywords and injected into both experiment_design and code_generation prompts. - **Files**: `prompts.py` (new `rl_step_guidance` block), `executor.py:2161-2174` (code_gen), `executor.py:1960-1963` (exp_design) ### I-09: All experiment methods fail (zero metrics) (FIXED) - **Severity**: Critical - **Status**: FIXED — Run 7-10 all had this, Runs 11-12 improved - **Fixes applied**: Docker deps (commit `787172d`), training epochs (commit `787172d`), anti-simulation rules (commit `44151b1`) - **Verification**: Run 11 still has 0 metrics (QLoRA instability), Run 12 has valid metrics ### I-10: `.ptp()` NumPy 2.0 API removed (FIXED) - **Severity**: High - **Status**: FIXED — commit `44151b1` - **File**: `validator.py` — forbidden patterns detection - **Fix**: Detect and replace deprecated NumPy APIs before execution ### I-11: Experiment results not framed correctly in paper (FIXED) - **Severity**: Medium - **Status**: FIXED — IMP-10 contradiction detection - **File**: `executor.py:_detect_result_contradictions()` - **Fix**: Auto-frames null results, warns about negative results --- ## 3. Code Generation Issues ### I-12: Code too simplistic / lazy implementations (FIXED) - **Severity**: Critical - **Status**: FIXED — commit `cb4af26` - **Files**: `validator.py` (AST analysis), `executor.py` (LLM code review stage 10.5) - **Fix**: Minimum 50 lines per algorithm class, empty subclass detection ### I-13: dict[key] crashes without .get() (FIXED) - **Severity**: Medium - **Status**: FIXED — commit `44151b1` - **File**: `validator.py` — forbidden patterns - **Fix**: Detect unsafe dict access in generated code ### I-14: LLM tasks use synthetic simulation (FIXED) - **Severity**: Critical - **Status**: FIXED — commit `44151b1` - **File**: `prompts.py` — CRITICAL NO SIMULATION rule - **Fix**: Prohibit fake training loops with synthetic loss values ### I-15: Missing experiment harness integration (FIXED) - **Severity**: Medium - **Status**: FIXED — v9.1 patch - **File**: `docker_sandbox.py:215-222` — `_inject_harness()`, `prompts.py:288-302` — harness guidance - **Fix**: Changed harness from "RECOMMENDED" to "MANDATORY" in compute_budget prompt block. Added explicit `check_value()` NaN detection and `finalize()` requirement with code examples. --- ## 4. Writing Quality Issues ### I-16: Academic style violations (FIXED) - **Status**: FIXED — IMP-20, commit `b88aba2` - **File**: `prompts.py` — `academic_style_guide` block ### I-17: Hedging language throughout paper (FIXED) - **Status**: FIXED — IMP-31, commit `b88aba2` - **File**: `prompts.py` — `anti_hedging_rules` block ### I-18: Number repetition across sections (FIXED) - **Status**: FIXED — IMP-24, commit `b88aba2` - **File**: `prompts.py` — `anti_repetition_rules` block ### I-19: Title too long / not formatted (FIXED) - **Status**: FIXED — `title_guidelines` rewrite, commit `b88aba2` - **File**: `prompts.py` — 14-word limit, MethodName: Subtitle format ### I-20: Abstract too verbose (FIXED) - **Status**: FIXED — `abstract_structure` rewrite, commit `b88aba2` - **File**: `prompts.py` — PMR+ format, 180-220 words --- ## 5. Literature & Citation Issues ### I-21: Hallucinated citations (FIXED) - **Status**: FIXED — Run 11: 90% verified, Run 12: 97.1% verified - **Files**: `literature/verify.py`, `literature/search.py` ### I-22: Invalid citation markers [?key:NOT_IN_BIB] (FIXED) - **Status**: FIXED — IMP-29, silent removal - **File**: `executor.py` ### I-23: Missing seminal papers (FIXED) - **Status**: FIXED — `data/seminal_papers.yaml` seed library - **File**: `researchclaw/data/seminal_papers.yaml` ### I-24: Rate-limited API searches (FIXED) - **Status**: FIXED — commit `63c5a7d` - **Files**: `arxiv_client.py` (circuit breaker), `openalex_client.py` (new), `semantic_scholar.py` (batch API) --- ## 6. Infrastructure Issues ### I-25: Docker missing ML packages (FIXED) - **Status**: FIXED — transformers, peft, trl, bitsandbytes, MuJoCo, etc. - **Commits**: `e72a818`, `787172d` ### I-26: HF cache mount duplication (FIXED) - **Status**: FIXED — commit `44151b1` - **File**: `docker_sandbox.py` ### I-27: Dataset pre-caching (FIXED) - **Status**: FIXED — CIFAR-10/100, FashionMNIST, MNIST in Docker image - **Commits**: `787172d` ### I-28: Time budget too short for LLM tasks (FIXED) - **Status**: FIXED — adaptive time budget by task type - **File**: `executor.py:2145-2160` ### I-29: Non-root pip install failure in Docker (FIXED) - **Status**: FIXED — `--break-system-packages` flag - **File**: `docker_sandbox.py` --- ## 7. Pipeline Logic Issues ### I-30: Pipeline proceeds after MAX_DECISION_PIVOTS=2 — quality gate added (FIXED) - **Severity**: Medium - **Status**: FIXED — v9.1 patch (quality gate added, MAX_PIVOTS=2 kept by design) - **Files**: `runner.py:299-321` — quality gate check, `runner.py:697-756` — `_check_experiment_quality()` - **Fix**: Added `_check_experiment_quality()` function that runs before forced PROCEED. Checks: (1) all metrics zero, (2) all conditions identical primary_metric (R13-1), (3) too many ablation warnings, (4) analysis quality score < 3. If any check fails, writes `quality_warning.txt` to run directory and logs QUALITY WARNING. Pipeline still proceeds but the warning is preserved for review. ### I-31: LLM code review JSON parsing failure (FIXED) - **Status**: FIXED — commit `44151b1` - **File**: `executor.py:2300-2330` - **Fix**: Markdown fence stripping, graceful fallback ### I-32: Topic quality not validated against trends (FIXED) - **Severity**: Medium - **Status**: FIXED — v9.1 patch - **File**: `prompts.py:986-996` — topic_init prompt - **Fix**: Added TREND VALIDATION requirement to topic_init prompt: must identify 2-3 recent papers (2024-2026) for relevance, name specific benchmark/dataset, state SOTA results, and include a 'Benchmark' subsection. --- ## 8. New Feature Requests ### F-01: Training Framework Documentation Retrieval (FIXED — Phase 1) - **Severity**: High — impacts LLM fine-tuning code quality - **Status**: FIXED (Phase 1: static docs) — v9 patch - **Description**: When the pipeline needs to generate code using training frameworks (LlamaFactory, TRL, Axolotl), the backbone LLM may not know the correct API usage. The pipeline should: 1. Detect which framework is needed based on the experiment design 2. Fetch the framework's official API documentation and example code 3. Inject relevant documentation into the code generation prompt 4. Generate code that correctly uses the framework APIs - **Current Problem**: Generated training code may use incorrect or outdated API calls, leading to experiment failures (e.g., Run 11 QLoRA training diverged) - **Proposed Approaches**: - **Option A: Static doc snippets** — Bundle curated API reference snippets for common frameworks in `researchclaw/data/framework_docs/`. Simple, fast, but requires manual updates. - **Option B: Context7-style MCP** — Use Context7 (upstash/context7) to fetch live documentation at runtime via MCP protocol. Always up-to-date, but adds network dependency. - **Option C: Git clone + extract** — Clone framework repos at pipeline startup, extract README/docs/examples, summarize via LLM, inject into prompts. Most complete, but slow and requires network. - **Option D: Hybrid** — Bundle static docs for top frameworks + fallback to web fetch for unknown ones. - **Reference Tools**: Cursor `@Docs`, Context7 MCP, Aider web context, OpenHands SDK - **Target**: Phase 4 or Phase 5 - **Dependencies**: Network access during code generation stage ### F-01 Detailed Design: Framework Doc-RAG #### Problem Statement When the pipeline generates experiment code that uses ML training frameworks (LlamaFactory, TRL, Axolotl, transformers Trainer), the backbone LLM (GPT-5.1/GPT-4.1) may not know current API signatures, default parameters, or correct usage patterns. This leads to: 1. **Incorrect API calls** — using removed or renamed functions 2. **Missing config fields** — e.g. LlamaFactory YAML missing required keys 3. **Wrong training patterns** — e.g. calling `Trainer.train()` without `TrainingArguments` 4. **Version mismatch** — framework APIs change between versions installed in Docker vs LLM training data Evidence: Run 11 (QLoRA) — all 8 methods diverged; likely caused by incorrect training setup. #### Industry Survey | Tool | Approach | Pros | Cons | |------|----------|------|------| | **DocPrompting** (ICLR 2023) | BM25/dense retrieval over docs → inject into code gen prompt | Academic validation (+2.85% pass@1), open source | Requires pre-built index | | **Cursor @Docs** | User adds doc URLs, IDE crawls/indexes, injects relevant snippets into LLM context | Real-time, version-aware | Requires IDE, manual URL management | | **Context7 MCP** | MCP server with 9000+ pre-indexed libraries, `resolve-library-id` + `query-docs` tools | Automatic, 9k libraries, version-specific | Network dependency, closed backend | | **DSPy DocLearner** | BeautifulSoup scraper → LLM analysis → code generation chain | Fully automated pipeline | Slow, brittle scraping | | **llms.txt** | Standardized `/llms.txt` markdown file in project root for LLM consumption | Simple, no crawling needed | Requires framework authors to adopt | | **AI Scientist v2** | No templates, relies purely on LLM knowledge + tree search debugging | Zero setup | Lower success rate, no doc awareness | | **Continue + MCP** | DeepWiki for GitHub repos + Context7 for docs + `.continue/rules` | Extensible MCP ecosystem | Complex setup | | **AGENTS.md** | Project-level instructions for AI agents (60k+ projects adopted) | No infra needed | Only project conventions, not API docs | **Key finding**: No existing autonomous research agent (AI Scientist v1/v2, CodeScientist) dynamically reads documentation at runtime. They all rely on pre-built templates or LLM training data. **Doc-RAG would be a differentiating feature for AutoResearchClaw.** **Academic evidence**: IBM study shows well-structured documentation improves AI response accuracy by up to **47%**. #### Available Framework Documentation | Framework | Docs URL | Format | Key Content | |-----------|----------|--------|-------------| | **TRL** | `huggingface.co/docs/trl` | HTML/MD | SFTTrainer, DPOTrainer, GRPOTrainer, RewardTrainer | | **LlamaFactory** | `llamafactory.readthedocs.io` | HTML/RST | YAML config, CLI, SFT/DPO/RLHF/KTO/ORPO | | **Axolotl** | `docs.axolotl.ai` | HTML/MD | YAML config, LoRA/QLoRA/GPTQ, full/DPO/GRPO | | **PEFT** | `huggingface.co/docs/peft` | HTML/MD | LoRA/QLoRA config, get_peft_model | | **transformers** | `huggingface.co/docs/transformers` | HTML/MD | Trainer, TrainingArguments, AutoModel | #### Recommended Implementation: Hybrid Static + Web Fetch **Phase 1 (Static — immediate):** - Create `researchclaw/data/framework_docs/` directory - Bundle curated API snippets for top 5 frameworks: - `trl.md` — SFTTrainer, DPOTrainer, PPOTrainer usage + config - `llamafactory.md` — YAML config format, CLI usage, dataset format - `transformers_trainer.md` — TrainingArguments, Trainer, PEFT integration - `peft.md` — LoRA/QLoRA config, get_peft_model, prepare_model_for_kbit_training - `axolotl.md` — YAML config format, training modes - In `executor.py:_execute_code_generation()`, detect framework from experiment design - Inject matching doc snippet into code generation prompt as `{framework_reference}` - **Effort**: ~4 hours, no network dependency **Phase 2 (Web Fetch — later):** - Add `FrameworkDocFetcher` class in `researchclaw/literature/framework_docs.py` - On experiment_design detection of framework name: 1. Check if `llms.txt` exists at framework's docs URL 2. If yes, fetch and extract relevant sections 3. If no, fall back to static bundle - Cache fetched docs locally (`.researchclaw_cache/framework_docs/`) - TTL: 7 days (frameworks don't change API that often) - **Effort**: ~8 hours, requires network during code gen stage **Phase 3 (Context7 MCP — optional):** - Integrate Context7 MCP client for automatic library discovery - `resolve-library-id("trl")` → `"/huggingface/trl"` - `query-docs("/huggingface/trl", "SFTTrainer config")` → relevant docs - Most complete solution but adds external service dependency #### Phase 1 Implementation (COMPLETED — v9 patch) - Created `researchclaw/data/framework_docs/` with 5 curated API reference files: - `trl.md` — SFTTrainer, DPOTrainer, GRPOTrainer, PPOTrainer, PEFT integration - `peft.md` — LoRA, QLoRA, DoRA configs, save/load, target_modules by model - `transformers_training.md` — TrainingArguments, Trainer, tokenization, causal LM - `llamafactory.md` — YAML config, CLI, dataset formats, DPO, export - `axolotl.md` — YAML config, dataset formats, DPO, multi-GPU - Added `detect_frameworks()` and `load_framework_docs()` in `researchclaw/data/__init__.py` - Injected into both `experiment_design` and `code_generation` stages in `executor.py` - Auto-detection based on topic + hypothesis + experiment plan keywords - Max 8000 chars for code_generation, 4000 chars for experiment_design (to avoid context overflow) #### Integration Point in Pipeline ``` Stage 9: experiment_design → detects framework (e.g., "use TRL SFTTrainer") ↓ Stage 10: code_generation prompt += framework_reference doc snippet ↓ Stage 10.5: code_review → validates API calls against doc snippet ↓ Stage 12: execution → framework is already installed in Docker ``` #### Framework Detection Heuristics ```python FRAMEWORK_KEYWORDS = { "trl": ["SFTTrainer", "DPOTrainer", "PPOTrainer", "trl", "RewardTrainer"], "llamafactory": ["LlamaFactory", "llama_factory", "llamafactory"], "peft": ["LoRA", "QLoRA", "get_peft_model", "PeftConfig"], "transformers": ["Trainer", "TrainingArguments", "AutoModelForCausalLM"], "axolotl": ["axolotl"], } ``` --- ## 9. Run 13 Findings (RL Benchmark — PPO/SAC/TD3 with PER on MuJoCo) ### R13-1: All conditions produce identical metrics (FIXED) - **Severity**: Critical - **Status**: FIXED — v9.1 patch - **Evidence**: Run 13 — all 6 algorithm/PER conditions had identical primary_metric (0.1074) - **Root Cause**: Condition → implementation mapping broken in generated code; ablation checker caught it but too late - **Fix**: Added zero-variance detection in executor.py analysis stage (line 3876+). Added to `_check_experiment_quality()` gate in runner.py. AST validation in validator.py now catches fake ablation subclasses. ### R13-2: Gymnasium v4 environments deprecated (FIXED) - **Severity**: Medium - **Status**: FIXED — v9.1 patch - **Evidence**: Run 13 warnings: "The environment HalfCheetah-v4 is out of date" - **Fix**: Added v5 environment requirement to `rl_step_guidance` prompt block in prompts.py. ### R13-3: No learning curve logging for RL (FIXED) - **Severity**: Medium - **Status**: FIXED — v9.1 patch - **Evidence**: Run 13 only reported final metrics, no step-by-step evaluation - **Fix**: Added learning curve logging requirement to `rl_step_guidance` prompt: `EVAL:` lines every N_eval steps, `LEARNING_CURVE:` summary at end. --- ## 10. Feature Requests — Advanced Code Generation ### F-02: Advanced Coding Agent for Experiment Code Generation (OPEN) - **Severity**: Critical (pipeline capability ceiling) - **Status**: OPEN — research complete, implementation pending - **Problem**: Current code generation stage produces relatively simple, single-file experiments. Cannot design large-scale multi-file projects (e.g., complex RL systems with custom environments, multi-component fine-tuning pipelines). This limits paper quality and experiment sophistication. - **Goal**: Replace single-shot code generation with an agentic coding system capable of iterative development, debugging, and multi-file project design — analogous to how Claude Code or Devin can build complex projects from scratch. #### Research Summary **Design Patterns Identified** (from survey of 12+ systems: Claude Code, Cursor, Devin, SWE-Agent, OpenHands, Aider, MetaGPT, ChatDev, AI Scientist v2, AIDE, AgentCoder, AlphaCodium): | Pattern | Description | Key Systems | Impact | |---------|-------------|-------------|--------| | A: Architect-then-Code | Separate planning step → architecture spec → code generation | Aider, MetaGPT | HIGH | | B: Solution Tree Search | Solutions as tree nodes; branch, evaluate, prune | AI Scientist v2, AIDE | CRITICAL | | C: Execution-in-the-Loop | Generate → execute → parse error → fix loop | Claude Code, SWE-Agent | HIGH | | D: Multi-Agent Review | Coder + reviewer dialog with iterative refinement | ChatDev, AgentCoder | MEDIUM | | E: Tool-Augmented Generation | File R/W, terminal, linting, search as LLM tools | Claude Code, SWE-Agent | HIGH | | F: Context Engineering | Repo maps, compression, selective context inclusion | Aider, Claude Code | MEDIUM | #### Implementation Plan — 4 Phases **Phase 1: Architect-then-Code** (Priority: HIGH) - Add architecture planning substage before code generation - LLM produces file structure, class hierarchy, data flow diagram - Code generation uses architecture spec as constraint - Files: `researchclaw/pipeline/executor.py` (new substage), `researchclaw/prompts.py` (architecture prompt) **Phase 2: Execution-in-the-Loop** (Priority: HIGH) - After initial code generation, run code in sandbox - Parse stderr/stdout for errors - Feed errors back to LLM for iterative fix (max N iterations) - Already partially exists in current REFINE loop — needs to be tightened into inner code-fix loop - Files: `researchclaw/pipeline/executor.py`, `researchclaw/experiment/docker_sandbox.py` **Phase 3: Solution Tree Search** (Priority: CRITICAL) - Multiple candidate solutions generated in parallel - Each evaluated via sandbox execution (runtime errors, metric quality) - Best candidate selected or merged; backtrack on failures - Inspired by AIDE/AI Scientist v2 tree search pattern - Files: New `researchclaw/pipeline/code_agent.py`, `researchclaw/pipeline/executor.py` **Phase 4: Multi-Agent Review** (Priority: MEDIUM) - Coder agent generates code, reviewer agent critiques - Dialog continues until reviewer approves or max rounds reached - Catches logical errors, missing edge cases, poor experiment design - Files: `researchclaw/pipeline/code_agent.py` #### Task Breakdown | Task ID | Phase | Description | Status | Depends On | |---------|-------|-------------|--------|------------| | F-02-1 | 1 | Design architecture planning prompt and substage | DONE | — | | F-02-2 | 1 | Implement architect substage in executor.py | DONE | F-02-1 | | F-02-3 | 1 | Wire architecture spec into code generation prompt | DONE | F-02-2 | | F-02-4 | 2 | Implement inner code-fix loop (generate → run → fix) | DONE | F-02-3 | | F-02-5 | 2 | Add error parsing and structured feedback extraction | DONE | F-02-4 | | F-02-6 | 2 | Configure max iterations and timeout for fix loop | DONE | F-02-5 | | F-02-7 | 3 | Design solution tree data structure and evaluation | DONE | F-02-6 | | F-02-8 | 3 | Implement parallel candidate generation | DONE | F-02-7 | | F-02-9 | 3 | Implement tree search: branch, evaluate, prune, select | DONE | F-02-8 | | F-02-10 | 4 | Implement reviewer agent prompt and dialog loop | DONE | F-02-9 | | F-02-11 | — | End-to-end test with complex RL experiment | DONE | F-02-10 | #### Live Test Results (2026-03-15) Branch: `feat/advanced-code-agent` | Commits: `93d3233`, `4b91ac9` | Test | Topic | Model | Score | Total Lines | Eff. Lines | Classes | Key Quality | |------|-------|-------|-------|-------------|------------|---------|-------------| | T1 | ViT CIFAR-10 | GPT-4.1 (v1) | 9.1/10 | 505 | 409 | 11 | Thin wrappers (3 lines each) | | T1 | ViT CIFAR-10 | GPT-4.1 (v2) | 10.0/10 | 589 | 488 | 4 | Substantial (45-79 lines/class) | | T1 | ViT CIFAR-10 | GPT-5.1 (v2) | 9.7/10 | 1425 | 1120 | 12 | 6 files, custom transformer blocks | | T2 | OOD Detection | GPT-5.2 (v2) | 10.0/10 | 1309 | 1033 | 14 | SNGP, Deep Ensemble, MC Dropout | | T2 | OOD Detection | GPT-5.1 (v2) | 9.7/10 | 1541 | 1257 | 10 | Spectral norm, RFF, GP posterior | | T3 | Meta-Learning | GPT-5.1 (v2) | 10.0/10 | 1366 | 1140 | 9 | MAML functional_call, Reptile, ProtoNet | **Key findings:** - v1→v2 prompt fix: method_richness improved from 4.4/10 to 10/10 (thin wrapper elimination) - GPT-5.1 consistently produces 1100-1250 effective lines with proper algorithmic depth - MAML implementation uses `torch.nn.utils.stateless.functional_call` for correct second-order gradients - SNGP implementation includes Random Fourier Features with diagonal GP posterior - All generated code passes syntax validation; no runtime errors in static analysis **Remaining observations (not blocking):** - MAMLFirstOrderLearner has significant code duplication with MAMLLearner (could use flag) - LinearClassifierHead flagged as thin (9 lines) — acceptable for utility module - GPT-5.2 timed out on preflight check (reasoning model min 32768 tokens) --- ## Appendix A: Issue-to-Run Mapping | Run | Issues Hit | Quality Score | |-----|-----------|---------------| | Run 7 | I-03, I-04, I-09, I-12, I-16-I-20 | 3/10 | | Run 8-10 | I-09, I-10, I-25, I-27 | Not scored | | Run 11 | I-01 (OK), I-06, I-09 (QLoRA diverge) | 7/10 | | Run 12 | I-01, I-06, I-07, I-15 | 7.5/10 | | Run 13 | I-06, I-07, I-08, I-30, R13-1/2/3 | 3/10 (REFINE decision) | ## Appendix B: Fix Commit Reference | Commit | Description | Issues Fixed | |--------|-------------|-------------| | `3792fd6` | V4 improvements | I-03, I-04 | | `cb4af26` | Phase 1 code quality | I-12 | | `e72a818` | Phase 2 LLM fine-tuning | I-25, I-28 | | `787172d` | Phase 0 diagnostics | I-09, I-10, I-27 | | `44151b1` | Phase 3 regression fixes | I-13, I-14, I-26, I-29, I-31 | | `b88aba2` | V8 merge | I-05, I-16-I-20, I-22 | | `63c5a7d` | Rate-limit defense | I-24 | ## Appendix C: Files Most Frequently Modified | File | Issue Count | Lines | |------|------------|-------| | `researchclaw/pipeline/executor.py` | 14 | ~6000 | | `researchclaw/prompts.py` | 10 | ~2500 | | `researchclaw/templates/converter.py` | 5 | ~1200 | | `researchclaw/experiment/docker_sandbox.py` | 3 | ~420 | | `researchclaw/docker/Dockerfile` | 3 | ~45 | ================================================ FILE: docs/iteration_plan_v8.md ================================================ # AutoResearchClaw Pipeline — 持续迭代改进方案 V8 > 创建日期: 2026-03-15 > 基于: V7 质量修复 (P1-P14) + Run 1-7 测试反馈 > 目标: 将 pipeline 从 3/10 提升至 7+/10 审稿人评分 --- ## 一、当前问题总览 ### 1.1 已确认的核心问题 | ID | 问题 | 严重程度 | 类别 | |----|------|----------|------| | Q1 | **代码过于简单/偷懒** — LLM 生成的实验代码复杂度不足,缺乏真正的算法实现深度 | 🔴 Critical | 代码质量 | | Q2 | **不支持 LLM 微调任务** — 无法使用 Llama-Factory/TRL/Axolotl 等框架进行模型训练 | 🔴 Critical | 能力缺失 | | Q3 | **Docker 环境缺失关键包** — transformers, PEFT, TRL, datasets, accelerate 未预装 | 🔴 Critical | 基础设施 | | Q4 | **计算预算不匹配** — 默认 600s 完全不够 LLM 微调/复杂训练任务 | 🟡 High | 配置 | | Q5 | **数据集指导不全** — 只覆盖图像分类(CIFAR-10/FashionMNIST),缺少 NLP/多模态数据集 | 🟡 High | 提示工程 | | Q6 | **缺少先进训练技巧指导** — 无混合精度、梯度累积、LoRA/QLoRA 等指导 | 🟡 High | 提示工程 | | Q7 | **选题缺乏前沿性验证** — topic_init 阶段无法确保选题与最新会议趋势对齐 | 🟡 High | 提示工程 | | Q8 | **实验设计与代码脱节** — experiment_design 阶段产出的方案过于抽象,代码难以还原 | 🟠 Medium | 流程 | | Q9 | **消融实验质量低** — 消融 variant 经常与 baseline 结果相同(代码偷懒) | 🟠 Medium | 代码质量 | | Q10 | **论文写作质量待提升** — 数字重复、结构松散、结论与实验脱节 | 🟠 Medium | 写作 | ### 1.2 硬件环境 | 资源 | 配置 | |------|------| | GPU | NVIDIA RTX 6000 Ada (49GB VRAM) | | 可训练模型 | Full FT: ≤3B; LoRA 16-bit: ≤14B; QLoRA 4-bit: ≤72B | | 最优甜点 | Qwen-2.5-14B + QLoRA (rank 32, batch 2-4, seq 4096) | | 极限模型 | Qwen-2.5-72B + QLoRA (rank 8-16, batch 1, seq 1024) | | 推荐框架 | Llama-Factory (内置 Unsloth 加速)、TRL (RLHF/DPO) | --- ## 二、迭代方案概览 ### 总体路线 ``` Phase 0: 诊断测试 (Run 8-10) ← 当前阶段 ↓ 发现问题 Phase 1: 代码质量根本性改进 ↓ 修复后 Phase 2: LLM 微调能力扩展 ↓ 新能力 Phase 3: 回归测试 (Run 11-13) ↓ 验证 Phase 4: 高级特性 & 持续迭代 ↓ 长期 Phase N: 持续监控与改进 ``` --- ## 三、Phase 0: 诊断测试 (Run 8-10) ### 目标 并行运行 3 个精心选择的主题,覆盖不同类型的研究任务,以审稿人视角全面评估代码和论文质量。 ### 3 个测试主题 #### Run 8: 经典 ML + 视觉任务 **主题**: "Adaptive Per-Layer LoRA Rank Allocation for Memory-Optimal Fine-Tuning of Vision Transformers" - **为什么选这个**: 测试 pipeline 能否生成涉及 LoRA、ViT、多实验对比的复杂代码 - **预期难点**: 需要 transformers + PEFT 库,需要多层分析逻辑 - **关注指标**: 代码是否真正实现了逐层 rank 分配,还是偷懒用了统一 rank #### Run 9: 强化学习 + 策略优化 **主题**: "Comparing Flow Matching, Diffusion, and Consistency Models as Generative Trajectory Policies for Offline Reinforcement Learning" - **为什么选这个**: 测试 pipeline 能否正确实现 3 种不同的生成模型并进行公平对比 - **预期难点**: 需要 D4RL 数据集,三种算法各有复杂实现 - **关注指标**: 每种算法是否有独立的完整实现,还是共享同一套代码换个名字 #### Run 10: LLM 推理 + 计算效率 **主题**: "First-Token Reasoning Quality as a Predictor for Adaptive Test-Time Compute Allocation in Language Models" - **为什么选这个**: 测试 pipeline 能否处理 LLM 推理/效率优化类任务 - **预期难点**: 需要加载 Qwen-2.5-7B/14B,需要 token-level 分析 - **关注指标**: 是否真正加载了模型进行推理,还是用假数据模拟 ### 评审清单 对每个 Run 的输出,按以下维度打分(1-10): | 维度 | 评审要点 | |------|----------| | **代码完整性** | 是否实现了实验设计中描述的所有算法?有无偷懒/跳过? | | **代码复杂度** | 代码是否达到了论文级别的复杂度?是否有非平凡的算法实现? | | **框架使用** | 是否正确使用了所需的框架/库?调用方式是否正确? | | **实验公平性** | 对比实验是否使用了相同的随机种子、数据划分、评估协议? | | **结果可信度** | 结果是否合理?是否有明显的造假/随机数伪造痕迹? | | **消融有效性** | 消融实验是否真正去除了关键组件?结果是否有区分度? | | **论文与代码一致性** | 论文中描述的方法是否与代码实现一致? | | **写作质量** | 论文结构、数字使用、引用质量是否达标? | ### 期望产出 - 每个 Run 的详细评审报告 - 发现的新问题列表(追加到本文档) - 更新 Phase 1 的优先级排序 --- ## 四、Phase 1: 代码质量根本性改进 ### P1.1 代码复杂度强制要求 **文件**: `researchclaw/prompts.py` **问题**: 当前 code_generation prompt 虽然有很多规则,但缺乏对算法实现深度的硬性要求 **改进方案**: 1. 添加 `code_complexity` block: - 每个算法/方法必须有独立的 class 实现(不能是函数别名) - 每个 class 必须有 `__init__`, `forward/predict`, `train_step` 三个核心方法 - 主要算法 class 不少于 50 行有效代码 - 消融变体必须通过修改算法逻辑实现,不能仅改超参数 - 禁止 `class MethodB(MethodA): pass` 这种空继承 2. 添加 `implementation_depth` 检查: - 在 validator.py 中新增复杂度评分 - 检查每个 class 的方法数量和代码行数 - 检查是否存在 "名不副实" 的类(如 BayesianOpt 但没有 acquisition function) **状态**: ⬜ 待实施 ### P1.2 算法实现正确性验证 **文件**: `researchclaw/pipeline/executor.py` **问题**: 当前仅做语法检查和安全扫描,不验证算法是否正确实现 **改进方案**: 1. 在代码生成后增加 `_verify_algorithm_implementation()` 阶段: - 用 LLM 审查代码,逐条检查实验设计中的每个组件是否在代码中实现 - 生成 checklist: `✅ PPO clipped surrogate objective implemented` / `❌ Missing value function baseline` - 如果有 ❌ 项,触发代码修复循环 2. 添加 `_verify_condition_independence()`: - 解析代码 AST,检查每个实验条件/方法的 class 是否有独立的逻辑 - 如果两个 class 的方法体完全相同(hash 匹配),标记为 "identical implementation" - 注入警告到修复 prompt 中 **状态**: ⬜ 待实施 ### P1.3 实验设计→代码 桥接增强 **文件**: `researchclaw/prompts.py`, `researchclaw/pipeline/executor.py` **问题**: experiment_design 产出的方案过于抽象,code_generation 难以还原 **改进方案**: 1. 在 experiment_design prompt 中增加 "Pseudocode" 要求: - 每个方法/算法必须给出伪代码级别的描述 - 明确输入输出 tensor shape - 明确 loss function 公式 - 明确 training loop 结构 2. 在 code_generation 中注入 pseudocode 上下文: - 从 experiment_design 输出中提取伪代码部分 - 作为 code_generation prompt 的一部分传入 - 明确要求 "代码必须与伪代码逻辑一致" **状态**: ⬜ 待实施 ### P1.4 代码审查自动化(LLM-as-Reviewer) **文件**: `researchclaw/pipeline/executor.py` (新阶段) **问题**: 代码生成后无系统性审查 **改进方案**: 1. 在 Stage 10 (code_generation) 和 Stage 11 (experiment_run) 之间插入 Stage 10.5: - `_execute_code_review()` 方法 - 用 LLM 以审稿人视角审查代码 - 生成审查报告: 实现完整性、算法正确性、代码质量 - 如果审查不通过,返回 Stage 10 重新生成(最多 2 次) 2. 审查 prompt 关注点: - 算法命名是否与实现一致? - 是否存在 "假实现"(名字是 X 但代码是 Y)? - 数据处理是否合理? - 损失函数是否正确? - 评估协议是否科学? **状态**: ⬜ 待实施 --- ## 五、Phase 2: LLM 微调能力扩展 ### P2.1 Docker 环境升级 **文件**: `researchclaw/docker/Dockerfile` **新增包**: ```dockerfile # LLM Training Stack RUN pip install --no-cache-dir \ transformers>=4.46.0 \ datasets>=3.0.0 \ accelerate>=1.0.0 \ peft>=0.13.0 \ trl>=0.12.0 \ bitsandbytes>=0.44.0 \ sentencepiece \ protobuf \ tokenizers \ safetensors \ flash-attn --no-build-isolation \ wandb # Optional: Llama-Factory RUN pip install --no-cache-dir llamafactory>=0.9.0 ``` **状态**: ⬜ 待实施 ### P2.2 LLM 微调 Prompt 体系 **文件**: `researchclaw/prompts.py` 新增 prompt blocks: 1. **`llm_training_guidance`** block: - 何时使用 LoRA vs QLoRA vs Full FT - GPU 内存估算公式 - 推荐框架选择指南(Llama-Factory / TRL / 原生 transformers) - 训练超参数模板(lr, warmup, scheduler, gradient accumulation) - 模型加载方式 (AutoModelForCausalLM + BitsAndBytesConfig) 2. **`llm_eval_guidance`** block: - 标准评估基准 (MMLU, MT-Bench, AlpacaEval, HumanEval) - 评估框架 (lm-eval-harness, vllm 推理加速) - 评估指标定义 3. **`llm_data_guidance`** block: - 指令微调数据格式 (Alpaca, ShareGPT, OpenAI chat) - HuggingFace datasets 加载方式 - 数据预处理 pipeline (tokenization, padding, truncation) - 常用数据集列表 (Alpaca, ShareGPT, MetaMathQA, CodeAlpaca) **状态**: ⬜ 待实施 ### P2.3 计算预算自适应 **文件**: `researchclaw/pipeline/executor.py`, `researchclaw/config.py` **改进方案**: 1. 根据研究主题自动调整 time_budget: - 经典 ML (CIFAR 级): 600s - 中等 (ViT/ResNet 训练): 1800s - LLM 微调 (7B LoRA): 7200s (2h) - LLM 微调 (14B QLoRA): 14400s (4h) - 大规模训练 (72B): 43200s (12h) 2. 在 experiment_design 阶段估算所需计算量: - 根据模型大小、数据量、训练 epoch 预估时间 - 自动设置合理的 time_budget **状态**: ⬜ 待实施 ### P2.4 模型缓存与下载管理 **文件**: `researchclaw/experiment/docker_sandbox.py` **改进方案**: 1. 支持 HuggingFace Hub 模型缓存目录挂载: - 宿主机 `~/.cache/huggingface` → 容器 `/root/.cache/huggingface` - 避免每次运行重新下载模型 2. 网络策略调整: - LLM 微调任务: `network_policy: "huggingface_only"` (仅允许 HF Hub 下载) - 传统 ML 任务: `network_policy: "pip_only"` 或 `"none"` **状态**: ⬜ 待实施 --- ## 六、Phase 3: 回归测试 (Run 11-13) ### 测试主题(Phase 1-2 完成后执行) #### Run 11: LLM 微调任务 **主题**: "QLoRA Rank Allocation: Adaptive Per-Layer Rank Selection for Memory-Optimal Fine-Tuning of Qwen-2.5" - **目的**: 验证 P2 (LLM 微调能力) 是否正确工作 - **验证点**: 能否正确调用 PEFT/QLoRA,能否加载 Qwen-2.5 模型 #### Run 12: VLM 推理分析 **主题**: "Modular Causal Attribution for Hallucination Mitigation in Vision-Language Models via MHA Intervention" - **目的**: 验证 pipeline 能否处理多模态任务 - **验证点**: 代码复杂度是否达标,分析方法是否正确 #### Run 13: 经典 RL 复杂实验 **主题**: "Generative Trajectory Policies: Flow Matching vs Diffusion vs Consistency Models for Offline RL on D4RL" - **目的**: 验证 P1 (代码质量改进) 是否有效 - **验证点**: 三种算法是否有独立完整实现 ### 评分标准 - 每个 Run 使用 Phase 0 的评审清单打分 - 目标: 所有维度 ≥ 6/10,平均 ≥ 7/10 - 如果不达标,回到 Phase 1/2 继续修复 --- ## 七、Phase 4: 高级特性 & 持续迭代 ### P4.1 基准发现系统 (Benchmark Discovery) - 在 experiment_design 阶段新增 LLM 调用,自动推荐相关基准和 SOTA 基线 - 已测试: LLM 知识法(Plan 2)效果极佳,可找到 40+ 基准 ### P4.2 实验复现性保障 - 记录完整的环境信息 (pip freeze, CUDA version, GPU type) - 自动生成 requirements.txt - 支持实验结果复现 ### P4.3 多 GPU 分布式训练支持 - DeepSpeed / FSDP 集成 - 多节点训练配置 ### P4.4 论文质量进一步提升 - LaTeX 格式化增强 - 图表自动优化(配色、字体、分辨率) - 引用格式严格化 ### P4.5 端到端自动评估 - 集成 LLM-as-Judge 对生成论文自动打分 - 与人工审稿打分对比校准 - 建立质量基线 --- ## 八、跟踪记录 ### 测试运行记录 | Run | 日期 | 主题 | 模式 | 代码评分 | 论文评分 | 发现的问题 | |-----|------|------|------|----------|----------|------------| | 1 | 2026-03-xx | Continual Learning | sandbox | - | - | Bug 1-4 | | 2 | 2026-03-xx | RIM Agents | sandbox | - | - | Bug 1-4 | | 3 | 2026-03-xx | (与 Run 1 同主题) | sandbox | - | - | Bug 1-4 | | 4 | 2026-03-xx | RL for AI4Science | sandbox | 4/10 | - | Bug 5-8, 变量作用域, 5/7条件崩溃 | | 5 | 2026-03-xx | Graph Neural ODE | sandbox | 4/10 | - | Bug 5-8, nn.Linear in forward, no-op ablation | | 6 | 2026-03-xx | Meta-Learning | sandbox | - | - | Bug 5-8 | | 7 | 2026-03-14 | Normalization Techniques | docker | 3/10 | 3/10 | P1-P14 | | 8 | 2026-03-15 | KD Comparison (CIFAR-10) | docker | 5/10 | - | Q13-Q15, 随机水平结果 | | 9 | 2026-03-15 | PPO/SAC/TD3+PER | docker | 7/10 | - | Q11, MuJoCo缺失致完全失败 | | 10 | 2026-03-15 | Neural ODE Robustness | docker | 7/10 | - | Q12/Q16, CIFAR-10挂载失败 | | 11 | 2026-03-15 | QLoRA Rank Allocation | docker | 4/10 | 7/10 | Q17-Q20, 合成模拟非真实训练 | | 12 | 2026-03-15 | VLM Hallucination | docker | 3/10 | TBD | Q21-Q23, KeyError崩溃, 训练/验证数据重叠 | | 13 | 2026-03-15 | PPO/SAC/TD3 MuJoCo | docker | 6/10 | TBD | Q24-Q26, 60k步不够收敛, PPO容量不公平 | ### 问题追踪 | 问题 ID | 描述 | Phase | 状态 | 修复 Commit | |---------|------|-------|------|-------------| | Q1 | 代码过于简单/偷懒 | P1 | ✅ 已修复 | cb4af26 | | Q2 | 不支持 LLM 微调 | P2 | ✅ 已修复 | e72a818 | | Q3 | Docker 缺关键包 | P2 | ✅ 已修复 | e72a818 | | Q4 | 计算预算不匹配 | P2 | ✅ 已修复 | e72a818 | | Q5 | 数据集指导不全 | P1/P2 | ✅ 已修复 | (本次) | | Q6 | 缺先进训练技巧 | P2 | ✅ 已修复 | e72a818 | | Q7 | 选题前沿性验证 | P4 | ⬜ 待实施 | - | | Q8 | 实验设计与代码脱节 | P1 | ✅ 已修复 | cb4af26 | | Q9 | 消融实验质量低 | P1 | ✅ 已修复 | cb4af26 | | Q10 | 论文写作质量 | P4 | 🟡 V7已部分修复 | - | | Q11 | Docker 缺 MuJoCo | P0 | ✅ 已修复 | (本次) | | Q12 | CIFAR-10 挂载失效 | P0 | ✅ 已修复 | (本次,重建镜像) | | Q13 | 训练 epoch 过少 | P0 | ✅ 已修复 | (本次) | | Q14 | Feature KD 维度不匹配 | P1 | 🟡 P1代码审查会捕获 | - | | Q15 | 消融与 baseline 重复 | P1 | 🟡 P1深度验证会捕获 | - | | Q16 | 缺少关键实验条件 | P1 | 🟡 P1代码审查会捕获 | - | | Q17 | Docker HF缓存重复挂载 | P3 | ✅ 已修复 | (本次) | | Q18 | LLM代码审查JSON解析失败 | P3 | ✅ 已修复 | (本次) | | Q19 | LLM任务用合成模拟代替真实训练 | P3 | ✅ 已修复(提示) | (本次) | | Q20 | ndarray.ptp()等NumPy 2.0移除API | P3 | ✅ 已修复(检测+提示) | (本次) | | Q21 | dict[key]无默认值致KeyError | P3 | ✅ 已修复(提示) | (本次) | | Q22 | 训练/验证数据集重叠 | P4 | ⬜ 待实施 | - | | Q23 | 损失函数方向错误(鼓励而非惩罚) | P4 | ⬜ 待实施 | - | | Q24 | RL训练步数不足(60k vs 需1M) | P3 | 🟡 已有epoch指导,需扩展到RL | - | | Q25 | 实验条件间模型容量不公平 | P4 | ⬜ 待实施 | - | | Q26 | proposed_method_variant与主方法相同 | P1 | 🟡 P1深度验证会捕获 | - | --- ## 九、测试选题库 ### 优先级 A — Phase 0 诊断测试用 | # | 主题 | 类型 | 预期复杂度 | GPU 时间 | |---|------|------|-----------|----------| | A1 | Adaptive Per-Layer LoRA Rank Allocation for ViT | 高效训练 | 高 | 4-8h | | A2 | Flow Matching vs Diffusion vs Consistency for Offline RL | 强化学习 | 高 | 6-10h | | A3 | First-Token Reasoning Quality for Compute Allocation | LLM 推理 | 中 | 3-6h | ### 优先级 B — Phase 2 后 LLM 微调测试 | # | 主题 | 类型 | 预期复杂度 | GPU 时间 | |---|------|------|-----------|----------| | B1 | QLoRA Fine-Tuning of Qwen-2.5-14B for Medical QA | LLM 微调 | 高 | 4-8h | | B2 | GainLoRA++ for LLM Continual Learning | 持续学习 | 高 | 6-10h | | B3 | Spurious Forgetting Analysis in Instruction-Tuned LLMs | LLM 分析 | 中 | 4-8h | ### 优先级 C — 多样性覆盖测试 | # | 主题 | 类型 | 预期复杂度 | GPU 时间 | |---|------|------|-----------|----------| | C1 | Modular Causal Attribution for VLM Hallucination | VLM 分析 | 中 | 3-6h | | C2 | Neural Operator Downscaling for Weather Prediction | AI4Science | 中 | 4-8h | | C3 | Meta-Learned LoRA Initialization for Few-Shot Adaptation | Meta-Learning | 中 | 4-8h | | C4 | Prune-Then-LoRA for Parameter-Efficient Fine-Tuning | 高效训练 | 中 | 4-8h | | C5 | Decomposition-of-Thought for VLM Reasoning | VLM 推理 | 低 | 2-4h | --- ## 十、执行计划 ### 执行进度 #### Phase 0: 诊断测试 ✅ 完成 1. ✅ 调研热门主题,筛选测试用 idea 2. ✅ 为 Run 8/9/10 创建配置文件 3. ✅ 并行启动 3 个 Run 4. ✅ 监控中间输出,特别关注 Stage 10 (代码生成) 产出 5. ✅ 以审稿人视角评审代码 + 论文 6. ✅ 汇总发现的问题 (Q11-Q16) 7. ✅ 确定 Phase 1 优先级 #### Phase 1: 代码质量改进 ✅ 完成 (commit cb4af26) - P1.1: 深度代码质量检查 (AST分析: 类质量, 变量作用域, API正确性) - P1.2: 自动修复循环 (深度验证 → LLM修复 → 重验证) - P1.3: 实验设计增加 implementation_spec (伪代码级描述) - P1.4: LLM代码审查 (Stage 10.5, 评分1-10, 严重问题触发修复) #### Phase 2: LLM微调能力 ✅ 完成 (commit e72a818) - P2.1: Docker新增transformers/peft/trl/bitsandbytes/datasets - P2.2: llm_training_guidance + llm_eval_guidance 提示块 - P2.3: 自动检测LLM主题注入指导; time_budget警告 - P2.4: HuggingFace缓存挂载 + HF_TOKEN透传 #### Phase 3: 回归测试 🔄 进行中 Run 11-13 结果分析: - **Run 11 (QLoRA)**: 代码4/10, 论文7/10 — 合成模拟非真实训练, 但论文质量达标 - **Run 12 (VLM)**: 代码3/10 — KeyError崩溃, 训练/验证重叠, 损失方向错误 - **Run 13 (RL)**: 代码6/10 — MuJoCo成功! 但60k步不够收敛, PPO容量不公平 Phase 3 修复 (本次commit): - Q17: Docker HF缓存重复挂载 → 优先HF_HOME, 避免重复 - Q18: LLM代码审查JSON解析失败 → 正确提取LLMResponse.content + 去除markdown fence - Q19: LLM任务合成模拟 → 添加"CRITICAL — NO SIMULATION"规则 - Q20: NumPy 2.0移除API → 检测器 + 禁止模式更新 - Q21: dict[key]无默认值 → 禁止模式更新 ### 注意事项 - 每次迭代结束后更新本文档 - 新发现的问题立即追加到问题追踪表 - 修复后必须有对应的回归测试 Run - 配置文件中 API key 已在 .gitignore 中排除 ================================================ FILE: docs/iteration_showcase_narrative.md ================================================ # AutoResearchClaw: Self-Iterating Experiment Optimization — Showcase > Figure: `docs/figures/iteration_improvement_showcase.png` / `.pdf` --- ## Overview This figure demonstrates AutoResearchClaw's core capability: **autonomous self-iteration of experimental methods**. Starting from an initial experiment design, the pipeline automatically: 1. Runs the experiment in a sandboxed environment 2. Analyzes the results and identifies weaknesses 3. Proposes algorithmic improvements via LLM reasoning 4. Implements code modifications and re-runs 5. Retains the best-performing version, discards regressions Below we describe two representative cases from actual pipeline runs. --- ## Case A: Continual Meta-Learning for Few-Shot Adaptation **Research Topic:** Designing meta-learning algorithms that adapt to non-stationary task distributions, where the underlying data distribution shifts over time. **Metric:** Post-adaptation query error on held-out tasks (lower = better). Converted to accuracy (%) in the figure. ### Iteration Progression | Round | Accuracy | What the Pipeline Did | |-------|----------|----------------------| | **Baseline** | **25.9%** | Initial experiment code with 6 standard conditions (random search, Bayesian optimization, PPO, etc.). Basic meta-learning framework without domain-specific adaptations. | | **Iter 1** | **81.2%** (+55.3 pts) | **Major architectural redesign.** The pipeline identified that the baseline methods were generic RL algorithms ill-suited for meta-learning. It autonomously: (1) Replaced generic methods with domain-specific ones: `replay_meta`, `context_gated_replay`, `online_meta_sgd`, `adaptive_lr_meta`; (2) Implemented a two-layer neural encoder with MAML-style inner-loop adaptation; (3) Added context-gated experience replay that modulates replay intensity based on context similarity; (4) Introduced per-parameter meta-SGD learning rates. | | **Iter 2** | **77.5%** (-3.7 pts) | **Failed experiment — automatically detected and recovered.** The pipeline attempted to simplify the architecture by replacing the deep encoder with a prototype network. This reduced model expressiveness and degraded performance. The pipeline automatically detected the regression and retained the Iter 1 code as the best version. | | **Iter 3** | **93.4%** (+15.9 pts) | **Architecture refinement with regularization.** Learning from both the success of Iter 1 and the failure of Iter 2, the pipeline: (1) Adopted a linear classifier with proper gradient-based inner-loop adaptation (simpler than Iter 1's deep encoder but more expressive than Iter 2's prototypes); (2) Added L2 anchor regularization to prevent catastrophic forgetting during adaptation; (3) Implemented cosine similarity-based context gating (more robust than prototype-distance gating); (4) Increased seed count from 24 to 28 for more robust statistics; (5) Added new comparison conditions: `prototype_regularized_meta`, `drift_aware_meta`. | | **Iter 4** | **93.4%** (converged) | Minor hyperparameter adjustments. Pipeline recognized convergence and stopped. | **Key Insight:** The pipeline demonstrated the ability to **recover from a failed approach** (Iter 2's prototype networks) by synthesizing lessons from both successful (Iter 1) and failed (Iter 2) attempts to arrive at a superior solution (Iter 3). --- ## Case B: RLHF with Curriculum-Based Reward Shaping for LLM Alignment **Research Topic:** Improving LLM alignment through reinforcement learning from human feedback, with a curriculum-based approach that gradually increases task difficulty. **Metric:** 1 − alignment_error (higher = better). Represents how well the trained policy aligns with human preferences. ### Iteration Progression | Round | Alignment | What the Pipeline Did | |-------|-----------|----------------------| | **Baseline** | **35.6%** | Vanilla PPO policy with linear reward function. Direct preference feedback from environment oracle. No learned reward model, no curriculum scheduling. | | **Iter 1** | **35.6%** (no change) | Minor code modifications that did not affect performance. Pipeline correctly identified no improvement and continued iterating. | | **Iter 2** | **61.6%** (+26.0 pts) | **Core algorithmic innovation.** The pipeline introduced three key components: (1) **Learned preference reward model** — a logistic regression model trained on preference pairs: P(prefer chosen \| feature delta), updated online with Adam optimizer; (2) **Reward mixing schedule** — gradually increases reliance on the learned reward model from 10% to 80% over training (coefficient ramp); (3) **Curriculum power shaping** — nonlinear difficulty progression (power=1.4) that gives the agent more time on easier problems before advancing. | | **Iter 3** | **63.0%** (+1.4 pts) | **Multi-signal evaluation.** Added: (1) **Rank-normalized multi-action evaluation** — samples up to 4 actions per state and evaluates preference feedback for each, converting to rank-based scores in [-1, +1]; (2) **Direct reward regression head** — a second regression-based reward predictor using ridge regression, blended with the classification head; (3) **Policy EMA** — exponential moving average of policy parameters (decay=0.92) with anchor regularization for training stability. | | **Iter 4** | **66.6%** (+3.6 pts) | **Confidence-aware reward integration.** Added: (1) **Confidence-gated reward** — measures learned reward model accuracy, then uses softmax entropy to modulate how much the reward signal influences actions; (2) **Mini-batch reward model updates** — trains on 3 randomly sampled past preference pairs per step (not just current); (3) **Margin bonus** — early-curriculum episodes receive extra reward shaping from preference margins (coef=0.18 × (1−level) × tanh(margin)). | **Key Insight:** The pipeline demonstrated **incremental technical sophistication** — each iteration built upon the previous one by adding a specific, well-motivated technique. The progression from vanilla PPO → learned reward model → multi-signal evaluation → confidence gating mirrors how a human researcher would iteratively refine an RLHF system. --- ## What This Demonstrates 1. **Autonomous Problem Diagnosis:** The pipeline identifies *why* performance is limited (e.g., "generic RL methods are unsuitable for meta-learning") and proposes targeted solutions. 2. **Failure Recovery:** When an iteration produces worse results (Case A, Iter 2), the pipeline automatically detects the regression, retains the previous best version, and learns from the failure to produce a better solution in the next iteration. 3. **Progressive Refinement:** Rather than making random changes, the pipeline demonstrates cumulative improvement — each iteration builds on insights from previous ones (Case B: reward model → rank normalization → confidence gating). 4. **Domain-Appropriate Innovation:** The pipeline generates methods that are appropriate for the specific research domain (context-gated replay for meta-learning, preference reward models for RLHF), not just generic hyperparameter tuning. 5. **Convergence Detection:** The pipeline automatically recognizes when further iterations are unlikely to yield improvement and terminates, avoiding wasted computation. --- ## Data Sources - Case A: `artifacts/rc-20260314-132748-0ec2c9/stage-13_v2/refinement_log.json` - Case B: `artifacts/rc-20260314-132748-91c516/stage-13/refinement_log.json` - Figure script: `scripts/plot_iteration_showcase.py` ================================================ FILE: docs/metaclaw-integration-plan.md ================================================ # MetaClaw × AutoResearchClaw 集成方案 > **Status**: ✅ **Implemented & Merged to main** (v0.3.0, 2026-03-16) > > **目标**: 将 MetaClaw 的持续学习能力(技能注入、技能进化、PRM 评分、RL 训练)接入 AutoResearchClaw 的 23 阶段研究流水线,提升端到端论文生成质量。 --- ## 一、项目概览 | 项目 | 定位 | 核心能力 | |------|------|----------| | **AutoResearchClaw** | 全自主研究流水线(Idea → Paper) | 23 阶段 Pipeline、文献检索、实验执行、论文写作、引用验证 | | **MetaClaw** | Agent 持续进化平台 | 技能注入(Skill Injection)、技能进化(Skill Evolution)、PRM 奖励评分、RL 微调、空闲调度器 | **集成核心思路**: MetaClaw 作为 AutoResearchClaw 的 **LLM 增强层**,通过多层次赋能提升每个阶段的 LLM 输出质量,并建立从研究失败中持续学习的闭环。 --- ## 二、架构设计 ### 2.1 集成架构总览 ``` ┌──────────────────────────────────────────────────────┐ │ AutoResearchClaw Pipeline │ │ Stage 1 → 2 → ... → 23 │ │ │ │ ┌─────────────┐ ┌──────────────────────────────┐ │ │ │ LLMClient │───▶│ MetaClaw Integration Layer │ │ │ │ (原有) │ │ (新增 metaclaw_bridge 模块) │ │ │ └─────────────┘ └──────────┬───────────────────┘ │ │ │ │ │ ┌─────────────┐ ┌──────────▼───────────────────┐ │ │ │ Evolution │◀──▶│ Lesson ↔ Skill 双向桥接 │ │ │ │ (原有) │ └─────────────────────────────┘ │ │ └─────────────┘ │ └──────────────────────────┬───────────────────────────┘ │ ┌──────────────▼──────────────┐ │ MetaClaw Proxy Server │ │ (FastAPI :30000) │ │ │ │ ┌────────────────────────┐ │ │ │ SkillManager │ │ │ │ - 通用技能 (40+) │ │ │ │ - 研究专属技能 (新增) │ │ │ │ - 阶段映射技能检索 │ │ │ └────────────────────────┘ │ │ │ │ ┌────────────────────────┐ │ │ │ SkillEvolver │ │ │ │ - 从失败中自动生成技能 │ │ │ └────────────────────────┘ │ │ │ │ ┌────────────────────────┐ │ │ │ PRMScorer │ │ │ │ - 阶段输出质量评分 │ │ │ └────────────────────────┘ │ │ │ └──────────────┬──────────────┘ │ ┌────────────▼────────────┐ │ Upstream LLM API │ │ (OpenAI / Kimi / etc.) │ └─────────────────────────┘ ``` ### 2.2 集成层次 | 层次 | 名称 | 改动范围 | 效果 | |------|------|----------|------| | **L1** | Proxy 透传 | 仅改配置 | AutoResearchClaw → MetaClaw Proxy → LLM,自动获得通用技能注入 | | **L2** | 阶段感知技能 | 新增研究技能库 + 阶段映射 | 每个 Pipeline 阶段注入最相关的研究技能 | | **L3** | Evolution 桥接 | 新增 bridge 模块 | AutoResearchClaw 失败教训 → MetaClaw 技能;双向学习闭环 | | **L4** | PRM 质量门控 | 集成 PRMScorer | 在质量门控阶段(5/9/15/20)使用 PRM 提供客观评分 | | **L5** | RL 持续训练 | MetaClaw RL 模式 | 从研究对话中持续微调模型(可选,需 GPU) | --- ## 三、详细任务分解 ### Phase 0: 环境准备与分支管理 #### Task 0.1: 创建集成分支 ```bash cd /home/jqliu/projects/AutoResearchClaw git checkout -b feat/metaclaw-integration ``` - 所有开发工作在此分支进行 - 定期 rebase main 保持同步 #### Task 0.2: MetaClaw 环境配置 ```bash cd /home/jqliu/projects/MetaClaw python -m venv .venv source .venv/bin/activate pip install -e ".[evolve]" # 安装核心 + 技能进化依赖 ``` - 只安装 `skills_only` 模式所需依赖(不需要 GPU / RL) - 如需 embedding 检索:`pip install -e ".[embedding]"` #### Task 0.3: MetaClaw 基础配置 创建 `~/.metaclaw/config.yaml`: ```yaml mode: skills_only llm: provider: custom model_id: <与 AutoResearchClaw 相同的模型> api_base: <上游 LLM API 地址> api_key: proxy: port: 30000 api_key: "" # 内部调用,无需鉴权 skills: enabled: true dir: ~/.metaclaw/skills retrieval_mode: template top_k: 6 task_specific_top_k: 10 auto_evolve: true ``` #### Task 0.4: 验证 MetaClaw 代理可用 ```bash # 启动 MetaClaw metaclaw start --mode skills_only # 测试连通性 curl -X POST http://localhost:30000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{"model":"test","messages":[{"role":"user","content":"ping"}],"max_tokens":50}' ``` --- ### Phase 1: L1 — Proxy 透传接入(最小改动) **目标**: 零代码改动,仅通过配置让 AutoResearchClaw 经由 MetaClaw 代理调用 LLM。 #### Task 1.1: 修改 AutoResearchClaw 配置 修改 `config.researchclaw.yaml`: ```yaml llm: provider: "openai-compatible" base_url: "http://localhost:30000" # 指向 MetaClaw 代理 api_key_env: "" api_key: "" # MetaClaw 无需鉴权 primary_model: "<原模型名>" # MetaClaw 会透传到上游 fallback_models: [] ``` #### Task 1.2: 兼容性适配 在 `researchclaw/llm/client.py` 中处理 MetaClaw 可能返回的 503 状态码(权重更新中): **文件**: `researchclaw/llm/client.py` **改动**: 将 503 加入可重试状态码列表 ```python # 原有 _RETRYABLE_STATUS = {429, 500, 502, 504} # 改为 _RETRYABLE_STATUS = {429, 500, 502, 503, 504} ``` #### Task 1.3: 端到端冒烟测试 ```bash # 1. 启动 MetaClaw metaclaw start --mode skills_only # 2. 运行 AutoResearchClaw 短流程 researchclaw run --topic "test topic" --config config.yaml ``` **验证点**: - [x] AutoResearchClaw 能正常调用 LLM - [x] MetaClaw 日志显示技能注入 - [x] 输出质量与直连 LLM 相当或更好 **预期交付**: AutoResearchClaw 透过 MetaClaw 运行,自动获得通用技能加持。 --- ### Phase 2: L2 — 研究专属技能库 + 阶段映射 **目标**: 为 AutoResearchClaw 的 23 个阶段创建专属技能,并实现精准注入。 #### Task 2.1: 创建研究专属技能 在 `~/.metaclaw/skills/` 下新增以下技能(每个技能一个目录 + `SKILL.md`): | 技能名 | 类别 | 适用阶段 | 内容要点 | |--------|------|----------|----------| | `literature-search-strategy` | research | 3, 4 | 查询扩展、布尔组合、避免过宽搜索 | | `paper-relevance-screening` | research | 5 | 相关性评分标准、排除低质量源 | | `knowledge-card-extraction` | research | 6 | 结构化提取模板:方法/结果/局限 | | `research-gap-identification` | research | 7 | 聚类分析、空白识别、创新角度发现 | | `hypothesis-formulation` | research | 8 | SMART 假设、可证伪性检查 | | `experiment-design-rigor` | research | 9 | 对照组设计、消融实验、统计功效 | | `hardware-aware-coding` | coding | 10 | GPU/CPU 适配、内存管理、batch size 选择 | | `experiment-debugging` | coding | 12, 13 | NaN/Inf 检测、收敛诊断、梯度检查 | | `statistical-analysis` | data_analysis | 14 | p 值计算、效应量、置信区间 | | `research-pivot-decision` | research | 15 | 证据权衡、PROCEED/PIVOT 决策框架 | | `academic-writing-structure` | communication | 16, 17 | IMRaD 结构、段落逻辑、术语一致性 | | `peer-review-methodology` | communication | 18 | 审稿视角、方法论-证据一致性检查 | | `citation-integrity` | research | 23 | 引用验证层次、防幻觉策略 | 每个技能文件格式: ```markdown --- name: literature-search-strategy description: Design effective literature search queries for academic research category: research --- # Literature Search Strategy ## When to Use When designing search queries for arXiv, Semantic Scholar, or other academic databases. ## Steps 1. **Decompose research topic** into 3-5 core concepts 2. **Generate synonyms** for each concept (e.g., "reinforcement learning" → "RL", "reward-based learning") 3. **Combine with Boolean operators**: (concept1 OR synonym1) AND (concept2 OR synonym2) 4. **Add temporal filters**: Prefer recent 3 years for fast-moving fields 5. **Iterative refinement**: If >200 results, narrow; if <10, broaden ## Anti-Patterns - Avoid single-keyword queries (too broad, causes timeout) - Avoid overly specific queries that miss relevant work - Never rely on a single database source ``` #### Task 2.2: 阶段-技能映射模块 **新增文件**: `researchclaw/metaclaw_bridge/stage_skill_map.py` ```python """Maps AutoResearchClaw pipeline stages to MetaClaw skill categories.""" # 每个阶段对应的 MetaClaw 任务类型 + 推荐注入的研究专属技能 STAGE_SKILL_MAP: dict[str, dict] = { "topic_init": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 4, }, "problem_decompose": { "task_type": "research", "skills": ["research-gap-identification"], "top_k": 4, }, "search_strategy": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 6, }, "literature_collect": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 4, }, "literature_screen": { "task_type": "research", "skills": ["paper-relevance-screening"], "top_k": 6, }, "knowledge_extract": { "task_type": "research", "skills": ["knowledge-card-extraction"], "top_k": 4, }, "synthesis": { "task_type": "research", "skills": ["research-gap-identification"], "top_k": 6, }, "hypothesis_gen": { "task_type": "research", "skills": ["hypothesis-formulation"], "top_k": 6, }, "experiment_design": { "task_type": "research", "skills": ["experiment-design-rigor"], "top_k": 6, }, "code_generation": { "task_type": "coding", "skills": ["hardware-aware-coding"], "top_k": 6, }, "resource_planning": { "task_type": "productivity", "skills": [], "top_k": 3, }, "experiment_run": { "task_type": "automation", "skills": ["experiment-debugging"], "top_k": 4, }, "iterative_refine": { "task_type": "coding", "skills": ["experiment-debugging"], "top_k": 6, }, "result_analysis": { "task_type": "data_analysis", "skills": ["statistical-analysis"], "top_k": 6, }, "research_decision": { "task_type": "research", "skills": ["research-pivot-decision"], "top_k": 4, }, "paper_outline": { "task_type": "communication", "skills": ["academic-writing-structure"], "top_k": 4, }, "paper_draft": { "task_type": "communication", "skills": ["academic-writing-structure"], "top_k": 6, }, "peer_review": { "task_type": "communication", "skills": ["peer-review-methodology"], "top_k": 6, }, "paper_revision": { "task_type": "communication", "skills": ["academic-writing-structure", "peer-review-methodology"], "top_k": 6, }, "quality_gate": { "task_type": "research", "skills": ["peer-review-methodology"], "top_k": 4, }, "knowledge_archive": { "task_type": "automation", "skills": [], "top_k": 2, }, "export_publish": { "task_type": "automation", "skills": [], "top_k": 2, }, "citation_verify": { "task_type": "research", "skills": ["citation-integrity"], "top_k": 4, }, } ``` #### Task 2.3: 阶段感知 HTTP Header 注入 修改 `researchclaw/llm/client.py`,在发送请求时附带阶段上下文 Header: ```python # 在 _request() 方法中新增 headers["X-Session-Id"] = f"arc-{run_id}" headers["X-AutoRC-Stage"] = stage_name # 自定义 header,供 MetaClaw 日志追踪 headers["X-Turn-Type"] = "main" # 确保触发技能注入 ``` #### Task 2.4: MetaClaw 端自定义技能检索(可选增强) 如果需要更精准的阶段感知,可在 MetaClaw 的 `api_server.py` 中读取 `X-AutoRC-Stage` header,根据 `STAGE_SKILL_MAP` 调整 skill 检索策略。但这需要修改 MetaClaw 代码,可作为后续优化。 **预期交付**: 13 个研究专属技能 + 阶段映射配置,每个 Pipeline 阶段获得最相关的技能注入。 --- ### Phase 3: L3 — Evolution ↔ Skill 双向桥接 **目标**: 让 AutoResearchClaw 的失败教训自动转化为 MetaClaw 技能,形成学习闭环。 #### Task 3.1: Lesson → Skill 转化器 **新增文件**: `researchclaw/metaclaw_bridge/lesson_to_skill.py` **功能**: 1. 从 `evolution/lessons.jsonl` 读取高严重性教训(severity = "error") 2. 按类别聚合同类失败 3. 调用 LLM 将教训批量转化为 MetaClaw 技能格式 4. 写入 `~/.metaclaw/skills/arc-xxx/SKILL.md` **类别映射**: ```python LESSON_CATEGORY_TO_SKILL_CATEGORY = { "SYSTEM": "automation", "EXPERIMENT": "coding", "WRITING": "communication", "ANALYSIS": "data_analysis", "LITERATURE": "research", "PIPELINE": "automation", } ``` **转化 Prompt 模板**: ``` 以下是自动化研究流水线中反复出现的失败教训。请将它们转化为可复用的技能指南。 失败教训: {lessons_text} 请为每个关键失败模式生成一个技能,格式如下: - name: 小写连字符命名 (如 arc-avoid-broad-queries) - description: 一句话描述何时使用此技能 - category: {target_category} - content: Markdown 格式的步骤指南 (5-10 行) ``` #### Task 3.2: Skill 效果回馈 **新增文件**: `researchclaw/metaclaw_bridge/skill_feedback.py` **功能**: 1. 在每次 Pipeline 运行结束后,统计各阶段成功/失败 2. 关联当次运行中注入了哪些技能 3. 计算技能-成功率关联 4. 将低效技能标记,供 MetaClaw SkillEvolver 参考 **数据结构**: ```python @dataclass class SkillEffectivenessRecord: skill_name: str stage_name: str run_id: str stage_success: bool timestamp: str ``` 存储位置: `evolution/skill_effectiveness.jsonl` #### Task 3.3: 自动进化触发 在 `researchclaw/pipeline/executor.py` 的运行结束钩子中,添加: ```python # Pipeline 完成后触发 async def _post_pipeline_hook(self, run_results: list[StageResult]): # 1. 提取教训 lessons = extract_lessons(run_results) self.evolution_store.append_many(lessons) # 2. 将高严重性教训转化为技能 (如果 MetaClaw bridge 启用) if self.config.metaclaw_bridge.enabled: from researchclaw.metaclaw_bridge.lesson_to_skill import convert_lessons_to_skills new_skills = await convert_lessons_to_skills( lessons=[l for l in lessons if l.severity == "error"], llm=self.llm, skills_dir=self.config.metaclaw_bridge.skills_dir, ) logger.info(f"Generated {len(new_skills)} new MetaClaw skills from run failures") ``` **预期交付**: 失败 → 教训 → 技能的自动闭环,每次运行都让系统变得更好。 --- ### Phase 4: L4 — PRM 质量门控 **目标**: 在关键质量门控阶段使用 MetaClaw 的 PRM 评分器提供客观质量评估。 #### Task 4.1: PRM 评分器集成 **新增文件**: `researchclaw/metaclaw_bridge/prm_gate.py` **功能**: 封装 MetaClaw PRMScorer,为 AutoResearchClaw 的质量门控提供评分。 ```python class ResearchPRMGate: """Uses MetaClaw's PRM scorer for objective quality assessment.""" def __init__(self, prm_config: dict): self.scorer = PRMScorer( api_base=prm_config["api_base"], api_key=prm_config["api_key"], model=prm_config["model"], majority_votes=prm_config.get("votes", 3), ) async def evaluate_stage_output( self, stage_name: str, instruction: str, output: str, ) -> float: """Returns -1.0 (fail), 0.0 (ambiguous), or 1.0 (pass).""" score = await self.scorer.score(instruction, output) return score ``` #### Task 4.2: 集成到质量门控阶段 在以下 4 个门控阶段添加 PRM 评分: | 阶段 | 评分对象 | 评分指令 | |------|----------|----------| | Stage 5 (LITERATURE_SCREEN) | 筛选后的论文列表 | "评估文献筛选的相关性和覆盖度" | | Stage 9 (EXPERIMENT_DESIGN) | 实验设计方案 | "评估实验设计的严谨性:对照组、消融实验、统计方法" | | Stage 15 (RESEARCH_DECISION) | PROCEED/PIVOT 决策 | "评估决策依据的证据充分性" | | Stage 20 (QUALITY_GATE) | 最终论文 | "评估论文的学术质量、创新性、方法论严谨性" | **决策逻辑**: ```python prm_score = await prm_gate.evaluate_stage_output(stage, instruction, output) if prm_score == 1.0: # 通过,继续 pass elif prm_score == 0.0: # 模糊,使用原有逻辑决策 pass elif prm_score == -1.0: # 不通过,触发回退 # Stage 5 → 回退到 Stage 4 # Stage 9 → 回退到 Stage 8 # Stage 15 → REFINE 或 PIVOT # Stage 20 → 回退到 Stage 16 ``` #### Task 4.3: 配置项扩展 在 `config.researchclaw.yaml` 中新增: ```yaml metaclaw_bridge: enabled: true proxy_url: "http://localhost:30000" skills_dir: "~/.metaclaw/skills" prm: enabled: true api_base: "https://api.openai.com/v1" # PRM 评分用的 LLM API api_key_env: "PRM_API_KEY" model: "gpt-4o" votes: 3 # 多数投票次数 gate_stages: [5, 9, 15, 20] # 启用 PRM 的阶段 lesson_to_skill: enabled: true min_severity: "error" # 仅转化 error 级别教训 max_skills_per_run: 3 # 每次运行最多生成 3 个新技能 ``` **预期交付**: 关键阶段获得客观质量评分,降低低质量输出流入后续阶段的概率。 --- ### Phase 5: L5 — RL 持续训练(可选) > **注意**: 此阶段需要 GPU 和 Tinker/MinT 后端,如当前环境不具备可跳过。 **目标**: 利用研究流水线的对话数据持续微调模型。 #### Task 5.1: 切换 MetaClaw 到 MadMax 模式 ```yaml # ~/.metaclaw/config.yaml mode: madmax rl: enabled: true backend: tinker # 或 mint model: <模型名> api_key: batch_size: 4 lora_rank: 32 scheduler: enabled: true sleep_start: "23:00" sleep_end: "07:00" idle_threshold_minutes: 30 ``` #### Task 5.2: 会话生命周期管理 在 AutoResearchClaw 的 Pipeline runner 中管理 MetaClaw 会话: ```python # Pipeline 开始时 headers["X-Session-Id"] = f"arc-{run_id}" # Pipeline 结束时 headers["X-Session-Done"] = "true" # 通知 MetaClaw 一次研究会话结束 ``` 这使 MetaClaw 能够在每次研究运行结束后触发技能进化和 RL 训练数据收集。 **预期交付**: 每次研究运行都成为模型改进的训练数据,长期持续提升。 --- ## 四、阶段-技能映射总表 | 阶段 | 阶段名称 | MetaClaw 任务类型 | 注入技能 | top_k | |------|----------|-------------------|----------|-------| | 1 | TOPIC_INIT | research | literature-search-strategy | 4 | | 2 | PROBLEM_DECOMPOSE | research | research-gap-identification | 4 | | 3 | SEARCH_STRATEGY | research | literature-search-strategy | 6 | | 4 | LITERATURE_COLLECT | research | literature-search-strategy | 4 | | 5 | LITERATURE_SCREEN | research | paper-relevance-screening | 6 | | 6 | KNOWLEDGE_EXTRACT | research | knowledge-card-extraction | 4 | | 7 | SYNTHESIS | research | research-gap-identification | 6 | | 8 | HYPOTHESIS_GEN | research | hypothesis-formulation | 6 | | 9 | EXPERIMENT_DESIGN | research | experiment-design-rigor | 6 | | 10 | CODE_GENERATION | coding | hardware-aware-coding | 6 | | 11 | RESOURCE_PLANNING | productivity | — | 3 | | 12 | EXPERIMENT_RUN | automation | experiment-debugging | 4 | | 13 | ITERATIVE_REFINE | coding | experiment-debugging | 6 | | 14 | RESULT_ANALYSIS | data_analysis | statistical-analysis | 6 | | 15 | RESEARCH_DECISION | research | research-pivot-decision | 4 | | 16 | PAPER_OUTLINE | communication | academic-writing-structure | 4 | | 17 | PAPER_DRAFT | communication | academic-writing-structure | 6 | | 18 | PEER_REVIEW | communication | peer-review-methodology | 6 | | 19 | PAPER_REVISION | communication | academic-writing-structure, peer-review-methodology | 6 | | 20 | QUALITY_GATE | research | peer-review-methodology | 4 | | 21 | KNOWLEDGE_ARCHIVE | automation | — | 2 | | 22 | EXPORT_PUBLISH | automation | — | 2 | | 23 | CITATION_VERIFY | research | citation-integrity | 4 | --- ## 五、新增文件清单 ``` AutoResearchClaw/ ├── researchclaw/ │ └── metaclaw_bridge/ # 新增模块 │ ├── __init__.py │ ├── config.py # MetaClaw 集成配置 │ ├── stage_skill_map.py # 阶段-技能映射 │ ├── lesson_to_skill.py # 教训→技能转化器 │ ├── skill_feedback.py # 技能效果追踪 │ ├── prm_gate.py # PRM 质量门控 │ └── session.py # MetaClaw 会话管理 ├── docs/ │ └── metaclaw-integration-plan.md # 本文档 └── tests/ └── test_metaclaw_bridge/ # 集成测试 ├── test_stage_skill_map.py ├── test_lesson_to_skill.py ├── test_prm_gate.py └── test_e2e_integration.py ``` **需修改的现有文件**: | 文件 | 改动内容 | |------|----------| | `researchclaw/llm/client.py` | 添加 503 重试 + X-Session-Id/X-Turn-Type header | | `researchclaw/config.py` | 新增 `metaclaw_bridge` 配置段 | | `researchclaw/pipeline/executor.py` | 添加 post-pipeline hook 调用 lesson_to_skill | | `config.researchclaw.example.yaml` | 添加 metaclaw_bridge 配置示例 | --- ## 六、实施路线图 ``` Week 1: Phase 0 + Phase 1 (环境 + Proxy 透传) ├── Day 1-2: 环境配置、分支创建、MetaClaw 安装验证 ├── Day 3-4: 配置修改、503 重试适配、冒烟测试 └── Day 5: 端到端验证、记录基线指标 Week 2: Phase 2 (研究技能库) ├── Day 1-3: 编写 13 个研究专属技能 SKILL.md ├── Day 4: 实现阶段映射模块 + header 注入 └── Day 5: A/B 对比测试(有/无技能注入) Week 3: Phase 3 (Evolution 桥接) ├── Day 1-2: 实现 lesson_to_skill 转化器 ├── Day 3: 实现 skill_feedback 追踪 ├── Day 4: 集成到 executor post-pipeline hook └── Day 5: 测试闭环:运行 → 失败 → 教训 → 技能 → 再运行改进 Week 4: Phase 4 (PRM 门控) + 收尾 ├── Day 1-2: 实现 prm_gate + 集成到 4 个门控阶段 ├── Day 3: 全流程端到端测试 ├── Day 4: 性能调优、文档完善 └── Day 5: 代码审查、合并准备 ``` --- ## 七、风险与缓解 | 风险 | 影响 | 缓解措施 | |------|------|----------| | MetaClaw 代理增加延迟 | 每次 LLM 调用额外 ~50ms | 可接受;技能注入带来的质量提升远超延迟代价 | | 技能注入增加 prompt token 消耗 | 每次调用增加 ~500-1000 tokens | 控制 top_k;关键阶段多注入,非关键阶段少注入 | | MetaClaw 上下文截断 | 长 prompt 被静默截断 | 配置 `max_context_tokens` ≥ 32000 | | Lesson→Skill 转化质量不稳定 | 生成无效技能 | 限制每次运行最多 3 个新技能;人工审核 | | MetaClaw 进程崩溃 | Pipeline 中断 | 在 LLMClient 中添加 fallback:MetaClaw 不可用时直连上游 | | 分支冲突 | 合并困难 | 改动集中在新模块,对原有代码侵入极小 | ### 关键缓解:Fallback 机制 在 `researchclaw/llm/client.py` 中实现: ```python async def _request(self, ...): try: # 优先走 MetaClaw 代理 return await self._http_post(self.base_url, ...) except (ConnectionError, Timeout): if self.config.metaclaw_bridge.fallback_url: # 降级直连上游 LLM logger.warning("MetaClaw proxy unavailable, falling back to direct LLM") return await self._http_post(self.config.metaclaw_bridge.fallback_url, ...) raise ``` --- ## 八、成功指标 | 指标 | 基线(无 MetaClaw) | 目标(集成后) | 测量方法 | |------|---------------------|----------------|----------| | Pipeline 完成率 | 现有水平 | +15% | 统计 Stage 15 PROCEED 率 | | 实验代码首次运行成功率 | 现有水平 | +20% | 统计 Stage 12 无需 Stage 13 的比例 | | 论文 PRM 评分 | — | ≥ 0.6 平均分 | Stage 20 PRM 评分统计 | | 引用验证通过率 | 现有水平 | +10% | Stage 23 验证通过率 | | 技能库增长 | 40 (MetaClaw 原有) | +13 (研究专属) + 自动进化 | 技能目录文件数 | --- ## 九、API 需求清单 集成过程中可能需要以下 API(请确认可用性): | API | 用途 | 是否必需 | |-----|------|----------| | OpenAI-compatible LLM API | AutoResearchClaw + MetaClaw 共用 | **必需** | | PRM 评分用 LLM API | Phase 4 质量门控(可与上述相同) | Phase 4 需要 | | Tinker/MinT API | Phase 5 RL 训练 | **可选** | | arXiv API | AutoResearchClaw 文献检索(已有) | 已配置 | | Semantic Scholar API | AutoResearchClaw 文献检索(已有) | 已配置 | --- ## 十、快速启动命令 完成集成后的典型使用流程: ```bash # 1. 启动 MetaClaw 代理 cd /home/jqliu/projects/MetaClaw source .venv/bin/activate metaclaw start --mode skills_only --port 30000 # 2. 运行增强版 AutoResearchClaw cd /home/jqliu/projects/AutoResearchClaw git checkout feat/metaclaw-integration researchclaw run \ --topic "Your research idea" \ --config config.researchclaw.yaml # 3. 查看 MetaClaw 技能注入日志 metaclaw status # 4. 查看新进化出的技能 ls ~/.metaclaw/skills/arc-*/ ``` ================================================ FILE: docs/next_phase_showcase_plan.md ================================================ # AutoResearchClaw — Phase 5: Showcase Website & Sample Papers > Created: 2026-03-15 > Status: **Website Built** — static site deployed, showcase papers pending generation > Prerequisites: Phase 3 regression tests complete, all fixes pushed to origin/main --- ## 1. Goals 1. **Generate representative showcase papers** across diverse research domains to demonstrate pipeline capabilities 2. **Build a static website** to publicly present AutoResearchClaw's pipeline, features, and sample outputs 3. **Establish a paper gallery** with downloadable PDFs and code for each showcase paper --- ## 2. Showcase Paper Generation ### 2.1 Test Case Selection Strategy Select 5-6 topics across different ML subfields, difficulty levels, and experiment types to maximize diversity: | # | Topic | Domain | Experiment Type | Why | |---|-------|--------|-----------------|-----| | S1 | "Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification" | CV + Training Strategy | CIFAR-10/100, standard benchmark | Accessible, clear baselines, testable | | S2 | "Prompt-Length-Aware Routing for Mixture-of-LoRA Experts in Instruction-Following" | NLP + PEFT | QLoRA + Qwen-2.5-3B fine-tuning | Showcases LLM fine-tuning capability | | S3 | "Graph Attention Networks with Learnable Edge Features for Molecular Property Prediction" | GNN + Chemistry | OGB-MolHIV benchmark | Cross-domain application | | S4 | "Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control" | RL | MuJoCo locomotion tasks | Complex multi-algorithm comparison | | S5 | "Spectral Normalization Effects on Mode Collapse in Conditional GANs for CIFAR-10" | Generative Models | GAN training on CIFAR-10 | Visual results + quantitative metrics | | S6 | "Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift" | Domain Adaptation | CIFAR-10-C corruption benchmark | Practical, real-world relevance | ### 2.2 Selection Criteria for Showcase From the generated papers, select 3-4 best ones based on: - **Paper Quality Score**: >= 7/10 from the built-in quality assessment - **Experiment Completeness**: All methods ran, ablations show differentiation - **Visual Quality**: Charts are clean, metrics are meaningful - **Topic Diversity**: No two showcase papers from the same subfield - **Narrative Quality**: Clear story from motivation through results to conclusions ### 2.3 Configuration Template ```yaml # showcase_config_template.yaml llm: provider: "azure_openai" model: "gpt-5.1" max_tokens: 16384 experiment: backend: "docker" timeout_sec: 3600 # generous budget for quality docker: gpu_enabled: true memory_limit_mb: 40960 network_policy: "full" pipeline: target_conference: "iclr_2026" max_refinement_iterations: 3 enable_code_review: true ``` --- ## 3. Static Website Design ### 3.1 Technology Stack | Component | Choice | Rationale | |-----------|--------|-----------| | Static Site Generator | **Astro** or **Next.js (static export)** | Modern, fast, Markdown-native | | Styling | **Tailwind CSS** | Utility-first, rapid prototyping | | Hosting | **GitHub Pages** or **Vercel** | Free, auto-deploy from repo | | PDF Rendering | **PDF.js** embedded viewer | In-browser paper viewing | | Domain | `autoresearchclaw.github.io` or custom | GitHub Pages default | **Alternative (simpler):** Pure HTML/CSS/JS with no build step — suitable if we want zero dependencies and maximum portability. ### 3.2 Site Structure ``` / → Landing page (hero, pipeline overview, CTA) /pipeline → Interactive pipeline visualization (23 stages) /papers → Gallery of showcase papers /papers/{paper-id} → Individual paper page (PDF viewer + metadata) /features → Feature highlights and comparison /getting-started → Quick start guide ``` ### 3.3 Page Designs #### Landing Page (`/`) - **Hero section**: Logo, tagline ("Chat an Idea. Get a Paper."), demo GIF/video - **Pipeline overview**: Animated or scrollable 23-stage diagram - **Key stats**: "1039 tests passed", "23 autonomous stages", "GPU-accelerated experiments" - **Paper carousel**: 3-4 showcase papers with thumbnails - **CTA**: GitHub link, quickstart command #### Pipeline Page (`/pipeline`) - Interactive visualization of the 23-stage pipeline - Each stage clickable → shows description, inputs, outputs, example - Stage groups: Topic Discovery → Literature → Experiment Design → Code → Execution → Writing → Review - Highlight: Docker sandbox, multi-agent review, citation verification #### Paper Gallery (`/papers`) - Grid/card layout with paper thumbnails - Each card shows: title, topic domain, quality score badge, abstract preview - Filter by domain (CV, NLP, RL, etc.) - Sort by quality score #### Individual Paper Page (`/papers/{id}`) - Embedded PDF viewer (PDF.js) - Metadata sidebar: topic, quality score, stages completed, runtime, GPU used - Download buttons: PDF, LaTeX source, experiment code - Quality assessment summary (strengths, weaknesses from internal review) - Experiment charts gallery #### Features Page (`/features`) - Feature cards with icons: - Real literature search (arXiv + Semantic Scholar) - Docker-sandboxed experiments with GPU passthrough - Multi-agent peer review - Iterative refinement loop - Conference-ready LaTeX output - Hardware-aware experiment design - Citation verification - LLM fine-tuning support (QLoRA/LoRA) ### 3.4 Assets Needed | Asset | Source | Status | |-------|--------|--------| | Logo | Existing (`image/logo.png`) | Done | | Framework diagram | Existing (`image/framework.png`) | Done | | Pipeline stage icons | Need to create | TODO | | Paper thumbnails | Generate from LaTeX PDFs | TODO | | Demo video/GIF | Screen recording of pipeline run | TODO | | Quality score badges | SVG badges | TODO | --- ## 4. Repository Structure for Website ``` website/ ├── public/ │ ├── papers/ │ │ ├── paper-01/ │ │ │ ├── paper.pdf │ │ │ ├── paper.tex │ │ │ ├── code/ │ │ │ ├── charts/ │ │ │ └── metadata.json │ │ └── paper-02/ │ │ └── ... │ └── assets/ │ ├── logo.png │ ├── framework.png │ └── icons/ ├── src/ │ ├── pages/ │ │ ├── index.astro (or .html) │ │ ├── pipeline.astro │ │ ├── papers/ │ │ │ ├── index.astro │ │ │ └── [id].astro │ │ ├── features.astro │ │ └── getting-started.astro │ ├── components/ │ │ ├── Header.astro │ │ ├── PipelineStage.astro │ │ ├── PaperCard.astro │ │ └── QualityBadge.astro │ ├── layouts/ │ │ └── Base.astro │ └── styles/ │ └── global.css ├── astro.config.mjs ├── tailwind.config.js └── package.json ``` --- ## 5. Paper Metadata Format Each showcase paper includes a `metadata.json`: ```json { "id": "curriculum-learning-cifar", "title": "Curriculum Learning with Adaptive Difficulty Scheduling...", "domain": "Computer Vision", "tags": ["curriculum-learning", "image-classification", "CIFAR-10"], "quality_score": 7.5, "verdict": "accept", "target_conference": "ICLR 2026", "generated_date": "2026-03-15", "runtime_minutes": 45, "gpu": "NVIDIA RTX 6000 Ada (49GB)", "stages_completed": 23, "abstract": "...", "strengths": ["...", "..."], "weaknesses": ["...", "..."], "files": { "pdf": "paper.pdf", "latex": "paper.tex", "code": "code/", "charts": "charts/", "references": "references.bib" } } ``` --- ## 6. Implementation Timeline | Phase | Task | Estimated Effort | Dependencies | |-------|------|-----------------|--------------| | 5.1 | Generate 5-6 showcase papers | 1 day (parallel runs) | Phase 3 complete | | 5.2 | Review & select 3-4 best papers | 2 hours | 5.1 | | 5.3 | Compile LaTeX → PDF for selected papers | 1 hour | 5.2 | | 5.4 | Set up website repo structure | 1 hour | — | | 5.5 | Build landing page + pipeline visualization | 4 hours | 5.4 | | 5.6 | Build paper gallery + individual pages | 3 hours | 5.2, 5.4 | | 5.7 | Build features page | 2 hours | 5.4 | | 5.8 | Deploy to GitHub Pages | 30 min | 5.5-5.7 | | 5.9 | Create demo video/GIF | 2 hours | Pipeline working | **Total estimated**: ~2 days --- ## 7. Deployment Options ### Option A: GitHub Pages (Recommended) - Free hosting on `autoresearchclaw.github.io` - Auto-deploy via GitHub Actions on push to `website` branch or `docs/` folder - No server costs, CDN included ### Option B: Vercel - Free tier supports static sites - Faster builds, preview deployments for PRs - Custom domain support ### Option C: Netlify - Similar to Vercel, free tier available - Form handling if needed later **Recommendation**: Start with GitHub Pages for simplicity, migrate to Vercel if we need preview deployments or custom domain. --- ## 8. Content Checklist - [x] Finalize showcase paper topics (Section 2.1) - [ ] Run all showcase experiments - [ ] Review and select best 3-4 papers - [ ] Compile PDFs from LaTeX - [ ] Create paper metadata.json for each - [x] Design pipeline visualization (interactive or static) — interactive click-to-expand - [x] Write feature descriptions — 16 feature cards + comparison table - [x] Create getting-started guide (adapted from README) — 7-step guide - [ ] Record demo video/GIF - [x] Build and deploy website — pure HTML/CSS, GitHub Pages via Actions - [x] Test on mobile/tablet — responsive CSS with nav toggle - [ ] Add analytics (optional, e.g., Plausible) --- ## 9. Open Questions 1. **Custom domain?** — Do we want a custom domain (e.g., `autoresearchclaw.com`) or is `github.io` sufficient? 2. **Video demo?** — Should we include a screen recording of a full pipeline run, or is a GIF of key stages enough? 3. **Interactive pipeline?** — Full interactive SVG/Canvas pipeline diagram vs. static image with tooltips? 4. **Paper format** — Show papers as embedded PDFs, or convert to HTML for better web rendering? 5. **Localization** — Website in English only, or mirror the multi-language READMEs? --- ## 10. Success Metrics - At least 3 showcase papers with quality score >= 7/10 - Website loads in < 2 seconds on 3G connection - All showcase paper PDFs downloadable - Pipeline visualization accurately represents all 23 stages - GitHub stars / traffic increase after website launch (track via GitHub Insights) ================================================ FILE: docs/pipeline_critical_fixes_v8.md ================================================ # Pipeline Critical Fixes V8 — 投稿级论文质量修复 ## 目标 修复所有阻止 Pipeline 产出符合 AI 顶会投稿标准论文的问题。 --- ## Tier 1: 阻断性问题(必须立即修复) ### T1.1 Title 提取 Bug - **问题**: converter 从 markdown 提取 title 时,可能错误抓取表格标题(如 "Table 1 – Aggregate primary_metric across methods.") - **文件**: `researchclaw/templates/converter.py` - **修复**: 加入 `_TITLE_REJECT_RE` 和 `_METRIC_DUMP_RE` 正则,`_is_bad_title()` 过滤表格/图表/指标标题,回退到真正的论文标题 - **状态**: ✅ 已修复 ### T1.2 Markdown Fence 泄漏到 LaTeX - **问题**: LLM 输出的 ` ```markdown ` fence 没有被 converter 清除,直接出现在 .tex 中 - **文件**: `researchclaw/templates/converter.py` - **修复**: 增加智能 fence 清洗,仅移除文档类 fence(markdown/latex/text/bibtex),保留代码 fence(python/java 等) - **状态**: ✅ 已修复 ### T1.3 Section 层级错误 - **问题**: LLM 输出用 `##` (H2) 作为主要章节标题,converter 将其映射为 `\subsection` 而非 `\section` - **文件**: `researchclaw/templates/converter.py` + `researchclaw/prompts.py` - **修复**: converter 自动检测 body 最低 heading level 并做 level shift(H2→`\section`),prompts 明确要求用 `#` 做主标题 - **状态**: ✅ 已修复 ### T1.4 BibTeX journal 字段填 arXiv 分类代码 - **问题**: `journal = {cs.CY}` 应该是 arXiv preprint 格式,不是 journal 名 - **文件**: `researchclaw/literature/models.py` - **修复**: 检测 arXiv category 模式 → 自动转换为 `journal = {arXiv preprint arXiv:XXXX.XXXXX}` 格式 - **状态**: ✅ 已修复 ### T1.5 Abstract 长度失控 + 含原始变量名 - **问题**: Abstract ~500 词(应 150-250),且包含 `bayesian_optimization/primary_metric = 0.8607` 等原始键名 - **文件**: `researchclaw/templates/converter.py` - **修复**: `check_paper_completeness()` 增加 abstract 长度检查(>300 词警告)和原始变量名检测 - **状态**: ✅ 已修复 --- ## Tier 2: 高优先级(显著提升论文质量) ### T2.1 Quality Gate 真正执行 - **问题**: Stage 20 总是返回 DONE,verdict 从未被 runner 检查 - **文件**: `researchclaw/pipeline/executor.py` - **修复**: `_execute_quality_gate()` 当 score < threshold 时返回 `StageStatus.FAILED`,增加 pass/fail 日志 - **状态**: ✅ 已修复 ### T2.2 文献筛选过于激进 - **问题**: 87 篇候选 → 仅 5 篇通过(94% 拒绝率),会议需要 20-40 篇 - **文件**: `researchclaw/pipeline/executor.py` - **修复**: keyword pre-filter 从 ≥2 放宽到 ≥1,最低保留数从 6 提高到 15,LLM 返回太少时自动补充 - **状态**: ✅ 已修复 ### T2.3 跨域论文过滤 - **问题**: 量子计算、社会学论文混入 RL 论文的参考文献 - **文件**: `researchclaw/prompts.py` - **修复**: 已在 V7 修复中通过 `literature_screen` prompt 强化领域匹配规则(P2/P6 fixes) - **状态**: ✅ 已修复(V7) ### T2.4 图表 DPI 不达标 - **问题**: 全部 savefig 使用 dpi=150(会议要求 ≥300) - **文件**: `researchclaw/experiment/visualize.py` - **修复**: 所有 `dpi=150` → `dpi=300`(5 处) - **状态**: ✅ 已修复 ### T2.5 强制必需章节验证 - **问题**: NeurIPS/ICLR 要求 Limitations 章节 — 当前不检查 - **文件**: `researchclaw/templates/converter.py` + `researchclaw/prompts.py` - **修复**: `check_paper_completeness()` 增加 Limitations 章节检测;`writing_structure` block 增加 MARKDOWN FORMATTING 规则 - **状态**: ✅ 已修复 --- ## Tier 3: 高价值架构级修复 ### T3.1 RESEARCH_DECISION 质量验证 - **问题**: _parse_decision() 仅提取关键词,不验证最低标准(≥2 baselines, ≥3 seeds 等) - **文件**: `researchclaw/pipeline/executor.py` - **修复**: 在 decision 提取后增加质量检查,验证决策文本是否提及 baselines/seeds/metrics,警告缺失项并写入 decision_structured.json - **状态**: ✅ 已修复 ### T3.2 FigureAgent 合并到当前分支 - **问题**: FigureAgent 代码在 main 分支但不在 feat/metaclaw-integration - **修复**: `git checkout main -- researchclaw/agents/figure_agent/ researchclaw/agents/__init__.py researchclaw/agents/base.py tests/test_figure_agent.py` - **状态**: ✅ 已修复 ### T3.3 负面结果处理 - **问题**: 当方法表现不如 baseline 时,论文仍写成 positive contribution - **文件**: `researchclaw/pipeline/executor.py` + `researchclaw/prompts.py` - **修复**: `_detect_result_contradictions()` 已实现 NULL/NEGATIVE 结果检测,advisories 注入 paper_draft prompt 上下文;prompts 中 `hypothesis_gen`、`paper_draft`、`paper_revision` 均已包含 negative result 处理指导 - **状态**: ✅ 已修复(已有实现) ### T3.4 Citation Verify 改为阻断性 - **问题**: CITATION_VERIFY 在 NONCRITICAL_STAGES 中,失败不阻断导出 - **文件**: `researchclaw/pipeline/stages.py` - **修复**: 从 NONCRITICAL_STAGES 移除 CITATION_VERIFY - **状态**: ✅ 已修复 ### T3.5 论文分段写作容错 - **问题**: 3 次 LLM 调用中任一超时,对应章节丢失 - **文件**: `researchclaw/pipeline/executor.py` - **修复**: `_write_paper_sections()` 三次 LLM 调用均增加 `retries=1`(自动重试 1 次),仍失败则用 `[PLACEHOLDER]` 标记缺失章节,确保后续流程不中断 - **状态**: ✅ 已修复 --- ## 额外修复 ### T-extra.1 Agent Config 集成 - **问题**: feat/metaclaw-integration 分支缺少 CodeAgentConfig / BenchmarkAgentConfig / FigureAgentConfig - **文件**: `researchclaw/config.py` - **修复**: 添加三个 agent config dataclass 及其解析函数,集成到 ExperimentConfig - **状态**: ✅ 已修复 --- ## 完成记录 | 时间 | 修复项 | 状态 | |------|--------|------| | 2026-03-15 | T1.1 Title 提取 Bug | ✅ | | 2026-03-15 | T1.2 Markdown Fence 泄漏 | ✅ | | 2026-03-15 | T1.3 Section 层级错误 | ✅ | | 2026-03-15 | T1.4 BibTeX journal 字段 | ✅ | | 2026-03-15 | T1.5 Abstract 验证 | ✅ | | 2026-03-15 | T2.1 Quality Gate 执行 | ✅ | | 2026-03-15 | T2.2 文献筛选放宽 | ✅ | | 2026-03-15 | T2.4 图表 DPI 升级 | ✅ | | 2026-03-15 | T2.5 必需章节验证 | ✅ | | 2026-03-15 | T3.2 FigureAgent 合并 | ✅ | | 2026-03-15 | T3.4 Citation Verify 阻断性 | ✅ | | 2026-03-15 | T-extra.1 Agent Config | ✅ | | 2026-03-15 | T3.1 Decision 质量验证 | ✅ | | 2026-03-15 | T3.3 负面结果处理 | ✅ (已有) | | 2026-03-15 | T3.5 分段写作容错 | ✅ | **已完成**: 15/15 (100%) ================================================ FILE: docs/rate_limit_fix_plan.md ================================================ # arXiv / 文献检索限流问题 — 调研报告与修复方案 > Created: 2026-03-15 > Status: **DONE** — All 7 tasks completed, 1117/1117 tests passing > Severity: High — 直接影响用户体验和 Pipeline 稳定性 --- ## 1. 问题描述 Pipeline 中多个阶段需要通过 API 调用外部论文数据库(arXiv、Semantic Scholar、OpenAlex),在高频请求时遭遇 **HTTP 429 (Too Many Requests)** 限流错误,导致文献检索失败或降级。 **受影响阶段:** | 阶段 | 功能 | API 调用量 | 严重程度 | |------|------|-----------|---------| | Stage 4 | 文献收集 | ~12 次 (6 query × 2 source) | **高** — 直接影响论文质量 | | Stage 8 | 假设新颖性检查 | ~8-12 次 | 中 — 非阻塞 | | Stage 23 | 引用验证 | ~40-50 次 | **高** — 最密集的 API 调用 | --- ## 2. 根因分析 ### 2.1 代码层面定位 | 文件 | 问题 | 影响 | |------|------|------| | `researchclaw/literature/arxiv_client.py` | 无显式 HTTP 429 检测 — URLError/OSError 统一捕获,无法区分限流和真正的网络错误 | 限流时无法做针对性处理 | | `researchclaw/literature/arxiv_client.py` | 无熔断器 (Circuit Breaker) — S2 有但 arXiv 没有 | 连续 429 时仍不停重试 | | `researchclaw/literature/arxiv_client.py` | 未解析 `Retry-After` 响应头 | 服务器建议的等待时间被忽略 | | `researchclaw/literature/semantic_scholar.py` | 虽有熔断器,但 Stage 23 的密集调用仍可能触发 | 一旦熔断,所有后续 S2 请求被跳过 | | `researchclaw/literature/verify.py` | Stage 23 逐条顺序验证 40+ 引用,每条间隔 1.5s | 总耗时 60-80s,集中 burst 可触发限流 | | `researchclaw/literature/search.py` | OpenAlex 仅用于 L3 title search fallback,未作为主搜索源 | 浪费了最宽松的 API 额度 | ### 2.2 各 API 官方限流策略 | API | 限制 | 我们当前的间隔 | 是否合规 | |-----|------|---------------|---------| | **arXiv** | 1 request / 3 seconds | 3.1s (`_RATE_LIMIT_SEC`) | 合规,但无 429 重试 | | **Semantic Scholar** (无 Key) | 共享池 5000/5min | 1.5s | 偏保守但可行 | | **Semantic Scholar** (有 Key) | 1 req/s (可申请更高) | 0.3s | 合规 | | **OpenAlex** (有 Key) | 10,000 list/day; 1,000 search/day | 仅 L3 fallback | 远未用满 | | **CrossRef** | 50 req/s (polite pool) | 1.5s | 远低于上限 | ### 2.3 arXiv 特殊性 - arXiv 元数据每天午夜更新一次 → 同一查询 24h 内重复请求无意义 - arXiv 返回 HTTP 200 但内容为空 (phantom empty page) 是已知 bug - arXiv ToS 明确要求:所有你控制的机器合计不超过 1 req/3s --- ## 3. 竞品方案调研 ### 3.1 PaperClaw (guhaohao0991/PaperClaw) | 方面 | 实现 | |------|------| | arXiv 搜索 | `urllib.request` + 3s 固定延迟,**无重试,无 429 处理** | | S2 搜索 | `requests` + `_request_with_retry()`: 指数退避 (2^attempt),3 次重试 | | S2 缓存 | 文件 JSON 缓存,按类型 TTL(论文 7 天,作者 30 天,引用 1 天) | | arXiv 缓存 | **无** — 每次直接调 API | **可借鉴**:S2 缓存按类型差异化 TTL 的思路。 ### 3.2 Sibyl Research System (Sibyl-Research-Team/sibyl-research-system) | 方面 | 实现 | |------|------| | arXiv 搜索 | 通过 `arxiv-mcp-server` MCP 工具,依赖 `arxiv` Python 库的 3s delay | | 429 处理 | **代码层面无** — 依赖 CLAUDE.md 指令让 LLM agent 行为级重试 | | 体量控制 | 刻意限制在 15-30 篇论文,"速度优先" | | S2 | **未使用** | **可借鉴**:控制搜索体量、不过度爬取的理念。 ### 3.3 Idea2Paper (AgentAlphaAGI/Idea2Paper) | 方面 | 实现 | |------|------| | 文献来源 | **完全离线** — 预构建 ICLR Knowledge Graph,运行时不调任何论文 API | | 重试策略 | `urllib3.util.retry.Retry(total=3, backoff_factor=2, status_forcelist=[429,500,502,503,504])` | | 降级策略 | Embedding 失败后降级为 Jaccard 相似度 | **可借鉴**:`urllib3.Retry` 的干净实现;embedding 失败的优雅降级。 ### 3.4 对比总结 | 项目 | arXiv 429 处理 | S2 429 处理 | 缓存 | 备选源 | |------|---------------|-------------|------|--------| | **AutoResearchClaw (当前)** | 无显式处理 | 熔断器 ✓ | 7 天 TTL ✓ | OpenAlex (仅 L3) | | PaperClaw | 无 | 指数退避 | S2 有/arXiv 无 | 无 | | Sibyl | 无 (靠 LLM) | 未使用 | 论文下载缓存 | WebSearch | | Idea2Paper | 不涉及 | 不涉及 | 离线 KG | 不涉及 | --- ## 4. 综合解决方案 ### 4.1 方案总览 ``` ┌─────────────────────────────────────────────────────┐ │ 防御层次 │ ├─────────────────────────────────────────────────────┤ │ L1: 智能限速器 (Adaptive Rate Limiter) │ │ - 根据 API 类型自动调节请求间隔 │ │ - 解析 Retry-After 响应头 │ │ │ │ L2: 熔断器 (Circuit Breaker) │ │ - arXiv 也加熔断器 (参考 S2 实现) │ │ - 三态切换: CLOSED → OPEN → HALF_OPEN │ │ │ │ L3: 多源降级 (Source Fallback) │ │ - arXiv 限流 → 切换 OpenAlex/S2 │ │ - S2 限流 → 切换 OpenAlex/arXiv │ │ - 全部限流 → 返回缓存结果 │ │ │ │ L4: 结果缓存 (Cache Layer) │ │ - 24h TTL for arXiv (每天只更新一次) │ │ - 差异化 TTL (论文元数据 vs 搜索结果) │ │ - 引用验证结果永久缓存 │ │ │ │ L5: 请求优化 (Request Optimization) │ │ - S2 batch API 批量查询 │ │ - 合并重复查询 │ │ - OpenAlex 提升为主搜索源 │ └─────────────────────────────────────────────────────┘ ``` --- ## 5. 实施任务列表 ### Task 1: arXiv 客户端增强 — 显式 429 处理 + 熔断器 **文件**: `researchclaw/literature/arxiv_client.py` **改动**: - [x]1.1 改用 `urllib.request.urlopen` 的 `HTTPError` 子类捕获,区分 429 和其他错误 - [x]1.2 解析 `Retry-After` 响应头,优先使用服务器建议的等待时间 - [x]1.3 添加 arXiv 熔断器(复用 S2 的三态模式): 3 次连续 429 → OPEN (180s cooldown) - [x]1.4 增加 `_RATE_LIMIT_SEC` 动态调整: 收到 429 后临时提升到 5s,成功后恢复 3.1s **预期效果**: arXiv 429 错误从"静默失败/重试 3 次放弃"变为"智能等待 + 熔断保护" ### Task 2: OpenAlex 提升为主搜索源 **文件**: `researchclaw/literature/search.py`, 新建 `researchclaw/literature/openalex_client.py` **改动**: - [x]2.1 新建 `openalex_client.py`: 封装 OpenAlex Works API (`https://api.openalex.org/works`) - 支持 `title.search` / `default.search` 两种查询模式 - 字段映射到 `Paper` 数据类 (title, abstract, year, venue, citation_count, authors, doi, arxiv_id) - 配置 polite pool email (`researchclaw@users.noreply.github.com`) - 指数退避 + 3 次重试 - [x]2.2 在 `search.py` 的 `search_papers()` 中注册 OpenAlex 为第三个源 - [x]2.3 调整 `search_papers_multi_query()` 的源顺序策略: - 默认: OpenAlex → Semantic Scholar → arXiv - 任一源 429 → 跳过该源,增加其他源的 limit - [x]2.4 在 `config.researchclaw.example.yaml` 中添加 `openalex_email` 配置项 **预期效果**: 文献检索默认走 OpenAlex (10K/day),arXiv 和 S2 作为补充和验证,大幅降低 429 风险 ### Task 3: 搜索结果缓存增强 **文件**: `researchclaw/literature/cache.py` **改动**: - [x]3.1 arXiv 搜索结果 TTL 从 7 天改为 24 小时(arXiv 每天午夜更新一次) - [x]3.2 添加"源级别"缓存策略: 如果 arXiv 缓存存在且 <24h,直接返回而不请求 API - [x]3.3 缓存命中时记录日志 `[cache] HIT query=... source=... age=...` **预期效果**: 同一 Pipeline 运行中不会重复请求同一查询,跨运行也可复用 24h 内的结果 ### Task 4: S2 batch API + 去重优化 **文件**: `researchclaw/literature/semantic_scholar.py` **改动**: - [x]4.1 新增 `batch_fetch_papers(paper_ids: list[str]) -> list[Paper]` - 使用 `POST /graph/v1/paper/batch` 端点 - 一次最多 500 个 ID(S2 限制) - 单次请求替代 N 次请求 - [x]4.2 在 Stage 23 引用验证中使用 batch API: 先收集所有有 S2 ID 的引用,一次性批量获取 **预期效果**: Stage 23 的 S2 API 调用从 ~20 次降至 1-2 次 ### Task 5: Stage 23 引用验证并行化 + 智能调度 **文件**: `researchclaw/literature/verify.py` **改动**: - [x]5.1 按源分组,相同源的验证串行(遵守限速),不同源的验证并行 - L1 (arXiv) 和 L2 (CrossRef/DataCite) 和 L3 (OpenAlex) 可并行 - [x]5.2 引用验证缓存标记为"永久有效"(已验证的论文不会变) - [x]5.3 优先使用 DOI → OpenAlex 验证(比 arXiv API 限制宽松得多),L1 arXiv 降为备选 **预期效果**: Stage 23 耗时从 60-80s 降至 20-30s,arXiv API 调用减少 50%+ ### Task 6: 用户反馈 + 日志改善 **文件**: `researchclaw/pipeline/executor.py`, `researchclaw/literature/search.py` **改动**: - [x]6.1 文献检索阶段添加进度日志: `[literature] Searching OpenAlex... (1/3 sources)` - [x]6.2 429 错误时输出友好提示: `[rate-limit] arXiv rate limit hit, switching to OpenAlex...` - [x]6.3 最终搜索统计: `[literature] Found 47 papers (OpenAlex: 28, S2: 12, arXiv: 7, cache: 23 hits)` **预期效果**: 用户能清楚看到搜索进度和限流处理过程,不再困惑 ### Task 7: 测试覆盖 **文件**: `tests/test_rc_literature.py` **改动**: - [x]7.1 arXiv 429 + Retry-After header 解析测试 - [x]7.2 arXiv 熔断器三态切换测试 - [x]7.3 OpenAlex 客户端正常搜索 + 429 退避测试 - [x]7.4 多源降级测试: 模拟 arXiv 429 → 自动切换到 OpenAlex - [x]7.5 S2 batch API 测试 - [x]7.6 缓存 24h TTL 测试 --- ## 6. 优先级排序 | 优先级 | 任务 | 理由 | 预计工时 | |--------|------|------|---------| | **P0** | Task 1: arXiv 429 显式处理 + 熔断器 | 直接修复当前 crash 问题 | 30min | | **P0** | Task 3: 缓存 TTL 调整 | 零成本减少请求量 | 15min | | **P1** | Task 2: OpenAlex 主搜索源 | 根本性降低 arXiv 依赖 | 1.5h | | **P1** | Task 6: 用户反馈日志 | 提升用户体验 | 20min | | **P2** | Task 4: S2 batch API | 优化 Stage 23 | 45min | | **P2** | Task 5: Stage 23 并行化 | 性能优化 | 1h | | **P3** | Task 7: 测试覆盖 | 质量保障 | 1h | **总计**: ~5 小时 --- ## 7. 实施进度 | 任务 | 状态 | 完成时间 | 备注 | |------|------|---------|------| | Task 1: arXiv 429 + 熔断器 | [x] 完成 | 2026-03-15 | 三态熔断器 + Retry-After 解析 + 动态限速 | | Task 2: OpenAlex 主搜索源 | [x] 完成 | 2026-03-15 | 新建 openalex_client.py,搜索源顺序 OA→S2→arXiv | | Task 3: 缓存增强 | [x] 完成 | 2026-03-15 | 按源差异化 TTL (arXiv 24h, S2/OA 3d, verify 永久) | | Task 4: S2 batch API | [x] 完成 | 2026-03-15 | batch_fetch_papers() POST 批量端点,500 ID/batch | | Task 5: Stage 23 优化 | [x] 完成 | 2026-03-15 | 验证顺序 DOI→CrossRef→OpenAlex→arXiv→S2,差异化延迟 | | Task 6: 用户反馈日志 | [x] 完成 | 2026-03-15 | Stage 4/23 进度日志 + 源统计 | | Task 7: 测试覆盖 | [x] 完成 | 2026-03-15 | +14 新测试 (熔断器×5, OpenAlex×4, 降级×1, TTL×2, 现有修复×2) | --- ## 附录 A: API 速率限制速查表 | API | 端点 | 免费限制 | 认证限制 | 我们的使用量 | |-----|------|---------|---------|------------| | arXiv | `export.arxiv.org/api/query` | 1 req / 3s | 无认证选项 | ~20 req/run | | Semantic Scholar | `api.semanticscholar.org/graph/v1` | 共享池 5K/5min | 1 req/s (API key) | ~30 req/run | | OpenAlex | `api.openalex.org/works` | 10K list/day, 1K search/day | 同左 (polite pool) | 待启用 | | CrossRef | `api.crossref.org/works` | 50 req/s (polite) | 同左 | ~15 req/run | | DataCite | `api.datacite.org/dois` | 无明确限制 | — | ~5 req/run | ## 附录 B: 参考实现 ### arXiv 显式 429 处理(目标代码) ```python try: resp = urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) except urllib.error.HTTPError as exc: if exc.code == 429: retry_after = exc.headers.get("Retry-After") wait = int(retry_after) if retry_after else _RATE_LIMIT_SEC * (2 ** attempt) _cb_on_429() # 通知熔断器 time.sleep(wait + random.uniform(0, wait * 0.2)) continue raise ``` ### OpenAlex 搜索客户端(目标签名) ```python def search_openalex( query: str, limit: int = 50, year_min: int | None = None, email: str = "researchclaw@users.noreply.github.com", ) -> list[Paper]: """Search OpenAlex Works API with polite pool access.""" ... ``` ### S2 Batch API(目标签名) ```python def batch_fetch_papers( paper_ids: list[str], fields: str = "paperId,title,abstract,year,venue,citationCount,authors,externalIds", ) -> list[dict]: """Batch fetch paper details via POST /graph/v1/paper/batch.""" ... ``` ================================================ FILE: docs/sandbox_environment_fix_plan.md ================================================ # AutoResearchClaw — Docker Sandbox Environment Enhancement Plan > Created: 2026-03-15 > Status: **DONE** — All 10 issues fixed, 1128/1128 tests passing > Priority: **CRITICAL** — Without these fixes, experiments fall back to synthetic data, producing meaningless papers --- ## 0. Problem Statement When a user requests experiments on real datasets (e.g., ImageNet, OGB, HuggingFace benchmarks), the pipeline fails to use them because: 1. The LLM is **not told** which packages are actually available in the Docker image 2. The Docker sandbox **cannot install packages at runtime** (default `network_policy: "none"`) 3. Phase 1 pip install is **broken** — packages install in Container A, but experiment runs in Container B (packages lost) 4. Only **4 small datasets** are pre-cached (CIFAR-10, FashionMNIST); prompt incorrectly claims CIFAR-100 and MNIST are cached too 5. **No dataset download mechanism** exists — no setup phase for downloading data before experiment execution 6. The Dockerfile is **missing key ML packages** (timm, einops, torchmetrics, ogb, etc.) **Result:** The LLM generates `torch.randn()` "ImageNet-like" synthetic data as a fallback, making all experiment results meaningless. --- ## 1. Reference Solutions Analysis ### 1.1 AI-Scientist (SakanaAI) - **Approach:** "Fat image" — ALL dependencies and datasets baked into Docker image at build time - **Dataset handling:** Pre-download scripts run during `docker build` (enwik8, shakespeare, text8) - **Runtime pip install:** None — not supported - **Network:** No isolation (user's responsibility) - **Lesson:** Pre-caching is the most reliable strategy for reproducibility ### 1.2 AutoResearch (Karpathy) - **Approach:** End-to-end automation in local environment - **Dataset handling:** Direct downloads via standard APIs - **Lesson:** Simplicity — don't over-engineer isolation if it breaks functionality ### 1.3 OpenHands (formerly OpenDevin) - **Approach:** Most sophisticated sandbox architecture - **Key feature:** `runtime_extra_deps` config for pre-declaring packages - **Agent autonomy:** Agent can run `pip install` via `CmdRunAction` inside the container - **Three-tag Docker image caching system** for build optimization - **Lesson:** Allow agent (LLM) to declare and install its own dependencies ### 1.4 MLCommons Training Benchmarks - **Approach:** Host-download, container-mount pattern - **Three phases:** Download on host → Build Docker image → Mount data volumes - **Lesson:** Large datasets should NEVER be inside Docker images — always volume-mount ### 1.5 Docker Desktop Sandboxes - **Network policies:** HTTP/HTTPS proxy allowlists per host - **Example:** Allow `*.pypi.org`, `*.huggingface.co`, `download.pytorch.org`, block everything else - **Lesson:** Fine-grained network control is better than all-or-nothing --- ## 2. Issues Identified ### E1: `pkg_hint` doesn't list most installed packages [CRITICAL] **File:** `researchclaw/pipeline/executor.py:2062-2073` **Current:** ```python pkg_extras = ", torchdiffeq, gymnasium, networkx, and pip-installable packages" # Resulting prompt: "AVAILABLE PACKAGES: Python stdlib, numpy, torch, sklearn, scipy, pandas, torchdiffeq, gymnasium, networkx" ``` **Missing from list:** torchvision, torchaudio, matplotlib, seaborn, PyYAML, tqdm **Impact:** LLM thinks torchvision isn't available → avoids it → generates synthetic data instead of using CIFAR-10 ### E2: Phase 1/Phase 2 container isolation BUG [CRITICAL] **File:** `researchclaw/experiment/docker_sandbox.py:169-181, 317-354` **Bug:** Phase 1 runs `docker run --rm` (installs packages, then container is removed). Phase 2 runs a NEW `docker run --rm` from the same base image. Packages installed in Phase 1 are **completely lost** because the container was deleted. **Impact:** `auto_install_deps` and `pip_pre_install` features are entirely non-functional. Any package not in the base Docker image is unavailable during experiment execution. ### E3: Default `network_policy` is `"none"` [HIGH] **File:** `researchclaw/config.py:163` **Current:** `network_policy: str = "none"` **Impact:** Even with `auto_install_deps: True`, Phase 1 never executes because it requires `network_policy == "pip_only"`. No runtime installation or download is possible by default. ### E4: No dataset download phase [HIGH] **File:** `researchclaw/experiment/docker_sandbox.py` (no implementation exists) **Missing:** There is no mechanism for downloading datasets before experiment execution. The only datasets available are the 4 pre-cached in the Docker image. **Impact:** Experiments requiring any dataset beyond CIFAR-10/100, MNIST, FashionMNIST cannot use real data. ### E5: Pre-cached dataset list inconsistent [MEDIUM] **File:** `researchclaw/docker/Dockerfile:27-30` vs `researchclaw/prompts.py:328-332` **Bug:** Dockerfile only pre-caches CIFAR-10 and FashionMNIST, but the `dataset_guidance` prompt also lists CIFAR-100 and MNIST as pre-cached. If LLM uses `download=False` for CIFAR-100/MNIST, it will get a FileNotFoundError. ### E6: Dockerfile missing commonly-needed ML packages [MEDIUM] **File:** `researchclaw/docker/Dockerfile:20-24` **Missing packages:** - Vision: `timm`, `albumentations`, `kornia`, `Pillow` - General ML: `einops`, `torchmetrics`, `lightning` - Graph: `ogb`, `torch-geometric` (optional, large) - HuggingFace: `transformers`, `datasets`, `accelerate`, `peft` (needed for LLM fine-tuning tasks) - Utilities: `h5py`, `tensorboard`, `wandb` ### E7: `dataset_guidance` prompt is misleading [MEDIUM] **File:** `researchclaw/prompts.py:333` **Current:** "For other torchvision datasets: use `download=True` (network available during setup)" **Reality:** With default `network_policy: "none"`, network is NOT available at any point. This guidance causes the LLM to generate code with `download=True` that fails with DNS resolution errors. ### E8: No `requirements.txt` generation or processing [LOW] **File:** `researchclaw/pipeline/executor.py` (code_generation stage) **Missing:** The LLM is not guided to declare its package requirements. No `requirements.txt` is generated alongside experiment code. ### E9: No LLM-generated setup script support [LOW] **File:** `researchclaw/experiment/docker_sandbox.py` **Missing:** No support for a `setup.py` or `download_data.py` script that runs before `main.py` to prepare the environment (download datasets, install packages, etc.). ### E10: No dataset registry / availability matrix [LOW] **File:** `researchclaw/prompts.py` **Missing:** The LLM has no knowledge of which datasets are downloadable (and how), which are too large, and what fallback alternatives exist. It should know: "ImageNet is 168GB — use Tiny-ImageNet (200 classes, 500/class) or ImageNet-1k subset instead." --- ## 3. Solution Design ### Architecture: Unified Container with Setup Phase Replace the broken two-container model with a **single container** running a **wrapper entrypoint script** that handles three phases: ``` ┌─────────────────────────────────────────────────────────────────┐ │ Single Docker Container │ │ │ │ Phase 0: pip install (requirements.txt + auto-detected deps) │ │ ↓ (network enabled for this phase) │ │ Phase 1: setup.py (dataset download, preprocessing) │ │ ↓ (network enabled for this phase) │ │ Phase 2: main.py (experiment execution) │ │ ↓ (network optionally disabled via iptables) │ │ │ │ Network policy: │ │ - "none": skip Phase 0/1, no network in Phase 2 │ │ - "setup_only" (NEW default): network in Phase 0+1, disabled │ │ via iptables before Phase 2 │ │ - "pip_only": network in Phase 0 only │ │ - "full": network throughout │ └─────────────────────────────────────────────────────────────────┘ ``` ### Key Design Decisions | Decision | Choice | Rationale | |----------|--------|-----------| | Single vs. multi container | Single | Fixes E2; packages survive between phases | | Network isolation method | iptables drop inside container | Docker doesn't support mid-run network changes | | Default network policy | `"setup_only"` | Allows pip install + dataset download, but experiment runs isolated | | Large dataset strategy | Volume-mount + download + fallback hierarchy | ImageNet on host, smaller sets downloadable | | Entrypoint | Wrapper bash script → python scripts | Separates concerns, easy to debug | | LLM guidance | Dataset availability matrix in prompt | LLM makes informed decisions about data | --- ## 4. Implementation Plan ### Task E1: Fix `pkg_hint` to list all installed packages [CRITICAL] **File:** `researchclaw/pipeline/executor.py:2062` **Change:** Update `pkg_extras` string for docker mode to include ALL pre-installed packages. ```python # Before: pkg_extras = ", torchdiffeq, gymnasium, networkx, and pip-installable packages" # After: pkg_extras = ( ", torchvision, torchaudio, matplotlib, seaborn, scipy, " "tqdm, torchdiffeq, gymnasium, networkx, PyYAML, " "and pip-installable packages (auto-detected from imports)" ) ``` **Effort:** 10 min ### Task E2: Fix Phase 1/Phase 2 container isolation — single-container execution [CRITICAL] **Files:** - `researchclaw/docker/entrypoint.sh` (NEW) — wrapper script - `researchclaw/docker/Dockerfile` (MODIFY) — new entrypoint - `researchclaw/experiment/docker_sandbox.py` (MODIFY) — refactor execution model **E2.1: Create wrapper entrypoint script** (`researchclaw/docker/entrypoint.sh`) ```bash #!/bin/bash set -e WORKSPACE="/workspace" SETUP_ONLY_NETWORK="${RC_SETUP_ONLY_NETWORK:-0}" # --- Phase 0: Install additional pip packages --- if [ -f "$WORKSPACE/requirements.txt" ]; then echo "[RC] Phase 0: Installing packages from requirements.txt..." pip install --no-cache-dir --break-system-packages \ -r "$WORKSPACE/requirements.txt" 2>&1 | tail -5 echo "[RC] Phase 0: Package installation complete." fi # --- Phase 1: Run setup script (dataset downloads, etc.) --- if [ -f "$WORKSPACE/setup.py" ]; then echo "[RC] Phase 1: Running setup.py (dataset download/preparation)..." python3 -u "$WORKSPACE/setup.py" echo "[RC] Phase 1: Setup complete." fi # --- Network cutoff (if setup_only policy) --- if [ "$RC_SETUP_ONLY_NETWORK" = "1" ]; then echo "[RC] Disabling network for experiment phase..." # Drop all outbound traffic (requires NET_ADMIN capability) iptables -A OUTPUT -j DROP 2>/dev/null || \ ip route del default 2>/dev/null || \ echo "[RC] Warning: Could not disable network (no NET_ADMIN cap)" fi # --- Phase 2: Run experiment --- ENTRY_POINT="${1:-main.py}" echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..." exec python3 -u "$WORKSPACE/$ENTRY_POINT" ``` **E2.2: Update Dockerfile** to use new entrypoint ```dockerfile # Add entrypoint script COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh # Change entrypoint from python3 to wrapper script ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ``` Note: For `setup_only` network policy, the container needs `--cap-add=NET_ADMIN` for iptables, or we use `ip route del default` as a fallback (doesn't require capabilities). **E2.3: Refactor `docker_sandbox.py`** - Remove separate `_install_deps()` method - Update `_build_run_command()` to pass entry point as CMD argument - Handle `requirements.txt` generation in staging dir - Add `RC_SETUP_ONLY_NETWORK` env var for network cutoff **E2.4: Update `_build_run_command()` for new model** ```python def _build_run_command(self, staging_dir, *, entry_point, container_name, network_disabled): cfg = self.config cmd = [ "docker", "run", "--name", container_name, "--rm", "-v", f"{staging_dir}:/workspace", "-w", "/workspace", f"--memory={cfg.memory_limit_mb}m", f"--shm-size={cfg.shm_size_mb}m", ] # For setup_only: container starts with network, then disables it internally if cfg.network_policy == "setup_only": cmd.extend(["-e", "RC_SETUP_ONLY_NETWORK=1"]) cmd.extend(["--cap-add=NET_ADMIN"]) # Don't add --network none (need network for setup phases) elif cfg.network_policy == "none": cmd.extend(["--network", "none"]) # "pip_only" and "full" keep normal network # ... (volume mounts, GPU, etc. unchanged) ... # New: generate requirements.txt from auto-detected deps if cfg.network_policy in ("pip_only", "setup_only", "full"): if cfg.auto_install_deps or cfg.pip_pre_install: self._write_requirements_txt(staging_dir) cmd.append(cfg.image) cmd.append(entry_point) # Passed as CMD to entrypoint.sh return cmd ``` **Effort:** 3-4 hours ### Task E3: Change default `network_policy` to `"setup_only"` [HIGH] **File:** `researchclaw/config.py:163` ```python # Before: network_policy: str = "none" # After: network_policy: str = "setup_only" ``` Also update docstring and config examples. **Effort:** 15 min ### Task E4: Add LLM-generated `setup.py` for dataset downloads [HIGH] **Files:** - `researchclaw/prompts.py` — add `setup_script_guidance` block - `researchclaw/pipeline/executor.py` — code generation stage generates setup.py alongside main.py **E4.1: Add `setup_script_guidance` prompt block** ``` ## Setup Script (setup.py) In addition to main.py, generate a setup.py script that handles: 1. Downloading required datasets 2. Any data preprocessing needed before the experiment The setup.py will run WITH network access before main.py runs without network. Use standard APIs: - torchvision.datasets.X(root='/workspace/data', download=True) - datasets.load_dataset('name', cache_dir='/workspace/data/hf') - ogb.nodeproppred.PygNodePropPredDataset(name='ogbg-molhiv', root='/workspace/data') - urllib.request.urlretrieve(url, '/workspace/data/filename') If all datasets are pre-cached (CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN), you may omit setup.py entirely. ``` **E4.2: Update executor code generation to emit `setup.py`** In the code generation prompt, instruct the LLM to produce a second file `setup.py` when datasets need downloading. The executor parses the response for both `main.py` and `setup.py` and writes both to the staging directory. **Effort:** 2 hours ### Task E5: Fix pre-cached dataset list — expand + sync with prompt [MEDIUM] **File:** `researchclaw/docker/Dockerfile:27-30` **Add to Dockerfile:** ```dockerfile # Pre-cache standard datasets for offline use RUN mkdir -p /opt/datasets && \ python3 -c "\ import torchvision; \ torchvision.datasets.CIFAR10(root='/opt/datasets', train=True, download=True); \ torchvision.datasets.CIFAR10(root='/opt/datasets', train=False, download=True); \ torchvision.datasets.CIFAR100(root='/opt/datasets', train=True, download=True); \ torchvision.datasets.CIFAR100(root='/opt/datasets', train=False, download=True); \ torchvision.datasets.MNIST(root='/opt/datasets', train=True, download=True); \ torchvision.datasets.MNIST(root='/opt/datasets', train=False, download=True); \ torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True, download=True); \ torchvision.datasets.FashionMNIST(root='/opt/datasets', train=False, download=True); \ torchvision.datasets.STL10(root='/opt/datasets', split='train', download=True); \ torchvision.datasets.STL10(root='/opt/datasets', split='test', download=True); \ torchvision.datasets.SVHN(root='/opt/datasets', split='train', download=True); \ torchvision.datasets.SVHN(root='/opt/datasets', split='test', download=True); \ " && chmod -R a+r /opt/datasets ``` **Update `dataset_guidance` prompt** to match actual pre-cached datasets. **Effort:** 30 min ### Task E6: Expand Dockerfile with commonly-needed ML packages [MEDIUM] **File:** `researchclaw/docker/Dockerfile` **Add package groups:** ```dockerfile # Extended ML ecosystem RUN python3 -m pip install \ timm einops torchmetrics Pillow \ transformers datasets accelerate peft \ bitsandbytes sentencepiece protobuf safetensors tokenizers \ trl evaluate rouge-score \ h5py tensorboard # Optional heavy packages (uncomment if needed) # RUN python3 -m pip install torch-geometric ogb # RUN python3 -m pip install albumentations kornia ``` **Update `builtin` set in `docker_sandbox.py:372-383`** to include new packages. **Update `import_to_pip` dict** with new mappings. **Effort:** 30 min ### Task E7: Fix misleading `dataset_guidance` prompt [MEDIUM] **File:** `researchclaw/prompts.py:323-369` **Changes:** 1. Accurately reflect which datasets are pre-cached vs. need downloading 2. Add dataset availability matrix with size info 3. Add fallback hierarchy for large datasets 4. Remove misleading "network available during setup" statement 5. Add guidance based on actual `network_policy` **New `dataset_guidance` block structure:** ``` ## Dataset Availability ### Tier 1: Pre-cached (ALWAYS available, use download=False) CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN → Root: /workspace/data ### Tier 2: Downloadable (available if setup.py runs with network) Any torchvision dataset, HuggingFace datasets, OGB benchmarks → Generate a setup.py to download before experiment runs ### Tier 3: Large datasets (require host-side preparation) ImageNet (168GB), LAION (>1TB), etc. → Use smaller alternatives: Tiny-ImageNet (237MB, 200 classes), ImageNet-1k subset, or CIFAR-100 as proxy ### ANTI-PATTERNS (NEVER do these): ✗ torch.randn() "ImageNet-like" data → Use real datasets ✗ download=True in main.py → Use setup.py for downloads ✗ download=False for non-cached datasets → Will FileNotFoundError ``` **Effort:** 1 hour ### Task E8: Add `requirements.txt` generation support [LOW] **Files:** - `researchclaw/prompts.py` — add requirement to code_generation prompt - `researchclaw/experiment/docker_sandbox.py` — auto-generate from detected imports **E8.1: LLM generates `requirements.txt`** Add to code_generation prompt: ``` If your experiment requires packages not in the standard Docker image, include a requirements.txt file listing them (one per line with versions). ``` **E8.2: Auto-generate fallback** In `docker_sandbox.py`, before container execution, auto-detect imports and write `requirements.txt` to staging dir if the LLM didn't provide one. **Effort:** 1 hour ### Task E9: Add dataset registry for LLM guidance [LOW] **File:** `researchclaw/data/dataset_registry.yaml` (NEW) **Content:** Structured registry of common ML datasets with: - Name, domain, size, download method - Availability tier (pre-cached / downloadable / host-only) - Fallback alternatives for large datasets **Usage:** Injected into experiment_design and code_generation prompts so the LLM makes informed decisions. **Effort:** 1 hour ### Task E10: Add `entrypoint.sh` non-root pip install support [LOW] **File:** `researchclaw/docker/Dockerfile` **Issue:** Container runs as `researcher` (non-root) but pip install needs root (or `--break-system-packages --user`). **Fix:** In `entrypoint.sh`, use `--user` flag for pip install, or run as root and drop privileges before Phase 2. **Alternative:** Use `--user 0:0` for the container and run experiment code under `researcher` via `su -c`. **Effort:** 30 min --- ## 5. Implementation Priority & Dependencies ``` E1 (pkg_hint fix) ─────────── Immediate, no deps ↓ E5 (pre-cache datasets) ─────────── Dockerfile change E6 (expand packages) ─────────── Dockerfile change (parallel with E5) ↓ E2 (single-container) ─────────── Core architecture fix E3 (default policy) ─────────── After E2 E10 (non-root pip) ─────────── After E2 ↓ E4 (setup.py generation) ─────────── After E2+E3 E7 (fix prompt guidance) ─────────── After E5 E8 (requirements.txt) ─────────── After E2 ↓ E9 (dataset registry) ─────────── Last (enhancement) ``` **Phase A (Critical, Day 1):** E1 + E2 + E3 + E5 + E6 + E7 + E10 **Phase B (Important, Day 2):** E4 + E8 + E9 --- ## 6. Testing Plan ### Unit Tests - [ ] `test_docker_sandbox.py`: Test single-container execution with entrypoint.sh - [ ] `test_docker_sandbox.py`: Test requirements.txt auto-generation - [ ] `test_docker_sandbox.py`: Test setup.py execution in container - [ ] `test_docker_sandbox.py`: Test network cutoff in `setup_only` mode - [ ] `test_prompts.py`: Verify pkg_hint includes all installed packages - [ ] `test_prompts.py`: Verify dataset_guidance matches Dockerfile pre-cached list ### Integration Tests - [ ] Docker build succeeds with expanded packages and datasets - [ ] Experiment using CIFAR-10 (pre-cached) runs without network - [ ] Experiment using STL-10 (newly pre-cached) runs without network - [ ] Experiment requiring timm installs it via requirements.txt - [ ] Experiment with setup.py downloads HuggingFace dataset - [ ] Network is actually disabled after setup in `setup_only` mode ### E2E Regression - [ ] Run pipeline with topic "Image Classification on CIFAR-10" → uses real CIFAR-10 - [ ] Run pipeline with topic "Vision Transformer on STL-10" → uses real STL-10 - [ ] Run pipeline with topic "Sentiment Analysis on IMDB" → downloads IMDB via setup.py - [ ] No experiment produces synthetic/random data as a dataset substitute --- ## 7. Risk Assessment | Risk | Severity | Mitigation | |------|----------|------------| | iptables in container needs NET_ADMIN cap | Medium | Fallback to `ip route del default`; document cap requirement | | entrypoint.sh changes break existing Docker images | High | Version the image tag (`:v2`); test with both entrypoints | | Large pre-cached datasets bloat Docker image | Medium | Use multi-stage build; keep optional packages commented | | HuggingFace download timeouts | Low | Set timeout in setup.py; retry logic | | LLM generates malicious pip packages | Low | Existing code security validation catches subprocess/network calls | --- ## 8. Files to Modify/Create | Action | File | Tasks | |--------|------|-------| | CREATE | `researchclaw/docker/entrypoint.sh` | E2 | | CREATE | `researchclaw/data/dataset_registry.yaml` | E9 | | MODIFY | `researchclaw/docker/Dockerfile` | E2, E5, E6, E10 | | MODIFY | `researchclaw/experiment/docker_sandbox.py` | E2, E3, E8 | | MODIFY | `researchclaw/config.py` | E3 | | MODIFY | `researchclaw/pipeline/executor.py` | E1, E4 | | MODIFY | `researchclaw/prompts.py` | E4, E7 | | MODIFY | `tests/test_docker_sandbox.py` | Tests | --- ## 9. Comparison: Before vs. After | Aspect | Before | After | |--------|--------|-------| | Available packages in prompt | 9 packages listed | 15+ packages listed | | Runtime pip install | Broken (Phase 1/2 isolation bug) | Working (single container) | | Default network policy | `"none"` (no install, no download) | `"setup_only"` (install+download, then isolated) | | Pre-cached datasets | 2 (CIFAR-10, FashionMNIST) | 6 (+ CIFAR-100, MNIST, STL-10, SVHN) | | Dataset download support | None | setup.py with network access | | Dockerfile ML packages | ~15 packages | ~30+ packages | | Large dataset handling | Falls back to synthetic | Fallback hierarchy + alternatives | | Requirements declaration | None | requirements.txt + auto-detect | ================================================ FILE: docs/showcase/SHOWCASE.md ================================================

🏆 Generated Paper Showcase

From a one-line idea to a conference-ready paper — fully autonomous, zero human intervention.

23 Stages  8 Papers  54k LOC  ~27h Runtime

1547+ papers  50 figures  121 pages  291 refs

--- Below are **eight papers** generated **entirely by AutoResearchClaw** — each starting from nothing more than a topic sentence. The pipeline autonomously searched literature, designed experiments, wrote and executed code, generated figures, and produced NeurIPS-formatted LaTeX papers with verified references. > 📌 **Two batches, eight domains** — Batch A covers mathematics, statistics, biology, and numerical computing; Batch B covers NLP, reinforcement learning, computer vision, and knowledge distillation — demonstrating the pipeline's cross-domain generality. --- ## 🔄 How It Works
**💡**
**Idea**
**📚**
**Literature**
300–470 papers
**🧪**
**Hypothesis**
experiment design
**💻**
**Code**
2K–15K lines
**🔬**
**Execute**
sandbox + refine
**📝**
**Write**
review & audit
**📄**
**Paper**
NeurIPS PDF

Each run traverses 23 autonomous stages with iterative self-healing, multi-agent peer review, and citation verification — no human in the loop.

---

📘 Batch A  ·  Mathematics, Statistics & Sciences

Generated on Machine A  ·  4 papers across 4 non-ML domains

--- ### 📄 Paper I  ·  Random Matrix Theory   math > **Finite-Dimensional Corrections to the Marchenko–Pastur Distribution in Random Wishart Matrices**
Paper I First Page

👆 Click to read the full paper

#### 💡 Idea Systematically quantify pre-asymptotic, finite-*N* deviations of empirical eigenvalue densities from the Marchenko–Pastur law across *N* = 50 to 5,000, decomposing error into bulk vs. edge components and testing lightweight correction models. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 473 papers collected → 26 cited | | 💻 **Code** | 10,290 lines of Python | | ⏱️ **Runtime** | ~2 h 25 min | | 📊 **Figures** | 5 auto-generated charts | | 📑 **Pages** | 16 pages (NeurIPS format) | #### 🎯 Key Result Produced a finite-*N* correction atlas showing convergence rates of spectral densities, with edge deviations persisting significantly longer than bulk errors — providing practical guidance for when the MP law is "close enough." Read PDF
🖼️ Auto-Generated Framework Diagram — MPCX Architecture

MPCX Framework Diagram

Finite-dimensional correction pipeline: Wishart matrix generation → empirical spectral density estimation → MP baseline comparison → bulk/edge error decomposition → correction model fitting. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper II  ·  Econometrics   stats > **Monte Carlo Evaluation of Instrumental Variable Estimators Under Weak Instruments**
Paper II First Page

👆 Click to read the full paper

#### 💡 Idea Reframe the classical 2SLS / LIML / Fuller-*k* / JIVE comparison around decision-relevant *risk surfaces*, mapping finite-sample phase diagrams that show where each estimator is preferred under realistic weak-IV conditions. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 366 papers collected → 41 cited | | 💻 **Code** | 10,062 lines of Python | | ⏱️ **Runtime** | ~2 h 56 min | | 📊 **Figures** | 6 auto-generated charts | | 📑 **Pages** | 14 pages (NeurIPS format) | #### 🎯 Key Result Generated estimator-switching phase diagrams revealing that Fuller-*k* dominates in specific small-*n*, many-instrument regions, while JIVE's bias reduction is systematically offset by variance inflation — providing actionable guidance for empirical researchers. Read PDF
🖼️ Auto-Generated Framework Diagram — IVX Architecture

IVX Framework Diagram

Monte Carlo IV evaluation pipeline: DGP specification → estimator suite (2SLS, LIML, Fuller-k, JIVE) → finite-sample risk surfaces → phase diagram construction. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper III  ·  Epidemiological Modeling   bio > **Structural Identifiability and Parameter Estimation in Compartmental Epidemic Models (SIR / SEIR)**
Paper III First Page

👆 Click to read the full paper

#### 💡 Idea Map the boundary between structural and practical identifiability in SIR vs. SEIR models across realistic observation regimes, and quantify when Fisher Information Matrix gives false confidence relative to profile likelihood. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 388 papers collected → 29 cited | | 💻 **Code** | 9,374 lines of Python | | ⏱️ **Runtime** | ~2 h 23 min | | 📊 **Figures** | 6 auto-generated charts | | 📑 **Pages** | 18 pages (NeurIPS format) | #### 🎯 Key Result Demonstrated that parameterization and observer design choices affect identifiability diagnostics more strongly than the choice between SIR and SEIR structure — with FIM producing overconfident estimates in specific observation-limited regimes where profile likelihood correctly flags non-identifiability. Read PDF
🖼️ Auto-Generated Framework Diagram — PRIM Architecture

PRIM Framework Diagram

PRIM benchmark workflow: synthetic outbreak generation (SIR/SEIR) → parameter estimation → profile likelihood vs. FIM diagnostics → identifiability regime mapping. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper IV  ·  Numerical Linear Algebra   computing > **Comparative Analysis of Preconditioning Strategies for Krylov Subspace Methods on Sparse Linear Systems**
Paper IV First Page

👆 Click to read the full paper

#### 💡 Idea Go beyond "which preconditioner wins" — build a feature-conditioned decision map for ILU / Jacobi / SSOR / AMG with CG / GMRES / BiCGSTAB, stratified by sparsity-graph structure and matrix pathology under realistic setup-vs-solve cost budgets. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 320 papers collected → 33 cited | | 💻 **Code** | 14,557 lines of Python | | ⏱️ **Runtime** | ~2 h 30 min | | 📊 **Figures** | 4 auto-generated charts | | 📑 **Pages** | 16 pages (NeurIPS format) | #### 🎯 Key Result Produced a setup-vs-solve tradeoff analysis showing that methods considered "best" under solve-time alone are often suboptimal under realistic memory and setup budgets — with AMG dominance limited to specific elliptic SPD matrix families. Read PDF
🖼️ Auto-Generated Framework Diagram — Krylov Preconditioner Architecture

Krylov Preconditioner Framework Diagram

Feature-conditioned preconditioner evaluation: sparse matrix collection → structural descriptor extraction → solver–preconditioner grid (CG/GMRES/BiCGSTAB × ILU/Jacobi/SSOR/AMG) → setup-vs-solve tradeoff analysis → decision map. Entirely auto-generated by the FigureAgent subsystem.

---

📙 Batch B  ·  Machine Learning & AI

Generated on Machine B  ·  NVIDIA RTX 6000 Ada (48 GB)  ·  4 papers across 4 ML sub-fields

--- ### 📄 Paper V  ·  Parameter-Efficient Fine-Tuning   NLP > **GARD: Gradient-Spectral Rank Allocation for LoRA Fine-Tuning**
Paper V First Page

👆 Click to read the full paper

#### 💡 Idea Most LoRA configurations use a fixed, uniform rank across all layers. GARD proposes using the *spectrum of layer-wise gradients* — eigenvalues of gradient covariance — to dynamically allocate rank where it matters most, under a strict parameter budget. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 60 references cited (100% verified) | | 💻 **Code** | 2,894 lines of Python (5 files) | | ⏱️ **Runtime** | ~50 min | | 📊 **Figures** | 7 auto-generated charts | | 📑 **Pages** | 17 pages (NeurIPS format) | #### 🎯 Key Contribution A principled alternative to uniform rank allocation: GARD links intrinsic gradient dimensionality to low-rank adapter capacity, periodically updating ranks during training using smoothed spectra. Read PDF
🖼️ Auto-Generated Framework Diagram — GARD Architecture

GARD Framework Diagram

Gradient spectral analysis → layer-wise rank scoring → dynamic rank allocation under budget constraint. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper VI  ·  Reinforcement Learning   RL > **LACE: Learned Abstractions for Count-Based Exploration in Sparse-Reward RL**
Paper VI First Page

👆 Click to read the full paper

#### 💡 Idea Count-based exploration in RL relies on state visitation counts, but raw state spaces are too large for effective counting. LACE designs *online-learned, task-aware state abstractions* optimized specifically for count-based exploration in sparse-reward environments. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 25 references cited (100% verified) | | 💻 **Code** | 2,067 lines of Python (4 files) | | 🐳 **Experiment** | 32 min GPU sandbox execution | | ⏱️ **Runtime** | ~6.8 hrs total | | 📊 **Figures** | 6 auto-generated charts | | 📑 **Pages** | 11 pages (NeurIPS format) | #### 🎯 Key Result DQN baseline achieves **356.7 mean reward** in sparse-reward gridworld tasks. The paper analyzes the trade-off between abstraction compactness for counting and information sufficiency for downstream control. Read PDF
🖼️ Auto-Generated Framework Diagram — LACE Architecture

LACE Framework Diagram

Learned state abstraction module integrated with count-based exploration in the DQN agent loop. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper VII  ·  Efficient Vision Transformers   CV > **FAME: Frequency-Aware Progressive Token Merging for Efficient ViT Inference**
Paper VII First Page

👆 Click to read the full paper

#### 💡 Idea Existing ViT token pruning methods reduce tokens based on attention or saliency without considering *frequency content*. FAME uses DCT/FFT-based spectral filters to distinguish high-frequency detail tokens from low-frequency background tokens, merging progressively across layers. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 40 references cited (100% verified) | | 💻 **Code** | 2,873 lines of Python (5 files) | | 🐳 **Experiment** | 32 min GPU sandbox execution | | ⏱️ **Runtime** | ~3.3 hrs total | | 📊 **Figures** | 7 auto-generated charts | | 📑 **Pages** | 10 pages (NeurIPS format) | #### 🎯 Key Result ViT-B/16 baseline: **56.54% accuracy** (3 seeds). Detailed analysis of the accuracy-efficiency tradeoff and per-layer metric breakdowns for frequency-aware vs. similarity-based merging. Read PDF
🖼️ Auto-Generated Framework Diagram — FAME Architecture

FAME Framework Diagram

Frequency-aware token merging applied progressively across ViT layers with DCT-based spectral filtering. Entirely auto-generated by the FigureAgent subsystem.

--- ### 📄 Paper VIII  ·  Knowledge Distillation   KD > **CRAFT: Contrastive Feature Alignment for Robust Distillation Under Distribution Shift**
Paper VIII First Page

👆 Click to read the full paper

#### 💡 Idea Standard knowledge distillation transfers teacher knowledge assuming train/test distributions match. CRAFT introduces *reliability-aware contrastive feature alignment* that aligns teacher-student features across clean and corrupted views, while suppressing fragile teacher directions via a de-alignment loss. #### ⚙️ Pipeline Journey | | | |:---|:---| | 🔗 **Stages** | 23 stages + 2 refinement iterations | | 📚 **Literature** | 37 references cited (97% verified) | | 💻 **Code** | 2,231 lines of Python (4 files) | | 🐳 **Experiment** | 33 min GPU sandbox execution | | ⏱️ **Runtime** | ~5.8 hrs total | | 📊 **Figures** | 9 auto-generated charts | | 📑 **Pages** | 19 pages (NeurIPS format) | #### 🎯 Key Result | Method | Clean Acc | Robust Acc | |:---|:---:|:---:| | ERM (baseline) | 81.22% | 62.96% | | LogitKD | 82.33% | 64.68% | | **AttentionKD** | **82.08%** | **65.95%** | | CRD | 68.03% | 50.57% | Attention-based feature KD improves robustness by **+3 pts** over ERM, while naive CRD degrades it by **-12 pts** — motivating CRAFT's reliability-aware design. Read PDF
🖼️ Auto-Generated Framework Diagram — CRAFT Architecture

CRAFT Framework Diagram

Reliability-aware contrastive feature alignment between teacher and student across clean and corrupted views, with de-alignment on fragile teacher directions. Entirely auto-generated by the FigureAgent subsystem.

--- ## 📊 Aggregate Statistics
📋 Metric I II III IV V VI VII VIII 🏆 Total
🏷️ Domain Math Stats Bio NumLA NLP RL CV KD 8 fields
💻 Code (LOC) 10,290 10,062 9,374 14,557 2,894 2,067 2,873 2,231 54,348
⏱️ Pipeline Time 2h25m 2h56m 2h23m 2h30m 50m 6h48m 3h18m 5h48m ~27 hrs
🔗 References 26 41 29 33 60 25 40 37 291 cited
📊 Figures 5 6 6 4 7 6 7 9 50 figs
📑 Pages 16 14 18 16 17 11 10 19 121 pages
---

🚀 Try It Yourself

Every paper above was generated by a single command:

```bash researchclaw run --topic "Your research idea here" --auto-approve ```

Back  GitHub

================================================ FILE: prompts.default.yaml ================================================ # ============================================================================= # AutoResearchClaw — Default Prompt Templates # ============================================================================= # # Copy this file, edit any prompt you want to customize, and point your config # to the copy: # # prompts: # custom_file: "my_prompts.yaml" # # Template variables use {var_name} syntax — see docs/integration-guide.md # for a list of available variables per stage. # # Stages without an entry here (experiment_run, citation_verify) do not call # the LLM and therefore have no prompts to customize. # ============================================================================= blocks: compute_budget: | ## Compute Budget Constraint - Total execution time limit: {time_budget_sec} seconds - You MUST design experiments that complete within this budget - Estimate: a simple numpy loop runs ~10M iterations/sec; a nested loop over conditions runs proportionally slower - SCALING RULES (mandatory): - If total conditions > 100: reduce seeds to 3-5 (not 20) - If total conditions > 500: reduce to 2-3 representative conditions per factor - If time_budget < 300s: limit total optimization steps to ≤5,000 per run - If time_budget < 120s: limit total optimization steps to ≤1,000 per run - Always print intermediate results so partial data is captured on timeout - MANDATORY: print a "TIME_ESTIMATE: Xs" line before the main loop, estimating total runtime based on a small pilot (run 1 condition, extrapolate) - MANDATORY: implement a time guard — check elapsed time periodically and stop gracefully if approaching 80% of budget, saving all results collected so far pkg_hint_sandbox: ' AVAILABLE PACKAGES (sandbox mode): Python stdlib, numpy, math, random, statistics, json. Do NOT use: torch, tensorflow, jax, sklearn, pandas, scipy, matplotlib, or any deep learning framework. Write the experiment using ONLY numpy and stdlib. ' topic_constraint: ' === HARD TOPIC CONSTRAINT === The paper MUST be about: {topic} PROHIBITED content (unless user explicitly specifies case-study mode): - Do NOT treat environment setup, dependency installation, or infrastructure failures as a research contribution. - Do NOT present debugging logs, system errors, or configuration issues as experimental findings. - Do NOT drift to tangential topics not directly related to the stated topic. - Every section MUST connect back to the core research question. - The Abstract and Introduction MUST clearly state the research problem derived from: {topic} - The Method section MUST describe a technical approach, not a workflow. - The Results section MUST report quantitative outcomes of experiments, not environment status. === END CONSTRAINT === ' stages: code_generation: max_tokens: 8192 system: You are a computational scientist who writes real, runnable experiments. Your code implements actual algorithms with real mathematical operations. You NEVER fake results with random number generators. Always use the ```filename:xxx.py format for each file. Use numpy for numerical computation. Keep code self-contained and deterministic. user: "Generate a Python experiment project for the following research topic:\nTOPIC: {topic}\n\nCRITICAL REQUIREMENTS\ \ — your code MUST satisfy ALL of these:\n1. Implement REAL algorithms (e.g., gradient descent, Adam, SGD, etc.)\n \ \ using numpy arrays — NOT random.uniform() loops that fake results.\n2. Define REAL objective/loss functions (e.g.,\ \ Rosenbrock, quadratic,\n cross-entropy on synthetic data) with proper mathematical formulas.\n3. Run REAL optimization\ \ loops that compute gradients and update parameters.\n4. Collect REAL metrics (loss values, convergence rates) from\ \ the optimization.\n5. The code must be scientifically meaningful — a reviewer should see\n actual algorithm implementations,\ \ not random number generators.\n\nOUTPUT FORMAT — return multiple files using this exact format:\n```filename:main.py\n\ # entry point code\n```\n\n```filename:optimizers.py\n# optimizer implementations\n```\n\nCODE STRUCTURE:\n- main.py:\ \ entry point that runs experiments and prints metrics\n- Additional modules for algorithms, objective functions, utilities\n\ - Primary metric key: {metric}\n- main.py must print metric lines as `name: value` (one per line)\n- main.py must ALSO\ \ write a `results.json` file with structured experiment results\n (e.g. per-algorithm, per-function, per-dimension metrics\ \ as nested dicts/lists)\n- Use deterministic seeds (numpy.random.seed or random.seed)\n- No external data files, no\ \ network calls, no GPU required\n- FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket\n- MUST implement convergence\ \ stopping criteria (e.g. stop when objective change < 1e-8 for\n N consecutive iterations) — do NOT just run a fixed\ \ number of iterations\n{pkg_hint}\nANTI-PATTERNS (do NOT do these):\n- Do NOT generate random numbers and pretend they\ \ are experiment results\n- Do NOT use `random.uniform()` to simulate a decreasing loss curve\n- Do NOT hardcode metric\ \ values or use trivial arithmetic as metrics\n- Do NOT run a fixed number of iterations without any convergence check\n- Do NOT implement convergence_rate or similar metrics as dummy return values\n (e.g. returning 1.0 or a constant) — measure actual iterations to convergence\n- If you report convergence_rate, define it as iterations_to_convergence / max_iterations\n or similar — it MUST differ between algorithms\n\nNUMPY 2.x COMPATIBILITY (CRITICAL):\n- np.trapz is REMOVED → use np.trapezoid\n- np.erfinv does NOT exist → use scipy.special.erfinv\n- np.bool, np.int, np.float, np.complex are REMOVED → use Python builtins\n- np.str, np.object are REMOVED → use str, object\n- np.math is REMOVED → use math module\n\nExperiment plan:\n{exp_plan}" experiment_design: system: You are a principal investigator designing ML experiments. user: '{preamble} Design an experiment plan as YAML. Required keys: objectives,datasets,baselines,proposed_methods,ablations,metrics,risks,compute_budget. Hypotheses: {hypotheses}' export_publish: max_tokens: 16384 system: You are a publication formatting editor. user: 'Format revised paper into clean final markdown for publication export. Preserve content quality and readability. Input paper: {revised}' hypothesis_gen: system: You formulate testable scientific hypotheses. user: 'Generate at least 2 falsifiable hypotheses from synthesis. Output markdown and for each hypothesis provide rationale, measurable prediction, failure condition. Synthesis: {synthesis}' knowledge_archive: system: You produce reproducibility-focused research retrospectives. user: '{preamble} Write retrospective archive markdown with lessons, reproducibility notes, and future work. Decision: {decision} Analysis: {analysis} Revised paper: {revised}' knowledge_extract: json_mode: true system: You extract high-signal evidence cards from papers. user: 'Extract structured knowledge cards from shortlist. Return JSON: {cards:[{card_id,title,cite_key,problem,method,data,metrics,findings,limitations,citation}]}. IMPORTANT: If the input contains cite_key fields, preserve them exactly in the output. Shortlist: {shortlist}' literature_collect: json_mode: true system: You are a literature mining assistant. user: 'Generate candidate papers from the search plan. Return JSON: {candidates:[...]} with >=20 rows. Each candidate must include id,title,source,url,year,abstract,collected_at. Topic: {topic} Search plan: {plan_text}' literature_screen: json_mode: true system: You are a strict domain-aware reviewer. Reject off-topic papers aggressively. user: 'Perform merged relevance+quality screening and return shortlist. Return JSON: {shortlist:[...]} each with title, cite_key (if present), relevance_score (0-1), quality_score (0-1), keep_reason. Preserve all original fields (paper_id, doi, arxiv_id, cite_key, etc.) from the input. Topic: {topic} Domains: {domains} Threshold: {quality_threshold} IMPORTANT: Only keep papers genuinely relevant to the topic above. Reject papers about unrelated domains even if they are high quality. Candidates JSONL: {candidates_text}' paper_draft: max_tokens: 32768 system: "You are a top-tier ML paper author writing for NeurIPS/ICML/ICLR.\n\n\ KEY PRINCIPLES (from accepted paper analyses):\n\ 1. NOVELTY: A good paper has 1-2 key ideas and keeps the rest simple. Think sushi, not curry.\n\ 2. NARRATIVE: The paper is a short, rigorous, evidence-based technical story with a takeaway readers care about.\n\ 3. FIGURE 1: The most important figure. It should convey whatever is most important — many readers go straight to Figure 1.\n\ 4. STRONG BASELINES: Invest real effort in making baselines competitive. Reviewers catch weak baselines.\n\ 5. ABLATIONS: Remove one component at a time and measure the effect. Without ablations, reviewers cannot tell which parts matter.\n\ 6. HONESTY: Acknowledge limitations explicitly. Papers that don't are substantially weaker.\n\ 7. CONTRIBUTIONS: State contributions clearly in Abstract AND Introduction. Many reviewers stop reading carefully after the intro.\n\ 8. REPRODUCIBILITY: Include all details needed to reproduce: hyperparameters, data processing, random seeds, hardware specs.\n\n\ COMMON REJECTION REASONS (avoid these):\n\ - Overclaiming: match claims to evidence\n\ - Missing ablations: systematically demonstrate each component's contribution\n\ - Weak baselines: tune baselines with the same effort as your method\n\ - Poor reproducibility: include every detail needed to replicate\n\n\ You ONLY use real experimental data — never fabricate or approximate numbers. Every metric value must exactly match the provided experiment output.\n\ You write at the depth and length expected for a 9-page conference paper (approximately 5000-6500 words in the main body, excluding references)." user: '{preamble} Write a FULL-LENGTH paper draft section by section in markdown. This paper must be suitable for submission to a top-tier ML conference (NeurIPS, ICML, ICLR). CRITICAL LENGTH REQUIREMENTS — each section MUST meet its minimum word count: 1. **Title**: Concise, informative (10-15 words) 2. **Abstract** (150-250 words): Problem, method, key results with numbers, conclusion 3. **Introduction** (800-1000 words): Motivation with real-world context, problem statement, research gap analysis, brief method overview, contribution list (3-4 bullet points), paper organization 4. **Related Work** (600-800 words): Organized by 3-4 thematic groups, each with 4-5 citations. Compare and contrast approaches, identify limitations of prior work, position this work clearly 5. **Method** (1000-1500 words): Formal problem definition with mathematical notation, detailed algorithm description with equations, complexity analysis, design rationale for key choices 6. **Experiments** (800-1200 words): Detailed experimental setup (datasets, preprocessing, data splits), baselines and their implementations, hyperparameter settings (in a table), evaluation metrics with justification, hardware and runtime information 7. **Results** (600-800 words): Main results table(s) with ALL metrics, per-condition analysis, statistical significance discussion, ablation studies, qualitative analysis where relevant 8. **Discussion** (400-600 words): Interpretation of key findings, unexpected results analysis, comparison with prior work, practical implications 9. **Limitations** (200-300 words): Honest assessment of scope, dataset, methodology, and generalizability limitations 10. **Conclusion** (200-300 words): Summary of contributions, main findings, and concrete future work directions TOTAL TARGET: 5000-6500 words in the main body. If any section is shorter than its minimum, EXPAND it with substantive technical content — NOT filler. QUALITY STANDARDS: - Use formal academic language throughout - Include mathematical notation where appropriate (use LaTeX-style $...$ for inline math) - Every claim must be supported by either a citation or experimental evidence - Results tables should use markdown table format with proper column headers - Provide algorithm pseudocode in the Method section when applicable Required sections: Title, Abstract, Introduction, Related Work, Method, Experiments, Results, Discussion, Limitations, Conclusion. Do NOT include a References section — it will be auto-generated. {topic_constraint}{exp_metrics_instruction}{citation_instruction}Outline: {outline}' paper_outline: max_tokens: 8192 system: You are an academic writing planner. user: '{preamble} Create a detailed paper outline in markdown. Include per-section goals and evidence links. {topic_constraint}{feedback}Analysis: {analysis} Decision: {decision}' paper_revision: max_tokens: 32768 system: You are a paper revision expert for NeurIPS/ICML/ICLR submissions. When revising, NEVER shorten existing sections — only expand, improve, and add content. The final paper must be at least as long as the draft. user: 'Revise the paper draft to address all review comments. CRITICAL: Maintain or INCREASE the paper length. Each section must meet its minimum word count: Abstract (150-250), Introduction (800-1000), Related Work (600-800), Method (1000-1500), Experiments (800-1200), Results (600-800), Discussion (400-600), Limitations (200-300), Conclusion (200-300). Return revised markdown only. {topic_constraint}Draft: {draft} Reviews: {reviews}' peer_review: max_tokens: 8192 system: You are a balanced conference reviewer who is rigorous about methodology-evidence consistency. user: 'Simulate peer review from at least 2 reviewer perspectives. Output markdown with Reviewer A and Reviewer B, each including strengths, weaknesses, and actionable revisions. Check specifically: 1. Does the paper stay on topic ({topic})? Flag any sections where the paper drifts to unrelated topics or presents environment issues as contributions. 2. METHODOLOGY-EVIDENCE CONSISTENCY: Compare the paper''s claims about experimental setup (number of trials, statistical tests, hyperparameters, baselines) against the actual experiment evidence provided below. Flag any discrepancies where the paper claims something that is NOT supported by the actual code or results. For example: - Paper claims N trials but code shows a different number - Paper claims statistical tests (ANOVA, t-test) but code has none - Paper reports metrics not present in actual results - Paper describes methods not implemented in code 3. TRIAL COUNT: The actual number of experiment runs is stated in the evidence below. If the paper claims a DIFFERENT number of trials (e.g., "100 independent trials" when only 1 was run), flag this as a CRITICAL fabrication that MUST be corrected. 4. PAPER LENGTH: This paper targets NeurIPS/ICML submission (9 pages). Check that each section has adequate depth. Flag sections that are too short: Abstract (<150 words), Introduction (<700 words), Related Work (<500 words), Method (<800 words), Experiments (<600 words), Results (<500 words). A paper with fewer than 4000 total words is CRITICALLY under-length. 5. REVIEW LIKE A TOP-CONFERENCE REVIEWER: - Is the contribution novel, or is it incremental over well-known work? - Are baselines properly tuned and competitive? - Are ablation studies present and meaningful? - Is every claim supported by evidence from the experiments? - Does the paper acknowledge its limitations honestly? - Would you recommend this paper be presented at NeurIPS/ICML? Why or why not? - Score the paper 1-10 following this rubric: 1-3 Reject (fundamental flaws), 4-5 Borderline (significant weaknesses), 6-7 Weak Accept (solid but not exciting), 8-9 Accept (strong contribution), 10 Strong Accept (exceptional). Paper draft: {draft} {experiment_evidence}' problem_decompose: system: You are a senior research strategist. user: 'Decompose this research problem into at least 4 prioritized sub-questions. Topic: {topic} Output markdown with sections: Source, Sub-questions, Priority Ranking, Risks. Goal context: {goal_text}' quality_gate: json_mode: true system: You are a final quality gate evaluator. user: 'Evaluate revised paper quality and return JSON. Schema: {score_1_to_10:number, verdict:string, strengths:[...], weaknesses:[...], required_actions:[...]}. Threshold: {quality_threshold} Paper: {revised}' research_decision: system: You are a research program lead making go/no-go decisions. user: 'Make a PROCEED or PIVOT decision from analysis. Output markdown with: Decision, Justification, Evidence, Next Actions. Analysis: {analysis}' resource_planning: json_mode: true system: You are an experiment scheduler. user: 'Create schedule JSON with GPU/time estimates. Schema: {tasks:[{id,name,depends_on,gpu_count,estimated_minutes,priority}], total_gpu_budget, generated}. Experiment plan: {exp_plan}' result_analysis: system: You are a quantitative ML analyst. Always cite exact numbers from the provided data. user: '{preamble} {data_context} Analyze run metrics and produce markdown report with statistical interpretation. Use the ACTUAL quantitative values provided above — do NOT invent numbers. Required sections: Metrics Summary (with real values), Comparative Findings, Statistical Checks, Limitations, Conclusion. Run context: {context}' search_strategy: json_mode: true system: You design literature retrieval strategies and source verification plans. You aim for COMPREHENSIVE coverage — a good research paper needs 30-60 references. user: 'Create a merged search strategy package. Return a JSON object with keys: search_plan_yaml, sources. search_plan_yaml must be valid YAML text with search_strategies containing at least 3 strategies, each with 3-5 diverse keyword queries (short, 3-6 words each). Generate at least 8 total queries. Cover: core topic, related methods, benchmarks/datasets, theoretical foundations, applications. sources must include id,name,type,url,status,query,verified_at. Topic: {topic} Problem tree: {problem_tree}' synthesis: system: You are a synthesis specialist for literature reviews. user: 'Produce merged synthesis output (topic clusters + research gaps). Output markdown with sections: Cluster Overview, Cluster 1..N, Gap 1..N, Prioritized Opportunities. Topic: {topic} Cards context: {cards_context}' topic_init: system: You are a rigorous research planner. user: 'Create a SMART research goal in markdown. Topic: {topic} Domains: {domains} Project: {project_name} Quality threshold: {quality_threshold} Required sections: Topic, Scope, SMART Goal, Constraints, Success Criteria, Generated.' sub_prompts: code_repair: system: You fix Python code validation errors while preserving functionality. user: 'The file `{fname}` in the experiment project has validation errors. Fix ALL issues and return ONLY the corrected file. ## Validation Issues in {fname} {issues_text} ## All Project Files {all_files_ctx} IMPORTANT: Do NOT use subprocess, os.system, eval, exec, or any network/shell calls. Return ONLY the corrected code for `{fname}`.' iterative_improve: max_tokens: 8192 system: You improve experiment projects and return valid executable Python code. Use ```filename:xxx.py format for each file. user: 'Improve the experiment code based on prior run results. Return the improved files using ```filename:xxx.py format for each file. Primary metric key: {metric_key} Metric direction: {metric_direction} Do not use subprocess, os.system, eval, exec, or any network/shell calls. Current project files: {files_context} Run summaries (JSON): {run_summaries}' iterative_repair: system: You fix Python code issues — both static validation errors and runtime bugs (NaN, Inf, division by zero, overflow). Diagnose the ROOT CAUSE from warnings and error messages. Do not add unsafe behavior. user: 'Fix all issues in the experiment code and return corrected Python code using ```filename:xxx.py format for each file. IMPORTANT: If you see NaN/Inf or RuntimeWarning about division or invalid values, trace the bug to its source (e.g. division by zero, uninitialized array, missing convergence check) and fix the actual code logic — do NOT just add try/except to suppress the error. ## Issues Found {issue_text} ## All Project Files {all_files_ctx}' version: '1.0' ================================================ FILE: pyproject.toml ================================================ [project] name = "researchclaw" version = "0.3.1" description = "ResearchClaw — Autonomous Research Pipeline. Turn any research idea into a paper." requires-python = ">=3.11" dependencies = [ "pyyaml>=6.0", "rich>=13.0", "arxiv>=2.1", "numpy>=1.24", ] readme = "README.md" license = {text = "MIT"} [project.optional-dependencies] anthropic = ["httpx>=0.24"] web = ["scholarly>=1.7", "crawl4ai>=0.2", "tavily-python>=0.3"] pdf = ["PyMuPDF>=1.23"] all = [ "httpx>=0.24", "scholarly>=1.7", "crawl4ai>=0.2", "tavily-python>=0.3", "PyMuPDF>=1.23", "huggingface-hub>=0.20", "matplotlib>=3.7", "scipy>=1.10", ] dev = ["pytest>=7.0", "httpx>=0.24"] [project.scripts] researchclaw = "researchclaw.cli:main" [tool.hatch.build.targets.wheel] packages = ["researchclaw", "sibyl", "arc"] [tool.hatch.build.targets.wheel.force-include] "researchclaw/templates/styles" = "researchclaw/templates/styles" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" ================================================ FILE: researchclaw/__init__.py ================================================ """ResearchClaw — Autonomous Research Pipeline.""" __version__ = "0.3.1" ================================================ FILE: researchclaw/__main__.py ================================================ """Allow running as `python -m researchclaw`.""" import sys from researchclaw.cli import main sys.exit(main()) ================================================ FILE: researchclaw/adapters.py ================================================ """Typed adapter interfaces and deterministic recording stubs.""" from __future__ import annotations from dataclasses import dataclass, field from typing import Protocol @dataclass(frozen=True) class FetchResponse: url: str status_code: int text: str @dataclass(frozen=True) class BrowserPage: url: str title: str class CronAdapter(Protocol): def schedule_resume(self, run_id: str, stage_id: int, reason: str) -> str: ... class MessageAdapter(Protocol): def notify(self, channel: str, subject: str, body: str) -> str: ... class MemoryAdapter(Protocol): def append(self, namespace: str, content: str) -> str: ... class SessionsAdapter(Protocol): def spawn(self, name: str, command: tuple[str, ...]) -> str: ... class WebFetchAdapter(Protocol): def fetch(self, url: str) -> FetchResponse: ... class BrowserAdapter(Protocol): def open(self, url: str) -> BrowserPage: ... @dataclass class RecordingCronAdapter: calls: list[tuple[str, int, str]] = field(default_factory=list) def schedule_resume(self, run_id: str, stage_id: int, reason: str) -> str: self.calls.append((run_id, stage_id, reason)) return f"cron-{len(self.calls)}" @dataclass class RecordingMessageAdapter: calls: list[tuple[str, str, str]] = field(default_factory=list) def notify(self, channel: str, subject: str, body: str) -> str: self.calls.append((channel, subject, body)) return f"message-{len(self.calls)}" @dataclass class RecordingMemoryAdapter: entries: list[tuple[str, str]] = field(default_factory=list) def append(self, namespace: str, content: str) -> str: self.entries.append((namespace, content)) return f"memory-{len(self.entries)}" @dataclass class RecordingSessionsAdapter: calls: list[tuple[str, tuple[str, ...]]] = field(default_factory=list) def spawn(self, name: str, command: tuple[str, ...]) -> str: self.calls.append((name, command)) return f"session-{len(self.calls)}" @dataclass class RecordingWebFetchAdapter: calls: list[str] = field(default_factory=list) def fetch(self, url: str) -> FetchResponse: self.calls.append(url) return FetchResponse(url=url, status_code=200, text=f"stub fetch for {url}") @dataclass class RecordingBrowserAdapter: calls: list[str] = field(default_factory=list) def open(self, url: str) -> BrowserPage: self.calls.append(url) return BrowserPage(url=url, title=f"Stub browser page for {url}") @dataclass class MCPMessageAdapter: """MessageAdapter backed by an MCP tool call.""" server_uri: str = "http://localhost:3000" def notify(self, channel: str, subject: str, body: str) -> str: return f"mcp-notify-{channel}" @dataclass class MCPWebFetchAdapter: """WebFetchAdapter backed by an MCP tool call.""" server_uri: str = "http://localhost:3000" def fetch(self, url: str) -> FetchResponse: return FetchResponse(url=url, status_code=200, text=f"mcp fetch for {url}") @dataclass class AdapterBundle: cron: CronAdapter = field(default_factory=RecordingCronAdapter) message: MessageAdapter = field(default_factory=RecordingMessageAdapter) memory: MemoryAdapter = field(default_factory=RecordingMemoryAdapter) sessions: SessionsAdapter = field(default_factory=RecordingSessionsAdapter) web_fetch: WebFetchAdapter = field(default_factory=RecordingWebFetchAdapter) browser: BrowserAdapter = field(default_factory=RecordingBrowserAdapter) @classmethod def from_config(cls, config: object) -> AdapterBundle: """Build an AdapterBundle from RCConfig, wiring MCP adapters when enabled.""" bundle = cls() mcp_cfg = getattr(config, "mcp", None) if mcp_cfg and getattr(mcp_cfg, "server_enabled", False): uri = f"http://localhost:{getattr(mcp_cfg, 'server_port', 3000)}" bundle.message = MCPMessageAdapter(server_uri=uri) bundle.web_fetch = MCPWebFetchAdapter(server_uri=uri) return bundle ================================================ FILE: researchclaw/agents/__init__.py ================================================ """Multi-agent subsystems for AutoResearchClaw pipeline.""" ================================================ FILE: researchclaw/agents/base.py ================================================ """Base classes for multi-agent subsystems. Provides ``BaseAgent`` (individual agent) and ``AgentOrchestrator`` (coordinator for multi-agent workflows). Both use the existing ``LLMClient`` for model calls and follow the same structural-typing conventions as ``CodeAgent``. """ from __future__ import annotations import json import logging import re from dataclasses import dataclass, field from typing import Any, Protocol logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # LLM protocol (structural typing — no import dependency on llm.client) # --------------------------------------------------------------------------- class _LLMResponseLike(Protocol): # pragma: no cover content: str model: str prompt_tokens: int completion_tokens: int class _LLMClientLike(Protocol): # pragma: no cover def chat( self, messages: list[dict[str, str]], *, system: str | None = None, max_tokens: int | None = None, temperature: float | None = None, json_mode: bool = False, ) -> Any: ... # --------------------------------------------------------------------------- # Agent result # --------------------------------------------------------------------------- @dataclass class AgentStepResult: """Output from a single agent step.""" success: bool data: dict[str, Any] = field(default_factory=dict) error: str = "" llm_calls: int = 0 token_usage: int = 0 # --------------------------------------------------------------------------- # Base agent # --------------------------------------------------------------------------- class BaseAgent: """Base class for all sub-agents in a multi-agent system. Subclasses must implement ``execute(context) -> AgentStepResult``. """ name: str = "base" def __init__(self, llm: _LLMClientLike) -> None: self._llm = llm self._calls = 0 self._tokens = 0 self.logger = logging.getLogger(f"{__name__}.{self.name}") # -- LLM helpers ------------------------------------------------------- def _chat( self, system: str, user: str, *, max_tokens: int = 4096, temperature: float = 0.4, json_mode: bool = False, ) -> str: """Send a chat message and return the content string.""" self._calls += 1 resp = self._llm.chat( [{"role": "user", "content": user}], system=system, max_tokens=max_tokens, temperature=temperature, json_mode=json_mode, ) self._tokens += getattr(resp, "total_tokens", 0) return resp.content def _chat_json( self, system: str, user: str, *, max_tokens: int = 4096, temperature: float = 0.3, ) -> dict[str, Any]: """Send a chat message expecting JSON output. Falls back to regex extraction.""" raw = self._chat( system, user, max_tokens=max_tokens, temperature=temperature, json_mode=True, ) return self._parse_json(raw) or {} # -- JSON parsing (3-tier, matching CodeAgent convention) --------------- @staticmethod def _parse_json(text: str) -> dict[str, Any] | None: """Try to extract JSON from text using three strategies. Always returns a ``dict`` or ``None`` — lists and other JSON primitives are discarded so callers can safely use ``.get()``. """ def _as_dict(val: Any) -> dict[str, Any] | None: return val if isinstance(val, dict) else None # 1. Direct parse try: return _as_dict(json.loads(text)) except (json.JSONDecodeError, ValueError): pass # 2. Fenced code block m = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL) if m: try: return _as_dict(json.loads(m.group(1))) except (json.JSONDecodeError, ValueError): pass # 3. First balanced { ... } block (BUG-DA6-07: use non-greedy brace matching) depth = 0 start_idx = -1 for i, ch in enumerate(text): if ch == "{": if depth == 0: start_idx = i depth += 1 elif ch == "}": depth -= 1 if depth == 0 and start_idx >= 0: candidate = text[start_idx : i + 1] try: return _as_dict(json.loads(candidate)) except (json.JSONDecodeError, ValueError): start_idx = -1 # try next top-level block return None # -- Subclass API ------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Execute the agent's task. Must be overridden.""" raise NotImplementedError def _make_result( self, success: bool, data: dict[str, Any] | None = None, error: str = "", ) -> AgentStepResult: # BUG-DA6-01: Return per-call delta, then reset counters to avoid # double-counting when the same agent instance is reused across retries. calls, tokens = self._calls, self._tokens self._calls = 0 self._tokens = 0 return AgentStepResult( success=success, data=data or {}, error=error, llm_calls=calls, token_usage=tokens, ) # --------------------------------------------------------------------------- # Orchestrator # --------------------------------------------------------------------------- class AgentOrchestrator: """Coordinates a sequence of agents with optional retry loops. Subclasses implement ``orchestrate(context) -> dict`` which defines the specific workflow (sequential, branching, iterative, etc.). """ def __init__(self, llm: _LLMClientLike, *, max_iterations: int = 3) -> None: self._llm = llm self.max_iterations = max_iterations self.logger = logging.getLogger(f"{__name__}.orchestrator") self.total_llm_calls = 0 self.total_tokens = 0 def _accumulate(self, result: AgentStepResult) -> None: """Track cumulative LLM usage.""" self.total_llm_calls += result.llm_calls self.total_tokens += result.token_usage def orchestrate(self, context: dict[str, Any]) -> dict[str, Any]: """Run the multi-agent workflow. Must be overridden.""" raise NotImplementedError ================================================ FILE: researchclaw/agents/benchmark_agent/__init__.py ================================================ """BenchmarkAgent — multi-agent benchmark, dataset, and baseline selection. Architecture ------------ 1. **Surveyor** — searches HuggingFace Hub + local knowledge base for domain-relevant benchmarks, datasets, and baseline methods. 2. **Selector** — filters and ranks candidates based on hardware constraints, time budget, network policy, and tier availability. 3. **Acquirer** — generates data-loading code snippets, ``setup.py`` download scripts, baseline boilerplate, and ``requirements.txt`` entries. 4. **Validator** — validates generated code for syntax correctness and API compatibility. The ``BenchmarkOrchestrator`` coordinates the four agents and produces a ``BenchmarkPlan`` consumed by downstream pipeline stages (experiment design, code generation). """ from researchclaw.agents.benchmark_agent.orchestrator import ( BenchmarkOrchestrator, BenchmarkPlan, ) __all__ = ["BenchmarkOrchestrator", "BenchmarkPlan"] ================================================ FILE: researchclaw/agents/benchmark_agent/acquirer.py ================================================ """Acquirer Agent — generates data loading code and download scripts. Produces three outputs consumed by the code generation stage: 1. Data loading snippets (``get_datasets()`` function) 2. Baseline method snippets (model instantiation code) 3. ``setup.py`` additions for dataset downloading """ from __future__ import annotations import logging from typing import Any from researchclaw.agents.base import AgentStepResult, BaseAgent logger = logging.getLogger(__name__) class AcquirerAgent(BaseAgent): """Generates data loading, baseline, and download code.""" name = "acquirer" def _generate_data_loader( self, benchmarks: list[dict[str, Any]], topic: str, ) -> str: """Ask LLM to generate a robust data loading function.""" bench_specs = [] for b in benchmarks: spec = ( f"- {b.get('name', 'Unknown')} (tier {b.get('tier', '?')}, " f"role: {b.get('role', 'secondary')})\n" f" API: {b.get('api', 'N/A')}\n" f" Metrics: {b.get('metrics', [])}\n" f" Note: {b.get('note', '')}" ) bench_specs.append(spec) system = ( "You are an expert ML engineer. Generate a Python function that loads " "and prepares datasets for an ML experiment.\n\n" "REQUIREMENTS:\n" "- Function signature: def get_datasets(data_root='/workspace/data') -> dict\n" "- Returns dict with keys: 'train', 'val', 'test' (each a Dataset or DataLoader)\n" "- Include appropriate transforms (normalization, augmentation for training)\n" "- Handle both torchvision and HuggingFace datasets APIs\n" "- Include proper train/val/test splits\n" "- Add error handling with informative messages\n" "- For pre-cached datasets (tier 1), use download=False\n" "- For downloadable datasets (tier 2), use download=True in setup.py\n" "- Include a DATA_CONFIG dict with dataset metadata (num_classes, input_shape, etc.)\n\n" "Return ONLY the Python code, no explanation." ) user = ( f"Research Topic: {topic}\n\n" f"Datasets to load:\n" + "\n".join(bench_specs) + "\n\n" "Generate the data loading code." ) return self._chat(system, user, max_tokens=4096, temperature=0.2) def _generate_baseline_code( self, baselines: list[dict[str, Any]], benchmarks: list[dict[str, Any]], topic: str, ) -> str: """Ask LLM to generate baseline method instantiation code.""" base_specs = [] for bl in baselines: spec = ( f"- {bl.get('name', 'Unknown')}\n" f" Source: {bl.get('source', 'N/A')}\n" f" Paper: {bl.get('paper', 'N/A')}" ) base_specs.append(spec) primary_bench = next( (b for b in benchmarks if b.get("role") == "primary"), benchmarks[0] if benchmarks else {}, ) system = ( "You are an expert ML engineer. Generate Python code that instantiates " "baseline methods for comparison in an ML experiment.\n\n" "REQUIREMENTS:\n" "- Function signature: def get_baselines(num_classes, device='cuda') -> dict\n" "- Returns dict mapping method_name -> model (nn.Module)\n" "- Each model must be ready for training (correct output dimensions)\n" "- Use pretrained weights where available (for feature extractors)\n" "- Adapt final layer to match num_classes of the target dataset\n" "- Include a BASELINES_CONFIG dict with metadata (param_count, paper, etc.)\n" "- Handle missing optional packages gracefully\n\n" "Return ONLY the Python code, no explanation." ) user = ( f"Research Topic: {topic}\n" f"Primary Dataset: {primary_bench.get('name', 'N/A')} " f"({primary_bench.get('classes', '?')} classes)\n\n" f"Baseline Methods:\n" + "\n".join(base_specs) + "\n\n" "Generate the baseline instantiation code." ) return self._chat(system, user, max_tokens=4096, temperature=0.2) def _generate_setup_script( self, benchmarks: list[dict[str, Any]], required_pip: list[str], ) -> str: """Generate setup.py content for dataset downloading.""" # Tier 2 datasets need download scripts tier2 = [b for b in benchmarks if b.get("tier", 1) >= 2] if not tier2 and not required_pip: return "" lines = [ '"""Setup script for dataset downloading and environment preparation.', '', 'This script runs during Phase 1 (setup) of the Docker sandbox,', 'when network access is available. It downloads datasets and installs', 'any additional dependencies.', '"""', '', 'import os', 'import sys', '', 'DATA_ROOT = "/workspace/data"', 'HF_CACHE = os.path.join(DATA_ROOT, "hf")', '', '', 'def download_datasets():', ' """Download all required datasets."""', ' os.makedirs(DATA_ROOT, exist_ok=True)', ' os.makedirs(HF_CACHE, exist_ok=True)', '', ] for b in tier2: api = b.get("api", "") name = b.get("name", "unknown") if "torchvision" in api: # Convert download=False to download=True for setup dl_api = api.replace("download=False", "download=True") lines.extend([ f' # Download {name}', ' try:', f' import torchvision', f' {dl_api}', f' print(f"Downloaded {name}")', f' except Exception as e:', f' print(f"Warning: Failed to download {name}: {{e}}")', '', ]) elif "datasets.load_dataset" in api or "load_dataset" in api: # Rewrite qualified `datasets.load_dataset(...)` to # `load_dataset(...)` so it matches the `from datasets import` _dl_api = api.replace("datasets.load_dataset", "load_dataset") lines.extend([ f' # Download {name}', ' try:', f' from datasets import load_dataset', f' {_dl_api}', f' print(f"Downloaded {name}")', f' except Exception as e:', f' print(f"Warning: Failed to download {name}: {{e}}")', '', ]) elif "PygNodePropPredDataset" in api or "PygGraphPropPredDataset" in api: lines.extend([ f' # Download {name}', ' try:', f' from ogb.nodeproppred import PygNodePropPredDataset' if 'Node' in api else f' from ogb.graphproppred import PygGraphPropPredDataset', f' {api}', f' print(f"Downloaded {name}")', f' except Exception as e:', f' print(f"Warning: Failed to download {name}: {{e}}")', '', ]) lines.extend([ '', 'if __name__ == "__main__":', ' download_datasets()', ' print("Setup complete.")', ]) return "\n".join(lines) def _generate_requirements(self, required_pip: list[str]) -> str: """Generate requirements.txt content for additional packages.""" if not required_pip: return "" # Filter out packages that are already in the Docker image builtin = { "torch", "torchvision", "torchaudio", "numpy", "scipy", "sklearn", "scikit-learn", "pandas", "matplotlib", "seaborn", "tqdm", "gymnasium", "networkx", "timm", "einops", "torchmetrics", "transformers", "datasets", "accelerate", "peft", "trl", "bitsandbytes", "tokenizers", "safetensors", "h5py", "tensorboard", "pillow", "pyyaml", "kornia", "albumentations", } extra = [p for p in required_pip if p.lower() not in builtin] return "\n".join(extra) if extra else "" # -- Code cleanup ------------------------------------------------------ @staticmethod def _strip_fences(code: str) -> str: """Remove markdown code fences if present.""" code = code.strip() if code.startswith("```"): # Remove opening fence first_nl = code.index("\n") if "\n" in code else len(code) code = code[first_nl + 1:] if code.endswith("```"): code = code[:-3].rstrip() return code # -- Main entry point -------------------------------------------------- def execute(self, context: dict[str, Any]) -> AgentStepResult: """Generate data loading, baseline, and download code. Context keys: topic (str): Research topic selection (dict): Output from SelectorAgent """ topic = context.get("topic", "") selection = context.get("selection", {}) benchmarks = selection.get("selected_benchmarks", []) baselines = selection.get("selected_baselines", []) required_pip = selection.get("required_pip", []) if not benchmarks: return self._make_result(False, error="No benchmarks selected") # 1. Generate data loading code self.logger.info("Generating data loading code for %d datasets", len(benchmarks)) data_loader_code = self._strip_fences( self._generate_data_loader(benchmarks, topic) ) # 2. Generate baseline code baseline_code = "" if baselines: self.logger.info("Generating baseline code for %d methods", len(baselines)) baseline_code = self._strip_fences( self._generate_baseline_code(baselines, benchmarks, topic) ) # 3. Generate setup.py setup_code = self._generate_setup_script(benchmarks, required_pip) # 4. Generate requirements.txt requirements = self._generate_requirements(required_pip) result = { "data_loader_code": data_loader_code, "baseline_code": baseline_code, "setup_code": setup_code, "requirements": requirements, "benchmark_names": [b.get("name", "Unknown") for b in benchmarks], "baseline_names": [bl.get("name", "Unknown") for bl in baselines], } self.logger.info("Acquirer complete: %d code artifacts generated", sum(1 for v in result.values() if v)) return self._make_result(True, data=result) ================================================ FILE: researchclaw/agents/benchmark_agent/orchestrator.py ================================================ """BenchmarkAgent Orchestrator — coordinates the four sub-agents. Flow: Surveyor → Selector → Acquirer → Validator (→ retry if failed) Produces a ``BenchmarkPlan`` consumed by experiment design and code generation stages. """ from __future__ import annotations import json import logging import time from dataclasses import dataclass, field from pathlib import Path from typing import Any from researchclaw.agents.base import AgentOrchestrator from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent from researchclaw.agents.benchmark_agent.selector import SelectorAgent from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent from researchclaw.agents.benchmark_agent.validator import ValidatorAgent logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- @dataclass(frozen=True) class BenchmarkAgentConfig: """Configuration for the BenchmarkAgent system.""" enabled: bool = True # Surveyor enable_hf_search: bool = True max_hf_results: int = 10 enable_web_search: bool = False max_web_results: int = 5 web_search_min_local: int = 3 # Selector tier_limit: int = 2 min_benchmarks: int = 1 min_baselines: int = 2 prefer_cached: bool = True # Orchestrator max_iterations: int = 2 # max Acquirer→Validator retry loops # --------------------------------------------------------------------------- # Output data structure # --------------------------------------------------------------------------- @dataclass class BenchmarkPlan: """Final output from the BenchmarkAgent system. Consumed by: - Experiment design stage (selected benchmarks/baselines for plan) - Code generation stage (data_loader_code, baseline_code) - Docker sandbox (setup_code, requirements) """ # Selected items selected_benchmarks: list[dict[str, Any]] = field(default_factory=list) selected_baselines: list[dict[str, Any]] = field(default_factory=list) matched_domains: list[str] = field(default_factory=list) # Generated code data_loader_code: str = "" baseline_code: str = "" setup_code: str = "" requirements: str = "" # Metadata rationale: str = "" experiment_notes: str = "" validation_passed: bool = False validation_warnings: list[str] = field(default_factory=list) # Stats total_llm_calls: int = 0 total_tokens: int = 0 elapsed_sec: float = 0.0 def to_dict(self) -> dict[str, Any]: """Serialize to a JSON-safe dict.""" return { "selected_benchmarks": self.selected_benchmarks, "selected_baselines": self.selected_baselines, "matched_domains": self.matched_domains, "data_loader_code": self.data_loader_code, "baseline_code": self.baseline_code, "setup_code": self.setup_code, "requirements": self.requirements, "rationale": self.rationale, "experiment_notes": self.experiment_notes, "validation_passed": self.validation_passed, "validation_warnings": self.validation_warnings, "total_llm_calls": self.total_llm_calls, "total_tokens": self.total_tokens, "elapsed_sec": self.elapsed_sec, } def to_prompt_block(self) -> str: """Format as a prompt block for injection into code generation.""" parts = [] # Benchmark summary if self.selected_benchmarks: parts.append("## Selected Benchmarks") for b in self.selected_benchmarks: role = b.get("role", "secondary") metrics = b.get("metrics", []) parts.append( f"- **{b.get('name', 'Unknown')}** ({role}) — " f"metrics: {', '.join(str(m) for m in metrics)}" ) if b.get("api"): parts.append(f" API: `{b['api']}`") if b.get("note"): parts.append(f" Note: {b['note']}") # Baseline summary if self.selected_baselines: parts.append("\n## Selected Baselines") for bl in self.selected_baselines: parts.append( f"- **{bl.get('name', 'Unknown')}**: {bl.get('paper', 'N/A')}" ) if bl.get("source"): parts.append(f" Code: `{bl['source']}`") # Data loading code if self.data_loader_code: parts.append("\n## Data Loading Code (READY TO USE)") parts.append("```python") parts.append(self.data_loader_code) parts.append("```") # Baseline code if self.baseline_code: parts.append("\n## Baseline Methods Code (READY TO USE)") parts.append("```python") parts.append(self.baseline_code) parts.append("```") # Experiment notes if self.experiment_notes: parts.append(f"\n## Experiment Notes\n{self.experiment_notes}") return "\n".join(parts) # --------------------------------------------------------------------------- # Orchestrator # --------------------------------------------------------------------------- class BenchmarkOrchestrator(AgentOrchestrator): """Coordinates Surveyor → Selector → Acquirer → Validator pipeline.""" def __init__( self, llm: Any, config: BenchmarkAgentConfig | None = None, *, gpu_memory_mb: int = 49000, time_budget_sec: int = 300, network_policy: str = "setup_only", stage_dir: Path | None = None, ) -> None: cfg = config or BenchmarkAgentConfig() super().__init__(llm, max_iterations=cfg.max_iterations) self._config = cfg self._stage_dir = stage_dir # Initialize sub-agents self._surveyor = SurveyorAgent( llm, enable_hf_search=cfg.enable_hf_search, max_hf_results=cfg.max_hf_results, ) self._selector = SelectorAgent( llm, gpu_memory_mb=gpu_memory_mb, time_budget_sec=time_budget_sec, network_policy=network_policy, tier_limit=cfg.tier_limit, min_benchmarks=cfg.min_benchmarks, min_baselines=cfg.min_baselines, prefer_cached=cfg.prefer_cached, ) self._acquirer = AcquirerAgent(llm) self._validator = ValidatorAgent(llm) def _save_artifact(self, name: str, data: Any) -> None: """Save intermediate artifact to stage directory.""" if self._stage_dir is None: return self._stage_dir.mkdir(parents=True, exist_ok=True) path = self._stage_dir / name if isinstance(data, str): path.write_text(data, encoding="utf-8") else: path.write_text( json.dumps(data, indent=2, ensure_ascii=False, default=str), encoding="utf-8", ) def orchestrate(self, context: dict[str, Any]) -> BenchmarkPlan: """Run the full benchmark selection pipeline. Context keys: topic (str): Research topic/title hypothesis (str): Research hypothesis experiment_plan (str): Experiment plan text """ t0 = time.monotonic() topic = context.get("topic", "") hypothesis = context.get("hypothesis", "") self.logger.info("BenchmarkAgent starting for: %s", topic[:80]) plan = BenchmarkPlan() # ── Phase 1: Survey ─────────────────────────────────────── self.logger.info("Phase 1: Surveying benchmarks") survey_result = self._surveyor.execute({ "topic": topic, "hypothesis": hypothesis, "experiment_plan": context.get("experiment_plan", ""), }) self._accumulate(survey_result) if not survey_result.success: self.logger.warning("Survey failed: %s", survey_result.error) plan.elapsed_sec = time.monotonic() - t0 plan.total_llm_calls = self.total_llm_calls plan.total_tokens = self.total_tokens return plan survey = survey_result.data plan.matched_domains = survey.get("matched_domains", []) self._save_artifact("survey_results.json", survey) # ── Phase 2: Select ─────────────────────────────────────── self.logger.info("Phase 2: Selecting benchmarks and baselines") select_result = self._selector.execute({ "topic": topic, "survey": survey, }) self._accumulate(select_result) if not select_result.success: self.logger.warning("Selection failed: %s", select_result.error) plan.elapsed_sec = time.monotonic() - t0 plan.total_llm_calls = self.total_llm_calls plan.total_tokens = self.total_tokens return plan selection = select_result.data plan.selected_benchmarks = selection.get("selected_benchmarks", []) plan.selected_baselines = selection.get("selected_baselines", []) plan.rationale = selection.get("rationale", "") plan.experiment_notes = selection.get("experiment_notes", "") self._save_artifact("selection_results.json", selection) # ── Phase 3+4: Acquire + Validate (with retry) ─────────── for iteration in range(self.max_iterations): self.logger.info( "Phase 3: Acquiring code (iteration %d/%d)", iteration + 1, self.max_iterations, ) # Acquire acq_result = self._acquirer.execute({ "topic": topic, "selection": selection, }) self._accumulate(acq_result) if not acq_result.success: self.logger.warning("Acquisition failed: %s", acq_result.error) continue acquisition = acq_result.data self._save_artifact( f"acquisition_{iteration}.json", {k: v for k, v in acquisition.items() if k not in ("data_loader_code", "baseline_code", "setup_code")}, ) # Validate self.logger.info("Phase 4: Validating code (iteration %d/%d)", iteration + 1, self.max_iterations) val_result = self._validator.execute({ "acquisition": acquisition, }) self._accumulate(val_result) validation = val_result.data self._save_artifact(f"validation_{iteration}.json", validation) # Store results plan.data_loader_code = acquisition.get("data_loader_code", "") plan.baseline_code = acquisition.get("baseline_code", "") plan.setup_code = acquisition.get("setup_code", "") plan.requirements = acquisition.get("requirements", "") plan.validation_passed = validation.get("passed", False) plan.validation_warnings = validation.get("warnings", []) if plan.validation_passed: self.logger.info("Validation passed on iteration %d", iteration + 1) break self.logger.warning( "Validation failed (iteration %d): %s", iteration + 1, validation.get("errors", []), ) # ── Finalize ────────────────────────────────────────────── plan.total_llm_calls = self.total_llm_calls plan.total_tokens = self.total_tokens plan.elapsed_sec = time.monotonic() - t0 # Save final plan self._save_artifact("benchmark_plan.json", plan.to_dict()) self.logger.info( "BenchmarkAgent complete: %d benchmarks, %d baselines, " "validation=%s, %d LLM calls, %.1fs", len(plan.selected_benchmarks), len(plan.selected_baselines), "PASS" if plan.validation_passed else "FAIL", plan.total_llm_calls, plan.elapsed_sec, ) return plan ================================================ FILE: researchclaw/agents/benchmark_agent/selector.py ================================================ """Selector Agent — filters and ranks benchmark candidates. Applies hardware constraints, time budget, network policy, and tier priorities to select the optimal combination of datasets and baselines. """ from __future__ import annotations import logging from pathlib import Path from typing import Any import yaml from researchclaw.agents.base import AgentStepResult, BaseAgent logger = logging.getLogger(__name__) _KNOWLEDGE_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "benchmark_knowledge.yaml" # Maximum dataset size (MB) by tier and network policy _SIZE_LIMITS: dict[str, int] = { "none": 0, # No download allowed — tier 1 only "setup_only": 5000, # Can download during setup phase "pip_only": 0, # pip only, no data download "full": 50000, # Generous limit } class SelectorAgent(BaseAgent): """Filters and ranks datasets/baselines based on constraints.""" name = "selector" def __init__( self, llm: Any, *, gpu_memory_mb: int = 49000, time_budget_sec: int = 300, network_policy: str = "setup_only", tier_limit: int = 2, min_benchmarks: int = 1, min_baselines: int = 2, prefer_cached: bool = True, ) -> None: super().__init__(llm) self._gpu_mb = gpu_memory_mb self._time_sec = time_budget_sec self._network_policy = network_policy self._tier_limit = tier_limit self._min_bench = min_benchmarks self._min_base = min_baselines self._prefer_cached = prefer_cached # -- Filtering --------------------------------------------------------- def _filter_benchmarks( self, benchmarks: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Filter benchmarks by tier, size, and network policy.""" max_size = _SIZE_LIMITS.get(self._network_policy, 5000) filtered: list[dict[str, Any]] = [] for b in benchmarks: tier = b.get("tier", 3) size = b.get("size_mb", 0) # Tier filter if tier > self._tier_limit: continue # Network policy filter if tier >= 2 and self._network_policy in ("none", "pip_only"): continue # Size filter (tier 2+ only — tier 1 is pre-cached) if tier >= 2 and size > max_size: continue filtered.append(b) return filtered def _filter_baselines( self, baselines: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Filter baselines by pip availability.""" filtered: list[dict[str, Any]] = [] for bl in baselines: pip_deps = bl.get("pip", []) # If no network, only allow baselines with no extra pip deps if self._network_policy == "none" and pip_deps: continue filtered.append(bl) return filtered # -- Ranking ----------------------------------------------------------- def _rank_benchmarks( self, benchmarks: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Sort benchmarks by preference: tier 1 > tier 2, knowledge_base > hf, downloads.""" def _score(b: dict[str, Any]) -> tuple[int, int, int]: tier = b.get("tier", 3) # Prefer lower tier (cached first) tier_score = -tier if self._prefer_cached else 0 # Prefer knowledge_base over hf/llm origin_score = { "knowledge_base": 2, "huggingface_hub": 1, "llm_suggestion": 0, }.get(b.get("origin", ""), 0) # Downloads as tiebreaker downloads = b.get("downloads", 0) return (tier_score, origin_score, downloads) return sorted(benchmarks, key=_score, reverse=True) def _rank_baselines( self, baselines: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Sort baselines: knowledge_base first, fewer deps preferred.""" def _score(bl: dict[str, Any]) -> tuple[int, int]: origin_score = 1 if bl.get("origin") == "knowledge_base" else 0 dep_score = -len(bl.get("pip", [])) return (origin_score, dep_score) return sorted(baselines, key=_score, reverse=True) # -- Selection --------------------------------------------------------- def _select_with_llm( self, topic: str, benchmarks: list[dict[str, Any]], baselines: list[dict[str, Any]], ) -> dict[str, Any]: """Ask LLM to make final selection from filtered candidates.""" bench_summary = "\n".join( f"- {b.get('name', 'Unknown')} (tier {b.get('tier', '?')}, " f"origin: {b.get('origin', '?')}, " f"metrics: {b.get('metrics', [])})" for b in benchmarks[:15] ) base_summary = "\n".join( f"- {bl.get('name', 'Unknown')}: {bl.get('paper', 'N/A')}" for bl in baselines[:10] ) system = ( "You are an ML experiment design expert. Select the BEST combination " "of benchmarks and baselines for a research paper.\n\n" "Return JSON:\n" "{\n" ' "primary_benchmark": "name",\n' ' "secondary_benchmarks": ["name1", "name2"],\n' ' "selected_baselines": ["name1", "name2", "name3"],\n' ' "rationale": "why these choices are optimal",\n' ' "experiment_notes": "specific setup guidance"\n' "}\n\n" "RULES:\n" "- Select 1 primary benchmark (the main evaluation dataset)\n" "- Select 0-2 secondary benchmarks (additional validation)\n" "- Select 2-4 baselines (must include at least 1 classic + 1 recent)\n" "- Primary benchmark MUST be the domain standard\n" "- Prefer benchmarks that top-venue papers commonly use\n" "- Consider dataset size vs time budget\n" "- CRITICAL: Only select benchmarks that are RELEVANT to the research " "topic's domain. Do NOT select image classification datasets (CIFAR, " "MNIST) for non-image tasks like PDE solvers, RL, or optimization.\n" "- CRITICAL: Baselines must be COMPETING METHODS, not optimizers. " "SGD/Adam/AdamW/Cosine LR are NOT baselines — they are training " "tools. Baselines must be alternative approaches to the same problem." ) user = ( f"Research Topic: {topic}\n\n" f"Available Benchmarks:\n{bench_summary}\n\n" f"Available Baselines:\n{base_summary}\n\n" f"Constraints: GPU={self._gpu_mb}MB, " f"time_budget={self._time_sec}s, " f"network_policy={self._network_policy}\n\n" "Make your selection." ) return self._chat_json(system, user, max_tokens=2048) def _resolve_selection( self, selection: dict[str, Any], benchmarks: list[dict[str, Any]], baselines: list[dict[str, Any]], ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: """Resolve LLM-selected names back to full benchmark/baseline dicts.""" # Build name lookup bench_map = {b.get("name", f"bench_{i}"): b for i, b in enumerate(benchmarks)} base_map = {bl.get("name", f"base_{i}"): bl for i, bl in enumerate(baselines)} selected_bench: list[dict[str, Any]] = [] primary = selection.get("primary_benchmark", "") if primary and primary in bench_map: entry = {**bench_map[primary], "role": "primary"} selected_bench.append(entry) for name in selection.get("secondary_benchmarks", []): if name in bench_map and name != primary: entry = {**bench_map[name], "role": "secondary"} selected_bench.append(entry) selected_base: list[dict[str, Any]] = [] for name in selection.get("selected_baselines", []): if name in base_map: selected_base.append(base_map[name]) return selected_bench, selected_base # -- Required baselines injection -------------------------------------- def _inject_required_baselines( self, topic: str, selected: list[dict[str, Any]], ranked: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Load required_baselines from knowledge base and inject missing ones. Returns the list of newly injected baseline dicts. """ try: kb = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) domains = kb.get("domains", {}) if isinstance(kb, dict) else {} except Exception: # noqa: BLE001 return [] topic_lower = topic.lower() injected: list[dict[str, Any]] = [] selected_names = {b.get("name", "").lower() for b in selected} for _domain_id, domain_data in domains.items(): if not isinstance(domain_data, dict): continue keywords = domain_data.get("keywords", []) if not any(kw.lower() in topic_lower for kw in keywords): continue required = domain_data.get("required_baselines", []) if not required: continue # Find each required baseline in ranked list or create stub all_baselines = domain_data.get("common_baselines", []) bl_by_name = {b.get("name", ""): b for b in all_baselines} for req_name in required: if req_name.lower() in selected_names: continue # Try to find full entry from knowledge base if req_name in bl_by_name: entry = {**bl_by_name[req_name], "origin": "required_baseline"} else: entry = {"name": req_name, "origin": "required_baseline", "pip": []} selected.append(entry) selected_names.add(req_name.lower()) injected.append(entry) return injected # -- Main entry point -------------------------------------------------- def execute(self, context: dict[str, Any]) -> AgentStepResult: """Select optimal benchmarks and baselines from survey results. Context keys: topic (str): Research topic survey (dict): Output from SurveyorAgent """ topic = context.get("topic", "") survey = context.get("survey", {}) benchmarks = survey.get("benchmarks", []) baselines = survey.get("baselines", []) if not benchmarks and not baselines: return self._make_result(False, error="No candidates to select from") # 1. Filter by constraints filtered_bench = self._filter_benchmarks(benchmarks) filtered_base = self._filter_baselines(baselines) self.logger.info( "Filtered: %d/%d benchmarks, %d/%d baselines", len(filtered_bench), len(benchmarks), len(filtered_base), len(baselines), ) # 2. Rank ranked_bench = self._rank_benchmarks(filtered_bench) ranked_base = self._rank_baselines(filtered_base) # 3. LLM-assisted final selection (if enough candidates) if len(ranked_bench) >= 2 or len(ranked_base) >= 2: selection = self._select_with_llm(topic, ranked_bench, ranked_base) selected_bench, selected_base = self._resolve_selection( selection, ranked_bench, ranked_base, ) else: # Not enough to warrant LLM call — use top ranked # BUG-DA6-06: Create copies to avoid mutating input dicts selected_bench = [{**b, "role": "primary"} if i == 0 else {**b, "role": "secondary"} for i, b in enumerate(ranked_bench[:3])] selected_base = ranked_base[:self._min_base] selection = {} # 4. Fallback: ensure minimums if len(selected_bench) < self._min_bench and ranked_bench: for b in ranked_bench: if b not in selected_bench: selected_bench.append({**b, "role": "secondary"}) if len(selected_bench) >= self._min_bench: break if len(selected_base) < self._min_base and ranked_base: for bl in ranked_base: if bl not in selected_base: selected_base.append(bl) if len(selected_base) >= self._min_base: break # 4b. Improvement E: Inject required baselines from knowledge base _injected_required = self._inject_required_baselines( topic, selected_base, ranked_base, ) if _injected_required: self.logger.info( "Injected %d required baselines: %s", len(_injected_required), [b.get("name") for b in _injected_required], ) # 5. Collect required pip packages required_pip: list[str] = [] seen_pip: set[str] = set() for item in selected_bench + selected_base: for pkg in item.get("pip", []): if pkg not in seen_pip: seen_pip.add(pkg) required_pip.append(pkg) result = { "selected_benchmarks": selected_bench, "selected_baselines": selected_base, "required_pip": required_pip, "rationale": selection.get("rationale", ""), "experiment_notes": selection.get("experiment_notes", ""), "total_filtered": len(filtered_bench), } self.logger.info( "Selected: %d benchmarks, %d baselines, %d pip packages", len(selected_bench), len(selected_base), len(required_pip), ) return self._make_result(True, data=result) ================================================ FILE: researchclaw/agents/benchmark_agent/surveyor.py ================================================ """Surveyor Agent — searches for domain-relevant benchmarks and baselines. Data sources (in priority order): 1. Local ``benchmark_knowledge.yaml`` — always available, no network. 2. HuggingFace Hub API (``huggingface_hub``) — dataset discovery by task/keyword. 3. LLM fallback — asks the LLM to suggest benchmarks when APIs unavailable. """ from __future__ import annotations import logging from pathlib import Path from typing import Any import yaml from researchclaw.agents.base import AgentStepResult, BaseAgent logger = logging.getLogger(__name__) _KNOWLEDGE_PATH = Path(__file__).resolve().parent.parent.parent / "data" / "benchmark_knowledge.yaml" # --------------------------------------------------------------------------- # HuggingFace Hub helpers (optional dependency) # --------------------------------------------------------------------------- _HF_AVAILABLE = False try: from huggingface_hub import HfApi # type: ignore[import-untyped] _HF_AVAILABLE = True except ImportError: pass # Mapping from our domain keywords to HuggingFace task_categories filters _DOMAIN_TO_HF_TASK: dict[str, list[str]] = { "image_classification": ["image-classification"], "text_classification": ["text-classification", "sentiment-analysis"], "language_modeling": ["text-generation"], "question_answering": ["question-answering"], "generative_models": ["unconditional-image-generation"], "graph_neural_networks": ["graph-ml"], "reinforcement_learning": ["reinforcement-learning"], "tabular_learning": ["tabular-classification", "tabular-regression"], "llm_finetuning": ["text-generation"], } class SurveyorAgent(BaseAgent): """Searches local knowledge base and HuggingFace Hub for benchmarks.""" name = "surveyor" def __init__( self, llm: Any, *, enable_hf_search: bool = True, max_hf_results: int = 10, ) -> None: super().__init__(llm) self._enable_hf = enable_hf_search and _HF_AVAILABLE self._max_hf = max_hf_results self._knowledge = self._load_knowledge() # -- Knowledge base ---------------------------------------------------- @staticmethod def _load_knowledge() -> dict[str, Any]: """Load the local benchmark knowledge base.""" try: data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) return data.get("domains", {}) if isinstance(data, dict) else {} except Exception: # noqa: BLE001 logger.warning("Failed to load benchmark_knowledge.yaml", exc_info=True) return {} def _match_domains(self, topic: str) -> list[str]: """Return domain IDs whose keywords appear in the topic.""" topic_lower = topic.lower() matched: list[str] = [] for domain_id, info in self._knowledge.items(): keywords = info.get("keywords", []) for kw in keywords: if kw in topic_lower: matched.append(domain_id) break return matched def _get_local_candidates(self, domain_ids: list[str]) -> dict[str, Any]: """Retrieve benchmarks and baselines from local knowledge base.""" benchmarks: list[dict[str, Any]] = [] baselines: list[dict[str, Any]] = [] seen_bench: set[str] = set() seen_base: set[str] = set() for did in domain_ids: info = self._knowledge.get(did, {}) for b in info.get("standard_benchmarks", []): name = b.get("name", "") if name not in seen_bench: seen_bench.add(name) benchmarks.append({**b, "source_domain": did, "origin": "knowledge_base"}) for bl in info.get("common_baselines", []): name = bl.get("name", "") if name not in seen_base: seen_base.add(name) baselines.append({**bl, "source_domain": did, "origin": "knowledge_base"}) return {"benchmarks": benchmarks, "baselines": baselines} # -- HuggingFace Hub --------------------------------------------------- def _search_hf_datasets(self, topic: str, domain_ids: list[str]) -> list[dict[str, Any]]: """Search HuggingFace Hub for relevant datasets.""" if not self._enable_hf: return [] results: list[dict[str, Any]] = [] seen: set[str] = set() try: api = HfApi() # Strategy 1: Search by task category for did in domain_ids: for task_cat in _DOMAIN_TO_HF_TASK.get(did, []): try: datasets = api.list_datasets( filter=[f"task_categories:{task_cat}"], sort="downloads", direction=-1, limit=self._max_hf, ) for ds in datasets: if ds.id not in seen: seen.add(ds.id) results.append({ "name": ds.id, "downloads": getattr(ds, "downloads", 0), "origin": "huggingface_hub", "api": f"datasets.load_dataset('{ds.id}', cache_dir='/workspace/data/hf')", "tier": 2, }) except Exception: # noqa: BLE001 logger.debug("HF task search failed for %s", task_cat) # Strategy 2: Keyword search on topic keywords = self._extract_search_keywords(topic) for kw in keywords[:3]: try: datasets = api.list_datasets( search=kw, sort="downloads", direction=-1, limit=self._max_hf, ) for ds in datasets: if ds.id not in seen: seen.add(ds.id) results.append({ "name": ds.id, "downloads": getattr(ds, "downloads", 0), "origin": "huggingface_hub", "api": f"datasets.load_dataset('{ds.id}', cache_dir='/workspace/data/hf')", "tier": 2, }) except Exception: # noqa: BLE001 logger.debug("HF keyword search failed for %s", kw) except Exception as exc: # noqa: BLE001 logger.warning("HuggingFace Hub search failed: %s", exc) return results @staticmethod def _extract_search_keywords(topic: str) -> list[str]: """Extract 1-3 word search keywords from a topic string.""" # Remove common filler words to get meaningful search terms stop = { "a", "an", "the", "for", "in", "on", "of", "to", "with", "and", "or", "is", "are", "using", "via", "based", "towards", "novel", "new", "improved", "approach", "method", "methods", "study", } words = [w.lower().strip(".,;:!?()[]") for w in topic.split()] filtered = [w for w in words if w and w not in stop and len(w) > 2] # Return 2-3 keyword phrases keywords: list[str] = [] if len(filtered) >= 2: keywords.append(" ".join(filtered[:2])) if len(filtered) >= 3: keywords.append(" ".join(filtered[:3])) if filtered: keywords.append(filtered[0]) return keywords # -- LLM fallback ------------------------------------------------------ def _llm_suggest_benchmarks(self, topic: str, hypothesis: str) -> dict[str, Any]: """Ask LLM to suggest benchmarks and baselines when APIs unavailable.""" system = ( "You are an expert ML researcher. Given a research topic and hypothesis, " "suggest appropriate benchmarks, datasets, and baseline methods.\n\n" "Return a JSON object with:\n" "- benchmarks: array of {name, domain, metrics: [], api (Python one-liner), " " tier (1=pre-cached, 2=downloadable), size_mb}\n" "- baselines: array of {name, source (Python code), paper (citation), pip: []}\n" "- rationale: string explaining why these are the right choices\n\n" "CRITICAL RULES:\n" "- Benchmarks and baselines MUST be DOMAIN-APPROPRIATE for the topic.\n" "- Do NOT suggest image classification datasets (CIFAR, ImageNet, MNIST) " "for non-image topics like PDE solvers, RL, combinatorial optimization, etc.\n" "- Do NOT suggest optimizers (SGD, Adam, AdamW) as METHOD baselines — " "optimizers are training tools, NOT research methods to compare against.\n" "- Baselines must be COMPETING METHODS from the same research domain.\n\n" "DOMAIN-SPECIFIC GUIDANCE:\n" "- Physics/PDE/Scientific computing: Use SYNTHETIC data (Burgers eq, " "Darcy flow, Navier-Stokes, heat equation). Baselines: FNO, DeepONet, " "PINN, spectral methods.\n" "- Combinatorial optimization (TSP, graph coloring, scheduling): Use " "SYNTHETIC instances (random TSP, Erdos-Renyi graphs). Baselines: " "classical MCTS, LKH, OR-Tools, Concorde, RL-based methods.\n" "- Reinforcement learning: Use Gymnasium environments (CartPole, " "LunarLander, HalfCheetah). Baselines: PPO, SAC, DQN, TD3.\n" "- Graph learning: Use standard graph benchmarks (Cora, CiteSeer, " "ogbn-arxiv). Baselines: GCN, GAT, GraphSAGE.\n" "- If the domain naturally requires SYNTHETIC data (PDE, optimization, " "theoretical analysis), explicitly set tier=1 and api='synthetic' and " "describe the data generation procedure in the 'source' field.\n\n" "- Prefer well-known, widely-used benchmarks from top venues\n" "- Prefer baselines with open-source PyTorch implementations\n" "- Include at least 2 datasets and 2 baselines" ) user = ( f"Research Topic: {topic}\n" f"Hypothesis: {hypothesis}\n\n" "Suggest appropriate benchmarks, datasets, and baseline methods. " "Make sure they are relevant to the specific domain of this research." ) result = self._chat_json(system, user, max_tokens=4096) return result # -- Main entry point -------------------------------------------------- def execute(self, context: dict[str, Any]) -> AgentStepResult: """Survey available benchmarks and baselines for the given topic. Context keys: topic (str): Research topic/title hypothesis (str): Research hypothesis experiment_plan (str): Experiment plan from previous stages """ topic = context.get("topic", "") hypothesis = context.get("hypothesis", "") if not topic: return self._make_result(False, error="No topic provided") self.logger.info("Surveying benchmarks for topic: %s", topic[:80]) # 1. Match domains from knowledge base domain_ids = self._match_domains(topic) if hypothesis: domain_ids = list(dict.fromkeys( domain_ids + self._match_domains(hypothesis) )) self.logger.info("Matched domains: %s", domain_ids) # 2. Get local candidates local = self._get_local_candidates(domain_ids) # 3. Search HuggingFace Hub (if available) hf_datasets = self._search_hf_datasets(topic, domain_ids) # 4. LLM fallback if no local matches llm_suggestions: dict[str, Any] = {} if not local["benchmarks"] and not hf_datasets: self.logger.info("No local/HF matches — falling back to LLM") llm_suggestions = self._llm_suggest_benchmarks(topic, hypothesis) # 5. Combine results all_benchmarks = local["benchmarks"] + hf_datasets if llm_suggestions.get("benchmarks"): for b in llm_suggestions["benchmarks"]: b["origin"] = "llm_suggestion" all_benchmarks.append(b) all_baselines = local["baselines"] if llm_suggestions.get("baselines"): for bl in llm_suggestions["baselines"]: bl["origin"] = "llm_suggestion" all_baselines.append(bl) survey_result = { "matched_domains": domain_ids, "benchmarks": all_benchmarks, "baselines": all_baselines, "hf_datasets_found": len(hf_datasets), "llm_fallback_used": bool(llm_suggestions), "rationale": llm_suggestions.get("rationale", ""), } self.logger.info( "Survey complete: %d benchmarks, %d baselines, %d HF datasets", len(all_benchmarks), len(all_baselines), len(hf_datasets), ) return self._make_result(True, data=survey_result) ================================================ FILE: researchclaw/agents/benchmark_agent/validator.py ================================================ """Validator Agent — validates generated code for correctness. Performs three levels of validation: 1. **Syntax check** — ``ast.parse()`` on generated Python code. 2. **Import check** — verifies that referenced modules are importable or listed in requirements. 3. **LLM review** — asks the LLM to review code for common pitfalls (wrong API usage, missing transforms, incorrect splits). """ from __future__ import annotations import ast import logging import re from typing import Any from researchclaw.agents.base import AgentStepResult, BaseAgent logger = logging.getLogger(__name__) # Packages available in Docker image (no pip install needed) _BUILTIN_MODULES = { "torch", "torchvision", "torchaudio", "numpy", "scipy", "sklearn", "pandas", "matplotlib", "seaborn", "tqdm", "gymnasium", "networkx", "timm", "einops", "torchmetrics", "transformers", "datasets", "accelerate", "peft", "trl", "bitsandbytes", "tokenizers", "safetensors", "h5py", "tensorboard", "PIL", "yaml", "kornia", "albumentations", "cv2", "mujoco", "os", "sys", "json", "re", "pathlib", "typing", "collections", "functools", "itertools", "math", "random", "copy", "dataclasses", "abc", "io", "csv", "glob", "shutil", "time", "datetime", "logging", "warnings", "argparse", "pickle", "struct", "hashlib", } class ValidatorAgent(BaseAgent): """Validates generated code artifacts for syntax and API correctness.""" name = "validator" def _check_syntax(self, code: str, label: str) -> list[str]: """Check Python syntax via ast.parse. Returns list of errors.""" if not code.strip(): return [] try: ast.parse(code) return [] except SyntaxError as e: return [f"{label}: SyntaxError at line {e.lineno}: {e.msg}"] def _check_imports( self, code: str, label: str, extra_requirements: list[str], ) -> list[str]: """Check that imported modules are available or declared.""" if not code.strip(): return [] warnings: list[str] = [] # Extract import statements import_pattern = re.compile( r"^\s*(?:import|from)\s+(\w+)", re.MULTILINE, ) imports = set(import_pattern.findall(code)) # Build allowed set allowed = set(_BUILTIN_MODULES) # Map pip package names to import names pip_to_import = { "torch-geometric": "torch_geometric", "ogb": "ogb", "stable-baselines3": "stable_baselines3", "xgboost": "xgboost", "opencv-python": "cv2", "scikit-learn": "sklearn", "gymnasium[mujoco]": "gymnasium", "huggingface_hub": "huggingface_hub", } for pkg in extra_requirements: import_name = pip_to_import.get(pkg, pkg.replace("-", "_")) allowed.add(import_name) for mod in imports: if mod not in allowed: warnings.append( f"{label}: import '{mod}' not in Docker image or requirements" ) return warnings def _llm_review( self, data_code: str, baseline_code: str, setup_code: str, benchmark_names: list[str], baseline_names: list[str], ) -> dict[str, Any]: """Ask LLM to review generated code for common pitfalls.""" system = ( "You are a code reviewer specializing in ML experiment code. " "Review the following generated code for correctness.\n\n" "Check for:\n" "1. Correct API usage (torchvision, HuggingFace datasets, PyG, etc.)\n" "2. Proper data transforms and normalization\n" "3. Correct train/val/test split handling\n" "4. Compatible input/output dimensions between data and models\n" "5. Missing error handling for optional dependencies\n" "6. Hardcoded paths that should use variables\n" "7. Missing download=True in setup.py for tier 2 datasets\n\n" "Return JSON:\n" "{\n" ' "passed": true/false,\n' ' "issues": ["issue 1", "issue 2"],\n' ' "suggestions": ["suggestion 1"],\n' ' "severity": "none" | "warning" | "error"\n' "}" ) code_sections = [] if data_code: code_sections.append(f"## Data Loading Code\n```python\n{data_code}\n```") if baseline_code: code_sections.append(f"## Baseline Code\n```python\n{baseline_code}\n```") if setup_code: code_sections.append(f"## Setup Script\n```python\n{setup_code}\n```") user = ( f"Benchmarks: {', '.join(benchmark_names)}\n" f"Baselines: {', '.join(baseline_names)}\n\n" + "\n\n".join(code_sections) ) return self._chat_json(system, user, max_tokens=2048) # -- Main entry point -------------------------------------------------- def execute(self, context: dict[str, Any]) -> AgentStepResult: """Validate all generated code artifacts. Context keys: acquisition (dict): Output from AcquirerAgent """ acq = context.get("acquisition", {}) data_code = acq.get("data_loader_code", "") baseline_code = acq.get("baseline_code", "") setup_code = acq.get("setup_code", "") requirements = acq.get("requirements", "") benchmark_names = acq.get("benchmark_names", []) baseline_names = acq.get("baseline_names", []) extra_pip = [r.strip() for r in requirements.split("\n") if r.strip()] all_errors: list[str] = [] all_warnings: list[str] = [] # 1. Syntax checks for code, label in [ (data_code, "data_loader"), (baseline_code, "baseline"), (setup_code, "setup"), ]: errors = self._check_syntax(code, label) all_errors.extend(errors) # 2. Import checks for code, label in [ (data_code, "data_loader"), (baseline_code, "baseline"), ]: warnings = self._check_imports(code, label, extra_pip) all_warnings.extend(warnings) # 3. LLM review (only if no syntax errors) llm_review: dict[str, Any] = {} if not all_errors: llm_review = self._llm_review( data_code, baseline_code, setup_code, benchmark_names, baseline_names, ) if llm_review.get("severity") == "error": all_errors.extend(llm_review.get("issues", [])) elif llm_review.get("issues"): all_warnings.extend(llm_review.get("issues", [])) passed = len(all_errors) == 0 severity = "error" if all_errors else ("warning" if all_warnings else "none") result = { "passed": passed, "errors": all_errors, "warnings": all_warnings, "severity": severity, "llm_review": llm_review, "suggestions": llm_review.get("suggestions", []), } self.logger.info( "Validation %s: %d errors, %d warnings", "PASSED" if passed else "FAILED", len(all_errors), len(all_warnings), ) return self._make_result(passed, data=result) ================================================ FILE: researchclaw/agents/code_searcher/__init__.py ================================================ """Code Searcher agent — searches GitHub for reference code before generation. This agent searches GitHub repositories and code to find relevant examples that inform the blueprint generation process, especially for domains where the LLM's internal knowledge may be insufficient. """ from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult __all__ = ["CodeSearchAgent", "CodeSearchResult"] ================================================ FILE: researchclaw/agents/code_searcher/agent.py ================================================ """Code Search Agent — orchestrates GitHub search, pattern extraction, and caching. This is the main entry point for code search. It: 1. Checks cache for existing results 2. Generates search queries (LLM or heuristic) 3. Searches GitHub for repos and code 4. Reads key files from top repos 5. Extracts patterns using LLM 6. Caches results for future use """ from __future__ import annotations import logging from dataclasses import dataclass, field from typing import Any from researchclaw.agents.code_searcher.cache import SearchCache from researchclaw.agents.code_searcher.github_client import ( CodeSnippet, GitHubClient, RepoAnalysis, RepoInfo, ) from researchclaw.agents.code_searcher.pattern_extractor import CodePatterns, extract_patterns from researchclaw.agents.code_searcher.query_gen import generate_search_queries from researchclaw.domains.detector import DomainProfile logger = logging.getLogger(__name__) @dataclass class CodeSearchResult: """Complete result from a code search operation.""" patterns: CodePatterns = field(default_factory=CodePatterns) repos_found: list[RepoInfo] = field(default_factory=list) snippets_found: list[CodeSnippet] = field(default_factory=list) repo_analyses: list[RepoAnalysis] = field(default_factory=list) queries_used: list[str] = field(default_factory=list) from_cache: bool = False github_requests: int = 0 def to_prompt_context(self) -> str: """Format as context block for injection into code generation prompts.""" if not self.patterns.has_content: return "" return self.patterns.to_prompt_context() def to_cache_dict(self) -> dict[str, Any]: """Serialize for caching.""" return { "api_patterns": self.patterns.api_patterns, "file_structure": self.patterns.file_structure, "evaluation_patterns": self.patterns.evaluation_patterns, "library_versions": self.patterns.library_versions, "repos": [ { "full_name": r.full_name, "description": r.description, "stars": r.stars, "html_url": r.html_url, } for r in self.repos_found[:5] ], "queries": self.queries_used, } @classmethod def from_cache_dict(cls, data: dict[str, Any]) -> CodeSearchResult: """Deserialize from cache.""" patterns = CodePatterns( api_patterns=data.get("api_patterns", []), file_structure=data.get("file_structure", {}), evaluation_patterns=data.get("evaluation_patterns", []), library_versions=data.get("library_versions", {}), ) repos = [ RepoInfo( full_name=r.get("full_name", ""), description=r.get("description", ""), stars=r.get("stars", 0), html_url=r.get("html_url", ""), ) for r in data.get("repos", []) ] return cls( patterns=patterns, repos_found=repos, queries_used=data.get("queries", []), from_cache=True, ) class CodeSearchAgent: """Orchestrates code search for reference material before code generation. Usage:: agent = CodeSearchAgent(llm=llm_client) result = agent.search( topic="PDE solver comparison", domain=domain_profile, specific_needs=["finite element method", "convergence test"], ) context = result.to_prompt_context() """ def __init__( self, llm: Any | None = None, github_token: str | None = None, cache: SearchCache | None = None, max_repos_to_analyze: int = 3, max_code_searches: int = 3, ) -> None: self._llm = llm self._github = GitHubClient(token=github_token) self._cache = cache or SearchCache() self._max_repos = max_repos_to_analyze self._max_code_searches = max_code_searches def search( self, topic: str, domain: DomainProfile, specific_needs: list[str] | None = None, ) -> CodeSearchResult: """Execute a complete code search for a research topic. Flow: 1. Check cache 2. Generate search queries 3. Search GitHub repos + code 4. Read key files from top repos 5. Extract patterns 6. Cache results Parameters ---------- topic : str Research topic. domain : DomainProfile Detected domain profile. specific_needs : list[str], optional Specific library/API needs. Returns ------- CodeSearchResult """ logger.info("Code search started for: %.60s (domain=%s)", topic, domain.domain_id) # 1. Check cache cached = self._cache.get(domain.domain_id, topic) if cached: logger.info("Using cached code search results") return CodeSearchResult.from_cache_dict(cached) # 2. Generate search queries queries = generate_search_queries( topic=topic, domain_name=domain.display_name, core_libraries=domain.core_libraries, specific_needs=specific_needs, llm=self._llm, ) # Add domain-specific search terms from profile if domain.github_search_terms: for term in domain.github_search_terms[:2]: if term not in queries: queries.append(term) result = CodeSearchResult(queries_used=queries) # 3. Search GitHub repos (use first query) if queries: try: repos = self._github.search_repos(queries[0], max_results=10) # Filter: recent, well-starred repos = [ r for r in repos if r.stars >= 10 # minimum quality threshold ] result.repos_found = repos[:self._max_repos * 2] except Exception: logger.warning("Repo search failed, continuing", exc_info=True) # 4. Search GitHub code (use remaining queries) code_snippets: list[str] = [] for query in queries[1:self._max_code_searches + 1]: try: snippets = self._github.search_code(query, max_results=5) result.snippets_found.extend(snippets) except Exception: logger.warning("Code search failed for query: %s", query) # 5. Read key files from top repos for repo in result.repos_found[:self._max_repos]: try: analysis = self._analyze_repo(repo) if analysis: result.repo_analyses.append(analysis) # Collect code snippets for content in analysis.key_files.values(): if content: code_snippets.append(content) except Exception: logger.warning("Failed to analyze repo: %s", repo.full_name) # Also fetch content for code search results for snippet in result.snippets_found[:5]: try: content = self._github.get_file_content( snippet.repo_full_name, snippet.file_path, ) if content: snippet.content = content code_snippets.append(content) except Exception: pass # 6. Extract patterns if code_snippets: result.patterns = extract_patterns( code_snippets=code_snippets, topic=topic, domain_name=domain.display_name, llm=self._llm, ) result.github_requests = self._github.request_count # 7. Cache results if result.patterns.has_content: self._cache.put(domain.domain_id, topic, result.to_cache_dict()) logger.info( "Code search complete: %d repos, %d snippets, %d patterns, %d API calls", len(result.repos_found), len(result.snippets_found), len(result.patterns.api_patterns), result.github_requests, ) return result def _analyze_repo(self, repo: RepoInfo) -> RepoAnalysis | None: """Analyze a repository by reading key files.""" analysis = RepoAnalysis(repo=repo) # Get README readme = self._github.get_readme(repo.full_name) if readme: analysis.readme = readme[:3000] # truncate # Get file tree file_tree = self._github.get_repo_tree( repo.full_name, repo.default_branch, ) analysis.file_tree = file_tree # Identify and read key files key_patterns = [ "main.py", "run.py", "train.py", "experiment.py", "requirements.txt", "setup.py", "pyproject.toml", ] for pattern in key_patterns: matches = [f for f in file_tree if f.endswith(pattern)] for match in matches[:1]: # first match only content = self._github.get_file_content( repo.full_name, match, max_size_kb=50, ) if content: analysis.key_files[match] = content # Parse requirements req_content = analysis.key_files.get("requirements.txt", "") if req_content: analysis.requirements = [ line.strip().split("==")[0].split(">=")[0] for line in req_content.splitlines() if line.strip() and not line.startswith("#") ] return analysis ================================================ FILE: researchclaw/agents/code_searcher/cache.py ================================================ """Disk-based cache for code search results. Caches search results by domain + topic hash with a configurable TTL (default 30 days). This avoids redundant GitHub API calls for similar topics within the same domain. """ from __future__ import annotations import hashlib import json import logging import time from dataclasses import asdict from pathlib import Path from typing import Any logger = logging.getLogger(__name__) _DEFAULT_CACHE_DIR = Path(__file__).parent.parent.parent / "data" / "code_search_cache" _DEFAULT_TTL_DAYS = 30 class SearchCache: """Disk-based cache for code search results. Cache structure:: code_search_cache/ {domain_id}/ {topic_hash}.json """ def __init__( self, cache_dir: Path | None = None, ttl_days: int = _DEFAULT_TTL_DAYS, ) -> None: self._cache_dir = cache_dir or _DEFAULT_CACHE_DIR self._ttl_sec = ttl_days * 86400 def get(self, domain_id: str, topic: str) -> dict[str, Any] | None: """Get cached result if it exists and is not expired.""" cache_path = self._cache_path(domain_id, topic) if not cache_path.exists(): return None try: data = json.loads(cache_path.read_text(encoding="utf-8")) timestamp = data.get("_cached_at", 0) if time.time() - timestamp > self._ttl_sec: logger.debug("Cache expired for %s/%s", domain_id, topic[:40]) cache_path.unlink(missing_ok=True) return None logger.info("Cache hit for %s/%s", domain_id, topic[:40]) return data except Exception: logger.warning("Failed to read cache", exc_info=True) return None def put(self, domain_id: str, topic: str, data: dict[str, Any]) -> None: """Store a result in the cache.""" cache_path = self._cache_path(domain_id, topic) cache_path.parent.mkdir(parents=True, exist_ok=True) data["_cached_at"] = time.time() data["_domain_id"] = domain_id data["_topic_hash"] = self._topic_hash(topic) try: cache_path.write_text( json.dumps(data, indent=2, default=str), encoding="utf-8", ) logger.debug("Cached result for %s/%s", domain_id, topic[:40]) except Exception: logger.warning("Failed to write cache", exc_info=True) def clear(self, domain_id: str | None = None) -> int: """Clear cache. Returns number of entries removed.""" count = 0 if domain_id: domain_dir = self._cache_dir / domain_id if domain_dir.is_dir(): for f in domain_dir.glob("*.json"): f.unlink() count += 1 else: if self._cache_dir.is_dir(): for f in self._cache_dir.rglob("*.json"): f.unlink() count += 1 return count def stats(self) -> dict[str, int]: """Return cache statistics.""" total = 0 expired = 0 by_domain: dict[str, int] = {} if not self._cache_dir.is_dir(): return {"total": 0, "expired": 0} for f in self._cache_dir.rglob("*.json"): total += 1 domain = f.parent.name by_domain[domain] = by_domain.get(domain, 0) + 1 try: data = json.loads(f.read_text(encoding="utf-8")) if time.time() - data.get("_cached_at", 0) > self._ttl_sec: expired += 1 except Exception: pass return {"total": total, "expired": expired, **by_domain} def _cache_path(self, domain_id: str, topic: str) -> Path: return self._cache_dir / domain_id / f"{self._topic_hash(topic)}.json" @staticmethod def _topic_hash(topic: str) -> str: return hashlib.sha256(topic.lower().strip().encode()).hexdigest()[:16] ================================================ FILE: researchclaw/agents/code_searcher/github_client.py ================================================ """GitHub REST API client for code and repository search. Handles rate limiting, authentication, and response parsing for: - Repository search (``/search/repositories``) - Code search (``/search/code``) - File content retrieval (``/repos/{owner}/{repo}/contents/{path}``) - README retrieval Rate limits: - Authenticated: 30 req/min for search, 5000 req/hr for core - Code search: 10 req/min - Unauthenticated: 10 req/min for search """ from __future__ import annotations import logging import os import time from dataclasses import dataclass, field from typing import Any from urllib.parse import quote logger = logging.getLogger(__name__) _GITHUB_API = "https://api.github.com" @dataclass class RepoInfo: """Summary of a GitHub repository.""" full_name: str # "owner/repo" description: str = "" stars: int = 0 language: str = "" updated_at: str = "" html_url: str = "" default_branch: str = "main" topics: list[str] = field(default_factory=list) @dataclass class CodeSnippet: """A code snippet found via GitHub code search.""" repo_full_name: str file_path: str file_url: str = "" content: str = "" # populated after fetching score: float = 0.0 @dataclass class RepoAnalysis: """Analysis of a repository's structure and content.""" repo: RepoInfo readme: str = "" requirements: list[str] = field(default_factory=list) key_files: dict[str, str] = field(default_factory=dict) # path -> content file_tree: list[str] = field(default_factory=list) class GitHubClient: """GitHub REST API client with rate limiting and caching. Uses ``GITHUB_TOKEN`` env var for authentication (strongly recommended). Falls back to unauthenticated access (much lower rate limits). """ def __init__(self, token: str | None = None) -> None: self._token = token or os.environ.get("GITHUB_TOKEN", "") self._last_search_time: float = 0 self._search_interval: float = 6.0 # 10 req/min → 6s between requests self._request_count: int = 0 def _headers(self) -> dict[str, str]: headers = { "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } if self._token: headers["Authorization"] = f"Bearer {self._token}" return headers def _rate_limit_wait(self) -> None: """Enforce rate limiting between search requests.""" elapsed = time.time() - self._last_search_time if elapsed < self._search_interval: wait = self._search_interval - elapsed logger.debug("Rate limit: waiting %.1fs", wait) time.sleep(wait) self._last_search_time = time.time() def _get(self, url: str, params: dict[str, str] | None = None) -> dict[str, Any] | None: """Make a GET request to the GitHub API.""" import urllib.request import urllib.error import json if params: query_str = "&".join(f"{k}={quote(str(v))}" for k, v in params.items()) url = f"{url}?{query_str}" req = urllib.request.Request(url, headers=self._headers()) self._request_count += 1 try: with urllib.request.urlopen(req, timeout=15) as resp: return json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as e: if e.code == 403: logger.warning("GitHub API rate limited (403). Skipping.") return None if e.code == 422: logger.warning("GitHub API validation error (422): %s", url) return None logger.warning("GitHub API error %d: %s", e.code, url) return None except Exception: logger.warning("GitHub API request failed: %s", url, exc_info=True) return None def search_repos( self, query: str, language: str = "Python", sort: str = "stars", max_results: int = 10, ) -> list[RepoInfo]: """Search for repositories matching a query. Parameters ---------- query : str Search query (e.g., "PDE solver finite element"). language : str Filter by programming language. sort : str Sort order: "stars", "updated", "best-match". max_results : int Maximum number of results to return. Returns ------- list[RepoInfo] """ self._rate_limit_wait() search_q = f"{query} language:{language}" params = { "q": search_q, "sort": sort, "order": "desc", "per_page": str(min(max_results, 30)), } data = self._get(f"{_GITHUB_API}/search/repositories", params) if data is None: return [] repos: list[RepoInfo] = [] for item in data.get("items", [])[:max_results]: repos.append(RepoInfo( full_name=item.get("full_name", ""), description=item.get("description", "") or "", stars=item.get("stargazers_count", 0), language=item.get("language", "") or "", updated_at=item.get("updated_at", ""), html_url=item.get("html_url", ""), default_branch=item.get("default_branch", "main"), topics=item.get("topics", []), )) logger.info("Found %d repos for query: %.60s", len(repos), query) return repos def search_code( self, query: str, language: str = "Python", max_results: int = 10, ) -> list[CodeSnippet]: """Search for code snippets matching a query. Note: Code search has stricter rate limits (10 req/min). Parameters ---------- query : str Search query (e.g., "from pyscf import gto scf"). language : str Filter by programming language. max_results : int Maximum results. Returns ------- list[CodeSnippet] """ self._rate_limit_wait() search_q = f"{query} language:{language}" params = { "q": search_q, "per_page": str(min(max_results, 30)), } data = self._get(f"{_GITHUB_API}/search/code", params) if data is None: return [] snippets: list[CodeSnippet] = [] for item in data.get("items", [])[:max_results]: repo = item.get("repository", {}) snippets.append(CodeSnippet( repo_full_name=repo.get("full_name", ""), file_path=item.get("path", ""), file_url=item.get("html_url", ""), score=item.get("score", 0.0), )) logger.info("Found %d code snippets for query: %.60s", len(snippets), query) return snippets def get_file_content( self, repo_full_name: str, path: str, max_size_kb: int = 100, ) -> str | None: """Get the content of a file from a repository. Parameters ---------- repo_full_name : str Repository in "owner/repo" format. path : str File path within the repository. max_size_kb : int Skip files larger than this. Returns ------- str or None File content, or None if not found/too large. """ import base64 url = f"{_GITHUB_API}/repos/{repo_full_name}/contents/{quote(path, safe='/')}" data = self._get(url) if data is None: return None size = data.get("size", 0) if size > max_size_kb * 1024: logger.debug("File too large (%d KB): %s/%s", size // 1024, repo_full_name, path) return None content = data.get("content", "") encoding = data.get("encoding", "") if encoding == "base64": try: return base64.b64decode(content).decode("utf-8", errors="replace") except Exception: return None return content def get_readme(self, repo_full_name: str) -> str | None: """Get the README content of a repository.""" import base64 url = f"{_GITHUB_API}/repos/{repo_full_name}/readme" data = self._get(url) if data is None: return None content = data.get("content", "") encoding = data.get("encoding", "") if encoding == "base64": try: return base64.b64decode(content).decode("utf-8", errors="replace") except Exception: return None return content def get_repo_tree( self, repo_full_name: str, branch: str = "main", ) -> list[str]: """Get the file tree of a repository (flat list of paths).""" url = f"{_GITHUB_API}/repos/{repo_full_name}/git/trees/{branch}" params = {"recursive": "1"} data = self._get(url, params) if data is None: return [] tree = data.get("tree", []) return [item["path"] for item in tree if item.get("type") == "blob"] @property def request_count(self) -> int: return self._request_count @property def has_token(self) -> bool: return bool(self._token) ================================================ FILE: researchclaw/agents/code_searcher/pattern_extractor.py ================================================ """Extract reusable code patterns from GitHub search results. Uses LLM to analyze reference code and extract: - API call patterns (how to use a specific library) - File organization patterns (project structure) - Data processing patterns (data loading / preprocessing) - Evaluation patterns (how to compute and report metrics) """ from __future__ import annotations import json import logging import re from dataclasses import dataclass, field from typing import Any logger = logging.getLogger(__name__) @dataclass class CodePatterns: """Extracted patterns from reference code.""" api_patterns: list[str] = field(default_factory=list) file_structure: dict[str, str] = field(default_factory=dict) data_patterns: list[str] = field(default_factory=list) evaluation_patterns: list[str] = field(default_factory=list) library_versions: dict[str, str] = field(default_factory=dict) raw_snippets: list[str] = field(default_factory=list) def to_prompt_context(self) -> str: """Format patterns as context for code generation prompts.""" parts: list[str] = [] if self.api_patterns: parts.append("## Reference API Usage Patterns") for i, pattern in enumerate(self.api_patterns[:5], 1): parts.append(f"### Pattern {i}") parts.append(f"```python\n{pattern}\n```") if self.file_structure: parts.append("\n## Reference Project Structure") for fname, desc in self.file_structure.items(): parts.append(f"- `{fname}`: {desc}") if self.evaluation_patterns: parts.append("\n## Reference Evaluation Patterns") for pattern in self.evaluation_patterns[:3]: parts.append(f"```python\n{pattern}\n```") return "\n".join(parts) @property def has_content(self) -> bool: return bool(self.api_patterns or self.file_structure or self.evaluation_patterns) _EXTRACT_PROMPT = """\ You are analyzing reference code to extract reusable patterns for a research project. Research topic: {topic} Domain: {domain_name} Here are code snippets from relevant GitHub repositories: {code_snippets} Extract the following patterns as JSON: {{ "api_patterns": [ "# Short, self-contained code snippet showing key API usage", "# Each should be 3-10 lines showing one specific API call pattern" ], "file_structure": {{ "filename.py": "what this file does" }}, "evaluation_patterns": [ "# How results are computed and reported" ], "library_versions": {{ "library_name": "recommended version" }} }} Focus on: 1. How the core libraries are imported and used 2. Common data loading / preprocessing patterns 3. How experiments are structured 4. How results are computed and reported Return ONLY valid JSON.""" def extract_patterns( code_snippets: list[str], topic: str, domain_name: str, llm: Any | None = None, ) -> CodePatterns: """Extract code patterns from reference snippets. Parameters ---------- code_snippets : list[str] Code content from GitHub repos. topic : str Research topic for context. domain_name : str Domain name for context. llm : LLMClient, optional LLM for pattern extraction. Falls back to heuristic if not provided. Returns ------- CodePatterns """ if not code_snippets: return CodePatterns() if llm is not None: return _llm_extract(code_snippets, topic, domain_name, llm) return _heuristic_extract(code_snippets) def _llm_extract( snippets: list[str], topic: str, domain_name: str, llm: Any, ) -> CodePatterns: """Extract patterns using LLM analysis.""" try: # Truncate snippets to fit context combined = "" for i, snippet in enumerate(snippets[:5]): truncated = snippet[:2000] if len(snippet) > 2000 else snippet combined += f"\n--- Snippet {i+1} ---\n{truncated}\n" prompt = _EXTRACT_PROMPT.format( topic=topic, domain_name=domain_name, code_snippets=combined, ) if hasattr(llm, "chat"): import asyncio try: loop = asyncio.get_running_loop() except RuntimeError: loop = None if loop and loop.is_running(): return _heuristic_extract(snippets) resp = llm.chat( [{"role": "user", "content": prompt}], system="You extract code patterns as JSON.", max_tokens=1500, ) else: return _heuristic_extract(snippets) content = resp.content if hasattr(resp, "content") else str(resp) # Parse JSON from response json_match = re.search(r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", content, re.DOTALL) if json_match: data = json.loads(json_match.group()) return CodePatterns( api_patterns=data.get("api_patterns", []), file_structure=data.get("file_structure", {}), evaluation_patterns=data.get("evaluation_patterns", []), library_versions=data.get("library_versions", {}), raw_snippets=snippets[:5], ) except Exception: logger.warning("LLM pattern extraction failed", exc_info=True) return _heuristic_extract(snippets) def _heuristic_extract(snippets: list[str]) -> CodePatterns: """Extract patterns using regex heuristics (no LLM needed).""" patterns = CodePatterns(raw_snippets=snippets[:5]) for snippet in snippets: # Extract import statements as API patterns imports = re.findall(r"^(?:from|import)\s+.+$", snippet, re.MULTILINE) for imp in imports[:10]: if imp not in patterns.api_patterns: patterns.api_patterns.append(imp) # Extract function/class definitions for structure hints defs = re.findall(r"^(?:def|class)\s+(\w+)", snippet, re.MULTILINE) for d in defs[:5]: if d not in patterns.file_structure: patterns.file_structure[d] = "detected function/class" # Deduplicate patterns.api_patterns = list(dict.fromkeys(patterns.api_patterns))[:10] return patterns ================================================ FILE: researchclaw/agents/code_searcher/query_gen.py ================================================ """LLM-based search query generation for code search. Given a research topic and domain, generates targeted search queries for GitHub repository and code search. """ from __future__ import annotations import json import logging import re from typing import Any logger = logging.getLogger(__name__) _QUERY_GEN_PROMPT = """\ You are generating GitHub search queries to find reference code for a research experiment. Research topic: {topic} Domain: {domain_name} Core libraries: {libraries} Specific needs: {needs} Generate 3-5 search queries that will help find: 1. Example implementations using the domain's core libraries 2. Similar research projects or experiments 3. Specific API usage patterns needed for this experiment Rules: - Each query should be 3-8 words (GitHub search works best with short queries) - Include library names when searching for API usage - Include domain-specific terms - Focus on FINDING CODE, not documentation Respond as a JSON array of strings. Example: ["pyscf DFT hartree fock example", "molecular energy calculation python"] Queries:""" def generate_search_queries( topic: str, domain_name: str, core_libraries: list[str], specific_needs: list[str] | None = None, llm: Any | None = None, ) -> list[str]: """Generate search queries for GitHub code search. If no LLM is provided, generates queries from topic keywords and library names using heuristic rules. Parameters ---------- topic : str Research topic. domain_name : str Domain display name. core_libraries : list[str] Domain's core libraries. specific_needs : list[str], optional Specific API/library needs. llm : LLMClient, optional LLM for query generation. Returns ------- list[str] 3-5 search queries. """ if llm is not None: return _llm_generate(topic, domain_name, core_libraries, specific_needs or [], llm) return _heuristic_generate(topic, domain_name, core_libraries, specific_needs or []) def _heuristic_generate( topic: str, domain_name: str, libraries: list[str], needs: list[str], ) -> list[str]: """Generate queries without LLM using keyword extraction.""" queries: list[str] = [] # Clean topic: extract key phrases topic_words = _extract_key_phrases(topic) # Query 1: Topic + main library if libraries: queries.append(f"{topic_words} {libraries[0]}") # Query 2: Domain + "python example" queries.append(f"{domain_name.lower()} python example") # Query 3: Specific library usage for lib in libraries[:2]: queries.append(f"{lib} example tutorial python") # Query 4: Specific needs for need in needs[:2]: queries.append(f"{need} python") # Deduplicate and limit seen: set[str] = set() unique: list[str] = [] for q in queries: q_norm = q.lower().strip() if q_norm not in seen: seen.add(q_norm) unique.append(q) return unique[:5] def _llm_generate( topic: str, domain_name: str, libraries: list[str], needs: list[str], llm: Any, ) -> list[str]: """Generate queries using LLM.""" try: prompt = _QUERY_GEN_PROMPT.format( topic=topic, domain_name=domain_name, libraries=", ".join(libraries), needs=", ".join(needs) if needs else "general usage", ) # Synchronous LLM call — LLMClient.chat() is sync and takes # (messages, *, system=, max_tokens=) signature. if hasattr(llm, "chat"): resp = llm.chat( [{"role": "user", "content": prompt}], system="You generate concise GitHub search queries.", max_tokens=200, ) else: return _heuristic_generate(topic, domain_name, libraries, needs) content = resp.content if hasattr(resp, "content") else str(resp) # Parse JSON array from response json_match = re.search(r"\[.*\]", content, re.DOTALL) if json_match: queries = json.loads(json_match.group()) if isinstance(queries, list) and all(isinstance(q, str) for q in queries): return queries[:5] logger.warning("Failed to parse LLM query response, using heuristic") return _heuristic_generate(topic, domain_name, libraries, needs) except Exception: logger.warning("LLM query generation failed", exc_info=True) return _heuristic_generate(topic, domain_name, libraries, needs) def _extract_key_phrases(text: str, max_words: int = 5) -> str: """Extract key phrases from a research topic.""" # Remove common filler words stop_words = { "a", "an", "the", "of", "for", "in", "on", "with", "and", "or", "to", "by", "is", "are", "using", "based", "via", "through", "novel", "new", "improved", "efficient", "towards", } words = text.lower().split() key_words = [w for w in words if w not in stop_words and len(w) > 2] return " ".join(key_words[:max_words]) ================================================ FILE: researchclaw/agents/figure_agent/__init__.py ================================================ """FigureAgent — multi-agent intelligent chart generation system. Architecture ------------ 1. **Planner** — analyzes experiment results and determines which charts to generate, their types, layouts, and captions. 2. **CodeGen** — generates Python matplotlib plotting scripts using academic styling (SciencePlots, 300 DPI, colorblind-safe palettes). 3. **Renderer** — executes plotting scripts and verifies output files. 4. **Critic** — tri-modal review: numerical accuracy, text correctness, and visual quality assessment. 5. **Integrator** — determines figure placement in the paper and generates markdown references with captions. The ``FigureOrchestrator`` coordinates all agents and produces a ``FigurePlan`` consumed by downstream pipeline stages (paper draft, paper export). """ from researchclaw.agents.figure_agent.orchestrator import ( FigureOrchestrator, FigurePlan, ) __all__ = ["FigureOrchestrator", "FigurePlan"] ================================================ FILE: researchclaw/agents/figure_agent/codegen.py ================================================ """CodeGen Agent — generates visualization code for each figure. Takes the Planner's figure specifications and experiment data, then generates either: - Standalone Python scripts (Matplotlib/Seaborn) — run by Renderer - LaTeX code (TikZ/PGFPlots) — embedded directly in the paper Architecture follows Visual ChatGPT (Wu et al., 2023): the LLM acts as a *controller* calling deterministic render tools instead of generating pixels directly. """ from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult from researchclaw.agents.figure_agent.style_config import get_style_preamble from researchclaw.utils.sanitize import sanitize_figure_id from researchclaw.utils.thinking_tags import strip_thinking_tags logger = logging.getLogger(__name__) def _esc(s: str) -> str: """Escape curly braces in user-provided strings for str.format().""" return s.replace("{", "{{").replace("}", "}}") # --------------------------------------------------------------------------- # Degenerate data detection # --------------------------------------------------------------------------- def _is_degenerate_data(values: list[float]) -> bool: """Return True if data values are too degenerate to produce a useful chart. Rejects: empty lists, all-zero, all-identical, or single-value data. """ if not values or len(values) < 1: return True if all(v == 0 for v in values): return True if len(values) >= 2 and len(set(round(v, 6) for v in values)) <= 1: return True return False # --------------------------------------------------------------------------- # Metric name humanization # --------------------------------------------------------------------------- _METRIC_DISPLAY_NAMES: dict[str, str] = { "primary_metric": "Performance", "accuracy": "Accuracy (%)", "loss": "Loss", "f1_score": "F1 Score", "precision": "Precision", "recall": "Recall", "reward": "Reward", "return": "Return", "mse": "MSE", "mae": "MAE", "rmse": "RMSE", "bleu": "BLEU", "rouge": "ROUGE", "perplexity": "Perplexity", "auc": "AUC", } def _humanize_label(raw: str) -> str: """Convert raw metric names like 'primary_metric' to human-readable labels.""" if not raw: return "" low = raw.lower().strip() if low in _METRIC_DISPLAY_NAMES: return _METRIC_DISPLAY_NAMES[low] # Convert snake_case to Title Case return raw.replace("_", " ").title() # --------------------------------------------------------------------------- # Built-in chart templates # --------------------------------------------------------------------------- _TEMPLATE_BAR_COMPARISON = ''' {style_preamble} # Data conditions = {conditions} values = {values} ci_low = {ci_low} ci_high = {ci_high} # Plot fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) x = np.arange(len(conditions)) bar_colors = [COLORS[i % len(COLORS)] for i in range(len(conditions))] yerr_lo = [max(0, v - lo) for v, lo in zip(values, ci_low)] yerr_hi = [max(0, hi - v) for v, hi in zip(values, ci_high)] bars = ax.bar(x, values, color=bar_colors, alpha=0.85, edgecolor="white", linewidth=0.5) ax.errorbar(x, values, yerr=[yerr_lo, yerr_hi], fmt="none", ecolor="#333", capsize=4, capthick=1.2, linewidth=1.2) # Value labels offset = max(yerr_hi) * 0.08 if yerr_hi and max(yerr_hi) > 0 else max(values) * 0.02 for i, v in enumerate(values): ax.text(i, v + offset, f"{{v:.4f}}", ha="center", va="bottom", fontweight="bold") ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") ax.set_xticks(x) ax.set_xticklabels([c.replace("_", " ") for c in conditions], rotation=25, ha="right") ax.grid(True, axis="y", alpha=0.3) ax.set_axisbelow(True) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATE_GROUPED_BAR = ''' {style_preamble} # Data: conditions x metrics conditions = {conditions} metric_names = {metric_names} # data_matrix[i][j] = value for condition i, metric j data_matrix = {data_matrix} # Plot n_groups = len(conditions) n_bars = len(metric_names) fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) x = np.arange(n_groups) bar_width = 0.8 / n_bars for j, metric in enumerate(metric_names): offset = (j - n_bars / 2 + 0.5) * bar_width vals = [data_matrix[i][j] for i in range(n_groups)] ax.bar(x + offset, vals, bar_width, label=metric.replace("_", " "), color=COLORS[j % len(COLORS)], alpha=0.85, edgecolor="white", linewidth=0.5) ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") ax.set_xticks(x) ax.set_xticklabels([c.replace("_", " ") for c in conditions], rotation=25, ha="right") ax.legend(loc="upper left", bbox_to_anchor=(0, 1), framealpha=0.9, edgecolor="gray") ax.grid(True, axis="y", alpha=0.3) ax.set_axisbelow(True) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATE_TRAINING_CURVE = ''' {style_preamble} # Data: each series is (label, epochs, values, [optional std]) series_data = {series_data} fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) for idx, series in enumerate(series_data): label = series["label"] epochs = series["epochs"] values = series["values"] color = COLORS[idx % len(COLORS)] ls = LINE_STYLES[idx % len(LINE_STYLES)] marker = MARKERS[idx % len(MARKERS)] ax.plot(epochs, values, linestyle=ls, color=color, linewidth=1.5, marker=marker, markersize=4, markevery=max(1, len(epochs)//10), label=label.replace("_", " ")) if "std" in series and series["std"]: std = series["std"] lower = [v - s for v, s in zip(values, std)] upper = [v + s for v, s in zip(values, std)] ax.fill_between(epochs, lower, upper, alpha=0.15, color=color) ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") ax.legend(loc="best", framealpha=0.9, edgecolor="gray") ax.grid(True, alpha=0.3) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATE_HEATMAP = ''' {style_preamble} # Data row_labels = {row_labels} col_labels = {col_labels} data = np.array({data_matrix}) fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) im = ax.imshow(data, cmap="cividis", aspect="auto") ax.set_xticks(np.arange(len(col_labels))) ax.set_yticks(np.arange(len(row_labels))) ax.set_xticklabels(col_labels, rotation=45, ha="right") ax.set_yticklabels(row_labels) # Annotate cells for i in range(len(row_labels)): for j in range(len(col_labels)): val = data[i, j] color = "white" if val > (data.max() + data.min()) / 2 else "black" ax.text(j, i, f"{{val:.3f}}", ha="center", va="center", color=color) ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") fig.colorbar(im, ax=ax, shrink=0.8) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATE_LINE_MULTI = ''' {style_preamble} # Data: list of series dicts with label, x, y, [std] series_data = {series_data} fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) for idx, series in enumerate(series_data): label = series["label"] x = series["x"] y = series["y"] color = COLORS[idx % len(COLORS)] ls = LINE_STYLES[idx % len(LINE_STYLES)] marker = MARKERS[idx % len(MARKERS)] ax.plot(x, y, linestyle=ls, color=color, linewidth=1.5, marker=marker, markersize=4, markevery=max(1, len(x)//8), label=label.replace("_", " ")) if "std" in series and series["std"]: std = series["std"] lower = [v - s for v, s in zip(y, std)] upper = [v + s for v, s in zip(y, std)] ax.fill_between(x, lower, upper, alpha=0.15, color=color) ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") ax.legend(loc="best", framealpha=0.9, edgecolor="gray") ax.grid(True, alpha=0.3) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATE_SCATTER = ''' {style_preamble} # Data: list of groups with label, x, y groups = {groups} fig, ax = plt.subplots(figsize=({width}, {height}), constrained_layout=True) for idx, group in enumerate(groups): label = group["label"] x = group["x"] y = group["y"] color = COLORS[idx % len(COLORS)] marker = MARKERS[idx % len(MARKERS)] ax.scatter(x, y, c=color, marker=marker, s=40, alpha=0.7, label=label.replace("_", " ")) ax.set_xlabel("{x_label}") ax.set_ylabel("{y_label}") ax.set_title("{title}") ax.legend(loc="best", framealpha=0.9, edgecolor="gray") ax.grid(True, alpha=0.3) fig.savefig("{output_path}") plt.close(fig) print(f"Saved: {output_path}") ''' _TEMPLATES: dict[str, str] = { "bar_comparison": _TEMPLATE_BAR_COMPARISON, "ablation_grouped": _TEMPLATE_BAR_COMPARISON, # Same template, different data "grouped_bar": _TEMPLATE_GROUPED_BAR, "training_curve": _TEMPLATE_TRAINING_CURVE, "loss_curve": _TEMPLATE_TRAINING_CURVE, "heatmap": _TEMPLATE_HEATMAP, "confusion_matrix": _TEMPLATE_HEATMAP, "line_multi": _TEMPLATE_LINE_MULTI, "scatter_plot": _TEMPLATE_SCATTER, } # --------------------------------------------------------------------------- # LaTeX / PGFPlots templates — for direct LaTeX embedding # --------------------------------------------------------------------------- _LATEX_TEMPLATE_BAR_COMPARISON = r''' \begin{{figure}}[htbp] \centering \begin{{tikzpicture}} \begin{{axis}}[ ybar, bar width=15pt, width={width}cm, height={height}cm, xlabel={{{x_label}}}, ylabel={{{y_label}}}, title={{{title}}}, symbolic x coords={{{x_coords}}}, xtick=data, x tick label style={{rotate=25, anchor=east, font=\small}}, ymin=0, nodes near coords, nodes near coords align={{vertical}}, every node near coord/.append style={{font=\tiny}}, grid=major, grid style={{dashed, gray!30}}, ] \addplot[fill=blue!60, draw=blue!80] coordinates {{{coords}}}; \end{{axis}} \end{{tikzpicture}} \caption{{{caption}}} \label{{fig:{figure_id}}} \end{{figure}} ''' _LATEX_TEMPLATE_LINE = r''' \begin{{figure}}[htbp] \centering \begin{{tikzpicture}} \begin{{axis}}[ width={width}cm, height={height}cm, xlabel={{{x_label}}}, ylabel={{{y_label}}}, title={{{title}}}, legend pos=north west, grid=major, grid style={{dashed, gray!30}}, cycle list name=color list, ] {plot_commands} \end{{axis}} \end{{tikzpicture}} \caption{{{caption}}} \label{{fig:{figure_id}}} \end{{figure}} ''' _LATEX_TEMPLATE_HEATMAP = r''' \begin{{figure}}[htbp] \centering \begin{{tikzpicture}} \begin{{axis}}[ colormap/viridis, colorbar, width={width}cm, height={height}cm, xlabel={{{x_label}}}, ylabel={{{y_label}}}, title={{{title}}}, point meta min={meta_min}, point meta max={meta_max}, xtick={{{xtick}}}, ytick={{{ytick}}}, xticklabels={{{xticklabels}}}, yticklabels={{{yticklabels}}}, x tick label style={{rotate=45, anchor=east, font=\small}}, ] \addplot[matrix plot*, mesh/cols={cols}, mesh/rows={rows}, point meta=explicit] coordinates {{ {matrix_coords} }}; \end{{axis}} \end{{tikzpicture}} \caption{{{caption}}} \label{{fig:{figure_id}}} \end{{figure}} ''' _LATEX_TEMPLATES: dict[str, str] = { "bar_comparison": _LATEX_TEMPLATE_BAR_COMPARISON, "ablation_grouped": _LATEX_TEMPLATE_BAR_COMPARISON, "training_curve": _LATEX_TEMPLATE_LINE, "loss_curve": _LATEX_TEMPLATE_LINE, "line_multi": _LATEX_TEMPLATE_LINE, "heatmap": _LATEX_TEMPLATE_HEATMAP, "confusion_matrix": _LATEX_TEMPLATE_HEATMAP, } class CodeGenAgent(BaseAgent): """Generates visualization code (Python or LaTeX) for each planned figure. Supports two output formats: - ``"python"`` (default): Matplotlib/Seaborn scripts executed by Renderer - ``"latex"``: TikZ/PGFPlots code embedded directly in the paper """ name = "figure_codegen" def __init__(self, llm: Any, *, output_format: str = "python", use_docker: bool = False) -> None: super().__init__(llm) self._output_format = output_format # "python" or "latex" self._use_docker = use_docker # BUG-60: generate Docker paths when True # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Generate plotting scripts for all planned figures. Context keys: figures (list[dict]): Figure plan from Planner experiment_results (dict): Raw experiment data condition_summaries (dict): Per-condition aggregated stats metrics_summary (dict): Per-metric aggregated stats metric_key (str): Primary metric name output_dir (str): Directory for output scripts critic_feedback (list[dict], optional): Previous Critic feedback """ try: figures = context.get("figures", []) experiment_results = context.get("experiment_results", {}) condition_summaries = context.get("condition_summaries", {}) metrics_summary = context.get("metrics_summary", {}) metric_key = context.get("metric_key", "primary_metric") output_dir = context.get("output_dir", "charts") critic_feedback = context.get("critic_feedback", []) scripts: list[dict[str, Any]] = [] for fig_spec in figures: # BUG-36: skip non-dict entries (LLM may return strings) if not isinstance(fig_spec, dict): self.logger.warning("Skipping non-dict fig_spec: %s", type(fig_spec)) continue figure_id = fig_spec.get("figure_id", "unknown") chart_type = fig_spec.get("chart_type", "bar_comparison") # Check for critic feedback on this specific figure fig_feedback = None for fb in critic_feedback: # BUG-FIX: guard against non-dict entries in feedback if isinstance(fb, dict) and fb.get("figure_id") == figure_id: fig_feedback = fb break script = self._generate_script( fig_spec=fig_spec, chart_type=chart_type, condition_summaries=condition_summaries, metrics_summary=metrics_summary, experiment_results=experiment_results, metric_key=metric_key, output_dir=output_dir, critic_feedback=fig_feedback, ) scripts.append({ "figure_id": figure_id, "chart_type": chart_type, "script": script, "output_filename": f"{figure_id}.png", "title": fig_spec.get("title", ""), "caption": fig_spec.get("caption", ""), "section": fig_spec.get("section", "results"), "width": fig_spec.get("width", "single_column"), }) return self._make_result(True, data={"scripts": scripts}) except Exception as exc: self.logger.error("CodeGen failed: %s", exc) return self._make_result(False, error=str(exc)) # ------------------------------------------------------------------ # Script generation # ------------------------------------------------------------------ def _generate_script( self, *, fig_spec: dict[str, Any], chart_type: str, condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], experiment_results: dict[str, Any], metric_key: str, output_dir: str, critic_feedback: dict[str, Any] | None, ) -> str: """Generate a plotting script for a single figure.""" figure_id = sanitize_figure_id(fig_spec.get("figure_id", "figure")) # BUG-20: Use absolute path to avoid CWD-relative savefig errors # BUG-60: When running in Docker, use container path directly so # renderer doesn't need fragile regex rewriting of host paths. if self._use_docker: output_path = f"/workspace/output/{figure_id}.png" else: output_path = str((Path(output_dir) / f"{figure_id}.png").resolve()) title = fig_spec.get("title", "") x_label = fig_spec.get("x_label", "") y_label = fig_spec.get("y_label", "") width_key = fig_spec.get("width", "single_column") # BUG-FIX: LLM may return data_source as a plain string (e.g. # "condition_comparison") instead of a dict. Normalize to dict. _raw_ds = fig_spec.get("data_source", {}) if isinstance(_raw_ds, str): data_source = {"type": _raw_ds} elif isinstance(_raw_ds, dict): data_source = _raw_ds else: data_source = {} from researchclaw.agents.figure_agent.style_config import FIGURE_WIDTH, DEFAULT_FIGURE_HEIGHT width = FIGURE_WIDTH.get(width_key, FIGURE_WIDTH["single_column"]) height = DEFAULT_FIGURE_HEIGHT # Try template-based generation first template = _TEMPLATES.get(chart_type) if template and not critic_feedback: script = self._fill_template( template=template, chart_type=chart_type, data_source=data_source, condition_summaries=condition_summaries, metrics_summary=metrics_summary, experiment_results=experiment_results, metric_key=metric_key, output_path=output_path, title=title, x_label=x_label, y_label=y_label, width=width, height=height, width_key=width_key, ) if script: return script # Fall back to LLM-generated script return self._llm_generate_script( fig_spec=fig_spec, chart_type=chart_type, condition_summaries=condition_summaries, metrics_summary=metrics_summary, experiment_results=experiment_results, metric_key=metric_key, output_path=output_path, width=width, height=height, critic_feedback=critic_feedback, width_key=width_key, ) def _fill_template( self, *, template: str, chart_type: str, data_source: dict[str, Any], condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], experiment_results: dict[str, Any], metric_key: str, output_path: str, title: str, x_label: str, y_label: str, width: float, height: float, width_key: str = "single_column", ) -> str: """Fill a template with actual data values.""" style_preamble = get_style_preamble(width_key=width_key) source_type = data_source.get("type", "condition_comparison") if chart_type in ("bar_comparison", "ablation_grouped"): return self._fill_bar_template( template=template, condition_summaries=condition_summaries, metric_key=data_source.get("metric", metric_key), output_path=output_path, title=title, x_label=x_label, y_label=y_label, width=width, height=height, style_preamble=style_preamble, ) if chart_type == "grouped_bar" and source_type == "multi_metric": # BUG-37: LLM may return nested lists in metrics — flatten to list[str] _raw_metrics = data_source.get("metrics", []) _flat_metrics: list[str] = [] for _mi in (_raw_metrics if isinstance(_raw_metrics, list) else []): if isinstance(_mi, str): _flat_metrics.append(_mi) elif isinstance(_mi, list): _flat_metrics.extend(str(x) for x in _mi) else: _flat_metrics.append(str(_mi)) return self._fill_grouped_bar_template( template=template, condition_summaries=condition_summaries, metrics=_flat_metrics, output_path=output_path, title=title, x_label=x_label, y_label=y_label, width=width, height=height, style_preamble=style_preamble, ) if chart_type in ("heatmap", "confusion_matrix"): return self._fill_heatmap_template( template=template, condition_summaries=condition_summaries, metrics_summary=metrics_summary, output_path=output_path, title=title, x_label=x_label, y_label=y_label, width=width, height=height, style_preamble=style_preamble, ) # For other types, fall through to LLM generation return "" def _fill_bar_template( self, *, template: str, condition_summaries: dict[str, Any], metric_key: str, output_path: str, title: str, x_label: str, y_label: str, width: float, height: float, style_preamble: str, ) -> str: """Fill bar comparison template with condition data.""" conditions: list[str] = [] values: list[float] = [] ci_low: list[float] = [] ci_high: list[float] = [] for cond, cdata in condition_summaries.items(): if not isinstance(cdata, dict): continue metrics = cdata.get("metrics", {}) val = metrics.get(f"{metric_key}_mean") or metrics.get(metric_key) if val is None: continue try: fval = float(val) except (ValueError, TypeError): continue conditions.append(cond) values.append(fval) ci_low.append(float(cdata.get("ci95_low", fval))) ci_high.append(float(cdata.get("ci95_high", fval))) if not conditions: return "" # Skip degenerate data (all zeros, all identical) if _is_degenerate_data(values): logger.warning("Skipping degenerate bar chart: all values are identical or zero") return "" # Humanize empty/raw labels if not y_label or y_label.lower().replace("_", "") in ("primarymetric", "metric"): y_label = _humanize_label(metric_key) if not x_label: x_label = "Method" return template.format( style_preamble=style_preamble, conditions=repr(conditions), values=repr(values), ci_low=repr(ci_low), ci_high=repr(ci_high), output_path=output_path, title=_esc(title), x_label=_esc(x_label), y_label=_esc(y_label), width=width, height=height, ) def _fill_grouped_bar_template( self, *, template: str, condition_summaries: dict[str, Any], metrics: list[str], output_path: str, title: str, x_label: str, y_label: str, width: float, height: float, style_preamble: str, ) -> str: """Fill grouped bar template with multi-metric data.""" conditions: list[str] = list(condition_summaries.keys()) if not conditions or not metrics: return "" data_matrix: list[list[float]] = [] for cond in conditions: cdata = condition_summaries.get(cond, {}) cmetrics = cdata.get("metrics", {}) if isinstance(cdata, dict) else {} row = [] for m in metrics: val = cmetrics.get(f"{m}_mean") or cmetrics.get(m, 0) try: row.append(float(val)) except (ValueError, TypeError): row.append(0.0) data_matrix.append(row) return template.format( style_preamble=style_preamble, conditions=repr(conditions), metric_names=repr(metrics), data_matrix=repr(data_matrix), output_path=output_path, title=_esc(title), x_label=_esc(x_label), y_label=_esc(y_label), width=width, height=height, ) def _fill_heatmap_template( self, *, template: str, condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], output_path: str, title: str, x_label: str, y_label: str, width: float, height: float, style_preamble: str, ) -> str: """Fill heatmap template — rows=conditions, cols=metrics.""" conditions = list(condition_summaries.keys()) # Select non-timing metrics metric_names = [ m for m in metrics_summary if not any(t in m.lower() for t in ["time", "elapsed", "seed", "runtime"]) ][:8] if not conditions or not metric_names: return "" data_matrix: list[list[float]] = [] for cond in conditions: cdata = condition_summaries.get(cond, {}) cmetrics = cdata.get("metrics", {}) if isinstance(cdata, dict) else {} row = [] for m in metric_names: val = cmetrics.get(f"{m}_mean") or cmetrics.get(m, 0) try: row.append(round(float(val), 4)) except (ValueError, TypeError): row.append(0.0) data_matrix.append(row) # Skip degenerate heatmaps (all values identical) all_vals = [v for row in data_matrix for v in row] if _is_degenerate_data(all_vals): logger.warning("Skipping degenerate heatmap: all values are identical or zero") return "" # Also skip single-row heatmaps (meaningless) if len(conditions) < 2: logger.warning("Skipping heatmap with only %d row(s)", len(conditions)) return "" return template.format( style_preamble=style_preamble, row_labels=repr(conditions), col_labels=repr(metric_names), data_matrix=repr(data_matrix), output_path=output_path, title=_esc(title), x_label=_esc(x_label or "Metric"), y_label=_esc(y_label or "Method"), width=max(width, len(metric_names) * 0.8), height=max(height, len(conditions) * 0.6), ) # ------------------------------------------------------------------ # LLM-based script generation # ------------------------------------------------------------------ def _llm_generate_script( self, *, fig_spec: dict[str, Any], chart_type: str, condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], experiment_results: dict[str, Any], metric_key: str, output_path: str, width: float, height: float, critic_feedback: dict[str, Any] | None, width_key: str = "single_column", ) -> str: """Generate a plotting script using LLM.""" if self._output_format == "latex": return self._llm_generate_latex( fig_spec=fig_spec, chart_type=chart_type, condition_summaries=condition_summaries, metrics_summary=metrics_summary, metric_key=metric_key, width=width, height=height, critic_feedback=critic_feedback, ) style_preamble = get_style_preamble(width_key=width_key) system_prompt = ( "You are an expert scientific visualization programmer. " "Generate a standalone Python script that creates a publication-quality " "matplotlib chart.\n\n" "RULES:\n" "- The script must be completely self-contained (no external imports " "beyond matplotlib, numpy, seaborn)\n" "- All data values must be hardcoded in the script (no file I/O)\n" "- Use the provided style preamble at the top of the script\n" "- Output format: PNG at 300 DPI\n" "- Use colorblind-safe colors from the COLORS list\n" "- Include descriptive axis labels and title\n" "- Use constrained_layout=True in plt.subplots() — do NOT call fig.tight_layout()\n" "- Call fig.savefig() and plt.close(fig) at the end\n" "- Print 'Saved: ' after saving\n" "- NEVER embed caption, description, or subtitle text inside the figure " "using fig.text() or ax.text() for long descriptions. " "All captions are added by LaTeX \\caption{}\n" "- Place legends OUTSIDE the data area when possible. " "Use bbox_to_anchor=(1.02, 1) with loc='upper left' for legends " "that would overlap bars or data points\n" "- Do NOT include any or tags\n\n" "Return ONLY the Python script, no explanation." ) # Build data context (truncated to avoid token overflow) data_context = { "conditions": list(condition_summaries.keys())[:10], "metric_key": metric_key, } # Add condition values for cond, cdata in list(condition_summaries.items())[:10]: if isinstance(cdata, dict): data_context[cond] = { "metrics": {k: v for k, v in (cdata.get("metrics") or {}).items() if not any(t in k.lower() for t in ["time", "elapsed", "runtime"])}, "ci95_low": cdata.get("ci95_low"), "ci95_high": cdata.get("ci95_high"), } user_prompt = ( f"Style preamble (paste at top of script):\n```python\n{style_preamble}\n```\n\n" f"Figure specification:\n{json.dumps(fig_spec, indent=2)}\n\n" f"Experiment data:\n{json.dumps(data_context, indent=2, default=str)}\n\n" f"Output path: {output_path}\n" f"Figure size: ({width}, {height})\n" ) if critic_feedback: user_prompt += ( f"\n\nPREVIOUS ATTEMPT FAILED REVIEW. Fix these issues:\n" f"{json.dumps(critic_feedback.get('issues', []), indent=2)}\n" ) raw = self._chat(system_prompt, user_prompt, max_tokens=4096, temperature=0.3) # Strip reasoning model thinking tags before parsing raw = strip_thinking_tags(raw) # Strip markdown fences script = self._strip_fences(raw) # Ensure style preamble is present if "matplotlib" not in script: script = style_preamble + "\n\n" + script return script def _llm_generate_latex( self, *, fig_spec: dict[str, Any], chart_type: str, condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], metric_key: str, width: float, height: float, critic_feedback: dict[str, Any] | None, ) -> str: """Generate LaTeX TikZ/PGFPlots code for a figure. This produces code that compiles directly in a LaTeX document that includes ``\\usepackage{pgfplots}`` and ``\\usepackage{tikz}``. """ system_prompt = ( "You are an expert scientific visualization programmer specializing " "in LaTeX/TikZ/PGFPlots.\n\n" "Generate LaTeX code using PGFPlots that creates a publication-quality " "chart suitable for a top-tier AI conference paper.\n\n" "RULES:\n" "- Use pgfplots (version ≥ 1.18) with \\pgfplotsset{compat=1.18}\n" "- All data values must be hardcoded in the LaTeX source\n" "- Use the colorbrewer palette or viridis colormap\n" "- Include descriptive axis labels and title\n" "- Wrap in a figure environment with \\caption and \\label\n" "- Font sizes should match: title 12pt, labels 10pt, ticks 9pt\n" "- Width should be \\columnwidth or 0.48\\textwidth for single column\n" "- Do NOT include any or tags\n\n" "Return ONLY the LaTeX code, no explanation." ) # Build data context data_context = { "conditions": list(condition_summaries.keys())[:10], "metric_key": metric_key, } for cond, cdata in list(condition_summaries.items())[:10]: if isinstance(cdata, dict): data_context[cond] = { "metrics": {k: v for k, v in (cdata.get("metrics") or {}).items() if not any(t in k.lower() for t in ["time", "elapsed", "runtime"])}, } user_prompt = ( f"Chart type: {chart_type}\n" f"Figure specification:\n{json.dumps(fig_spec, indent=2)}\n\n" f"Experiment data:\n{json.dumps(data_context, indent=2, default=str)}\n\n" f"Figure dimensions: width={width}in, height={height}in\n" ) if critic_feedback: user_prompt += ( f"\n\nPREVIOUS ATTEMPT FAILED REVIEW. Fix these issues:\n" f"{json.dumps(critic_feedback.get('issues', []), indent=2)}\n" ) raw = self._chat(system_prompt, user_prompt, max_tokens=4096, temperature=0.3) # Strip reasoning model thinking tags before parsing raw = strip_thinking_tags(raw) # Strip markdown fences (```latex ... ```) return self._strip_latex_fences(raw) @staticmethod def _strip_fences(text: str) -> str: """Remove markdown code fences from LLM output.""" m = re.search(r"```(?:python)?\s*\n(.*?)```", text, re.DOTALL) if m: return m.group(1).strip() return text.strip() @staticmethod def _strip_latex_fences(text: str) -> str: """Remove markdown code fences from LaTeX LLM output.""" m = re.search(r"```(?:latex|tex)?\s*\n(.*?)```", text, re.DOTALL) if m: return m.group(1).strip() return text.strip() ================================================ FILE: researchclaw/agents/figure_agent/critic.py ================================================ """Critic Agent — tri-modal review of rendered charts. Reviews each chart on three dimensions (inspired by PlotGen): 1. **Numerical accuracy** — verifies plotted values match source data 2. **Text correctness** — checks labels, legends, captions are accurate 3. **Visual quality** — LLM-based assessment of academic publication standards Outputs pass/fail per figure with specific fix suggestions. """ from __future__ import annotations import json import logging import re from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult logger = logging.getLogger(__name__) class CriticAgent(BaseAgent): """Reviews rendered charts for accuracy and quality.""" name = "figure_critic" def __init__( self, llm: Any, *, strict_mode: bool = False, ) -> None: super().__init__(llm) self._strict = strict_mode # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Review all rendered figures. Context keys: rendered (list[dict]): From Renderer — each has 'figure_id', 'success', 'output_path', 'script_path', 'title', 'caption' scripts (list[dict]): From CodeGen — has the source scripts condition_summaries (dict): Source data for numerical verification metrics_summary (dict): Source metrics metric_key (str): Primary metric name """ try: rendered = context.get("rendered", []) scripts = context.get("scripts", []) condition_summaries = context.get("condition_summaries", {}) metrics_summary = context.get("metrics_summary", {}) metric_key = context.get("metric_key", "primary_metric") # Build script lookup script_map: dict[str, dict[str, Any]] = {} for s in scripts: # BUG-38: skip non-dict entries if not isinstance(s, dict): self.logger.warning("Skipping non-dict script entry: %s", type(s)) continue script_map[s.get("figure_id", "")] = s reviews: list[dict[str, Any]] = [] all_passed = True for fig in rendered: # BUG-38: skip non-dict entries if not isinstance(fig, dict): self.logger.warning("Skipping non-dict rendered entry: %s", type(fig)) continue figure_id = fig.get("figure_id", "unknown") if not fig.get("success"): reviews.append({ "figure_id": figure_id, "passed": False, "issues": [{"type": "render_failure", "message": fig.get("error", "Render failed")}], }) all_passed = False continue script_info = script_map.get(figure_id, {}) script_code = script_info.get("script", "") review = self._review_figure( figure_id=figure_id, script_code=script_code, fig_info=fig, condition_summaries=condition_summaries, metrics_summary=metrics_summary, metric_key=metric_key, ) reviews.append(review) if not review["passed"]: all_passed = False passed_count = sum(1 for r in reviews if r["passed"]) self.logger.info( "Critic review: %d/%d figures passed", passed_count, len(reviews), ) return self._make_result( success=True, data={ "reviews": reviews, "all_passed": all_passed, "passed_count": passed_count, "total_count": len(reviews), }, ) except Exception as exc: self.logger.error("Critic failed: %s", exc) return self._make_result(False, error=str(exc)) # ------------------------------------------------------------------ # Per-figure review # ------------------------------------------------------------------ def _review_figure( self, *, figure_id: str, script_code: str, fig_info: dict[str, Any], condition_summaries: dict[str, Any], metrics_summary: dict[str, Any], metric_key: str, ) -> dict[str, Any]: """Review a single rendered figure on four dimensions.""" issues: list[dict[str, str]] = [] # Dimension 1: Numerical accuracy num_issues = self._check_numerical_accuracy( script_code, condition_summaries, metric_key ) issues.extend(num_issues) # Dimension 2: Text correctness text_issues = self._check_text_correctness( script_code, fig_info ) issues.extend(text_issues) # Dimension 3: Visual quality (LLM-based) quality_issues = self._check_visual_quality( script_code, fig_info ) issues.extend(quality_issues) # Dimension 4: Rendered image validation (pixel-level) output_path = fig_info.get("output_path", "") if output_path: pixel_issues = self._check_rendered_image(output_path) issues.extend(pixel_issues) # Determine pass/fail critical_issues = [i for i in issues if i.get("severity") == "critical"] passed = len(critical_issues) == 0 if self._strict: passed = len(issues) == 0 return { "figure_id": figure_id, "passed": passed, "issues": issues, "issue_count": len(issues), } # ------------------------------------------------------------------ # Dimension 1: Numerical accuracy # ------------------------------------------------------------------ def _check_numerical_accuracy( self, script_code: str, condition_summaries: dict[str, Any], metric_key: str, ) -> list[dict[str, str]]: """Verify that data values in the script match source data.""" issues: list[dict[str, str]] = [] if not condition_summaries or not script_code: return issues # Extract numerical values from script script_numbers = set() for m in re.finditer(r"(\d+\.\d{2,})", script_code): try: script_numbers.add(round(float(m.group(1)), 4)) except ValueError: pass # Extract expected values from condition summaries expected_values = set() for cond, cdata in condition_summaries.items(): if not isinstance(cdata, dict): continue metrics = cdata.get("metrics", {}) for key in [metric_key, f"{metric_key}_mean"]: val = metrics.get(key) if val is not None: try: expected_values.add(round(float(val), 4)) except (ValueError, TypeError): pass if not expected_values: return issues # Check for degenerate data (all identical or all zero) vals_list = sorted(expected_values) if len(vals_list) >= 2 and len(set(round(v, 6) for v in vals_list)) <= 1: issues.append({ "type": "numerical_accuracy", "severity": "critical", "message": "All expected metric values are identical — chart will be uninformative", }) if all(v == 0 for v in vals_list): issues.append({ "type": "numerical_accuracy", "severity": "critical", "message": "All expected metric values are zero — chart will show no meaningful data", }) # Check if script contains the expected values found = expected_values & script_numbers missing = expected_values - script_numbers if missing and len(missing) > len(expected_values) / 2: issues.append({ "type": "numerical_accuracy", "severity": "critical", "message": ( f"Script may not contain correct data values. " f"Expected values like {list(missing)[:3]} not found in script. " f"Found values: {list(found)[:5]}" ), }) return issues # ------------------------------------------------------------------ # Dimension 2: Text correctness # ------------------------------------------------------------------ def _check_text_correctness( self, script_code: str, fig_info: dict[str, Any], ) -> list[dict[str, str]]: """Check labels, titles, and legends in the script.""" issues: list[dict[str, str]] = [] if not script_code: return issues # Check for axis labels has_xlabel = "set_xlabel" in script_code or "xlabel" in script_code has_ylabel = "set_ylabel" in script_code or "ylabel" in script_code has_title = "set_title" in script_code or ".title(" in script_code if not has_xlabel: issues.append({ "type": "text_correctness", "severity": "warning", "message": "Missing x-axis label", }) if not has_ylabel: issues.append({ "type": "text_correctness", "severity": "warning", "message": "Missing y-axis label", }) if not has_title: issues.append({ "type": "text_correctness", "severity": "warning", "message": "Missing chart title", }) # Check for savefig call if "savefig" not in script_code: issues.append({ "type": "text_correctness", "severity": "critical", "message": "Missing fig.savefig() call — chart will not be saved", }) # Check for plt.close to prevent memory leaks if "plt.close" not in script_code and "close()" not in script_code: issues.append({ "type": "text_correctness", "severity": "warning", "message": "Missing plt.close() — may cause memory leaks", }) return issues # ------------------------------------------------------------------ # Dimension 3: Visual quality (LLM review) # ------------------------------------------------------------------ def _check_visual_quality( self, script_code: str, fig_info: dict[str, Any], ) -> list[dict[str, str]]: """Use LLM to assess visual quality of the chart code.""" if not script_code: return [] system_prompt = ( "You are an expert reviewer of scientific figures for AI conferences " "(NeurIPS, ICML, ICLR). Review the following matplotlib script and " "identify any quality issues.\n\n" "Check for:\n" "1. DPI setting (should be 300+ for publication)\n" "2. Font sizes (readable when printed: title ≥12pt, labels ≥10pt)\n" "3. Color choices (colorblind-safe, not default matplotlib)\n" "4. Layout (tight_layout or constrained_layout used)\n" "5. Grid and styling (clean, professional)\n" "6. Legend placement (visible, not overlapping data)\n" "7. Data representation (appropriate chart type for the data)\n\n" "Return a JSON object with:\n" "- quality_score: 1-10 (10 = publication ready)\n" "- issues: list of objects with 'type', 'severity' ('warning' or 'critical'), 'message'\n" "- If score >= 7 with no critical issues, the figure passes.\n" ) user_prompt = ( f"Chart title: {fig_info.get('title', 'Unknown')}\n" f"Chart caption: {fig_info.get('caption', '')}\n\n" f"Script:\n```python\n{script_code[:3000]}\n```" ) result = self._chat_json(system_prompt, user_prompt, max_tokens=2048) issues: list[dict[str, str]] = [] quality_score = result.get("quality_score", 5) for issue in result.get("issues", []): if isinstance(issue, dict) and issue.get("message"): issues.append({ "type": "visual_quality", "severity": issue.get("severity", "warning"), "message": str(issue["message"]), }) if quality_score < 4: issues.append({ "type": "visual_quality", "severity": "critical", "message": f"Overall quality score too low: {quality_score}/10", }) return issues # ------------------------------------------------------------------ # Dimension 4: Rendered image validation (pixel-level) # ------------------------------------------------------------------ def _check_rendered_image( self, output_path: str ) -> list[dict[str, str]]: """Check the rendered PNG for visual defects via pixel analysis. Detects: - Near-blank images (>95% white) indicating degenerate/empty charts - Text/graphics touching image edges (possible label clipping) """ issues: list[dict[str, str]] = [] try: from PIL import Image import numpy as np img = Image.open(output_path).convert("RGB") arr = np.array(img) h, w, _ = arr.shape # Check 1: Near-blank image (>95% white pixels) white_mask = np.all(arr > 250, axis=2) white_ratio = float(np.mean(white_mask)) if white_ratio > 0.95: issues.append({ "type": "rendered_quality", "severity": "critical", "message": ( f"Image is {white_ratio:.0%} white — likely degenerate " f"or empty chart" ), }) # Check 2: Non-white pixels touching edges (possible clipping) margin = 3 # pixels for edge_name, edge_slice in [ ("top", arr[:margin, :]), ("bottom", arr[-margin:, :]), ("left", arr[:, :margin]), ("right", arr[:, -margin:]), ]: dark_mask = np.any(edge_slice < 80, axis=-1) dark_ratio = float(np.mean(dark_mask)) if dark_ratio > 0.05: issues.append({ "type": "rendered_quality", "severity": "warning", "message": ( f"Content touching {edge_name} edge ({dark_ratio:.0%} " f"dark pixels) — possible label/title clipping" ), }) except ImportError: logger.debug("PIL not available — skipping rendered image checks") except Exception as exc: logger.debug("Rendered image check failed: %s", exc) return issues ================================================ FILE: researchclaw/agents/figure_agent/decision.py ================================================ """Decision Agent — decides what figures are needed and how to generate them. Analyzes the paper draft/outline and experiment data to determine: - Which sections need figures - What TYPE of figure each section needs - Which generation BACKEND to use: * ``code`` — Code-to-Viz (Matplotlib/TikZ) for data-driven charts * ``image`` — Nano Banana (Gemini) for architecture/conceptual diagrams This agent acts as the "director" before the Planner/CodeGen/NanoBanana sub-agents execute. It does NOT generate any figures itself. References: - Visual ChatGPT (Wu et al., 2023): LLM as controller - Nano Banana: Gemini native image generation (google.genai) """ from __future__ import annotations import json import logging from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult from researchclaw.utils.thinking_tags import strip_thinking_tags logger = logging.getLogger(__name__) def _safe_priority(val: object, default: int = 2) -> int: """Convert priority to int, clamped to 1-3.""" try: return max(1, min(3, int(val))) # type: ignore[arg-type] except (ValueError, TypeError): return default # --------------------------------------------------------------------------- # Figure categories # --------------------------------------------------------------------------- FIGURE_CATEGORY_DATA = "code" # data-driven → Matplotlib / TikZ FIGURE_CATEGORY_IMAGE = "image" # conceptual → Nano Banana (Gemini) _DECISION_SYSTEM_PROMPT = """\ You are an expert academic paper analyst. Your job is to analyze a research paper's content and decide which figures are needed. For each figure, decide: 1. **section** — Which section of the paper it belongs to (e.g. "Method", "Results", "Introduction", "Architecture") 2. **figure_type** — A descriptive type: - For data/experiment figures: "bar_comparison", "line_chart", "heatmap", "confusion_matrix", "training_curve", "ablation_chart", "scatter_plot" - For conceptual/architecture figures: "architecture_diagram", "method_flowchart", "pipeline_overview", "concept_illustration", "system_diagram", "attention_visualization", "comparison_illustration" 3. **backend** — Which generation backend: - "code" for data-driven charts (bar charts, line plots, heatmaps) → will be generated via Matplotlib/Seaborn or TikZ/PGFPlots - "image" for conceptual diagrams (architecture, pipeline, method) → will be generated via Gemini Nano Banana image generation 4. **description** — A detailed description of what the figure should show 5. **priority** — 1 (essential) to 3 (nice-to-have) Return a JSON array of figure decisions. Example: ```json [ { "section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Overview of the proposed model architecture showing encoder-decoder structure with attention mechanism", "priority": 1 }, { "section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Bar chart comparing accuracy of proposed method vs baselines on CIFAR-100", "priority": 1 } ] ``` RULES: - Every research paper should have at least 1 architecture/method figure - Every paper with experiments should have at least 2 result figures - Prioritize figures that make the paper more convincing - Do NOT generate duplicate or redundant figures - Return ONLY valid JSON, no explanation - Do NOT include or tags """ class FigureDecisionAgent(BaseAgent): """Decides what figures are needed and which backend generates them. This agent analyzes the paper context (topic, draft, experiment data) and produces a *figure decision plan* — a list of figure requests tagged with either ``"code"`` (Code-to-Viz) or ``"image"`` (Nano Banana). The downstream orchestrator then routes each request to the appropriate generation sub-agent. """ name = "figure_decision" def __init__( self, llm: Any, *, min_figures: int = 3, max_figures: int = 10, ) -> None: super().__init__(llm) self._min_figures = min_figures self._max_figures = max_figures # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Analyze context and produce figure decisions. Context keys: topic (str): Research topic hypothesis (str): Research hypothesis paper_draft (str): Current paper draft / outline (markdown) experiment_results (dict): Parsed experiment data (if any) condition_summaries (dict): Per-condition stats (if any) has_experiments (bool): Whether experiments were conducted """ topic = context.get("topic", "") hypothesis = context.get("hypothesis", "") paper_draft = context.get("paper_draft", "") has_experiments = context.get("has_experiments", True) experiment_results = context.get("experiment_results", {}) condition_summaries = context.get("condition_summaries", {}) # ── Try LLM-based decision ──────────────────────────────────── if self._llm is not None: try: decisions = self._llm_decide( topic=topic, hypothesis=hypothesis, paper_draft=paper_draft, has_experiments=has_experiments, experiment_results=experiment_results, condition_summaries=condition_summaries, ) # Enforce bounds decisions = self._enforce_bounds(decisions, has_experiments) return self._make_result( True, data={ "decisions": decisions, "code_figures": [ d for d in decisions if d["backend"] == "code" ], "image_figures": [ d for d in decisions if d["backend"] == "image" ], "total": len(decisions), }, ) except Exception as e: logger.warning("LLM decision failed, using heuristic: %s", e) # ── Fallback: heuristic decision ────────────────────────────── decisions = self._heuristic_decide( topic=topic, has_experiments=has_experiments, condition_summaries=condition_summaries, ) return self._make_result( True, data={ "decisions": decisions, "code_figures": [ d for d in decisions if d["backend"] == "code" ], "image_figures": [ d for d in decisions if d["backend"] == "image" ], "total": len(decisions), }, ) # ------------------------------------------------------------------ # LLM-based decision # ------------------------------------------------------------------ def _llm_decide( self, *, topic: str, hypothesis: str, paper_draft: str, has_experiments: bool, experiment_results: dict[str, Any], condition_summaries: dict[str, Any], ) -> list[dict[str, Any]]: """Ask LLM to analyze paper and decide on figures.""" # Build user context user_parts = [ f"Research topic: {topic}", f"Hypothesis: {hypothesis}", ] if paper_draft: # Truncate to avoid token overflow draft_preview = paper_draft[:4000] user_parts.append(f"\nPaper draft (preview):\n{draft_preview}") if has_experiments and condition_summaries: conditions_preview = json.dumps( {k: v for k, v in list(condition_summaries.items())[:8]}, indent=2, default=str, ) user_parts.append( f"\nExperiment conditions:\n{conditions_preview}" ) if has_experiments and experiment_results: metrics = list(experiment_results.keys())[:20] user_parts.append(f"\nAvailable metrics: {metrics}") user_parts.append( f"\nConstraints: Generate between {self._min_figures} " f"and {self._max_figures} figures total." ) user_prompt = "\n".join(user_parts) raw = self._chat( _DECISION_SYSTEM_PROMPT, user_prompt, max_tokens=2048, temperature=0.3, ) # Strip reasoning model thinking tags before JSON parsing raw = strip_thinking_tags(raw) # Parse JSON response return self._parse_decisions(raw) def _parse_decisions(self, raw: str) -> list[dict[str, Any]]: """Parse LLM response into decision list.""" import re # Strip markdown fences m = re.search(r"```(?:json)?\s*\n(.*?)```", raw, re.DOTALL) text = m.group(1).strip() if m else raw.strip() # Find JSON array start = text.find("[") end = text.rfind("]") if start == -1 or end == -1: raise ValueError("No JSON array found in LLM response") decisions_raw = json.loads(text[start : end + 1]) # Validate and normalize decisions = [] for d in decisions_raw: if not isinstance(d, dict): continue decision = { "section": str(d.get("section", "Results")), "figure_type": str(d.get("figure_type", "bar_comparison")), "backend": str(d.get("backend", "code")), "description": str(d.get("description", "")), "priority": _safe_priority(d.get("priority", 2)), } # Validate backend if decision["backend"] not in ("code", "image"): # Auto-assign based on figure_type decision["backend"] = self._infer_backend( decision["figure_type"] ) decisions.append(decision) return decisions # ------------------------------------------------------------------ # Heuristic fallback # ------------------------------------------------------------------ def _heuristic_decide( self, *, topic: str, has_experiments: bool, condition_summaries: dict[str, Any], ) -> list[dict[str, Any]]: """Generate figure decisions without LLM (rule-based fallback).""" decisions: list[dict[str, Any]] = [] # Always suggest an architecture/method diagram decisions.append({ "section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": ( f"Architecture overview diagram for the proposed method " f"in the paper about: {topic[:100]}" ), "priority": 1, }) if has_experiments: # Main results comparison n_conditions = len(condition_summaries) decisions.append({ "section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": ( f"Bar chart comparing main metric across " f"{n_conditions} experimental conditions" ), "priority": 1, }) # Training/convergence curve decisions.append({ "section": "Results", "figure_type": "training_curve", "backend": "code", "description": "Training convergence curves with loss/metric over epochs", "priority": 2, }) # Ablation study if n_conditions >= 4: decisions.append({ "section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Ablation study showing contribution of each component", "priority": 2, }) # Pipeline/method flowchart decisions.append({ "section": "Method", "figure_type": "pipeline_overview", "backend": "image", "description": ( f"Step-by-step pipeline flowchart showing the method's " f"workflow for: {topic[:100]}" ), "priority": 2, }) return decisions[:self._max_figures] # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ @staticmethod def _infer_backend(figure_type: str) -> str: """Infer generation backend from figure type.""" code_types = { "bar_comparison", "line_chart", "heatmap", "confusion_matrix", "training_curve", "ablation_chart", "scatter_plot", "line_multi", "grouped_bar", "loss_curve", } if figure_type in code_types: return "code" return "image" def _enforce_bounds( self, decisions: list[dict[str, Any]], has_experiments: bool, ) -> list[dict[str, Any]]: """Enforce min/max figure counts and required categories.""" # Sort by priority (1 = highest) decisions.sort(key=lambda d: d.get("priority", 2)) # Ensure at least one architecture figure has_image = any(d["backend"] == "image" for d in decisions) if not has_image: decisions.insert(0, { "section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Model architecture overview", "priority": 1, }) # Ensure at least one data figure if experiments exist if has_experiments: has_code = any(d["backend"] == "code" for d in decisions) if not has_code: decisions.append({ "section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Main results comparison", "priority": 1, }) # Enforce bounds if len(decisions) < self._min_figures: # Pad with lower-priority suggestions while len(decisions) < self._min_figures: decisions.append({ "section": "Discussion", "figure_type": "concept_illustration", "backend": "image", "description": "Conceptual illustration of key findings", "priority": 3, }) return decisions[:self._max_figures] ================================================ FILE: researchclaw/agents/figure_agent/integrator.py ================================================ """Integrator Agent — determines figure placement in the paper. Maps each rendered figure to the correct paper section, generates markdown image references with captions, and produces a ``figure_manifest.json`` that downstream stages use for paper embedding. """ from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult logger = logging.getLogger(__name__) # Mapping from figure section → paper section heading _SECTION_MAP = { "method": "Method", "methods": "Method", "methodology": "Method", "architecture": "Method", "results": "Results", "experiment": "Results", "experiments": "Results", "analysis": "Analysis", "discussion": "Discussion", "ablation": "Results", "introduction": "Introduction", } class IntegratorAgent(BaseAgent): """Determines figure placement and generates paper references.""" name = "figure_integrator" def __init__(self, llm: Any) -> None: super().__init__(llm) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Generate figure manifest and markdown references. Context keys: rendered (list[dict]): Successfully rendered figures with 'figure_id', 'output_path', 'title', 'caption', 'section' topic (str): Research topic output_dir (str|Path): Charts directory """ try: rendered = context.get("rendered", []) topic = context.get("topic", "") output_dir = Path(context.get("output_dir", "charts")) # Filter to successfully rendered figures only successful = [r for r in rendered if r.get("success")] if not successful: return self._make_result( True, data={"manifest": [], "markdown_refs": "", "figure_count": 0}, ) # Build manifest manifest = self._build_manifest(successful, output_dir) # Generate markdown references for paper embedding markdown_refs = self._generate_markdown_refs(manifest) # Generate figure descriptions for paper writing prompt figure_descriptions = self._generate_descriptions(manifest) # Save manifest manifest_path = output_dir / "figure_manifest.json" manifest_path.write_text( json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8", ) self.logger.info( "Generated figure manifest: %d figures", len(manifest) ) return self._make_result( True, data={ "manifest": manifest, "markdown_refs": markdown_refs, "figure_descriptions": figure_descriptions, "figure_count": len(manifest), "manifest_path": str(manifest_path), }, ) except Exception as exc: self.logger.error("Integrator failed: %s", exc) return self._make_result(False, error=str(exc)) # ------------------------------------------------------------------ # Manifest building # ------------------------------------------------------------------ def _build_manifest( self, rendered: list[dict[str, Any]], output_dir: Path, ) -> list[dict[str, Any]]: """Build a structured manifest of all figures.""" manifest: list[dict[str, Any]] = [] # Sort by priority: priority 1 (must-have) first sorted_figs = sorted( rendered, key=lambda f: ( self._section_order(f.get("section", "results")), f.get("priority", 2), ), ) for i, fig in enumerate(sorted_figs, 1): figure_id = fig.get("figure_id", f"fig_{i}") output_path = fig.get("output_path", "") section = fig.get("section", "results") paper_section = _SECTION_MAP.get(section.lower(), "Results") # Relative path for paper embedding (charts/filename.png) if output_path: rel_path = f"charts/{Path(output_path).name}" else: rel_path = f"charts/{figure_id}.png" entry = { "figure_number": i, "figure_id": figure_id, "file_path": rel_path, "absolute_path": output_path, "title": fig.get("title", f"Figure {i}"), "caption": fig.get("caption", ""), "paper_section": paper_section, "width": fig.get("width", "single_column"), "label": f"fig:{figure_id}", "script_path": fig.get("script_path", ""), } manifest.append(entry) return manifest @staticmethod def _section_order(section: str) -> int: """Order sections for figure numbering.""" order = { "introduction": 0, "method": 1, "methods": 1, "methodology": 1, "architecture": 1, "results": 2, "experiment": 2, "experiments": 2, "ablation": 3, "analysis": 4, "discussion": 5, } return order.get(section.lower(), 3) # ------------------------------------------------------------------ # Markdown reference generation # ------------------------------------------------------------------ def _generate_markdown_refs( self, manifest: list[dict[str, Any]] ) -> str: """Generate markdown image references for paper embedding.""" refs: list[str] = [] for entry in manifest: fig_num = entry["figure_number"] file_path = entry["file_path"] caption = entry.get("caption") or entry.get("title", f"Figure {fig_num}") refs.append( f"![Figure {fig_num}: {caption}]({file_path})" ) return "\n\n".join(refs) # ------------------------------------------------------------------ # Description generation for paper writing prompt # ------------------------------------------------------------------ def _generate_descriptions( self, manifest: list[dict[str, Any]] ) -> str: """Generate figure descriptions for injection into paper writing prompt.""" parts: list[str] = [] parts.append("## AVAILABLE FIGURES (embed in the paper)") parts.append( "The following figures were generated from actual experiment data. " "Reference them in the appropriate paper sections using markdown " "image syntax: `![Caption](charts/filename.png)`\n" ) for entry in manifest: fig_num = entry["figure_number"] file_path = entry["file_path"] title = entry.get("title", "") caption = entry.get("caption", "") section = entry.get("paper_section", "Results") parts.append( f"**Figure {fig_num}** (`{file_path}`) — {title}\n" f" Caption: {caption}\n" f" Place in: **{section}** section\n" ) parts.append( "\nFor each figure referenced, write a descriptive caption and " "discuss what the figure shows in 2-3 sentences.\n" ) return "\n".join(parts) ================================================ FILE: researchclaw/agents/figure_agent/nano_banana.py ================================================ """Nano Banana Agent — generates conceptual/architectural images via Gemini. Uses Google's Gemini native image generation (Nano Banana) to create non-data figures such as: - Model architecture diagrams - Method pipeline flowcharts - System overview illustrations - Concept/intuition diagrams These figures complement the Code-to-Viz agent which handles data-driven charts (bar plots, line charts, heatmaps, etc.). Requires: ``pip install google-genai Pillow`` API key: Set ``GEMINI_API_KEY`` or ``GOOGLE_API_KEY`` env var, or pass via config. References: - Nano Banana docs: https://ai.google.dev/gemini-api/docs/image-generation - Gemini 3.1 Flash Image Preview: high-efficiency, high-volume - Gemini 3 Pro Image Preview: professional asset production """ from __future__ import annotations import base64 import json import logging import os import re import urllib.error import urllib.request from pathlib import Path from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult from researchclaw.utils.sanitize import sanitize_figure_id logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Default config # --------------------------------------------------------------------------- _DEFAULT_MODEL = "gemini-2.5-flash-image" _FALLBACK_MODELS = [ "gemini-3.1-flash-image-preview", "gemini-3-pro-image-preview", "gemini-2.5-flash-image", ] _ACADEMIC_STYLE_PROMPT = ( "The image should be in a clean, professional ACADEMIC style suitable " "for a top-tier AI/ML research paper (NeurIPS, ICML, ICLR). " "Use a white or light background. Use clear labels and annotations. " "Avoid excessive decoration. Use a consistent color palette. " "Text should be legible at column width (~3.25 inches). " "Style: technical illustration, vector-like, clean lines." ) class NanoBananaAgent(BaseAgent): """Generates conceptual/architectural figures using Gemini image generation. This agent uses the Gemini API (Nano Banana) to create publication-quality conceptual figures that complement data-driven charts from Code-to-Viz. """ name = "nano_banana" def __init__( self, llm: Any, *, gemini_api_key: str | None = None, model: str = _DEFAULT_MODEL, output_dir: str | Path | None = None, aspect_ratio: str = "16:9", use_sdk: bool | None = None, # None = auto-detect ) -> None: super().__init__(llm) self._api_key = ( gemini_api_key or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY") or "" ) self._model = model self._output_dir = Path(output_dir) if output_dir else None self._aspect_ratio = aspect_ratio # Detect SDK availability self._use_sdk = use_sdk if self._use_sdk is None: try: import google.genai # noqa: F401 self._use_sdk = True except ImportError: self._use_sdk = False if not self._api_key: logger.warning( "No Gemini API key found. Set GEMINI_API_KEY or " "GOOGLE_API_KEY env var for Nano Banana image generation." ) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Generate images for figure decisions marked as 'image' backend. Context keys: image_figures (list[dict]): Decisions from FigureDecisionAgent with backend="image" topic (str): Research topic output_dir (str|Path): Output directory for images """ image_figures = context.get("image_figures", []) topic = context.get("topic", "") output_dir = Path( context.get("output_dir", self._output_dir or "charts") ) output_dir.mkdir(parents=True, exist_ok=True) if not image_figures: # BUG-DA8-10: Use _make_result() to properly track LLM counters return self._make_result( success=True, data={"generated": [], "count": 0}, ) if not self._api_key: return self._make_result( success=False, error="No Gemini API key configured for Nano Banana", data={"generated": [], "count": 0}, ) generated: list[dict[str, Any]] = [] for i, fig in enumerate(image_figures): figure_id = sanitize_figure_id( fig.get("figure_id", f"conceptual_{i + 1}") ) description = fig.get("description", "") figure_type = fig.get("figure_type", "architecture_diagram") section = fig.get("section", "Method") # Build prompt for Gemini prompt = self._build_prompt( description=description, figure_type=figure_type, section=section, topic=topic, ) # Generate image output_path = output_dir / f"{figure_id}.png" try: success = self._generate_image( prompt=prompt, output_path=output_path, ) if success: generated.append({ "figure_id": figure_id, "figure_type": figure_type, "section": section, "description": description, "output_path": str(output_path), "path": str(output_path), "title": description[:80] if description else f"Figure {figure_id}", "caption": description or "", "prompt": prompt, "success": True, "backend": "nano_banana", }) logger.info( "Generated %s: %s", figure_id, output_path ) else: generated.append({ "figure_id": figure_id, "success": False, "error": "Generation returned no image", "backend": "nano_banana", }) except Exception as e: logger.warning( "Failed to generate %s via Nano Banana: %s", figure_id, e, ) generated.append({ "figure_id": figure_id, "success": False, "error": str(e), "backend": "nano_banana", }) success_count = sum(1 for g in generated if g.get("success")) return self._make_result( success=success_count > 0, data={ "generated": generated, "count": success_count, "total_attempted": len(image_figures), }, ) # ------------------------------------------------------------------ # Prompt building # ------------------------------------------------------------------ def _build_prompt( self, *, description: str, figure_type: str, section: str, topic: str, ) -> str: """Build a Gemini prompt for academic figure generation.""" type_guidelines = self._get_type_guidelines(figure_type) prompt = ( f"Create a professional academic figure for the '{section}' " f"section of a research paper about: {topic}\n\n" f"Figure description: {description}\n\n" f"Style guidelines:\n{type_guidelines}\n\n" f"{_ACADEMIC_STYLE_PROMPT}\n\n" f"The figure must be publication-ready for a top-tier " f"AI/ML conference paper." ) return prompt @staticmethod def _get_type_guidelines(figure_type: str) -> str: """Get specific guidelines for each figure type.""" guidelines = { "architecture_diagram": ( "- Show the model layers, connections, and data flow\n" "- Use boxes for layers/modules with clear labels\n" "- Use arrows to show data flow direction\n" "- Include dimensions/shapes where relevant\n" "- Group related components with dashed borders\n" "- Use a consistent left-to-right or top-to-bottom flow" ), "method_flowchart": ( "- Show the step-by-step process flow\n" "- Use rounded rectangles for processes\n" "- Use diamonds for decision points\n" "- Use arrows with labels for transitions\n" "- Number the steps if sequential\n" "- Highlight key/novel steps with color" ), "pipeline_overview": ( "- Show the full pipeline from input to output\n" "- Use distinct visual blocks for each stage\n" "- Include example inputs/outputs at each stage\n" "- Use consistent arrow style for data flow\n" "- Label each stage clearly\n" "- Show parallel/branching paths if applicable" ), "concept_illustration": ( "- Illustrate the key concept or intuition\n" "- Use simple, clean diagrams\n" "- Include before/after or problem/solution comparison\n" "- Use visual metaphors where appropriate\n" "- Keep it simple enough to understand at a glance" ), "system_diagram": ( "- Show the overall system architecture\n" "- Include all major components and their interactions\n" "- Use standard UML-like notation where appropriate\n" "- Show data stores, APIs, and external services\n" "- Include protocols/data formats for connections" ), "attention_visualization": ( "- Show attention weights or patterns\n" "- Use heatmap-style coloring for attention scores\n" "- Include input/output sequences\n" "- Label attention heads if multi-head attention\n" "- Use clear color scale legend" ), "comparison_illustration": ( "- Show side-by-side comparison of approaches\n" "- Highlight key differences with visual cues\n" "- Use consistent styling across comparisons\n" "- Include labels for each approach\n" "- Use checkmarks/crosses for feature comparison" ), } return guidelines.get(figure_type, guidelines["concept_illustration"]) # ------------------------------------------------------------------ # Image generation backends # ------------------------------------------------------------------ def _generate_image( self, prompt: str, output_path: Path, ) -> bool: """Generate image via Gemini API. Tries google-genai SDK first, falls back to REST API. """ if self._use_sdk: return self._generate_via_sdk(prompt, output_path) return self._generate_via_rest(prompt, output_path) def _generate_via_sdk( self, prompt: str, output_path: Path, ) -> bool: """Generate image using google-genai SDK.""" try: from google import genai from google.genai import types client = genai.Client(api_key=self._api_key) response = client.models.generate_content( model=self._model, contents=[prompt], config=types.GenerateContentConfig( response_modalities=["IMAGE"], image_config=types.ImageConfig( aspect_ratio=self._aspect_ratio, ), ), ) for part in response.parts: if part.inline_data is not None: image = part.as_image() image.save(str(output_path)) return True logger.warning("Gemini SDK returned no image data") return False except ImportError: logger.warning("google-genai SDK not installed, falling back to REST") self._use_sdk = False return self._generate_via_rest(prompt, output_path) except Exception as e: logger.warning("Gemini SDK error: %s, falling back to REST", e) return self._generate_via_rest(prompt, output_path) def _generate_via_rest( self, prompt: str, output_path: Path, ) -> bool: """Generate image using Gemini REST API (no SDK dependency).""" # Validate model name to prevent URL injection if not re.fullmatch(r"[a-zA-Z0-9._-]+", self._model): logger.error("Invalid Gemini model name: %r", self._model) return False url = ( f"https://generativelanguage.googleapis.com/v1beta/" f"models/{self._model}:generateContent" ) payload = { "contents": [{"parts": [{"text": prompt}]}], "generationConfig": { "responseModalities": ["IMAGE"], "imageConfig": { "aspectRatio": self._aspect_ratio, }, }, } data = json.dumps(payload).encode("utf-8") req = urllib.request.Request( url, data=data, headers={ "Content-Type": "application/json", "x-goog-api-key": self._api_key, }, method="POST", ) try: with urllib.request.urlopen(req, timeout=120) as resp: result = json.loads(resp.read().decode("utf-8")) # Extract image from response candidates = result.get("candidates", []) if not candidates: logger.warning("Gemini REST API returned no candidates") return False parts = candidates[0].get("content", {}).get("parts", []) for part in parts: inline_data = part.get("inlineData", {}) if inline_data.get("mimeType", "").startswith("image/"): image_bytes = base64.b64decode(inline_data["data"]) output_path.write_bytes(image_bytes) return True logger.warning("Gemini REST API returned no image parts") return False except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace")[:500] logger.warning("Gemini REST API error %d: %s", e.code, body) return False except Exception as e: logger.warning("Gemini REST API error: %s", e) return False ================================================ FILE: researchclaw/agents/figure_agent/orchestrator.py ================================================ """FigureAgent Orchestrator — coordinates the figure generation sub-agents. Flow: Decision Agent → analyzes paper → decides what figures are needed ├── code figures → Planner → CodeGen → Renderer → Critic → retry └── image figures → Nano Banana (Gemini image generation) → Integrator (combines all figures into manifest) Produces a ``FigurePlan`` consumed by paper draft and export stages. """ from __future__ import annotations import json import logging import time from dataclasses import dataclass, field from pathlib import Path from typing import Any from researchclaw.agents.base import AgentOrchestrator from researchclaw.agents.figure_agent.codegen import CodeGenAgent from researchclaw.agents.figure_agent.critic import CriticAgent from researchclaw.agents.figure_agent.decision import FigureDecisionAgent from researchclaw.agents.figure_agent.integrator import IntegratorAgent from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent from researchclaw.agents.figure_agent.planner import PlannerAgent from researchclaw.agents.figure_agent.renderer import RendererAgent logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- @dataclass(frozen=True) class FigureAgentConfig: """Configuration for the FigureAgent system.""" enabled: bool = True # Planner min_figures: int = 3 max_figures: int = 8 # Orchestrator max_iterations: int = 3 # max CodeGen→Renderer→Critic retry loops # Renderer security render_timeout_sec: int = 30 use_docker: bool | None = None # None = auto-detect docker_image: str = "researchclaw/experiment:latest" # Code generation output_format: str = "python" # "python" or "latex" # Nano Banana (Gemini image generation) gemini_api_key: str = "" # or set GEMINI_API_KEY env var gemini_model: str = "gemini-2.5-flash-image" nano_banana_enabled: bool = True # enable/disable image generation # Critic strict_mode: bool = False # if True, any issue = fail # Output dpi: int = 300 # --------------------------------------------------------------------------- # Output data structure # --------------------------------------------------------------------------- @dataclass class FigurePlan: """Final output from the FigureAgent system. Consumed by: - Paper draft stage (figure_descriptions for writing prompt) - Paper export stage (manifest for LaTeX figure embedding) - Charts directory (scripts + rendered images) """ # Figure manifest (list of figure metadata dicts) manifest: list[dict[str, Any]] = field(default_factory=list) # Generated references markdown_refs: str = "" figure_descriptions: str = "" # Paths output_dir: str = "" manifest_path: str = "" # Stats figure_count: int = 0 passed_count: int = 0 total_llm_calls: int = 0 total_tokens: int = 0 elapsed_sec: float = 0.0 def to_dict(self) -> dict[str, Any]: """Serialize to a JSON-safe dict.""" return { "manifest": self.manifest, "markdown_refs": self.markdown_refs, "figure_descriptions": self.figure_descriptions, "output_dir": self.output_dir, "manifest_path": self.manifest_path, "figure_count": self.figure_count, "passed_count": self.passed_count, "total_llm_calls": self.total_llm_calls, "total_tokens": self.total_tokens, "elapsed_sec": self.elapsed_sec, } def get_chart_files(self) -> list[str]: """Return list of chart filenames from manifest.""" return [ Path(entry["file_path"]).name for entry in self.manifest if entry.get("file_path") ] # --------------------------------------------------------------------------- # Orchestrator # --------------------------------------------------------------------------- class FigureOrchestrator(AgentOrchestrator): """Coordinates Decision → (Code-to-Viz | Nano Banana) → Integrator.""" def __init__( self, llm: Any, config: FigureAgentConfig | None = None, *, stage_dir: Path | None = None, ) -> None: cfg = config or FigureAgentConfig() super().__init__(llm, max_iterations=cfg.max_iterations) self._config = cfg self._stage_dir = stage_dir # Decision agent self._decision = FigureDecisionAgent( llm, min_figures=cfg.min_figures, max_figures=cfg.max_figures, ) # Code-to-Viz sub-agents (for data-driven charts) self._planner = PlannerAgent( llm, min_figures=cfg.min_figures, max_figures=cfg.max_figures, ) # BUG-60: Pass use_docker so CodeGen generates container-aware paths self._codegen = CodeGenAgent( llm, output_format=cfg.output_format, use_docker=bool(cfg.use_docker) if cfg.use_docker is not None else False, ) self._renderer = RendererAgent( llm, timeout_sec=cfg.render_timeout_sec, use_docker=cfg.use_docker, docker_image=cfg.docker_image, ) self._critic = CriticAgent( llm, strict_mode=cfg.strict_mode, ) # Nano Banana agent (for conceptual/architectural images) self._nano_banana: NanoBananaAgent | None = None if cfg.nano_banana_enabled: self._nano_banana = NanoBananaAgent( llm, gemini_api_key=cfg.gemini_api_key or None, model=cfg.gemini_model, ) self._integrator = IntegratorAgent(llm) def _save_artifact(self, name: str, data: Any) -> None: """Save intermediate artifact to stage directory.""" if self._stage_dir is None: return self._stage_dir.mkdir(parents=True, exist_ok=True) path = self._stage_dir / name if isinstance(data, str): path.write_text(data, encoding="utf-8") else: path.write_text( json.dumps(data, indent=2, ensure_ascii=False, default=str), encoding="utf-8", ) def orchestrate(self, context: dict[str, Any]) -> FigurePlan: """Run the full figure generation pipeline. Context keys: experiment_results (dict): Parsed results.json condition_summaries (dict): Per-condition aggregated stats metrics_summary (dict): Per-metric aggregated stats metric_key (str): Primary metric name topic (str): Research topic hypothesis (str): Research hypothesis paper_draft (str): Current paper draft (for decision agent) output_dir (str|Path): Directory for chart output """ t0 = time.monotonic() topic = context.get("topic", "") output_dir = Path(context.get("output_dir", "charts")) output_dir.mkdir(parents=True, exist_ok=True) self.logger.info("FigureAgent starting for: %s", topic[:80]) plan = FigurePlan(output_dir=str(output_dir)) # ── Phase 0: Decision — what figures are needed? ────────────── self.logger.info("Phase 0: Deciding what figures are needed") decision_result = self._decision.execute({ "topic": topic, "hypothesis": context.get("hypothesis", ""), "paper_draft": context.get("paper_draft", ""), "has_experiments": bool(context.get("experiment_results")), "experiment_results": context.get("experiment_results", {}), "condition_summaries": context.get("condition_summaries", {}), }) self._accumulate(decision_result) self._save_artifact("figure_decisions.json", decision_result.data) code_figures = decision_result.data.get("code_figures", []) image_figures = decision_result.data.get("image_figures", []) self.logger.info( "Decision: %d code figures, %d image figures", len(code_figures), len(image_figures), ) # Track all rendered figures (from both backends) all_rendered: list[dict[str, Any]] = [] # ── Phase A: Code-to-Viz for data figures ───────────────────── if code_figures: rendered_code = self._run_code_pipeline( code_figures=code_figures, context=context, output_dir=output_dir, ) all_rendered.extend(rendered_code) # ── Phase B: Nano Banana for image figures ──────────────────── if image_figures and self._nano_banana is not None: rendered_images = self._run_nano_banana( image_figures=image_figures, context=context, output_dir=output_dir, ) all_rendered.extend(rendered_images) elif image_figures: self.logger.warning( "Nano Banana disabled — skipping %d image figures", len(image_figures), ) # ── Phase C: Integrate all figures ──────────────────────────── self.logger.info( "Phase C: Integrating %d figures into paper", len(all_rendered) ) integrate_result = self._integrator.execute({ "rendered": all_rendered, "topic": topic, "output_dir": str(output_dir), }) self._accumulate(integrate_result) # ── Finalize ───────────────────────────────────────────────── plan.manifest = integrate_result.data.get("manifest", []) plan.markdown_refs = integrate_result.data.get("markdown_refs", "") plan.figure_descriptions = integrate_result.data.get("figure_descriptions", "") plan.manifest_path = integrate_result.data.get("manifest_path", "") plan.figure_count = integrate_result.data.get("figure_count", 0) plan.passed_count = sum( 1 for r in all_rendered if r.get("success") ) plan.total_llm_calls = self.total_llm_calls plan.total_tokens = self.total_tokens plan.elapsed_sec = time.monotonic() - t0 # Save final plan self._save_artifact("figure_plan_final.json", plan.to_dict()) self.logger.info( "FigureAgent complete: %d figures (%d code + %d image), " "%d passed, %d LLM calls, %.1fs", plan.figure_count, len(code_figures), len(image_figures), plan.passed_count, plan.total_llm_calls, plan.elapsed_sec, ) return plan # ------------------------------------------------------------------ # Code-to-Viz pipeline (data-driven charts) # ------------------------------------------------------------------ def _run_code_pipeline( self, code_figures: list[dict[str, Any]], context: dict[str, Any], output_dir: Path, ) -> list[dict[str, Any]]: """Run Planner → CodeGen → Renderer → Critic for data figures.""" # Phase 1: Plan (uses experiment data) self.logger.info("Phase A1: Planning data figures") plan_result = self._planner.execute({ "experiment_results": context.get("experiment_results", {}), "topic": context.get("topic", ""), "hypothesis": context.get("hypothesis", ""), "conditions": context.get("conditions", []), "metric_key": context.get("metric_key", "primary_metric"), "metrics_summary": context.get("metrics_summary", {}), "condition_summaries": context.get("condition_summaries", {}), }) self._accumulate(plan_result) if not plan_result.success: self.logger.warning("Planning failed: %s", plan_result.error) return [] figures = plan_result.data.get("figures", []) self._save_artifact("figure_plan_code.json", figures) self.logger.info("Planned %d data figures", len(figures)) # Phase 2+3+4: CodeGen → Render → Critic (with retry) critic_feedback: list[dict[str, Any]] = [] final_rendered: list[dict[str, Any]] = [] for iteration in range(self.max_iterations): self.logger.info( "Phase A2: CodeGen (iteration %d/%d)", iteration + 1, self.max_iterations, ) # CodeGen codegen_result = self._codegen.execute({ "figures": figures, "experiment_results": context.get("experiment_results", {}), "condition_summaries": context.get("condition_summaries", {}), "metrics_summary": context.get("metrics_summary", {}), "metric_key": context.get("metric_key", "primary_metric"), "output_dir": str(output_dir), "critic_feedback": critic_feedback, }) self._accumulate(codegen_result) if not codegen_result.success: self.logger.warning("CodeGen failed: %s", codegen_result.error) continue scripts = codegen_result.data.get("scripts", []) self._save_artifact(f"scripts_{iteration}.json", [ {k: v for k, v in s.items() if k != "script"} for s in scripts ]) # Render self.logger.info( "Phase A3: Rendering (iteration %d/%d)", iteration + 1, self.max_iterations, ) render_result = self._renderer.execute({ "scripts": scripts, "output_dir": str(output_dir), }) self._accumulate(render_result) if not render_result.success: self.logger.warning("Rendering failed: %s", render_result.error) continue rendered = render_result.data.get("rendered", []) # Merge newly rendered figures with previously passed figures # (on retries, only failed figures are re-rendered) if iteration == 0: final_rendered = rendered else: # Replace entries for re-rendered figures, keep previously passed ones re_rendered_ids = {r.get("figure_id") for r in rendered} final_rendered = [ r for r in final_rendered if r.get("figure_id") not in re_rendered_ids ] + rendered # Critic self.logger.info( "Phase A4: Critic review (iteration %d/%d)", iteration + 1, self.max_iterations, ) critic_result = self._critic.execute({ "rendered": rendered, "scripts": scripts, "condition_summaries": context.get("condition_summaries", {}), "metrics_summary": context.get("metrics_summary", {}), "metric_key": context.get("metric_key", "primary_metric"), }) self._accumulate(critic_result) reviews = critic_result.data.get("reviews", []) all_passed = critic_result.data.get("all_passed", False) self._save_artifact(f"reviews_{iteration}.json", reviews) if all_passed: self.logger.info( "All data figures passed review on iteration %d", iteration + 1, ) break # Collect feedback for failed figures critic_feedback = [ r for r in reviews if not r.get("passed") ] # Only retry figures that failed # BUG-37: figure_id may be non-hashable (list) — force str failed_ids = set() for r in critic_feedback: _fid = r.get("figure_id") if isinstance(_fid, str): failed_ids.add(_fid) elif isinstance(_fid, list) and _fid: failed_ids.add(str(_fid[0])) figures = [f for f in figures if f.get("figure_id") in failed_ids] self.logger.warning( "Critic: %d/%d figures need revision", len(failed_ids), len(rendered), ) return final_rendered # ------------------------------------------------------------------ # Nano Banana pipeline (conceptual/architectural images) # ------------------------------------------------------------------ def _run_nano_banana( self, image_figures: list[dict[str, Any]], context: dict[str, Any], output_dir: Path, ) -> list[dict[str, Any]]: """Run Nano Banana for conceptual/architectural figures.""" if self._nano_banana is None: return [] self.logger.info( "Phase B: Generating %d image figures via Nano Banana", len(image_figures), ) # Assign figure IDs for i, fig in enumerate(image_figures): if "figure_id" not in fig: fig["figure_id"] = ( f"{fig.get('figure_type', 'conceptual')}_{i + 1}" ) nb_result = self._nano_banana.execute({ "image_figures": image_figures, "topic": context.get("topic", ""), "output_dir": str(output_dir), }) self._accumulate(nb_result) self._save_artifact("nano_banana_results.json", nb_result.data) generated = nb_result.data.get("generated", []) success_count = nb_result.data.get("count", 0) self.logger.info( "Nano Banana: %d/%d images generated successfully", success_count, len(image_figures), ) return generated ================================================ FILE: researchclaw/agents/figure_agent/planner.py ================================================ """Planner Agent — analyzes experiment results and determines chart plan. Examines the experiment results data structure, research topic, and paper idea to decide: - How many figures to generate - What type each figure should be (bar, line, heatmap, etc.) - What data each figure should display - Caption specifications for each figure - Layout (single / subplot / multi-panel) """ from __future__ import annotations import json import logging from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Chart type decision matrix — maps experiment characteristics to chart types # --------------------------------------------------------------------------- _CHART_TYPE_MATRIX: dict[str, list[dict[str, str]]] = { "classification": [ {"type": "bar_comparison", "purpose": "accuracy comparison across methods"}, {"type": "confusion_matrix", "purpose": "per-class prediction analysis"}, {"type": "training_curve", "purpose": "convergence behavior"}, ], "generation": [ {"type": "line_multi", "purpose": "FID/IS curves over training"}, {"type": "bar_comparison", "purpose": "generation quality metrics comparison"}, ], "reinforcement_learning": [ {"type": "training_curve", "purpose": "reward curve with mean±std shading"}, {"type": "bar_comparison", "purpose": "final performance comparison"}, ], "knowledge_distillation": [ {"type": "bar_comparison", "purpose": "teacher-student accuracy comparison"}, {"type": "line_multi", "purpose": "knowledge transfer efficiency curve"}, {"type": "heatmap", "purpose": "feature alignment heatmap"}, ], "nlp": [ {"type": "bar_comparison", "purpose": "BLEU/ROUGE metric comparison"}, {"type": "heatmap", "purpose": "attention heatmap"}, ], "graph_neural_networks": [ {"type": "bar_comparison", "purpose": "node classification accuracy"}, {"type": "training_curve", "purpose": "convergence on graph tasks"}, ], "meta_learning": [ {"type": "line_multi", "purpose": "few-shot accuracy vs number of shots"}, {"type": "bar_comparison", "purpose": "cross-task performance comparison"}, ], "continual_learning": [ {"type": "line_multi", "purpose": "forgetting rate curve across tasks"}, {"type": "heatmap", "purpose": "task accuracy matrix"}, ], "optimization": [ {"type": "training_curve", "purpose": "convergence speed comparison"}, {"type": "line_multi", "purpose": "loss landscape analysis"}, ], "default": [ {"type": "bar_comparison", "purpose": "main results comparison across methods"}, {"type": "training_curve", "purpose": "training convergence"}, ], } # Keywords for domain detection _DOMAIN_KEYWORDS: dict[str, list[str]] = { "classification": ["classif", "accuracy", "cifar", "imagenet", "image recognition"], "generation": ["generat", "gan", "diffusion", "vae", "fid", "inception score"], "reinforcement_learning": ["reinforcement", "reward", "policy", "gymnasium", "mujoco", "atari"], "knowledge_distillation": ["distill", "teacher", "student", "knowledge transfer"], "nlp": ["bleu", "rouge", "language model", "translation", "summariz"], "graph_neural_networks": ["graph", "node classif", "gnn", "gcn", "message passing"], "meta_learning": ["meta-learn", "few-shot", "maml", "prototyp"], "continual_learning": ["continual", "lifelong", "catastrophic forgetting", "incremental"], "optimization": ["optim", "convergence", "learning rate", "sgd", "adam"], } class PlannerAgent(BaseAgent): """Analyzes experiment data and generates a figure plan.""" name = "figure_planner" def __init__( self, llm: Any, *, min_figures: int = 3, max_figures: int = 8, ) -> None: super().__init__(llm) self._min_figures = min_figures self._max_figures = max_figures # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Generate a figure plan from experiment results. Context keys: experiment_results (dict): Parsed results.json / experiment_summary topic (str): Research topic hypothesis (str): Research hypothesis conditions (list[str]): Experiment condition names metric_key (str): Primary metric name metrics_summary (dict): Per-metric aggregated statistics condition_summaries (dict): Per-condition aggregated statistics """ try: results = context.get("experiment_results", {}) topic = context.get("topic", "") metric_key = context.get("metric_key", "primary_metric") conditions = context.get("conditions", []) metrics_summary = context.get("metrics_summary", {}) condition_summaries = context.get("condition_summaries", {}) # Step 1: Detect research domain domain = self._detect_domain(topic) self.logger.info("Detected research domain: %s", domain) # Step 2: Analyze available data data_analysis = self._analyze_data( results, conditions, metrics_summary, condition_summaries, metric_key ) # Step 3: Generate figure plan via LLM figure_plan = self._generate_plan( topic=topic, domain=domain, data_analysis=data_analysis, metric_key=metric_key, conditions=conditions, ) return self._make_result(True, data={ "figures": figure_plan, "domain": domain, "data_analysis": data_analysis, }) except Exception as exc: self.logger.error("Planner failed: %s", exc) return self._make_result(False, error=str(exc)) # ------------------------------------------------------------------ # Domain detection # ------------------------------------------------------------------ def _detect_domain(self, topic: str) -> str: """Detect research domain from topic string.""" topic_lower = topic.lower() scores: dict[str, int] = {} for domain, keywords in _DOMAIN_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in topic_lower) if score > 0: scores[domain] = score if scores: return max(scores, key=scores.get) # type: ignore[arg-type] return "default" # ------------------------------------------------------------------ # Data analysis # ------------------------------------------------------------------ def _analyze_data( self, results: dict[str, Any], conditions: list[str], metrics_summary: dict[str, Any], condition_summaries: dict[str, Any], metric_key: str, ) -> dict[str, Any]: """Analyze available experiment data to determine chart potential.""" analysis: dict[str, Any] = { "num_conditions": len(conditions), "conditions": conditions[:10], "num_metrics": len(metrics_summary), "metric_names": list(metrics_summary.keys())[:15], "has_training_history": False, "has_per_condition_data": bool(condition_summaries), "has_ablation": False, "has_multiple_seeds": False, "primary_metric": metric_key, } # Check for training history data for key in results: if any(t in str(key).lower() for t in ["history", "curve", "epoch", "step"]): analysis["has_training_history"] = True break # Check for ablation conditions for cond in conditions: cond_lower = cond.lower() if any(t in cond_lower for t in ["ablat", "without", "no_", "reduced", "remove"]): analysis["has_ablation"] = True break # Check for multi-seed data for cond_data in condition_summaries.values(): if isinstance(cond_data, dict): n_seeds = cond_data.get("n_seeds", 0) if n_seeds and int(n_seeds) > 1: analysis["has_multiple_seeds"] = True break # Extract key metric values per condition condition_values: dict[str, float] = {} for cond, cdata in condition_summaries.items(): if isinstance(cdata, dict): metrics = cdata.get("metrics", {}) val = metrics.get(f"{metric_key}_mean") or metrics.get(metric_key) if val is not None: try: condition_values[cond] = float(val) except (ValueError, TypeError): pass analysis["condition_values"] = condition_values return analysis # ------------------------------------------------------------------ # Plan generation # ------------------------------------------------------------------ def _generate_plan( self, *, topic: str, domain: str, data_analysis: dict[str, Any], metric_key: str, conditions: list[str], ) -> list[dict[str, Any]]: """Use LLM to generate a detailed figure plan.""" # Get domain-specific chart suggestions domain_charts = _CHART_TYPE_MATRIX.get(domain, _CHART_TYPE_MATRIX["default"]) system_prompt = ( "You are an expert scientific visualization advisor. " "Given experiment data from an ML research paper, you plan which " "figures to include in the paper.\n\n" "RULES:\n" f"- Generate between {self._min_figures} and {self._max_figures} figures\n" "- Each figure must serve a distinct purpose\n" "- At minimum include: 1 main results comparison + 1 ablation/analysis figure\n" "- If training history data exists, include a training curve\n" "- Figures should tell a coherent story about the research contributions\n" "- Do NOT generate figures for data that doesn't exist\n" "- Caption should be precise and descriptive (not generic)\n\n" "Available chart types: bar_comparison, grouped_bar, training_curve, " "loss_curve, heatmap, scatter_plot, violin_box, ablation_grouped, " "line_multi, radar_chart\n\n" "Return a JSON object with key 'figures' containing a list of figure " "specifications. Each figure spec must have:\n" "- figure_id: string (e.g. 'fig_main_results')\n" "- chart_type: one of the available types\n" "- title: short title for the chart\n" "- caption: detailed caption text (1-2 sentences)\n" "- data_source: what data to plot (metric names, conditions)\n" "- x_label: x-axis label\n" "- y_label: y-axis label\n" "- width: 'single_column' or 'double_column'\n" "- priority: 1 (must-have) to 3 (nice-to-have)\n" "- section: which paper section ('method', 'results', 'analysis')\n" ) user_prompt = ( f"Research topic: {topic}\n" f"Domain: {domain}\n" f"Primary metric: {metric_key}\n" f"Number of conditions: {data_analysis['num_conditions']}\n" f"Conditions: {', '.join(data_analysis.get('conditions', []))}\n" f"Available metrics: {', '.join(data_analysis.get('metric_names', []))}\n" f"Has training history: {data_analysis.get('has_training_history', False)}\n" f"Has ablation conditions: {data_analysis.get('has_ablation', False)}\n" f"Has multiple seeds: {data_analysis.get('has_multiple_seeds', False)}\n" f"Condition values: {json.dumps(data_analysis.get('condition_values', {}))}\n\n" f"Suggested chart types for this domain:\n" ) for chart in domain_charts: user_prompt += f"- {chart['type']}: {chart['purpose']}\n" user_prompt += "\nGenerate the figure plan JSON." result = self._chat_json(system_prompt, user_prompt, max_tokens=4096) figures = result.get("figures", []) if not figures: # Fallback: generate a basic plan from domain matrix self.logger.warning("LLM returned no figures, using domain-based fallback") figures = self._fallback_plan(domain, data_analysis, metric_key, conditions) # Ensure minimum figure count if len(figures) < self._min_figures: self.logger.info( "LLM returned %d figures (min %d), adding defaults", len(figures), self._min_figures, ) figures = self._augment_plan(figures, data_analysis, metric_key, conditions) # Cap at max figures = figures[:self._max_figures] # BUG-36: LLM may return figures as list of strings instead of dicts figures = [f for f in figures if isinstance(f, dict)] # Assign IDs if missing for i, fig in enumerate(figures): if not fig.get("figure_id"): fig["figure_id"] = f"fig_{i + 1}" return figures # ------------------------------------------------------------------ # Fallback plan (no LLM needed) # ------------------------------------------------------------------ def _fallback_plan( self, domain: str, data_analysis: dict[str, Any], metric_key: str, conditions: list[str], ) -> list[dict[str, Any]]: """Generate a basic plan without LLM (used as fallback).""" figures: list[dict[str, Any]] = [] # Always include a main results comparison if data_analysis["num_conditions"] >= 2: figures.append({ "figure_id": "fig_main_results", "chart_type": "bar_comparison", "title": "Method Comparison", "caption": f"Comparison of {metric_key.replace('_', ' ')} across all evaluated methods. " f"Error bars show 95% confidence intervals.", "data_source": {"type": "condition_comparison", "metric": metric_key}, "x_label": "Method", "y_label": metric_key.replace("_", " ").title(), "width": "single_column", "priority": 1, "section": "results", }) # Ablation grouped bar if ablation exists if data_analysis.get("has_ablation"): figures.append({ "figure_id": "fig_ablation", "chart_type": "ablation_grouped", "title": "Ablation Study", "caption": "Ablation study showing the contribution of each component. " "Removing each component independently reveals its importance.", "data_source": {"type": "ablation_comparison", "metric": metric_key}, "x_label": "Variant", "y_label": metric_key.replace("_", " ").title(), "width": "single_column", "priority": 1, "section": "results", }) # Training curve if history exists if data_analysis.get("has_training_history"): figures.append({ "figure_id": "fig_training_curve", "chart_type": "training_curve", "title": "Training Convergence", "caption": "Training loss curves for all methods. " "Shaded regions indicate standard deviation across seeds.", "data_source": {"type": "training_history"}, "x_label": "Epoch", "y_label": "Loss", "width": "single_column", "priority": 2, "section": "results", }) # Multi-metric comparison if multiple metrics if data_analysis["num_metrics"] > 2: metrics_to_show = [ m for m in data_analysis.get("metric_names", []) if m != metric_key and not any( t in m.lower() for t in ["time", "elapsed", "seed", "runtime"] ) ][:5] if metrics_to_show: figures.append({ "figure_id": "fig_multi_metric", "chart_type": "grouped_bar", "title": "Multi-Metric Comparison", "caption": "Performance comparison across multiple evaluation metrics.", "data_source": {"type": "multi_metric", "metrics": metrics_to_show}, "x_label": "Method", "y_label": "Score", "width": "double_column", "priority": 2, "section": "analysis", }) return figures def _augment_plan( self, existing: list[dict[str, Any]], data_analysis: dict[str, Any], metric_key: str, conditions: list[str], ) -> list[dict[str, Any]]: """Add default figures to meet minimum count.""" # BUG-37: chart_type may be non-hashable (list) — force str existing_types = { f.get("chart_type") for f in existing if isinstance(f.get("chart_type"), str) } augmented = list(existing) # Add main comparison if missing if "bar_comparison" not in existing_types and data_analysis["num_conditions"] >= 2: augmented.append({ "figure_id": "fig_main_results", "chart_type": "bar_comparison", "title": "Method Comparison", "caption": f"Comparison of {metric_key.replace('_', ' ')} across all methods.", "data_source": {"type": "condition_comparison", "metric": metric_key}, "x_label": "Method", "y_label": metric_key.replace("_", " ").title(), "width": "single_column", "priority": 1, "section": "results", }) # Add ablation if applicable and missing if ( "ablation_grouped" not in existing_types and data_analysis.get("has_ablation") ): augmented.append({ "figure_id": "fig_ablation", "chart_type": "ablation_grouped", "title": "Ablation Study", "caption": "Ablation analysis showing component contributions.", "data_source": {"type": "ablation_comparison", "metric": metric_key}, "x_label": "Variant", "y_label": metric_key.replace("_", " ").title(), "width": "single_column", "priority": 1, "section": "results", }) return augmented ================================================ FILE: researchclaw/agents/figure_agent/renderer.py ================================================ """Renderer Agent — executes plotting scripts and verifies output. Runs generated Python scripts in a subprocess (or Docker sandbox when available), captures stdout/stderr, verifies output files exist with correct format, and returns rendered image paths. Security: When Docker is available, visualization code is executed inside an isolated container (``--network none``) to prevent RCE from LLM-generated code. Falls back to a local subprocess when Docker is not available. Architecture ref: Visual ChatGPT (Wu et al., 2023) — LLMs as controllers calling deterministic render tools instead of generating pixels directly. """ from __future__ import annotations import logging import os import shutil import subprocess import sys from pathlib import Path from typing import Any from researchclaw.agents.base import BaseAgent, AgentStepResult from researchclaw.utils.sanitize import sanitize_figure_id logger = logging.getLogger(__name__) # Minimum acceptable file size (bytes) — filters out corrupt/empty PNGs _MIN_FILE_SIZE = 1024 # 1 KB # Docker image for sandboxed visualization rendering. # The experiment image already has matplotlib, numpy, seaborn pre-installed. _VIZ_DOCKER_IMAGE = "researchclaw/experiment:latest" def _docker_available() -> bool: """Return True if Docker daemon is reachable.""" try: cp = subprocess.run( ["docker", "info"], capture_output=True, timeout=10, check=False, ) return cp.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False class RendererAgent(BaseAgent): """Executes plotting scripts and verifies output files. Supports two execution modes: 1. **Docker sandbox** (preferred): Runs scripts inside an isolated container with ``--network none`` to prevent RCE. 2. **Local subprocess** (fallback): Direct execution when Docker is unavailable. The mode is auto-detected at instantiation time but can be forced via the ``use_docker`` parameter. """ name = "figure_renderer" def __init__( self, llm: Any, *, timeout_sec: int = 30, python_path: str | None = None, use_docker: bool | None = None, docker_image: str | None = None, ) -> None: super().__init__(llm) self._timeout = timeout_sec self._python = python_path or sys.executable self._docker_image = docker_image or _VIZ_DOCKER_IMAGE # Auto-detect Docker availability if not explicitly set if use_docker is None: self._use_docker = _docker_available() else: self._use_docker = use_docker if self._use_docker: self.logger.info( "RendererAgent: Docker sandbox ENABLED (image=%s)", self._docker_image, ) else: self.logger.warning( "RendererAgent: Docker sandbox DISABLED — LLM-generated " "scripts will run as LOCAL subprocesses WITHOUT sandboxing. " "Set use_docker=True or install Docker for secure execution." ) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def execute(self, context: dict[str, Any]) -> AgentStepResult: """Execute plotting scripts and verify outputs. Context keys: scripts (list[dict]): From CodeGen — each has 'figure_id', 'script', 'output_filename' output_dir (str|Path): Directory for output charts and scripts """ try: scripts = context.get("scripts", []) output_dir = Path(context.get("output_dir", "charts")).resolve() output_dir.mkdir(parents=True, exist_ok=True) scripts_dir = output_dir / "scripts" scripts_dir.mkdir(parents=True, exist_ok=True) results: list[dict[str, Any]] = [] for script_info in scripts: figure_id = script_info.get("figure_id", "unknown") script_code = script_info.get("script", "") output_filename = script_info.get("output_filename", f"{figure_id}.png") result = self._render_one( figure_id=figure_id, script_code=script_code, output_filename=output_filename, output_dir=output_dir, scripts_dir=scripts_dir, ) result["title"] = script_info.get("title", "") result["caption"] = script_info.get("caption", "") result["section"] = script_info.get("section", "results") result["width"] = script_info.get("width", "single_column") results.append(result) success_count = sum(1 for r in results if r["success"]) self.logger.info( "Rendered %d/%d figures successfully", success_count, len(scripts), ) return self._make_result( success=success_count > 0, data={"rendered": results, "output_dir": str(output_dir)}, error="" if success_count > 0 else "All renders failed", ) except Exception as exc: self.logger.error("Renderer failed: %s", exc) return self._make_result(False, error=str(exc)) # ------------------------------------------------------------------ # Per-figure rendering # ------------------------------------------------------------------ def _render_one( self, *, figure_id: str, script_code: str, output_filename: str, output_dir: Path, scripts_dir: Path, ) -> dict[str, Any]: """Render a single figure script.""" figure_id = sanitize_figure_id(figure_id) output_filename = sanitize_figure_id( output_filename.replace(".png", ""), fallback="figure" ) + ".png" result: dict[str, Any] = { "figure_id": figure_id, "success": False, "output_path": "", "script_path": "", "error": "", } if not script_code.strip(): result["error"] = "Empty script" return result # Save script for reproducibility script_path = scripts_dir / f"{figure_id}.py" # BUG-60: When running in Docker, rewrite absolute host paths to # Docker-mapped paths. Generated scripts use savefig() with absolute # host paths (e.g. /home/user/.../charts/fig.png) but inside Docker # the output dir is mounted at /workspace/output. if self._use_docker: import re as _re_path _host_out = str(output_dir.resolve()) # Replace host output dir with Docker-mapped path script_code = script_code.replace(_host_out, "/workspace/output") # Also catch any other absolute paths pointing to output_dir parent script_code = _re_path.sub( r'savefig\(["\'](?:/[^"\']*/)(' + _re_path.escape(output_filename) + r')["\']', r'savefig("/workspace/output/\1"', script_code, ) script_path.write_text(script_code, encoding="utf-8") result["script_path"] = str(script_path) # Choose execution backend if self._use_docker: proc_result = self._execute_in_docker( script_path=script_path, output_dir=output_dir, figure_id=figure_id, ) else: proc_result = self._execute_local( script_path=script_path, output_dir=output_dir, ) if proc_result["error"]: result["error"] = proc_result["error"] self.logger.warning( "Render failed for %s: %s", figure_id, result["error"][:200] ) return result # Verify output file exists output_path = output_dir / output_filename if not output_path.exists(): # Check if it was saved relative to script CWD alt_path = output_dir.parent / output_dir.name / output_filename if alt_path.exists(): output_path = alt_path else: result["error"] = f"Output file not found: {output_path}" self.logger.warning("Output missing for %s", figure_id) return result # Verify file size file_size = output_path.stat().st_size if file_size < _MIN_FILE_SIZE: result["error"] = f"Output file too small ({file_size} bytes)" self.logger.warning( "Output too small for %s: %d bytes", figure_id, file_size ) return result result["success"] = True result["output_path"] = str(output_path) result["file_size"] = file_size self.logger.info("Rendered %s: %s (%d bytes)", figure_id, output_path, file_size) return result # ------------------------------------------------------------------ # Execution backends # ------------------------------------------------------------------ def _execute_local( self, *, script_path: Path, output_dir: Path, ) -> dict[str, str]: """Execute script in a local subprocess (no sandbox).""" try: proc = subprocess.run( [self._python, str(script_path.resolve())], capture_output=True, text=True, timeout=self._timeout, # BUG-20: Use output_dir as CWD so relative paths # like fig.savefig("comparison.png") resolve correctly cwd=str(output_dir.resolve()), ) except subprocess.TimeoutExpired: return {"error": f"Script timed out after {self._timeout}s"} except FileNotFoundError: return {"error": f"Python executable not found: {self._python}"} if proc.returncode != 0: stderr = proc.stderr[:2000] if proc.stderr else "Unknown error" return {"error": f"Script failed (exit {proc.returncode}): {stderr}"} return {"error": ""} def _execute_in_docker( self, *, script_path: Path, output_dir: Path, figure_id: str, ) -> dict[str, str]: """Execute script inside an isolated Docker container. Security measures: - ``--network none``: No network access (prevents data exfiltration) - ``--read-only``: Root filesystem is read-only - ``--tmpfs /tmp``: Writable /tmp only in-memory - ``--memory 512m``: Hard memory limit - Volume mounts are restricted to the output directory - Script is bind-mounted read-only - Container is auto-removed after execution This prevents RCE from LLM-generated visualization code. """ import uuid as _uuid_renderer container_name = f"rc-viz-{figure_id}-{os.getpid()}-{_uuid_renderer.uuid4().hex[:8]}" cmd = [ "docker", "run", "--name", container_name, "--rm", "--network", "none", "--read-only", "--tmpfs", "/tmp:rw,noexec,nosuid,size=64m", f"--memory=512m", "-e", "MPLCONFIGDIR=/tmp/matplotlib", "-e", "XDG_CONFIG_HOME=/tmp", "-v", f"{script_path.resolve()}:/workspace/script.py:ro", "-v", f"{output_dir.resolve()}:/workspace/output:rw", "-w", "/workspace/output", # BUG-60: CWD = output dir so relative paths work "--user", f"{os.getuid()}:{os.getgid()}", "--entrypoint", "python3", self._docker_image, "/workspace/script.py", ] try: proc = subprocess.run( cmd, capture_output=True, text=True, timeout=self._timeout, check=False, ) except subprocess.TimeoutExpired: # Kill the container on timeout try: subprocess.run( ["docker", "kill", container_name], capture_output=True, timeout=10, check=False, ) except (FileNotFoundError, subprocess.TimeoutExpired): pass return {"error": f"Docker script timed out after {self._timeout}s"} except FileNotFoundError: return {"error": "Docker executable not found"} except Exception as exc: return {"error": f"Docker execution error: {exc}"} if proc.returncode != 0: stderr = proc.stderr[:2000] if proc.stderr else "Unknown error" return {"error": f"Docker script failed (exit {proc.returncode}): {stderr}"} return {"error": ""} ================================================ FILE: researchclaw/agents/figure_agent/style_config.py ================================================ """Academic chart styling configuration for FigureAgent. Defines global constants for chart styling that conform to AI conference publication standards (IEEE, NeurIPS, ICML, ICLR). Used by CodeGen Agent when generating matplotlib plotting scripts. """ from __future__ import annotations # --------------------------------------------------------------------------- # Style presets # --------------------------------------------------------------------------- # SciencePlots style list — CodeGen Agent inserts this into generated scripts. # Fallback: seaborn-v0_8-whitegrid if SciencePlots is not installed. MATPLOTLIB_STYLES = ["science", "ieee"] MATPLOTLIB_STYLES_FALLBACK = ["seaborn-v0_8-whitegrid"] # Output resolution (DPI) — 300+ for publication, 150 for draft DPI_PUBLICATION = 300 DPI_DRAFT = 150 # --------------------------------------------------------------------------- # Font sizes (points) — width-aware to avoid oversized text in paper columns # --------------------------------------------------------------------------- # For single-column figures (≤3.5in) — fonts must be small to match 10pt body FONT_SIZE_SINGLE_COL = { "title": 9, "axis_label": 8, "tick": 7, "legend": 7, "annotation": 7, } # For double-column / full-page figures (≥7.0in) — normal academic sizes FONT_SIZE_DOUBLE_COL = { "title": 11, "axis_label": 10, "tick": 9, "legend": 9, "annotation": 9, } # Legacy alias (default to single-column, the most common case) FONT_SIZE = FONT_SIZE_SINGLE_COL def get_font_sizes(width_key: str = "single_column") -> dict[str, int]: """Return font size dict appropriate for the given figure width.""" if width_key in ("double_column", "full_page"): return FONT_SIZE_DOUBLE_COL return FONT_SIZE_SINGLE_COL # --------------------------------------------------------------------------- # Figure dimensions (inches) — column-width aware # --------------------------------------------------------------------------- FIGURE_WIDTH = { "single_column": 3.5, # IEEE / NeurIPS single column "double_column": 7.0, # IEEE / NeurIPS double column "full_page": 7.0, # Full width } DEFAULT_FIGURE_HEIGHT = 3.0 # reasonable default height # --------------------------------------------------------------------------- # Colorblind-safe palette (Paul Tol's "bright" scheme) # --------------------------------------------------------------------------- COLORS_BRIGHT = [ "#4477AA", # blue "#EE6677", # red "#228833", # green "#CCBB44", # yellow "#66CCEE", # cyan "#AA3377", # purple "#BBBBBB", # grey ] # Extended palette for > 7 categories COLORS_EXTENDED = COLORS_BRIGHT + [ "#332288", # indigo "#88CCEE", # light blue "#44AA99", # teal "#117733", # dark green "#999933", # olive "#CC6677", # rose "#882255", # wine ] # --------------------------------------------------------------------------- # Line and marker styles (for B&W printing compatibility) # --------------------------------------------------------------------------- LINE_STYLES = ["-", "--", "-.", ":"] MARKER_STYLES = ["o", "s", "^", "D", "v", "P", "*", "X"] # --------------------------------------------------------------------------- # Output format preferences # --------------------------------------------------------------------------- OUTPUT_FORMAT_PRIMARY = "pdf" # Vector — preferred for publication OUTPUT_FORMAT_FALLBACK = "png" # Raster — for markdown embedding OUTPUT_FORMATS = ["pdf", "png"] # Generate both # --------------------------------------------------------------------------- # Chart type constants # --------------------------------------------------------------------------- CHART_TYPES = { "bar_comparison", "grouped_bar", "training_curve", "loss_curve", "heatmap", "confusion_matrix", "scatter_plot", "violin_box", "ablation_grouped", "line_multi", "radar_chart", "architecture_diagram", # Placeholder — generated via description } # --------------------------------------------------------------------------- # Style snippet for injection into generated scripts # --------------------------------------------------------------------------- STYLE_PREAMBLE = ''' import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import numpy as np # Academic styling try: plt.style.use({styles}) except Exception: try: plt.style.use({fallback}) except Exception: pass # Use default matplotlib style # Colorblind-safe palette COLORS = {colors} LINE_STYLES = {line_styles} MARKERS = {markers} # Publication settings plt.rcParams.update({{ "font.size": {font_axis}, "axes.titlesize": {font_title}, "axes.labelsize": {font_axis}, "xtick.labelsize": {font_tick}, "ytick.labelsize": {font_tick}, "legend.fontsize": {font_legend}, "figure.dpi": {dpi}, "savefig.dpi": {dpi}, "savefig.bbox": "tight", "savefig.pad_inches": 0.15, }}) '''.strip() def get_style_preamble( *, dpi: int = DPI_PUBLICATION, width_key: str = "single_column", ) -> str: """Return the style preamble string for injection into chart scripts.""" fonts = get_font_sizes(width_key) return STYLE_PREAMBLE.format( styles=repr(MATPLOTLIB_STYLES), fallback=repr(MATPLOTLIB_STYLES_FALLBACK), colors=repr(COLORS_BRIGHT), line_styles=repr(LINE_STYLES), markers=repr(MARKER_STYLES), font_title=fonts["title"], font_axis=fonts["axis_label"], font_tick=fonts["tick"], font_legend=fonts["legend"], dpi=dpi, ) ================================================ FILE: researchclaw/assessor/__init__.py ================================================ """Paper quality assessment and venue recommendation.""" from researchclaw.assessor.rubrics import RUBRICS, Rubric from researchclaw.assessor.scorer import PaperScorer from researchclaw.assessor.venue_recommender import VenueRecommender from researchclaw.assessor.comparator import HistoryComparator __all__ = [ "RUBRICS", "HistoryComparator", "PaperScorer", "Rubric", "VenueRecommender", ] ================================================ FILE: researchclaw/assessor/comparator.py ================================================ """Historical score comparison and tracking.""" from __future__ import annotations import json import logging from datetime import datetime, timezone from pathlib import Path from typing import Any logger = logging.getLogger(__name__) class HistoryComparator: """Track and compare paper quality scores across runs.""" def __init__(self, history_dir: Path | None = None): self._history_dir = history_dir self._entries: list[dict[str, Any]] = [] if history_dir: self._load_history() def _load_history(self) -> None: """Load score history from disk.""" if self._history_dir is None: return history_file = self._history_dir / "quality_history.json" if not history_file.exists(): return try: data = json.loads(history_file.read_text(encoding="utf-8")) if isinstance(data, list): self._entries = data except (json.JSONDecodeError, OSError) as exc: logger.warning("Failed to load quality history: %s", exc) def record( self, run_id: str, topic: str, scores: dict[str, Any], ) -> None: """Record a quality assessment result.""" entry = { "run_id": run_id, "topic": topic, "overall": scores.get("overall", 0.0), "scores": scores.get("scores", {}), "timestamp": datetime.now(timezone.utc).isoformat(), } self._entries.append(entry) self._save_history() def _save_history(self) -> None: """Persist history to disk.""" if self._history_dir is None: return self._history_dir.mkdir(parents=True, exist_ok=True) history_file = self._history_dir / "quality_history.json" history_file.write_text( json.dumps(self._entries, indent=2), encoding="utf-8" ) def compare( self, current_scores: dict[str, Any], previous_run_id: str | None = None, ) -> dict[str, Any]: """Compare current scores with a previous run or best historical.""" if not self._entries: return { "comparison": "no_history", "message": "No previous runs to compare against.", } if previous_run_id: prev = next( (e for e in self._entries if e["run_id"] == previous_run_id), None, ) else: prev = max(self._entries, key=lambda e: e.get("overall", 0)) if prev is None: return { "comparison": "not_found", "message": f"Run '{previous_run_id}' not found in history.", } current_overall = current_scores.get("overall", 0.0) prev_overall = prev.get("overall", 0.0) delta = round(current_overall - prev_overall, 2) dim_deltas = {} current_dims = current_scores.get("scores", {}) prev_dims = prev.get("scores", {}) for dim in set(current_dims) | set(prev_dims): cur = current_dims.get(dim, 0.0) prv = prev_dims.get(dim, 0.0) dim_deltas[dim] = round(cur - prv, 2) trend = "improved" if delta > 0.5 else ("declined" if delta < -0.5 else "stable") return { "comparison": "success", "previous_run_id": prev.get("run_id", "unknown"), "current_overall": current_overall, "previous_overall": prev_overall, "delta": delta, "trend": trend, "dimension_deltas": dim_deltas, } def get_best_run(self) -> dict[str, Any] | None: """Return the highest-scoring historical run.""" if not self._entries: return None return max(self._entries, key=lambda e: e.get("overall", 0)) def get_history(self) -> list[dict[str, Any]]: """Return all historical entries.""" return list(self._entries) ================================================ FILE: researchclaw/assessor/rubrics.py ================================================ """Paper quality assessment rubrics.""" from __future__ import annotations from dataclasses import dataclass @dataclass(frozen=True) class Rubric: """A single evaluation dimension rubric.""" name: str criteria: str scale: str weight: float = 1.0 RUBRICS: dict[str, Rubric] = { "novelty": Rubric( name="Novelty", criteria="Originality of the idea. Does it propose something genuinely new?", scale="1=rehash, 3=incremental, 5=solid contribution, 7=novel, 10=breakthrough", ), "rigor": Rubric( name="Rigor", criteria=( "Scientific rigor. Are experiments well-designed? " "Statistical significance reported?" ), scale="1=no experiments, 3=basic, 5=adequate, 7=thorough, 10=exemplary", ), "clarity": Rubric( name="Clarity", criteria="Writing quality. Is the paper well-organized and easy to follow?", scale="1=incomprehensible, 3=poor, 5=adequate, 7=clear, 10=excellent", ), "impact": Rubric( name="Impact", criteria="Potential impact on the field. Will others cite/use this work?", scale="1=none, 3=limited, 5=moderate, 7=significant, 10=transformative", ), "experiments": Rubric( name="Experiments", criteria="Experimental sufficiency. Are baselines fair? Ablations complete?", scale="1=none, 3=minimal, 5=adequate, 7=comprehensive, 10=exceptional", ), } ================================================ FILE: researchclaw/assessor/scorer.py ================================================ """Multi-dimensional paper quality scorer.""" from __future__ import annotations import json import logging import re from typing import Any from researchclaw.assessor.rubrics import RUBRICS, Rubric logger = logging.getLogger(__name__) class PaperScorer: """Score a paper across multiple quality dimensions using an LLM.""" def __init__( self, dimensions: tuple[str, ...] | None = None, llm_client: Any = None, ): self.dimensions = dimensions or tuple(RUBRICS.keys()) self.llm = llm_client async def score( self, paper_md: str, experiment_results: dict[str, Any] | None = None, ) -> dict[str, Any]: """Score a paper across all configured dimensions.""" scores: dict[str, float] = {} details: dict[str, str] = {} for dim in self.dimensions: rubric = RUBRICS.get(dim) if rubric is None: logger.warning("Unknown rubric dimension: %s", dim) continue score, detail = await self._score_dimension( paper_md, experiment_results, rubric ) scores[dim] = score details[dim] = detail if scores: total_weight = sum( RUBRICS[d].weight for d in scores if d in RUBRICS ) if total_weight > 0: weighted_sum = sum( scores[d] * RUBRICS[d].weight for d in scores if d in RUBRICS ) overall = round(weighted_sum / total_weight, 2) else: overall = round(sum(scores.values()) / len(scores), 2) else: overall = 0.0 return { "scores": scores, "overall": overall, "details": details, "dimensions_evaluated": list(scores.keys()), } async def _score_dimension( self, paper_md: str, experiment_results: dict[str, Any] | None, rubric: Rubric, ) -> tuple[float, str]: """Score a single dimension using the LLM.""" if self.llm is None: return self._heuristic_score(paper_md, rubric) exp_context = "" if experiment_results: exp_context = f"\n\nExperiment results summary:\n{json.dumps(experiment_results, indent=2, default=str)[:2000]}" prompt = ( f"Rate the following research paper on '{rubric.name}' from 1 to 10.\n\n" f"Criteria: {rubric.criteria}\n" f"Scale: {rubric.scale}\n\n" f"Paper content (first 6000 chars):\n{paper_md[:6000]}" f"{exp_context}\n\n" f"Respond in this exact format:\n" f"SCORE: \n" f"REASON: " ) try: response = await self.llm.chat_async(prompt) return self._parse_score_response(response, rubric.name) except Exception as exc: logger.warning("LLM scoring failed for %s: %s", rubric.name, exc) return self._heuristic_score(paper_md, rubric) @staticmethod def _parse_score_response( response: str, dim_name: str, ) -> tuple[float, str]: """Parse LLM score response.""" score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)", response) reason_match = re.search(r"REASON:\s*(.+)", response) if score_match: score = float(score_match.group(1)) score = max(1.0, min(10.0, score)) else: score = 5.0 reason = reason_match.group(1).strip() if reason_match else "No detail provided" return score, reason @staticmethod def _heuristic_score( paper_md: str, rubric: Rubric, ) -> tuple[float, str]: """Simple heuristic scoring when LLM is unavailable.""" word_count = len(paper_md.split()) if rubric.name == "Clarity": if word_count > 3000: score = 6.0 elif word_count > 1000: score = 5.0 else: score = 3.0 return score, f"Heuristic: {word_count} words" if rubric.name == "Experiments": has_table = "table" in paper_md.lower() or "|" in paper_md has_figure = "figure" in paper_md.lower() or "fig." in paper_md.lower() score = 4.0 if has_table: score += 1.5 if has_figure: score += 1.5 return min(score, 10.0), "Heuristic: table/figure presence" return 5.0, "Heuristic: default score (no LLM)" ================================================ FILE: researchclaw/assessor/venue_recommender.py ================================================ """Venue recommendation based on paper quality scores.""" from __future__ import annotations from typing import Any class VenueRecommender: """Recommend submission venues based on quality scores.""" VENUE_TIERS: dict[str, dict[str, Any]] = { "tier_1": { "venues": ["NeurIPS", "ICML", "ICLR", "CVPR", "ACL"], "min_score": 8.0, "domains": { "NeurIPS": ["ml", "ai", "deep-learning"], "ICML": ["ml"], "ICLR": ["ml", "deep-learning", "representation-learning"], "CVPR": ["cv", "deep-learning"], "ACL": ["nlp", "ai"], }, }, "tier_2": { "venues": ["AAAI", "IJCAI", "ECCV", "EMNLP", "KDD"], "min_score": 6.0, "domains": { "AAAI": ["ai", "ml"], "IJCAI": ["ai", "ml"], "ECCV": ["cv", "deep-learning"], "EMNLP": ["nlp"], "KDD": ["data-mining", "ml"], }, }, "tier_3": { "venues": ["ACML", "AISTATS", "WACV", "COLING"], "min_score": 4.0, "domains": { "ACML": ["ml"], "AISTATS": ["ml", "statistics"], "WACV": ["cv"], "COLING": ["nlp"], }, }, "workshop": { "venues": ["NeurIPS Workshop", "ICML Workshop", "ICLR Workshop"], "min_score": 3.0, "domains": { "NeurIPS Workshop": ["ml", "ai"], "ICML Workshop": ["ml"], "ICLR Workshop": ["ml", "deep-learning"], }, }, } def recommend( self, scores: dict[str, Any], domains: list[str] | None = None, ) -> list[dict[str, Any]]: """Recommend venues based on paper scores.""" overall = scores.get("overall", 0.0) if not isinstance(overall, (int, float)): overall = 0.0 recommendations: list[dict[str, Any]] = [] for tier_name, tier_data in self.VENUE_TIERS.items(): min_score = tier_data["min_score"] if overall < min_score: continue for venue in tier_data["venues"]: venue_domains = tier_data["domains"].get(venue, []) if domains and not any(d in venue_domains for d in domains): continue recommendations.append({ "venue": venue, "tier": tier_name, "match_score": overall, "suggestion": self._get_suggestion(venue, scores), "venue_domains": venue_domains, }) recommendations.sort( key=lambda r: ( -r["match_score"], {"tier_1": 0, "tier_2": 1, "tier_3": 2, "workshop": 3}.get( r["tier"], 4 ), ) ) return recommendations @staticmethod def _get_suggestion(venue: str, scores: dict[str, Any]) -> str: """Generate a brief suggestion for improving chances at this venue.""" dim_scores = scores.get("scores", {}) if not dim_scores: return "Evaluate paper quality to get specific suggestions." weakest_dim = min(dim_scores, key=dim_scores.get) weakest_score = dim_scores[weakest_dim] if weakest_score < 5: return f"Strengthen '{weakest_dim}' (currently {weakest_score}/10) before submitting." if weakest_score < 7: return f"Consider improving '{weakest_dim}' ({weakest_score}/10) for better chances." return "Paper quality looks strong for this venue." def format_recommendations( self, recommendations: list[dict[str, Any]], ) -> str: """Format recommendations as a readable string.""" if not recommendations: return "No suitable venues found for current paper quality." lines = ["Venue Recommendations:", ""] for rec in recommendations: lines.append( f" {rec['venue']} ({rec['tier']}) — " f"score {rec['match_score']}/10" ) lines.append(f" {rec['suggestion']}") lines.append("") return "\n".join(lines) ================================================ FILE: researchclaw/calendar/__init__.py ================================================ """Conference deadline calendar and submission planning.""" from researchclaw.calendar.deadlines import ConferenceCalendar from researchclaw.calendar.planner import SubmissionPlanner from researchclaw.calendar.reminder import ReminderCalculator __all__ = [ "ConferenceCalendar", "ReminderCalculator", "SubmissionPlanner", ] ================================================ FILE: researchclaw/calendar/data/conferences.yaml ================================================ conferences: - name: "NeurIPS 2026" full_name: "Conference on Neural Information Processing Systems" domains: ["ml", "ai", "deep-learning"] abstract_deadline: "2026-05-15" paper_deadline: "2026-05-22" notification: "2026-09-25" camera_ready: "2026-10-15" conference_date: "2026-12-08" url: "https://neurips.cc/" tier: 1 - name: "ICLR 2027" full_name: "International Conference on Learning Representations" domains: ["ml", "deep-learning", "representation-learning"] abstract_deadline: "2026-09-28" paper_deadline: "2026-10-05" notification: "2027-01-22" conference_date: "2027-04-28" url: "https://iclr.cc/" tier: 1 - name: "ICML 2026" full_name: "International Conference on Machine Learning" domains: ["ml"] paper_deadline: "2026-01-31" notification: "2026-05-01" conference_date: "2026-07-21" url: "https://icml.cc/" tier: 1 - name: "CVPR 2027" full_name: "Conference on Computer Vision and Pattern Recognition" domains: ["cv", "deep-learning"] paper_deadline: "2026-11-15" notification: "2027-02-25" conference_date: "2027-06-15" url: "https://cvpr.thecvf.com/" tier: 1 - name: "ACL 2026" full_name: "Annual Meeting of the Association for Computational Linguistics" domains: ["nlp", "ai"] paper_deadline: "2026-02-15" notification: "2026-05-10" conference_date: "2026-08-10" url: "https://www.aclweb.org/" tier: 1 - name: "AAAI 2027" full_name: "AAAI Conference on Artificial Intelligence" domains: ["ai", "ml"] abstract_deadline: "2026-08-08" paper_deadline: "2026-08-15" notification: "2026-11-20" conference_date: "2027-02-22" url: "https://aaai.org/" tier: 1 - name: "IJCAI 2026" full_name: "International Joint Conference on Artificial Intelligence" domains: ["ai", "ml"] paper_deadline: "2026-01-17" notification: "2026-04-20" conference_date: "2026-08-09" url: "https://www.ijcai.org/" tier: 1 - name: "ECCV 2026" full_name: "European Conference on Computer Vision" domains: ["cv", "deep-learning"] paper_deadline: "2026-03-07" notification: "2026-07-01" conference_date: "2026-10-05" url: "https://eccv.ecva.net/" tier: 1 - name: "EMNLP 2026" full_name: "Conference on Empirical Methods in Natural Language Processing" domains: ["nlp"] paper_deadline: "2026-06-01" notification: "2026-08-15" conference_date: "2026-12-01" url: "https://www.aclweb.org/" tier: 1 - name: "AISTATS 2027" full_name: "International Conference on Artificial Intelligence and Statistics" domains: ["ml", "statistics"] paper_deadline: "2026-10-10" notification: "2027-01-15" conference_date: "2027-04-15" url: "https://aistats.org/" tier: 2 - name: "ACML 2026" full_name: "Asian Conference on Machine Learning" domains: ["ml"] paper_deadline: "2026-06-15" notification: "2026-08-30" conference_date: "2026-11-15" url: "https://www.acml-conf.org/" tier: 2 - name: "WACV 2027" full_name: "Winter Conference on Applications of Computer Vision" domains: ["cv"] paper_deadline: "2026-08-01" notification: "2026-10-20" conference_date: "2027-01-06" url: "https://wacv2027.thecvf.com/" tier: 2 - name: "COLING 2026" full_name: "International Conference on Computational Linguistics" domains: ["nlp"] paper_deadline: "2026-05-10" notification: "2026-07-15" conference_date: "2026-10-20" url: "https://coling2026.org/" tier: 2 - name: "NAACL 2026" full_name: "North American Chapter of the ACL" domains: ["nlp"] paper_deadline: "2026-01-20" notification: "2026-03-30" conference_date: "2026-06-15" url: "https://www.aclweb.org/" tier: 1 - name: "KDD 2026" full_name: "ACM SIGKDD Conference on Knowledge Discovery and Data Mining" domains: ["data-mining", "ml"] paper_deadline: "2026-02-08" notification: "2026-05-20" conference_date: "2026-08-03" url: "https://kdd.org/" tier: 1 ================================================ FILE: researchclaw/calendar/deadlines.py ================================================ """Conference deadline data management.""" from __future__ import annotations import logging from dataclasses import dataclass from datetime import date, datetime from pathlib import Path from typing import Any import yaml logger = logging.getLogger(__name__) _DATA_DIR = Path(__file__).parent / "data" @dataclass(frozen=True) class Conference: """A single conference entry.""" name: str full_name: str domains: tuple[str, ...] tier: int url: str = "" abstract_deadline: date | None = None paper_deadline: date | None = None notification: date | None = None camera_ready: date | None = None conference_date: date | None = None @classmethod def from_dict(cls, data: dict[str, Any]) -> Conference: """Parse a conference from a YAML dict.""" def _parse_date(val: Any) -> date | None: if val is None: return None if isinstance(val, date): return val return datetime.strptime(str(val), "%Y-%m-%d").date() return cls( name=str(data["name"]), full_name=str(data.get("full_name", data["name"])), domains=tuple(data.get("domains") or ()), tier=int(data.get("tier", 3)), url=str(data.get("url", "")), abstract_deadline=_parse_date(data.get("abstract_deadline")), paper_deadline=_parse_date(data.get("paper_deadline")), notification=_parse_date(data.get("notification")), camera_ready=_parse_date(data.get("camera_ready")), conference_date=_parse_date(data.get("conference_date")), ) @property def next_deadline(self) -> date | None: """Return the earliest upcoming deadline (abstract or paper).""" today = date.today() candidates = [] if self.abstract_deadline and self.abstract_deadline >= today: candidates.append(self.abstract_deadline) if self.paper_deadline and self.paper_deadline >= today: candidates.append(self.paper_deadline) return min(candidates) if candidates else None @property def days_until_deadline(self) -> int | None: """Days until the next deadline, or None if all passed.""" nd = self.next_deadline if nd is None: return None return (nd - date.today()).days class ConferenceCalendar: """Manage conference deadline data.""" def __init__(self, conferences: list[Conference] | None = None): self._conferences: list[Conference] = conferences or [] @classmethod def load_builtin(cls) -> ConferenceCalendar: """Load the built-in conferences.yaml data.""" yaml_path = _DATA_DIR / "conferences.yaml" if not yaml_path.exists(): logger.warning("Built-in conferences.yaml not found at %s", yaml_path) return cls([]) return cls.load(yaml_path) @classmethod def load(cls, path: Path | str) -> ConferenceCalendar: """Load conferences from a YAML file.""" path = Path(path) with path.open(encoding="utf-8") as f: data = yaml.safe_load(f) or {} entries = data.get("conferences", []) conferences = [] for entry in entries: try: conferences.append(Conference.from_dict(entry)) except (KeyError, ValueError, TypeError) as exc: logger.warning("Skipping invalid conference entry: %s", exc) return cls(conferences) @property def conferences(self) -> list[Conference]: return list(self._conferences) def get_upcoming( self, domains: list[str] | None = None, days: int = 90, tier: int | None = None, ) -> list[Conference]: """Get conferences with deadlines in the next N days.""" today = date.today() results = [] for conf in self._conferences: nd = conf.next_deadline if nd is None: continue delta = (nd - today).days if delta < 0 or delta > days: continue if domains and not any(d in conf.domains for d in domains): continue if tier is not None and conf.tier > tier: continue results.append(conf) results.sort(key=lambda c: c.next_deadline or date.max) return results def get_by_name(self, name: str) -> Conference | None: """Find a conference by name (case-insensitive partial match).""" name_lower = name.lower() for conf in self._conferences: if name_lower in conf.name.lower(): return conf return None def get_by_domain(self, domain: str) -> list[Conference]: """Get all conferences for a domain.""" return [c for c in self._conferences if domain in c.domains] def format_upcoming( self, domains: list[str] | None = None, days: int = 90, ) -> str: """Format upcoming deadlines as a readable string.""" upcoming = self.get_upcoming(domains=domains, days=days) if not upcoming: return "No upcoming deadlines in the next {} days.".format(days) lines = [f"Upcoming Conference Deadlines (next {days} days):", ""] for conf in upcoming: nd = conf.next_deadline days_left = conf.days_until_deadline dl_type = "abstract" if nd == conf.abstract_deadline else "paper" lines.append( f" {conf.name} (Tier {conf.tier})" ) lines.append( f" {dl_type} deadline: {nd} ({days_left} days left)" ) if conf.url: lines.append(f" URL: {conf.url}") lines.append("") return "\n".join(lines) ================================================ FILE: researchclaw/calendar/planner.py ================================================ """Submission timeline planner.""" from __future__ import annotations from datetime import date, timedelta from typing import Any from researchclaw.calendar.deadlines import ConferenceCalendar class SubmissionPlanner: """Generate submission timelines for target conferences.""" # Stage proportions of total available time STAGE_PROPORTIONS = [ ("Topic Selection", 0.0), ("Literature Review", 0.10), ("Experiment Design", 0.20), ("Experiments", 0.40), ("Paper Writing", 0.60), ("Revision", 0.80), ("Final Check", 0.95), ("Submission", 1.0), ] def __init__(self, calendar: ConferenceCalendar): self.calendar = calendar def plan( self, target_venue: str, start_date: date | None = None, ) -> dict[str, Any]: """Generate a submission timeline for a target venue.""" conf = self.calendar.get_by_name(target_venue) if conf is None: return {"error": f"Conference '{target_venue}' not found"} deadline = conf.paper_deadline or conf.abstract_deadline if deadline is None: return {"error": f"No deadline found for '{conf.name}'"} start = start_date or date.today() total_days = (deadline - start).days if total_days <= 0: return { "error": f"Deadline {deadline} has passed", "venue": conf.name, "deadline": deadline.isoformat(), } milestones = [] for stage_name, proportion in self.STAGE_PROPORTIONS: offset = int(total_days * proportion) milestone_date = start + timedelta(days=offset) days_left = (deadline - milestone_date).days milestones.append({ "stage": stage_name, "date": milestone_date.isoformat(), "days_left": days_left, }) return { "venue": conf.name, "deadline": deadline.isoformat(), "total_days": total_days, "start_date": start.isoformat(), "milestones": milestones, "conference_url": conf.url, "tier": conf.tier, } def format_plan( self, target_venue: str, start_date: date | None = None, ) -> str: """Format a submission plan as a readable string.""" plan = self.plan(target_venue, start_date) if "error" in plan: return f"Error: {plan['error']}" lines = [ f"Submission Plan for {plan['venue']}", f"Deadline: {plan['deadline']} ({plan['total_days']} days from start)", "", "Milestones:", ] for ms in plan["milestones"]: lines.append( f" [{ms['date']}] {ms['stage']} ({ms['days_left']} days left)" ) return "\n".join(lines) ================================================ FILE: researchclaw/calendar/reminder.py ================================================ """Deadline reminder calculation.""" from __future__ import annotations from dataclasses import dataclass from datetime import date from typing import Any from researchclaw.calendar.deadlines import Conference @dataclass(frozen=True) class Reminder: """A deadline reminder.""" conference_name: str deadline_type: str # "abstract" or "paper" deadline_date: date days_until: int urgency: str # "critical" | "warning" | "info" class ReminderCalculator: """Calculate deadline reminders based on configuration.""" def __init__( self, reminder_days: tuple[int, ...] = (30, 14, 7, 3, 1), ): self.reminder_days = sorted(reminder_days, reverse=True) def check( self, conferences: list[Conference], check_date: date | None = None, ) -> list[Reminder]: """Check which conferences need reminders today.""" today = check_date or date.today() reminders: list[Reminder] = [] for conf in conferences: for dl_type, dl_date in [ ("abstract", conf.abstract_deadline), ("paper", conf.paper_deadline), ]: if dl_date is None: continue days_until = (dl_date - today).days if days_until < 0: continue if days_until in self.reminder_days: urgency = self._classify_urgency(days_until) reminders.append(Reminder( conference_name=conf.name, deadline_type=dl_type, deadline_date=dl_date, days_until=days_until, urgency=urgency, )) reminders.sort(key=lambda r: r.days_until) return reminders def get_active_reminders( self, conferences: list[Conference], check_date: date | None = None, ) -> list[Reminder]: """Get all reminders for deadlines within the reminder window.""" today = check_date or date.today() max_days = max(self.reminder_days) if self.reminder_days else 30 reminders: list[Reminder] = [] for conf in conferences: for dl_type, dl_date in [ ("abstract", conf.abstract_deadline), ("paper", conf.paper_deadline), ]: if dl_date is None: continue days_until = (dl_date - today).days if 0 <= days_until <= max_days: urgency = self._classify_urgency(days_until) reminders.append(Reminder( conference_name=conf.name, deadline_type=dl_type, deadline_date=dl_date, days_until=days_until, urgency=urgency, )) reminders.sort(key=lambda r: r.days_until) return reminders @staticmethod def _classify_urgency(days_until: int) -> str: if days_until <= 3: return "critical" if days_until <= 14: return "warning" return "info" def format_reminders(self, reminders: list[Reminder]) -> str: """Format reminders as a readable string.""" if not reminders: return "No upcoming deadline reminders." lines = ["Deadline Reminders:", ""] for r in reminders: icon = {"critical": "!!!", "warning": "!!", "info": "i"}[r.urgency] lines.append( f" [{icon}] {r.conference_name} — {r.deadline_type} deadline " f"in {r.days_until} days ({r.deadline_date})" ) return "\n".join(lines) ================================================ FILE: researchclaw/cli.py ================================================ """ResearchClaw CLI — run the 23-stage autonomous research pipeline.""" from __future__ import annotations import argparse import hashlib import shutil import subprocess import sys from datetime import datetime, timezone from pathlib import Path from collections.abc import Mapping from typing import cast from researchclaw.adapters import AdapterBundle from researchclaw.config import ( CONFIG_SEARCH_ORDER, EXAMPLE_CONFIG, RCConfig, resolve_config_path, ) from researchclaw.health import print_doctor_report, run_doctor, write_doctor_report # --------------------------------------------------------------------------- # OpenCode installation helpers # --------------------------------------------------------------------------- def _is_opencode_installed() -> bool: """Check if the ``opencode`` CLI is available on PATH.""" opencode_cmd = shutil.which("opencode") if opencode_cmd is None: return False try: r = subprocess.run( [opencode_cmd, "--version"], capture_output=True, text=True, timeout=15, ) return r.returncode == 0 except Exception: # noqa: BLE001 return False def _is_npm_installed() -> bool: """Check if ``npm`` is available on PATH.""" return shutil.which("npm") is not None def _install_opencode() -> bool: """Install OpenCode globally via npm. Returns True on success.""" print(" Installing opencode-ai (this may take a minute)...") npm_cmd = shutil.which("npm") if not npm_cmd: print(" npm is not installed. Cannot install OpenCode.") return False try: r = subprocess.run( [npm_cmd, "i", "-g", "opencode-ai@latest"], capture_output=True, text=True, timeout=120, ) if r.returncode == 0: print(" OpenCode installed successfully!") return True else: print(f" Installation failed (exit {r.returncode}):") if r.stderr: for line in r.stderr.strip().splitlines()[:5]: print(f" {line}") return False except subprocess.TimeoutExpired: print(" Installation timed out.") return False except Exception as exc: # noqa: BLE001 print(f" Installation failed: {exc}") return False def _prompt_opencode_install() -> bool: """Interactively prompt the user to install OpenCode. Returns True if OpenCode is now available (already installed or just installed successfully). Returns False otherwise. """ if _is_opencode_installed(): return True if not sys.stdin.isatty(): return False print() print("=" * 60) print(" OpenCode Beast Mode (Recommended)") print("=" * 60) print() print(" OpenCode is an AI coding agent that dramatically improves") print(" experiment code generation for complex research tasks.") print() print(" With OpenCode enabled, ResearchClaw can generate multi-file") print(" experiment projects with custom architectures, training") print(" loops, and ablation studies — far beyond single-file limits.") print() if not _is_npm_installed(): print(" Node.js/npm is required but not installed.") print(" To install OpenCode later:") print(" 1. Install Node.js: https://nodejs.org/") print(" 2. Run: npm i -g opencode-ai@latest") print(" — or: researchclaw setup") print() return False try: answer = input(" Install OpenCode now? [Y/n]: ").strip().lower() except (EOFError, KeyboardInterrupt): print() return False if answer in ("", "y", "yes"): success = _install_opencode() if not success: print(" You can retry later with: researchclaw setup") return success else: print(" Skipped. You can install later with: researchclaw setup") return False def _resolve_config_or_exit(args: argparse.Namespace) -> Path | None: """Resolve config path from args, printing helpful errors on failure. Returns the resolved Path on success, or None if the config cannot be found (after printing an error message to stderr). """ path = resolve_config_path(getattr(args, "config", None)) if path is not None and not path.exists(): print(f"Error: config file not found: {path}", file=sys.stderr) return None if path is None: search_list = ", ".join(CONFIG_SEARCH_ORDER) print( f"Error: no config file found (searched: {search_list}).\n" f"Run 'researchclaw init' to create one from the example template.", file=sys.stderr, ) return None return path def _generate_run_id(topic: str) -> str: ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") topic_hash = hashlib.sha256(topic.encode()).hexdigest()[:6] return f"rc-{ts}-{topic_hash}" def cmd_run(args: argparse.Namespace) -> int: resolved = _resolve_config_or_exit(args) if resolved is None: return 1 config_path = resolved topic = cast(str | None, args.topic) output = cast(str | None, args.output) from_stage_name = cast(str | None, args.from_stage) auto_approve = cast(bool, args.auto_approve) skip_preflight = cast(bool, args.skip_preflight) resume = cast(bool, args.resume) skip_noncritical = cast(bool, args.skip_noncritical_stage) no_graceful_degradation = cast(bool, args.no_graceful_degradation) kb_root_path = None config = RCConfig.load(config_path, check_paths=False) # Override graceful_degradation if CLI flag is set if no_graceful_degradation: import dataclasses as _dc_gd new_research = _dc_gd.replace(config.research, graceful_degradation=False) config = _dc_gd.replace(config, research=new_research) # Derive gate behavior from project.mode (CLI --auto-approve overrides) mode = config.project.mode.lower() if auto_approve: # Explicit CLI flag takes precedence over config mode stop_on_gate = False elif mode == "full-auto": auto_approve = True stop_on_gate = False else: # "semi-auto" and "docs-first" should block on gates stop_on_gate = True if topic: import dataclasses new_research = dataclasses.replace(config.research, topic=topic) config = dataclasses.replace(config, research=new_research) # --- LLM Preflight --- if not skip_preflight: from researchclaw.llm import create_llm_client client = create_llm_client(config) print("Preflight check...", end=" ", flush=True) ok, msg = client.preflight() if ok: print(msg) else: print(f"FAILED — {msg}", file=sys.stderr) return 1 run_id = _generate_run_id(config.research.topic) run_dir = Path(output or f"artifacts/{run_id}") # BUG-119: When --resume without --output, search for the most recent # existing run directory that matches the topic and has a checkpoint. if resume and not output: topic_hash = hashlib.sha256(config.research.topic.encode()).hexdigest()[:6] artifacts_root = Path("artifacts") if artifacts_root.is_dir(): candidates = sorted( ( d for d in artifacts_root.iterdir() if d.is_dir() and d.name.startswith("rc-") and d.name.endswith(f"-{topic_hash}") and (d / "checkpoint.json").exists() ), key=lambda d: d.name, reverse=True, # newest first (timestamp in name) ) if candidates: run_dir = candidates[0] run_id = run_dir.name print(f"Found existing run to resume: {run_dir}") else: print( "Warning: --resume specified but no checkpoint found " f"for topic hash '{topic_hash}'. Starting new run.", file=sys.stderr, ) run_dir.mkdir(parents=True, exist_ok=True) if config.knowledge_base.root: kb_root_path = Path(config.knowledge_base.root) kb_root_path.mkdir(parents=True, exist_ok=True) adapters = AdapterBundle() from researchclaw.pipeline.runner import execute_pipeline, read_checkpoint from researchclaw.pipeline.stages import Stage # --- Determine start stage --- from_stage = Stage.TOPIC_INIT if from_stage_name: try: from_stage = Stage[from_stage_name.upper()] except KeyError: valid = ", ".join(s.name for s in Stage) print( f"Error: unknown stage '{from_stage_name}'. " f"Valid stages: {valid}", file=sys.stderr, ) return 1 elif resume: resumed = read_checkpoint(run_dir) if resumed is not None: from_stage = resumed print(f"Resuming from checkpoint: Stage {int(from_stage)}: {from_stage.name}") from researchclaw import __version__ print(f"ResearchClaw v{__version__} — Starting pipeline") print(f" Run ID: {run_id}") print(f" Topic: {config.research.topic}") print(f" Output: {run_dir}") print(f" Mode: {config.project.mode}") print(f" From: Stage {int(from_stage)}: {from_stage.name}") # Hint: OpenCode beast mode exp_cfg = getattr(config, "experiment", None) oc_cfg = getattr(exp_cfg, "opencode", None) if oc_cfg and getattr(oc_cfg, "enabled", False) and not _is_opencode_installed(): print() print(" Hint: OpenCode beast mode is enabled but not installed.") print(" Run 'researchclaw setup' to install for better code generation.") print() results = execute_pipeline( run_dir=run_dir, run_id=run_id, config=config, adapters=adapters, from_stage=from_stage, auto_approve_gates=auto_approve, stop_on_gate=stop_on_gate, skip_noncritical=skip_noncritical, kb_root=kb_root_path, ) done = sum(1 for r in results if r.status.value == "done") failed = sum(1 for r in results if r.status.value == "failed") print(f"\nPipeline complete: {done}/{len(results)} stages done, {failed} failed") return 0 if failed == 0 else 1 def cmd_validate(args: argparse.Namespace) -> int: from researchclaw.config import validate_config import yaml resolved = _resolve_config_or_exit(args) if resolved is None: return 1 config_path = resolved no_check_paths = cast(bool, args.no_check_paths) with config_path.open(encoding="utf-8") as f: loaded = cast(object, yaml.safe_load(f)) if loaded is None: data: dict[str, object] = {} elif isinstance(loaded, dict): loaded_map = cast(Mapping[object, object], loaded) data = {str(key): value for key, value in loaded_map.items()} else: print("Config validation FAILED:") print(" Error: Config root must be a mapping") return 1 result = validate_config(data, check_paths=not no_check_paths) if result.ok: print("Config validation passed") for w in result.warnings: print(f" Warning: {w}") return 0 else: print("Config validation FAILED:") for e in result.errors: print(f" Error: {e}") return 1 def cmd_doctor(args: argparse.Namespace) -> int: resolved = _resolve_config_or_exit(args) if resolved is None: return 1 config_path = resolved output = cast(str | None, args.output) report = run_doctor(config_path) print_doctor_report(report) if output: write_doctor_report(report, Path(output)) return 0 if report.overall == "pass" else 1 def cmd_project(args: argparse.Namespace) -> int: """C1: Multi-project management commands.""" from researchclaw.project.manager import ProjectManager action = cast(str, args.project_action) config_path = Path(cast(str, args.config)) config = RCConfig.load(config_path, check_paths=False) pm = ProjectManager(Path(config.multi_project.projects_dir)) if action == "list": projects = pm.list_all() if not projects: print("No projects found.") for p in projects: marker = " *" if pm.active and pm.active.name == p.name else "" print(f" {p.name} [{p.status}]{marker}") return 0 elif action == "status": status = pm.get_status() print(f"Total projects: {status['total']}") print(f"Active: {status.get('active', 'none')}") return 0 elif action == "create": name = cast(str, args.name) topic = cast(str | None, getattr(args, "topic", None)) proj = pm.create(name, str(config_path), topic=topic or "") print(f"Created project: {proj.name}") return 0 elif action == "switch": name = cast(str, args.name) pm.switch(name) print(f"Switched to project: {name}") return 0 elif action == "compare": names = cast(list[str], args.names) if len(names) != 2: print("Error: compare requires exactly 2 project names", file=sys.stderr) return 1 result = pm.compare(names[0], names[1]) print(f"Comparing {names[0]} vs {names[1]}:") for k, v in result.get("metric_diff", {}).items(): print(f" {k}: delta={v['delta']:.4f}") return 0 else: print(f"Unknown project action: {action}", file=sys.stderr) return 1 def cmd_mcp(args: argparse.Namespace) -> int: """C3: MCP integration commands.""" import asyncio start = cast(bool, args.start) if start: from researchclaw.mcp.server import ResearchClawMCPServer server = ResearchClawMCPServer() print("Starting MCP server...") asyncio.run(server.start()) return 0 else: from researchclaw.mcp.tools import list_tool_names names = list_tool_names() print("Available MCP tools:") for name in names: print(f" {name}") return 0 def cmd_overleaf(args: argparse.Namespace) -> int: """C4: Overleaf sync commands.""" config_path = Path(cast(str, args.config)) config = RCConfig.load(config_path, check_paths=False) if not config.overleaf.enabled: print("Overleaf sync is not enabled in config.", file=sys.stderr) return 1 from researchclaw.overleaf.sync import OverleafSync sync = OverleafSync( git_url=config.overleaf.git_url, branch=config.overleaf.branch, ) do_sync = cast(bool, args.sync) do_status = cast(bool, args.status) if do_status: status = sync.get_status() for k, v in status.items(): print(f" {k}: {v}") return 0 elif do_sync: run_dir = Path(cast(str, args.run_dir)) if not run_dir.exists(): print(f"Error: run_dir not found: {run_dir}", file=sys.stderr) return 1 sync.setup(run_dir) sync.pull_changes() print("Overleaf sync complete.") return 0 else: print("Use --sync or --status", file=sys.stderr) return 1 def cmd_serve(args: argparse.Namespace) -> int: """Start the FastAPI web server.""" config_path = Path(cast(str, args.config)) if not config_path.exists(): print(f"Error: config file not found: {config_path}", file=sys.stderr) return 1 config = RCConfig.load(config_path, check_paths=False) host = cast(str, args.host) or config.server.host port = int(cast(int, args.port) or config.server.port) try: from researchclaw.server.app import create_app import uvicorn except ImportError as exc: print( f"Error: web dependencies not installed — pip install researchclaw[web]\n{exc}", file=sys.stderr, ) return 1 app = create_app(config, monitor_dir=args.monitor_dir) uvicorn.run(app, host=host, port=port) return 0 def cmd_dashboard(args: argparse.Namespace) -> int: """Start dashboard-only server (no pipeline control).""" config_path = Path(cast(str, args.config)) if not config_path.exists(): print(f"Error: config file not found: {config_path}", file=sys.stderr) return 1 config = RCConfig.load(config_path, check_paths=False) host = cast(str, args.host) or config.server.host port = int(cast(int, args.port) or config.server.port) try: from researchclaw.server.app import create_app import uvicorn except ImportError as exc: print( f"Error: web dependencies not installed — pip install researchclaw[web]\n{exc}", file=sys.stderr, ) return 1 app = create_app(config, dashboard_only=True, monitor_dir=args.monitor_dir) uvicorn.run(app, host=host, port=port) return 0 def cmd_wizard(args: argparse.Namespace) -> int: """Run the interactive setup wizard.""" from researchclaw.wizard.quickstart import QuickStartWizard wizard = QuickStartWizard() output = cast(str | None, args.output) import yaml config = wizard.run_interactive() if output: Path(output).write_text(yaml.dump(config, default_flow_style=False)) print(f"Config written to {output}") else: print(yaml.dump(config, default_flow_style=False)) return 0 _PROVIDER_CHOICES = { "1": ("openai", "OPENAI_API_KEY"), "2": ("openrouter", "OPENROUTER_API_KEY"), "3": ("deepseek", "DEEPSEEK_API_KEY"), "4": ("minimax", "MINIMAX_API_KEY"), "5": ("acp", ""), } _PROVIDER_URLS = { "openai": "https://api.openai.com/v1", "openrouter": "https://openrouter.ai/api/v1", "deepseek": "https://api.deepseek.com/v1", "minimax": "https://api.minimax.io/v1", } _PROVIDER_MODELS = { "openai": ("gpt-4o", ["gpt-4.1", "gpt-4o-mini"]), "openrouter": ( "anthropic/claude-3.5-sonnet", ["google/gemini-pro-1.5", "meta-llama/llama-3.1-70b-instruct"], ), "deepseek": ("deepseek-chat", ["deepseek-reasoner"]), "minimax": ("MiniMax-M2.5", ["MiniMax-M2.5-highspeed"]), } def cmd_init(args: argparse.Namespace) -> int: force = cast(bool, args.force) dest = Path("config.arc.yaml") if dest.exists() and not force: print(f"{dest} already exists. Use --force to overwrite.", file=sys.stderr) return 1 # Look for the example config: first in repo root (relative to package), # then in CWD (for development), then bundled in the package data dir. _candidates = [ Path(__file__).resolve().parent.parent / EXAMPLE_CONFIG, # repo root Path.cwd() / EXAMPLE_CONFIG, # cwd fallback Path(__file__).resolve().parent / "data" / EXAMPLE_CONFIG, # packaged ] example = next((p for p in _candidates if p.exists()), None) if example is None: print( f"Error: example config not found.\n" f"Searched: {', '.join(str(c) for c in _candidates)}", file=sys.stderr, ) return 1 # Interactive provider prompt (TTY only, else default to openai) choice = "1" if sys.stdin.isatty(): print("Select LLM provider:") print(" 1) openai (requires OPENAI_API_KEY)") print(" 2) openrouter (requires OPENROUTER_API_KEY)") print(" 3) deepseek (requires DEEPSEEK_API_KEY)") print(" 4) minimax (requires MINIMAX_API_KEY)") print(" 5) acp (local AI agent — no API key needed)") try: raw = input("Choice [1]: ").strip() except (EOFError, KeyboardInterrupt): raw = "" if raw in _PROVIDER_CHOICES: choice = raw provider, api_key_env = _PROVIDER_CHOICES[choice] content = example.read_text(encoding="utf-8") # String-based replacement to preserve YAML comments content = content.replace( 'provider: "openai-compatible"', f'provider: "{provider}"' ) if provider == "acp": # ACP doesn't need base_url or api_key content = content.replace( 'base_url: "https://api.openai.com/v1"', 'base_url: ""' ) content = content.replace('api_key_env: "OPENAI_API_KEY"', 'api_key_env: ""') else: base_url = _PROVIDER_URLS.get(provider, "https://api.openai.com/v1") content = content.replace( 'base_url: "https://api.openai.com/v1"', f'base_url: "{base_url}"' ) if api_key_env: content = content.replace( 'api_key_env: "OPENAI_API_KEY"', f'api_key_env: "{api_key_env}"' ) if provider in _PROVIDER_MODELS: primary, fallbacks = _PROVIDER_MODELS[provider] content = content.replace('primary_model: "gpt-4o"', f'primary_model: "{primary}"') # Replace fallback models block old_fallbacks = ' fallback_models:\n - "gpt-4.1"\n - "gpt-4o-mini"' new_fallbacks = " fallback_models:\n" + "".join( f' - "{m}"\n' for m in fallbacks ) content = content.replace(old_fallbacks, new_fallbacks.rstrip("\n")) dest.write_text(content, encoding="utf-8") print(f"Created {dest} (provider: {provider})") if provider == "acp": print("\nNext steps:") print(" 1. Ensure your ACP agent is installed and on PATH") print(" 2. Edit config.arc.yaml to set llm.acp.agent if needed") print(" 3. Run: researchclaw doctor") else: env_var = api_key_env or "OPENAI_API_KEY" print(f"\nNext steps:") print(f" 1. Export your API key: export {env_var}=sk-...") print(" 2. Edit config.arc.yaml to customize your settings") print(" 3. Run: researchclaw doctor") # Offer OpenCode installation _prompt_opencode_install() return 0 def cmd_setup(args: argparse.Namespace) -> int: """Post-install setup — check and install optional tools.""" print("ResearchClaw — Environment Setup\n") # 1. OpenCode if _is_opencode_installed(): try: opencode_cmd = shutil.which("opencode") or "opencode" r = subprocess.run( [opencode_cmd, "--version"], capture_output=True, text=True, timeout=15, ) ver = r.stdout.strip() or "unknown" except Exception: # noqa: BLE001 ver = "unknown" print(f" [OK] OpenCode is installed (version: {ver})") else: installed = _prompt_opencode_install() if installed: print(" [OK] OpenCode is now available") else: print(" [--] OpenCode not installed (beast mode will be unavailable)") # 2. Docker (informational) print() if shutil.which("docker"): print(" [OK] Docker is available (sandbox execution enabled)") else: print(" [--] Docker not found (experiment sandbox unavailable)") print(" Install: https://docs.docker.com/get-docker/") # 3. LaTeX (informational) if shutil.which("pdflatex"): print(" [OK] LaTeX is available (PDF paper compilation enabled)") else: print(" [--] LaTeX not found (paper will be exported as .tex only)") print(" Install: sudo apt install texlive-full (or equivalent)") print() print("Run 'researchclaw doctor' for a full environment health check.") return 0 def cmd_report(args: argparse.Namespace) -> int: from researchclaw.report import generate_report, write_report run_dir = Path(cast(str, args.run_dir)) output = cast(str | None, args.output) try: report = generate_report(run_dir) except (FileNotFoundError, ValueError) as e: print(f"Error: {e}", file=sys.stderr) return 1 print(report) if output: write_report(run_dir, Path(output)) print(f"\nReport written to {output}") return 0 # ── Research Enhancement commands (Agent D) ─────────────────────── def cmd_trends(args: argparse.Namespace) -> int: """Research trend tracking commands.""" config_path = Path(cast(str, args.config)) if not config_path.exists(): print(f"Error: config file not found: {config_path}", file=sys.stderr) return 1 config = RCConfig.load(config_path, check_paths=False) import asyncio from researchclaw.trends.feeds import FeedManager from researchclaw.trends.trend_analyzer import TrendAnalyzer domains = cast(list[str] | None, args.domains) or list(config.research.domains) if not domains: domains = ["machine learning"] feed_manager = FeedManager( sources=config.trends.sources, s2_api_key=config.llm.s2_api_key, ) if cast(bool, args.digest): from researchclaw.trends.daily_digest import DailyDigest digest = DailyDigest(feed_manager) result = asyncio.run(digest.generate(domains, config.trends.max_papers_per_day)) print(result) return 0 if cast(bool, args.analyze): papers = feed_manager.fetch_recent_papers(domains, max_papers=50) analyzer = TrendAnalyzer() analysis = analyzer.analyze(papers, config.trends.trend_window_days) print(analyzer.generate_trend_report(analysis)) return 0 if cast(bool, args.suggest_topics): from researchclaw.trends.auto_topic import AutoTopicGenerator from researchclaw.trends.opportunity_finder import OpportunityFinder papers = feed_manager.fetch_recent_papers(domains, max_papers=50) analyzer = TrendAnalyzer() finder = OpportunityFinder() generator = AutoTopicGenerator(analyzer, finder) candidates = asyncio.run(generator.generate_candidates(domains, papers)) print(generator.format_candidates(candidates)) return 0 print("Usage: researchclaw trends --digest|--analyze|--suggest-topics") return 0 def cmd_calendar(args: argparse.Namespace) -> int: """Conference deadline calendar commands.""" from researchclaw.calendar.deadlines import ConferenceCalendar from researchclaw.calendar.planner import SubmissionPlanner calendar = ConferenceCalendar.load_builtin() domains = cast(list[str] | None, args.domains) if cast(bool, args.upcoming): print(calendar.format_upcoming(domains=domains)) return 0 plan_venue = cast(str | None, args.plan) if plan_venue: planner = SubmissionPlanner(calendar) print(planner.format_plan(plan_venue)) return 0 print("Usage: researchclaw calendar --upcoming|--plan ") return 0 def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( prog="researchclaw", description="ResearchClaw — Autonomous Research Pipeline", ) sub = parser.add_subparsers(dest="command") run_p = sub.add_parser("run", help="Run the 23-stage research pipeline") _ = run_p.add_argument("--topic", "-t", help="Override research topic") _ = run_p.add_argument( "--config", "-c", default=None, help="Config file (default: auto-detect config.arc.yaml or config.yaml)", ) _ = run_p.add_argument("--output", "-o", help="Output directory") _ = run_p.add_argument( "--from-stage", help="Start from a specific stage (e.g. PAPER_OUTLINE)" ) _ = run_p.add_argument( "--auto-approve", action="store_true", help="Auto-approve gate stages" ) _ = run_p.add_argument( "--skip-preflight", action="store_true", help="Skip LLM preflight check" ) _ = run_p.add_argument( "--resume", action="store_true", help="Resume from last checkpoint" ) _ = run_p.add_argument( "--skip-noncritical-stage", action="store_true", help="Skip noncritical stages on failure instead of aborting" ) _ = run_p.add_argument( "--no-graceful-degradation", action="store_true", help="Disable graceful degradation: fail pipeline on quality gate failure" ) val_p = sub.add_parser("validate", help="Validate config file") _ = val_p.add_argument( "--config", "-c", default=None, help="Config file (default: auto-detect config.arc.yaml or config.yaml)", ) _ = val_p.add_argument( "--no-check-paths", action="store_true", help="Skip path existence checks" ) doc_p = sub.add_parser("doctor", help="Check environment and configuration health") _ = doc_p.add_argument( "--config", "-c", default=None, help="Config file (default: auto-detect config.arc.yaml or config.yaml)", ) _ = doc_p.add_argument("--output", "-o", help="Write JSON report to file") init_p = sub.add_parser("init", help="Create config.arc.yaml from example template") _ = init_p.add_argument( "--force", action="store_true", help="Overwrite existing config.arc.yaml" ) _ = sub.add_parser("setup", help="Check and install optional tools (OpenCode, etc.)") rpt_p = sub.add_parser("report", help="Generate human-readable run report") _ = rpt_p.add_argument( "--run-dir", required=True, help="Path to run artifacts directory" ) _ = rpt_p.add_argument("--output", "-o", help="Write report to file") # A: Web platform srv_p = sub.add_parser("serve", help="Start the web server") _ = srv_p.add_argument("--config", "-c", default="config.yaml", help="Config file path") _ = srv_p.add_argument("--host", default="", help="Host to bind (default from config)") _ = srv_p.add_argument("--port", type=int, default=0, help="Port (default from config)") _ = srv_p.add_argument("--monitor-dir", help="Artifacts dir to monitor") dash_p = sub.add_parser("dashboard", help="Start dashboard-only server") _ = dash_p.add_argument("--config", "-c", default="config.yaml", help="Config file path") _ = dash_p.add_argument("--host", default="", help="Host to bind") _ = dash_p.add_argument("--port", type=int, default=0, help="Port") _ = dash_p.add_argument("--monitor-dir", help="Artifacts dir to monitor") wiz_p = sub.add_parser("wizard", help="Run the setup wizard") _ = wiz_p.add_argument("--output", "-o", help="Write config to file") # C1: Multi-project management proj_p = sub.add_parser("project", help="Multi-project management") _ = proj_p.add_argument( "project_action", choices=["list", "status", "create", "switch", "compare"], help="Project action", ) _ = proj_p.add_argument("--name", "-n", help="Project name") _ = proj_p.add_argument("--names", nargs="*", help="Project names (for compare)") _ = proj_p.add_argument("--topic", "-t", help="Research topic") _ = proj_p.add_argument( "--config", "-c", default="config.yaml", help="Config file path" ) # C3: MCP integration mcp_p = sub.add_parser("mcp", help="MCP integration") _ = mcp_p.add_argument( "--start", action="store_true", help="Start MCP server" ) # C4: Overleaf sync ovl_p = sub.add_parser("overleaf", help="Overleaf bidirectional sync") _ = ovl_p.add_argument("--sync", action="store_true", help="Run sync") _ = ovl_p.add_argument("--status", action="store_true", help="Show status") _ = ovl_p.add_argument("--run-dir", help="Run artifacts directory") _ = ovl_p.add_argument( "--config", "-c", default="config.yaml", help="Config file path" ) # D1: Research trend tracking trends_p = sub.add_parser("trends", help="Research trend tracking") _ = trends_p.add_argument("--digest", action="store_true", help="Generate daily digest") _ = trends_p.add_argument("--analyze", action="store_true", help="Analyze trends") _ = trends_p.add_argument( "--suggest-topics", action="store_true", help="Suggest research topics" ) _ = trends_p.add_argument("--config", "-c", default="config.yaml", help="Config file path") _ = trends_p.add_argument("--domains", nargs="+", help="Override domains") # D4: Conference deadline calendar cal_p = sub.add_parser("calendar", help="Conference deadline calendar") _ = cal_p.add_argument("--upcoming", action="store_true", help="Show upcoming deadlines") _ = cal_p.add_argument("--plan", help="Generate submission timeline for a venue") _ = cal_p.add_argument("--domains", nargs="+", help="Filter by domain") args = parser.parse_args(argv) command = cast(str | None, args.command) if command == "run": return cmd_run(args) elif command == "validate": return cmd_validate(args) elif command == "doctor": return cmd_doctor(args) elif command == "init": return cmd_init(args) elif command == "setup": return cmd_setup(args) elif command == "report": return cmd_report(args) elif command == "serve": return cmd_serve(args) elif command == "dashboard": return cmd_dashboard(args) elif command == "wizard": return cmd_wizard(args) elif command == "project": return cmd_project(args) elif command == "mcp": return cmd_mcp(args) elif command == "overleaf": return cmd_overleaf(args) elif command == "trends": return cmd_trends(args) elif command == "calendar": return cmd_calendar(args) else: parser.print_help() return 0 if __name__ == "__main__": sys.exit(main()) ================================================ FILE: researchclaw/collaboration/__init__.py ================================================ """Agent collaboration and knowledge sharing system. Enables multiple AutoResearchClaw instances to share research artifacts (literature summaries, experiment results, code templates, review feedback) through a file-system-based shared repository. """ from researchclaw.collaboration.repository import ResearchRepository from researchclaw.collaboration.publisher import ArtifactPublisher from researchclaw.collaboration.subscriber import ArtifactSubscriber from researchclaw.collaboration.dedup import deduplicate_artifacts __all__ = [ "ResearchRepository", "ArtifactPublisher", "ArtifactSubscriber", "deduplicate_artifacts", ] ================================================ FILE: researchclaw/collaboration/dedup.py ================================================ """Cross-instance deduplication for shared artifacts.""" from __future__ import annotations import hashlib import logging from typing import Any logger = logging.getLogger(__name__) def content_hash(content: Any) -> str: """Compute a content hash for deduplication. Args: content: Content to hash (str, dict, or list). Returns: Hex digest string. """ if isinstance(content, (dict, list)): import json text = json.dumps(content, sort_keys=True, default=str) else: text = str(content) return hashlib.sha256(text.encode("utf-8")).hexdigest()[:16] def deduplicate_artifacts( artifacts: list[dict[str, Any]], key: str = "content", ) -> list[dict[str, Any]]: """Remove duplicate artifacts based on content hash. Args: artifacts: List of artifact dicts. key: Dict key containing the content to compare. Returns: Deduplicated list, preserving first occurrence. """ seen: set[str] = set() unique: list[dict[str, Any]] = [] for artifact in artifacts: h = content_hash(artifact.get(key, "")) if h not in seen: seen.add(h) unique.append(artifact) removed = len(artifacts) - len(unique) if removed > 0: logger.info("Deduplication removed %d artifacts", removed) return unique ================================================ FILE: researchclaw/collaboration/publisher.py ================================================ """Artifact publisher — extracts and publishes research artifacts from pipeline runs.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.collaboration.repository import ResearchRepository logger = logging.getLogger(__name__) class ArtifactPublisher: """Extracts artifacts from pipeline run directories and publishes them. Scans stage output directories for relevant files and publishes structured summaries to the shared repository. """ def __init__(self, repository: ResearchRepository) -> None: self._repo = repository def publish_from_run_dir( self, run_id: str, run_dir: Path, ) -> int: """Extract and publish all artifacts from a pipeline run directory. Args: run_id: Unique run identifier. run_dir: Path to the pipeline run output directory. Returns: Number of artifacts published. """ artifacts: dict[str, Any] = {} # Literature summary (from stage 7 - synthesis) lit_summary = self._extract_literature(run_dir) if lit_summary: artifacts["literature_summary"] = lit_summary # Experiment results (from stage 14 - result_analysis) exp_results = self._extract_experiments(run_dir) if exp_results: artifacts["experiment_results"] = exp_results # Code template (from stage 10 - code_generation) code_template = self._extract_code(run_dir) if code_template: artifacts["code_template"] = code_template # Review feedback (from stage 18 - peer_review) review = self._extract_review(run_dir) if review: artifacts["review_feedback"] = review if not artifacts: logger.info("No artifacts found in run dir: %s", run_dir) return 0 return self._repo.publish(run_id, artifacts) def _extract_literature(self, run_dir: Path) -> Any: """Extract literature summary from stage 7.""" for stage_dir in run_dir.glob("stage-07*"): for name in ("synthesis.md", "synthesis.json"): path = stage_dir / name if path.exists(): return path.read_text(encoding="utf-8")[:5000] return None def _extract_experiments(self, run_dir: Path) -> Any: """Extract experiment results from stage 14.""" for stage_dir in run_dir.glob("stage-14*"): summary = stage_dir / "experiment_summary.json" if summary.exists(): try: return json.loads(summary.read_text(encoding="utf-8")) except json.JSONDecodeError: pass return None def _extract_code(self, run_dir: Path) -> Any: """Extract code template from stage 10.""" for stage_dir in run_dir.glob("stage-10*"): main_py = stage_dir / "main.py" if main_py.exists(): return main_py.read_text(encoding="utf-8")[:10000] return None def _extract_review(self, run_dir: Path) -> Any: """Extract review feedback from stage 18.""" for stage_dir in run_dir.glob("stage-18*"): review = stage_dir / "review.md" if review.exists(): return review.read_text(encoding="utf-8")[:5000] return None ================================================ FILE: researchclaw/collaboration/repository.py ================================================ """Shared knowledge repository for cross-instance collaboration.""" from __future__ import annotations import json import logging from datetime import datetime, timezone from pathlib import Path from typing import Any logger = logging.getLogger(__name__) ARTIFACT_TYPES = ( "literature_summary", "experiment_results", "code_template", "review_feedback", ) class ResearchRepository: """File-system-based shared knowledge repository. Artifacts are organized by run_id and type, enabling cross-instance search and import of research outputs. """ def __init__(self, repo_dir: str | Path = ".researchclaw/shared") -> None: self._repo_dir = Path(repo_dir) @property def repo_dir(self) -> Path: """Return the repository directory path.""" return self._repo_dir def publish(self, run_id: str, artifacts: dict[str, Any]) -> int: """Publish research artifacts to the shared repository. Args: run_id: Unique identifier for the pipeline run. artifacts: Dict mapping artifact type to content. Supported types: literature_summary, experiment_results, code_template, review_feedback. Returns: Number of artifacts published. """ run_dir = self._repo_dir / run_id run_dir.mkdir(parents=True, exist_ok=True) count = 0 for artifact_type, content in artifacts.items(): if artifact_type not in ARTIFACT_TYPES: logger.warning("Unknown artifact type: %s", artifact_type) continue artifact_path = run_dir / f"{artifact_type}.json" payload = { "run_id": run_id, "type": artifact_type, "content": content, "published_at": datetime.now(timezone.utc).isoformat( timespec="seconds" ), } artifact_path.write_text( json.dumps(payload, indent=2, ensure_ascii=False, default=str), encoding="utf-8", ) count += 1 logger.info("Published %d artifacts for run %s", count, run_id) return count def search( self, query: str, artifact_type: str | None = None, max_results: int = 10, ) -> list[dict[str, Any]]: """Search for artifacts matching a query. Simple keyword-based search (case-insensitive substring match). Args: query: Search query string. artifact_type: Filter by type (optional). max_results: Maximum number of results. Returns: List of matching artifact dicts. """ if not self._repo_dir.exists(): return [] results: list[dict[str, Any]] = [] query_lower = query.lower() for run_dir in sorted(self._repo_dir.iterdir(), reverse=True): if not run_dir.is_dir(): continue for artifact_file in run_dir.glob("*.json"): if artifact_type and artifact_file.stem != artifact_type: continue try: payload = json.loads( artifact_file.read_text(encoding="utf-8") ) except (json.JSONDecodeError, OSError): continue content_str = json.dumps(payload.get("content", ""), default=str).lower() if query_lower in content_str: results.append(payload) if len(results) >= max_results: return results return results def list_runs(self) -> list[str]: """List all run IDs in the repository. Returns: List of run ID strings, most recent first. """ if not self._repo_dir.exists(): return [] return sorted( [d.name for d in self._repo_dir.iterdir() if d.is_dir()], reverse=True, ) def get_run_artifacts(self, run_id: str) -> dict[str, Any]: """Get all artifacts for a specific run. Args: run_id: The run identifier. Returns: Dict mapping artifact type to content. """ run_dir = self._repo_dir / run_id if not run_dir.exists(): return {} artifacts: dict[str, Any] = {} for artifact_file in run_dir.glob("*.json"): try: payload = json.loads(artifact_file.read_text(encoding="utf-8")) artifacts[artifact_file.stem] = payload.get("content") except (json.JSONDecodeError, OSError): continue return artifacts def import_literature(self, source_run_id: str) -> list[dict[str, Any]]: """Import literature summaries from another run. Args: source_run_id: The source run ID to import from. Returns: List of literature summary dicts. """ artifacts = self.get_run_artifacts(source_run_id) content = artifacts.get("literature_summary") if content is None: return [] if isinstance(content, list): return content return [content] def import_code_template( self, source_run_id: str, pattern: str, ) -> str | None: """Import a code template from another run. Args: source_run_id: The source run ID. pattern: Substring to search for in the code template. Returns: Matching code template string, or None. """ artifacts = self.get_run_artifacts(source_run_id) content = artifacts.get("code_template") if content is None: return None content_str = str(content) if pattern.lower() in content_str.lower(): return content_str return None ================================================ FILE: researchclaw/collaboration/subscriber.py ================================================ """Artifact subscriber — queries and imports shared artifacts.""" from __future__ import annotations import logging from typing import Any from researchclaw.collaboration.repository import ResearchRepository logger = logging.getLogger(__name__) class ArtifactSubscriber: """Subscribes to and imports artifacts from the shared repository. Provides convenience methods for finding and importing relevant artifacts for a new pipeline run. """ def __init__(self, repository: ResearchRepository) -> None: self._repo = repository def find_relevant_literature( self, topic: str, max_results: int = 5, ) -> list[dict[str, Any]]: """Find literature summaries relevant to a topic. Args: topic: Research topic to search for. max_results: Maximum results. Returns: List of matching literature artifacts. """ return self._repo.search( topic, artifact_type="literature_summary", max_results=max_results ) def find_similar_experiments( self, query: str, max_results: int = 5, ) -> list[dict[str, Any]]: """Find experiment results similar to the current task. Args: query: Search query describing the experiment. max_results: Maximum results. Returns: List of matching experiment artifacts. """ return self._repo.search( query, artifact_type="experiment_results", max_results=max_results ) def find_code_templates( self, query: str, max_results: int = 3, ) -> list[dict[str, Any]]: """Find reusable code templates. Args: query: Search query. max_results: Maximum results. Returns: List of matching code template artifacts. """ return self._repo.search( query, artifact_type="code_template", max_results=max_results ) def import_best_practices( self, topic: str, ) -> str: """Compile best practices from historical runs for a topic. Args: topic: Current research topic. Returns: Formatted string of best practices for prompt injection. """ parts: list[str] = [] # Literature insights lit_results = self.find_relevant_literature(topic, max_results=3) if lit_results: parts.append("### Related Literature (from prior runs)") for result in lit_results: content = result.get("content", "") if isinstance(content, str): parts.append(f"- {content[:200]}...") run_id = result.get("run_id", "?") parts.append(f" (from run: {run_id})") # Experiment insights exp_results = self.find_similar_experiments(topic, max_results=3) if exp_results: parts.append("\n### Related Experiments (from prior runs)") for result in exp_results: run_id = result.get("run_id", "?") parts.append(f"- Experiment from run {run_id}") if not parts: return "" return "\n".join(parts) ================================================ FILE: researchclaw/config.py ================================================ """ResearchClaw config loading and validation.""" from __future__ import annotations from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any import sys import yaml DEFAULT_PYTHON_PATH = ".venv/Scripts/python.exe" if sys.platform == "win32" else ".venv/bin/python3" CONFIG_SEARCH_ORDER: tuple[str, ...] = ("config.arc.yaml", "config.yaml") def _safe_int(val: Any, default: int) -> int: """Convert value to int, handling None/null YAML values.""" if val is None: return default try: return int(val) except (ValueError, TypeError): return default _VALID_NETWORK_POLICIES = {"none", "setup_only", "pip_only", "full"} def _validate_network_policy(val: object, default: str = "setup_only") -> str: """Validate network_policy, falling back to *default* on bad values.""" s = str(val).strip().lower() if val else default if s not in _VALID_NETWORK_POLICIES: import logging as _cfg_log _cfg_log.getLogger(__name__).warning( "Invalid network_policy %r, using %r", val, default, ) return default return s def _safe_float(val: Any, default: float) -> float: """Convert value to float, handling None/null YAML values. BUG-DA8-11: Also rejects NaN/Inf which YAML can produce via .nan/.inf. """ if val is None: return default try: import math result = float(val) if not math.isfinite(result): return default return result except (ValueError, TypeError): return default EXAMPLE_CONFIG = "config.researchclaw.example.yaml" def resolve_config_path(explicit: str | None) -> Path | None: """Return first existing config from search order, or explicit path if given.""" if explicit is not None: return Path(explicit) for name in CONFIG_SEARCH_ORDER: candidate = Path(name) if candidate.exists(): return candidate return None REQUIRED_FIELDS = ( "project.name", "research.topic", "runtime.timezone", "notifications.channel", "knowledge_base.root", "llm.base_url", "llm.api_key_env", ) KB_SUBDIRS = ( "questions", "literature", "experiments", "findings", "decisions", "reviews", ) PROJECT_MODES = {"docs-first", "semi-auto", "full-auto"} KB_BACKENDS = {"markdown", "obsidian"} EXPERIMENT_MODES = {"simulated", "sandbox", "docker", "ssh_remote", "colab_drive", "agentic"} CLI_AGENT_PROVIDERS = {"llm", "claude_code", "codex"} def _get_by_path(data: dict[str, Any], dotted_key: str) -> Any: cur: Any = data for part in dotted_key.split("."): if not isinstance(cur, dict) or part not in cur: return None cur = cur[part] return cur def _is_blank(value: Any) -> bool: return value is None or (isinstance(value, str) and not value.strip()) @dataclass(frozen=True) class ValidationResult: ok: bool errors: tuple[str, ...] = () warnings: tuple[str, ...] = () @dataclass(frozen=True) class ProjectConfig: name: str mode: str = "docs-first" @dataclass(frozen=True) class ResearchConfig: topic: str domains: tuple[str, ...] = () daily_paper_count: int = 0 quality_threshold: float = 0.0 graceful_degradation: bool = True @dataclass(frozen=True) class RuntimeConfig: timezone: str max_parallel_tasks: int = 1 approval_timeout_hours: int = 12 retry_limit: int = 0 @dataclass(frozen=True) class NotificationsConfig: channel: str target: str = "" on_stage_start: bool = False on_stage_fail: bool = False on_gate_required: bool = True @dataclass(frozen=True) class KnowledgeBaseConfig: backend: str root: str obsidian_vault: str = "" @dataclass(frozen=True) class OpenClawBridgeConfig: use_cron: bool = False use_message: bool = False use_memory: bool = False use_sessions_spawn: bool = False use_web_fetch: bool = False use_browser: bool = False @dataclass(frozen=True) class AcpConfig: """ACP (Agent Client Protocol) settings.""" agent: str = "claude" cwd: str = "." acpx_command: str = "" session_name: str = "researchclaw" timeout_sec: int = 1800 @dataclass(frozen=True) class LlmConfig: provider: str base_url: str = "" api_key_env: str = "" api_key: str = "" primary_model: str = "" fallback_models: tuple[str, ...] = () s2_api_key: str = "" notes: str = "" acp: AcpConfig = field(default_factory=AcpConfig) @dataclass(frozen=True) class SecurityConfig: hitl_required_stages: tuple[int, ...] = (5, 9, 20) allow_publish_without_approval: bool = False redact_sensitive_logs: bool = True @dataclass(frozen=True) class SandboxConfig: python_path: str = DEFAULT_PYTHON_PATH gpu_required: bool = False allowed_imports: tuple[str, ...] = ( "math", "random", "json", "csv", "numpy", "torch", "sklearn", ) max_memory_mb: int = 4096 @dataclass(frozen=True) class SshRemoteConfig: host: str = "" user: str = "" port: int = 22 key_path: str = "" gpu_ids: tuple[int, ...] = () remote_workdir: str = "/tmp/researchclaw_experiments" remote_python: str = "python3" setup_commands: tuple[str, ...] = () use_docker: bool = False docker_image: str = "researchclaw/experiment:latest" docker_network_policy: str = "none" docker_memory_limit_mb: int = 8192 docker_shm_size_mb: int = 2048 timeout_sec: int = 600 # default 10 min for experiment execution scp_timeout_sec: int = 300 # default 5 min for file uploads setup_timeout_sec: int = 300 # default 5 min for setup commands @dataclass(frozen=True) class ColabDriveConfig: """Configuration for Google Drive-based async Colab execution.""" drive_root: str = "" # local mount path, e.g. ~/Google Drive/MyDrive/researchclaw poll_interval_sec: int = 30 timeout_sec: int = 3600 setup_script: str = "" # commands to run before experiment, written to setup.sh @dataclass(frozen=True) class DockerSandboxConfig: """Configuration for Docker-based experiment sandbox.""" image: str = "researchclaw/experiment:latest" gpu_enabled: bool = True gpu_device_ids: tuple[int, ...] = () memory_limit_mb: int = 8192 network_policy: str = "setup_only" # none | setup_only | pip_only | full pip_pre_install: tuple[str, ...] = () auto_install_deps: bool = True shm_size_mb: int = 2048 container_python: str = "/usr/bin/python3" keep_containers: bool = False @dataclass(frozen=True) class AgenticConfig: """Configuration for the agentic experiment mode. Launches a coding agent (e.g. Claude Code) inside a Docker container with full shell access so it can run arbitrary CLI commands, write code, and iteratively complete the experiment. """ image: str = "researchclaw/experiment:latest" agent_cli: str = "claude" agent_install_cmd: str = "npm install -g @anthropic-ai/claude-code" network_policy: str = "full" # Agent needs network access timeout_sec: int = 1800 # 30 min per session memory_limit_mb: int = 8192 gpu_enabled: bool = False mount_skills: bool = True allow_shell_commands: bool = True max_turns: int = 50 @dataclass(frozen=True) class CodeAgentConfig: """Configuration for the advanced multi-phase code generation agent.""" enabled: bool = True # Phase 1: Blueprint planning (deep implementation blueprint) architecture_planning: bool = True # Phase 2: Sequential file generation (one-by-one following blueprint) sequential_generation: bool = True # Phase 2.5: Hard validation gates (AST-based) hard_validation: bool = True hard_validation_max_repairs: int = 4 # Phase 3: Execution-in-the-loop (run → parse error → fix) exec_fix_max_iterations: int = 3 exec_fix_timeout_sec: int = 60 # Phase 4: Solution tree search (off by default — higher cost) tree_search_enabled: bool = False tree_search_candidates: int = 3 tree_search_max_depth: int = 2 tree_search_eval_timeout_sec: int = 120 # Phase 5: Multi-agent review dialog review_max_rounds: int = 2 @dataclass(frozen=True) class OpenCodeConfig: """OpenCode 'Beast Mode' — external AI coding agent for complex experiments. Requires: npm i -g opencode-ai@latest """ enabled: bool = True auto: bool = True # Auto-trigger without user confirmation complexity_threshold: float = 0.2 # 0.0-1.0 model: str = "" # Empty = use llm.primary_model timeout_sec: int = 600 # Max seconds for opencode run max_retries: int = 1 workspace_cleanup: bool = True @dataclass(frozen=True) class BenchmarkAgentConfig: """Configuration for the BenchmarkAgent multi-agent system.""" enabled: bool = True # Surveyor enable_hf_search: bool = True max_hf_results: int = 10 # Surveyor — web search enable_web_search: bool = True max_web_results: int = 5 web_search_min_local: int = 3 # skip web search when local benchmarks >= this # Selector tier_limit: int = 2 min_benchmarks: int = 1 min_baselines: int = 2 prefer_cached: bool = True # Orchestrator max_iterations: int = 2 @dataclass(frozen=True) class FigureAgentConfig: """Configuration for the FigureAgent multi-agent system.""" enabled: bool = True # Planner min_figures: int = 3 max_figures: int = 8 # Orchestrator max_iterations: int = 3 # max CodeGen→Renderer→Critic retry loops # Renderer security render_timeout_sec: int = 30 use_docker: bool | None = None # None = auto-detect, True/False to force docker_image: str = "researchclaw/experiment:latest" # Code generation output format output_format: str = "python" # "python" (matplotlib) or "latex" (TikZ/PGFPlots) # Nano Banana (Gemini image generation) gemini_api_key: str = "" # or set GEMINI_API_KEY / GOOGLE_API_KEY env var gemini_model: str = "gemini-2.5-flash-image" nano_banana_enabled: bool = True # enable/disable Gemini image generation # Critic strict_mode: bool = False # Output dpi: int = 300 @dataclass(frozen=True) class ExperimentRepairConfig: """Experiment repair loop — diagnose and fix failed experiments before paper writing. When enabled, after Stage 14 (result_analysis) the pipeline: 1. Diagnoses experiment failures (missing deps, crashes, OOM, time guard, etc.) 2. Assesses experiment quality (full_paper / preliminary_study / technical_report) 3. If quality is insufficient, generates targeted repair prompts 4. Re-runs experiment with fixes, up to ``max_cycles`` times 5. Selects best results across all cycles for paper writing """ enabled: bool = True max_cycles: int = 3 min_completion_rate: float = 0.5 # At least 50% conditions must complete min_conditions: int = 2 # At least 2 conditions for a valid experiment use_opencode: bool = True # Use OpenCode agent for repairs (vs LLM prompt) timeout_sec_per_cycle: int = 600 # Max time per repair cycle @dataclass(frozen=True) class CliAgentConfig: """CLI-based code generation backend for Stages 10 & 13. provider: "llm" — use existing LLM chat API (default, backward-compatible) "claude_code" — Claude Code CLI (``claude -p``) "codex" — OpenAI Codex CLI (``codex exec``) Auth for claude_code: ANTHROPIC_AUTH_TOKEN + ANTHROPIC_BASE_URL env vars. Auth for codex: OPENAI_API_KEY env var. """ provider: str = "llm" binary_path: str = "" # auto-detected via PATH if empty model: str = "" # model override for the CLI agent max_budget_usd: float = 5.0 timeout_sec: int = 600 extra_args: tuple[str, ...] = () @dataclass(frozen=True) class ExperimentConfig: mode: str = "simulated" time_budget_sec: int = 300 max_iterations: int = 10 max_refine_duration_sec: int = 0 # 0 = auto (3× time_budget_sec) metric_key: str = "primary_metric" metric_direction: str = "minimize" keep_threshold: float = 0.0 sandbox: SandboxConfig = field(default_factory=SandboxConfig) docker: DockerSandboxConfig = field(default_factory=DockerSandboxConfig) agentic: AgenticConfig = field(default_factory=AgenticConfig) ssh_remote: SshRemoteConfig = field(default_factory=SshRemoteConfig) colab_drive: ColabDriveConfig = field(default_factory=ColabDriveConfig) code_agent: CodeAgentConfig = field(default_factory=CodeAgentConfig) opencode: OpenCodeConfig = field(default_factory=OpenCodeConfig) benchmark_agent: BenchmarkAgentConfig = field(default_factory=BenchmarkAgentConfig) figure_agent: FigureAgentConfig = field(default_factory=FigureAgentConfig) repair: ExperimentRepairConfig = field(default_factory=ExperimentRepairConfig) cli_agent: CliAgentConfig = field(default_factory=CliAgentConfig) @dataclass(frozen=True) class MetaClawPRMConfig: """PRM quality gate settings for MetaClaw bridge.""" enabled: bool = False api_base: str = "" api_key_env: str = "" api_key: str = "" model: str = "gpt-5.4" votes: int = 3 temperature: float = 0.6 gate_stages: tuple[int, ...] = (5, 9, 15, 20) @dataclass(frozen=True) class MetaClawLessonToSkillConfig: """Settings for converting lessons into MetaClaw skills.""" enabled: bool = True min_severity: str = "warning" max_skills_per_run: int = 3 @dataclass(frozen=True) class MetaClawBridgeConfig: """MetaClaw integration bridge configuration.""" enabled: bool = False proxy_url: str = "http://localhost:30000" skills_dir: str = "~/.metaclaw/skills" fallback_url: str = "" fallback_api_key: str = "" prm: MetaClawPRMConfig = field(default_factory=MetaClawPRMConfig) lesson_to_skill: MetaClawLessonToSkillConfig = field( default_factory=MetaClawLessonToSkillConfig ) @dataclass(frozen=True) class WebSearchConfig: """Configuration for web search and crawling capabilities.""" enabled: bool = True tavily_api_key: str = "" tavily_api_key_env: str = "TAVILY_API_KEY" enable_scholar: bool = True enable_crawling: bool = True enable_pdf_extraction: bool = True max_web_results: int = 10 max_scholar_results: int = 10 max_crawl_urls: int = 5 @dataclass(frozen=True) class ExportConfig: """Configuration for paper export and LaTeX generation.""" target_conference: str = "neurips_2025" authors: str = "Anonymous" bib_file: str = "references" @dataclass(frozen=True) class PromptsConfig: """Configuration for prompt externalization.""" custom_file: str = "" # Path to custom prompts YAML (empty = use defaults) # ── Agent B: Intelligence & Memory configs ──────────────────────── @dataclass(frozen=True) class MemoryConfig: """Configuration for the persistent evolutionary memory system.""" enabled: bool = True store_dir: str = ".researchclaw/memory" embedding_model: str = "text-embedding-3-small" max_entries_per_category: int = 500 decay_half_life_days: int = 90 confidence_threshold: float = 0.3 inject_at_stages: tuple[int, ...] = (1, 9, 10, 17) @dataclass(frozen=True) class SkillsConfig: """Configuration for the dynamic skills library.""" enabled: bool = True builtin_dir: str = "" # empty = use package default custom_dirs: tuple[str, ...] = () external_dirs: tuple[str, ...] = () auto_match: bool = True max_skills_per_stage: int = 3 fallback_matching: bool = True @dataclass(frozen=True) class KnowledgeGraphConfig: """Configuration for the research knowledge graph.""" enabled: bool = False store_path: str = ".researchclaw/knowledge_graph" max_entities: int = 10000 auto_update: bool = True # ── Web platform configs (Agent A) ────────────────────────────── @dataclass(frozen=True) class ServerConfig: """Web server configuration.""" enabled: bool = False host: str = "0.0.0.0" port: int = 8080 cors_origins: tuple[str, ...] = ("*",) auth_token: str = "" # empty = no authentication voice_enabled: bool = False whisper_model: str = "whisper-1" whisper_api_url: str = "" # empty = use OpenAI default @dataclass(frozen=True) class DashboardConfig: """Dashboard configuration.""" enabled: bool = True refresh_interval_sec: int = 5 max_log_lines: int = 1000 browser_notifications: bool = True # ── Agent C: Infrastructure configs ──────────────────────────────── @dataclass(frozen=True) class MultiProjectConfig: """C1: Multi-project parallel management.""" enabled: bool = False projects_dir: str = ".researchclaw/projects" max_concurrent: int = 2 shared_knowledge: bool = True @dataclass(frozen=True) class ServerEntryConfig: """Single compute server entry for C2.""" name: str = "" host: str = "" server_type: str = "ssh" gpu: str = "" vram_gb: int = 0 priority: int = 1 cost_per_hour: float = 0.0 scheduler: str = "" cloud_provider: str = "" @dataclass(frozen=True) class ServersConfig: """C2: Multi-server resource scheduling.""" enabled: bool = False servers: tuple[ServerEntryConfig, ...] = () prefer_free: bool = True failover: bool = True monitor_interval_sec: int = 60 @dataclass(frozen=True) class MCPIntegrationConfig: """C3: MCP standardized integration.""" server_enabled: bool = False server_port: int = 3000 server_transport: str = "stdio" external_servers: tuple[dict, ...] = () @dataclass(frozen=True) class OverleafConfig: """C4: Overleaf bidirectional sync.""" enabled: bool = False git_url: str = "" branch: str = "main" auto_push: bool = True auto_pull: bool = False poll_interval_sec: int = 300 COPILOT_MODES = ("co-pilot", "auto-pilot", "zero-touch") @dataclass(frozen=True) class TrendsConfig: """D1: Research trend tracking.""" enabled: bool = False domains: tuple[str, ...] = () daily_digest: bool = True digest_time: str = "08:00" max_papers_per_day: int = 20 trend_window_days: int = 30 sources: tuple[str, ...] = ("arxiv", "semantic_scholar") @dataclass(frozen=True) class CoPilotConfig: """D2: Interactive co-pilot mode.""" mode: str = "auto-pilot" pause_at_gates: bool = True pause_at_every_stage: bool = False feedback_timeout_sec: int = 3600 allow_branching: bool = True max_branches: int = 3 @dataclass(frozen=True) class QualityAssessorConfig: """D3: Paper quality assessor.""" enabled: bool = True dimensions: tuple[str, ...] = ( "novelty", "rigor", "clarity", "impact", "experiments" ) venue_recommendation: bool = True score_history: bool = True @dataclass(frozen=True) class CalendarConfig: """D4: Conference deadline calendar.""" enabled: bool = False target_venues: tuple[str, ...] = () reminder_days_before: tuple[int, ...] = (30, 14, 7, 3, 1) auto_plan: bool = True @dataclass(frozen=True) class RCConfig: project: ProjectConfig research: ResearchConfig runtime: RuntimeConfig notifications: NotificationsConfig knowledge_base: KnowledgeBaseConfig openclaw_bridge: OpenClawBridgeConfig llm: LlmConfig security: SecurityConfig = field(default_factory=SecurityConfig) experiment: ExperimentConfig = field(default_factory=ExperimentConfig) export: ExportConfig = field(default_factory=ExportConfig) prompts: PromptsConfig = field(default_factory=PromptsConfig) web_search: WebSearchConfig = field(default_factory=WebSearchConfig) metaclaw_bridge: MetaClawBridgeConfig = field( default_factory=MetaClawBridgeConfig ) # Agent B: Intelligence & Memory memory: MemoryConfig = field(default_factory=MemoryConfig) skills: SkillsConfig = field(default_factory=SkillsConfig) knowledge_graph: KnowledgeGraphConfig = field(default_factory=KnowledgeGraphConfig) # Agent C: Infrastructure multi_project: MultiProjectConfig = field(default_factory=MultiProjectConfig) compute_servers: ServersConfig = field(default_factory=ServersConfig) mcp: MCPIntegrationConfig = field(default_factory=MCPIntegrationConfig) overleaf: OverleafConfig = field(default_factory=OverleafConfig) # Agent A: Web platform server: ServerConfig = field(default_factory=ServerConfig) dashboard: DashboardConfig = field(default_factory=DashboardConfig) # Agent D: Research Enhancement trends: TrendsConfig = field(default_factory=TrendsConfig) copilot: CoPilotConfig = field(default_factory=CoPilotConfig) quality_assessor: QualityAssessorConfig = field(default_factory=QualityAssessorConfig) calendar: CalendarConfig = field(default_factory=CalendarConfig) def to_dict(self) -> dict[str, Any]: return asdict(self) @classmethod def from_dict( cls, data: dict[str, Any], *, project_root: Path | None = None, check_paths: bool = True, ) -> RCConfig: result = validate_config( data, project_root=project_root, check_paths=check_paths ) if not result.ok: raise ValueError("; ".join(result.errors)) project = data["project"] research = data["research"] runtime = data["runtime"] notifications = data["notifications"] knowledge_base = data["knowledge_base"] bridge = data.get("openclaw_bridge") or {} llm = data["llm"] security = data.get("security") or {} experiment = data.get("experiment") or {} export = data.get("export") or {} prompts = data.get("prompts") or {} web_search = data.get("web_search") or {} metaclaw = data.get("metaclaw_bridge") or {} memory_data = data.get("memory") or {} skills_data = data.get("skills") or {} knowledge_graph_data = data.get("knowledge_graph") or {} multi_project = data.get("multi_project") or {} compute_servers = data.get("compute_servers") or {} mcp_data = data.get("mcp") or {} overleaf = data.get("overleaf") or {} server = data.get("server") or {} dashboard_data = data.get("dashboard") or {} trends_data = data.get("trends") or {} copilot_data = data.get("copilot") or {} quality_assessor_data = data.get("quality_assessor") or {} calendar_data = data.get("calendar") or {} return cls( project=ProjectConfig( name=project["name"], mode=project.get("mode", "docs-first") ), research=ResearchConfig( topic=research["topic"], domains=tuple(research.get("domains") or ()), daily_paper_count=int(research.get("daily_paper_count", 0)), quality_threshold=float(research.get("quality_threshold", 0.0)), graceful_degradation=bool(research.get("graceful_degradation", True)), ), runtime=RuntimeConfig( timezone=runtime["timezone"], max_parallel_tasks=int(runtime.get("max_parallel_tasks", 1)), approval_timeout_hours=int(runtime.get("approval_timeout_hours", 12)), retry_limit=int(runtime.get("retry_limit", 0)), ), notifications=NotificationsConfig( channel=notifications["channel"], target=notifications.get("target", ""), on_stage_start=bool(notifications.get("on_stage_start", False)), on_stage_fail=bool(notifications.get("on_stage_fail", False)), on_gate_required=bool(notifications.get("on_gate_required", True)), ), knowledge_base=KnowledgeBaseConfig( backend=knowledge_base.get("backend", "markdown"), root=knowledge_base["root"], obsidian_vault=knowledge_base.get("obsidian_vault", ""), ), openclaw_bridge=OpenClawBridgeConfig( use_cron=bool(bridge.get("use_cron", False)), use_message=bool(bridge.get("use_message", False)), use_memory=bool(bridge.get("use_memory", False)), use_sessions_spawn=bool(bridge.get("use_sessions_spawn", False)), use_web_fetch=bool(bridge.get("use_web_fetch", False)), use_browser=bool(bridge.get("use_browser", False)), ), llm=_parse_llm_config(llm), security=SecurityConfig( hitl_required_stages=tuple( int(s) for s in security.get("hitl_required_stages", (5, 9, 20)) ), allow_publish_without_approval=bool( security.get("allow_publish_without_approval", False) ), redact_sensitive_logs=bool(security.get("redact_sensitive_logs", True)), ), experiment=_parse_experiment_config(experiment), export=ExportConfig( target_conference=export.get("target_conference", "neurips_2025"), authors=export.get("authors", "Anonymous"), bib_file=export.get("bib_file", "references"), ), prompts=PromptsConfig( custom_file=prompts.get("custom_file", ""), ), web_search=WebSearchConfig( enabled=bool(web_search.get("enabled", True)), tavily_api_key=str(web_search.get("tavily_api_key", "")), tavily_api_key_env=str(web_search.get("tavily_api_key_env", "TAVILY_API_KEY")), enable_scholar=bool(web_search.get("enable_scholar", True)), enable_crawling=bool(web_search.get("enable_crawling", True)), enable_pdf_extraction=bool(web_search.get("enable_pdf_extraction", True)), max_web_results=int(web_search.get("max_web_results", 10)), max_scholar_results=int(web_search.get("max_scholar_results", 10)), max_crawl_urls=int(web_search.get("max_crawl_urls", 5)), ), metaclaw_bridge=_parse_metaclaw_bridge_config(metaclaw), memory=_parse_memory_config(memory_data), skills=_parse_skills_config(skills_data), knowledge_graph=_parse_knowledge_graph_config(knowledge_graph_data), multi_project=_parse_multi_project_config(multi_project), compute_servers=_parse_servers_config(compute_servers), mcp=_parse_mcp_config(mcp_data), overleaf=_parse_overleaf_config(overleaf), server=_parse_server_config(server), dashboard=_parse_dashboard_config(dashboard_data), trends=_parse_trends_config(trends_data), copilot=_parse_copilot_config(copilot_data), quality_assessor=_parse_quality_assessor_config(quality_assessor_data), calendar=_parse_calendar_config(calendar_data), ) @classmethod def load( cls, path: str | Path, *, project_root: str | Path | None = None, check_paths: bool = True, ) -> RCConfig: config_path = Path(path).expanduser().resolve() with config_path.open(encoding="utf-8") as handle: data = yaml.safe_load(handle) or {} if not isinstance(data, dict): raise ValueError( f"Config root must be a mapping, got {type(data).__name__}. " f"Check that {config_path} is valid YAML." ) resolved_root = ( Path(project_root).expanduser().resolve() if project_root else config_path.parent ) return cls.from_dict(data, project_root=resolved_root, check_paths=check_paths) def validate_config( data: dict[str, Any], *, project_root: Path | None = None, check_paths: bool = True, ) -> ValidationResult: errors: list[str] = [] warnings: list[str] = [] llm_provider = _get_by_path(data, "llm.provider") for key in REQUIRED_FIELDS: # ACP provider doesn't need base_url or api_key_env if llm_provider == "acp" and key in ("llm.base_url", "llm.api_key_env"): continue value = _get_by_path(data, key) if _is_blank(value): errors.append(f"Missing required field: {key}") project_mode = _get_by_path(data, "project.mode") if not _is_blank(project_mode) and project_mode not in PROJECT_MODES: errors.append(f"Invalid project.mode: {project_mode}") kb_backend = _get_by_path(data, "knowledge_base.backend") if not _is_blank(kb_backend) and kb_backend not in KB_BACKENDS: errors.append(f"Invalid knowledge_base.backend: {kb_backend}") hitl_required_stages = _get_by_path(data, "security.hitl_required_stages") if hitl_required_stages is not None: if not isinstance(hitl_required_stages, list): errors.append("security.hitl_required_stages must be a list") else: for stage in hitl_required_stages: if not isinstance(stage, int) or not 1 <= stage <= 23: errors.append( f"Invalid security.hitl_required_stages entry: {stage}" ) exp_mode = _get_by_path(data, "experiment.mode") if not _is_blank(exp_mode) and exp_mode not in EXPERIMENT_MODES: errors.append(f"Invalid experiment.mode: {exp_mode}") exp_direction = _get_by_path(data, "experiment.metric_direction") if not _is_blank(exp_direction) and exp_direction not in ("minimize", "maximize"): errors.append(f"Invalid experiment.metric_direction: {exp_direction}") cli_agent_provider = _get_by_path(data, "experiment.cli_agent.provider") if not _is_blank(cli_agent_provider) and cli_agent_provider not in CLI_AGENT_PROVIDERS: errors.append(f"Invalid experiment.cli_agent.provider: {cli_agent_provider}") kb_root_raw = _get_by_path(data, "knowledge_base.root") if check_paths and not _is_blank(kb_root_raw) and project_root is not None: kb_root = project_root / str(kb_root_raw) if not kb_root.exists(): errors.append(f"Missing path: {kb_root}") else: for subdir in KB_SUBDIRS: candidate = kb_root / subdir if not candidate.exists(): warnings.append(f"Missing recommended kb subdir: {candidate}") return ValidationResult( ok=not errors, errors=tuple(errors), warnings=tuple(warnings) ) def _parse_llm_config(data: dict[str, Any]) -> LlmConfig: acp_data = data.get("acp") or {} return LlmConfig( provider=data.get("provider", "openai-compatible"), base_url=data.get("base_url", ""), api_key_env=data.get("api_key_env", ""), api_key=data.get("api_key", ""), primary_model=data.get("primary_model", ""), fallback_models=tuple(data.get("fallback_models") or ()), s2_api_key=data.get("s2_api_key", ""), notes=data.get("notes", ""), acp=AcpConfig( agent=acp_data.get("agent", "claude"), cwd=acp_data.get("cwd", "."), acpx_command=acp_data.get("acpx_command", ""), session_name=acp_data.get("session_name", "researchclaw"), timeout_sec=int(acp_data.get("timeout_sec", 1800)), ), ) def _parse_agentic_config(data: dict[str, Any]) -> AgenticConfig: if not data: return AgenticConfig() return AgenticConfig( image=data.get("image", "researchclaw/experiment:latest"), agent_cli=data.get("agent_cli", "claude"), agent_install_cmd=data.get( "agent_install_cmd", "npm install -g @anthropic-ai/claude-code" ), network_policy=data.get("network_policy", "full"), timeout_sec=int(data.get("timeout_sec", 1800)), memory_limit_mb=int(data.get("memory_limit_mb", 8192)), gpu_enabled=bool(data.get("gpu_enabled", False)), mount_skills=bool(data.get("mount_skills", True)), allow_shell_commands=bool(data.get("allow_shell_commands", True)), max_turns=int(data.get("max_turns", 50)), ) def _parse_experiment_config(data: dict[str, Any]) -> ExperimentConfig: sandbox_data = data.get("sandbox") or {} docker_data = data.get("docker") or {} ssh_data = data.get("ssh_remote") or {} colab_data = data.get("colab_drive") or {} return ExperimentConfig( mode=data.get("mode", "simulated"), time_budget_sec=_safe_int(data.get("time_budget_sec"), 300), max_iterations=_safe_int(data.get("max_iterations"), 10), max_refine_duration_sec=_safe_int(data.get("max_refine_duration_sec"), 0), metric_key=data.get("metric_key", "primary_metric"), metric_direction=data.get("metric_direction", "minimize"), keep_threshold=_safe_float(data.get("keep_threshold"), 0.0), sandbox=SandboxConfig( python_path=sandbox_data.get("python_path", DEFAULT_PYTHON_PATH), gpu_required=bool(sandbox_data.get("gpu_required", False)), allowed_imports=tuple( sandbox_data.get("allowed_imports", SandboxConfig.allowed_imports) ), max_memory_mb=_safe_int(sandbox_data.get("max_memory_mb"), 4096), ), docker=DockerSandboxConfig( image=docker_data.get("image", "researchclaw/experiment:latest"), gpu_enabled=bool(docker_data.get("gpu_enabled", True)), gpu_device_ids=tuple( int(g) for g in docker_data.get("gpu_device_ids", ()) ), memory_limit_mb=_safe_int(docker_data.get("memory_limit_mb"), 8192), network_policy=_validate_network_policy( docker_data.get("network_policy", "setup_only"), ), pip_pre_install=tuple(docker_data.get("pip_pre_install", ())), auto_install_deps=bool(docker_data.get("auto_install_deps", True)), shm_size_mb=_safe_int(docker_data.get("shm_size_mb"), 2048), container_python=docker_data.get("container_python", "/usr/bin/python3"), keep_containers=bool(docker_data.get("keep_containers", False)), ), ssh_remote=SshRemoteConfig( host=ssh_data.get("host", ""), user=ssh_data.get("user", ""), port=_safe_int(ssh_data.get("port"), 22), key_path=ssh_data.get("key_path", ""), gpu_ids=tuple(int(g) for g in ssh_data.get("gpu_ids", ())), remote_workdir=ssh_data.get( "remote_workdir", "/tmp/researchclaw_experiments" ), remote_python=ssh_data.get("remote_python", "python3"), setup_commands=tuple(ssh_data.get("setup_commands") or ()), use_docker=bool(ssh_data.get("use_docker", False)), docker_image=ssh_data.get("docker_image", "researchclaw/experiment:latest"), docker_network_policy=_validate_network_policy( ssh_data.get("docker_network_policy", "none"), ), docker_memory_limit_mb=_safe_int(ssh_data.get("docker_memory_limit_mb"), 8192), docker_shm_size_mb=_safe_int(ssh_data.get("docker_shm_size_mb"), 2048), timeout_sec=_safe_int(ssh_data.get("timeout_sec"), 600), scp_timeout_sec=_safe_int(ssh_data.get("scp_timeout_sec"), 300), setup_timeout_sec=_safe_int(ssh_data.get("setup_timeout_sec"), 300), ), colab_drive=ColabDriveConfig( drive_root=colab_data.get("drive_root", ""), poll_interval_sec=_safe_int(colab_data.get("poll_interval_sec"), 30), timeout_sec=_safe_int(colab_data.get("timeout_sec"), 3600), setup_script=colab_data.get("setup_script", ""), ), agentic=_parse_agentic_config(data.get("agentic") or {}), code_agent=_parse_code_agent_config(data.get("code_agent") or {}), opencode=_parse_opencode_config(data.get("opencode") or {}), benchmark_agent=_parse_benchmark_agent_config( data.get("benchmark_agent") or {} ), figure_agent=_parse_figure_agent_config(data.get("figure_agent") or {}), repair=_parse_experiment_repair_config(data.get("repair") or {}), cli_agent=_parse_cli_agent_config(data.get("cli_agent") or {}), ) def _parse_benchmark_agent_config(data: dict[str, Any]) -> BenchmarkAgentConfig: if not data: return BenchmarkAgentConfig() return BenchmarkAgentConfig( enabled=bool(data.get("enabled", True)), enable_hf_search=bool(data.get("enable_hf_search", True)), max_hf_results=_safe_int(data.get("max_hf_results"), 10), enable_web_search=bool(data.get("enable_web_search", True)), max_web_results=_safe_int(data.get("max_web_results"), 5), web_search_min_local=_safe_int(data.get("web_search_min_local"), 3), tier_limit=_safe_int(data.get("tier_limit"), 2), min_benchmarks=_safe_int(data.get("min_benchmarks"), 1), min_baselines=_safe_int(data.get("min_baselines"), 2), prefer_cached=bool(data.get("prefer_cached", True)), max_iterations=_safe_int(data.get("max_iterations"), 2), ) def _parse_figure_agent_config(data: dict[str, Any]) -> FigureAgentConfig: if not data: return FigureAgentConfig() use_docker_raw = data.get("use_docker", None) return FigureAgentConfig( enabled=bool(data.get("enabled", True)), min_figures=_safe_int(data.get("min_figures"), 3), max_figures=_safe_int(data.get("max_figures"), 8), max_iterations=_safe_int(data.get("max_iterations"), 3), render_timeout_sec=_safe_int(data.get("render_timeout_sec"), 30), use_docker=( None if use_docker_raw is None else bool(use_docker_raw) ), docker_image=data.get("docker_image", "researchclaw/experiment:latest"), output_format=data.get("output_format", "python"), gemini_api_key=data.get("gemini_api_key", ""), gemini_model=data.get("gemini_model", "gemini-2.5-flash-image"), nano_banana_enabled=bool(data.get("nano_banana_enabled", True)), strict_mode=bool(data.get("strict_mode", False)), dpi=_safe_int(data.get("dpi"), 300), ) def _parse_experiment_repair_config(data: dict[str, Any]) -> ExperimentRepairConfig: if not data: return ExperimentRepairConfig() return ExperimentRepairConfig( enabled=bool(data.get("enabled", True)), max_cycles=_safe_int(data.get("max_cycles"), 3), min_completion_rate=_safe_float(data.get("min_completion_rate"), 0.5), min_conditions=_safe_int(data.get("min_conditions"), 2), use_opencode=bool(data.get("use_opencode", True)), timeout_sec_per_cycle=_safe_int(data.get("timeout_sec_per_cycle"), 600), ) def _parse_cli_agent_config(data: dict[str, Any]) -> CliAgentConfig: if not data: return CliAgentConfig() return CliAgentConfig( provider=data.get("provider", "llm"), binary_path=data.get("binary_path", ""), model=data.get("model", ""), max_budget_usd=_safe_float(data.get("max_budget_usd"), 5.0), timeout_sec=_safe_int(data.get("timeout_sec"), 600), extra_args=tuple(data.get("extra_args") or ()), ) def _parse_code_agent_config(data: dict[str, Any]) -> CodeAgentConfig: if not data: return CodeAgentConfig() return CodeAgentConfig( enabled=bool(data.get("enabled", True)), architecture_planning=bool(data.get("architecture_planning", True)), sequential_generation=bool(data.get("sequential_generation", True)), hard_validation=bool(data.get("hard_validation", True)), hard_validation_max_repairs=_safe_int(data.get("hard_validation_max_repairs"), 4), exec_fix_max_iterations=_safe_int(data.get("exec_fix_max_iterations"), 3), exec_fix_timeout_sec=_safe_int(data.get("exec_fix_timeout_sec"), 60), tree_search_enabled=bool(data.get("tree_search_enabled", False)), tree_search_candidates=_safe_int(data.get("tree_search_candidates"), 3), tree_search_max_depth=_safe_int(data.get("tree_search_max_depth"), 2), tree_search_eval_timeout_sec=_safe_int( data.get("tree_search_eval_timeout_sec"), 120 ), review_max_rounds=_safe_int(data.get("review_max_rounds"), 2), ) def _parse_opencode_config(data: dict[str, Any]) -> OpenCodeConfig: if not data: return OpenCodeConfig() return OpenCodeConfig( enabled=bool(data.get("enabled", True)), auto=bool(data.get("auto", True)), complexity_threshold=_safe_float(data.get("complexity_threshold"), 0.2), model=str(data.get("model", "")), timeout_sec=_safe_int(data.get("timeout_sec"), 600), max_retries=_safe_int(data.get("max_retries"), 1), workspace_cleanup=bool(data.get("workspace_cleanup", True)), ) def _parse_metaclaw_bridge_config(data: dict[str, Any]) -> MetaClawBridgeConfig: prm_data = data.get("prm") or {} l2s_data = data.get("lesson_to_skill") or {} return MetaClawBridgeConfig( enabled=bool(data.get("enabled", False)), proxy_url=data.get("proxy_url", "http://localhost:30000"), skills_dir=data.get("skills_dir", "~/.metaclaw/skills"), fallback_url=data.get("fallback_url", ""), fallback_api_key=data.get("fallback_api_key", ""), prm=MetaClawPRMConfig( enabled=bool(prm_data.get("enabled", False)), api_base=prm_data.get("api_base", ""), api_key_env=prm_data.get("api_key_env", ""), api_key=prm_data.get("api_key", ""), model=prm_data.get("model", "gpt-5.4"), votes=_safe_int(prm_data.get("votes"), 3), temperature=_safe_float(prm_data.get("temperature"), 0.6), gate_stages=tuple( int(s) for s in prm_data.get("gate_stages", (5, 9, 15, 20)) ), ), lesson_to_skill=MetaClawLessonToSkillConfig( enabled=bool(l2s_data.get("enabled", True)), min_severity=l2s_data.get("min_severity", "warning"), max_skills_per_run=_safe_int(l2s_data.get("max_skills_per_run"), 3), ), ) def _parse_memory_config(data: dict[str, Any]) -> MemoryConfig: if not data: return MemoryConfig() stages = data.get("inject_at_stages", (1, 9, 10, 17)) return MemoryConfig( enabled=bool(data.get("enabled", True)), store_dir=str(data.get("store_dir", ".researchclaw/memory")), embedding_model=str(data.get("embedding_model", "text-embedding-3-small")), max_entries_per_category=int(data.get("max_entries_per_category", 500)), decay_half_life_days=int(data.get("decay_half_life_days", 90)), confidence_threshold=float(data.get("confidence_threshold", 0.3)), inject_at_stages=tuple(int(s) for s in stages), ) def _parse_skills_config(data: dict[str, Any]) -> SkillsConfig: if not data: return SkillsConfig() return SkillsConfig( enabled=bool(data.get("enabled", True)), builtin_dir=str(data.get("builtin_dir", "")), custom_dirs=tuple(str(d) for d in (data.get("custom_dirs") or ())), external_dirs=tuple(str(d) for d in (data.get("external_dirs") or ())), auto_match=bool(data.get("auto_match", True)), max_skills_per_stage=int(data.get("max_skills_per_stage", 3)), fallback_matching=bool(data.get("fallback_matching", True)), ) def _parse_knowledge_graph_config(data: dict[str, Any]) -> KnowledgeGraphConfig: if not data: return KnowledgeGraphConfig() return KnowledgeGraphConfig( enabled=bool(data.get("enabled", False)), store_path=str(data.get("store_path", ".researchclaw/knowledge_graph")), max_entities=int(data.get("max_entities", 10000)), auto_update=bool(data.get("auto_update", True)), ) def _parse_multi_project_config(data: dict[str, Any]) -> MultiProjectConfig: if not data: return MultiProjectConfig() return MultiProjectConfig( enabled=bool(data.get("enabled", False)), projects_dir=data.get("projects_dir", ".researchclaw/projects"), max_concurrent=int(data.get("max_concurrent", 2)), shared_knowledge=bool(data.get("shared_knowledge", True)), ) def _parse_servers_config(data: dict[str, Any]) -> ServersConfig: if not data: return ServersConfig() raw_servers = data.get("servers") or () servers = tuple( ServerEntryConfig( name=s.get("name", ""), host=s.get("host", ""), server_type=s.get("server_type", "ssh"), gpu=s.get("gpu", ""), vram_gb=int(s.get("vram_gb", 0)), priority=int(s.get("priority", 1)), cost_per_hour=float(s.get("cost_per_hour", 0.0)), scheduler=s.get("scheduler", ""), cloud_provider=s.get("cloud_provider", ""), ) for s in raw_servers ) return ServersConfig( enabled=bool(data.get("enabled", False)), servers=servers, prefer_free=bool(data.get("prefer_free", True)), failover=bool(data.get("failover", True)), monitor_interval_sec=int(data.get("monitor_interval_sec", 60)), ) def _parse_mcp_config(data: dict[str, Any]) -> MCPIntegrationConfig: if not data: return MCPIntegrationConfig() return MCPIntegrationConfig( server_enabled=bool(data.get("server_enabled", False)), server_port=int(data.get("server_port", 3000)), server_transport=data.get("server_transport", "stdio"), external_servers=tuple(data.get("external_servers") or ()), ) def _parse_overleaf_config(data: dict[str, Any]) -> OverleafConfig: if not data: return OverleafConfig() return OverleafConfig( enabled=bool(data.get("enabled", False)), git_url=data.get("git_url", ""), branch=data.get("branch", "main"), auto_push=bool(data.get("auto_push", True)), auto_pull=bool(data.get("auto_pull", False)), poll_interval_sec=int(data.get("poll_interval_sec", 300)), ) def _parse_server_config(data: dict[str, Any]) -> ServerConfig: if not data: return ServerConfig() cors = data.get("cors_origins") if isinstance(cors, list): cors = tuple(cors) elif cors is None: cors = ("*",) else: cors = (str(cors),) return ServerConfig( enabled=bool(data.get("enabled", False)), host=data.get("host", "0.0.0.0"), port=int(data.get("port", 8080)), cors_origins=cors, auth_token=data.get("auth_token", ""), voice_enabled=bool(data.get("voice_enabled", False)), whisper_model=data.get("whisper_model", "whisper-1"), whisper_api_url=data.get("whisper_api_url", ""), ) def _parse_dashboard_config(data: dict[str, Any]) -> DashboardConfig: if not data: return DashboardConfig() return DashboardConfig( enabled=bool(data.get("enabled", True)), refresh_interval_sec=int(data.get("refresh_interval_sec", 5)), max_log_lines=int(data.get("max_log_lines", 1000)), browser_notifications=bool(data.get("browser_notifications", True)), ) def _parse_trends_config(data: dict[str, Any]) -> TrendsConfig: if not data: return TrendsConfig() sources = data.get("sources", ("arxiv", "semantic_scholar")) if isinstance(sources, list): sources = tuple(sources) domains = data.get("domains", ()) if isinstance(domains, list): domains = tuple(domains) return TrendsConfig( enabled=bool(data.get("enabled", False)), domains=domains, daily_digest=bool(data.get("daily_digest", True)), digest_time=data.get("digest_time", "08:00"), max_papers_per_day=int(data.get("max_papers_per_day", 20)), trend_window_days=int(data.get("trend_window_days", 30)), sources=sources, ) def _parse_copilot_config(data: dict[str, Any]) -> CoPilotConfig: if not data: return CoPilotConfig() return CoPilotConfig( mode=data.get("mode", "auto-pilot"), pause_at_gates=bool(data.get("pause_at_gates", True)), pause_at_every_stage=bool(data.get("pause_at_every_stage", False)), feedback_timeout_sec=int(data.get("feedback_timeout_sec", 3600)), allow_branching=bool(data.get("allow_branching", True)), max_branches=int(data.get("max_branches", 3)), ) def _parse_quality_assessor_config(data: dict[str, Any]) -> QualityAssessorConfig: if not data: return QualityAssessorConfig() dimensions = data.get("dimensions", ("novelty", "rigor", "clarity", "impact", "experiments")) if isinstance(dimensions, list): dimensions = tuple(dimensions) return QualityAssessorConfig( enabled=bool(data.get("enabled", True)), dimensions=dimensions, venue_recommendation=bool(data.get("venue_recommendation", True)), score_history=bool(data.get("score_history", True)), ) def _parse_calendar_config(data: dict[str, Any]) -> CalendarConfig: if not data: return CalendarConfig() venues = data.get("target_venues", ()) if isinstance(venues, list): venues = tuple(venues) reminder = data.get("reminder_days_before", (30, 14, 7, 3, 1)) if isinstance(reminder, list): reminder = tuple(reminder) return CalendarConfig( enabled=bool(data.get("enabled", False)), target_venues=venues, reminder_days_before=reminder, auto_plan=bool(data.get("auto_plan", True)), ) def load_config( path: str | Path, *, project_root: str | Path | None = None, check_paths: bool = True, ) -> RCConfig: return RCConfig.load(path, project_root=project_root, check_paths=check_paths) ================================================ FILE: researchclaw/copilot/__init__.py ================================================ """Interactive Co-Pilot mode for human-AI research collaboration.""" from researchclaw.copilot.modes import ResearchMode from researchclaw.copilot.controller import CoPilotController from researchclaw.copilot.feedback import FeedbackHandler from researchclaw.copilot.branching import BranchManager __all__ = [ "BranchManager", "CoPilotController", "FeedbackHandler", "ResearchMode", ] ================================================ FILE: researchclaw/copilot/branching.py ================================================ """Exploration branch management for Co-Pilot mode.""" from __future__ import annotations import json import logging import shutil from datetime import datetime, timezone from pathlib import Path from typing import Any logger = logging.getLogger(__name__) class BranchManager: """Manage exploration branches during Co-Pilot sessions.""" def __init__(self, run_dir: Path, max_branches: int = 3): self.run_dir = run_dir self.max_branches = max_branches self._branches_dir = run_dir / "branches" def create_branch( self, name: str, from_stage: int, ) -> str: """Create a new exploration branch by copying state up to from_stage.""" if len(self.list_branches()) >= self.max_branches: raise ValueError( f"Maximum branches ({self.max_branches}) reached. " f"Delete a branch before creating a new one." ) branch_dir = self._branches_dir / name if branch_dir.exists(): raise ValueError(f"Branch '{name}' already exists.") branch_dir.mkdir(parents=True, exist_ok=True) # Copy stage directories up to from_stage for stage_num in range(1, from_stage + 1): src = self.run_dir / f"stage-{stage_num:02d}" if src.exists(): dest = branch_dir / f"stage-{stage_num:02d}" shutil.copytree(src, dest, dirs_exist_ok=True) # Write branch metadata meta = { "name": name, "from_stage": from_stage, "created_at": datetime.now(timezone.utc).isoformat(), "status": "active", } (branch_dir / "branch_meta.json").write_text( json.dumps(meta, indent=2), encoding="utf-8" ) logger.info("Created branch '%s' from stage %d", name, from_stage) return str(branch_dir) def list_branches(self) -> list[dict[str, Any]]: """List all branches with their metadata.""" if not self._branches_dir.exists(): return [] branches = [] for branch_dir in sorted(self._branches_dir.iterdir()): if not branch_dir.is_dir(): continue meta_path = branch_dir / "branch_meta.json" if meta_path.exists(): try: meta = json.loads(meta_path.read_text(encoding="utf-8")) meta["path"] = str(branch_dir) branches.append(meta) except (json.JSONDecodeError, OSError): branches.append({ "name": branch_dir.name, "path": str(branch_dir), "status": "unknown", }) else: branches.append({ "name": branch_dir.name, "path": str(branch_dir), "status": "unknown", }) return branches def switch_branch(self, name: str) -> Path: """Get the directory path for a branch (for resuming execution).""" branch_dir = self._branches_dir / name if not branch_dir.exists(): raise ValueError(f"Branch '{name}' does not exist.") return branch_dir def delete_branch(self, name: str) -> None: """Delete a branch and its data.""" branch_dir = self._branches_dir / name if not branch_dir.exists(): raise ValueError(f"Branch '{name}' does not exist.") shutil.rmtree(branch_dir) logger.info("Deleted branch '%s'", name) def compare_branches( self, branch_a: str, branch_b: str, ) -> dict[str, Any]: """Compare results between two branches.""" dir_a = self._branches_dir / branch_a dir_b = self._branches_dir / branch_b if not dir_a.exists(): return {"error": f"Branch '{branch_a}' does not exist."} if not dir_b.exists(): return {"error": f"Branch '{branch_b}' does not exist."} # Compare experiment summaries if available result: dict[str, Any] = { "branch_a": branch_a, "branch_b": branch_b, "stages_a": self._count_stages(dir_a), "stages_b": self._count_stages(dir_b), } summary_a = self._read_experiment_summary(dir_a) summary_b = self._read_experiment_summary(dir_b) if summary_a and summary_b: result["metrics_a"] = summary_a.get("metrics_summary", {}) result["metrics_b"] = summary_b.get("metrics_summary", {}) return result @staticmethod def _count_stages(branch_dir: Path) -> int: """Count completed stages in a branch directory.""" count = 0 for d in branch_dir.iterdir(): if d.is_dir() and d.name.startswith("stage-"): count += 1 return count @staticmethod def _read_experiment_summary( branch_dir: Path, ) -> dict[str, Any] | None: """Read experiment summary from a branch.""" summary_path = branch_dir / "stage-14" / "experiment_summary.json" if not summary_path.exists(): return None try: return json.loads(summary_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return None ================================================ FILE: researchclaw/copilot/controller.py ================================================ """Co-Pilot controller — orchestrates pause/feedback/branch logic.""" from __future__ import annotations import logging from pathlib import Path from typing import Any from researchclaw.config import CoPilotConfig from researchclaw.copilot.branching import BranchManager from researchclaw.copilot.feedback import Feedback, FeedbackHandler from researchclaw.copilot.modes import ResearchMode from researchclaw.pipeline.stages import GATE_STAGES logger = logging.getLogger(__name__) class CoPilotController: """Control Co-Pilot mode during pipeline execution.""" def __init__( self, config: CoPilotConfig, run_dir: Path, ): self.config = config self.mode = ResearchMode(config.mode) self.run_dir = run_dir self.feedback_handler = FeedbackHandler(run_dir) self.branch_manager = BranchManager( run_dir, max_branches=config.max_branches ) def should_pause(self, stage_num: int, is_gate: bool) -> bool: """Determine if the pipeline should pause at this stage.""" if self.mode == ResearchMode.ZERO_TOUCH: return False if self.mode == ResearchMode.AUTO_PILOT: return is_gate and self.config.pause_at_gates # CO_PILOT mode if self.config.pause_at_every_stage: return True return is_gate def present_stage_result( self, stage_num: int, stage_name: str, artifacts: list[str], status: str, error: str | None = None, ) -> str: """Format stage result summary for user review.""" lines = [ f"Stage {stage_num}: {stage_name}", f"Status: {status}", ] if error: lines.append(f"Error: {error}") if artifacts: lines.append(f"Artifacts: {', '.join(artifacts)}") lines.extend([ "", "Available actions: approve, modify, retry, skip, branch, rollback", ]) return "\n".join(lines) def request_feedback( self, stage_num: int, stage_name: str, summary: str, ) -> Feedback | None: """Request and wait for user feedback.""" self.feedback_handler.write_feedback_request( stage=stage_num, stage_name=stage_name, summary=summary, ) logger.info( "Co-Pilot: waiting for feedback on stage %d (%s)", stage_num, stage_name, ) feedback = self.feedback_handler.wait_for_feedback( stage=stage_num, timeout_sec=self.config.feedback_timeout_sec, ) self.feedback_handler.clear_request() return feedback def handle_feedback( self, feedback: Feedback, ) -> dict[str, Any]: """Process user feedback and return action instructions.""" result: dict[str, Any] = { "action": feedback.action, "stage": feedback.stage, } if feedback.action == "approve": result["instruction"] = "continue" elif feedback.action == "modify": result["instruction"] = "apply_modifications" result["modifications"] = feedback.modifications or {} result["message"] = feedback.message elif feedback.action == "retry": result["instruction"] = "rerun_stage" elif feedback.action == "skip": result["instruction"] = "skip_stage" elif feedback.action == "branch": if self.config.allow_branching: branch_name = feedback.branch_name or f"branch_{feedback.stage}" try: branch_path = self.branch_manager.create_branch( branch_name, feedback.stage ) result["instruction"] = "branch_created" result["branch_name"] = branch_name result["branch_path"] = branch_path except ValueError as exc: result["instruction"] = "branch_failed" result["error"] = str(exc) else: result["instruction"] = "branching_disabled" elif feedback.action == "rollback": result["instruction"] = "rollback" result["rollback_to"] = feedback.rollback_to else: result["instruction"] = "continue" return result @classmethod def from_config( cls, config: CoPilotConfig, run_dir: Path, ) -> CoPilotController | None: """Create a controller, or None if mode is zero-touch.""" mode = ResearchMode(config.mode) if mode == ResearchMode.ZERO_TOUCH: return None return cls(config, run_dir) ================================================ FILE: researchclaw/copilot/feedback.py ================================================ """User feedback processing for Co-Pilot mode.""" from __future__ import annotations import json import logging import time from dataclasses import dataclass from pathlib import Path from typing import Any logger = logging.getLogger(__name__) FEEDBACK_ACTIONS = frozenset({ "approve", # Continue to next stage "modify", # Apply modifications and continue "retry", # Re-run current stage "skip", # Skip current stage "discuss", # Enter discussion mode (future) "branch", # Create exploration branch "rollback", # Roll back to a previous stage }) @dataclass(frozen=True) class Feedback: """Structured user feedback for a pipeline stage.""" action: str stage: int message: str = "" modifications: dict[str, Any] | None = None branch_name: str = "" rollback_to: int | None = None timestamp: str = "" class FeedbackHandler: """Handle feedback input/output for Co-Pilot mode.""" def __init__(self, run_dir: Path): self.run_dir = run_dir def write_feedback_request( self, stage: int, stage_name: str, summary: str, options: list[str] | None = None, ) -> Path: """Write a feedback request file for external consumers.""" from datetime import datetime, timezone request = { "stage": stage, "stage_name": stage_name, "summary": summary, "options": options or list(FEEDBACK_ACTIONS), "timestamp": datetime.now(timezone.utc).isoformat(), "status": "waiting", } request_path = self.run_dir / "copilot_feedback_request.json" request_path.write_text( json.dumps(request, indent=2), encoding="utf-8" ) return request_path def read_feedback_response(self) -> Feedback | None: """Read feedback response from file (written by UI or CLI).""" response_path = self.run_dir / "copilot_feedback_response.json" if not response_path.exists(): return None try: data = json.loads(response_path.read_text(encoding="utf-8")) action = data.get("action", "approve") if action not in FEEDBACK_ACTIONS: logger.warning("Invalid feedback action: %s", action) return None return Feedback( action=action, stage=int(data.get("stage", 0)), message=str(data.get("message", "")), modifications=data.get("modifications"), branch_name=str(data.get("branch_name", "")), rollback_to=( int(data["rollback_to"]) if data.get("rollback_to") is not None else None ), timestamp=str(data.get("timestamp", "")), ) except (json.JSONDecodeError, TypeError, ValueError) as exc: logger.warning("Failed to parse feedback response: %s", exc) return None def wait_for_feedback( self, stage: int, timeout_sec: int = 3600, poll_interval_sec: float = 1.0, ) -> Feedback | None: """Wait for feedback by polling the response file.""" response_path = self.run_dir / "copilot_feedback_response.json" # Clear any stale response if response_path.exists(): response_path.unlink() deadline = time.monotonic() + timeout_sec while time.monotonic() < deadline: feedback = self.read_feedback_response() if feedback is not None and feedback.stage == stage: # Clean up response file if response_path.exists(): response_path.unlink() return feedback time.sleep(poll_interval_sec) logger.info("Feedback timeout for stage %d after %ds", stage, timeout_sec) return None def clear_request(self) -> None: """Clear the feedback request file.""" request_path = self.run_dir / "copilot_feedback_request.json" if request_path.exists(): request_path.unlink() ================================================ FILE: researchclaw/copilot/modes.py ================================================ """Research mode definitions for Co-Pilot.""" from __future__ import annotations from enum import Enum class ResearchMode(Enum): """Pipeline execution modes.""" CO_PILOT = "co-pilot" # Pause at every stage for feedback AUTO_PILOT = "auto-pilot" # Pause only at gate stages ZERO_TOUCH = "zero-touch" # Fully automatic, no pauses ================================================ FILE: researchclaw/dashboard/__init__.py ================================================ """Real-time pipeline monitoring dashboard.""" ================================================ FILE: researchclaw/dashboard/broadcaster.py ================================================ """Dashboard state broadcaster — pushes updates via WebSocket.""" from __future__ import annotations import asyncio import logging from typing import Any from researchclaw.dashboard.collector import DashboardCollector, RunSnapshot from researchclaw.server.websocket.events import Event, EventType from researchclaw.server.websocket.manager import ConnectionManager logger = logging.getLogger(__name__) class DashboardBroadcaster: """Periodically collect run data and broadcast changes.""" def __init__( self, manager: ConnectionManager, collector: DashboardCollector, ) -> None: self._manager = manager self._collector = collector self._prev_snapshots: dict[str, dict[str, Any]] = {} async def tick(self) -> None: """Collect current state and broadcast changes.""" snapshots = self._collector.collect_all() current: dict[str, dict[str, Any]] = {} for snap in snapshots: d = snap.to_dict() current[snap.run_id] = d prev = self._prev_snapshots.get(snap.run_id) if prev is None: # New run discovered await self._manager.broadcast( Event(type=EventType.RUN_DISCOVERED, data=d) ) else: # Check for stage changes if d["current_stage"] != prev.get("current_stage"): await self._manager.broadcast( Event( type=EventType.STAGE_COMPLETE if d["current_stage"] > prev.get("current_stage", 0) else EventType.RUN_STATUS_CHANGED, data=d, ) ) elif d["status"] != prev.get("status"): await self._manager.broadcast( Event(type=EventType.RUN_STATUS_CHANGED, data=d) ) # Check for metric updates if d["metrics"] and d["metrics"] != prev.get("metrics"): await self._manager.broadcast( Event( type=EventType.METRIC_UPDATE, data={"run_id": snap.run_id, "metrics": d["metrics"]}, ) ) self._prev_snapshots = current async def start_dashboard_loop( manager: ConnectionManager, interval: int = 5, monitor_dir: str | None = None, ) -> None: """Background task that periodically broadcasts dashboard updates.""" collector = DashboardCollector() broadcaster = DashboardBroadcaster(manager, collector) while True: try: await broadcaster.tick() except Exception: logger.exception("Dashboard broadcast error") await asyncio.sleep(interval) ================================================ FILE: researchclaw/dashboard/collector.py ================================================ """Run data collector — scans artifacts/ for pipeline state.""" from __future__ import annotations import json import logging import time from dataclasses import dataclass, field from pathlib import Path from typing import Any logger = logging.getLogger(__name__) @dataclass class RunSnapshot: """Point-in-time snapshot of a pipeline run.""" run_id: str path: str status: str = "unknown" current_stage: int = 0 current_stage_name: str = "" total_stages: int = 23 start_time: str = "" elapsed_sec: float = 0.0 is_active: bool = False topic: str = "" metrics: dict[str, Any] = field(default_factory=dict) stages_completed: list[str] = field(default_factory=list) last_log_lines: list[str] = field(default_factory=list) error: str = "" def to_dict(self) -> dict[str, Any]: return { "run_id": self.run_id, "path": self.path, "status": self.status, "current_stage": self.current_stage, "current_stage_name": self.current_stage_name, "total_stages": self.total_stages, "start_time": self.start_time, "elapsed_sec": round(self.elapsed_sec, 1), "is_active": self.is_active, "topic": self.topic, "metrics": self.metrics, "stages_completed": self.stages_completed, "error": self.error, } class DashboardCollector: """Collect run state from artifacts/ directory.""" def __init__( self, artifacts_dir: str = "artifacts", max_log_lines: int = 200, ) -> None: self._artifacts = Path(artifacts_dir) self._max_log_lines = max_log_lines def collect_all(self) -> list[RunSnapshot]: """Scan all rc-* directories and return snapshots.""" if not self._artifacts.exists(): return [] runs: list[RunSnapshot] = [] for d in sorted(self._artifacts.iterdir(), reverse=True): if d.is_dir() and d.name.startswith("rc-"): try: snap = self._collect_run(d) runs.append(snap) except Exception as exc: logger.debug("Failed to collect %s: %s", d, exc) return runs def collect_run(self, run_dir: str | Path) -> RunSnapshot: """Collect a single run.""" return self._collect_run(Path(run_dir)) def _collect_run(self, run_dir: Path) -> RunSnapshot: snap = RunSnapshot(run_id=run_dir.name, path=str(run_dir)) # --- checkpoint.json --- ckpt_path = run_dir / "checkpoint.json" if ckpt_path.exists(): try: with ckpt_path.open() as f: ckpt = json.load(f) snap.current_stage = ckpt.get("stage", 0) snap.current_stage_name = ckpt.get("stage_name", "") snap.status = ckpt.get("status", "running") snap.topic = ckpt.get("topic", "") snap.start_time = ckpt.get("start_time", "") except Exception: pass # --- heartbeat.json --- hb_path = run_dir / "heartbeat.json" if hb_path.exists(): try: with hb_path.open() as f: hb = json.load(f) last_ts = hb.get("timestamp", 0) snap.is_active = (time.time() - last_ts) < 60 if snap.is_active: snap.status = "running" except Exception: pass # --- stage directories --- snap.stages_completed = sorted( [d.name for d in run_dir.iterdir() if d.is_dir() and d.name.startswith("stage-")] ) # --- experiment metrics (results.json) --- for results_path in run_dir.rglob("results.json"): try: with results_path.open() as f: snap.metrics = json.load(f) break except Exception: pass # --- last log lines --- log_path = run_dir / "pipeline.log" if log_path.exists(): try: lines = log_path.read_text(errors="replace").splitlines() snap.last_log_lines = lines[-self._max_log_lines:] except Exception: pass return snap ================================================ FILE: researchclaw/dashboard/metrics.py ================================================ """Metric aggregation and computation for the dashboard.""" from __future__ import annotations from typing import Any def aggregate_metrics(runs: list[dict[str, Any]]) -> dict[str, Any]: """Aggregate metrics across multiple runs for overview display.""" total = len(runs) active = sum(1 for r in runs if r.get("is_active")) completed = sum(1 for r in runs if r.get("status") == "completed") failed = sum(1 for r in runs if r.get("status") == "failed") avg_stages = 0.0 if total > 0: avg_stages = sum(r.get("current_stage", 0) for r in runs) / total return { "total_runs": total, "active_runs": active, "completed_runs": completed, "failed_runs": failed, "average_stage": round(avg_stages, 1), } def extract_training_curve(metrics: dict[str, Any]) -> list[dict[str, float]]: """Extract training curve data points from experiment metrics.""" curve: list[dict[str, float]] = [] training_log = metrics.get("training_log", []) if isinstance(training_log, list): for entry in training_log: if isinstance(entry, dict): point = {} for key in ("epoch", "step", "loss", "accuracy", "lr"): if key in entry: try: point[key] = float(entry[key]) except (ValueError, TypeError): pass if point: curve.append(point) return curve ================================================ FILE: researchclaw/data/__init__.py ================================================ """Static data assets for the ResearchClaw pipeline.""" from __future__ import annotations import logging from pathlib import Path from typing import Any import yaml logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Framework documentation # --------------------------------------------------------------------------- _FRAMEWORK_DOCS_DIR = Path(__file__).parent / "framework_docs" # Map of framework identifier -> (doc filename, keyword patterns for detection) _FRAMEWORK_REGISTRY: dict[str, dict[str, Any]] = { "trl": { "file": "trl.md", "keywords": ["trl", "sft", "dpo", "grpo", "ppo trainer", "rlhf", "sfttrainer", "dpotrainer", "grpotrainer"], }, "peft": { "file": "peft.md", "keywords": ["peft", "lora", "qlora", "adapter", "low-rank", "parameter-efficient", "dora"], }, "transformers_training": { "file": "transformers_training.md", "keywords": ["transformers", "huggingface", "trainer", "trainingarguments", "automodel", "fine-tun"], }, "llamafactory": { "file": "llamafactory.md", "keywords": ["llamafactory", "llama-factory", "llama factory"], }, "axolotl": { "file": "axolotl.md", "keywords": ["axolotl"], }, } def detect_frameworks(topic: str, hypothesis: str = "", plan: str = "") -> list[str]: """Detect which ML training frameworks are relevant based on topic/hypothesis/plan. Returns a list of framework identifiers (e.g., ["trl", "peft"]). """ combined = (topic + " " + hypothesis + " " + plan).lower() matched: list[str] = [] for fw_id, info in _FRAMEWORK_REGISTRY.items(): for kw in info["keywords"]: if kw in combined: matched.append(fw_id) break return matched def load_framework_docs(framework_ids: list[str], max_chars: int = 8000) -> str: """Load and concatenate framework API documentation for the given IDs. Returns a single string with all relevant docs, truncated to max_chars to avoid overwhelming the prompt context. """ parts: list[str] = [] total = 0 for fw_id in framework_ids: info = _FRAMEWORK_REGISTRY.get(fw_id) if not info: continue doc_path = _FRAMEWORK_DOCS_DIR / info["file"] if not doc_path.exists(): logger.warning("Framework doc not found: %s", doc_path) continue content = doc_path.read_text(encoding="utf-8") if total + len(content) > max_chars: remaining = max_chars - total if remaining > 500: content = content[:remaining] + "\n... (truncated)\n" else: break parts.append(content) total += len(content) if not parts: return "" header = ( "\n## Framework API Documentation (auto-detected)\n" "The following API references are relevant to your experiment. " "Use these exact APIs and patterns — do NOT guess the API.\n\n" ) return header + "\n---\n\n".join(parts) _SEMINAL_PAPERS_PATH = Path(__file__).parent / "seminal_papers.yaml" _CACHE: list[dict[str, Any]] | None = None def _load_all() -> list[dict[str, Any]]: """Load and cache the seminal papers list.""" global _CACHE # noqa: PLW0603 if _CACHE is not None: return _CACHE try: data = yaml.safe_load(_SEMINAL_PAPERS_PATH.read_text(encoding="utf-8")) _CACHE = data.get("papers", []) if isinstance(data, dict) else [] except Exception: # noqa: BLE001 logger.warning("Failed to load seminal_papers.yaml", exc_info=True) _CACHE = [] return _CACHE def load_seminal_papers(topic: str) -> list[dict[str, Any]]: """Return seminal papers whose keywords overlap with *topic*. Each returned dict has: title, authors, year, venue, cite_key, keywords. Matching is case-insensitive substring on the topic string. """ all_papers = _load_all() topic_lower = topic.lower() matched: list[dict[str, Any]] = [] seen_keys: set[str] = set() for paper in all_papers: keywords = paper.get("keywords", []) if not isinstance(keywords, list): continue for kw in keywords: if isinstance(kw, str) and kw.lower() in topic_lower: ck = paper.get("cite_key", "") if ck not in seen_keys: seen_keys.add(ck) matched.append(paper) break logger.debug( "load_seminal_papers(%r): matched %d papers", topic, len(matched) ) return matched ================================================ FILE: researchclaw/data/benchmark_knowledge.yaml ================================================ # BenchmarkAgent Knowledge Base # Domain-indexed registry of standard benchmarks, datasets, and baselines # for automated experiment design. # # Fields: # - standard_benchmarks: canonical datasets for the domain # - tier: 1 (pre-cached) | 2 (downloadable) | 3 (too large) # - api: Python one-liner to load the dataset # - size_mb: approximate download size # - metrics: standard evaluation metrics # - common_baselines: well-known methods with open-source implementations # - source: Python code to instantiate the model/method # - paper: original paper citation # - pip: additional pip packages needed (if any) domains: # ── Computer Vision ───────────────────────────────────────────── image_classification: required_baselines: - ResNet-50 - ViT-B/16 keywords: - image classification - visual recognition - object recognition - image categorization standard_benchmarks: - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 samples: 60000 metrics: [accuracy, top1_accuracy] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False, transform=transform)" - name: CIFAR-100 tier: 1 domain: image_classification size_mb: 170 classes: 100 samples: 60000 metrics: [accuracy, top1_accuracy, top5_accuracy] api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False, transform=transform)" - name: FashionMNIST tier: 1 domain: image_classification size_mb: 50 classes: 10 samples: 70000 metrics: [accuracy] api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False, transform=transform)" - name: STL-10 tier: 1 domain: image_classification size_mb: 2640 classes: 10 samples: 113000 metrics: [accuracy] api: "torchvision.datasets.STL10(root='/workspace/data', split='train', download=False, transform=transform)" - name: Tiny-ImageNet tier: 2 domain: image_classification size_mb: 237 classes: 200 samples: 110000 metrics: [top1_accuracy, top5_accuracy] api: "datasets.load_dataset('Maysee/tiny-imagenet', cache_dir='/workspace/data/hf')" - name: Caltech-101 tier: 2 domain: image_classification size_mb: 131 classes: 101 samples: 9146 metrics: [accuracy] api: "torchvision.datasets.Caltech101(root='/workspace/data', download=True, transform=transform)" - name: ImageNet-1K tier: 3 domain: image_classification size_mb: 168000 classes: 1000 samples: 1281167 metrics: [top1_accuracy, top5_accuracy] alternatives: ["Tiny-ImageNet", "CIFAR-100", "STL-10"] common_baselines: - name: ResNet-18 source: "torchvision.models.resnet18(weights='IMAGENET1K_V1')" paper: "He et al., Deep Residual Learning, CVPR 2016" pip: [] - name: ResNet-50 source: "torchvision.models.resnet50(weights='IMAGENET1K_V2')" paper: "He et al., Deep Residual Learning, CVPR 2016" pip: [] - name: ViT-B/16 source: "timm.create_model('vit_base_patch16_224', pretrained=True)" paper: "Dosovitskiy et al., An Image is Worth 16x16 Words, ICLR 2021" pip: [timm] - name: EfficientNet-B0 source: "torchvision.models.efficientnet_b0(weights='IMAGENET1K_V1')" paper: "Tan & Le, EfficientNet, ICML 2019" pip: [] - name: MobileNetV3 source: "torchvision.models.mobilenet_v3_small(weights='IMAGENET1K_V1')" paper: "Howard et al., MobileNetV3, ICCV 2019" pip: [] knowledge_distillation: required_baselines: - KD (Hinton) keywords: - knowledge distillation - model compression - teacher student - dark knowledge - distill standard_benchmarks: - name: CIFAR-100 tier: 1 domain: image_classification size_mb: 170 classes: 100 samples: 60000 metrics: [accuracy, top1_accuracy] api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False, transform=transform)" note: "Most common KD benchmark — 100 classes provides rich teacher knowledge" - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 samples: 60000 metrics: [accuracy] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False, transform=transform)" - name: Tiny-ImageNet tier: 2 domain: image_classification size_mb: 237 classes: 200 samples: 110000 metrics: [top1_accuracy, top5_accuracy] api: "datasets.load_dataset('Maysee/tiny-imagenet', cache_dir='/workspace/data/hf')" common_baselines: - name: KD (Hinton) source: "Custom: KL-divergence on softened logits (temperature=4)" paper: "Hinton et al., Distilling the Knowledge in a Neural Network, 2015" pip: [] - name: FitNet source: "Custom: Feature map regression (intermediate layers)" paper: "Romero et al., FitNets, ICLR 2015" pip: [] - name: CRD source: "Custom: Contrastive representation distillation" paper: "Tian et al., Contrastive Representation Distillation, ICLR 2020" pip: [] - name: DKD source: "Custom: Decoupled knowledge distillation (target + non-target)" paper: "Zhao et al., Decoupled Knowledge Distillation, CVPR 2022" pip: [] continual_learning: required_baselines: - EWC - Fine-tuning (lower bound) keywords: - continual learning - lifelong learning - catastrophic forgetting - incremental learning - class-incremental standard_benchmarks: - name: Split CIFAR-100 tier: 1 domain: continual_learning size_mb: 170 classes: 100 samples: 60000 metrics: [average_accuracy, forgetting_rate, backward_transfer] api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)" note: "Split into 10 or 20 tasks (10 or 5 classes each)" - name: Split MNIST tier: 1 domain: continual_learning size_mb: 50 classes: 10 samples: 70000 metrics: [average_accuracy, forgetting_rate] api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)" note: "Split into 5 tasks (2 classes each)" - name: Permuted MNIST tier: 1 domain: continual_learning size_mb: 50 classes: 10 samples: 70000 metrics: [average_accuracy, backward_transfer] api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)" note: "Each task applies a random pixel permutation" common_baselines: - name: EWC source: "Custom: Elastic Weight Consolidation (Fisher penalty)" paper: "Kirkpatrick et al., Overcoming catastrophic forgetting, PNAS 2017" pip: [] - name: SI (Synaptic Intelligence) source: "Custom: Path integral of parameter importance" paper: "Zenke et al., Continual Learning Through Synaptic Intelligence, ICML 2017" pip: [] - name: PackNet source: "Custom: Progressive pruning and re-training" paper: "Mallya & Lazebnik, PackNet, CVPR 2018" pip: [] - name: Experience Replay source: "Custom: Maintain memory buffer of past examples" paper: "Chaudhry et al., Continual Learning with Tiny Episodic Memories, 2019" pip: [] # ── Natural Language Processing ───────────────────────────────── text_classification: keywords: - text classification - sentiment analysis - document classification - topic classification - natural language understanding - nlu standard_benchmarks: - name: IMDB tier: 2 domain: nlp_sentiment size_mb: 80 classes: 2 samples: 50000 metrics: [accuracy, f1_score] api: "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')" - name: SST-2 tier: 2 domain: nlp_sentiment size_mb: 7 classes: 2 samples: 70000 metrics: [accuracy] api: "datasets.load_dataset('glue', 'sst2', cache_dir='/workspace/data/hf')" - name: AG News tier: 2 domain: nlp_classification size_mb: 30 classes: 4 samples: 127600 metrics: [accuracy, f1_macro] api: "datasets.load_dataset('ag_news', cache_dir='/workspace/data/hf')" - name: MNLI tier: 2 domain: nlp_nli size_mb: 310 classes: 3 samples: 433000 metrics: [accuracy] api: "datasets.load_dataset('glue', 'mnli', cache_dir='/workspace/data/hf')" common_baselines: - name: BERT-base source: "transformers.AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')" paper: "Devlin et al., BERT, NAACL 2019" pip: [transformers] - name: RoBERTa-base source: "transformers.AutoModelForSequenceClassification.from_pretrained('roberta-base')" paper: "Liu et al., RoBERTa, 2019" pip: [transformers] - name: DistilBERT source: "transformers.AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')" paper: "Sanh et al., DistilBERT, 2019" pip: [transformers] language_modeling: keywords: - language modeling - language model - perplexity - text generation - causal language model - autoregressive standard_benchmarks: - name: WikiText-103 tier: 2 domain: language_modeling size_mb: 500 samples: 28000 metrics: [perplexity, bits_per_character] api: "datasets.load_dataset('wikitext', 'wikitext-103-v1', cache_dir='/workspace/data/hf')" - name: WikiText-2 tier: 2 domain: language_modeling size_mb: 12 samples: 2000 metrics: [perplexity] api: "datasets.load_dataset('wikitext', 'wikitext-2-v1', cache_dir='/workspace/data/hf')" - name: Penn Treebank tier: 2 domain: language_modeling size_mb: 5 metrics: [perplexity] api: "datasets.load_dataset('ptb_text_only', cache_dir='/workspace/data/hf')" common_baselines: - name: GPT-2 Small source: "transformers.AutoModelForCausalLM.from_pretrained('gpt2')" paper: "Radford et al., Language Models are Unsupervised Multitask Learners, 2019" pip: [transformers] - name: LSTM LM source: "Custom: 2-layer LSTM with tied embeddings" paper: "Merity et al., AWD-LSTM, ICLR 2018" pip: [] question_answering: keywords: - question answering - reading comprehension - qa - extractive qa standard_benchmarks: - name: SQuAD v2 tier: 2 domain: question_answering size_mb: 45 samples: 150000 metrics: [exact_match, f1_score] api: "datasets.load_dataset('squad_v2', cache_dir='/workspace/data/hf')" - name: SQuAD v1.1 tier: 2 domain: question_answering size_mb: 35 samples: 100000 metrics: [exact_match, f1_score] api: "datasets.load_dataset('squad', cache_dir='/workspace/data/hf')" common_baselines: - name: BERT-base QA source: "transformers.AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')" paper: "Devlin et al., BERT, NAACL 2019" pip: [transformers] # ── Reinforcement Learning ────────────────────────────────────── reinforcement_learning: required_baselines: - PPO - Random Policy keywords: - reinforcement learning - reward shaping - policy gradient - value function - q-learning - actor-critic - markov decision process - mdp standard_benchmarks: - name: CartPole-v1 tier: 1 domain: classic_control size_mb: 0 metrics: [episode_reward, episode_length] api: "gymnasium.make('CartPole-v1')" note: "Simple baseline environment — episodic, discrete action" - name: LunarLander-v3 tier: 1 domain: classic_control size_mb: 0 metrics: [episode_reward] api: "gymnasium.make('LunarLander-v3')" - name: Acrobot-v1 tier: 1 domain: classic_control size_mb: 0 metrics: [episode_reward, steps_to_solve] api: "gymnasium.make('Acrobot-v1')" - name: HalfCheetah-v5 tier: 2 domain: mujoco size_mb: 50 metrics: [average_return, episode_reward] api: "gymnasium.make('HalfCheetah-v5')" pip: ["gymnasium[mujoco]"] - name: Hopper-v5 tier: 2 domain: mujoco size_mb: 50 metrics: [average_return] api: "gymnasium.make('Hopper-v5')" pip: ["gymnasium[mujoco]"] common_baselines: - name: PPO source: "stable_baselines3.PPO('MlpPolicy', env)" paper: "Schulman et al., PPO, 2017" pip: [stable-baselines3] - name: SAC source: "stable_baselines3.SAC('MlpPolicy', env)" paper: "Haarnoja et al., Soft Actor-Critic, ICML 2018" pip: [stable-baselines3] - name: DQN source: "stable_baselines3.DQN('MlpPolicy', env)" paper: "Mnih et al., Playing Atari with Deep RL, 2015" pip: [stable-baselines3] - name: A2C source: "stable_baselines3.A2C('MlpPolicy', env)" paper: "Mnih et al., Asynchronous Methods for Deep RL, ICML 2016" pip: [stable-baselines3] # ── Graph Neural Networks ─────────────────────────────────────── graph_neural_networks: keywords: - graph neural network - gnn - node classification - graph classification - link prediction - message passing - graph convolution - gcn - graph attention standard_benchmarks: - name: Cora tier: 2 domain: node_classification size_mb: 5 classes: 7 samples: 2708 metrics: [accuracy, f1_macro] api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='Cora')" pip: [torch-geometric] - name: CiteSeer tier: 2 domain: node_classification size_mb: 5 classes: 6 samples: 3327 metrics: [accuracy] api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='CiteSeer')" pip: [torch-geometric] - name: PubMed tier: 2 domain: node_classification size_mb: 20 classes: 3 samples: 19717 metrics: [accuracy] api: "torch_geometric.datasets.Planetoid(root='/workspace/data', name='PubMed')" pip: [torch-geometric] - name: ogbn-arxiv tier: 2 domain: node_classification size_mb: 200 samples: 169343 metrics: [accuracy] api: "ogb.nodeproppred.PygNodePropPredDataset(name='ogbn-arxiv', root='/workspace/data')" pip: [ogb, torch-geometric] - name: ogbg-molhiv tier: 2 domain: graph_classification size_mb: 50 samples: 41127 metrics: [auroc] api: "ogb.graphproppred.PygGraphPropPredDataset(name='ogbg-molhiv', root='/workspace/data')" pip: [ogb, torch-geometric] common_baselines: - name: GCN source: "torch_geometric.nn.GCNConv" paper: "Kipf & Welling, Semi-Supervised Classification with GCN, ICLR 2017" pip: [torch-geometric] - name: GAT source: "torch_geometric.nn.GATConv" paper: "Velickovic et al., Graph Attention Networks, ICLR 2018" pip: [torch-geometric] - name: GraphSAGE source: "torch_geometric.nn.SAGEConv" paper: "Hamilton et al., Inductive Representation Learning, NeurIPS 2017" pip: [torch-geometric] - name: GIN source: "torch_geometric.nn.GINConv" paper: "Xu et al., How Powerful are GNNs?, ICLR 2019" pip: [torch-geometric] # ── Meta-Learning ─────────────────────────────────────────────── meta_learning: keywords: - meta-learning - few-shot learning - learning to learn - meta learning - few shot - n-way k-shot standard_benchmarks: - name: Omniglot tier: 2 domain: few_shot_classification size_mb: 17 classes: 1623 samples: 32460 metrics: [accuracy] api: "torchvision.datasets.Omniglot(root='/workspace/data', download=True)" - name: Mini-ImageNet tier: 2 domain: few_shot_classification size_mb: 2900 classes: 100 samples: 60000 metrics: [accuracy] api: "datasets.load_dataset('nielsr/mini-imagenet', cache_dir='/workspace/data/hf')" note: "Standard 5-way 1-shot and 5-way 5-shot evaluation" - name: CIFAR-FS tier: 1 domain: few_shot_classification size_mb: 170 classes: 100 samples: 60000 metrics: [accuracy] api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)" note: "CIFAR-100 re-split into 64/16/20 base/val/novel classes" common_baselines: - name: MAML source: "Custom: Model-Agnostic Meta-Learning with inner/outer loop" paper: "Finn et al., MAML, ICML 2017" pip: [] - name: Prototypical Networks source: "Custom: Prototype-based classification in embedding space" paper: "Snell et al., Prototypical Networks, NeurIPS 2017" pip: [] - name: Matching Networks source: "Custom: Attention-based few-shot classifier" paper: "Vinyals et al., Matching Networks, NeurIPS 2016" pip: [] # ── Generative Models ────────────────────────────────────────── generative_models: required_baselines: - VAE keywords: - generative model - gan - vae - variational autoencoder - generative adversarial - diffusion model - image generation - score matching standard_benchmarks: - name: CIFAR-10 tier: 1 domain: image_generation size_mb: 170 metrics: [fid, inception_score, is] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" - name: MNIST tier: 1 domain: image_generation size_mb: 50 metrics: [fid, reconstruction_error] api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)" - name: CelebA tier: 2 domain: image_generation size_mb: 1400 samples: 202599 metrics: [fid, lpips] api: "torchvision.datasets.CelebA(root='/workspace/data', split='train', download=True)" common_baselines: - name: DCGAN source: "Custom: Deep Convolutional GAN" paper: "Radford et al., Unsupervised Representation Learning, ICLR 2016" pip: [] - name: VAE source: "Custom: Variational Autoencoder with KL divergence" paper: "Kingma & Welling, Auto-Encoding Variational Bayes, ICLR 2014" pip: [] - name: DDPM source: "Custom: Denoising Diffusion Probabilistic Model" paper: "Ho et al., Denoising Diffusion Probabilistic Models, NeurIPS 2020" pip: [] # ── Transfer Learning & Domain Adaptation ─────────────────────── transfer_learning: keywords: - transfer learning - domain adaptation - domain shift - distribution shift - covariate shift - domain generalization - out-of-distribution - ood standard_benchmarks: - name: CIFAR-10-C tier: 2 domain: robustness size_mb: 2700 classes: 10 metrics: [accuracy, mean_corruption_error] api: "datasets.load_dataset('cifar10_corrupted', cache_dir='/workspace/data/hf')" note: "15 corruption types × 5 severity levels" - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 metrics: [accuracy] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" note: "Source domain for corruption/shift experiments" - name: Office-31 tier: 2 domain: domain_adaptation size_mb: 800 classes: 31 samples: 4652 metrics: [accuracy] api: "Custom download from official source" common_baselines: - name: DANN source: "Custom: Domain-Adversarial Neural Network (gradient reversal)" paper: "Ganin et al., Domain-Adversarial Training, JMLR 2016" pip: [] - name: CORAL source: "Custom: Correlation alignment loss" paper: "Sun & Saenko, Deep CORAL, ECCV 2016" pip: [] - name: AugMax source: "Custom: Adversarial data augmentation" paper: "Wang et al., AugMax, NeurIPS 2021" pip: [] # ── Neural Architecture Search ────────────────────────────────── neural_architecture_search: keywords: - neural architecture search - nas - architecture search - automl architecture standard_benchmarks: - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 metrics: [accuracy, params, flops] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" - name: NAS-Bench-201 tier: 2 domain: nas_benchmark size_mb: 2100 metrics: [accuracy, latency] api: "Custom: nats_bench.create('/workspace/data/NATS-tss-v1_0-3ffb9-simple', 'tss', fast_mode=True)" pip: [nats_bench] common_baselines: - name: DARTS source: "Custom: Differentiable Architecture Search" paper: "Liu et al., DARTS, ICLR 2019" pip: [] - name: Random Search source: "Custom: Random architecture sampling + training" paper: "Li & Talwalkar, Random Search and Reproducibility, UAI 2020" pip: [] # ── Self-Supervised Learning ──────────────────────────────────── self_supervised_learning: keywords: - self-supervised - contrastive learning - self supervised - pretext task - representation learning - unsupervised representation - simclr - byol - moco standard_benchmarks: - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 metrics: [linear_probe_accuracy, knn_accuracy] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" - name: STL-10 tier: 1 domain: image_classification size_mb: 2640 classes: 10 metrics: [linear_probe_accuracy] api: "torchvision.datasets.STL10(root='/workspace/data', split='unlabeled', download=False)" note: "100K unlabeled images for pre-training, 13K labeled for evaluation" common_baselines: - name: SimCLR source: "Custom: Contrastive learning with NT-Xent loss" paper: "Chen et al., SimCLR, ICML 2020" pip: [] - name: BYOL source: "Custom: Bootstrap Your Own Latent (no negatives)" paper: "Grill et al., BYOL, NeurIPS 2020" pip: [] - name: MoCo v2 source: "Custom: Momentum Contrast with MLP head" paper: "Chen et al., Improved Baselines for MoCo, 2020" pip: [] # ── Optimizer Research ────────────────────────────────────────── # NOTE: This domain is specifically for research ON optimizers/learning # rate schedules. Do NOT match generic topics that merely USE optimization. # Keywords are narrow to avoid false matches (e.g., "combinatorial # optimization" or "PDE optimization" should NOT match this domain). optimizer_research: keywords: - optimizer design - learning rate schedule - adaptive learning rate - gradient descent method - second-order optimization - optimizer convergence standard_benchmarks: - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 metrics: [accuracy, convergence_speed, final_loss] api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" - name: CIFAR-100 tier: 1 domain: image_classification size_mb: 170 metrics: [accuracy, convergence_speed] api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)" - name: FashionMNIST tier: 1 domain: image_classification size_mb: 50 metrics: [accuracy, convergence_speed] api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False)" common_baselines: - name: AdamW source: "torch.optim.AdamW(params, lr=1e-3, weight_decay=0.01)" paper: "Loshchilov & Hutter, Decoupled Weight Decay, ICLR 2019" pip: [] - name: SGD + Cosine Annealing source: "torch.optim.SGD + CosineAnnealingLR" paper: "Loshchilov & Hutter, SGDR, ICLR 2017" pip: [] - name: LAMB source: "Custom: Layer-wise Adaptive Moments for Batch Training" paper: "You et al., Large Batch Optimization, ICLR 2020" pip: [] - name: SAM (Sharpness-Aware Minimization) source: "Custom: Sharpness-Aware Minimization" paper: "Foret et al., SAM, ICLR 2021" pip: [] # ── LLM Fine-Tuning ──────────────────────────────────────────── llm_finetuning: keywords: - fine-tuning - fine tuning - finetuning - instruction tuning - alignment - rlhf - dpo - lora - qlora - adapter tuning standard_benchmarks: - name: Alpaca tier: 2 domain: instruction_following size_mb: 25 samples: 52000 metrics: [loss, perplexity] api: "datasets.load_dataset('tatsu-lab/alpaca', cache_dir='/workspace/data/hf')" - name: MMLU tier: 2 domain: general_knowledge size_mb: 12 classes: 57 metrics: [accuracy] api: "datasets.load_dataset('cais/mmlu', 'all', cache_dir='/workspace/data/hf')" - name: HellaSwag tier: 2 domain: commonsense_reasoning size_mb: 70 metrics: [accuracy] api: "datasets.load_dataset('Rowan/hellaswag', cache_dir='/workspace/data/hf')" common_baselines: - name: Full Fine-Tuning source: "Standard supervised fine-tuning on all parameters" paper: "Baseline approach" pip: [transformers] - name: LoRA source: "peft.LoraConfig(r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'])" paper: "Hu et al., LoRA, ICLR 2022" pip: [peft] - name: QLoRA source: "BitsAndBytesConfig(load_in_4bit=True) + LoRA" paper: "Dettmers et al., QLoRA, NeurIPS 2023" pip: [peft, bitsandbytes] # ── Tabular / Classical ML ────────────────────────────────────── tabular_learning: keywords: - tabular - tabular data - tabular learning - structured data - feature engineering - gradient boosting - xgboost standard_benchmarks: - name: UCI Adult tier: 2 domain: binary_classification size_mb: 5 classes: 2 samples: 48842 metrics: [accuracy, auroc, f1_score] api: "datasets.load_dataset('scikit-learn/adult-census-income', cache_dir='/workspace/data/hf')" - name: California Housing tier: 2 domain: regression size_mb: 1 samples: 20640 metrics: [rmse, mae, r2] api: "sklearn.datasets.fetch_california_housing()" - name: Forest Cover Type tier: 2 domain: multiclass_classification size_mb: 75 classes: 7 samples: 581012 metrics: [accuracy, f1_macro] api: "sklearn.datasets.fetch_covtype()" common_baselines: - name: XGBoost source: "xgboost.XGBClassifier(n_estimators=100, max_depth=6)" paper: "Chen & Guestrin, XGBoost, KDD 2016" pip: [xgboost] - name: Random Forest source: "sklearn.ensemble.RandomForestClassifier(n_estimators=100)" paper: "Breiman, Random Forests, Machine Learning 2001" pip: [] - name: MLP source: "sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(256, 128))" paper: "Standard neural network baseline" pip: [] ================================================ FILE: researchclaw/data/dataset_registry.yaml ================================================ # ResearchClaw Dataset Registry # Tiers: # 1 = pre-cached in Docker image (no network needed) # 2 = downloadable via setup.py (network during setup phase) # 3 = too large for in-experiment download (use alternatives) # ── Tier 1: Pre-cached ────────────────────────────────────────── - name: CIFAR-10 tier: 1 domain: image_classification size_mb: 170 classes: 10 samples: 60000 api: "torchvision.datasets.CIFAR10(root='/workspace/data', download=False)" - name: CIFAR-100 tier: 1 domain: image_classification size_mb: 170 classes: 100 samples: 60000 api: "torchvision.datasets.CIFAR100(root='/workspace/data', download=False)" - name: MNIST tier: 1 domain: image_classification size_mb: 50 classes: 10 samples: 70000 api: "torchvision.datasets.MNIST(root='/workspace/data', download=False)" - name: FashionMNIST tier: 1 domain: image_classification size_mb: 50 classes: 10 samples: 70000 api: "torchvision.datasets.FashionMNIST(root='/workspace/data', download=False)" - name: STL-10 tier: 1 domain: image_classification size_mb: 2640 classes: 10 samples: 113000 api: "torchvision.datasets.STL10(root='/workspace/data', download=False)" - name: SVHN tier: 1 domain: image_classification size_mb: 450 classes: 10 samples: 99289 api: "torchvision.datasets.SVHN(root='/workspace/data', download=False)" # ── Tier 2: Downloadable ──────────────────────────────────────── - name: Tiny-ImageNet tier: 2 domain: image_classification size_mb: 237 classes: 200 samples: 110000 download: "setup.py with torchvision or direct URL download" note: "Best proxy for ImageNet experiments" - name: Caltech-101 tier: 2 domain: image_classification size_mb: 131 classes: 101 samples: 9146 api: "torchvision.datasets.Caltech101(root='/workspace/data', download=True)" - name: Flowers-102 tier: 2 domain: image_classification size_mb: 340 classes: 102 samples: 8189 api: "torchvision.datasets.Flowers102(root='/workspace/data', download=True)" - name: IMDB tier: 2 domain: nlp_sentiment size_mb: 80 classes: 2 samples: 50000 api: "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')" - name: AG News tier: 2 domain: nlp_classification size_mb: 30 classes: 4 samples: 127600 api: "datasets.load_dataset('ag_news', cache_dir='/workspace/data/hf')" - name: WikiText-103 tier: 2 domain: language_modeling size_mb: 500 samples: 28000 api: "datasets.load_dataset('wikitext', 'wikitext-103-v1', cache_dir='/workspace/data/hf')" - name: SST-2 tier: 2 domain: nlp_sentiment size_mb: 7 classes: 2 samples: 70000 api: "datasets.load_dataset('glue', 'sst2', cache_dir='/workspace/data/hf')" - name: SQuAD v2 tier: 2 domain: question_answering size_mb: 45 samples: 150000 api: "datasets.load_dataset('squad_v2', cache_dir='/workspace/data/hf')" - name: ogbg-molhiv tier: 2 domain: graph_classification size_mb: 50 samples: 41127 api: "PygGraphPropPredDataset(name='ogbg-molhiv', root='/workspace/data')" note: "Requires ogb and torch-geometric" - name: ogbn-arxiv tier: 2 domain: node_classification size_mb: 200 samples: 169343 api: "PygNodePropPredDataset(name='ogbn-arxiv', root='/workspace/data')" # ── Tier 3: Too large ─────────────────────────────────────────── - name: ImageNet-1K tier: 3 domain: image_classification size_mb: 168000 classes: 1000 samples: 1281167 alternatives: - "Tiny-ImageNet (200 classes, 237MB)" - "CIFAR-100 (100 classes, 170MB)" - "STL-10 (10 classes, 96x96 images)" - name: LAION-400M tier: 3 domain: vision_language size_mb: 1000000 alternatives: - "COCO Captions (HuggingFace, ~1GB)" - "Flickr30K (HuggingFace)" - name: The Pile tier: 3 domain: language_modeling size_mb: 825000 alternatives: - "WikiText-103 (500MB)" - "C4 subset via HuggingFace streaming" ================================================ FILE: researchclaw/data/docker_profiles.yaml ================================================ # Docker image profiles for domain-specific experiment execution. # Each domain can specify its own image with pre-installed packages. # The sandbox engine uses domain_id → docker profile mapping. profiles: ml_base: image: "researchclaw/sandbox-ml:latest" packages: - torch - torchvision - transformers - datasets - scikit-learn - numpy - scipy - matplotlib - tqdm - gymnasium - stable-baselines3 gpu: true memory_limit_mb: 8192 physics: image: "researchclaw/sandbox-physics:latest" packages: - numpy - scipy - matplotlib - jax - jaxlib - findiff gpu: false memory_limit_mb: 4096 chemistry: image: "researchclaw/sandbox-chemistry:latest" packages: - pyscf - rdkit - numpy - scipy - matplotlib - pandas gpu: false memory_limit_mb: 4096 biology: image: "researchclaw/sandbox-biology:latest" packages: - scanpy - anndata - leidenalg - biopython - scikit-learn - numpy - matplotlib - pandas gpu: false memory_limit_mb: 16384 economics: image: "researchclaw/sandbox-economics:latest" packages: - statsmodels - linearmodels - pandas - numpy - scipy - matplotlib gpu: false memory_limit_mb: 4096 math: image: "researchclaw/sandbox-math:latest" packages: - numpy - scipy - sympy - matplotlib - findiff gpu: false memory_limit_mb: 4096 security: image: "researchclaw/sandbox-security:latest" packages: - scikit-learn - xgboost - pandas - numpy - matplotlib gpu: false memory_limit_mb: 4096 network: "none" robotics: image: "researchclaw/sandbox-robotics:latest" packages: - gymnasium - mujoco - stable-baselines3 - torch - numpy - matplotlib gpu: true memory_limit_mb: 8192 generic: image: "researchclaw/sandbox-generic:latest" packages: - numpy - scipy - matplotlib - pandas - scikit-learn gpu: false memory_limit_mb: 4096 # Domain ID → profile mapping domain_map: ml_vision: ml_base ml_nlp: ml_base ml_rl: ml_base ml_graph: ml_base ml_tabular: ml_base ml_generative: ml_base ml_compression: ml_base ml_generic: ml_base physics_simulation: physics physics_pde: physics physics_quantum: chemistry chemistry_qm: chemistry chemistry_molprop: chemistry chemistry_general: chemistry biology_singlecell: biology biology_genomics: biology biology_protein: biology biology_general: biology economics_empirical: economics economics_general: economics mathematics_numerical: math mathematics_optimization: math mathematics_general: math security_detection: security robotics_control: robotics generic: generic ================================================ FILE: researchclaw/data/framework_docs/axolotl.md ================================================ # Axolotl — API Quick Reference ## Installation ```bash pip install axolotl # or git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl && pip install -e . ``` ## CLI Usage ```bash # Train accelerate launch -m axolotl.cli.train config.yaml # Inference accelerate launch -m axolotl.cli.inference config.yaml --lora_model_dir=./output # Merge LoRA adapter python -m axolotl.cli.merge_lora config.yaml --lora_model_dir=./output ``` ## Training Config (YAML) ```yaml base_model: Qwen/Qwen2.5-3B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer trust_remote_code: true # Load in 4-bit (QLoRA) load_in_4bit: true adapter: qlora # qlora, lora, or omit for full fine-tune lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true # target all linear layers # Dataset datasets: - path: my_data.jsonl type: alpaca # alpaca, sharegpt, completion, etc. # OR custom format: # type: # field_instruction: instruction # field_input: input # field_output: output # format: "{instruction}\n{input}" # Training sequence_len: 2048 sample_packing: true # pack short sequences together pad_to_sequence_len: true num_epochs: 3 micro_batch_size: 4 gradient_accumulation_steps: 4 learning_rate: 2e-5 lr_scheduler: cosine warmup_ratio: 0.1 optimizer: paged_adamw_8bit # Precision bf16: auto # auto-detect GPU capability tf32: true # Memory optimization gradient_checkpointing: true flash_attention: true # use flash attention 2 # Logging & saving logging_steps: 10 save_strategy: steps save_steps: 500 save_total_limit: 3 output_dir: ./output # Evaluation val_set_size: 0.05 eval_steps: 500 # Weights & Biases (optional) wandb_project: my-project wandb_run_id: ``` ## Dataset Formats ```yaml # Alpaca format datasets: - path: data.jsonl type: alpaca # {"instruction": "...", "input": "...", "output": "..."} # ShareGPT format datasets: - path: data.jsonl type: sharegpt # {"conversations": [{"from": "human", "value": "..."}, {"from": "gpt", "value": "..."}]} # Completion (raw text) datasets: - path: data.jsonl type: completion # {"text": "full text here"} # HuggingFace dataset datasets: - path: tatsu-lab/alpaca type: alpaca ``` ## DPO Config ```yaml rl: dpo # enables DPO training # Dataset must have "chosen" and "rejected" fields datasets: - path: dpo_data.jsonl type: chat_template.default field_messages: chosen # for chosen responses # split into chosen/rejected pairs ``` ## Key Tips - `sample_packing: true` greatly improves throughput on short sequences - `flash_attention: true` reduces memory and speeds up attention (requires compatible GPU) - `lora_target_linear: true` is the easiest way to target all linear layers - `bf16: auto` auto-detects GPU capability - DeepSpeed integration: add `deepspeed: deepspeed_configs/zero2.json` - Multi-GPU: use `accelerate launch` with proper config ================================================ FILE: researchclaw/data/framework_docs/llamafactory.md ================================================ # LLaMA-Factory — API Quick Reference ## Installation ```bash pip install llamafactory # or git clone https://github.com/hiyouga/LLaMA-Factory.git cd LLaMA-Factory && pip install -e . ``` ## CLI Usage (Primary Interface) ```bash # Fine-tune with a YAML config llamafactory-cli train config.yaml # Chat with a fine-tuned model llamafactory-cli chat config.yaml # Export/merge LoRA adapter llamafactory-cli export config.yaml # Launch web UI llamafactory-cli webui ``` ## Training Config (YAML) ```yaml ### Model model_name_or_path: Qwen/Qwen2.5-3B trust_remote_code: true ### Method (LoRA) stage: sft # sft, pt (pretrain), rm, ppo, dpo, kto, orpo do_train: true finetuning_type: lora # lora, freeze, full lora_rank: 16 lora_alpha: 32 lora_dropout: 0.05 lora_target: all # "all" for all linear, or comma-separated names ### Dataset dataset: alpaca_en # registered dataset name or custom path template: qwen # chat template: qwen, llama3, mistral, chatglm, etc. cutoff_len: 1024 preprocessing_num_workers: 8 ### Training output_dir: ./output num_train_epochs: 3.0 per_device_train_batch_size: 4 gradient_accumulation_steps: 4 learning_rate: 2.0e-5 lr_scheduler_type: cosine warmup_ratio: 0.1 bf16: true gradient_checkpointing: true ### Evaluation val_size: 0.05 per_device_eval_batch_size: 8 eval_strategy: steps eval_steps: 500 ### Logging logging_steps: 10 save_steps: 500 save_total_limit: 3 report_to: none ### Quantization (QLoRA) quantization_bit: 4 # 4 or 8 quantization_method: bitsandbytes # bitsandbytes, gptq, awq ``` ## Custom Dataset Registration ```json // data/dataset_info.json — register your dataset { "my_dataset": { "file_name": "my_data.json", "formatting": "alpaca", "columns": { "prompt": "instruction", "query": "input", "response": "output" } } } ``` ### Dataset Formats ```json // Alpaca format {"instruction": "Translate to French", "input": "Hello", "output": "Bonjour"} // ShareGPT format {"conversations": [ {"from": "human", "value": "Hello"}, {"from": "gpt", "value": "Hi there!"} ]} ``` ## DPO Training Config ```yaml stage: dpo do_train: true model_name_or_path: Qwen/Qwen2.5-3B finetuning_type: lora dataset: dpo_en # needs chosen/rejected columns template: qwen dpo_beta: 0.1 dpo_loss: sigmoid # sigmoid, hinge, ipo ``` ## Export/Merge Config ```yaml model_name_or_path: Qwen/Qwen2.5-3B adapter_name_or_path: ./output/checkpoint-1000 template: qwen finetuning_type: lora export_dir: ./merged_model export_size: 2 # shard size in GB export_legacy_format: false ``` ## Key Tips - `template` must match the model's chat format (qwen, llama3, mistral, etc.) - `lora_target: all` targets all linear layers (recommended for quality) - Use `quantization_bit: 4` for QLoRA to fit large models on limited VRAM - `cutoff_len` controls max sequence length — reduce for memory savings - Always set `gradient_checkpointing: true` for models > 1B parameters - Check supported models: Qwen, LLaMA, Mistral, Phi, ChatGLM, Baichuan, Yi, etc. ================================================ FILE: researchclaw/data/framework_docs/peft.md ================================================ # PEFT (Parameter-Efficient Fine-Tuning) — API Quick Reference ## Installation ```bash pip install peft ``` ## LoRA (Low-Rank Adaptation) ```python from peft import LoraConfig, get_peft_model, TaskType from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B", torch_dtype=torch.bfloat16) lora_config = LoraConfig( r=16, # rank (4, 8, 16, 32, 64) lora_alpha=32, # scaling factor (typically 2*r) lora_dropout=0.05, # dropout probability target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], task_type=TaskType.CAUSAL_LM, bias="none", # "none", "all", or "lora_only" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # trainable params: 13,631,488 || all params: 3,098,746,880 || trainable%: 0.44% ``` ## QLoRA (Quantized LoRA) ```python from transformers import BitsAndBytesConfig import torch # 4-bit quantization config bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", # "nf4" or "fp4" bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, # nested quantization ) model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-3B", quantization_config=bnb_config, device_map="auto", ) # Then apply LoRA on top from peft import prepare_model_for_kbit_training model = prepare_model_for_kbit_training(model) model = get_peft_model(model, lora_config) ``` ## Saving and Loading ```python # Save adapter only (small file) model.save_pretrained("./lora_adapter") # Load adapter from peft import PeftModel base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B") model = PeftModel.from_pretrained(base_model, "./lora_adapter") # Merge adapter into base model (for deployment) merged_model = model.merge_and_unload() merged_model.save_pretrained("./merged_model") ``` ## Common target_modules by Model Family | Model | target_modules | |-------|---------------| | LLaMA/Qwen/Mistral | q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj | | GPT-2/GPT-J | c_attn, c_proj, c_fc | | BLOOM | query_key_value, dense, dense_h_to_4h, dense_4h_to_h | | T5/Flan-T5 | q, v, k, o, wi, wo | | Phi | q_proj, v_proj, dense, fc1, fc2 | ## DoRA (Weight-Decomposed Low-Rank Adaptation) ```python lora_config = LoraConfig( r=16, lora_alpha=32, use_dora=True, # enables DoRA decomposition target_modules=["q_proj", "v_proj"], task_type=TaskType.CAUSAL_LM, ) ``` ## Key Tips - Higher `r` = more parameters = more capacity but slower training - `lora_alpha / r` is the effective scaling. Common: alpha=2*r - For QLoRA: always use `prepare_model_for_kbit_training()` before `get_peft_model()` - Target more modules (all linear layers) for better quality at marginal compute cost - Use `modules_to_save=["lm_head", "embed_tokens"]` if you want to train the head too ================================================ FILE: researchclaw/data/framework_docs/transformers_training.md ================================================ # HuggingFace Transformers Training — API Quick Reference ## TrainingArguments (key parameters) ```python from transformers import TrainingArguments args = TrainingArguments( output_dir="./output", # Training num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=16, gradient_accumulation_steps=4, # effective batch = 8 * 4 = 32 gradient_checkpointing=True, # saves memory at ~20% speed cost # Optimizer learning_rate=2e-5, weight_decay=0.01, warmup_ratio=0.1, # or warmup_steps=100 lr_scheduler_type="cosine", # "linear", "cosine", "constant" optim="adamw_torch", # "adamw_torch", "adamw_8bit", "paged_adamw_8bit" # Precision bf16=True, # bfloat16 (Ampere+ GPUs) # fp16=True, # float16 (older GPUs) # Logging logging_steps=10, logging_strategy="steps", report_to="none", # "wandb", "tensorboard", "none" # Saving save_strategy="epoch", save_total_limit=2, load_best_model_at_end=True, metric_for_best_model="eval_loss", # Evaluation eval_strategy="epoch", # Other dataloader_num_workers=4, remove_unused_columns=True, seed=42, max_grad_norm=1.0, ) ``` ## Trainer ```python from transformers import Trainer, AutoModelForSequenceClassification model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=2 ) trainer = Trainer( model=model, args=args, train_dataset=train_ds, eval_dataset=eval_ds, tokenizer=tokenizer, compute_metrics=compute_metrics, # optional metric function data_collator=data_collator, # optional custom collator ) # Train train_result = trainer.train() # Evaluate metrics = trainer.evaluate() # Save trainer.save_model("./best_model") ``` ## Custom compute_metrics ```python import numpy as np from sklearn.metrics import accuracy_score, f1_score def compute_metrics(eval_pred): logits, labels = eval_pred predictions = np.argmax(logits, axis=-1) return { "accuracy": accuracy_score(labels, predictions), "f1": f1_score(labels, predictions, average="weighted"), } ``` ## Tokenization / Data Preparation ```python from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def tokenize_function(examples): return tokenizer( examples["text"], padding="max_length", truncation=True, max_length=512, ) tokenized_ds = dataset.map(tokenize_function, batched=True) ``` ## Causal LM Training (GPT-style) ```python from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling model = AutoModelForCausalLM.from_pretrained("gpt2") data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, # causal LM, not masked LM ) trainer = Trainer( model=model, args=args, train_dataset=tokenized_ds, data_collator=data_collator, ) ``` ## Key Tips - `gradient_checkpointing=True` + `gradient_accumulation_steps` for memory efficiency - `optim="paged_adamw_8bit"` reduces optimizer memory by ~50% - `bf16=True` is preferred over `fp16=True` on Ampere+ GPUs (no loss scaling needed) - Set `TOKENIZERS_PARALLELISM=false` to avoid fork warnings - Use `model.config.use_cache = False` when using gradient_checkpointing ================================================ FILE: researchclaw/data/framework_docs/trl.md ================================================ # TRL (Transformer Reinforcement Learning) — API Quick Reference ## Installation ```bash pip install trl ``` ## SFTTrainer — Supervised Fine-Tuning ```python from trl import SFTTrainer, SFTConfig from transformers import AutoModelForCausalLM, AutoTokenizer from datasets import load_dataset model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B") dataset = load_dataset("json", data_files="train.jsonl", split="train") # SFTConfig inherits from TrainingArguments training_args = SFTConfig( output_dir="./output", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=4, learning_rate=2e-5, max_seq_length=1024, logging_steps=10, save_strategy="epoch", bf16=True, gradient_checkpointing=True, # Dataset formatting dataset_text_field="text", # column name for text packing=True, # pack short samples for efficiency ) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, args=training_args, ) trainer.train() trainer.save_model("./final_model") ``` ### Dataset Format Options ```python # Option 1: "text" column (conversational or plain text) # {"text": "### Human: question\n### Assistant: answer"} # Option 2: "messages" column (chat format, auto-applies chat template) # {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]} # Option 3: formatting_func callback def formatting_func(example): return f"### Question: {example['question']}\n### Answer: {example['answer']}" trainer = SFTTrainer( ..., formatting_func=formatting_func, ) ``` ## DPOTrainer — Direct Preference Optimization ```python from trl import DPOTrainer, DPOConfig training_args = DPOConfig( output_dir="./dpo_output", num_train_epochs=1, per_device_train_batch_size=2, gradient_accumulation_steps=8, learning_rate=5e-7, beta=0.1, # KL penalty coefficient max_length=1024, max_prompt_length=512, bf16=True, gradient_checkpointing=True, loss_type="sigmoid", # "sigmoid" (default), "hinge", "ipo" ) # Dataset must have columns: "prompt", "chosen", "rejected" # OR "chosen" and "rejected" as full conversations trainer = DPOTrainer( model=model, ref_model=None, # None = use implicit reference (PEFT) train_dataset=dpo_dataset, tokenizer=tokenizer, args=training_args, ) trainer.train() ``` ## GRPOTrainer — Group Relative Policy Optimization ```python from trl import GRPOTrainer, GRPOConfig training_args = GRPOConfig( output_dir="./grpo_output", num_train_epochs=1, per_device_train_batch_size=2, learning_rate=1e-6, num_generations=4, # samples per prompt for group scoring max_completion_length=256, bf16=True, ) # Requires a reward function def reward_fn(completions, prompts): # Return list of float scores return [score_completion(c) for c in completions] trainer = GRPOTrainer( model=model, reward_funcs=reward_fn, train_dataset=dataset, # needs "prompt" column tokenizer=tokenizer, args=training_args, ) trainer.train() ``` ## PPOTrainer — Proximal Policy Optimization for RLHF ```python from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead config = PPOConfig( model_name="Qwen/Qwen2.5-3B", learning_rate=1.41e-5, batch_size=16, mini_batch_size=4, ppo_epochs=4, gradient_accumulation_steps=1, ) model = AutoModelForCausalLMWithValueHead.from_pretrained("Qwen/Qwen2.5-3B") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B") trainer = PPOTrainer( config=config, model=model, tokenizer=tokenizer, dataset=dataset, ) # Training loop for batch in trainer.dataloader: query_tensors = batch["input_ids"] response_tensors = trainer.generate(query_tensors, max_new_tokens=128) rewards = [reward_model(q, r) for q, r in zip(query_tensors, response_tensors)] stats = trainer.step(query_tensors, response_tensors, rewards) ``` ## Integration with PEFT/LoRA ```python from peft import LoraConfig peft_config = LoraConfig( r=16, lora_alpha=32, lora_dropout=0.05, target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], task_type="CAUSAL_LM", ) # Pass peft_config to any TRL trainer trainer = SFTTrainer( model=model, peft_config=peft_config, # automatically wraps model with LoRA ..., ) ``` ## Key Tips - Always set `tokenizer.pad_token = tokenizer.eos_token` if pad_token is None - Use `gradient_checkpointing=True` for memory efficiency - Use `bf16=True` on Ampere+ GPUs (A100, RTX 3090+, RTX 4090, RTX 6000 Ada) - For multi-GPU: TRL uses accelerate under the hood, just launch with `accelerate launch` - `packing=True` in SFTConfig significantly speeds up training on short samples ================================================ FILE: researchclaw/data/seminal_papers.yaml ================================================ # Foundational ML papers that should be cited when discussing specific topics. # Indexed by keyword/technique for automatic injection into literature candidates. # Format: each entry has title, authors, year, venue, cite_key, and keywords. papers: # --- Normalization --- - title: "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift" authors: "Ioffe and Szegedy" year: 2015 venue: "ICML" cite_key: "ioffe2015batch" keywords: ["batch normalization", "normalization", "BN", "internal covariate shift"] - title: "Layer Normalization" authors: "Ba et al." year: 2016 venue: "arXiv" cite_key: "ba2016layer" keywords: ["layer normalization", "normalization", "LN"] - title: "Group Normalization" authors: "Wu and He" year: 2018 venue: "ECCV" cite_key: "wu2018group" keywords: ["group normalization", "normalization", "GN"] - title: "Instance Normalization: The Missing Ingredient for Fast Stylization" authors: "Ulyanov et al." year: 2016 venue: "arXiv" cite_key: "ulyanov2016instance" keywords: ["instance normalization", "normalization", "style transfer"] # --- Residual Networks --- - title: "Deep Residual Learning for Image Recognition" authors: "He et al." year: 2016 venue: "CVPR" cite_key: "he2016deep" keywords: ["ResNet", "residual network", "residual learning", "skip connection", "deep learning"] - title: "Identity Mappings in Deep Residual Networks" authors: "He et al." year: 2016 venue: "ECCV" cite_key: "he2016identity" keywords: ["ResNet", "residual network", "pre-activation"] # --- Transformers / Attention --- - title: "Attention Is All You Need" authors: "Vaswani et al." year: 2017 venue: "NeurIPS" cite_key: "vaswani2017attention" keywords: ["transformer", "attention", "self-attention", "multi-head attention"] - title: "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" authors: "Dosovitskiy et al." year: 2021 venue: "ICLR" cite_key: "dosovitskiy2021image" keywords: ["ViT", "vision transformer", "image transformer"] - title: "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" authors: "Devlin et al." year: 2019 venue: "NAACL" cite_key: "devlin2019bert" keywords: ["BERT", "pre-training", "language model", "NLP", "transformer"] # --- Optimization --- - title: "Adam: A Method for Stochastic Optimization" authors: "Kingma and Ba" year: 2015 venue: "ICLR" cite_key: "kingma2015adam" keywords: ["Adam", "optimizer", "stochastic optimization", "adaptive learning rate"] - title: "Decoupled Weight Decay Regularization" authors: "Loshchilov and Hutter" year: 2019 venue: "ICLR" cite_key: "loshchilov2019decoupled" keywords: ["AdamW", "weight decay", "optimizer", "regularization"] - title: "SGD with Momentum" authors: "Sutskever et al." year: 2013 venue: "ICML" cite_key: "sutskever2013importance" keywords: ["SGD", "momentum", "optimization", "learning rate"] # --- Regularization --- - title: "Dropout: A Simple Way to Prevent Neural Networks from Overfitting" authors: "Srivastava et al." year: 2014 venue: "JMLR" cite_key: "srivastava2014dropout" keywords: ["dropout", "regularization", "overfitting"] # --- Distribution Shift / Domain Adaptation --- - title: "Benchmarking Neural Network Robustness to Common Corruptions and Perturbations" authors: "Hendrycks and Dietterich" year: 2019 venue: "ICLR" cite_key: "hendrycks2019benchmarking" keywords: ["distribution shift", "corruption", "robustness", "CIFAR-C", "ImageNet-C"] - title: "Domain Adaptation for Object Recognition: An Unsupervised Approach" authors: "Saenko et al." year: 2010 venue: "ECCV" cite_key: "saenko2010adapting" keywords: ["domain adaptation", "transfer learning", "distribution shift"] - title: "In Search of Lost Domain Generalization" authors: "Gulrajani and Lopez-Paz" year: 2021 venue: "ICLR" cite_key: "gulrajani2021search" keywords: ["domain generalization", "distribution shift", "benchmark"] # --- Reinforcement Learning --- - title: "Proximal Policy Optimization Algorithms" authors: "Schulman et al." year: 2017 venue: "arXiv" cite_key: "schulman2017proximal" keywords: ["PPO", "policy gradient", "reinforcement learning", "RL"] - title: "Playing Atari with Deep Reinforcement Learning" authors: "Mnih et al." year: 2013 venue: "NIPS Workshop" cite_key: "mnih2013playing" keywords: ["DQN", "deep reinforcement learning", "Atari", "RL"] - title: "Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor" authors: "Haarnoja et al." year: 2018 venue: "ICML" cite_key: "haarnoja2018soft" keywords: ["SAC", "soft actor-critic", "entropy", "RL", "continuous control"] # --- Generative Models --- - title: "Generative Adversarial Nets" authors: "Goodfellow et al." year: 2014 venue: "NeurIPS" cite_key: "goodfellow2014generative" keywords: ["GAN", "generative adversarial", "generative model"] - title: "Denoising Diffusion Probabilistic Models" authors: "Ho et al." year: 2020 venue: "NeurIPS" cite_key: "ho2020denoising" keywords: ["diffusion model", "denoising", "generative model", "DDPM"] - title: "Auto-Encoding Variational Bayes" authors: "Kingma and Welling" year: 2014 venue: "ICLR" cite_key: "kingma2014auto" keywords: ["VAE", "variational autoencoder", "generative model", "latent variable"] # --- Graph Neural Networks --- - title: "Semi-Supervised Classification with Graph Convolutional Networks" authors: "Kipf and Welling" year: 2017 venue: "ICLR" cite_key: "kipf2017semi" keywords: ["GCN", "graph neural network", "graph convolution", "GNN"] - title: "Neural Ordinary Differential Equations" authors: "Chen et al." year: 2018 venue: "NeurIPS" cite_key: "chen2018neural" keywords: ["neural ODE", "ODE", "differential equation", "continuous depth"] # --- Continual Learning --- - title: "Overcoming Catastrophic Forgetting in Neural Networks" authors: "Kirkpatrick et al." year: 2017 venue: "PNAS" cite_key: "kirkpatrick2017overcoming" keywords: ["EWC", "elastic weight consolidation", "continual learning", "catastrophic forgetting"] - title: "Continual Lifelong Learning with Neural Networks: A Review" authors: "Parisi et al." year: 2019 venue: "Neural Networks" cite_key: "parisi2019continual" keywords: ["continual learning", "lifelong learning", "catastrophic forgetting"] # --- Meta-Learning --- - title: "Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks" authors: "Finn et al." year: 2017 venue: "ICML" cite_key: "finn2017model" keywords: ["MAML", "meta-learning", "few-shot", "fast adaptation"] # --- Contrastive / Self-supervised --- - title: "A Simple Framework for Contrastive Learning of Visual Representations" authors: "Chen et al." year: 2020 venue: "ICML" cite_key: "chen2020simple" keywords: ["SimCLR", "contrastive learning", "self-supervised", "representation learning"] # --- Multi-agent Systems --- - title: "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments" authors: "Lowe et al." year: 2017 venue: "NeurIPS" cite_key: "lowe2017multi" keywords: ["multi-agent", "MADDPG", "cooperative", "competitive", "multi-agent RL"] # --- Bayesian Optimization --- - title: "A Tutorial on Bayesian Optimization" authors: "Frazier" year: 2018 venue: "arXiv" cite_key: "frazier2018tutorial" keywords: ["Bayesian optimization", "surrogate model", "acquisition function", "hyperparameter tuning"] ================================================ FILE: researchclaw/docker/Dockerfile ================================================ # ResearchClaw experiment sandbox — GPU-enabled, isolated environment. # Build: docker build -t researchclaw/experiment:latest researchclaw/docker/ FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 # System packages: Python 3.11, build tools, common native deps RUN apt-get update && apt-get install -y --no-install-recommends \ python3.11 python3.11-dev python3.11-venv python3-pip \ gcc g++ cmake make gfortran \ libopenblas-dev liblapack-dev \ git curl ca-certificates \ iptables iproute2 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 \ && python3 -m pip install --upgrade pip setuptools wheel \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Core ML/scientific Python stack RUN python3 -m pip install \ torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \ && python3 -m pip install \ numpy scipy scikit-learn pandas matplotlib seaborn \ tqdm gymnasium networkx torchdiffeq PyYAML \ && python3 -m pip install "gymnasium[mujoco]" "gymnasium[box2d]" mujoco # Extended ML ecosystem — vision, metrics, utilities RUN python3 -m pip install \ timm einops torchmetrics Pillow \ h5py tensorboard albumentations kornia # LLM / Transformer training stack RUN python3 -m pip install \ transformers>=4.46.0 \ datasets>=3.0.0 \ accelerate>=1.0.0 \ peft>=0.13.0 \ trl>=0.12.0 \ bitsandbytes>=0.44.0 \ sentencepiece protobuf tokenizers safetensors \ evaluate rouge-score # Pre-cache standard datasets for offline use RUN mkdir -p /opt/datasets && \ python3 -c "import torchvision; torchvision.datasets.CIFAR10(root='/opt/datasets', train=True, download=True); torchvision.datasets.CIFAR10(root='/opt/datasets', train=False, download=True)" && \ python3 -c "import torchvision; torchvision.datasets.CIFAR100(root='/opt/datasets', train=True, download=True); torchvision.datasets.CIFAR100(root='/opt/datasets', train=False, download=True)" && \ python3 -c "import torchvision; torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True, download=True); torchvision.datasets.FashionMNIST(root='/opt/datasets', train=False, download=True)" && \ python3 -c "import torchvision; torchvision.datasets.MNIST(root='/opt/datasets', train=True, download=True); torchvision.datasets.MNIST(root='/opt/datasets', train=False, download=True)" && \ python3 -c "import torchvision; torchvision.datasets.STL10(root='/opt/datasets', split='train', download=True); torchvision.datasets.STL10(root='/opt/datasets', split='test', download=True)" && \ python3 -c "import torchvision; torchvision.datasets.SVHN(root='/opt/datasets', split='train', download=True); torchvision.datasets.SVHN(root='/opt/datasets', split='test', download=True)" && \ chmod -R a+r /opt/datasets # Non-root user for security RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace # Entrypoint script — three-phase execution (pip install → setup → experiment) COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher # Default: run entrypoint wrapper with main.py as the experiment script ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.biology ================================================ # ResearchClaw Biology sandbox — bioinformatics, single-cell analysis. # Build: docker build -f Dockerfile.biology -t researchclaw/sandbox-biology:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ \ libhdf5-dev \ && apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas \ scikit-learn tqdm PyYAML h5py # Biology packages RUN pip install \ scanpy anndata leidenalg \ biopython # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.chemistry ================================================ # ResearchClaw Chemistry sandbox — quantum chemistry, molecular property. # Build: docker build -f Dockerfile.chemistry -t researchclaw/sandbox-chemistry:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ gfortran cmake \ libopenblas-dev liblapack-dev \ && apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas \ tqdm PyYAML # Chemistry packages RUN pip install pyscf && \ pip install rdkit-pypi || true # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.economics ================================================ # ResearchClaw Economics sandbox — econometrics, regression analysis. # Build: docker build -f Dockerfile.economics -t researchclaw/sandbox-economics:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ \ && apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas seaborn \ statsmodels linearmodels \ tqdm PyYAML # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.generic ================================================ # ResearchClaw Generic sandbox — lightweight Python scientific stack. # Build: docker build -f Dockerfile.generic -t researchclaw/sandbox-generic:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ \ && apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas \ scikit-learn tqdm PyYAML seaborn # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.math ================================================ # ResearchClaw Math sandbox — numerical methods, optimization. # Build: docker build -f Dockerfile.math -t researchclaw/sandbox-math:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ gfortran \ libopenblas-dev liblapack-dev \ && apt-get clean && rm -rf /var/lib/apt/lists/* RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas \ sympy findiff tqdm PyYAML # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/Dockerfile.physics ================================================ # ResearchClaw Physics sandbox — CPU-focused, numerical computing. # Build: docker build -f Dockerfile.physics -t researchclaw/sandbox-physics:latest researchclaw/docker/ FROM python:3.11-slim ENV PYTHONUNBUFFERED=1 ENV PIP_NO_CACHE_DIR=1 RUN apt-get update && apt-get install -y --no-install-recommends \ gcc g++ gfortran cmake make \ libopenblas-dev liblapack-dev \ && apt-get clean && rm -rf /var/lib/apt/lists/* # Core scientific Python stack RUN pip install --upgrade pip setuptools wheel && \ pip install \ numpy scipy matplotlib pandas seaborn \ tqdm PyYAML h5py # Physics-specific packages RUN pip install \ findiff \ sympy # JAX (CPU only for physics sandbox) RUN pip install \ jax jaxlib # Non-root user RUN groupadd -r researcher && useradd -r -g researcher -m -s /bin/bash researcher WORKDIR /workspace RUN chown researcher:researcher /workspace COPY entrypoint.sh /usr/local/bin/rc-entrypoint.sh RUN chmod +x /usr/local/bin/rc-entrypoint.sh USER researcher ENTRYPOINT ["/usr/local/bin/rc-entrypoint.sh"] CMD ["main.py"] ================================================ FILE: researchclaw/docker/entrypoint.sh ================================================ #!/bin/bash # ResearchClaw experiment entrypoint — unified three-phase execution. # # Phase 0: pip install from requirements.txt (if present) # Phase 1: Run setup.py for dataset downloads / preparation (if present) # Phase 2: Run the main experiment script # # Environment variables: # RC_SETUP_ONLY_NETWORK=1 — disable network after Phase 1 (iptables/route) # RC_ENTRY_POINT — override entry point (default: first CLI arg or main.py) set -e WORKSPACE="/workspace" ENTRY_POINT="${1:-main.py}" # ---------------------------------------------------------------- # Phase 0: Install additional pip packages # ---------------------------------------------------------------- if [ -f "$WORKSPACE/requirements.txt" ]; then echo "[RC] Phase 0: Installing packages from requirements.txt..." pip install --no-cache-dir --break-system-packages \ -r "$WORKSPACE/requirements.txt" 2>&1 | tail -20 echo "[RC] Phase 0: Package installation complete." fi # ---------------------------------------------------------------- # Phase 1: Run setup script (dataset download / preparation) # ---------------------------------------------------------------- if [ -f "$WORKSPACE/setup.py" ]; then echo "[RC] Phase 1: Running setup.py (dataset download/preparation)..." python3 -u "$WORKSPACE/setup.py" echo "[RC] Phase 1: Setup complete." fi # ---------------------------------------------------------------- # Network cutoff (if setup_only policy) # ---------------------------------------------------------------- if [ "${RC_SETUP_ONLY_NETWORK:-0}" = "1" ]; then echo "[RC] Disabling network for experiment phase..." # Try iptables first (requires NET_ADMIN capability) if iptables -A OUTPUT -j DROP 2>/dev/null; then echo "[RC] Network disabled via iptables." elif ip route del default 2>/dev/null; then echo "[RC] Network disabled via route removal." else echo "[RC] Warning: Could not disable network (no NET_ADMIN cap or ip route). Continuing with network." fi fi # ---------------------------------------------------------------- # Phase 2: Run experiment # ---------------------------------------------------------------- echo "[RC] Phase 2: Running experiment ($ENTRY_POINT)..." exec python3 -u "$WORKSPACE/$ENTRY_POINT" ================================================ FILE: researchclaw/domains/__init__.py ================================================ """Universal cross-domain research code generation framework. This package provides domain detection, prompt adaptation, and experiment schema generalization so the pipeline can generate code for any computational research domain — not just ML/AI. """ from researchclaw.domains.detector import DomainProfile, detect_domain __all__ = ["DomainProfile", "detect_domain"] ================================================ FILE: researchclaw/domains/adapters/__init__.py ================================================ """Domain-specific prompt adapters. Each adapter customizes prompt blocks for a specific research domain while the ML adapter preserves existing behavior unchanged. """ from researchclaw.domains.adapters.ml import MLPromptAdapter from researchclaw.domains.adapters.generic import GenericPromptAdapter from researchclaw.domains.adapters.physics import PhysicsPromptAdapter from researchclaw.domains.adapters.economics import EconomicsPromptAdapter from researchclaw.domains.adapters.biology import BiologyPromptAdapter from researchclaw.domains.adapters.chemistry import ChemistryPromptAdapter from researchclaw.domains.adapters.neuroscience import NeurosciencePromptAdapter from researchclaw.domains.adapters.robotics import RoboticsPromptAdapter __all__ = [ "MLPromptAdapter", "GenericPromptAdapter", "PhysicsPromptAdapter", "EconomicsPromptAdapter", "BiologyPromptAdapter", "ChemistryPromptAdapter", "NeurosciencePromptAdapter", "RoboticsPromptAdapter", ] ================================================ FILE: researchclaw/domains/adapters/biology.py ================================================ """Biology domain prompt adapter. Provides domain-specific prompt blocks for bioinformatics experiments (single-cell analysis, genomics, protein science). """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class BiologyPromptAdapter(PromptAdapter): """Adapter for biology/bioinformatics domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain return PromptBlocks( compute_budget=domain.compute_budget_guidance or ( "Bioinformatics analyses can be memory-intensive:\n" "- Use small/subsampled datasets for testing\n" "- Single-cell: cap at 5000 cells for benchmarks\n" "- Genomics: use small chromosomes/regions" ), dataset_guidance=domain.dataset_guidance or ( "Generate synthetic biological data in code:\n" "- Single-cell: use scanpy.datasets or simulate with splatter\n" "- Genomics: generate synthetic sequences\n" "- Do NOT download external datasets" ), hp_reporting=domain.hp_reporting_guidance or ( "Report analysis parameters:\n" "HYPERPARAMETERS: {'n_cells': ..., 'n_genes': ..., " "'n_hvg': ..., 'n_pcs': ..., 'resolution': ...}" ), code_generation_hints=domain.code_generation_hints or self._default_hints(), output_format_guidance=( "Output results to results.json:\n" '{"conditions": {"method": {"ARI": 0.85, "NMI": 0.82}},\n' ' "metadata": {"domain": "biology_singlecell"}}' ), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is a **{domain.display_name}** experiment.\n\n" "Key principles:\n" "1. Proper preprocessing is critical (QC, normalization)\n" "2. Use standard evaluation metrics (ARI, NMI for clustering)\n" "3. Compare against established methods in the field\n" "4. Include sensitivity analysis for key parameters\n" ) return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=( "Use Wilcoxon rank-sum test with FDR correction " "for differential expression. Use ARI/NMI for clustering." ), ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=( "Biology result analysis:\n" "- Clustering: ARI, NMI, silhouette score\n" "- DE: number of DEGs at FDR < 0.05\n" "- Trajectory: pseudotime correlation\n" "- Report runtime alongside quality metrics" ), ) def _default_hints(self) -> str: return ( "Bioinformatics code requirements:\n" "1. Use scanpy for single-cell analysis\n" "2. Standard pipeline: load → QC → normalize → log1p → HVG → PCA → neighbors\n" "3. Compare clustering methods (Leiden, Louvain, K-means)\n" "4. Evaluate with ARI against known cell types\n" "5. Output results to results.json\n" ) ================================================ FILE: researchclaw/domains/adapters/chemistry.py ================================================ """Chemistry domain prompt adapter.""" from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class ChemistryPromptAdapter(PromptAdapter): """Adapter for chemistry domains (quantum chemistry, molecular property).""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain return PromptBlocks( compute_budget=domain.compute_budget_guidance or ( "Quantum chemistry calculations can be slow:\n" "- Use small basis sets for testing (STO-3G)\n" "- Limit molecule size (< 20 atoms)\n" "- DFT is faster than post-HF methods" ), dataset_guidance=domain.dataset_guidance or ( "Define molecular systems in code:\n" "- Atomic coordinates and basis sets\n" "- Standard test molecules (H2, H2O, CH4)\n" "- Do NOT download external datasets" ), code_generation_hints=domain.code_generation_hints or self._default_hints(), output_format_guidance=( "Output results to results.json:\n" '{"conditions": {"method": {"energy_hartree": -1.13, "error_kcal_mol": 0.5}},\n' ' "metadata": {"domain": "chemistry_qm"}}' ), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is a **{domain.display_name}** experiment.\n\n" "Key principles:\n" "1. Use well-defined molecular test sets\n" "2. Compare against high-level reference (CCSD(T) or experimental)\n" "3. Report energies in Hartree, errors in kcal/mol\n" "4. Vary basis set for convergence if applicable\n" ) return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance="Use MAE, RMSE, max error for method comparison.", ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=( "Chemistry result analysis:\n" "- Report MAE, RMSE in kcal/mol against reference\n" "- 'Chemical accuracy' = MAE < 1 kcal/mol\n" "- Compare computation time vs accuracy trade-off" ), ) def _default_hints(self) -> str: if self.domain.domain_id == "chemistry_qm": return ( "Quantum chemistry code with PySCF:\n" "1. mol = gto.M(atom='...', basis='sto-3g')\n" "2. mf = scf.RHF(mol); mf.kernel()\n" "3. For post-HF: mp2 = mp.MP2(mf); mp2.kernel()\n" "4. Compare methods on same molecule set\n" "5. Energy conversion: 1 Ha = 627.509 kcal/mol\n" "6. Output results.json\n" ) return ( "Molecular property prediction:\n" "1. Define molecules via SMILES strings\n" "2. Use RDKit for featurization\n" "3. Train/test split on molecular data\n" "4. Compare ML models (RF, XGBoost, GCN)\n" "5. Output results.json with MAE, RMSE\n" ) ================================================ FILE: researchclaw/domains/adapters/economics.py ================================================ """Economics domain prompt adapter. Provides domain-specific prompt blocks for empirical economics experiments (regression analysis, causal inference, panel data). """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class EconomicsPromptAdapter(PromptAdapter): """Adapter for economics domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain return PromptBlocks( compute_budget=domain.compute_budget_guidance or ( "Economics regressions are fast. Focus on:\n" "- Multiple specifications (4-6 columns)\n" "- Bootstrap SE if needed (100-500 reps)\n" "- Cluster-robust SE for panel data" ), dataset_guidance=domain.dataset_guidance or ( "Generate synthetic data with known treatment effect (DGP):\n" "- Include treatment, outcome, controls, fixed effects\n" "- Simulate realistic correlations and confounders\n" "- Do NOT download external datasets" ), hp_reporting=domain.hp_reporting_guidance or ( "Report specification details:\n" "HYPERPARAMETERS: {'n_obs': ..., 'n_controls': ..., " "'true_effect': ..., 'fe_groups': ..., 'cluster_var': ...}" ), code_generation_hints=domain.code_generation_hints or self._default_hints(), output_format_guidance=self._output_format(), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is an **{domain.display_name}** experiment.\n" f"Paradigm: progressive specification\n\n" "Key principles for economics experiments:\n" "1. Start simple (OLS), add complexity progressively\n" "2. Report each specification as a column in a regression table\n" "3. Use robust/clustered standard errors\n" "4. Include at least one robustness check\n" "5. Data should be generated with a known DGP for validation\n" ) return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=( "Use Hausman test for FE vs RE choice, " "F-test for joint significance, " "robust/clustered SE for inference." ), ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=( "Economics result analysis:\n" "- Compare coefficient estimates across specifications\n" "- Check if treatment effect is robust to controls/FE\n" "- Report significance levels (*/**/***)\n" "- Discuss economic magnitude, not just statistical significance" ), statistical_test_guidance=( "Use Hausman test, robust SE, cluster SE. " "Report R², N, F-statistic for each specification." ), ) def _default_hints(self) -> str: return ( "Economics code requirements:\n" "1. Generate synthetic data with statsmodels/numpy\n" "2. Implement progressive specifications:\n" " - Spec 1: Simple OLS (Y ~ treatment)\n" " - Spec 2: OLS + controls (Y ~ treatment + X1 + X2)\n" " - Spec 3: Fixed effects (Y ~ treatment + X + entity FE)\n" " - Spec 4: IV / 2SLS if applicable\n" "3. Use robust/clustered standard errors\n" "4. Output regression table to results.json\n" "5. Use linearmodels for panel FE, statsmodels for OLS/IV\n" ) def _output_format(self) -> str: return ( "Output regression table to results.json:\n" '{"regression_table": {\n' ' "spec_1_ols": {"coeff": 0.15, "se": 0.03, "p": 0.001, "n": 5000, "r2": 0.12},\n' ' "spec_2_controls": {"coeff": 0.12, "se": 0.02, "p": 0.001, "n": 5000, "r2": 0.25}\n' '},\n' ' "metadata": {"domain": "economics_empirical", "total_runtime_sec": ...}}' ) ================================================ FILE: researchclaw/domains/adapters/generic.py ================================================ """Generic domain adapter — fallback for unknown/new domains. Re-exports GenericPromptAdapter from prompt_adapter.py so that the adapters package has a consistent interface. """ from researchclaw.domains.prompt_adapter import GenericPromptAdapter __all__ = ["GenericPromptAdapter"] ================================================ FILE: researchclaw/domains/adapters/math.py ================================================ """Mathematics domain prompt adapter.""" from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class MathPromptAdapter(PromptAdapter): """Adapter for numerical mathematics and optimization domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain paradigm = domain.experiment_paradigm return PromptBlocks( compute_budget=domain.compute_budget_guidance or ( "Numerical methods are typically fast.\n" "Use 5-8 refinement levels for convergence plots.\n" "Step sizes: geometric sequence (h, h/2, h/4, ...)" ), dataset_guidance=domain.dataset_guidance or ( "Use standard test problems with known solutions:\n" "- ODE: Lotka-Volterra, Van der Pol, stiff systems\n" "- Quadrature: smooth, oscillatory, singular integrands\n" "- Linear algebra: Hilbert matrix, tridiagonal\n" "- Do NOT download external datasets" ), code_generation_hints=domain.code_generation_hints or self._hints(paradigm), output_format_guidance=self._output_format(paradigm), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( experiment_design_context=( f"This is a **{self.domain.display_name}** experiment.\n" "Focus on:\n" "1. Correctness (verify against known solutions)\n" "2. Convergence order (expected vs observed)\n" "3. Efficiency (operations count, wall time)\n" ), statistical_test_guidance="Use convergence order fitting for accuracy analysis.", ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=( "Numerical methods analysis:\n" "- Convergence: fit log(error) vs log(h)\n" "- Stability: check for growth in error over long runs\n" "- Efficiency: compare accuracy per unit computation" ), ) def _hints(self, paradigm: str) -> str: if paradigm == "convergence": return ( "Numerical methods convergence study:\n" "1. Implement methods from scratch (not just scipy wrappers)\n" "2. Use test problems with KNOWN exact solutions\n" "3. Run at 5+ refinement levels\n" "4. Compute error: ||u_h - u_exact||_2\n" "5. Report convergence order: p = log(e_h / e_{h/2}) / log(2)\n" "6. Output results.json with convergence data" ) return ( "Numerical/optimization code:\n" "1. Implement algorithms from scratch\n" "2. Test on standard benchmark functions\n" "3. Compare accuracy and efficiency\n" "4. Output results.json" ) def _output_format(self, paradigm: str) -> str: if paradigm == "convergence": return ( "Output convergence results to results.json:\n" '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]}}' ) return ( "Output results to results.json:\n" '{"conditions": {"optimizer": {"iterations": 100, "final_value": 0.001}}}' ) ================================================ FILE: researchclaw/domains/adapters/ml.py ================================================ """ML domain prompt adapter — preserves existing behavior exactly. This adapter returns empty PromptBlocks for all stages, which signals the pipeline to use the existing hardcoded ML behavior in prompts.py. This is the **zero-regression guarantee** for ML functionality. """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class MLPromptAdapter(PromptAdapter): """ML adapter: delegates to existing prompts.py behavior unchanged.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks() def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks() def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks() ================================================ FILE: researchclaw/domains/adapters/neuroscience.py ================================================ """Computational neuroscience domain prompt adapter. Provides domain-specific prompt blocks for neural simulation experiments (spiking networks, neural dynamics, population coding, brain imaging analysis). """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class NeurosciencePromptAdapter(PromptAdapter): """Adapter for computational neuroscience domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain paradigm = domain.experiment_paradigm libs = ( ", ".join(domain.core_libraries) if domain.core_libraries else "numpy, scipy, brian2" ) return PromptBlocks( compute_budget=domain.compute_budget_guidance or self._default_compute_budget(), dataset_guidance=domain.dataset_guidance or self._default_dataset_guidance(), hp_reporting=domain.hp_reporting_guidance or self._default_hp_reporting(), code_generation_hints=domain.code_generation_hints or self._default_code_hints(paradigm), output_format_guidance=self._output_format(paradigm), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is a **{domain.display_name}** experiment.\n" f"Paradigm: {domain.experiment_paradigm}\n\n" "Key principles for neuroscience simulations:\n" "1. Use biologically plausible parameters (membrane time constants, " "synaptic weights, firing rates)\n" "2. Validate single-neuron dynamics before scaling to networks\n" "3. Report spike statistics: firing rate, CV of ISI, Fano factor\n" "4. For network models, specify connectivity (E/I ratio, sparsity)\n" "5. Compare against established neuron models as baselines\n" ) if domain.standard_baselines: design_context += ( f"\nStandard reference models: " f"{', '.join(domain.standard_baselines)}\n" ) stats = ( ", ".join(domain.statistical_tests) if domain.statistical_tests else "paired t-test, KS test" ) return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=( f"Use {stats} for result analysis. " "For spike train comparison, use van Rossum distance " "or SPIKE-distance when appropriate." ), ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=self.domain.result_analysis_hints or ( "Neuroscience result analysis:\n" "- Firing rate: mean ± std across neurons and trials\n" "- Regularity: CV of inter-spike intervals (CV < 1 regular, " "CV ≈ 1 Poisson-like)\n" "- Synchrony: pairwise spike-count correlation\n" "- Population: Fano factor, dimensionality\n" "- Decoding: accuracy with cross-validation, information (bits)\n" "- Use raster plots and PSTHs for visualization" ), statistical_test_guidance=( "Use paired t-test or permutation test for firing rate " "comparisons. Use KS test for ISI distribution comparisons." ), ) # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _default_code_hints(self, paradigm: str) -> str: if paradigm == "simulation": return ( "Neural simulation code:\n" "1. Define neuron model (LIF, Izhikevich, or Hodgkin-Huxley)\n" "2. Set biologically plausible parameters " "(tau_m=20ms, V_thresh=-50mV, V_reset=-65mV for LIF)\n" "3. Generate input stimulus (Poisson spikes or step current)\n" "4. Run simulation, record spikes and membrane potential\n" "5. Compute spike statistics: rate, CV ISI, Fano factor\n" "6. Compare multiple models on the same stimulus\n" "7. Output results.json with comparison data\n" ) return ( "Neural analysis code:\n" "1. Generate or load neural activity data\n" "2. Preprocess: spike sorting / binning / filtering\n" "3. Compute relevant metrics\n" "4. Compare methods on the same data\n" "5. Output results to results.json\n" ) def _default_compute_budget(self) -> str: return ( "Time budget for neural simulations:\n" "- Single neuron models: very fast, run many trials\n" "- Small networks (< 1000 neurons): seconds per run\n" "- Large networks: use vectorized code or Brian2\n" "- Keep biological time reasonable (100ms–10s)" ) def _default_dataset_guidance(self) -> str: return ( "Neuroscience experiments generate data programmatically:\n" "- Define neuron parameters and connectivity in code\n" "- Generate Poisson spike trains for inputs\n" "- Use standard test circuits (E/I balanced, WTA)\n" "- Do NOT download external neural datasets\n" "- For brain imaging: generate synthetic fMRI/EEG signals" ) def _default_hp_reporting(self) -> str: return ( "Report simulation parameters:\n" "HYPERPARAMETERS: {'n_neurons': ..., 'tau_m_ms': ..., " "'v_thresh_mV': ..., 'sim_duration_ms': ..., 'dt_ms': ..., " "'connectivity': ...}" ) def _output_format(self, paradigm: str) -> str: return ( "Output neural simulation results to results.json:\n" '{"conditions": {"model_name": {"firing_rate_hz": 12.5, ' '"cv_isi": 0.85, "fano_factor": 1.02}},\n' ' "metadata": {"domain": "neuroscience_computational", ' '"sim_duration_ms": ..., "n_neurons": ...}}' ) ================================================ FILE: researchclaw/domains/adapters/physics.py ================================================ """Physics domain prompt adapter. Provides domain-specific prompt blocks for computational physics experiments (simulations, PDE solvers, convergence studies). """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class PhysicsPromptAdapter(PromptAdapter): """Adapter for physics domains (simulation, PDE, quantum).""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain paradigm = domain.experiment_paradigm libs = ", ".join(domain.core_libraries) if domain.core_libraries else "numpy, scipy" code_hints = domain.code_generation_hints or self._default_code_hints(paradigm) return PromptBlocks( compute_budget=domain.compute_budget_guidance or self._default_compute_budget(), dataset_guidance=domain.dataset_guidance or self._default_dataset_guidance(), hp_reporting=domain.hp_reporting_guidance or self._default_hp_reporting(), code_generation_hints=code_hints, output_format_guidance=self._output_format(paradigm), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is a **{domain.display_name}** experiment.\n" f"Paradigm: {domain.experiment_paradigm}\n\n" "Key principles for physics experiments:\n" "1. Conservation laws must be respected (energy, momentum, etc.)\n" "2. Use appropriate units (reduced units for MD, SI otherwise)\n" "3. Validate against known analytical solutions when possible\n" "4. For convergence: vary grid size/timestep systematically\n" ) if domain.standard_baselines: design_context += f"\nStandard reference methods: {', '.join(domain.standard_baselines)}\n" stats = ", ".join(domain.statistical_tests) if domain.statistical_tests else "convergence order analysis" return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=f"Use {stats} for result analysis.", ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=self.domain.result_analysis_hints or ( "Physics result analysis:\n" "- Convergence: fit log(error) vs log(h) for order\n" "- Conservation: track energy/momentum drift\n" "- Accuracy: compare with analytical solutions\n" "- Use log-log plots for convergence studies" ), statistical_test_guidance="Use convergence order fitting and relative error analysis.", ) def _default_code_hints(self, paradigm: str) -> str: if paradigm == "convergence": return ( "This is a convergence study:\n" "1. Implement the numerical method(s)\n" "2. Run at 5+ refinement levels (e.g., h, h/2, h/4, h/8, h/16)\n" "3. Compute error norms at each level (L2, L-inf)\n" "4. Output results.json with convergence data\n" "5. Expected format:\n" ' {"convergence": {"method_name": [{"h": 0.1, "l2_error": 0.05}]}}\n' ) return ( "Physics simulation code:\n" "1. Define the physical system (particles, fields, etc.)\n" "2. Implement integrator(s)\n" "3. Run simulation, track observables\n" "4. Compare methods on the same system\n" "5. Output results.json with comparison data\n" ) def _default_compute_budget(self) -> str: return ( "Time budget for physics computations:\n" "- Keep simulation sizes manageable\n" "- Use small test systems for validation\n" "- Scale up only if time permits\n" "- Focus on accuracy, not scale" ) def _default_dataset_guidance(self) -> str: return ( "Physics experiments generate data programmatically:\n" "- Define initial conditions in code\n" "- Use standard test problems with known solutions\n" "- Do NOT download external datasets\n" "- Generate particle positions, velocities, or grid values in code" ) def _default_hp_reporting(self) -> str: return ( "Report simulation parameters:\n" "HYPERPARAMETERS: {'dt': ..., 'N_particles': ..., 'grid_size': ..., " "'num_steps': ..., 'method': ...}" ) def _output_format(self, paradigm: str) -> str: if paradigm == "convergence": return ( "Output convergence results to results.json:\n" '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]},\n' ' "metadata": {"domain": "...", "total_runtime_sec": ...}}' ) return ( "Output simulation results to results.json:\n" '{"conditions": {"method": {"metric_name": value}},\n' ' "metadata": {"domain": "...", "total_runtime_sec": ...}}' ) ================================================ FILE: researchclaw/domains/adapters/robotics.py ================================================ """Robotics & control domain prompt adapter. Provides domain-specific prompt blocks for robotics experiments (control policies, RL-based manipulation, locomotion, sim-to-real). """ from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class RoboticsPromptAdapter(PromptAdapter): """Adapter for robotics and control domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain libs = ( ", ".join(domain.core_libraries) if domain.core_libraries else "gymnasium, stable-baselines3, torch" ) return PromptBlocks( compute_budget=domain.compute_budget_guidance or self._default_compute_budget(), dataset_guidance=domain.dataset_guidance or self._default_dataset_guidance(), hp_reporting=domain.hp_reporting_guidance or self._default_hp_reporting(), code_generation_hints=domain.code_generation_hints or self._default_code_hints(), output_format_guidance=self._output_format(), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain design_context = ( f"This is a **{domain.display_name}** experiment.\n" f"Paradigm: {domain.experiment_paradigm}\n\n" "Key principles for robotics experiments:\n" "1. Use standardized environments (Gymnasium, MuJoCo) for " "reproducibility\n" "2. Report mean ± std of episode return over multiple seeds\n" "3. Include learning curves (return vs. training steps)\n" "4. Compare against established RL baselines (PPO, SAC, TD3)\n" "5. Report success rate for goal-conditioned tasks\n" ) if domain.standard_baselines: design_context += ( f"\nStandard baselines: " f"{', '.join(domain.standard_baselines)}\n" ) stats = ( ", ".join(domain.statistical_tests) if domain.statistical_tests else "paired t-test" ) return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=( f"Use {stats} across random seeds to assess significance. " "Report results over at least 5 seeds. Include confidence " "intervals on learning curves." ), ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=self.domain.result_analysis_hints or ( "Robotics result analysis:\n" "- Episode return: mean ± std across seeds and evaluation " "episodes\n" "- Success rate: fraction of episodes reaching goal\n" "- Sample efficiency: return at fixed training step count\n" "- Learning curves: smoothed return vs. environment steps\n" "- Wall-clock time if comparing algorithm efficiency" ), statistical_test_guidance=( "Use paired t-test or Welch's t-test across seeds. " "Report 95% confidence intervals on all metrics." ), ) # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ def _default_code_hints(self) -> str: return ( "Robotics/control code:\n" "1. Create Gymnasium environment (use built-in envs or define " "custom wrappers)\n" "2. Implement or instantiate RL agent (PPO, SAC, TD3 via " "stable-baselines3)\n" "3. Train for a fixed number of environment steps\n" "4. Evaluate over 100 episodes, record returns\n" "5. Repeat across multiple seeds (>= 5)\n" "6. Output results.json with per-seed evaluation metrics\n" ) def _default_compute_budget(self) -> str: return ( "Time budget for robotics experiments:\n" "- Use simple envs (CartPole, Pendulum) for fast iteration\n" "- Limit training to 100k-500k steps for benchmarks\n" "- MuJoCo envs are heavier: reduce training budget\n" "- Evaluate over 100 episodes per seed for stable metrics" ) def _default_dataset_guidance(self) -> str: return ( "Robotics experiments use simulation environments:\n" "- Use Gymnasium built-in envs (CartPole, Pendulum, HalfCheetah)\n" "- Define custom environments via Gymnasium API if needed\n" "- Do NOT download external datasets\n" "- All training data is generated through environment interaction" ) def _default_hp_reporting(self) -> str: return ( "Report training hyperparameters:\n" "HYPERPARAMETERS: {'env': ..., 'algorithm': ..., 'lr': ..., " "'gamma': ..., 'total_timesteps': ..., 'n_seeds': ..., " "'eval_episodes': ...}" ) def _output_format(self) -> str: return ( "Output robotics experiment results to results.json:\n" '{"conditions": {"algorithm": {"mean_return": 195.2, ' '"std_return": 12.3, "success_rate": 0.95}},\n' ' "metadata": {"domain": "robotics_control", "env": "...", ' '"total_timesteps": ...}}' ) ================================================ FILE: researchclaw/domains/adapters/security.py ================================================ """Security domain prompt adapter.""" from __future__ import annotations from typing import Any from researchclaw.domains.prompt_adapter import PromptAdapter, PromptBlocks class SecurityPromptAdapter(PromptAdapter): """Adapter for security/intrusion detection domains.""" def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain return PromptBlocks( dataset_guidance=domain.dataset_guidance or ( "Generate synthetic network/security data in code:\n" "- Normal traffic patterns + attack patterns\n" "- Class-imbalanced (realistic: ~5% attacks)\n" "- Do NOT download external datasets" ), code_generation_hints=domain.code_generation_hints or ( "Security detection code:\n" "1. Generate synthetic tabular features (packet size, duration, etc.)\n" "2. Train classifiers (RF, XGBoost, SVM)\n" "3. Evaluate with TPR, FPR, F1, per-class metrics\n" "4. Report confusion matrix\n" "5. Output results to results.json\n" ), output_format_guidance=( "Output results to results.json:\n" '{"conditions": {"detector": {"TPR": 0.95, "FPR": 0.02, "F1": 0.93}}}' ), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( experiment_design_context=( "This is a **security/intrusion detection** experiment.\n" "Key: class imbalance, low false positive rate is critical.\n" "Compare detectors on same data splits.\n" ), ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks( result_analysis_hints=( "Security analysis: focus on FPR (false alarm rate) alongside TPR.\n" "Per-class F1 is important for multi-class attack detection." ), ) ================================================ FILE: researchclaw/domains/detector.py ================================================ """Domain detection and profile loading. Provides :class:`DomainProfile` (the canonical representation of a research domain's experiment conventions) and :func:`detect_domain` which maps a research topic + context to the most appropriate profile. Detection strategy (three-level): 1. **Keyword matching** — fast, deterministic, hits known domains. 2. **LLM classification** — for ambiguous topics. 3. **Hybrid resolution** — e.g. "physics-informed neural networks" matches both physics and ML; we pick the primary and tag secondaries. """ from __future__ import annotations import logging import re from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any import yaml logger = logging.getLogger(__name__) _PROFILES_DIR = Path(__file__).parent / "profiles" # --------------------------------------------------------------------------- # Enums # --------------------------------------------------------------------------- class ExperimentParadigm(str, Enum): """High-level experiment structure used by different domains.""" COMPARISON = "comparison" # A vs B (ML, security) CONVERGENCE = "convergence" # error vs refinement (math, physics) PROGRESSIVE_SPEC = "progressive_spec" # OLS → +FE → +IV (economics) SIMULATION = "simulation" # run → observe → analyze (physics) ABLATION_STUDY = "ablation_study" # systematic component removal class MetricType(str, Enum): SCALAR = "scalar" TABLE = "table" CONVERGENCE = "convergence" LEARNING_CURVE = "learning_curve" CONFUSION_MATRIX = "confusion" STRUCTURED = "structured" PARETO = "pareto" # --------------------------------------------------------------------------- # DomainProfile # --------------------------------------------------------------------------- @dataclass class DomainProfile: """Complete description of a research domain's experiment conventions. Loaded from YAML files in ``researchclaw/domains/profiles/``. """ # Identity domain_id: str # e.g. "computational_physics" display_name: str # e.g. "Computational Physics" parent_domain: str = "" # e.g. "physics" # Experiment paradigm experiment_paradigm: str = ExperimentParadigm.COMPARISON.value condition_terminology: dict[str, str] = field(default_factory=lambda: { "baseline": "baseline", "proposed": "proposed method", "variant": "ablation", "input": "dataset", "metric": "accuracy/loss", }) # Code structure typical_file_structure: dict[str, str] = field(default_factory=dict) entry_point: str = "main.py" # Dependencies & environment core_libraries: list[str] = field(default_factory=list) docker_image: str = "researchclaw/sandbox-generic:latest" gpu_required: bool = False pip_packages: list[str] = field(default_factory=list) # Metrics & evaluation metric_types: list[str] = field(default_factory=lambda: ["scalar"]) standard_baselines: list[str] = field(default_factory=list) evaluation_protocol: str = "" statistical_tests: list[str] = field(default_factory=lambda: ["paired_t_test"]) # Output & presentation output_formats: list[str] = field(default_factory=lambda: ["latex_table"]) figure_types: list[str] = field(default_factory=lambda: ["bar_chart", "line_plot"]) # Search keywords (for Code Searcher and literature) github_search_terms: list[str] = field(default_factory=list) paper_keywords: list[str] = field(default_factory=list) # Prompt guidance blocks (domain-specific instruction text) compute_budget_guidance: str = "" dataset_guidance: str = "" hp_reporting_guidance: str = "" code_generation_hints: str = "" result_analysis_hints: str = "" # --------------------------------------------------------------------------- # Profile loading # --------------------------------------------------------------------------- _profile_cache: dict[str, DomainProfile] = {} def _load_profile(path: Path) -> DomainProfile: """Load a single YAML profile into a DomainProfile.""" with path.open(encoding="utf-8") as fh: data: dict[str, Any] = yaml.safe_load(fh) or {} return DomainProfile( domain_id=data.get("domain_id", path.stem), display_name=data.get("display_name", path.stem.replace("_", " ").title()), parent_domain=data.get("parent_domain", ""), experiment_paradigm=data.get("experiment_paradigm", "comparison"), condition_terminology=data.get("condition_terminology", {}), typical_file_structure=data.get("typical_file_structure", {}), entry_point=data.get("entry_point", "main.py"), core_libraries=data.get("core_libraries", []), docker_image=data.get("docker_image", "researchclaw/sandbox-generic:latest"), gpu_required=data.get("gpu_required", False), pip_packages=data.get("pip_packages", []), metric_types=data.get("metric_types", ["scalar"]), standard_baselines=data.get("standard_baselines", []), evaluation_protocol=data.get("evaluation_protocol", ""), statistical_tests=data.get("statistical_tests", ["paired_t_test"]), output_formats=data.get("output_formats", ["latex_table"]), figure_types=data.get("figure_types", ["bar_chart", "line_plot"]), github_search_terms=data.get("github_search_terms", []), paper_keywords=data.get("paper_keywords", []), compute_budget_guidance=data.get("compute_budget_guidance", ""), dataset_guidance=data.get("dataset_guidance", ""), hp_reporting_guidance=data.get("hp_reporting_guidance", ""), code_generation_hints=data.get("code_generation_hints", ""), result_analysis_hints=data.get("result_analysis_hints", ""), ) def load_all_profiles() -> dict[str, DomainProfile]: """Load all YAML profiles from the profiles directory.""" global _profile_cache if _profile_cache: return _profile_cache if not _PROFILES_DIR.is_dir(): logger.warning("Profiles directory not found: %s", _PROFILES_DIR) return {} for yaml_path in sorted(_PROFILES_DIR.glob("*.yaml")): try: profile = _load_profile(yaml_path) _profile_cache[profile.domain_id] = profile except Exception: logger.warning("Failed to load profile %s", yaml_path, exc_info=True) logger.info("Loaded %d domain profiles", len(_profile_cache)) return _profile_cache def get_profile(domain_id: str) -> DomainProfile | None: """Get a specific domain profile by ID.""" profiles = load_all_profiles() return profiles.get(domain_id) def get_generic_profile() -> DomainProfile: """Return the generic fallback profile.""" profile = get_profile("generic") if profile is not None: return profile # Hardcoded fallback if YAML not found return DomainProfile( domain_id="generic", display_name="Generic Computational Research", experiment_paradigm="comparison", core_libraries=["numpy", "scipy", "matplotlib", "pandas"], docker_image="researchclaw/sandbox-generic:latest", ) # --------------------------------------------------------------------------- # Keyword-based detection rules # --------------------------------------------------------------------------- # Ordered list: first match wins (more specific patterns first). _KEYWORD_RULES: list[tuple[list[str], str]] = [ # ML sub-domains (most specific first) (["reinforcement learning", "rl agent", "policy gradient", "q-learning", "actor-critic", "reward shaping", "gymnasium", "stable-baselines"], "ml_rl"), (["knowledge distillation", "teacher-student", "model compression", "pruning", "quantization"], "ml_compression"), (["natural language", "nlp", "text classification", "sentiment", "language model", "transformer", "bert", "gpt", "llm", "tokeniz"], "ml_nlp"), (["object detection", "image segmentation", "image classification", "convolutional", "cnn", "resnet", "vit", "vision transformer", "computer vision", "visual"], "ml_vision"), (["graph neural", "gnn", "node classification", "link prediction", "graph convolution", "message passing"], "ml_graph"), (["tabular", "xgboost", "lightgbm", "catboost", "feature engineering"], "ml_tabular"), (["generative adversarial", "gan", "diffusion model", "vae", "variational autoencoder", "image generation"], "ml_generative"), # Neuroscience (before ML catch-all so "spiking neural" is not swallowed # by the "neural network" pattern in ml_generic) (["spiking neural", "spike train", "brian2", "hodgkin-huxley", "integrate-and-fire", "lif model", "izhikevich", "membrane potential", "action potential", "neural circuit", "neural dynamics", "population coding", "neural decoding", "raster plot", "firing rate", "synaptic", "connectome"], "neuroscience_computational"), (["fmri", "eeg", "meg", "neuroimaging", "brain imaging", "nilearn", "mne-python", "bold signal", "brain network", "functional connectivity"], "neuroscience_imaging"), (["neuroscience", "neuron model", "brain simulation", "neural computation", "neural encoding"], "neuroscience_computational"), # Catch-all ML (["neural network", "deep learning", "machine learning", "training loop", "backpropagation", "gradient descent", "pytorch", "tensorflow", "torch", "sklearn"], "ml_generic"), # Physics (["molecular dynamics", "n-body", "lennard-jones", "force field", "jax-md", "ase", "openmm"], "physics_simulation"), (["partial differential", "pde", "finite element", "finite difference", "fenics", "navier-stokes", "heat equation", "wave equation", "poisson", "laplace"], "physics_pde"), (["quantum mechanics", "schrodinger", "hamiltonian", "wavefunction", "density functional"], "physics_quantum"), (["physics", "simulation", "integrator", "conservation", "energy drift", "symplectic"], "physics_simulation"), # Chemistry (["quantum chemistry", "dft", "hartree-fock", "pyscf", "ccsd", "molecular orbital", "basis set"], "chemistry_qm"), (["molecular property", "smiles", "rdkit", "fingerprint", "drug", "binding affinity", "admet"], "chemistry_molprop"), (["chemistry", "molecule", "reaction", "catalyst"], "chemistry_general"), # Biology (["single-cell", "scrna", "scanpy", "anndata", "leiden", "differential expression", "pseudotime"], "biology_singlecell"), (["genomics", "genome", "variant calling", "sequencing", "biopython", "alignment"], "biology_genomics"), (["protein", "alphafold", "protein folding", "amino acid", "esm"], "biology_protein"), (["biology", "bioinformatics", "omics"], "biology_general"), # Economics (["econometrics", "regression", "instrumental variable", "fixed effect", "panel data", "difference-in-difference", "causal inference", "statsmodels", "linearmodels"], "economics_empirical"), (["economics", "economic", "market", "equilibrium", "utility", "welfare"], "economics_general"), # Mathematics (["numerical method", "numerical analysis", "convergence order", "finite difference", "quadrature", "interpolation", "ode solver", "runge-kutta", "sympy"], "mathematics_numerical"), (["optimization", "convex", "linear programming", "gradient-free", "evolutionary algorithm"], "mathematics_optimization"), (["mathematics", "mathematical", "theorem", "proof", "algebra", "topology"], "mathematics_general"), # Security (["intrusion detection", "malware", "anomaly detection", "network traffic", "cybersecurity", "vulnerability", "threat detection", "scapy"], "security_detection"), # Robotics / Control (["robot", "robotic", "control", "manipulation", "mujoco", "pybullet", "locomotion", "navigation"], "robotics_control"), ] def _keyword_detect(text: str) -> str | None: """Match text against keyword rules. Returns domain_id or None.""" lower = text.lower() for keywords, domain_id in _KEYWORD_RULES: for kw in keywords: if kw in lower: return domain_id return None # --------------------------------------------------------------------------- # LLM-based detection # --------------------------------------------------------------------------- _LLM_CLASSIFY_PROMPT = """\ You are a domain classifier for computational research topics. Given the research topic and context, classify it into EXACTLY ONE domain. Available domains: - ml_vision: Computer vision (image classification, detection, segmentation) - ml_nlp: Natural language processing (text, language models, transformers) - ml_rl: Reinforcement learning (agents, environments, rewards) - ml_graph: Graph neural networks (node/edge/graph tasks) - ml_tabular: Tabular ML (XGBoost, feature engineering) - ml_generative: Generative models (GANs, diffusion, VAE) - ml_compression: Model compression (distillation, pruning, quantization) - ml_generic: Other ML/AI research - physics_simulation: Molecular dynamics, N-body, classical simulations - physics_pde: PDE solvers (FEM, FDM, spectral methods) - physics_quantum: Quantum mechanics, quantum chemistry - chemistry_qm: Quantum chemistry (DFT, Hartree-Fock, PySCF) - chemistry_molprop: Molecular property prediction (SMILES, RDKit) - biology_singlecell: Single-cell analysis (scRNA-seq, scanpy) - biology_genomics: Genomics (sequencing, variant calling) - biology_protein: Protein science (folding, property prediction) - economics_empirical: Empirical economics (regression, causal inference) - mathematics_numerical: Numerical methods (ODE/PDE solvers, convergence) - mathematics_optimization: Optimization (convex, evolutionary) - security_detection: Security/intrusion detection - neuroscience_computational: Computational neuroscience (spiking networks, neural dynamics, population coding) - neuroscience_imaging: Brain imaging analysis (fMRI, EEG, MEG, functional connectivity) - robotics_control: Robotics and control - generic: Cannot classify / cross-domain Topic: {topic} Context: {context} Respond with ONLY the domain_id (e.g., "ml_vision"). Nothing else.""" def _llm_detect( topic: str, context: str, llm: Any, ) -> str | None: """Use LLM to classify a research topic into a domain. Synchronous — ``llm.chat()`` is a blocking call. """ try: prompt = _LLM_CLASSIFY_PROMPT.format(topic=topic, context=context) response = llm.chat( [{"role": "user", "content": prompt}], system="You are a precise domain classifier.", max_tokens=50, ) content = getattr(response, "content", None) if not content or not content.strip(): logger.warning("LLM domain detection returned empty response") return None domain_id = content.strip().strip('"').strip("'").lower() # Validate it's a known domain profiles = load_all_profiles() if domain_id in profiles or domain_id == "generic": return domain_id # Try fuzzy match (require at least 4 chars to avoid over-matching) if len(domain_id) >= 4: for pid in profiles: if pid in domain_id or domain_id in pid: return pid logger.warning("LLM returned unknown domain: %s", domain_id) return None except Exception: logger.warning("LLM domain detection failed", exc_info=True) return None # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def detect_domain( topic: str, hypotheses: str = "", literature: str = "", llm: Any | None = None, ) -> DomainProfile: """Detect the research domain from topic and context. Three-level detection: 1. Keyword matching (fast, deterministic) 2. LLM classification (if llm is provided, for ambiguous topics) 3. Fallback to generic profile Parameters ---------- topic : str Research topic description. hypotheses : str Hypotheses text for additional context. literature : str Literature review text for additional context. llm : LLMClient, optional LLM client for classification fallback. Returns ------- DomainProfile The detected domain profile. """ combined_text = f"{topic} {hypotheses} {literature}" # Level 1: Keyword matching domain_id = _keyword_detect(combined_text) if domain_id: profile = get_profile(domain_id) if profile: logger.info( "Domain detected via keywords: %s (%s)", profile.display_name, domain_id, ) return profile logger.warning( "Keyword matched domain_id=%s but no profile found, falling back", domain_id, ) # Level 2: LLM classification if llm is not None: domain_id = _llm_detect(combined_text, f"hypotheses: {hypotheses}", llm) if domain_id: profile = get_profile(domain_id) if profile: logger.info( "Domain detected via LLM: %s (%s)", profile.display_name, domain_id, ) return profile # Level 3: Fallback to generic logger.info("Using generic domain profile for topic: %.80s", topic) return get_generic_profile() async def detect_domain_async( topic: str, hypotheses: str = "", literature: str = "", llm: Any | None = None, ) -> DomainProfile: """Async version of detect_domain with LLM classification support.""" combined_text = f"{topic} {hypotheses} {literature}" # Level 1: Keyword matching domain_id = _keyword_detect(combined_text) if domain_id: profile = get_profile(domain_id) if profile: logger.info( "Domain detected via keywords: %s (%s)", profile.display_name, domain_id, ) return profile # Level 2: LLM classification if llm is not None: domain_id = _llm_detect(topic, combined_text, llm) if domain_id: profile = get_profile(domain_id) if profile: logger.info( "Domain detected via LLM: %s (%s)", profile.display_name, domain_id, ) return profile # Level 3: Fallback logger.info("Using generic domain profile for topic: %.80s", topic) return get_generic_profile() def detect_domain_id(topic: str, hypotheses: str = "", literature: str = "") -> str: """Quick keyword-only detection that returns a domain_id string. Useful for lightweight checks where a full profile isn't needed. """ combined = f"{topic} {hypotheses} {literature}" return _keyword_detect(combined) or "generic" def is_ml_domain(domain: DomainProfile) -> bool: """Check if a domain profile represents an ML/AI domain.""" return domain.domain_id.startswith("ml_") or domain.domain_id in ( "ml_generic", "ml_vision", "ml_nlp", "ml_rl", "ml_graph", "ml_tabular", "ml_generative", "ml_compression", ) ================================================ FILE: researchclaw/domains/experiment_schema.py ================================================ """Universal experiment schema — domain-agnostic experiment plan structure. Replaces the fixed ``baselines/proposed_methods/ablations`` keys with a generic ``conditions`` list that uses role-based terminology, adaptable to any research domain. """ from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from typing import Any import yaml class ConditionRole(str, Enum): """Role of an experimental condition.""" REFERENCE = "reference" # baseline / reference solver / standard pipeline PROPOSED = "proposed" # the method being investigated VARIANT = "variant" # ablation / parameter variation / robustness check class ExperimentType(str, Enum): COMPARISON = "comparison" CONVERGENCE = "convergence" PROGRESSIVE_SPEC = "progressive_spec" SIMULATION = "simulation" ABLATION_STUDY = "ablation_study" @dataclass class Condition: """A single experimental condition (method, configuration, etc.).""" name: str role: str = ConditionRole.PROPOSED.value description: str = "" varies_from: str = "" # parent condition for variants variation: str = "" # what is varied parameters: dict[str, Any] = field(default_factory=dict) @dataclass class MetricSpec: """Specification of a metric to evaluate.""" name: str direction: str = "minimize" # "minimize" | "maximize" unit: str = "" description: str = "" @dataclass class EvaluationSpec: """Evaluation protocol for the experiment.""" primary_metric: MetricSpec = field(default_factory=lambda: MetricSpec(name="primary_metric")) secondary_metrics: list[MetricSpec] = field(default_factory=list) protocol: str = "" statistical_test: str = "paired_t_test" num_seeds: int = 3 @dataclass class UniversalExperimentPlan: """Domain-agnostic experiment plan. This can represent ML train-eval, physics convergence studies, economics regression tables, and any other paradigm. """ experiment_type: str = ExperimentType.COMPARISON.value domain_id: str = "" problem_description: str = "" # Conditions (replaces baselines / proposed_methods / ablations) conditions: list[Condition] = field(default_factory=list) # Inputs input_type: str = "generated" # "benchmark_dataset" | "generated" | "loaded" input_description: str = "" # Evaluation evaluation: EvaluationSpec = field(default_factory=EvaluationSpec) # Presentation main_figure_type: str = "bar_chart" main_table_type: str = "comparison_table" # Raw YAML (for backward compatibility with existing pipeline) raw_yaml: str = "" @property def references(self) -> list[Condition]: """Get conditions with 'reference' role (baselines).""" return [c for c in self.conditions if c.role == ConditionRole.REFERENCE.value] @property def proposed(self) -> list[Condition]: """Get conditions with 'proposed' role.""" return [c for c in self.conditions if c.role == ConditionRole.PROPOSED.value] @property def variants(self) -> list[Condition]: """Get conditions with 'variant' role (ablations).""" return [c for c in self.conditions if c.role == ConditionRole.VARIANT.value] def to_legacy_format(self) -> dict[str, Any]: """Convert to legacy baselines/proposed_methods/ablations format. This allows the universal plan to be consumed by existing pipeline code that expects the old key names. """ baselines = [ {"name": c.name, "description": c.description} for c in self.references ] proposed = [ {"name": c.name, "description": c.description} for c in self.proposed ] ablations = [ { "name": c.name, "description": c.description, "varies_from": c.varies_from, "variation": c.variation, } for c in self.variants ] return { "baselines": baselines, "proposed_methods": proposed, "ablations": ablations, "metrics": { self.evaluation.primary_metric.name: { "direction": self.evaluation.primary_metric.direction, } }, } def to_yaml(self) -> str: """Serialize to YAML string.""" data: dict[str, Any] = { "experiment": { "type": self.experiment_type, "domain": self.domain_id, "problem": {"description": self.problem_description}, "conditions": [ { "name": c.name, "role": c.role, "description": c.description, **({"varies_from": c.varies_from} if c.varies_from else {}), **({"variation": c.variation} if c.variation else {}), } for c in self.conditions ], "inputs": { "type": self.input_type, "description": self.input_description, }, "evaluation": { "primary_metric": { "name": self.evaluation.primary_metric.name, "direction": self.evaluation.primary_metric.direction, }, "protocol": self.evaluation.protocol, "statistical_test": self.evaluation.statistical_test, }, "presentation": { "main_figure": self.main_figure_type, "main_table": self.main_table_type, }, } } return yaml.dump(data, default_flow_style=False, sort_keys=False) def from_legacy_exp_plan( plan_yaml: str | dict[str, Any], domain_id: str = "", ) -> UniversalExperimentPlan: """Convert a legacy exp_plan.yaml (baselines/proposed/ablations) to the universal format. This allows existing ML experiment plans to work with the new system. """ if isinstance(plan_yaml, str): data = yaml.safe_load(plan_yaml) or {} else: data = plan_yaml conditions: list[Condition] = [] # Parse baselines → reference for b in data.get("baselines", []): if isinstance(b, str): conditions.append(Condition(name=b, role=ConditionRole.REFERENCE.value)) elif isinstance(b, dict): conditions.append(Condition( name=b.get("name", "baseline"), role=ConditionRole.REFERENCE.value, description=b.get("description", ""), )) # Parse proposed_methods → proposed for p in data.get("proposed_methods", []): if isinstance(p, str): conditions.append(Condition(name=p, role=ConditionRole.PROPOSED.value)) elif isinstance(p, dict): conditions.append(Condition( name=p.get("name", "proposed"), role=ConditionRole.PROPOSED.value, description=p.get("description", ""), )) # Parse ablations → variant for a in data.get("ablations", []): if isinstance(a, str): conditions.append(Condition(name=a, role=ConditionRole.VARIANT.value)) elif isinstance(a, dict): conditions.append(Condition( name=a.get("name", "ablation"), role=ConditionRole.VARIANT.value, description=a.get("description", ""), varies_from=a.get("varies_from", ""), variation=a.get("variation", ""), )) # Parse metrics metrics = data.get("metrics", {}) primary_name = "primary_metric" primary_direction = "minimize" if isinstance(metrics, dict): for name, spec in metrics.items(): primary_name = name if isinstance(spec, dict): primary_direction = spec.get("direction", "minimize") break elif isinstance(metrics, list) and metrics: primary_name = metrics[0] if isinstance(metrics[0], str) else "primary_metric" return UniversalExperimentPlan( experiment_type=data.get("experiment_type", "comparison"), domain_id=domain_id, problem_description=data.get("objective", ""), conditions=conditions, evaluation=EvaluationSpec( primary_metric=MetricSpec(name=primary_name, direction=primary_direction), ), raw_yaml=yaml.dump(data, default_flow_style=False) if isinstance(data, dict) else str(plan_yaml), ) ================================================ FILE: researchclaw/domains/profiles/_generic.yaml ================================================ domain_id: generic display_name: Generic Computational Research parent_domain: "" experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: variant input: input data metric: primary metric typical_file_structure: config.py: "Experiment configuration and parameters" data.py: "Data generation or loading" methods.py: "Method implementations" evaluate.py: "Evaluation and metrics" main.py: "Entry point: setup → run → evaluate → report" entry_point: main.py core_libraries: - numpy - scipy - matplotlib - pandas docker_image: researchclaw/sandbox-generic:latest gpu_required: false pip_packages: - numpy - scipy - matplotlib - pandas - scikit-learn metric_types: - scalar - structured evaluation_protocol: "Compare methods on standard test problems. Report primary metric with error bars." statistical_tests: - paired_t_test github_search_terms: - scientific computing python - computational experiment paper_keywords: - computational methods code_generation_hints: | General guidelines for any computational experiment: 1. Implement all methods from the experiment plan 2. Use the same test problems / data for all methods 3. Report results as JSON to results.json 4. Include error bars / multiple seeds where applicable 5. Follow good software practices: clear variable names, modular code ================================================ FILE: researchclaw/domains/profiles/biology_genomics.yaml ================================================ domain_id: biology_genomics display_name: Genomics Analysis parent_domain: biology experiment_paradigm: comparison condition_terminology: baseline: standard pipeline proposed: proposed method variant: parameter variant input: sequence data metric: accuracy / F1 typical_file_structure: config.py: "Analysis parameters" data.py: "Sequence data loading" methods.py: "Analysis methods" main.py: "Entry point" entry_point: main.py core_libraries: [biopython, numpy, scikit-learn, pandas] docker_image: researchclaw/sandbox-biology:latest gpu_required: false pip_packages: [biopython, numpy, scikit-learn, pandas, matplotlib] metric_types: [scalar] standard_baselines: [BLAST, BWA, GATK] evaluation_protocol: "Compare methods on standard genomic tasks." statistical_tests: [paired_t_test] github_search_terms: [genomics python, sequence analysis biopython] paper_keywords: [genomics, sequence analysis] ================================================ FILE: researchclaw/domains/profiles/biology_protein.yaml ================================================ domain_id: biology_protein display_name: Protein Science parent_domain: biology experiment_paradigm: comparison condition_terminology: baseline: baseline model proposed: proposed model variant: ablation input: protein data metric: RMSD / accuracy typical_file_structure: config.py: "Model parameters" data.py: "Protein data loading" model.py: "Prediction model" main.py: "Entry point" entry_point: main.py core_libraries: [numpy, torch, scikit-learn, biopython] docker_image: researchclaw/sandbox-biology:latest gpu_required: false pip_packages: [numpy, torch, scikit-learn, biopython, pandas] metric_types: [scalar] standard_baselines: [ESM-2, ProtBERT, Random Forest] evaluation_protocol: "Evaluate on standard protein benchmarks." statistical_tests: [paired_t_test] github_search_terms: [protein prediction pytorch, protein folding python] paper_keywords: [protein structure prediction] ================================================ FILE: researchclaw/domains/profiles/biology_singlecell.yaml ================================================ domain_id: biology_singlecell display_name: Single-Cell Analysis parent_domain: biology experiment_paradigm: comparison condition_terminology: baseline: standard pipeline proposed: proposed method variant: sensitivity analysis input: dataset metric: ARI / NMI typical_file_structure: config.py: "Analysis parameters" data.py: "Data loading and preprocessing (QC, normalization, HVG)" methods.py: "Clustering/analysis method implementations" evaluation.py: "Evaluation metrics (ARI, NMI, silhouette)" main.py: "Entry point: load → preprocess → cluster → evaluate → report" entry_point: main.py core_libraries: - scanpy - anndata - numpy - scikit-learn docker_image: researchclaw/sandbox-biology:latest gpu_required: false pip_packages: - scanpy - anndata - leidenalg - numpy - scikit-learn - matplotlib - pandas metric_types: - scalar - table standard_baselines: - Leiden clustering - Louvain clustering - K-means - Spectral clustering evaluation_protocol: "Apply clustering methods, evaluate against known cell types. Report ARI, NMI, silhouette score." statistical_tests: - wilcoxon_rank_sum - fdr_correction github_search_terms: - scanpy tutorial - single cell clustering - scRNA-seq analysis python paper_keywords: - single-cell RNA-seq - cell clustering - differential expression dataset_guidance: | Single-cell analysis datasets: - Generate synthetic data with scanpy.datasets or simulate with splatter - Use small reference datasets: PBMC3k, Paul15 - Format: AnnData (h5ad) - Preprocessing: QC → normalize → log1p → HVG selection → PCA → neighbors ================================================ FILE: researchclaw/domains/profiles/chemistry_molprop.yaml ================================================ domain_id: chemistry_molprop display_name: Molecular Property Prediction parent_domain: chemistry experiment_paradigm: comparison condition_terminology: baseline: baseline model proposed: proposed model variant: ablation input: molecular dataset metric: MAE / RMSE typical_file_structure: config.py: "Model and training configuration" data.py: "Molecular data loading and featurization" model.py: "Property prediction model" train.py: "Training loop" main.py: "Entry point: load data → featurize → train → evaluate → report" entry_point: main.py core_libraries: - rdkit - numpy - scikit-learn - torch docker_image: researchclaw/sandbox-chemistry:latest gpu_required: false pip_packages: - rdkit - numpy - scikit-learn - torch - pandas - matplotlib metric_types: - scalar standard_baselines: - Random Forest + Morgan FP - XGBoost + ECFP - GCN - SchNet - MPNN evaluation_protocol: "Train/test split or cross-validation. Report MAE, RMSE on test set." statistical_tests: - paired_t_test github_search_terms: - molecular property prediction - rdkit fingerprint model - cheminformatics python - SMILES prediction paper_keywords: - molecular property prediction - cheminformatics - QSAR dataset_guidance: | Use standard molecular datasets: - Generate small datasets from SMILES strings in code - Use RDKit for featurization (Morgan fingerprints, descriptors) - Standard datasets: QM9, ESOL, FreeSolv, Lipophilicity - For benchmarking, use simple built-in data or generated molecular data ================================================ FILE: researchclaw/domains/profiles/chemistry_qm.yaml ================================================ domain_id: chemistry_qm display_name: Quantum Chemistry parent_domain: chemistry experiment_paradigm: comparison condition_terminology: baseline: "reference method (e.g., CCSD(T))" proposed: proposed method variant: basis set / parameter variant input: molecular system metric: energy error (kcal/mol) typical_file_structure: config.py: "Molecule definitions and calculation parameters" molecules.py: "Molecular geometry and basis set setup" calculations.py: "Quantum chemistry calculation routines" analysis.py: "Error analysis and comparison with reference" main.py: "Entry point: define molecules → calculate → compare → report" entry_point: main.py core_libraries: - pyscf - numpy - scipy docker_image: researchclaw/sandbox-chemistry:latest gpu_required: false pip_packages: - pyscf - numpy - scipy - matplotlib - pandas metric_types: - scalar - table standard_baselines: - HF (Hartree-Fock) - DFT/B3LYP - MP2 - CCSD - CCSD(T) evaluation_protocol: "Calculate molecular properties with different methods, compare against high-level reference (CCSD(T) or experimental)." statistical_tests: - mae - rmse - max_error output_formats: - latex_table figure_types: - bar_chart - correlation_plot - potential_energy_surface github_search_terms: - pyscf example - quantum chemistry python - DFT calculation tutorial - hartree fock python paper_keywords: - quantum chemistry - density functional theory - electronic structure dataset_guidance: | Quantum chemistry uses molecular systems defined in code: - Define molecular geometries using atomic coordinates - Use standard basis sets (STO-3G for testing, cc-pVDZ/cc-pVTZ for production) - Standard test molecules: H2, H2O, CH4, benzene - Do NOT download external datasets code_generation_hints: | Quantum chemistry code with PySCF: 1. Define molecules: mol = gto.M(atom='H 0 0 0; H 0 0 0.74', basis='sto-3g') 2. Run calculations: mf = scf.RHF(mol); mf.kernel() 3. Compare multiple methods on the same molecule set 4. Report energies in Hartree and errors in kcal/mol (1 Ha = 627.509 kcal/mol) 5. Output results.json with method comparison data ================================================ FILE: researchclaw/domains/profiles/economics_empirical.yaml ================================================ domain_id: economics_empirical display_name: Empirical Economics parent_domain: economics experiment_paradigm: progressive_spec condition_terminology: baseline: "specification (1)" proposed: full specification variant: robustness check input: sample metric: coefficient estimate typical_file_structure: config.py: "Specification definitions and variable lists" data_prep.py: "Data cleaning, variable construction, panel setup" models.py: "Regression specifications (OLS, FE, IV)" robustness.py: "Robustness checks and sensitivity analysis" main.py: "Entry point: load → estimate → robustness → tables" entry_point: main.py core_libraries: - statsmodels - linearmodels - pandas - numpy - scipy docker_image: researchclaw/sandbox-economics:latest gpu_required: false pip_packages: - statsmodels - linearmodels - pandas - numpy - scipy - matplotlib metric_types: - table - scalar standard_baselines: - OLS - OLS + controls - Fixed Effects - 2SLS / IV evaluation_protocol: "Progressive specification: OLS → +controls → +FE → +IV. Report regression table with coefficients, SE, R², N." statistical_tests: - hausman_test - robust_se - cluster_se - f_test output_formats: - regression_table - latex_table figure_types: - coefficient_plot - scatter_plot - residual_plot github_search_terms: - statsmodels regression example - panel data python - instrumental variable python - econometrics python paper_keywords: - causal inference - instrumental variables - panel data - regression analysis compute_budget_guidance: | Economics regressions are fast. Focus on: - Multiple specifications (4-6 columns in regression table) - Bootstrap for standard errors if needed (100-500 replications) - Cluster-robust SE for panel data dataset_guidance: | Economics data: - Generate synthetic panel/cross-section data in code - Simulate treatment effects with known DGP - Include controls, fixed effects structure - Standard test: wage equation, returns to education - Do NOT download external datasets code_generation_hints: | Economics regression code: 1. Generate synthetic data with known treatment effect 2. Implement progressive specifications (OLS → +controls → +FE → +IV) 3. Report coefficient, SE, p-value, R², N for each spec 4. Use robust/clustered standard errors 5. Output regression table to results.json: {"regression_table": {"spec_1_ols": {"coeff": 0.15, "se": 0.03, ...}}} ================================================ FILE: researchclaw/domains/profiles/mathematics_numerical.yaml ================================================ domain_id: mathematics_numerical display_name: Numerical Mathematics parent_domain: mathematics experiment_paradigm: convergence condition_terminology: baseline: reference method proposed: proposed method variant: variant input: test problem metric: error / convergence order typical_file_structure: config.py: "Problem parameters and method configurations" problems.py: "Test problem definitions with exact solutions" methods.py: "Numerical method implementations" analysis.py: "Error analysis and convergence computation" main.py: "Entry point: define problems → solve → analyze → report" entry_point: main.py core_libraries: - numpy - scipy - sympy - matplotlib docker_image: researchclaw/sandbox-math:latest gpu_required: false pip_packages: - numpy - scipy - sympy - matplotlib metric_types: - scalar - convergence standard_baselines: - Euler method - RK4 - Adams-Bashforth - Crank-Nicolson - Gauss-Legendre quadrature evaluation_protocol: "Run method at multiple step sizes, compute error against reference solution. Report convergence order." statistical_tests: - convergence_order_fit output_formats: - latex_table - convergence_plot figure_types: - convergence_plot - error_plot - solution_comparison_plot github_search_terms: - numerical methods python - ODE solver convergence - numerical analysis numpy - quadrature python paper_keywords: - numerical methods - convergence analysis - error estimates compute_budget_guidance: | Numerical methods are typically fast. Focus on accuracy: - Use 5-8 refinement levels for clear convergence plots - Step sizes: geometric sequence (e.g., h, h/2, h/4, h/8, ...) dataset_guidance: | Numerical methods use standard test problems: - ODE: Lotka-Volterra, Van der Pol, stiff problems - Quadrature: polynomials, oscillatory integrands, singular integrands - Linear algebra: Hilbert matrix, sparse tridiagonal systems - Define all problems in code; do NOT download external data code_generation_hints: | Numerical methods code: 1. Implement methods from scratch (not just calling scipy) 2. Use test problems with KNOWN exact solutions 3. Compute error at multiple refinement levels 4. Report convergence order: p = log(e_h / e_{h/2}) / log(2) 5. Output results.json with convergence data ================================================ FILE: researchclaw/domains/profiles/mathematics_optimization.yaml ================================================ domain_id: mathematics_optimization display_name: Mathematical Optimization parent_domain: mathematics experiment_paradigm: comparison condition_terminology: baseline: reference optimizer proposed: proposed optimizer variant: parameter variant input: test function metric: convergence rate typical_file_structure: config.py: "Optimization parameters" problems.py: "Test function definitions" optimizers.py: "Optimizer implementations" main.py: "Entry point" entry_point: main.py core_libraries: [numpy, scipy, matplotlib] docker_image: researchclaw/sandbox-math:latest gpu_required: false pip_packages: [numpy, scipy, matplotlib] metric_types: [scalar, convergence] standard_baselines: [Gradient Descent, L-BFGS, Nelder-Mead, Adam, CMA-ES] evaluation_protocol: "Run optimizers on standard test functions." statistical_tests: [paired_t_test, convergence_order_fit] github_search_terms: [optimization benchmark python, convex optimization scipy] paper_keywords: [optimization, convergence analysis] ================================================ FILE: researchclaw/domains/profiles/ml_compression.yaml ================================================ domain_id: ml_compression display_name: Model Compression (ML) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: uncompressed model proposed: compressed model variant: ablation input: dataset metric: accuracy / compression ratio typical_file_structure: config.py: "Hyperparameters and compression configuration" data.py: "Dataset loading with train/val/test splits" model.py: "Teacher and student model architecture definitions" compress.py: "Compression methods (distillation, pruning, quantization)" main.py: "Entry point: setup -> compress -> evaluate -> report" entry_point: main.py core_libraries: - torch - numpy - scikit-learn docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - torchvision - numpy - scikit-learn - matplotlib - tqdm metric_types: - scalar - learning_curve - table standard_baselines: - Uncompressed teacher model - Vanilla knowledge distillation - Magnitude pruning - Post-training quantization (INT8) - Random pruning evaluation_protocol: "Compare compressed model against full model. Report accuracy, model size, FLOPs, and inference latency. Mean +/- std over 3+ seeds." statistical_tests: - paired_t_test github_search_terms: - knowledge distillation pytorch - model pruning neural network - quantization aware training - model compression benchmark paper_keywords: - knowledge distillation - network pruning - model quantization - model compression - efficient inference ================================================ FILE: researchclaw/domains/profiles/ml_generative.yaml ================================================ domain_id: ml_generative display_name: Generative Models (ML) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: FID / IS typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Dataset loading and preprocessing" model.py: "Generative model architecture (generator, discriminator, encoder, decoder)" train.py: "Training loop with sample generation and metric tracking" main.py: "Entry point: setup -> train -> generate -> evaluate -> report" entry_point: main.py core_libraries: - torch - torchvision - numpy - scipy docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - torchvision - numpy - scipy - scikit-learn - matplotlib - tqdm - pillow metric_types: - scalar - learning_curve - image_grid standard_baselines: - DCGAN - WGAN-GP - VAE - DDPM - StyleGAN2 evaluation_protocol: "Train generative model, compute FID and IS on generated samples vs real data. Report mean +/- std over 3+ seeds." statistical_tests: - paired_t_test github_search_terms: - generative adversarial network pytorch - diffusion model training - VAE image generation - GAN benchmark FID paper_keywords: - generative adversarial network - diffusion model - variational autoencoder - image generation - score matching ================================================ FILE: researchclaw/domains/profiles/ml_generic.yaml ================================================ domain_id: ml_generic display_name: Machine Learning (General) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: accuracy/loss typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Dataset loading and preprocessing" model.py: "Model architecture definition" train.py: "Training loop with metric tracking" main.py: "Entry point: setup → train → evaluate → report" entry_point: main.py core_libraries: - torch - numpy - scikit-learn docker_image: researchclaw/sandbox-ml:latest gpu_required: true metric_types: - scalar - learning_curve standard_baselines: - MLP - Linear - Random Forest - XGBoost evaluation_protocol: "Standard train/eval split. Report mean ± std over 3+ seeds." statistical_tests: - paired_t_test github_search_terms: - machine learning pytorch - deep learning benchmark paper_keywords: - machine learning - deep learning ================================================ FILE: researchclaw/domains/profiles/ml_graph.yaml ================================================ domain_id: ml_graph display_name: Graph Neural Networks (ML) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: accuracy / F1 typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Graph dataset loading and preprocessing" model.py: "GNN architecture definition (message passing, pooling)" train.py: "Training loop with metric tracking" main.py: "Entry point: setup -> train -> evaluate -> report" entry_point: main.py core_libraries: - torch - torch_geometric - numpy - scikit-learn docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - torch-geometric - torch-scatter - torch-sparse - numpy - scikit-learn - matplotlib - tqdm - networkx - ogb metric_types: - scalar - learning_curve standard_baselines: - GCN - GAT - GraphSAGE - GIN - MLP (no graph structure) evaluation_protocol: "Train on training set, evaluate on test set. Report mean +/- std over 3+ seeds. Use standard dataset splits when available." statistical_tests: - paired_t_test github_search_terms: - graph neural network pytorch - node classification PyG - graph classification benchmark - GNN message passing paper_keywords: - graph neural network - node classification - graph classification - message passing - link prediction ================================================ FILE: researchclaw/domains/profiles/ml_nlp.yaml ================================================ domain_id: ml_nlp display_name: Natural Language Processing (ML) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: accuracy/F1 typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Dataset loading and tokenization" model.py: "Model architecture definition" train.py: "Training loop with metric tracking" main.py: "Entry point: setup → train → evaluate → report" entry_point: main.py core_libraries: - torch - transformers - datasets - numpy - scikit-learn docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - transformers - datasets - tokenizers - numpy - scikit-learn metric_types: - scalar - learning_curve standard_baselines: - BERT-base - RoBERTa-base - DistilBERT - LSTM - TF-IDF + SVM evaluation_protocol: "Fine-tune on training set, evaluate on test set. Report mean ± std over 3+ seeds." statistical_tests: - paired_t_test github_search_terms: - text classification transformers - NLP benchmark pytorch - sentiment analysis huggingface paper_keywords: - natural language processing - text classification - language model ================================================ FILE: researchclaw/domains/profiles/ml_rl.yaml ================================================ domain_id: ml_rl display_name: Reinforcement Learning parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline algorithm proposed: proposed algorithm variant: ablation input: environment metric: episode return typical_file_structure: config.py: "Hyperparameters and environment configuration" env.py: "Environment wrappers and setup" agent.py: "RL agent implementation" train.py: "Training loop with episode tracking" main.py: "Entry point: setup → train → evaluate → report" entry_point: main.py core_libraries: - torch - gymnasium - stable-baselines3 - numpy docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - gymnasium - stable-baselines3 - numpy - matplotlib metric_types: - scalar - learning_curve standard_baselines: - PPO - SAC - TD3 - DQN - A2C evaluation_protocol: "Train for specified steps, evaluate over 100 episodes. Report mean ± std over 3+ seeds." statistical_tests: - paired_t_test github_search_terms: - reinforcement learning pytorch - RL benchmark gymnasium - stable baselines3 example paper_keywords: - reinforcement learning - policy gradient - deep reinforcement learning ================================================ FILE: researchclaw/domains/profiles/ml_tabular.yaml ================================================ domain_id: ml_tabular display_name: Tabular ML parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: accuracy / RMSE / AUC typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Tabular data loading, feature engineering, train/val/test splits" model.py: "Model definitions and hyperparameter tuning" evaluation.py: "Evaluation metrics and cross-validation" main.py: "Entry point: setup -> preprocess -> train -> evaluate -> report" entry_point: main.py core_libraries: - xgboost - lightgbm - numpy - scikit-learn - pandas docker_image: researchclaw/sandbox-ml:latest gpu_required: false pip_packages: - xgboost - lightgbm - catboost - numpy - scikit-learn - pandas - matplotlib - optuna metric_types: - scalar - table standard_baselines: - XGBoost - LightGBM - CatBoost - Random Forest - Logistic Regression / Linear Regression - MLP evaluation_protocol: "Use 5-fold cross-validation or fixed train/test splits. Report mean +/- std of primary metric across folds or seeds." statistical_tests: - paired_t_test - wilcoxon_signed_rank github_search_terms: - xgboost classification benchmark - lightgbm tabular data - tabular benchmark gradient boosting - catboost vs xgboost paper_keywords: - gradient boosting - tabular data - feature engineering - ensemble methods - tree-based models ================================================ FILE: researchclaw/domains/profiles/ml_vision.yaml ================================================ domain_id: ml_vision display_name: Computer Vision (ML) parent_domain: ml experiment_paradigm: comparison condition_terminology: baseline: baseline proposed: proposed method variant: ablation input: dataset metric: accuracy/loss typical_file_structure: config.py: "Hyperparameters and model configuration" data.py: "Dataset loading with train/val/test splits" model.py: "Model architecture definition" train.py: "Training loop with metric tracking" main.py: "Entry point: setup → train → evaluate → report" entry_point: main.py core_libraries: - torch - torchvision - numpy - scikit-learn docker_image: researchclaw/sandbox-ml:latest gpu_required: true pip_packages: - torch - torchvision - numpy - scikit-learn - matplotlib - tqdm metric_types: - scalar - learning_curve standard_baselines: - ResNet-18 - ResNet-50 - VGG-16 - DenseNet-121 - EfficientNet-B0 evaluation_protocol: "Train on training set, evaluate on test set. Report mean ± std over 3+ seeds." statistical_tests: - paired_t_test output_formats: - latex_table - convergence_plot figure_types: - accuracy_curve - loss_curve - confusion_matrix - bar_chart github_search_terms: - image classification pytorch - computer vision benchmark - CNN training pipeline paper_keywords: - image classification - convolutional neural network - visual recognition ================================================ FILE: researchclaw/domains/profiles/neuroscience_computational.yaml ================================================ domain_id: neuroscience_computational display_name: Computational Neuroscience parent_domain: neuroscience experiment_paradigm: simulation condition_terminology: baseline: reference model proposed: proposed model variant: ablation / parameter variant input: neural data / stimulus protocol metric: firing rate / spike correlation / decoding accuracy typical_file_structure: config.py: "Simulation parameters (neuron count, connectivity, time constants)" network.py: "Neural network/circuit model definition" neuron.py: "Single-neuron model (LIF, Hodgkin-Huxley, Izhikevich)" stimulus.py: "Input stimulus generation (Poisson spikes, current injection)" analysis.py: "Spike train analysis, firing rate, correlation, decoding" main.py: "Entry point: setup → simulate → analyze → report" entry_point: main.py core_libraries: - numpy - scipy - brian2 - neo - matplotlib docker_image: researchclaw/sandbox-generic:latest gpu_required: false pip_packages: - numpy - scipy - brian2 - neo - elephant - matplotlib - mne - nilearn metric_types: - scalar - learning_curve - convergence standard_baselines: - Leaky Integrate-and-Fire (LIF) - Hodgkin-Huxley - Izhikevich model - Rate-coded network evaluation_protocol: "Run neural simulation for specified duration. Compare spike statistics (firing rate, CV of ISI, pairwise correlation). For decoding tasks, report accuracy with cross-validation." statistical_tests: - paired_t_test - ks_test - permutation_test output_formats: - latex_table - raster_plot - firing_rate_plot figure_types: - raster_plot - firing_rate_histogram - membrane_potential_trace - connectivity_matrix - tuning_curve - spike_correlation_matrix github_search_terms: - spiking neural network python - brian2 simulation - computational neuroscience model - neural population dynamics - neural decoding python paper_keywords: - spiking neural network - neural dynamics - population coding - neural circuit model - brain-inspired computing compute_budget_guidance: | Time budget for neuroscience simulations: - Single-neuron models: fast, can run thousands of trials - Small networks (< 1000 neurons): seconds per trial - Large networks (> 10000 neurons): use Brian2 or vectorized NumPy - Keep simulation duration reasonable (100ms–10s biological time) - Reduce network size if simulations exceed time budget dataset_guidance: | Computational neuroscience experiments generate synthetic data: - Define neuron models and connectivity in code - Generate input stimuli programmatically (Poisson spike trains, step currents) - Use standard benchmark circuits (E/I balanced network, winner-take-all) - Do NOT download external neural datasets - For brain imaging analysis, generate synthetic fMRI/EEG signals code_generation_hints: | Neuroscience simulation code requirements: 1. Implement neuron models with biologically plausible parameters 2. Use standard time integration (Euler for LIF, RK4 for Hodgkin-Huxley) 3. Generate spike trains and compute standard statistics 4. Compare models on same stimulus and initial conditions 5. Report: firing rate (Hz), CV of ISI, pairwise correlation, Fano factor 6. For decoding: train decoder on spike counts, report accuracy with k-fold CV 7. Output all results to results.json result_analysis_hints: | Neuroscience result analysis: - Firing rate: mean and std across neurons and trials - Regularity: coefficient of variation of inter-spike intervals (CV ISI) - Synchrony: pairwise spike-count correlation - Population: Fano factor, dimensionality of population activity - Decoding: accuracy, confusion matrix, information content (bits) - Use raster plots and PSTHs for qualitative assessment ================================================ FILE: researchclaw/domains/profiles/neuroscience_imaging.yaml ================================================ domain_id: neuroscience_imaging display_name: Brain Imaging Analysis parent_domain: neuroscience experiment_paradigm: comparison condition_terminology: baseline: standard analysis pipeline proposed: proposed analysis method variant: ablation / preprocessing variant input: imaging dataset metric: classification accuracy / connectivity strength typical_file_structure: config.py: "Analysis parameters (TR, voxel size, atlas, frequency bands)" preprocess.py: "Preprocessing pipeline (motion correction, filtering)" features.py: "Feature extraction (ROI signals, connectivity matrices)" classify.py: "Decoding / classification pipeline" main.py: "Entry point: preprocess → extract → analyze → report" entry_point: main.py core_libraries: - numpy - scipy - nilearn - mne - scikit-learn - matplotlib docker_image: researchclaw/sandbox-generic:latest gpu_required: false pip_packages: - numpy - scipy - nilearn - mne - scikit-learn - matplotlib - nibabel metric_types: - scalar - confusion standard_baselines: - SVM classifier - Correlation-based connectivity - Atlas-based parcellation - Band-pass filtering evaluation_protocol: "Generate synthetic imaging data or use standard atlases. Run analysis pipeline, report classification accuracy with cross-validation, or connectivity metrics." statistical_tests: - paired_t_test - permutation_test - fdr_correction output_formats: - latex_table - brain_map - connectivity_matrix figure_types: - brain_map - connectivity_matrix - bar_chart - confusion_matrix github_search_terms: - nilearn fmri analysis - mne eeg python - brain decoding sklearn - functional connectivity python paper_keywords: - brain imaging - functional connectivity - neural decoding - fMRI analysis - EEG signal processing compute_budget_guidance: | Time budget for brain imaging analysis: - Synthetic data generation: fast - Feature extraction from ROIs: seconds - Classification with cross-validation: seconds to minutes - Keep number of subjects/trials manageable for time budget dataset_guidance: | Brain imaging experiments should generate synthetic data: - Use nilearn.datasets for atlas references only - Generate synthetic fMRI signals with known activation patterns - Generate synthetic EEG with known frequency components - Do NOT download large imaging datasets - Simulate realistic noise characteristics (scanner drift, eye blinks) code_generation_hints: | Brain imaging analysis code: 1. Generate synthetic imaging data with known ground truth 2. Apply standard preprocessing (filtering, artifact removal) 3. Extract features (ROI timeseries, connectivity, power spectra) 4. Run classification/analysis pipeline 5. Report accuracy with cross-validation (k-fold, leave-one-out) 6. Compare methods on the same synthetic data 7. Output results to results.json result_analysis_hints: | Brain imaging result analysis: - Classification: accuracy, F1, confusion matrix (with cross-validation) - Connectivity: correlation strength, graph metrics (modularity, efficiency) - Frequency analysis: power spectral density, band power ratios - Use permutation tests for statistical significance - Apply FDR correction for multiple comparisons ================================================ FILE: researchclaw/domains/profiles/physics_pde.yaml ================================================ domain_id: physics_pde display_name: PDE Solvers parent_domain: physics experiment_paradigm: convergence condition_terminology: baseline: reference solver proposed: proposed solver variant: parameter variant input: test problem metric: L2 error norm typical_file_structure: config.py: "Problem parameters (domain, boundary conditions, grid sizes)" problem.py: "PDE definition and analytical solution (if available)" solver.py: "Numerical solver implementation" analysis.py: "Error computation and convergence analysis" main.py: "Entry point: define problem → solve at multiple resolutions → analyze → report" entry_point: main.py core_libraries: - numpy - scipy - matplotlib docker_image: researchclaw/sandbox-physics:latest gpu_required: false pip_packages: - numpy - scipy - matplotlib - findiff metric_types: - scalar - convergence standard_baselines: - Finite Difference (2nd order) - Finite Element (P1) - Spectral Method - Crank-Nicolson evaluation_protocol: "Solve on progressively finer grids, compute L2/L-inf error against analytical solution. Determine convergence order." statistical_tests: - convergence_order_fit output_formats: - latex_table - convergence_plot figure_types: - convergence_plot - solution_surface_plot - error_distribution_plot github_search_terms: - PDE solver python - finite difference python - finite element numpy - poisson equation solver paper_keywords: - partial differential equations - finite element method - finite difference method - numerical PDE compute_budget_guidance: | PDE solver time budget: - 2D problems: grid sizes 16, 32, 64, 128, 256 (5 levels) - 3D problems: grid sizes 8, 16, 32, 64 (4 levels) - Time-dependent: keep simulation time short, focus on spatial convergence dataset_guidance: | PDE problems use manufactured or standard test problems: - Use problems with known analytical solutions for error measurement - Standard: Poisson, Heat equation, Wave equation, Advection - Define boundary conditions and source terms in code - Do NOT download external datasets code_generation_hints: | PDE solver requirements: 1. Implement the discretization correctly (stencils, assembly) 2. Run at MULTIPLE grid sizes for convergence study 3. Compute error norms: L2 = sqrt(h^d * sum((u_h - u_exact)^2)) 4. Report convergence order: p = log(e1/e2) / log(h1/h2) 5. Handle boundary conditions properly 6. Output results.json with convergence data: {"convergence": {"method": [{"h": 0.1, "l2_error": 0.05, "linf_error": 0.1}]}} result_analysis_hints: | PDE convergence analysis: - Fit log-log regression: log(error) = p * log(h) + C - Expected: 2nd order FD → p≈2, 4th order → p≈4 - Report observed vs expected convergence order - Flag if convergence order is significantly below expected ================================================ FILE: researchclaw/domains/profiles/physics_quantum.yaml ================================================ domain_id: physics_quantum display_name: Quantum Mechanics / Quantum Physics parent_domain: physics experiment_paradigm: comparison condition_terminology: baseline: reference method proposed: proposed method variant: parameter variant input: quantum system metric: energy / fidelity typical_file_structure: config.py: "System parameters (potential, basis, grid)" system.py: "Quantum system definition" solver.py: "Solver implementation" main.py: "Entry point: setup → solve → analyze → report" entry_point: main.py core_libraries: [numpy, scipy, matplotlib] docker_image: researchclaw/sandbox-physics:latest gpu_required: false pip_packages: [numpy, scipy, matplotlib] metric_types: [scalar, convergence] standard_baselines: [Exact diagonalization, Variational method, Perturbation theory] evaluation_protocol: "Compare methods on standard quantum systems. Report energy accuracy, fidelity." statistical_tests: [relative_error, convergence_order_fit] github_search_terms: [quantum mechanics python, schrodinger equation solver] paper_keywords: [quantum mechanics, Schrodinger equation] ================================================ FILE: researchclaw/domains/profiles/physics_simulation.yaml ================================================ domain_id: physics_simulation display_name: Computational Physics (Simulation) parent_domain: physics experiment_paradigm: simulation condition_terminology: baseline: reference method proposed: proposed method variant: parameter variant input: initial conditions metric: error norm typical_file_structure: config.py: "Simulation parameters (grid, timestep, boundary conditions)" system.py: "Physical system definition (potentials, Hamiltonian)" integrator.py: "Numerical integrator implementation" analysis.py: "Observable computation from simulation data" main.py: "Entry point: setup → simulate → analyze → report" entry_point: main.py core_libraries: - numpy - scipy - matplotlib - jax docker_image: researchclaw/sandbox-physics:latest gpu_required: false pip_packages: - numpy - scipy - matplotlib - jax - jaxlib metric_types: - scalar - convergence standard_baselines: - Velocity Verlet - Leapfrog - RK4 - Euler evaluation_protocol: "Run simulation with multiple methods, compare energy conservation and trajectory accuracy. Report energy drift, relative error." statistical_tests: - convergence_order_fit - relative_error output_formats: - latex_table - convergence_plot figure_types: - trajectory_plot - energy_conservation_plot - convergence_plot - phase_space_plot github_search_terms: - molecular dynamics python - N-body simulation numpy - symplectic integrator python - physics simulation jax paper_keywords: - molecular dynamics - N-body simulation - symplectic integrator - energy conservation compute_budget_guidance: | Time budget considerations for physics simulations: - Short simulations (< 100 steps): suitable for convergence tests - Medium simulations (100-10000 steps): typical production runs - Long simulations (> 10000 steps): only with efficient integrators Adjust timestep (dt) and number of steps to fit within the time budget. dataset_guidance: | Physics simulations generate their own data: - Define initial conditions programmatically (positions, velocities) - Use standard test problems (harmonic oscillator, Kepler, Lennard-Jones) - Do NOT download external datasets - Generate test systems with known analytical solutions for validation code_generation_hints: | Physics simulation code requirements: 1. Implement actual physics — conserve energy, momentum as appropriate 2. Use appropriate units (reduced units for MD, SI for classical mechanics) 3. Compare methods at the SAME initial conditions and timestep 4. For convergence studies: run at multiple dt values (e.g., 0.1, 0.05, 0.025, 0.0125) 5. Report energy drift as relative error: |E(t) - E(0)| / |E(0)| 6. Output all results to results.json result_analysis_hints: | Physics result analysis: - For convergence: fit log(error) vs log(h) to determine convergence order - For energy conservation: report max and mean energy drift - For trajectory: compare against analytical solution if available - Use log-log plots for convergence, linear plots for time evolution ================================================ FILE: researchclaw/domains/profiles/robotics_control.yaml ================================================ domain_id: robotics_control display_name: Robotics & Control parent_domain: robotics experiment_paradigm: comparison condition_terminology: baseline: baseline controller/algorithm proposed: proposed controller/algorithm variant: ablation input: environment metric: episode return / success rate typical_file_structure: config.py: "Environment and training configuration" env.py: "Environment setup and wrappers" agent.py: "Control policy / RL agent" train.py: "Training loop" main.py: "Entry point: setup → train → evaluate → report" entry_point: main.py core_libraries: - gymnasium - mujoco - stable-baselines3 - torch - numpy docker_image: researchclaw/sandbox-robotics:latest gpu_required: true pip_packages: - gymnasium - mujoco - stable-baselines3 - torch - numpy - matplotlib metric_types: - scalar - learning_curve standard_baselines: - PPO - SAC - TD3 - PID controller evaluation_protocol: "Train in environment, evaluate over 100 episodes. Report mean return, success rate." statistical_tests: - paired_t_test github_search_terms: - mujoco python control - robot learning gymnasium - manipulation RL paper_keywords: - robot learning - control policy - manipulation ================================================ FILE: researchclaw/domains/profiles/security_detection.yaml ================================================ domain_id: security_detection display_name: Security / Intrusion Detection parent_domain: security experiment_paradigm: comparison condition_terminology: baseline: baseline detector proposed: proposed detector variant: feature variant input: dataset metric: TPR / FPR / F1 typical_file_structure: config.py: "Detection parameters and thresholds" data.py: "Data loading and feature extraction" model.py: "Detection model implementation" evaluate.py: "Evaluation with per-class metrics" main.py: "Entry point: load → extract features → train → evaluate → report" entry_point: main.py core_libraries: - scikit-learn - numpy - pandas docker_image: researchclaw/sandbox-security:latest gpu_required: false pip_packages: - scikit-learn - numpy - pandas - matplotlib - xgboost metric_types: - scalar - confusion_matrix standard_baselines: - Random Forest - XGBoost - SVM - Isolation Forest - One-Class SVM evaluation_protocol: "Train/test split. Report TPR, FPR, F1, per-class metrics." statistical_tests: - paired_t_test - mcnemar_test github_search_terms: - intrusion detection python - network anomaly detection - malware classification sklearn paper_keywords: - intrusion detection - anomaly detection - network security dataset_guidance: | Security detection datasets: - Generate synthetic network traffic or tabular data - Simulate normal vs attack patterns - Use class-imbalanced data (realistic for security) - Do NOT download external datasets ================================================ FILE: researchclaw/domains/prompt_adapter.py ================================================ """Domain-aware prompt adaptation layer. Instead of rewriting ``prompts.py`` (2395+ lines of battle-tested code), this module wraps existing prompt blocks with domain-specific overrides via the **adapter pattern**. Usage:: adapter = get_adapter(domain_profile) blocks = adapter.get_code_generation_blocks(context) # blocks dict can be injected into the existing prompt system """ from __future__ import annotations import logging from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any from researchclaw.domains.detector import DomainProfile, is_ml_domain logger = logging.getLogger(__name__) @dataclass class PromptBlocks: """Collection of prompt blocks for a specific pipeline stage. Each field is a string block that gets injected into the prompt template. Empty strings mean "use the default from prompts.py". """ compute_budget: str = "" dataset_guidance: str = "" hp_reporting: str = "" code_generation_hints: str = "" result_analysis_hints: str = "" experiment_design_context: str = "" statistical_test_guidance: str = "" output_format_guidance: str = "" class PromptAdapter(ABC): """Base class for domain-specific prompt adapters. Subclasses override methods to provide domain-specific prompt blocks. The ML adapter returns empty strings for everything (meaning: use the existing hardcoded behavior in prompts.py unchanged). """ def __init__(self, domain: DomainProfile) -> None: self.domain = domain @abstractmethod def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: """Return prompt blocks for the code generation stage.""" @abstractmethod def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: """Return prompt blocks for the experiment design stage.""" @abstractmethod def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: """Return prompt blocks for the result analysis stage.""" def get_blueprint_context(self) -> str: """Extra context injected into the blueprint generation prompt. Includes domain-specific file structure guidance, library hints, etc. """ parts: list[str] = [] if self.domain.typical_file_structure: parts.append("## Recommended File Structure") for fname, desc in self.domain.typical_file_structure.items(): parts.append(f"- `{fname}`: {desc}") if self.domain.core_libraries: parts.append(f"\n## Core Libraries: {', '.join(self.domain.core_libraries)}") if self.domain.code_generation_hints: parts.append(f"\n## Domain-Specific Hints\n{self.domain.code_generation_hints}") return "\n".join(parts) def get_condition_terminology(self) -> dict[str, str]: """Return the domain's terminology mapping.""" return self.domain.condition_terminology # --------------------------------------------------------------------------- # ML Adapter — wraps ALL current behavior unchanged # --------------------------------------------------------------------------- class MLPromptAdapter(PromptAdapter): """ML adapter: returns empty blocks so the existing prompts.py behavior is used verbatim. This is the zero-regression guarantee. """ def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: # Empty = use existing hardcoded ML blocks in prompts.py return PromptBlocks() def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks() def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: return PromptBlocks() # --------------------------------------------------------------------------- # Generic Adapter — LLM-knowledge-only fallback for unknown domains # --------------------------------------------------------------------------- class GenericPromptAdapter(PromptAdapter): """Generic adapter for domains without a specialized adapter. Uses the DomainProfile's guidance fields (loaded from YAML) to construct prompt blocks. Falls back to sensible generic guidance. """ def get_code_generation_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain paradigm = domain.experiment_paradigm libs = ", ".join(domain.core_libraries) if domain.core_libraries else "numpy, scipy" code_hints = domain.code_generation_hints or self._default_code_hints(paradigm, libs) dataset_guidance = domain.dataset_guidance or self._default_dataset_guidance(paradigm) hp_guidance = domain.hp_reporting_guidance or self._default_hp_guidance() return PromptBlocks( compute_budget=domain.compute_budget_guidance, dataset_guidance=dataset_guidance, hp_reporting=hp_guidance, code_generation_hints=code_hints, output_format_guidance=self._output_format_guidance(), ) def get_experiment_design_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain terminology = domain.condition_terminology paradigm = domain.experiment_paradigm design_context = ( f"This is a {domain.display_name} experiment.\n" f"Experiment paradigm: {paradigm}\n" ) if terminology: design_context += "Terminology:\n" for key, term in terminology.items(): design_context += f" - {key}: {term}\n" if domain.standard_baselines: design_context += f"Standard baselines in this domain: {', '.join(domain.standard_baselines)}\n" stats = ", ".join(domain.statistical_tests) if domain.statistical_tests else "appropriate statistical tests" stat_guidance = f"Use {stats} for result significance testing." return PromptBlocks( experiment_design_context=design_context, statistical_test_guidance=stat_guidance, ) def get_result_analysis_blocks(self, context: dict[str, Any]) -> PromptBlocks: domain = self.domain analysis_hints = domain.result_analysis_hints or "" if domain.statistical_tests: stat_guidance = ( "Statistical tests to use for this domain:\n" + "\n".join(f" - {t}" for t in domain.statistical_tests) ) else: stat_guidance = "" return PromptBlocks( result_analysis_hints=analysis_hints, statistical_test_guidance=stat_guidance, ) def _default_code_hints(self, paradigm: str, libs: str) -> str: hints = f"Core libraries for this domain: {libs}\n" if paradigm == "convergence": hints += ( "This is a convergence study. The code should:\n" "1. Run the method at multiple refinement levels (e.g., grid sizes, timesteps)\n" "2. Compute error norms at each level\n" "3. Report results in a format suitable for convergence analysis\n" "4. Output results as JSON to results.json\n" ) elif paradigm == "progressive_spec": hints += ( "This uses progressive specification (common in economics):\n" "1. Start with a simple model (e.g., OLS)\n" "2. Progressively add complexity (controls, fixed effects, IV)\n" "3. Present results as a regression table\n" "4. Output results as JSON to results.json\n" ) elif paradigm == "simulation": hints += ( "This is a simulation study. The code should:\n" "1. Set up the physical/computational system\n" "2. Run the simulation\n" "3. Compute observables from simulation data\n" "4. Output results as JSON to results.json\n" ) else: hints += ( "Output all results as JSON to results.json with the structure:\n" '{"conditions": {"method_name": {"seed_X": {"metric": value}}}}\n' ) return hints def _default_dataset_guidance(self, paradigm: str) -> str: if paradigm in ("convergence", "simulation"): return ( "Data/input for this experiment should be generated programmatically.\n" "Define initial conditions, parameters, or test problems in code.\n" "Do NOT attempt to download external datasets." ) return "" def _default_hp_guidance(self) -> str: return ( "Report all experiment parameters in a dictionary printed to stdout:\n" "HYPERPARAMETERS: {'param1': value1, 'param2': value2, ...}" ) def _output_format_guidance(self) -> str: domain = self.domain if "convergence" in domain.metric_types: return ( "Output results as JSON to results.json with convergence data:\n" '{"convergence": {"method": [{"h": 0.1, "error": 0.05}, ...]}}' ) if "table" in domain.metric_types: return ( "Output results as JSON to results.json with table data:\n" '{"regression_table": {"spec_1": {"coeff": 0.15, "se": 0.03, ...}}}' ) return ( "Output results as JSON to results.json:\n" '{"conditions": {"method": {"seed_X": {"metric": value}}}}' ) # --------------------------------------------------------------------------- # Adapter registry # --------------------------------------------------------------------------- # Maps domain_id prefixes to adapter classes. # If a domain_id starts with "ml_", the ML adapter is used. def _build_adapter_registry() -> dict[str, type[PromptAdapter]]: """Build the adapter registry with lazy imports for domain adapters.""" registry: dict[str, type[PromptAdapter]] = { "ml_": MLPromptAdapter, "generic": GenericPromptAdapter, } try: from researchclaw.domains.adapters.physics import PhysicsPromptAdapter registry["physics_"] = PhysicsPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.economics import EconomicsPromptAdapter registry["economics_"] = EconomicsPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.biology import BiologyPromptAdapter registry["biology_"] = BiologyPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.chemistry import ChemistryPromptAdapter registry["chemistry_"] = ChemistryPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.security import SecurityPromptAdapter registry["security_"] = SecurityPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.math import MathPromptAdapter registry["mathematics_"] = MathPromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.neuroscience import NeurosciencePromptAdapter registry["neuroscience_"] = NeurosciencePromptAdapter except ImportError: pass try: from researchclaw.domains.adapters.robotics import RoboticsPromptAdapter registry["robotics_"] = RoboticsPromptAdapter except ImportError: pass return registry _ADAPTER_REGISTRY: dict[str, type[PromptAdapter]] = _build_adapter_registry() def register_adapter(domain_prefix: str, adapter_cls: type[PromptAdapter]) -> None: """Register a custom adapter for a domain prefix.""" _ADAPTER_REGISTRY[domain_prefix] = adapter_cls def get_adapter(domain: DomainProfile) -> PromptAdapter: """Get the appropriate PromptAdapter for a given domain. Lookup order: 1. Exact domain_id match 2. Prefix match (e.g., "ml_" for all ML domains) 3. Generic fallback """ # Exact match if domain.domain_id in _ADAPTER_REGISTRY: return _ADAPTER_REGISTRY[domain.domain_id](domain) # Prefix match for prefix, adapter_cls in _ADAPTER_REGISTRY.items(): if prefix.endswith("_") and domain.domain_id.startswith(prefix): return adapter_cls(domain) # ML domain check if is_ml_domain(domain): return MLPromptAdapter(domain) # Generic fallback return GenericPromptAdapter(domain) ================================================ FILE: researchclaw/evolution.py ================================================ """Self-evolution system for the ResearchClaw pipeline. Records lessons from each pipeline run (failures, slow stages, quality issues) and injects them into future runs as prompt overlays. Inspired by Sibyl's time-weighted evolution mechanism. Architecture ------------ * ``LessonCategory`` — 6 issue categories for classification. * ``LessonEntry`` — single lesson (stage, category, severity, description, ts). * ``EvolutionStore`` — JSONL-backed persistent store with append + query. * ``extract_lessons()`` — auto-extract lessons from ``StageResult`` lists. * ``build_overlay()`` — generate per-stage prompt overlay text. Usage ----- :: from researchclaw.evolution import EvolutionStore, extract_lessons store = EvolutionStore(Path("evolution")) lessons = extract_lessons(results) store.append_many(lessons) overlay = store.build_overlay("hypothesis_gen", max_lessons=5) """ from __future__ import annotations import json import logging import math from dataclasses import asdict, dataclass from datetime import datetime, timezone from enum import Enum from pathlib import Path logger = logging.getLogger(__name__) class LessonCategory(str, Enum): """Issue classification for extracted lessons.""" SYSTEM = "system" # Environment / network / timeout EXPERIMENT = "experiment" # Code validation, sandbox timeout WRITING = "writing" # Paper quality issues ANALYSIS = "analysis" # Weak analysis, missing comparison LITERATURE = "literature" # Search / verification failures PIPELINE = "pipeline" # Stage orchestration issues @dataclass class LessonEntry: """A single lesson extracted from a pipeline run.""" stage_name: str stage_num: int category: str severity: str # "info", "warning", "error" description: str timestamp: str # ISO 8601 run_id: str = "" def to_dict(self) -> dict[str, object]: return asdict(self) @classmethod def from_dict(cls, data: dict[str, object]) -> LessonEntry: return cls( stage_name=str(data.get("stage_name", "")), stage_num=int(data.get("stage_num", 0)), category=str(data.get("category", "pipeline")), severity=str(data.get("severity", "info")), description=str(data.get("description", "")), timestamp=str(data.get("timestamp", "")), run_id=str(data.get("run_id", "")), ) # --------------------------------------------------------------------------- # Lesson classification keywords # --------------------------------------------------------------------------- _CATEGORY_KEYWORDS: dict[str, list[str]] = { LessonCategory.SYSTEM: [ "timeout", "connection", "network", "oom", "memory", "permission", "ssh", "socket", "dns", ], LessonCategory.EXPERIMENT: [ "sandbox", "validation", "import", "syntax", "subprocess", "experiment", "code", "execution", ], LessonCategory.WRITING: [ "paper", "draft", "outline", "revision", "review", "template", "latex", ], LessonCategory.ANALYSIS: [ "analysis", "metric", "statistic", "comparison", "baseline", ], LessonCategory.LITERATURE: [ "search", "citation", "verify", "hallucin", "arxiv", "semantic_scholar", "literature", "collect", ], } def _classify_error(stage_name: str, error_text: str) -> str: """Classify an error into a LessonCategory based on keywords.""" combined = f"{stage_name} {error_text}".lower() best_category = LessonCategory.PIPELINE best_score = 0 for category, keywords in _CATEGORY_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in combined) if score > best_score: best_score = score best_category = category return best_category # --------------------------------------------------------------------------- # Lesson extraction from pipeline results # --------------------------------------------------------------------------- # Stage name mapping (import-free to avoid circular deps) _STAGE_NAMES: dict[int, str] = { 1: "topic_init", 2: "problem_decompose", 3: "search_strategy", 4: "literature_collect", 5: "literature_screen", 6: "knowledge_extract", 7: "synthesis", 8: "hypothesis_gen", 9: "experiment_design", 10: "code_generation", 11: "resource_planning", 12: "experiment_run", 13: "iterative_refine", 14: "result_analysis", 15: "research_decision", 16: "paper_outline", 17: "paper_draft", 18: "peer_review", 19: "paper_revision", 20: "quality_gate", 21: "knowledge_archive", 22: "export_publish", 23: "citation_verify", } def extract_lessons( results: list[object], run_id: str = "", run_dir: Path | None = None, ) -> list[LessonEntry]: """Extract lessons from a list of StageResult objects. Detects: - Failed stages → error lesson - Blocked stages → pipeline lesson - Decision pivots/refines → pipeline lesson (with rationale if available) - Runtime warnings from experiment stderr → code_bug lesson - Metric anomalies (NaN, identical convergence) → metric_anomaly lesson """ now = datetime.now(timezone.utc).isoformat(timespec="seconds") lessons: list[LessonEntry] = [] for result in results: stage_num = int(getattr(result, "stage", 0)) stage_name = _STAGE_NAMES.get(stage_num, f"stage_{stage_num}") status = str(getattr(result, "status", "")) error = getattr(result, "error", None) decision = str(getattr(result, "decision", "proceed")) # Failed stages if "failed" in status.lower() and error: category = _classify_error(stage_name, str(error)) lessons.append(LessonEntry( stage_name=stage_name, stage_num=stage_num, category=category, severity="error", description=f"Stage {stage_name} failed: {str(error)[:300]}", timestamp=now, run_id=run_id, )) # Blocked stages if "blocked" in status.lower(): lessons.append(LessonEntry( stage_name=stage_name, stage_num=stage_num, category=LessonCategory.PIPELINE, severity="warning", description=f"Stage {stage_name} blocked awaiting approval", timestamp=now, run_id=run_id, )) # PIVOT / REFINE decisions — extract rationale if available if decision in ("pivot", "refine"): rationale = _extract_decision_rationale(run_dir) if run_dir else "" desc = f"Research decision was {decision.upper()}" if rationale: desc += f": {rationale[:200]}" else: desc += " — prior hypotheses/experiments were insufficient" lessons.append(LessonEntry( stage_name=stage_name, stage_num=stage_num, category=LessonCategory.PIPELINE, severity="warning", description=desc, timestamp=now, run_id=run_id, )) # --- Extract lessons from experiment artifacts --- if run_dir is not None: lessons.extend(_extract_runtime_lessons(run_dir, now, run_id)) return lessons def _extract_decision_rationale(run_dir: Path) -> str: """Extract rationale from the most recent decision_structured.json. Supports multiple field formats: - ``rationale`` or ``reason`` key (direct) - ``raw_text_excerpt`` containing ``## Justification`` section (LLM output) """ for stage_dir in sorted(run_dir.glob("stage-15*"), reverse=True): decision_file = stage_dir / "decision_structured.json" if decision_file.exists(): try: data = json.loads(decision_file.read_text(encoding="utf-8")) if not isinstance(data, dict): continue # Try direct rationale/reason keys first direct = data.get("rationale", "") or data.get("reason", "") if direct: return str(direct) # Parse raw_text_excerpt for Justification section raw = data.get("raw_text_excerpt", "") if raw: return _parse_justification_from_excerpt(str(raw)) except (json.JSONDecodeError, OSError): pass return "" def _parse_justification_from_excerpt(text: str) -> str: """Extract the Justification/Rationale section from LLM decision text.""" import re # Match ## Justification, ## Rationale, or similar headings pattern = re.compile( r"##\s*(?:Justification|Rationale|Reason)\s*\n(.*?)(?=\n##|\Z)", re.DOTALL | re.IGNORECASE, ) match = pattern.search(text) if match: return match.group(1).strip()[:300] # Fallback: skip the first line (## Decision / **REFINE**) and return the rest lines = [l.strip() for l in text.splitlines() if l.strip()] # Skip heading lines starting with ## or ** content_lines = [ l for l in lines if not l.startswith("##") and not (l.startswith("**") and l.endswith("**")) ] if content_lines: return " ".join(content_lines)[:300] return "" def _extract_runtime_lessons( run_dir: Path, timestamp: str, run_id: str ) -> list[LessonEntry]: """Extract fine-grained lessons from experiment run artifacts.""" import math lessons: list[LessonEntry] = [] # Check sandbox run results for stderr warnings and NaN for runs_dir in run_dir.glob("stage-*/runs"): for run_file in runs_dir.glob("*.json"): if run_file.name == "results.json": continue try: payload = json.loads(run_file.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue if not isinstance(payload, dict): continue # Check stderr for runtime warnings stderr = payload.get("stderr", "") if stderr and any( kw in stderr for kw in ("Warning", "Error", "divide", "overflow", "invalid value") ): lessons.append(LessonEntry( stage_name="experiment_run", stage_num=12, category=LessonCategory.EXPERIMENT, severity="warning", description=f"Runtime warning in experiment: {stderr[:200]}", timestamp=timestamp, run_id=run_id, )) # Check metrics for NaN/Inf metrics = payload.get("metrics", {}) if isinstance(metrics, dict): for key, val in metrics.items(): try: fval = float(val) if math.isnan(fval) or math.isinf(fval): lessons.append(LessonEntry( stage_name="experiment_run", stage_num=12, category=LessonCategory.EXPERIMENT, severity="error", description=f"Metric '{key}' was {val} — code bug (division by zero or overflow)", timestamp=timestamp, run_id=run_id, )) except (TypeError, ValueError): pass return lessons # --------------------------------------------------------------------------- # Time-decay weighting # --------------------------------------------------------------------------- HALF_LIFE_DAYS: float = 30.0 MAX_AGE_DAYS: float = 90.0 def _time_weight(timestamp_iso: str) -> float: """Compute exponential decay weight for a lesson based on age. Uses 30-day half-life: weight = exp(-age_days * ln(2) / 30). Returns 0.0 for lessons older than 90 days. """ try: ts = datetime.fromisoformat(timestamp_iso) if ts.tzinfo is None: ts = ts.replace(tzinfo=timezone.utc) age = datetime.now(timezone.utc) - ts age_days = age.total_seconds() / 86400.0 if age_days > MAX_AGE_DAYS: return 0.0 return math.exp(-age_days * math.log(2) / HALF_LIFE_DAYS) except (ValueError, TypeError): return 0.0 # --------------------------------------------------------------------------- # Evolution store # --------------------------------------------------------------------------- class EvolutionStore: """JSONL-backed store for pipeline lessons.""" def __init__(self, store_dir: Path) -> None: self._dir = store_dir self._dir.mkdir(parents=True, exist_ok=True) self._lessons_path = self._dir / "lessons.jsonl" @property def lessons_path(self) -> Path: return self._lessons_path def append(self, lesson: LessonEntry) -> None: """Append a single lesson to the store.""" with self._lessons_path.open("a", encoding="utf-8") as f: f.write(json.dumps(lesson.to_dict(), ensure_ascii=False) + "\n") def append_many(self, lessons: list[LessonEntry]) -> None: """Append multiple lessons atomically.""" if not lessons: return with self._lessons_path.open("a", encoding="utf-8") as f: for lesson in lessons: f.write(json.dumps(lesson.to_dict(), ensure_ascii=False) + "\n") logger.info("Appended %d lessons to evolution store", len(lessons)) def load_all(self) -> list[LessonEntry]: """Load all lessons from disk.""" if not self._lessons_path.exists(): return [] lessons: list[LessonEntry] = [] for line in self._lessons_path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: data = json.loads(line) lessons.append(LessonEntry.from_dict(data)) except (json.JSONDecodeError, TypeError): continue return lessons def query_for_stage( self, stage_name: str, *, max_lessons: int = 5 ) -> list[LessonEntry]: """Return the most relevant lessons for a stage, weighted by recency. Includes lessons that directly match the stage, plus high-severity lessons from related stages. """ all_lessons = self.load_all() scored: list[tuple[float, LessonEntry]] = [] for lesson in all_lessons: weight = _time_weight(lesson.timestamp) if weight <= 0.0: continue # Boost direct stage matches if lesson.stage_name == stage_name: weight *= 2.0 # Boost errors over warnings/info if lesson.severity == "error": weight *= 1.5 scored.append((weight, lesson)) scored.sort(key=lambda x: x[0], reverse=True) return [entry for _, entry in scored[:max_lessons]] def build_overlay( self, stage_name: str, *, max_lessons: int = 5, skills_dir: str = "", ) -> str: """Generate a prompt overlay string for a given stage. Combines two sources: 1. Current-run lessons from ``lessons.jsonl`` (intra-run learning). 2. Cross-run MetaClaw ``arc-*`` skills from *skills_dir* (inter-run learning via the MetaClaw skill-generation feedback loop). Returns empty string if no relevant lessons or skills exist. """ parts: list[str] = [] # --- Section 1: intra-run lessons --- lessons = self.query_for_stage(stage_name, max_lessons=max_lessons) if lessons: parts.append("## Lessons from Prior Runs") for i, lesson in enumerate(lessons, 1): severity_icon = {"error": "❌", "warning": "⚠️", "info": "ℹ️"}.get( lesson.severity, "•" ) parts.append( f"{i}. {severity_icon} [{lesson.category}] {lesson.description}" ) parts.append( "\nUse these lessons to avoid repeating past mistakes." ) # --- Section 2: cross-run MetaClaw arc-* skills --- if skills_dir: from pathlib import Path as _Path sd = _Path(skills_dir).expanduser() if sd.is_dir(): arc_skills: list[str] = [] for skill_dir in sorted(sd.iterdir()): if skill_dir.is_dir() and skill_dir.name.startswith("arc-"): skill_file = skill_dir / "SKILL.md" if skill_file.is_file(): try: text = skill_file.read_text(encoding="utf-8").strip() if text: arc_skills.append(text) except OSError: continue if arc_skills: parts.append("\n## Learned Skills from Prior Runs") for skill_text in arc_skills[:5]: parts.append(skill_text) parts.append( "\nApply these skills proactively to improve quality." ) return "\n".join(parts) def count(self) -> int: """Return total number of stored lessons.""" return len(self.load_all()) def export_to_memory(self, memory_store: object) -> int: """Export lessons to a memory store (duck-typed to avoid circular imports). The *memory_store* must expose an ``add(content, category, metadata)`` method (compatible with ``researchclaw.memory.store.MemoryStore``). Returns the number of lessons exported. """ add_fn = getattr(memory_store, "add", None) if add_fn is None or not callable(add_fn): logger.warning("export_to_memory: memory_store has no add() method") return 0 lessons = self.load_all() exported = 0 for lesson in lessons: weight = _time_weight(lesson.timestamp) if weight <= 0.0: continue try: # Map lesson categories to valid MemoryStore categories _CAT_MAP = { "system": "experiment", "analysis": "experiment", "literature": "ideation", "pipeline": "experiment", "experiment": "experiment", "writing": "writing", "ideation": "ideation", } _mem_cat = _CAT_MAP.get(lesson.category, "experiment") add_fn( content=lesson.description, category=_mem_cat, metadata={ "source": "evolution", "stage": lesson.stage_name, "severity": lesson.severity, "run_id": lesson.run_id, "timestamp": lesson.timestamp, }, ) exported += 1 except Exception: logger.debug("Failed to export lesson: %s", lesson.description[:80]) return exported def get_lessons_for_stage_with_memory( self, stage_name: str, memory_store: object, *, max_lessons: int = 5, ) -> str: """Combine evolution overlay with memory context for a stage. *memory_store* must expose a ``recall(query, category, max_results)`` method returning objects with a ``.content`` attribute. """ overlay = self.build_overlay(stage_name, max_lessons=max_lessons) recall_fn = getattr(memory_store, "recall", None) if recall_fn is None or not callable(recall_fn): return overlay try: memories = recall_fn( query=stage_name, category=None, max_results=max_lessons, ) if memories: parts = ["\n## Recalled Memories"] for i, mem in enumerate(memories, 1): content = getattr(mem, "content", str(mem)) parts.append(f"{i}. {content}") memory_text = "\n".join(parts) return f"{overlay}\n{memory_text}" if overlay else memory_text except Exception: logger.debug("Failed to recall memories for stage %s", stage_name) return overlay ================================================ FILE: researchclaw/experiment/__init__.py ================================================ """Experiment execution — sandbox, runner, git manager.""" from researchclaw.experiment.factory import create_sandbox from researchclaw.experiment.sandbox import ( ExperimentSandbox, SandboxProtocol, SandboxResult, parse_metrics, ) __all__ = [ "ExperimentSandbox", "SandboxProtocol", "SandboxResult", "create_sandbox", "parse_metrics", ] ================================================ FILE: researchclaw/experiment/agentic_sandbox.py ================================================ """Agentic sandbox: launches a coding agent inside a Docker container. The agent (e.g. Claude Code, Codex) gets full shell access and can run arbitrary CLI commands, read/write files, and iteratively complete the experiment. This replaces the traditional code-generation + sandbox-execution pipeline with a single agentic session. """ from __future__ import annotations import json import logging import os import subprocess import threading import time from dataclasses import dataclass, field from pathlib import Path from researchclaw.config import AgenticConfig from researchclaw.experiment.sandbox import SandboxResult, parse_metrics logger = logging.getLogger(__name__) _CONTAINER_COUNTER = 0 _counter_lock = threading.Lock() def _next_container_name() -> str: global _CONTAINER_COUNTER # noqa: PLW0603 with _counter_lock: _CONTAINER_COUNTER += 1 return f"rc-agentic-{_CONTAINER_COUNTER}-{os.getpid()}" @dataclass class AgenticResult: """Result of an agentic experiment session.""" returncode: int stdout: str stderr: str elapsed_sec: float output_files: list[str] = field(default_factory=list) output_dirs: list[str] = field(default_factory=list) metrics: dict[str, float] = field(default_factory=dict) agent_log: str = "" steps_completed: int = 0 class AgenticSandbox: """Run a coding agent inside a Docker container with full shell access.""" def __init__( self, config: AgenticConfig, workdir: Path, skills_dir: Path | None = None, ) -> None: self.config = config self.workdir = workdir.resolve() self.workdir.mkdir(parents=True, exist_ok=True) self.skills_dir = skills_dir self._container_name: str | None = None # -- public API ---------------------------------------------------------- def run_agent_session( self, prompt: str, workspace: Path, *, timeout_sec: int | None = None, ) -> AgenticResult: """Launch the agent inside Docker, send *prompt*, and collect results. 1. ``docker run -d`` a long-lived container 2. Install agent CLI (if ``agent_install_cmd`` is set) 3. ``docker exec`` the agent with *prompt* 4. Collect output files from ``/workspace`` 5. Stop + remove the container """ timeout = timeout_sec or self.config.timeout_sec container = _next_container_name() self._container_name = container workspace = workspace.resolve() workspace.mkdir(parents=True, exist_ok=True) start = time.monotonic() try: # 1. Start the container self._start_container(container, workspace) # 2. Install agent CLI if self.config.agent_install_cmd: self._docker_exec( container, self.config.agent_install_cmd, timeout=min(300, timeout), ) # 3. Run the agent agent_cmd = self._build_agent_command(prompt) proc = self._docker_exec( container, agent_cmd, timeout=timeout, ) stdout = proc.stdout or "" stderr = proc.stderr or "" returncode = proc.returncode # 4. Collect results output_files, output_dirs = self._collect_outputs(workspace) metrics = self._parse_result_metrics(workspace, stdout) agent_log = stdout steps = self._count_agent_steps(stdout) elapsed = time.monotonic() - start return AgenticResult( returncode=returncode, stdout=stdout, stderr=stderr, elapsed_sec=elapsed, output_files=output_files, output_dirs=output_dirs, metrics=metrics, agent_log=agent_log, steps_completed=steps, ) except subprocess.TimeoutExpired: elapsed = time.monotonic() - start logger.warning( "Agentic session timed out after %ds (container %s)", timeout, container, ) # Still try to collect partial results output_files, output_dirs = self._collect_outputs(workspace) metrics = self._parse_result_metrics(workspace, "") return AgenticResult( returncode=-1, stdout="", stderr=f"Agent session timed out after {timeout}s", elapsed_sec=elapsed, output_files=output_files, output_dirs=output_dirs, metrics=metrics, agent_log="", steps_completed=0, ) except Exception as exc: elapsed = time.monotonic() - start logger.exception("Agentic session failed: %s", exc) return AgenticResult( returncode=-1, stdout="", stderr=str(exc), elapsed_sec=elapsed, ) finally: self._cleanup_container(container) def to_sandbox_result(self, result: AgenticResult) -> SandboxResult: """Convert an AgenticResult to a SandboxResult for pipeline compat.""" return SandboxResult( returncode=result.returncode, stdout=result.stdout, stderr=result.stderr, elapsed_sec=result.elapsed_sec, metrics={k: v for k, v in result.metrics.items()}, timed_out=(result.returncode == -1 and "timed out" in result.stderr), ) # -- Docker helpers ------------------------------------------------------ def _start_container(self, container: str, workspace: Path) -> None: """Start a long-lived Docker container with workspace mounted.""" cmd = [ "docker", "run", "-d", "--name", container, "-v", f"{workspace}:/workspace", "-w", "/workspace", f"--memory={self.config.memory_limit_mb}m", ] # Mount skills directory as read-only reference if self.config.mount_skills and self.skills_dir and self.skills_dir.is_dir(): cmd.extend(["-v", f"{self.skills_dir}:/skills:ro"]) # Network if self.config.network_policy == "none": cmd.extend(["--network", "none"]) # GPU passthrough if self.config.gpu_enabled: cmd.extend(["--gpus", "all"]) cmd.extend([self.config.image, "tail", "-f", "/dev/null"]) logger.info("Starting agentic container: %s", container) subprocess.run(cmd, check=True, capture_output=True, text=True) def _docker_exec( self, container: str, command: str, *, timeout: int = 300, ) -> subprocess.CompletedProcess[str]: """Run a command inside the container.""" cmd = ["docker", "exec", container, "bash", "-c", command] return subprocess.run( cmd, capture_output=True, text=True, timeout=timeout, check=False, ) def _build_agent_command(self, prompt: str) -> str: """Build the shell command to invoke the agent CLI.""" cli = self.config.agent_cli max_turns = self.config.max_turns import shlex as _shlex escaped = _shlex.quote(prompt) if cli == "claude": # Claude Code CLI return ( f"{cli} -p {escaped} " f"--output-format json " f"--max-turns {max_turns} " f"--allowedTools 'Bash(*)' 'Read' 'Write' 'Edit' 'Glob' 'Grep'" ) elif cli == "codex": # OpenAI Codex CLI return f"{cli} --quiet --approval-mode full-auto {escaped}" else: # Generic: pass prompt via -p flag return f"{cli} -p {escaped}" def _cleanup_container(self, container: str) -> None: """Stop and remove the container.""" try: subprocess.run( ["docker", "stop", "-t", "5", container], capture_output=True, timeout=30, check=False, ) subprocess.run( ["docker", "rm", "-f", container], capture_output=True, timeout=30, check=False, ) logger.debug("Cleaned up container %s", container) except Exception: # noqa: BLE001 logger.warning("Failed to cleanup container %s", container) # -- Result collection --------------------------------------------------- @staticmethod def _collect_outputs(workspace: Path) -> tuple[list[str], list[str]]: """Walk workspace and return lists of output files and directories.""" output_files: list[str] = [] output_dirs: list[str] = [] if not workspace.exists(): return output_files, output_dirs for item in sorted(workspace.rglob("*")): rel = str(item.relative_to(workspace)) if item.is_dir(): output_dirs.append(rel) elif item.is_file(): output_files.append(rel) return output_files, output_dirs @staticmethod def _parse_result_metrics( workspace: Path, stdout: str ) -> dict[str, float]: """Parse metrics from results.json (preferred) or stdout.""" metrics: dict[str, float] = {} # Try results.json first results_json = workspace / "results.json" if results_json.exists(): try: data = json.loads(results_json.read_text(encoding="utf-8")) if isinstance(data, dict): # Flatten metrics from various common formats raw = data.get("metrics", data) for k, v in raw.items(): try: metrics[k] = float(v) except (TypeError, ValueError): pass except (json.JSONDecodeError, OSError): pass # Fall back to stdout metric parsing if not metrics and stdout: metrics = parse_metrics(stdout) return metrics @staticmethod def _count_agent_steps(stdout: str) -> int: """Estimate the number of agent turns from the output.""" # For JSON-format Claude output, count tool-use entries try: data = json.loads(stdout) if isinstance(data, list): return len(data) if isinstance(data, dict): # Claude Code JSON output has a "messages" or similar key messages = data.get("messages", data.get("turns", [])) if isinstance(messages, list): return len(messages) except (json.JSONDecodeError, TypeError): pass # Fallback: count lines that look like agent actions count = 0 for line in stdout.splitlines(): stripped = line.strip() if stripped.startswith(("$", ">>>", ">>", "claude>", "Agent:")): count += 1 return count # -- Static checks ------------------------------------------------------- @staticmethod def check_docker_available() -> bool: """Return True if Docker daemon is reachable.""" try: result = subprocess.run( ["docker", "info"], capture_output=True, timeout=10, check=False, ) return result.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False ================================================ FILE: researchclaw/experiment/code_agent.py ================================================ """Pluggable code-generation backends for experiment Stages 10 & 13. Supports three providers: - ``llm`` — existing LLM chat API (backward-compatible default) - ``claude_code`` — Claude Code CLI (``claude -p``) - ``codex`` — OpenAI Codex CLI (``codex exec``) Usage:: from researchclaw.experiment.code_agent import create_code_agent agent = create_code_agent(config, llm=llm_client, prompts=pm) result = agent.generate(exp_plan=plan, topic=topic, ...) if result.ok: files = result.files # dict[str, str] """ from __future__ import annotations import logging import os import shutil import signal import subprocess import time from dataclasses import dataclass from pathlib import Path from typing import Any, Protocol from researchclaw.config import RCConfig logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Result dataclass # --------------------------------------------------------------------------- @dataclass(frozen=True) class CodeAgentResult: """Output from a code agent invocation.""" files: dict[str, str] # filename -> code content provider_name: str # "llm", "claude_code", "codex" elapsed_sec: float raw_output: str = "" error: str | None = None @property def ok(self) -> bool: return self.error is None and bool(self.files) # --------------------------------------------------------------------------- # Protocol # --------------------------------------------------------------------------- class CodeAgentProvider(Protocol): """Protocol for code generation backends.""" @property def name(self) -> str: ... def generate( self, *, exp_plan: str, topic: str, metric_key: str, pkg_hint: str, compute_budget: str, extra_guidance: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: """Generate experiment code from scratch (Stage 10).""" ... def refine( self, *, current_files: dict[str, str], run_summaries: list[str], metric_key: str, metric_direction: str, topic: str, extra_hints: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: """Refine existing experiment code based on run results (Stage 13).""" ... def repair( self, *, files: dict[str, str], issues: str, workdir: Path, timeout_sec: int = 300, ) -> CodeAgentResult: """Fix validation or runtime issues in code.""" ... # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _to_text(value: str | bytes | None) -> str: if value is None: return "" if isinstance(value, bytes): return value.decode("utf-8", errors="replace") return value def _collect_py_files(workdir: Path) -> dict[str, str]: """Read all .py files from a directory (flat, no subdirs).""" files: dict[str, str] = {} for pyfile in sorted(workdir.glob("*.py")): if pyfile.name.startswith("_codex_") or pyfile.name.startswith("_agent_"): continue files[pyfile.name] = pyfile.read_text(encoding="utf-8") return files def _seed_workdir(workdir: Path, files: dict[str, str]) -> None: """Pre-populate workdir with files for refinement/repair.""" workdir.mkdir(parents=True, exist_ok=True) for fname, content in files.items(): (workdir / fname).write_text(content, encoding="utf-8") def format_feedback_for_agent( sandbox_result: Any, metric_key: str, metric_direction: str, best_metric: float | None, ) -> str: """Format sandbox run results as structured feedback for CLI agents.""" parts = ["## Previous Run Results"] parts.append(f"Return code: {sandbox_result.returncode}") parts.append(f"Elapsed: {sandbox_result.elapsed_sec:.1f}s") parts.append(f"Timed out: {sandbox_result.timed_out}") if sandbox_result.metrics: parts.append("Metrics:") for k, v in sandbox_result.metrics.items(): parts.append(f" {k}: {v}") if sandbox_result.stderr: parts.append(f"Stderr (last 1000 chars):\n{sandbox_result.stderr[-1000:]}") parts.append(f"\nTarget: {metric_direction} '{metric_key}'") if best_metric is not None: parts.append(f"Best so far: {best_metric}") return "\n".join(parts) # --------------------------------------------------------------------------- # LlmCodeAgent — wraps existing LLM chat API (backward-compatible) # --------------------------------------------------------------------------- class LlmCodeAgent: """Code agent backed by the existing OpenAI-compatible LLM chat API. This implementation extracts the LLM call + response parsing logic that was previously inline in ``_execute_code_generation`` and ``_execute_iterative_refine``, preserving exact behavior. """ def __init__( self, llm: Any, prompts: Any, config: RCConfig, ) -> None: self._llm = llm self._pm = prompts self._config = config @property def name(self) -> str: return "llm" def generate( self, *, exp_plan: str, topic: str, metric_key: str, pkg_hint: str, compute_budget: str, extra_guidance: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: from researchclaw.pipeline.executor import ( _chat_with_prompt, _extract_multi_file_blocks, ) start = time.monotonic() sp = self._pm.for_stage( "code_generation", topic=topic, metric=metric_key, pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, exp_plan=exp_plan, ) # Higher max_tokens for reasoning models _code_max_tokens = sp.max_tokens or 8192 if any( self._config.llm.primary_model.startswith(p) for p in ("gpt-5", "o3", "o4") ): _code_max_tokens = max(_code_max_tokens, 16384) try: resp = _chat_with_prompt( self._llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=_code_max_tokens, ) files = _extract_multi_file_blocks(resp.content) # Retry on empty response with higher token limit if not files and not resp.content.strip(): logger.warning( "LlmCodeAgent: empty response (len=%d, finish=%s). " "Retrying with 32768 tokens.", len(resp.content), resp.finish_reason, ) resp = _chat_with_prompt( self._llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=32768, ) files = _extract_multi_file_blocks(resp.content) elapsed = time.monotonic() - start if not files: logger.warning( "LlmCodeAgent: no files extracted (resp len=%d)", len(resp.content), ) return CodeAgentResult( files=files, provider_name="llm", elapsed_sec=elapsed, raw_output=resp.content[:2000], ) except Exception as exc: elapsed = time.monotonic() - start logger.error("LlmCodeAgent.generate failed: %s", exc) return CodeAgentResult( files={}, provider_name="llm", elapsed_sec=elapsed, error=str(exc), ) def refine( self, *, current_files: dict[str, str], run_summaries: list[str], metric_key: str, metric_direction: str, topic: str, extra_hints: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: from researchclaw.pipeline.executor import ( _chat_with_prompt, _extract_code_block, _extract_multi_file_blocks, ) start = time.monotonic() def _files_to_context(project_files: dict[str, str]) -> str: parts = [] for fname, code in sorted(project_files.items()): parts.append(f"```filename:{fname}\n{code}\n```") return "\n\n".join(parts) try: ip = self._pm.sub_prompt( "iterative_improve", metric_key=metric_key, metric_direction=metric_direction, files_context=_files_to_context(current_files), run_summaries=chr(10).join(run_summaries[:20]), condition_coverage_hint="", topic=topic, ) user_prompt = ip.user + extra_hints response = _chat_with_prompt( self._llm, ip.system, user_prompt, max_tokens=ip.max_tokens or 8192, ) extracted_files = _extract_multi_file_blocks(response.content) if not extracted_files: single_code = _extract_code_block(response.content) if single_code.strip(): extracted_files = {"main.py": single_code} elapsed = time.monotonic() - start return CodeAgentResult( files=extracted_files, provider_name="llm", elapsed_sec=elapsed, raw_output=response.content[:2000], ) except Exception as exc: elapsed = time.monotonic() - start logger.error("LlmCodeAgent.refine failed: %s", exc) return CodeAgentResult( files={}, provider_name="llm", elapsed_sec=elapsed, error=str(exc), ) def repair( self, *, files: dict[str, str], issues: str, workdir: Path, timeout_sec: int = 300, ) -> CodeAgentResult: from researchclaw.pipeline.executor import ( _chat_with_prompt, _extract_code_block, _extract_multi_file_blocks, ) start = time.monotonic() all_files_ctx = "\n\n".join( f"```filename:{f}\n{c}\n```" for f, c in files.items() ) try: rp = self._pm.sub_prompt( "code_repair", fname="main.py", issues_text=issues, all_files_ctx=all_files_ctx, ) resp = _chat_with_prompt(self._llm, rp.system, rp.user) # Try multi-file extraction first, then single-block repaired = _extract_multi_file_blocks(resp.content) if not repaired: code = _extract_code_block(resp.content) if code.strip(): repaired = {"main.py": code} elapsed = time.monotonic() - start return CodeAgentResult( files=repaired, provider_name="llm", elapsed_sec=elapsed, raw_output=resp.content[:2000], ) except Exception as exc: elapsed = time.monotonic() - start return CodeAgentResult( files={}, provider_name="llm", elapsed_sec=elapsed, error=str(exc), ) # --------------------------------------------------------------------------- # CLI agent base — shared subprocess logic for Claude Code / Codex # --------------------------------------------------------------------------- class _CliAgentBase: """Shared infrastructure for CLI-based coding agents.""" _provider_name: str = "" def __init__( self, binary_path: str, model: str = "", max_budget_usd: float = 5.0, timeout_sec: int = 600, extra_args: list[str] | None = None, ) -> None: self._binary = binary_path self._model = model self._max_budget_usd = max_budget_usd self._default_timeout = timeout_sec self._extra_args = extra_args or [] @property def name(self) -> str: return self._provider_name def _run_subprocess( self, cmd: list[str], workdir: Path, timeout_sec: int, ) -> tuple[int, str, str, float, bool]: """Run command as subprocess with process-group cleanup on timeout. Returns (returncode, stdout, stderr, elapsed_sec, timed_out). """ workdir.mkdir(parents=True, exist_ok=True) start = time.monotonic() timed_out = False proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=workdir, env={**os.environ}, start_new_session=True, ) try: stdout_bytes, stderr_bytes = proc.communicate(timeout=timeout_sec) except subprocess.TimeoutExpired: timed_out = True # Kill entire process group try: os.killpg(os.getpgid(proc.pid), signal.SIGTERM) except OSError: pass try: stdout_bytes, stderr_bytes = proc.communicate(timeout=5) except subprocess.TimeoutExpired: try: os.killpg(os.getpgid(proc.pid), signal.SIGKILL) except OSError: pass stdout_bytes, stderr_bytes = proc.communicate(timeout=5) elapsed = time.monotonic() - start return ( proc.returncode or -1, _to_text(stdout_bytes), _to_text(stderr_bytes), elapsed, timed_out, ) def _build_result( self, workdir: Path, returncode: int, stdout: str, stderr: str, elapsed: float, timed_out: bool, ) -> CodeAgentResult: """Collect .py files from workdir and build result.""" files = _collect_py_files(workdir) error = None if timed_out: error = f"Timed out after {elapsed:.0f}s" elif returncode != 0 and not files: error = f"Exited {returncode}: {stderr[:500]}" return CodeAgentResult( files=files, provider_name=self._provider_name, elapsed_sec=elapsed, raw_output=stdout[:3000], error=error, ) @staticmethod def _generate_prompt( topic: str, exp_plan: str, metric_key: str, pkg_hint: str, compute_budget: str, extra_guidance: str, ) -> str: return ( "You are generating experiment code for a research paper.\n\n" f"TOPIC: {topic}\n\n" f"EXPERIMENT PLAN:\n{exp_plan}\n\n" f"PRIMARY METRIC: {metric_key}\n" f"{pkg_hint}\n{compute_budget}\n{extra_guidance}\n\n" "INSTRUCTIONS:\n" "1. Create a multi-file Python project in the current directory.\n" "2. The entry point MUST be main.py.\n" "3. main.py must print metrics as 'name: value' lines to stdout.\n" f"4. Use condition labels: 'condition= {metric_key}: '\n" "5. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket, " "network calls, external data files.\n" "6. Use deterministic seeds (numpy.random.seed or random.seed).\n" "7. Write ALL files to the current working directory.\n" "8. Do NOT create subdirectories.\n" ) @staticmethod def _refine_prompt( current_files: dict[str, str], run_summaries: list[str], metric_key: str, metric_direction: str, topic: str, extra_hints: str, ) -> str: files_listing = "\n".join( f" - {fname} ({len(code)} chars)" for fname, code in current_files.items() ) summaries_text = "\n".join(run_summaries[:10]) if run_summaries else "(no prior runs)" return ( "You are improving experiment code for a research paper.\n\n" f"TOPIC: {topic}\n" f"TARGET: {metric_direction} '{metric_key}'\n\n" f"EXISTING FILES in current directory:\n{files_listing}\n\n" "Read the existing files, then improve them based on these run results:\n\n" f"PRIOR RUN SUMMARIES:\n{summaries_text}\n\n" f"{extra_hints}\n\n" "INSTRUCTIONS:\n" "1. Read existing code, understand the experiment structure.\n" "2. Modify files to improve the metric.\n" "3. Keep the entry point as main.py.\n" "4. Write modified files to the current directory.\n" "5. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket.\n" ) @staticmethod def _repair_prompt( files: dict[str, str], issues: str, ) -> str: files_listing = "\n".join( f" - {fname} ({len(code)} chars)" for fname, code in files.items() ) return ( "The experiment code has validation or runtime issues.\n\n" f"ISSUES:\n{issues}\n\n" f"FILES in current directory:\n{files_listing}\n\n" "INSTRUCTIONS:\n" "1. Read the existing files in the current directory.\n" "2. Fix ALL reported issues.\n" "3. Write the corrected files back.\n" "4. FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket.\n" ) # --------------------------------------------------------------------------- # ClaudeCodeAgent # --------------------------------------------------------------------------- class ClaudeCodeAgent(_CliAgentBase): """Code agent backed by Claude Code CLI (``claude -p``).""" _provider_name = "claude_code" def _build_cmd(self, prompt: str, workdir: Path) -> list[str]: cmd = [ self._binary, "-p", prompt, "--dangerously-skip-permissions", "--output-format", "text", "--allowed-tools", "Bash Edit Write Read", "--add-dir", str(workdir), ] if self._model: cmd += ["--model", self._model] if self._max_budget_usd: cmd += ["--max-budget-usd", str(self._max_budget_usd)] cmd.extend(self._extra_args) return cmd def generate( self, *, exp_plan: str, topic: str, metric_key: str, pkg_hint: str, compute_budget: str, extra_guidance: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: prompt = self._generate_prompt( topic, exp_plan, metric_key, pkg_hint, compute_budget, extra_guidance, ) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) def refine( self, *, current_files: dict[str, str], run_summaries: list[str], metric_key: str, metric_direction: str, topic: str, extra_hints: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: _seed_workdir(workdir, current_files) prompt = self._refine_prompt( current_files, run_summaries, metric_key, metric_direction, topic, extra_hints, ) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) def repair( self, *, files: dict[str, str], issues: str, workdir: Path, timeout_sec: int = 300, ) -> CodeAgentResult: _seed_workdir(workdir, files) prompt = self._repair_prompt(files, issues) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) # --------------------------------------------------------------------------- # CodexAgent # --------------------------------------------------------------------------- class CodexAgent(_CliAgentBase): """Code agent backed by OpenAI Codex CLI (``codex exec``).""" _provider_name = "codex" def _build_cmd(self, prompt: str, workdir: Path) -> list[str]: cmd = [ self._binary, "exec", prompt, "--sandbox", "workspace-write", "--json", "-C", str(workdir), ] if self._model: cmd += ["-m", self._model] cmd.extend(self._extra_args) return cmd def generate( self, *, exp_plan: str, topic: str, metric_key: str, pkg_hint: str, compute_budget: str, extra_guidance: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: prompt = self._generate_prompt( topic, exp_plan, metric_key, pkg_hint, compute_budget, extra_guidance, ) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) def refine( self, *, current_files: dict[str, str], run_summaries: list[str], metric_key: str, metric_direction: str, topic: str, extra_hints: str, workdir: Path, timeout_sec: int = 600, ) -> CodeAgentResult: _seed_workdir(workdir, current_files) prompt = self._refine_prompt( current_files, run_summaries, metric_key, metric_direction, topic, extra_hints, ) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) def repair( self, *, files: dict[str, str], issues: str, workdir: Path, timeout_sec: int = 300, ) -> CodeAgentResult: _seed_workdir(workdir, files) prompt = self._repair_prompt(files, issues) cmd = self._build_cmd(prompt, workdir) rc, stdout, stderr, elapsed, to = self._run_subprocess( cmd, workdir, timeout_sec or self._default_timeout, ) return self._build_result(workdir, rc, stdout, stderr, elapsed, to) # --------------------------------------------------------------------------- # Factory # --------------------------------------------------------------------------- def create_code_agent( config: RCConfig, llm: Any | None = None, prompts: Any | None = None, ) -> CodeAgentProvider: """Create the appropriate code agent based on config.experiment.cli_agent.""" agent_cfg = config.experiment.cli_agent provider = agent_cfg.provider if provider == "llm": if llm is None: raise RuntimeError("LLM code agent requires an LLM client") from researchclaw.prompts import PromptManager return LlmCodeAgent(llm, prompts or PromptManager(), config) # type: ignore[return-value] if provider == "claude_code": binary = agent_cfg.binary_path or shutil.which("claude") if not binary: raise RuntimeError( "Claude Code binary not found. " "Install it or set experiment.code_agent.binary_path." ) return ClaudeCodeAgent( # type: ignore[return-value] binary_path=binary, model=agent_cfg.model or "sonnet", max_budget_usd=agent_cfg.max_budget_usd, timeout_sec=agent_cfg.timeout_sec, extra_args=list(agent_cfg.extra_args), ) if provider == "codex": binary = agent_cfg.binary_path or shutil.which("codex") if not binary: raise RuntimeError( "Codex binary not found. " "Install it or set experiment.code_agent.binary_path." ) return CodexAgent( # type: ignore[return-value] binary_path=binary, model=agent_cfg.model or "", max_budget_usd=agent_cfg.max_budget_usd, timeout_sec=agent_cfg.timeout_sec, extra_args=list(agent_cfg.extra_args), ) raise ValueError(f"Unknown code agent provider: {provider}") ================================================ FILE: researchclaw/experiment/colab_sandbox.py ================================================ """Google Drive-based async sandbox for Colab experiment execution. Execution model: 1. Write experiment code to a shared Google Drive folder (pending/) 2. A Colab notebook polls pending/, runs each script, writes results to done/ 3. This sandbox polls done/ until results appear or timeout 4. Parse metrics from the result file and return This approach is more robust than direct SSH to Colab because: - No SSH tunnel to maintain - Colab session timeouts only kill the current experiment, not the pipeline - Google Drive sync handles reconnects transparently Requirements: - Google Drive for Desktop installed and syncing (or any Drive mount) - A Colab notebook running the worker loop (template provided below) """ from __future__ import annotations import json import logging import shutil import time import uuid from pathlib import Path from researchclaw.config import ColabDriveConfig from researchclaw.experiment.sandbox import SandboxResult, parse_metrics logger = logging.getLogger(__name__) # Template for the Colab worker notebook COLAB_WORKER_TEMPLATE = '''\ # === ResearchClaw Colab Worker === # Run this cell in Google Colab with GPU enabled. # It polls Google Drive for experiment tasks and executes them. import os, json, time, subprocess, traceback from pathlib import Path from google.colab import drive drive.mount("/content/drive") DRIVE_ROOT = Path("/content/drive/MyDrive/researchclaw") PENDING = DRIVE_ROOT / "pending" RUNNING = DRIVE_ROOT / "running" DONE = DRIVE_ROOT / "done" for d in [PENDING, RUNNING, DONE]: d.mkdir(parents=True, exist_ok=True) print(f"Worker ready. Watching {PENDING}") print("Press Ctrl+C or stop the cell to quit.\\n") while True: for task_dir in sorted(PENDING.iterdir()): if not task_dir.is_dir(): continue task_id = task_dir.name run_dir = RUNNING / task_id done_dir = DONE / task_id # Move to running task_dir.rename(run_dir) print(f"[{task_id}] Running...") # Run setup.sh if present setup_sh = run_dir / "setup.sh" if setup_sh.exists(): subprocess.run(["bash", str(setup_sh)], cwd=str(run_dir), capture_output=True, timeout=300) # Find entry point entry = run_dir / "main.py" if not entry.exists(): # Try first .py file py_files = sorted(run_dir.glob("*.py")) entry = py_files[0] if py_files else None result = {"returncode": -1, "stdout": "", "stderr": "entry point not found"} if entry: try: cp = subprocess.run( ["python3", "-u", str(entry)], cwd=str(run_dir), capture_output=True, text=True, timeout=1800, # 30 min max per experiment ) result = { "returncode": cp.returncode, "stdout": cp.stdout, "stderr": cp.stderr, } except subprocess.TimeoutExpired as e: result = { "returncode": -1, "stdout": (e.stdout or b"").decode("utf-8", errors="replace"), "stderr": "Timed out after 1800s", "timed_out": True, } except Exception: result = { "returncode": -1, "stdout": "", "stderr": traceback.format_exc(), } # Write result and move to done (run_dir / "result.json").write_text(json.dumps(result)) run_dir.rename(done_dir) print(f"[{task_id}] Done (exit {result['returncode']})") time.sleep(10) ''' class ColabDriveSandbox: """Execute experiments asynchronously via Google Drive + Colab worker. Same public API as ExperimentSandbox/DockerSandbox/SshRemoteSandbox. """ def __init__(self, config: ColabDriveConfig, workdir: Path) -> None: self.config = config self.workdir = workdir.resolve() self.workdir.mkdir(parents=True, exist_ok=True) self._run_counter = 0 # Resolve drive root self.drive_root = Path(config.drive_root).expanduser().resolve() self.pending_dir = self.drive_root / "pending" self.done_dir = self.drive_root / "done" # ------------------------------------------------------------------ # Public API (matches SandboxProtocol) # ------------------------------------------------------------------ def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: self._run_counter += 1 task_id = f"rc-{uuid.uuid4().hex[:8]}" # Stage locally staging = self.workdir / f"_colab_{self._run_counter}" staging.mkdir(parents=True, exist_ok=True) (staging / "main.py").write_text(code, encoding="utf-8") self._inject_harness(staging) self._write_setup_script(staging) return self._submit_and_wait(staging, task_id, timeout_sec) def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> SandboxResult: # BUG-DA8-07: Validate entry_point (path traversal, etc.) like other backends from researchclaw.experiment.sandbox import validate_entry_point err = validate_entry_point(entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) self._run_counter += 1 task_id = f"rc-{uuid.uuid4().hex[:8]}" staging = self.workdir / f"_colab_project_{self._run_counter}" if staging.exists(): shutil.rmtree(staging) staging.mkdir(parents=True, exist_ok=True) self._inject_harness(staging) for src_item in project_dir.iterdir(): dest = staging / src_item.name if dest.name == "experiment_harness.py": continue if src_item.is_dir(): shutil.copytree(src_item, dest, dirs_exist_ok=True) elif src_item.is_file(): dest.write_bytes(src_item.read_bytes()) if not (staging / entry_point).exists(): return SandboxResult( returncode=-1, stdout="", stderr=f"Entry point {entry_point} not found", elapsed_sec=0.0, metrics={}, ) # BUG-DA8-07: Check resolved path doesn't escape staging dir from researchclaw.experiment.sandbox import validate_entry_point_resolved err2 = validate_entry_point_resolved(staging, entry_point) if err2: return SandboxResult( returncode=-1, stdout="", stderr=err2, elapsed_sec=0.0, metrics={}, ) self._write_setup_script(staging) return self._submit_and_wait(staging, task_id, timeout_sec) # ------------------------------------------------------------------ # Static helpers # ------------------------------------------------------------------ @staticmethod def check_drive_available(config: ColabDriveConfig) -> tuple[bool, str]: """Check if the Google Drive mount is accessible.""" if not config.drive_root: return False, "colab_drive.drive_root is empty" root = Path(config.drive_root).expanduser().resolve() try: exists = root.exists() except OSError: exists = False if not exists: return False, ( f"Drive root not found: {root}. " f"Is Google Drive for Desktop running and syncing?" ) return True, f"Google Drive accessible at {root}" @staticmethod def write_worker_notebook(output_path: Path) -> None: """Write the Colab worker template to a file for the user to upload.""" output_path.write_text(COLAB_WORKER_TEMPLATE, encoding="utf-8") logger.info("Colab worker template written to %s", output_path) @staticmethod def _inject_harness(target_dir: Path) -> None: harness_src = Path(__file__).parent / "harness_template.py" if harness_src.exists(): dest = target_dir / "experiment_harness.py" dest.write_text( harness_src.read_text(encoding="utf-8"), encoding="utf-8" ) def _write_setup_script(self, staging: Path) -> None: """Write setup.sh if setup_script is configured.""" if self.config.setup_script: setup_path = staging / "setup.sh" setup_path.write_text( f"#!/bin/bash\nset -e\n{self.config.setup_script}\n", encoding="utf-8", ) # ------------------------------------------------------------------ # Core: submit task and poll for result # ------------------------------------------------------------------ def _submit_and_wait( self, staging: Path, task_id: str, timeout_sec: int, ) -> SandboxResult: """Submit an experiment task and wait for the Colab worker to complete it. Protocol: 1. Copy experiment files to ``pending//`` in the shared Drive 2. The Colab worker notebook polls ``pending/``, moves task to ``running/``, executes it, writes ``result.json``, moves to ``done/`` 3. This method polls ``done//`` until result appears or timeout Google Drive sync latency (typically 5-30s) is handled by the configurable poll_interval_sec. If the worker never picks up the task, the pending directory is cleaned up on timeout. """ # Ensure directories exist self.pending_dir.mkdir(parents=True, exist_ok=True) self.done_dir.mkdir(parents=True, exist_ok=True) # Copy task to pending/ task_pending = self.pending_dir / task_id if task_pending.exists(): shutil.rmtree(task_pending) shutil.copytree(staging, task_pending) logger.info("Task %s submitted to %s", task_id, self.pending_dir) # Poll done/ for result task_done = self.done_dir / task_id effective_timeout = max(timeout_sec, self.config.timeout_sec) poll_interval = self.config.poll_interval_sec start = time.monotonic() while time.monotonic() - start < effective_timeout: if task_done.exists(): return self._collect_result(task_done, time.monotonic() - start) time.sleep(poll_interval) # Timeout — clean up pending task if still there elapsed = time.monotonic() - start if task_pending.exists(): shutil.rmtree(task_pending) return SandboxResult( returncode=-1, stdout="", stderr=( f"Colab worker did not complete task {task_id} " f"within {effective_timeout}s. " f"Is the Colab worker notebook running?" ), elapsed_sec=elapsed, metrics={}, timed_out=True, ) def _collect_result( self, task_done: Path, elapsed: float, ) -> SandboxResult: """Read result.json from the done task directory and clean up. The result.json schema (written by the Colab worker): {"returncode": int, "stdout": str, "stderr": str, "timed_out"?: bool} Metrics are parsed from stdout using the same ``metric_name: value`` format as the local and Docker sandboxes. """ result_file = task_done / "result.json" if not result_file.exists(): return SandboxResult( returncode=-1, stdout="", stderr="Colab worker did not write result.json", elapsed_sec=elapsed, metrics={}, ) try: data = json.loads(result_file.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError) as exc: return SandboxResult( returncode=-1, stdout="", stderr=f"Failed to read result.json: {exc}", elapsed_sec=elapsed, metrics={}, ) stdout = data.get("stdout", "") stderr = data.get("stderr", "") returncode = data.get("returncode", -1) timed_out = data.get("timed_out", False) metrics = parse_metrics(stdout) # Clean up shutil.rmtree(task_done, ignore_errors=True) return SandboxResult( returncode=returncode, stdout=stdout, stderr=stderr, elapsed_sec=elapsed, metrics=metrics, timed_out=timed_out, ) ================================================ FILE: researchclaw/experiment/docker_sandbox.py ================================================ """Docker-based sandbox for experiment code execution with GPU passthrough. Uses a single-container, three-phase execution model: Phase 0: pip install from requirements.txt (if present) Phase 1: Run setup.py for dataset downloads (if present) Phase 2: Run the experiment script (main.py) All phases run in the same container, so pip-installed packages persist into the experiment phase. Network can be disabled after setup via iptables (``setup_only`` policy). """ from __future__ import annotations import json import logging import os import re import shutil import subprocess import threading import time from pathlib import Path from researchclaw.config import DockerSandboxConfig from researchclaw.experiment.sandbox import ( SandboxResult, parse_metrics, validate_entry_point, validate_entry_point_resolved, ) logger = logging.getLogger(__name__) _CONTAINER_COUNTER = 0 _counter_lock = threading.Lock() def _next_container_name() -> str: global _CONTAINER_COUNTER # noqa: PLW0603 with _counter_lock: _CONTAINER_COUNTER += 1 return f"rc-exp-{_CONTAINER_COUNTER}-{os.getpid()}" # Packages already in the Docker image — skip during auto-detect. _BUILTIN_PACKAGES = { # PyTorch ecosystem "torch", "torchvision", "torchaudio", "torchdiffeq", # Scientific / ML "numpy", "scipy", "sklearn", "pandas", "matplotlib", "seaborn", "tqdm", "gymnasium", "networkx", # Extended ML ecosystem "timm", "einops", "torchmetrics", "albumentations", "kornia", "h5py", "tensorboard", # HuggingFace / LLM stack "transformers", "datasets", "accelerate", "peft", "trl", "bitsandbytes", "sentencepiece", "protobuf", "tokenizers", "safetensors", "evaluate", # Other pre-installed "yaml", "PIL", "mujoco", # Python stdlib "os", "sys", "math", "random", "json", "csv", "re", "time", "collections", "itertools", "functools", "pathlib", "typing", "dataclasses", "abc", "copy", "io", "logging", "argparse", "datetime", "hashlib", "pickle", "subprocess", "shutil", "tempfile", "warnings", "unittest", "contextlib", "operator", "string", "textwrap", "struct", "statistics", "glob", "urllib", "http", "email", "html", "xml", } # Map import names to pip package names. _IMPORT_TO_PIP = { "torchdiffeq": "torchdiffeq", "torch_geometric": "torch-geometric", "torchvision": "torchvision", "torchaudio": "torchaudio", "cv2": "opencv-python", "PIL": "Pillow", "sklearn": "scikit-learn", "yaml": "PyYAML", "gym": "gymnasium", "ogb": "ogb", "dgl": "dgl", "lightning": "lightning", "pytorch_lightning": "pytorch-lightning", "wandb": "wandb", "optuna": "optuna", } class DockerSandbox: """Execute experiment code inside a Docker container. Same public API as :class:`ExperimentSandbox` so the pipeline can use either backend transparently. The container uses ``entrypoint.sh`` which runs three phases in sequence: 0. ``pip install -r requirements.txt`` (if file present in /workspace) 1. ``python3 setup.py`` (if file present in /workspace) 2. ``python3 `` Network policy controls when network is available: - ``"none"``: No network at any point (``--network none``) - ``"setup_only"``: Network during Phase 0+1, disabled via iptables before Phase 2 - ``"pip_only"``: Network during Phase 0 only (legacy compat, same as setup_only) - ``"full"``: Network available throughout all phases """ def __init__(self, config: DockerSandboxConfig, workdir: Path) -> None: self.config = config self.workdir = workdir.resolve() self.workdir.mkdir(parents=True, exist_ok=True) self._run_counter = 0 # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: """Run a single Python code string inside a container.""" self._run_counter += 1 staging = self.workdir / f"_docker_run_{self._run_counter}" staging.mkdir(parents=True, exist_ok=True) script_path = staging / "main.py" script_path.write_text(code, encoding="utf-8") # Inject experiment harness self._inject_harness(staging) return self._execute(staging, entry_point="main.py", timeout_sec=timeout_sec) def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> SandboxResult: """Run a multi-file experiment project inside a container.""" self._run_counter += 1 staging = self.workdir / f"_docker_project_{self._run_counter}" if staging.exists(): shutil.rmtree(staging) staging.mkdir(parents=True, exist_ok=True) # Pre-copy syntax validation — fail fast before any I/O err = validate_entry_point(entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) # Inject harness first (immutable) self._inject_harness(staging) # Copy project files and subdirectories (skip harness overwrite) import shutil as _shutil for src_item in project_dir.iterdir(): dest = staging / src_item.name if src_item.name == "experiment_harness.py": logger.warning( "Project contains experiment_harness.py — skipping (immutable)" ) continue if src_item.is_file(): dest.write_bytes(src_item.read_bytes()) elif src_item.is_dir() and not src_item.name.startswith((".", "__")): _shutil.copytree(src_item, dest, dirs_exist_ok=True) # Post-copy resolve check — catches symlink-based escapes err = validate_entry_point_resolved(staging, entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) entry = staging / entry_point if not entry.exists(): return SandboxResult( returncode=-1, stdout="", stderr=f"Entry point {entry_point} not found in project", elapsed_sec=0.0, metrics={}, ) return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec) # ------------------------------------------------------------------ # Static helpers # ------------------------------------------------------------------ @staticmethod def check_docker_available() -> bool: """Return True if the Docker daemon is reachable.""" try: cp = subprocess.run( ["docker", "info"], capture_output=True, timeout=10, check=False, ) return cp.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False @staticmethod def check_nvidia_runtime() -> bool: """Return True if the NVIDIA Container Toolkit is available.""" try: cp = subprocess.run( ["docker", "run", "--rm", "--gpus", "all", "nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04", "nvidia-smi"], capture_output=True, timeout=30, check=False, ) return cp.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False @staticmethod def ensure_image(image: str) -> bool: """Return True if *image* exists locally (does NOT pull).""" try: cp = subprocess.run( ["docker", "image", "inspect", image], capture_output=True, timeout=10, check=False, ) return cp.returncode == 0 except (FileNotFoundError, subprocess.TimeoutExpired): return False @staticmethod def _inject_harness(target_dir: Path) -> None: harness_src = Path(__file__).parent / "harness_template.py" if harness_src.exists(): dest = target_dir / "experiment_harness.py" dest.write_text(harness_src.read_text(encoding="utf-8"), encoding="utf-8") logger.debug("Injected experiment harness into %s", target_dir) else: logger.warning("Harness template not found at %s", harness_src) # ------------------------------------------------------------------ # Internals # ------------------------------------------------------------------ def _execute( self, staging_dir: Path, *, entry_point: str, timeout_sec: int ) -> SandboxResult: """Core execution: single container, three-phase via entrypoint.sh.""" cfg = self.config container_name = _next_container_name() # Auto-generate requirements.txt if packages need installing if cfg.network_policy in ("pip_only", "setup_only", "full"): self._write_requirements_txt(staging_dir) # Build the docker run command cmd = self._build_run_command( staging_dir, entry_point=entry_point, container_name=container_name, ) start = time.monotonic() timed_out = False try: logger.debug("Docker run command: %s", cmd) completed = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout_sec, check=False, ) stdout = completed.stdout stderr = completed.stderr returncode = completed.returncode elapsed = time.monotonic() - start except subprocess.TimeoutExpired as exc: timed_out = True stdout = exc.stdout or "" stderr = exc.stderr or "" if isinstance(stdout, bytes): stdout = stdout.decode("utf-8", errors="replace") if isinstance(stderr, bytes): stderr = stderr.decode("utf-8", errors="replace") returncode = -1 # Force-kill the container on timeout self._kill_container(container_name) elapsed = time.monotonic() - start except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - start return SandboxResult( returncode=-1, stdout="", stderr=f"Docker execution error: {exc}", elapsed_sec=elapsed, metrics={}, ) finally: # Always clean up the container regardless of how we exit. # docker rm -f is idempotent: safe even if container was # already removed by --rm, already dead, or never created. if not cfg.keep_containers: self._remove_container(container_name) # Parse metrics from stdout metrics = parse_metrics(stdout) # Try to read structured results.json from staging dir (volume-mounted) results_json_path = staging_dir / "results.json" if results_json_path.exists(): try: structured = json.loads( results_json_path.read_text(encoding="utf-8") ) if isinstance(structured, dict): for k, v in structured.items(): if k not in metrics: try: metrics[k] = float(v) except (TypeError, ValueError): pass except (json.JSONDecodeError, OSError): pass return SandboxResult( returncode=returncode, stdout=stdout, stderr=stderr, elapsed_sec=elapsed, metrics=metrics, timed_out=timed_out, ) def _build_run_command( self, staging_dir: Path, *, entry_point: str, container_name: str, ) -> list[str]: """Build the ``docker run`` command list. The container uses ``entrypoint.sh`` which handles: Phase 0: pip install requirements.txt Phase 1: python3 setup.py Phase 2: python3 Network policy determines --network and RC_SETUP_ONLY_NETWORK env. """ cfg = self.config cmd = [ "docker", "run", "--name", container_name, "--rm", "-v", f"{staging_dir}:/workspace", "-w", "/workspace", f"--memory={cfg.memory_limit_mb}m", f"--shm-size={cfg.shm_size_mb}m", ] # --- Network policy --- if cfg.network_policy == "none": # Fully isolated — no network at any point cmd.extend(["--network", "none"]) cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"]) elif cfg.network_policy in ("setup_only", "pip_only"): # Network during Phase 0+1, disabled via iptables before Phase 2. # Run as host user so experiment can write results.json to volume. # iptables requires NET_ADMIN but will gracefully degrade if # the user lacks root — network remains available but the code # has already been validated by the pipeline security check. cmd.extend(["-e", "RC_SETUP_ONLY_NETWORK=1"]) cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"]) cmd.extend(["--cap-add=NET_ADMIN"]) elif cfg.network_policy == "full": # Full network throughout — for development/debugging cmd.extend(["--user", f"{os.getuid()}:{os.getgid()}"]) # Mount pre-cached datasets # Priority: /opt/datasets (system) > ~/.cache/datasets (user) datasets_host = Path("/opt/datasets") user_datasets = Path.home() / ".cache" / "datasets" if datasets_host.is_dir(): cmd.extend(["-v", f"{datasets_host}:/workspace/data:ro"]) elif user_datasets.is_dir(): cmd.extend(["-v", f"{user_datasets}:/workspace/data:rw"]) else: # Create user-level cache so containers can download datasets user_datasets.mkdir(parents=True, exist_ok=True) cmd.extend(["-v", f"{user_datasets}:/workspace/data:rw"]) # Mount HuggingFace model cache (read-only for model weights). # BUG-103 fix: Don't set HF_HOME to the read-only mount — the # transformers library writes token/telemetry files under HF_HOME. # Instead, use HF_HUB_CACHE for read-only model access and let # HF_HOME default to a writable location inside the container. hf_mounted = False _hf_hub_cache = "/home/researcher/.cache/huggingface/hub" hf_home_env = os.environ.get("HF_HOME", "").strip() if hf_home_env: xdg_hf = Path(hf_home_env).resolve() if xdg_hf.is_dir(): cmd.extend(["-v", f"{xdg_hf}:{_hf_hub_cache}:ro"]) cmd.extend(["-e", f"HF_HUB_CACHE={_hf_hub_cache}"]) hf_mounted = True if not hf_mounted: hf_cache_host = Path.home() / ".cache" / "huggingface" if hf_cache_host.is_dir(): cmd.extend(["-v", f"{hf_cache_host}:{_hf_hub_cache}:ro"]) cmd.extend(["-e", f"HF_HUB_CACHE={_hf_hub_cache}"]) # BUG-107 fix: Set TORCH_HOME to writable location so torchvision # can download pretrained model weights (e.g., Inception-v3 for FID). cmd.extend(["-e", "TORCH_HOME=/workspace/.cache/torch"]) # BUG-R52-03: Set HOME to a writable directory. The container runs # as the host user (--user UID:GID) whose HOME defaults to "/" when # no matching passwd entry exists. pip --user then fails with # "Permission denied: '/.local'". cmd.extend(["-e", "HOME=/workspace/.home"]) # Pass HF token if available (for gated model downloads) hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") if hf_token: cmd.extend(["-e", f"HF_TOKEN={hf_token}"]) # GPU passthrough if cfg.gpu_enabled: if cfg.gpu_device_ids: device_spec = ",".join(str(d) for d in cfg.gpu_device_ids) cmd.extend(["--gpus", f"device={device_spec}"]) else: cmd.extend(["--gpus", "all"]) # Image + entry point (passed as CMD arg to entrypoint.sh) cmd.append(cfg.image) cmd.append(entry_point) return cmd def _write_requirements_txt(self, staging_dir: Path) -> None: """Generate requirements.txt in staging dir from auto-detected imports and explicit pip_pre_install, unless one already exists (LLM-generated). """ req_path = staging_dir / "requirements.txt" # If the LLM already generated a requirements.txt, respect it but # append any pip_pre_install packages not already listed. existing_reqs: set[str] = set() if req_path.exists(): for line in req_path.read_text(encoding="utf-8").splitlines(): line = line.strip() if line and not line.startswith("#"): # Extract package name (before any version specifier) pkg = re.split(r"[><=!~\[]", line)[0].strip().lower() existing_reqs.add(pkg) # Collect additional packages to install packages: list[str] = [] # From config pip_pre_install for pkg in self.config.pip_pre_install: pkg_base = re.split(r"[><=!~\[]", pkg)[0].strip().lower() if pkg_base not in existing_reqs: packages.append(pkg) existing_reqs.add(pkg_base) # Auto-detect from imports if self.config.auto_install_deps: detected = self._detect_pip_packages(staging_dir) for pkg in detected: pkg_base = pkg.lower() if pkg_base not in existing_reqs: packages.append(pkg) existing_reqs.add(pkg_base) if not packages and not req_path.exists(): return # Nothing to install if packages: mode = "a" if req_path.exists() else "w" with open(req_path, mode, encoding="utf-8") as f: if mode == "a": f.write("\n# Auto-detected by ResearchClaw\n") for pkg in packages: f.write(pkg + "\n") logger.info("requirements.txt updated: %s", packages) @staticmethod def _detect_pip_packages(staging_dir: Path) -> list[str]: """Scan Python files for import statements and return pip package names.""" import_re = re.compile( r"^\s*(?:import|from)\s+([\w.]+)", re.MULTILINE ) # Exclude local project modules (any .py file in staging_dir, recursive) # BUG-DA8-13: Use rglob to also scan subdirectories local_modules = { pyf.stem for pyf in staging_dir.rglob("*.py") } detected: list[str] = [] for pyf in staging_dir.rglob("*.py"): if pyf.name == "setup.py": continue # Don't scan setup.py for experiment deps text = pyf.read_text(encoding="utf-8", errors="replace") for m in import_re.finditer(text): top_module = m.group(1).split(".")[0] if top_module in _BUILTIN_PACKAGES: continue if top_module in local_modules: continue # Skip local project modules pip_name = _IMPORT_TO_PIP.get(top_module, top_module) if pip_name not in detected: detected.append(pip_name) return detected @staticmethod def _kill_container(name: str) -> None: try: subprocess.run( ["docker", "kill", name], capture_output=True, timeout=10, check=False, ) except (FileNotFoundError, subprocess.TimeoutExpired): pass @staticmethod def _remove_container(name: str) -> None: try: subprocess.run( ["docker", "rm", "-f", name], capture_output=True, timeout=10, check=False, ) except (FileNotFoundError, subprocess.TimeoutExpired): pass ================================================ FILE: researchclaw/experiment/evaluators/__init__.py ================================================ """Domain-specific experiment evaluators.""" ================================================ FILE: researchclaw/experiment/evaluators/convergence.py ================================================ """Convergence study evaluator for physics/math domains. Analyzes convergence data (error vs grid size/timestep) to determine convergence order and quality of numerical methods. """ from __future__ import annotations import logging import math from dataclasses import dataclass, field from typing import Any import numpy as np logger = logging.getLogger(__name__) @dataclass class ConvergenceResult: """Result of convergence analysis for one method.""" method: str convergence_order: float = 0.0 r_squared: float = 0.0 points: list[dict[str, float]] = field(default_factory=list) is_converging: bool = False expected_order: float | None = None # if known order_matches_expected: bool = False @dataclass class ConvergenceReport: """Full convergence analysis report.""" methods: list[ConvergenceResult] = field(default_factory=list) best_method: str = "" summary: str = "" def compute_convergence_order( h_values: list[float], errors: list[float], ) -> tuple[float, float]: """Compute convergence order via log-log linear regression. Parameters ---------- h_values : list[float] Grid sizes / timesteps (must be decreasing). errors : list[float] Error norms corresponding to each h value. Returns ------- order : float Estimated convergence order (slope in log-log space). r_squared : float R² of the log-log fit. """ if len(h_values) < 2 or len(errors) < 2: return 0.0, 0.0 # Filter out non-positive values valid = [ (h, e) for h, e in zip(h_values, errors) if h > 0 and e > 0 and math.isfinite(h) and math.isfinite(e) ] if len(valid) < 2: return 0.0, 0.0 hs, es = zip(*valid) log_h = np.log(np.array(hs, dtype=np.float64)) log_e = np.log(np.array(es, dtype=np.float64)) # Linear regression: log(e) = p * log(h) + C n = len(log_h) sum_x = np.sum(log_h) sum_y = np.sum(log_e) sum_xy = np.sum(log_h * log_e) sum_x2 = np.sum(log_h ** 2) denom = n * sum_x2 - sum_x ** 2 if abs(denom) < 1e-15: return 0.0, 0.0 slope = (n * sum_xy - sum_x * sum_y) / denom intercept = (sum_y - slope * sum_x) / n # R² y_pred = slope * log_h + intercept ss_res = np.sum((log_e - y_pred) ** 2) ss_tot = np.sum((log_e - np.mean(log_e)) ** 2) r_squared = 1.0 - ss_res / ss_tot if ss_tot > 1e-15 else 0.0 return float(slope), float(r_squared) def analyze_convergence( convergence_data: dict[str, list[dict[str, float]]], expected_orders: dict[str, float] | None = None, ) -> ConvergenceReport: """Analyze convergence data for multiple methods. Parameters ---------- convergence_data : dict Maps method name to list of {"h": ..., "error": ...} dicts. expected_orders : dict, optional Known convergence orders per method (for validation). Returns ------- ConvergenceReport """ results: list[ConvergenceResult] = [] for method, points in convergence_data.items(): if not points: continue # Sort by h (descending — coarsest first) sorted_pts = sorted(points, key=lambda p: p.get("h", 0), reverse=True) h_vals = [p["h"] for p in sorted_pts if "h" in p] # Try "error", "l2_error", "linf_error" error_key = "error" for key in ("error", "l2_error", "linf_error"): if key in sorted_pts[0]: error_key = key break errors = [p.get(error_key, 0) for p in sorted_pts] order, r2 = compute_convergence_order(h_vals, errors) expected = None matches = False if expected_orders and method in expected_orders: expected = expected_orders[method] matches = abs(order - expected) < 0.5 # within half an order is_converging = order > 0.5 and r2 > 0.8 results.append(ConvergenceResult( method=method, convergence_order=order, r_squared=r2, points=sorted_pts, is_converging=is_converging, expected_order=expected, order_matches_expected=matches, )) # Find best method (highest convergence order) best = "" if results: best_result = max(results, key=lambda r: r.convergence_order) best = best_result.method # Generate summary summary_lines = [] for r in results: line = f"{r.method}: order={r.convergence_order:.2f} (R²={r.r_squared:.3f})" if r.expected_order is not None: status = "✓" if r.order_matches_expected else "✗" line += f" [expected={r.expected_order:.1f} {status}]" summary_lines.append(line) return ConvergenceReport( methods=results, best_method=best, summary="\n".join(summary_lines), ) ================================================ FILE: researchclaw/experiment/factory.py ================================================ """Factory for creating sandbox backends based on experiment config.""" from __future__ import annotations import logging from pathlib import Path from typing import TYPE_CHECKING from researchclaw.config import ExperimentConfig from researchclaw.experiment.sandbox import ExperimentSandbox, SandboxProtocol if TYPE_CHECKING: from researchclaw.experiment.agentic_sandbox import AgenticSandbox logger = logging.getLogger(__name__) def create_sandbox(config: ExperimentConfig, workdir: Path) -> SandboxProtocol: """Return the appropriate sandbox backend for *config.mode*. - ``"sandbox"`` → :class:`ExperimentSandbox` (subprocess) - ``"docker"`` → :class:`DockerSandbox` (Docker container) """ if config.mode == "docker": from researchclaw.experiment.docker_sandbox import DockerSandbox docker_cfg = config.docker if not DockerSandbox.check_docker_available(): logger.warning( "Docker daemon is not reachable — " "falling back to subprocess sandbox." ) return ExperimentSandbox(config.sandbox, workdir) if not DockerSandbox.ensure_image(docker_cfg.image): raise RuntimeError( f"Docker image '{docker_cfg.image}' not found locally. " f"Build it: docker build -t {docker_cfg.image} researchclaw/docker/" ) if docker_cfg.gpu_enabled: logger.info("Docker sandbox: GPU passthrough enabled") return DockerSandbox(docker_cfg, workdir) if config.mode == "ssh_remote": from researchclaw.experiment.ssh_sandbox import SshRemoteSandbox ssh_cfg = config.ssh_remote if not ssh_cfg.host: raise RuntimeError( "ssh_remote mode requires experiment.ssh_remote.host in config." ) ok, msg = SshRemoteSandbox.check_ssh_available(ssh_cfg) if not ok: raise RuntimeError(f"SSH connectivity check failed: {msg}") logger.info("SSH remote sandbox: %s", msg) return SshRemoteSandbox(ssh_cfg, workdir) if config.mode == "colab_drive": from researchclaw.experiment.colab_sandbox import ColabDriveSandbox colab_cfg = config.colab_drive ok, msg = ColabDriveSandbox.check_drive_available(colab_cfg) if not ok: raise RuntimeError(f"Colab Drive check failed: {msg}") logger.info("Colab Drive sandbox: %s", msg) # Write worker template for user convenience worker_path = Path(colab_cfg.drive_root).expanduser() / "colab_worker.py" if not worker_path.exists(): ColabDriveSandbox.write_worker_notebook(worker_path) logger.info( "Colab worker template written to %s — " "upload this to Colab and run it.", worker_path, ) return ColabDriveSandbox(colab_cfg, workdir) if config.mode != "sandbox": raise RuntimeError( f"Unsupported experiment mode for create_sandbox(): {config.mode}" ) return ExperimentSandbox(config.sandbox, workdir) def create_agentic_sandbox( config: ExperimentConfig, workdir: Path, skills_dir: Path | None = None, ) -> "AgenticSandbox": # noqa: F821 """Return an :class:`AgenticSandbox` for agentic experiment mode. Validates that Docker is available before returning. """ from researchclaw.experiment.agentic_sandbox import AgenticSandbox if not AgenticSandbox.check_docker_available(): raise RuntimeError( "Docker daemon is not reachable. " "Agentic mode requires Docker. Start Docker first." ) agentic_cfg = config.agentic if agentic_cfg.gpu_enabled: logger.info("Agentic sandbox: GPU passthrough enabled") return AgenticSandbox(agentic_cfg, workdir, skills_dir=skills_dir) ================================================ FILE: researchclaw/experiment/git_manager.py ================================================ """Git-native experiment version management inspired by autoresearch.""" from __future__ import annotations import json import logging import re import subprocess from pathlib import Path logger = logging.getLogger(__name__) class ExperimentGitManager: """Git-native experiment versioning, inspired by autoresearch. Every successful experiment is a commit; failed experiments are reset. This enables git log as an experiment journal and easy rollback. """ def __init__(self, repo_dir: Path) -> None: self.repo_dir: Path = repo_dir self._active_branch: str | None = None self._original_branch: str | None = self._detect_current_branch() def create_experiment_branch(self, tag: str) -> str: branch = f"experiment/{tag}" result = self._run_git(["checkout", "-b", branch]) if result is None or result.returncode != 0: self._log_git_failure("create_experiment_branch", result) return "" self._active_branch = branch return branch def commit_experiment( self, run_id: str, metrics: dict[str, object], description: str ) -> str: add_result = self._run_git(["add", "-A"]) if add_result is None or add_result.returncode != 0: self._log_git_failure("git add", add_result) return "" message = self._format_commit_message( run_id=run_id, metrics=metrics, description=description ) commit_result = self._run_git(["commit", "-m", message]) if commit_result is None or commit_result.returncode != 0: self._log_git_failure("git commit", commit_result) return "" hash_result = self._run_git(["rev-parse", "HEAD"]) if hash_result is None or hash_result.returncode != 0: self._log_git_failure("git rev-parse HEAD", hash_result) return "" return self._clean_output(hash_result.stdout) def discard_experiment(self, run_id: str, reason: str) -> bool: logger.info("Discarding experiment %s: %s", run_id, reason) result = self._run_git(["reset", "--hard", "HEAD"]) if result is None or result.returncode != 0: self._log_git_failure("discard_experiment", result) return False return True def get_experiment_history(self) -> list[dict[str, str]]: result = self._run_git(["log", "--oneline", "--fixed-strings", "--grep", "experiment("]) if result is None or result.returncode != 0: self._log_git_failure("git log", result) return [] history: list[dict[str, str]] = [] for line in result.stdout.splitlines(): parsed = self._parse_experiment_log_line(line) if parsed is not None: history.append(parsed) return history def is_git_repo(self) -> bool: """Check whether repo_dir is inside a git repository.""" result = self._run_git(["rev-parse", "--is-inside-work-tree"]) return result is not None and result.returncode == 0 def get_current_branch(self) -> str: """Return the name of the current branch, or '' on failure.""" name = self._detect_current_branch() return name or "" def return_to_original_branch(self) -> bool: """Switch back to the branch that was active when the manager was created.""" if not self._original_branch: return False result = self._run_git(["checkout", self._original_branch]) if result is None or result.returncode != 0: self._log_git_failure("return_to_original_branch", result) return False self._active_branch = self._original_branch return True def get_experiment_diff(self) -> str: """Return the git diff of uncommitted changes (for logging/debugging).""" result = self._run_git(["diff", "--stat"]) if result is None or result.returncode != 0: return "" return result.stdout.strip() def clean_untracked(self) -> bool: """Remove untracked files in the experiment workspace.""" result = self._run_git(["clean", "-fd"]) return result is not None and result.returncode == 0 def _run_git(self, args: list[str]) -> subprocess.CompletedProcess[str] | None: try: logger.debug("Running git command: git %s", " ".join(args)) return subprocess.run( ["git", *args], cwd=self.repo_dir, capture_output=True, text=True, check=False, ) except Exception as exc: # noqa: BLE001 logger.warning("Git operation failed (%s): %s", " ".join(args), exc) return None @staticmethod def _format_commit_message( *, run_id: str, metrics: dict[str, object], description: str ) -> str: metrics_json = json.dumps(metrics, sort_keys=True) return f"experiment({run_id}): {description}\n\nMetrics: {metrics_json}" @staticmethod def _clean_output(output: str) -> str: return output.strip() @staticmethod def _parse_experiment_log_line(line: str) -> dict[str, str] | None: pattern = re.compile(r"^([0-9a-fA-F]+)\s+experiment\(([^)]+)\):\s*(.*)$") match = pattern.match(line.strip()) if match is None: return None commit_hash, run_id, message = match.groups() return {"hash": commit_hash, "run_id": run_id, "message": message} @staticmethod def _log_git_failure( operation: str, result: subprocess.CompletedProcess[str] | None ) -> None: if result is None: logger.warning("Git operation failed for %s", operation) return stderr = result.stderr.strip() if stderr: logger.warning("Git operation failed for %s: %s", operation, stderr) else: logger.warning( "Git operation failed for %s with code %s", operation, result.returncode ) def _detect_current_branch(self) -> str | None: """Detect the current git branch name, or None if not in a repo.""" result = self._run_git(["rev-parse", "--abbrev-ref", "HEAD"]) if result is None or result.returncode != 0: return None name = result.stdout.strip() return name if name else None ================================================ FILE: researchclaw/experiment/harness_template.py ================================================ """Experiment harness — immutable evaluation infrastructure. This file is injected into the sandbox project directory at execution time. The LLM-generated experiment code should import and use this harness for: - Time budget management (should_stop) - Metric reporting (report_metric) - Result finalization (finalize) - NaN/divergence detection (built-in) This file is NOT editable by the LLM agent — it provides a trust boundary for metric reporting, inspired by karpathy/autoresearch's immutable prepare.py. """ import json import math import sys import time class ExperimentHarness: """Immutable experiment infrastructure for time and metric management.""" def __init__(self, time_budget: int = 120): self._start = time.time() self._time_budget = max(1, int(time_budget)) self._metrics: dict[str, float] = {} self._partial_results: list[dict[str, object]] = [] self._step_count = 0 self._nan_count = 0 @property def elapsed(self) -> float: """Seconds elapsed since harness creation.""" return time.time() - self._start @property def progress(self) -> float: """Fraction of time budget used (0.0 to 1.0).""" return min(self.elapsed / self._time_budget, 1.0) def should_stop(self) -> bool: """Return True if approaching 80% of time budget.""" return self.elapsed >= self._time_budget * 0.8 def check_value(self, value: float, name: str = "metric") -> bool: """Return True if value is finite. Log warning and count NaN/Inf.""" if math.isnan(value) or math.isinf(value): self._nan_count += 1 print( f"WARNING: {name} = {value} (non-finite, skipped)", file=sys.stderr, ) if self._nan_count >= 5: print( "FAIL: Too many NaN/Inf values detected. " "Stopping experiment early.", file=sys.stderr, ) self.finalize() sys.exit(1) return False return True def report_metric(self, name: str, value: float) -> None: """Report a metric value. Validates and prints in standard format. Non-finite values (NaN, Inf) are rejected and logged as warnings. """ if not isinstance(value, (int, float)): try: value = float(value) except (TypeError, ValueError): print(f"WARNING: Cannot convert {name}={value!r} to float", file=sys.stderr) return if not self.check_value(value, name): return self._metrics[name] = value # Standard format recognized by sandbox metric parser print(f"{name}: {value}") def log_result(self, result_dict: dict[str, object]) -> None: """Log a structured result row (e.g., per-condition results).""" self._partial_results.append(result_dict) def finalize(self) -> None: """Write results.json with all reported metrics and partial results.""" output = { "metrics": self._metrics, "elapsed_sec": round(self.elapsed, 2), "time_budget_sec": self._time_budget, "steps_completed": self._step_count, "nan_count": self._nan_count, } if self._partial_results: output["results"] = self._partial_results try: with open("results.json", "w", encoding="utf-8") as f: json.dump(output, f, indent=2, default=str) except OSError as e: print(f"WARNING: Could not write results.json: {e}", file=sys.stderr) def step(self) -> None: """Increment step counter. Call this once per experiment step.""" self._step_count += 1 # Convenience: create a default harness when imported _default_harness: ExperimentHarness | None = None def get_harness(time_budget: int = 120) -> ExperimentHarness: """Get or create the default experiment harness.""" global _default_harness # noqa: PLW0603 if _default_harness is None: _default_harness = ExperimentHarness(time_budget) return _default_harness ================================================ FILE: researchclaw/experiment/metrics.py ================================================ """Universal metric parser — supports JSON, CSV, and stdout regex formats. Parse priority: 1. ``results.json`` — structured JSON output (recommended for all domains) 2. ``results.csv`` — tabular output 3. stdout regex — backward-compatible with existing ``metric: value`` format This module extends (not replaces) the existing ``sandbox.parse_metrics`` function. The existing stdout parser remains the fallback. """ from __future__ import annotations import csv import json import logging import math from dataclasses import dataclass, field from enum import Enum from io import StringIO from pathlib import Path from typing import Any logger = logging.getLogger(__name__) class MetricType(str, Enum): SCALAR = "scalar" TABLE = "table" CONVERGENCE = "convergence" LEARNING_CURVE = "learning_curve" CONFUSION_MATRIX = "confusion" STRUCTURED = "structured" PARETO = "pareto" @dataclass class ExperimentResults: """Unified experiment results container. Works for all domains — ML scalar metrics, physics convergence data, economics regression tables, etc. """ # Flat scalar metrics (backward-compatible with existing pipeline) scalars: dict[str, float] = field(default_factory=dict) # Per-condition results (new universal format) conditions: dict[str, dict[str, Any]] = field(default_factory=dict) # Convergence data (for physics/math domains) convergence: dict[str, list[dict[str, float]]] = field(default_factory=dict) # Regression tables (for economics) regression_table: dict[str, dict[str, Any]] = field(default_factory=dict) # Full structured data (raw JSON) structured: dict[str, Any] = field(default_factory=dict) # Metadata experiment_type: str = "" domain: str = "" total_runtime_sec: float = 0.0 source: str = "" # "json" | "csv" | "stdout" def to_flat_metrics(self) -> dict[str, float]: """Convert to flat metric dict for backward compatibility. The existing pipeline expects dict[str, float] from parse_metrics(). This method flattens all result types into that format. """ metrics: dict[str, float] = dict(self.scalars) # Flatten conditions for cond_name, seeds in self.conditions.items(): if isinstance(seeds, dict): for seed_or_metric, value in seeds.items(): if isinstance(value, dict): for metric_name, metric_val in value.items(): if isinstance(metric_val, (int, float)) and math.isfinite(metric_val): metrics[f"{cond_name}/{metric_name}"] = float(metric_val) elif isinstance(value, (int, float)) and math.isfinite(value): metrics[f"{cond_name}/{seed_or_metric}"] = float(value) # Flatten convergence (take final/best error per method) for method, points in self.convergence.items(): if points: last = points[-1] for key, val in last.items(): if key != "h" and isinstance(val, (int, float)) and math.isfinite(val): metrics[f"{method}/{key}"] = float(val) # Flatten regression table for spec, coeffs in self.regression_table.items(): if isinstance(coeffs, dict): for key, val in coeffs.items(): if isinstance(val, (int, float)) and math.isfinite(val): metrics[f"{spec}/{key}"] = float(val) return metrics class UniversalMetricParser: """Parse experiment results from multiple output formats. Usage:: parser = UniversalMetricParser() results = parser.parse(run_dir) flat = results.to_flat_metrics() # backward-compatible """ def parse(self, run_dir: Path, stdout: str = "") -> ExperimentResults: """Parse experiment results from a run directory. Tries formats in order: JSON → CSV → stdout regex. """ # 1. Try JSON results_json = run_dir / "results.json" if results_json.exists(): try: result = self._parse_json(results_json) if result.scalars or result.conditions or result.convergence or result.regression_table: logger.info("Parsed results from results.json") return result except Exception: logger.warning("Failed to parse results.json", exc_info=True) # 2. Try CSV results_csv = run_dir / "results.csv" if results_csv.exists(): try: result = self._parse_csv(results_csv) if result.source == "csv": logger.info("Parsed results from results.csv") return result except Exception: logger.warning("Failed to parse results.csv", exc_info=True) # 3. Fallback: stdout regex (existing behavior) if stdout: return self._parse_stdout(stdout) # Try reading stdout.log from run_dir stdout_log = run_dir / "stdout.log" if stdout_log.exists(): try: stdout_text = stdout_log.read_text(encoding="utf-8", errors="replace") return self._parse_stdout(stdout_text) except Exception: logger.warning("Failed to read stdout.log", exc_info=True) return ExperimentResults(source="none") def _parse_json(self, path: Path) -> ExperimentResults: """Parse structured JSON results.""" with path.open(encoding="utf-8") as fh: data = json.load(fh) if not isinstance(data, dict): return ExperimentResults(source="json") result = ExperimentResults( source="json", experiment_type=data.get("experiment_type", ""), structured=data, ) # Extract metadata meta = data.get("metadata", {}) if isinstance(meta, dict): result.domain = meta.get("domain", "") result.total_runtime_sec = float(meta.get("total_runtime_sec", 0)) # Extract conditions (comparison experiments) conditions = data.get("conditions", {}) if isinstance(conditions, dict): result.conditions = conditions # Also extract scalar metrics for backward compatibility for cond_name, seeds in conditions.items(): if isinstance(seeds, dict): for seed_key, metrics in seeds.items(): if isinstance(metrics, dict): for metric_name, val in metrics.items(): if isinstance(val, (int, float)) and math.isfinite(val): result.scalars[f"{cond_name}/{metric_name}"] = float(val) result.scalars[metric_name] = float(val) elif isinstance(metrics, (int, float)) and math.isfinite(metrics): result.scalars[f"{cond_name}/{seed_key}"] = float(metrics) # Extract convergence data convergence = data.get("convergence", {}) if isinstance(convergence, dict): result.convergence = convergence # Extract regression table reg_table = data.get("regression_table", {}) if isinstance(reg_table, dict): result.regression_table = reg_table # Top-level scalar metrics for key, val in data.items(): if key not in ("conditions", "convergence", "regression_table", "metadata", "experiment_type"): if isinstance(val, (int, float)) and math.isfinite(val): result.scalars[key] = float(val) return result def _parse_csv(self, path: Path) -> ExperimentResults: """Parse CSV results (one row per condition/seed/metric).""" text = path.read_text(encoding="utf-8", errors="replace") reader = csv.DictReader(StringIO(text)) result = ExperimentResults(source="csv") rows_processed = 0 for row in reader: rows_processed += 1 # Expected columns: condition, seed, metric, value # Or: method, h, error (for convergence) cond = row.get("condition", row.get("method", "")) metric = row.get("metric", "") value_str = row.get("value", row.get("error", "")) try: val = float(value_str) except (ValueError, TypeError): continue if not math.isfinite(val): continue if metric: key = f"{cond}/{metric}" if cond else metric result.scalars[key] = val elif cond: # Convergence-style: method, h, error h_str = row.get("h", "") try: h = float(h_str) except (ValueError, TypeError): continue if cond not in result.convergence: result.convergence[cond] = [] result.convergence[cond].append({"h": h, "error": val}) # Mark as CSV source if we processed any rows (even if no valid data) if rows_processed == 0: result.source = "none" return result def _parse_stdout(self, stdout: str) -> ExperimentResults: """Parse stdout using the existing regex-based parser. Delegates to ``sandbox.parse_metrics`` for backward compatibility. """ from researchclaw.experiment.sandbox import parse_metrics metrics = parse_metrics(stdout) return ExperimentResults( scalars={k: float(v) for k, v in metrics.items() if isinstance(v, (int, float))}, source="stdout", ) ================================================ FILE: researchclaw/experiment/runner.py ================================================ """Experiment execution engine inspired by autoresearch's edit→run→eval→keep/discard loop.""" from __future__ import annotations import json import logging import re from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Protocol, cast from researchclaw.config import ExperimentConfig, SandboxConfig, SshRemoteConfig from researchclaw.experiment.factory import create_sandbox from researchclaw.experiment.sandbox import SandboxProtocol logger = logging.getLogger(__name__) @dataclass(frozen=True) class ExperimentResult: run_id: str iteration: int code: str metrics: dict[str, object] primary_metric: float | None improved: bool kept: bool elapsed_sec: float stdout: str stderr: str error: str | None = None @dataclass class ExperimentHistory: results: list[ExperimentResult] = field(default_factory=list) best_result: ExperimentResult | None = None baseline_metric: float | None = None def add(self, result: ExperimentResult) -> None: self.results.append(result) if self.baseline_metric is None and result.primary_metric is not None: self.baseline_metric = result.primary_metric def to_dict(self) -> dict[str, object]: return { "results": [asdict(result) for result in self.results], "best_result": asdict(self.best_result) if self.best_result else None, "baseline_metric": self.baseline_metric, } @classmethod def from_dict(cls, data: dict[str, object]) -> ExperimentHistory: results: list[ExperimentResult] = [] raw_results = data.get("results") if isinstance(raw_results, list): for item in cast(list[object], raw_results): if isinstance(item, dict): item_map = cast(dict[object, object], item) normalized_item: dict[str, object] = {} for key, value in item_map.items(): normalized_item[str(key)] = value parsed = _result_from_dict(normalized_item) if parsed is not None: results.append(parsed) best_raw = data.get("best_result") best_result = ( _result_from_dict( { str(key): value for key, value in cast(dict[object, object], best_raw).items() } ) if isinstance(best_raw, dict) else None ) baseline_metric_raw = data.get("baseline_metric") baseline_metric = ( float(baseline_metric_raw) if isinstance(baseline_metric_raw, (int, float)) else None ) return cls( results=results, best_result=best_result, baseline_metric=baseline_metric ) class _ChatResponse(Protocol): content: str class _ChatClient(Protocol): def chat( self, messages: list[dict[str, str]], *, system: str | None = None ) -> _ChatResponse: ... class _GitManager(Protocol): def is_git_repo(self) -> bool: ... def create_experiment_branch(self, tag: str) -> str: ... def commit_experiment(self, run_id: str, metrics: dict[str, object], description: str) -> str: ... def discard_experiment(self, run_id: str, reason: str) -> bool: ... def return_to_original_branch(self) -> bool: ... class ExperimentRunner: def __init__( self, config: ExperimentConfig, workspace: Path, *, git_repo_dir: Path | None = None, ) -> None: self.config: ExperimentConfig = config self.workspace: Path = workspace self.workspace.mkdir(parents=True, exist_ok=True) self.remote_config: SshRemoteConfig = config.ssh_remote self.sandbox: SandboxProtocol = create_sandbox(config, workspace / "sandbox") self.history: ExperimentHistory = ExperimentHistory() # Git integration (Phase 3) self._git: _GitManager | None = None if git_repo_dir is not None: from researchclaw.experiment.git_manager import ExperimentGitManager mgr = ExperimentGitManager(git_repo_dir) if mgr.is_git_repo(): self._git = mgr else: logger.warning("git_repo_dir %s is not a git repo; git integration disabled", git_repo_dir) def run_experiment( self, code: str, *, run_id: str, iteration: int = 0 ) -> ExperimentResult: sandbox_result = self.sandbox.run(code, timeout_sec=self.config.time_budget_sec) primary_metric = self._to_float( sandbox_result.metrics.get(self.config.metric_key) ) current_best = ( self.history.best_result.primary_metric if self.history.best_result else None ) improved = False kept = False if primary_metric is not None: if current_best is None: improved = True kept = True elif self._is_improvement(primary_metric, current_best): improved = True kept = abs(primary_metric - current_best) > self.config.keep_threshold error: str | None = None if sandbox_result.timed_out: error = f"Timed out after {self.config.time_budget_sec}s" elif sandbox_result.returncode != 0: error = ( sandbox_result.stderr.strip() or f"Process exited with {sandbox_result.returncode}" ) result = ExperimentResult( run_id=run_id, iteration=iteration, code=code, metrics=sandbox_result.metrics, primary_metric=primary_metric, improved=improved, kept=kept, elapsed_sec=sandbox_result.elapsed_sec, stdout=sandbox_result.stdout, stderr=sandbox_result.stderr, error=error, ) if kept: self.history.best_result = result self.history.add(result) return result def run_loop( self, initial_code: str, *, run_id: str, llm: _ChatClient | None = None ) -> ExperimentHistory: # Phase 3: Create experiment branch if git is available if self._git is not None: branch = self._git.create_experiment_branch(run_id) if branch: logger.info("Created experiment branch: %s", branch) current_code = initial_code baseline = self.run_experiment(current_code, run_id=run_id, iteration=0) # Phase 3: Commit baseline if self._git is not None and baseline.kept: self._git.commit_experiment( run_id=f"{run_id}-iter0", metrics=baseline.metrics, description=f"Baseline: {self.config.metric_key}={baseline.primary_metric}", ) if llm is None: return self.history no_improvement_count = 0 for iteration in range(1, self.config.max_iterations + 1): next_code = self._improve_code(llm, current_code, self.history) result = self.run_experiment(next_code, run_id=run_id, iteration=iteration) current_code = next_code # Phase 3: Git commit/discard based on result if self._git is not None: if result.kept: self._git.commit_experiment( run_id=f"{run_id}-iter{iteration}", metrics=result.metrics, description=f"Iter {iteration}: {self.config.metric_key}={result.primary_metric}", ) else: self._git.discard_experiment( run_id=f"{run_id}-iter{iteration}", reason=f"No improvement at iteration {iteration}", ) if result.improved: no_improvement_count = 0 else: no_improvement_count += 1 if no_improvement_count >= 3: logger.info("Stopping early due to 3 non-improving iterations") break # Phase 3: Return to original branch if self._git is not None: self._git.return_to_original_branch() return self.history def _improve_code( self, llm: _ChatClient, current_code: str, history: ExperimentHistory ) -> str: direction = self.config.metric_direction last_result = history.results[-1] if history.results else None last_metrics = last_result.metrics if last_result else {} best_metrics = history.best_result.metrics if history.best_result else {} last_metric = last_result.primary_metric if last_result else None best_metric = ( history.best_result.primary_metric if history.best_result else None ) prompt = ( "Improve the experiment code to optimize the primary metric.\n\n" f"Metric key: {self.config.metric_key}\n" f"Direction: {direction}\n" f"Last primary metric: {last_metric}\n" f"Best primary metric: {best_metric}\n" f"Last metrics JSON: {json.dumps(last_metrics, ensure_ascii=True)}\n" f"Best metrics JSON: {json.dumps(best_metrics, ensure_ascii=True)}\n\n" "Current code:\n" "```python\n" f"{current_code}\n" "```\n\n" "Return only the updated Python code." ) try: response = llm.chat( [{"role": "user", "content": prompt}], system="You are an expert ML experimentation assistant.", ) except Exception as exc: # noqa: BLE001 logger.exception("Code improvement call failed: %s", exc) return current_code candidate = getattr(response, "content", "") if not isinstance(candidate, str) or not candidate.strip(): logger.warning("LLM returned empty code; keeping current version") return current_code extracted = self._extract_python_code(candidate) return extracted if extracted.strip() else current_code def save_history(self, path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) _ = path.write_text( json.dumps(self.history.to_dict(), indent=2), encoding="utf-8" ) def _is_improvement(self, new_value: float, best_value: float) -> bool: if self.config.metric_direction == "maximize": return new_value > best_value return new_value < best_value @staticmethod def _to_float(value: object) -> float | None: if isinstance(value, bool): return None if isinstance(value, (int, float)): return float(value) if isinstance(value, str): try: return float(value) except ValueError: return None return None @staticmethod def _extract_python_code(content: str) -> str: match = re.search(r"```(?:python)?\s*(.*?)\s*```", content, flags=re.DOTALL) if match is None: return content.strip() return match.group(1).strip() def _result_from_dict(data: dict[str, object]) -> ExperimentResult | None: run_id = data.get("run_id") iteration = data.get("iteration") code = data.get("code") metrics = data.get("metrics") primary_metric = data.get("primary_metric") improved = data.get("improved") kept = data.get("kept") elapsed_sec = data.get("elapsed_sec") stdout = data.get("stdout") stderr = data.get("stderr") error = data.get("error") if not isinstance(run_id, str) or not isinstance(iteration, int): return None if not isinstance(code, str) or not isinstance(metrics, dict): return None if primary_metric is not None and not isinstance(primary_metric, (int, float)): return None if not isinstance(improved, bool) or not isinstance(kept, bool): return None if not isinstance(elapsed_sec, (int, float)): return None if not isinstance(stdout, str) or not isinstance(stderr, str): return None if error is not None and not isinstance(error, str): return None typed_metrics: dict[str, object] = {} for key, value in cast(dict[object, object], metrics).items(): typed_metrics[str(key)] = value return ExperimentResult( run_id=run_id, iteration=iteration, code=code, metrics=typed_metrics, primary_metric=float(primary_metric) if isinstance(primary_metric, (int, float)) else None, improved=improved, kept=kept, elapsed_sec=float(elapsed_sec), stdout=stdout, stderr=stderr, error=error, ) ================================================ FILE: researchclaw/experiment/sandbox.py ================================================ """Sandbox environment for safe experiment code execution.""" from __future__ import annotations import logging import math import os import re import subprocess import time from dataclasses import dataclass from pathlib import Path from typing import Protocol from researchclaw.config import SandboxConfig from researchclaw.hardware import is_metric_name logger = logging.getLogger(__name__) def validate_entry_point(entry_point: str) -> str | None: """Validate *entry_point* syntax (no filesystem access needed). Returns an error message if invalid, ``None`` if valid. Call this **before** copying files to fail fast on obviously bad input. """ if not entry_point or not entry_point.strip(): return "Entry point is empty" ep = Path(entry_point) if ep.is_absolute(): return f"Entry point must be a relative path, got: {entry_point}" if ".." in ep.parts: return f"Entry point must not contain '..': {entry_point}" return None def validate_entry_point_resolved(staging: Path, entry_point: str) -> str | None: """Validate that *entry_point* resolves inside *staging*. Returns an error message if invalid, ``None`` if valid. Call this **after** copying files so that symlinks are resolved against the real staging contents. """ resolved = (staging / entry_point).resolve() staging_resolved = staging.resolve() if not resolved.is_relative_to(staging_resolved): return f"Entry point escapes staging directory: {entry_point}" return None # Matches both plain "metric: value" and "condition=xxx metric: value" formats _FLOAT_RE = r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?" _METRIC_PATTERN = re.compile( rf"^(?:\S+=\S+\s+)?(\w[\w.]*)\s*:\s*({_FLOAT_RE})\s*$" ) # R17: Extract per-condition metrics with optional extra tags: # "condition= [regime=] [H=] [seed=] metric: value" # Captures: (condition_name, extra_tags_string, metric_name, value) _CONDITION_METRIC_PATTERN = re.compile( rf"^condition=(\S+)\s+((?:\S+=\S+\s+)*)(\w[\w.]*)\s*:\s*({_FLOAT_RE})\s*$" ) # R16-1: Ratio format with optional extra tags _CONDITION_RATIO_PATTERN = re.compile( r"^condition=(\S+)\s+((?:\S+=\S+\s+)*)(\w[\w.]*)\s*:\s*(\d+)/(\d+)\s*$" ) # BUG-181: Parse SUMMARY lines: "SUMMARY condition=X metric=Y mean=M std=S [success_rate=R]" _SUMMARY_PATTERN = re.compile( r"^SUMMARY\s+condition=(\S+)\s+metric=(\S+)\s+mean=(" + _FLOAT_RE + r")\s+std=(" + _FLOAT_RE + r")" ) # BUG-181: Multi-metric condition line: extract all "metric: value" pairs _CONDITION_MULTI_METRIC_RE = re.compile( r"(\w[\w.]*)\s*:\s*(" + _FLOAT_RE + r")" ) def _to_text(value: str | bytes | None) -> str: if value is None: return "" if isinstance(value, bytes): return value.decode("utf-8", errors="replace") return value def parse_metrics(stdout: str) -> dict[str, float]: metrics: dict[str, float] = {} for line in stdout.splitlines(): stripped = line.strip() # BUG-181: Parse SUMMARY lines first (most reliable, one metric per line) # Format: "SUMMARY condition=X metric=Y mean=M std=S [success_rate=R]" summary_match = _SUMMARY_PATTERN.match(stripped) if summary_match: cond_name, metric_name, mean_str, std_str = summary_match.groups() if is_metric_name(metric_name): try: mean_val = float(mean_str) std_val = float(std_str) except ValueError: continue if not (math.isnan(mean_val) or math.isinf(mean_val)): metrics[f"{cond_name}/{metric_name}"] = mean_val metrics[f"{cond_name}/{metric_name}_mean"] = mean_val metrics[f"{cond_name}/{metric_name}_std"] = std_val metrics[metric_name] = mean_val continue # R16-1: Try ratio format first: "condition=X [tags] metric: N/M" ratio_match = _CONDITION_RATIO_PATTERN.match(stripped) if ratio_match: cond_name, extra_tags, name, num, den = ratio_match.groups() if is_metric_name(name): try: val = float(num) / float(den) if float(den) != 0 else 0.0 except (ValueError, ZeroDivisionError): continue # Build composite key from condition + extra tags tag_parts = [cond_name] for tag in extra_tags.strip().split(): if "=" in tag: tag_parts.append(tag.split("=", 1)[1]) composite_key = "/".join(tag_parts) metrics[f"{composite_key}/{name}"] = val metrics[f"{cond_name}/{name}"] = val metrics[name] = val continue # Try condition-prefixed format: "condition=X [tags] metric: value" cond_match = _CONDITION_METRIC_PATTERN.match(stripped) if cond_match: cond_name, extra_tags, name, value = cond_match.groups() if is_metric_name(name): try: val = float(value) except ValueError: continue if math.isnan(val) or math.isinf(val): logger.warning("Skipping non-finite metric %s=%s", name, value) continue # Build composite key from condition + extra tags tag_parts = [cond_name] for tag in extra_tags.strip().split(): if "=" in tag: tag_parts.append(tag.split("=", 1)[1]) composite_key = "/".join(tag_parts) metrics[f"{composite_key}/{name}"] = val metrics[f"{cond_name}/{name}"] = val metrics[name] = val continue # BUG-181: Multi-metric condition line fallback # Handles: "condition=X seed=S metric1: v1 metric2: v2 ..." # (lines not matched by _CONDITION_METRIC_PATTERN due to multiple metrics) if stripped.startswith("condition="): _parts = stripped.split() _cond = _parts[0].split("=", 1)[1] if "=" in _parts[0] else None _seed = None for _p in _parts[1:]: if _p.startswith("seed="): _seed = _p.split("=", 1)[1] break if _cond: for _mm in _CONDITION_MULTI_METRIC_RE.finditer(stripped): _mname, _mval_str = _mm.groups() if is_metric_name(_mname): try: _mval = float(_mval_str) except ValueError: continue if math.isnan(_mval) or math.isinf(_mval): continue if _seed is not None: metrics[f"{_cond}/{_seed}/{_mname}"] = _mval metrics[f"{_cond}/{_mname}"] = _mval metrics[_mname] = _mval continue # Plain format: "metric: value" match = _METRIC_PATTERN.match(stripped) if match is None: continue name, value = match.groups() if not is_metric_name(name): continue try: val = float(value) except ValueError: continue # R5-3: Skip NaN/Inf values — they indicate divergence if math.isnan(val) or math.isinf(val): logger.warning("Skipping non-finite metric %s=%s", name, value) continue metrics[name] = val return metrics def extract_paired_comparisons(stdout: str) -> list[dict[str, object]]: """R18-1: Extract PAIRED statistical comparison lines from stdout. Matches: PAIRED: vs [regime=] mean_diff= ... Returns a list of dicts with method, baseline, regime, and stats. """ results: list[dict[str, object]] = [] pattern = re.compile( r"^PAIRED:\s+(\S+)\s+vs\s+(\S+)\s*(.*?)mean_diff=([+-]?\d+\.?\d*)" r".*?std_diff=([+-]?\d+\.?\d*)" r".*?t_stat=([+-]?\d+\.?\d*)" r".*?p_value=([+-]?\d+\.?\d*)" ) for line in stdout.splitlines(): m = pattern.match(line.strip()) if m: method, baseline, tags, mean_diff, std_diff, t_stat, p_value = m.groups() entry: dict[str, object] = { "method": method, "baseline": baseline, "mean_diff": float(mean_diff), "std_diff": float(std_diff), "t_stat": float(t_stat), "p_value": float(p_value), } # Extract regime if present regime_m = re.search(r"regime=(\S+)", tags) if regime_m: entry["regime"] = regime_m.group(1) # Extract CI if present ci_m = re.search(r"ci95=\(([^,]+),([^)]+)\)", line) if ci_m: entry["ci95_low"] = float(ci_m.group(1)) entry["ci95_high"] = float(ci_m.group(2)) results.append(entry) return results def detect_nan_divergence(stdout: str, stderr: str) -> str | None: """Check stdout/stderr for NaN/Inf/divergence indicators. Returns a description of the issue if detected, None otherwise. """ issues: list[str] = [] combined = (stdout or "") + "\n" + (stderr or "") lower = combined.lower() # Check for NaN indicators if "nan" in lower: for pattern in ("loss: nan", "nan loss", "math domain error", "loss is nan"): if pattern in lower: issues.append(f"NaN detected: '{pattern}' found in output") break else: # Generic NaN mention — could be a false positive but worth flagging if re.search(r"\bnan\b", lower): issues.append("Possible NaN detected in output") # Check for Inf indicators if "inf" in lower: if re.search(r"\binf\b", lower) and "info" not in lower.split("inf")[0][-4:]: issues.append("Possible Inf value detected in output") # Check for divergence (loss > 100 is a common fast-fail threshold) for line in stdout.splitlines(): match = _METRIC_PATTERN.match(line.strip()) if match: name, value = match.groups() try: val = float(value) if math.isnan(val) or math.isinf(val): issues.append(f"Non-finite metric: {name}={value}") elif "loss" in name.lower() and val > 100: issues.append(f"Diverging loss: {name}={val} (>100)") except ValueError: pass return "; ".join(issues) if issues else None @dataclass(frozen=True) class SandboxResult: returncode: int stdout: str stderr: str elapsed_sec: float metrics: dict[str, object] timed_out: bool = False class SandboxProtocol(Protocol): """Structural type for sandbox backends (ExperimentSandbox, DockerSandbox).""" def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: ... def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> SandboxResult: ... class ExperimentSandbox: def __init__(self, config: SandboxConfig, workdir: Path) -> None: self.config: SandboxConfig = config self.workdir: Path = workdir.resolve() self.workdir.mkdir(parents=True, exist_ok=True) self._run_counter: int = 0 def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: script_path = self._next_script_path() self._write_script(script_path, code) start = time.monotonic() command = self._build_command(script_path) logger.debug("Running sandbox command: %s", command) result: SandboxResult try: env = {**os.environ, "PYTHONUNBUFFERED": "1"} completed = subprocess.run( command, capture_output=True, text=True, timeout=timeout_sec, cwd=self.workdir, env=env, check=False, ) result = self._result_from_completed( completed, elapsed_sec=time.monotonic() - start ) except subprocess.TimeoutExpired as exc: result = self._result_from_timeout( exc, timeout_sec=timeout_sec, elapsed_sec=time.monotonic() - start ) except Exception as exc: # noqa: BLE001 result = self._result_from_exception( exc, elapsed_sec=time.monotonic() - start ) if self._should_cleanup(result): self._cleanup_script(script_path) return result def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> SandboxResult: """Run a multi-file experiment project in the sandbox. Copies all ``.py`` files from *project_dir* into the sandbox work directory and executes *entry_point*. """ import shutil # BUG-DA8-06: Use unique dir name to prevent races under concurrent calls self._run_counter += 1 sandbox_project = self.workdir / f"_project_{self._run_counter}" if sandbox_project.exists(): shutil.rmtree(sandbox_project) sandbox_project.mkdir(parents=True, exist_ok=True) # Pre-copy syntax validation — fail fast before any I/O err = validate_entry_point(entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) # R5-4: Inject immutable experiment harness before copying project files self._inject_harness(sandbox_project) # Copy all project files (will NOT overwrite harness — harness name is unique) for src_file in project_dir.iterdir(): if src_file.is_file(): dest = sandbox_project / src_file.name # Do not allow project to overwrite the harness if dest.name == "experiment_harness.py": logger.warning("Project contains experiment_harness.py — skipping (immutable)") continue dest.write_bytes(src_file.read_bytes()) elif src_file.is_dir() and not src_file.name.startswith("."): import shutil as _shutil_proj dest_dir = sandbox_project / src_file.name _shutil_proj.copytree(src_file, dest_dir, dirs_exist_ok=True) # Post-copy resolve check — catches symlink-based escapes err = validate_entry_point_resolved(sandbox_project, entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) entry = sandbox_project / entry_point if not entry.exists(): return SandboxResult( returncode=-1, stdout="", stderr=f"Entry point {entry_point} not found in project", elapsed_sec=0.0, metrics={}, ) start = time.monotonic() command = self._build_command(entry) logger.debug("Running project sandbox command: %s (cwd=%s)", command, sandbox_project) result: SandboxResult try: env = {**os.environ, "PYTHONUNBUFFERED": "1"} completed = subprocess.run( command, capture_output=True, text=True, timeout=timeout_sec, cwd=sandbox_project, env=env, check=False, ) result = self._result_from_completed( completed, elapsed_sec=time.monotonic() - start ) except subprocess.TimeoutExpired as exc: result = self._result_from_timeout( exc, timeout_sec=timeout_sec, elapsed_sec=time.monotonic() - start ) except Exception as exc: # noqa: BLE001 result = self._result_from_exception( exc, elapsed_sec=time.monotonic() - start ) return result @staticmethod def _inject_harness(target_dir: Path) -> None: """Copy the immutable experiment harness into the target directory.""" harness_src = Path(__file__).parent / "harness_template.py" if harness_src.exists(): dest = target_dir / "experiment_harness.py" dest.write_text(harness_src.read_text(encoding="utf-8"), encoding="utf-8") logger.debug("Injected experiment harness into %s", target_dir) else: logger.warning("Harness template not found at %s", harness_src) def _next_script_path(self) -> Path: self._run_counter += 1 return self.workdir / f"_experiment_{self._run_counter}.py" @staticmethod def _write_script(script_path: Path, code: str) -> None: _ = script_path.write_text(code, encoding="utf-8") def _build_command(self, script_path: Path) -> list[str]: # Convert relative python_path to absolute WITHOUT resolving symlinks. # Using .resolve() would follow venv symlinks to the system Python binary, # which loses the venv context (site-packages like numpy become unavailable). python = self.config.python_path python_path = Path(python) if not python_path.is_absolute(): python_path = Path.cwd() / python_path # -u: unbuffered stdout/stderr so subprocess.run captures all output return [str(python_path), "-u", str(script_path)] @staticmethod def _result_from_completed( completed: subprocess.CompletedProcess[str], *, elapsed_sec: float ) -> SandboxResult: metrics = parse_metrics(completed.stdout) return SandboxResult( returncode=completed.returncode, stdout=completed.stdout, stderr=completed.stderr, elapsed_sec=elapsed_sec, metrics={key: value for key, value in metrics.items()}, ) @staticmethod def _result_from_timeout( exc: subprocess.TimeoutExpired, *, timeout_sec: int, elapsed_sec: float, ) -> SandboxResult: stdout = _to_text(exc.stdout) stderr = _to_text(exc.stderr) metrics = parse_metrics(stdout) logger.warning("Sandbox execution timed out after %ss", timeout_sec) return SandboxResult( returncode=-1, stdout=stdout, stderr=stderr, elapsed_sec=elapsed_sec, metrics={key: value for key, value in metrics.items()}, timed_out=True, ) @staticmethod def _result_from_exception(exc: Exception, *, elapsed_sec: float) -> SandboxResult: logger.exception("Sandbox execution failed: %s", exc) return SandboxResult( returncode=-1, stdout="", stderr=str(exc), elapsed_sec=elapsed_sec, metrics={}, ) @staticmethod def _should_cleanup(result: SandboxResult) -> bool: return result.returncode == 0 and not result.timed_out @staticmethod def _cleanup_script(script_path: Path) -> None: try: script_path.unlink(missing_ok=True) except Exception: # noqa: BLE001 logger.warning("Failed to delete temporary file: %s", script_path) ================================================ FILE: researchclaw/experiment/ssh_sandbox.py ================================================ """SSH remote sandbox for experiment code execution on remote GPU servers. Uploads experiment code via scp, executes via ssh, and collects results. Supports any SSH-accessible machine including cloud VMs, lab servers, and Colab instances with SSH tunnels. """ from __future__ import annotations import logging import os import shlex import shutil import subprocess import time import uuid from pathlib import Path from researchclaw.config import SshRemoteConfig from researchclaw.experiment.sandbox import ( SandboxResult, parse_metrics, validate_entry_point, validate_entry_point_resolved, ) logger = logging.getLogger(__name__) class SshRemoteSandbox: """Execute experiment code on a remote machine via SSH. Same public API as :class:`ExperimentSandbox` and :class:`DockerSandbox` so the pipeline can use any backend transparently. Execution model: 1. Create a unique run directory on the remote host 2. Upload code (and harness) via scp 3. Optionally run setup commands (pip install, conda activate, etc.) 4. Execute the experiment script via ssh 5. Parse stdout for metrics 6. Clean up the remote run directory """ def __init__(self, config: SshRemoteConfig, workdir: Path) -> None: self.config = config self.workdir = workdir.resolve() self.workdir.mkdir(parents=True, exist_ok=True) self._run_counter = 0 # ------------------------------------------------------------------ # Public API (matches SandboxProtocol) # ------------------------------------------------------------------ def run(self, code: str, *, timeout_sec: int = 300) -> SandboxResult: """Run a single Python code string on the remote host.""" self._run_counter += 1 staging = self.workdir / f"_ssh_run_{self._run_counter}" staging.mkdir(parents=True, exist_ok=True) script_path = staging / "main.py" script_path.write_text(code, encoding="utf-8") self._inject_harness(staging) return self._execute(staging, entry_point="main.py", timeout_sec=timeout_sec) def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> SandboxResult: """Run a multi-file experiment project on the remote host.""" self._run_counter += 1 staging = self.workdir / f"_ssh_project_{self._run_counter}" if staging.exists(): shutil.rmtree(staging) staging.mkdir(parents=True, exist_ok=True) # Pre-copy syntax validation — fail fast before any I/O err = validate_entry_point(entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) self._inject_harness(staging) for src_item in project_dir.iterdir(): dest = staging / src_item.name if dest.name == "experiment_harness.py": logger.warning( "Project contains experiment_harness.py — skipping (immutable)" ) continue if src_item.is_dir(): shutil.copytree(src_item, dest, dirs_exist_ok=True) elif src_item.is_file(): dest.write_bytes(src_item.read_bytes()) # Post-copy resolve check — catches symlink-based escapes err = validate_entry_point_resolved(staging, entry_point) if err: return SandboxResult( returncode=-1, stdout="", stderr=err, elapsed_sec=0.0, metrics={}, ) entry = staging / entry_point if not entry.exists(): return SandboxResult( returncode=-1, stdout="", stderr=f"Entry point {entry_point} not found in project", elapsed_sec=0.0, metrics={}, ) return self._execute(staging, entry_point=entry_point, timeout_sec=timeout_sec) # ------------------------------------------------------------------ # Static helpers # ------------------------------------------------------------------ @staticmethod def check_ssh_available(config: SshRemoteConfig) -> tuple[bool, str]: """Return (ok, message) after testing SSH connectivity.""" if not config.host: return False, "ssh_remote.host is empty" cmd = _build_ssh_base(config, extra_opts=["-o", "ConnectTimeout=10"]) cmd.append("echo researchclaw-ssh-ok") try: cp = subprocess.run( cmd, capture_output=True, text=True, timeout=15, check=False, ) if cp.returncode == 0 and "researchclaw-ssh-ok" in cp.stdout: return True, f"SSH connection to {config.host} OK" return False, f"SSH test failed (exit {cp.returncode}): {cp.stderr.strip()}" except subprocess.TimeoutExpired: return False, f"SSH connection to {config.host} timed out" except FileNotFoundError: return False, "ssh command not found on PATH" @staticmethod def _inject_harness(target_dir: Path) -> None: harness_src = Path(__file__).parent / "harness_template.py" if harness_src.exists(): dest = target_dir / "experiment_harness.py" dest.write_text( harness_src.read_text(encoding="utf-8"), encoding="utf-8" ) # ------------------------------------------------------------------ # Internals # ------------------------------------------------------------------ def _execute( self, staging_dir: Path, *, entry_point: str, timeout_sec: int ) -> SandboxResult: """Core execution flow for remote experiments. Steps: 1. Create a unique temporary directory on the remote host 2. Upload experiment files via scp 3. Run any user-defined setup commands (pip install, etc.) 4. Execute the experiment (bare Python or Docker container) 5. Parse metrics from stdout (same format as local sandbox) 6. Clean up the remote directory regardless of outcome """ cfg = self.config run_id = f"rc-{uuid.uuid4().hex[:8]}" remote_dir = f"{cfg.remote_workdir}/{run_id}" remote_dir_q = shlex.quote(remote_dir) # 1. Create remote directory mkdir_ok = self._ssh_run(f"mkdir -p {remote_dir_q}") if mkdir_ok.returncode != 0: return SandboxResult( returncode=-1, stdout="", stderr=f"Failed to create remote directory: {mkdir_ok.stderr}", elapsed_sec=0.0, metrics={}, ) # 2. Upload code upload_ok = self._scp_upload(staging_dir, remote_dir) if not upload_ok: self._ssh_run(f"rm -rf {remote_dir_q}", timeout_sec=15) return SandboxResult( returncode=-1, stdout="", stderr=f"Failed to upload code to {cfg.host}:{remote_dir}", elapsed_sec=0.0, metrics={}, ) # 3. Run setup commands (pip install, conda activate, etc.) for setup_cmd in cfg.setup_commands: setup_result = self._ssh_run( f"cd {remote_dir_q} && {setup_cmd}", timeout_sec=cfg.setup_timeout_sec, ) if setup_result.returncode != 0: logger.warning( "Setup command failed: %s (exit %d): %s", setup_cmd, setup_result.returncode, setup_result.stderr, ) # 4. Execute experiment if cfg.use_docker: exec_cmd = self._build_docker_exec_cmd( remote_dir, entry_point=entry_point, ) else: exec_cmd = self._build_bare_exec_cmd( remote_dir, entry_point=entry_point, ) start = time.monotonic() result = self._ssh_run(exec_cmd, timeout_sec=timeout_sec) elapsed = time.monotonic() - start timed_out = result.timed_out # 5. Parse metrics from stdout metrics = parse_metrics(result.stdout) # 6. Clean up remote directory self._ssh_run(f"rm -rf {remote_dir_q}", timeout_sec=15) return SandboxResult( returncode=result.returncode, stdout=result.stdout, stderr=result.stderr, elapsed_sec=elapsed, metrics=metrics, timed_out=timed_out, ) def _build_bare_exec_cmd( self, remote_dir: str, *, entry_point: str, ) -> str: """Build command to run Python directly on remote host (with basic sandboxing).""" cfg = self.config rd = shlex.quote(remote_dir) ep = shlex.quote(entry_point) py = shlex.quote(cfg.remote_python) gpu_env = "" if cfg.gpu_ids: gpu_env = f"CUDA_VISIBLE_DEVICES={','.join(str(g) for g in cfg.gpu_ids)} " # Security layers: # 1. HOME override — prevents reading ~/.ssh, ~/.bashrc, etc. # 2. unshare --net — drops network access (Linux only). # 3. If unshare unavailable, still runs with HOME override but # logs a warning so the user knows network isolation is missing. return ( f"cd {rd} && " f"if command -v unshare >/dev/null 2>&1; then " f"HOME={rd} " f"{gpu_env}" f"unshare --net {py} -u {ep}; " f"else " f"echo 'WARNING: unshare not available, running without network isolation' >&2; " f"HOME={rd} " f"{gpu_env}" f"{py} -u {ep}; " f"fi" ) def _build_docker_exec_cmd( self, remote_dir: str, *, entry_point: str, ) -> str: """Build command to run inside a Docker container on the remote host. This is the most secure execution mode: code runs in an isolated container with restricted network, memory limits, and no access to the host filesystem beyond the experiment directory. """ cfg = self.config parts = [ "docker", "run", "--rm", "-v", f"{shlex.quote(remote_dir)}:/workspace", "-w", "/workspace", # BUG-DA8-14: Mirror local Docker sandbox security hardening "-e", "HOME=/workspace/.home", "-e", "TORCH_HOME=/workspace/.home/.cache/torch", "-e", "MPLCONFIGDIR=/tmp/matplotlib", f"--memory={cfg.docker_memory_limit_mb}m", f"--shm-size={cfg.docker_shm_size_mb}m", ] # Network isolation if cfg.docker_network_policy == "none": parts.extend(["--network", "none"]) # GPU passthrough if cfg.gpu_ids: device_spec = ",".join(str(g) for g in cfg.gpu_ids) parts.extend(["--gpus", f"device={device_spec}"]) else: # Try to pass all GPUs; fails gracefully if none available parts.extend(["--gpus", "all"]) parts.append(shlex.quote(cfg.docker_image)) parts.extend(["python3", "-u", shlex.quote(entry_point)]) return " ".join(parts) def _ssh_run( self, command: str, *, timeout_sec: int | None = None ) -> _SshResult: """Execute a command on the remote host via ssh.""" if timeout_sec is None: timeout_sec = self.config.timeout_sec cmd = _build_ssh_base(self.config) + [command] try: cp = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout_sec, check=False, ) return _SshResult( returncode=cp.returncode, stdout=cp.stdout, stderr=cp.stderr, ) except subprocess.TimeoutExpired as exc: stdout = exc.stdout or "" stderr = exc.stderr or "" if isinstance(stdout, bytes): stdout = stdout.decode("utf-8", errors="replace") if isinstance(stderr, bytes): stderr = stderr.decode("utf-8", errors="replace") return _SshResult( returncode=-1, stdout=stdout, stderr=stderr, timed_out=True, ) except Exception as exc: # noqa: BLE001 return _SshResult( returncode=-1, stdout="", stderr=str(exc), ) def _scp_upload(self, local_dir: Path, remote_dir: str) -> bool: """Upload all files from local_dir to remote_dir via scp.""" cfg = self.config target = f"{_ssh_target(cfg)}:{remote_dir}/" cmd = ["scp", "-r", "-o", "StrictHostKeyChecking=no"] if cfg.port != 22: cmd.extend(["-P", str(cfg.port)]) if cfg.key_path: cmd.extend(["-i", os.path.expanduser(cfg.key_path)]) # Upload all files and directories in the staging directory items = [str(f) for f in local_dir.iterdir()] if not items: return True cmd.extend(items) cmd.append(target) try: cp = subprocess.run( cmd, capture_output=True, text=True, timeout=cfg.scp_timeout_sec, check=False, ) if cp.returncode != 0: logger.error("scp upload failed: %s", cp.stderr.strip()) return cp.returncode == 0 except (subprocess.TimeoutExpired, FileNotFoundError) as exc: logger.error("scp upload error: %s", exc) return False # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- class _SshResult: __slots__ = ("returncode", "stdout", "stderr", "timed_out") def __init__( self, returncode: int, stdout: str, stderr: str, timed_out: bool = False, ) -> None: self.returncode = returncode self.stdout = stdout self.stderr = stderr self.timed_out = timed_out def _ssh_target(cfg: SshRemoteConfig) -> str: """Build user@host string.""" if cfg.user: return f"{cfg.user}@{cfg.host}" return cfg.host def _build_ssh_base( cfg: SshRemoteConfig, extra_opts: list[str] | None = None, ) -> list[str]: """Build the base ssh command with common options. *extra_opts* are inserted **before** the hostname so that SSH interprets them as SSH options, not as part of the remote command. """ cmd = [ "ssh", "-o", "StrictHostKeyChecking=no", "-o", "BatchMode=yes", ] if cfg.port != 22: cmd.extend(["-p", str(cfg.port)]) if cfg.key_path: cmd.extend(["-i", os.path.expanduser(cfg.key_path)]) if extra_opts: cmd.extend(extra_opts) cmd.append(_ssh_target(cfg)) return cmd ================================================ FILE: researchclaw/experiment/validator.py ================================================ """Experiment code validation: syntax, security, and import checks. This module provides pre-execution validation for LLM-generated experiment code. It catches common issues *before* running code in the sandbox, enabling automated repair via LLM re-generation. """ from __future__ import annotations import ast import sys from dataclasses import dataclass, field from typing import Any # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @dataclass class ValidationIssue: """A single validation finding.""" severity: str # "error" | "warning" category: str # "syntax" | "security" | "import" | "style" message: str line: int | None = None col: int | None = None @dataclass class CodeValidation: """Aggregated validation result for a code snippet.""" issues: list[ValidationIssue] = field(default_factory=list) @property def ok(self) -> bool: return not any(i.severity == "error" for i in self.issues) @property def errors(self) -> list[ValidationIssue]: return [i for i in self.issues if i.severity == "error"] @property def warnings(self) -> list[ValidationIssue]: return [i for i in self.issues if i.severity == "warning"] def summary(self) -> str: errs = len(self.errors) warns = len(self.warnings) if errs == 0 and warns == 0: return "Code validation passed." parts: list[str] = [] if errs: parts.append(f"{errs} error(s)") if warns: parts.append(f"{warns} warning(s)") return "Code validation: " + ", ".join(parts) # --------------------------------------------------------------------------- # Dangerous call patterns (security scan) # --------------------------------------------------------------------------- # Fully-qualified call names that are forbidden in experiment code. DANGEROUS_CALLS: frozenset[str] = frozenset( { "os.system", "os.popen", "os.exec", "os.execl", "os.execle", "os.execlp", "os.execlpe", "os.execv", "os.execve", "os.execvp", "os.execvpe", "os.remove", "os.unlink", "os.rmdir", "os.removedirs", "subprocess.call", "subprocess.run", "subprocess.Popen", "subprocess.check_call", "subprocess.check_output", "shutil.rmtree", } ) # Bare built-in names that should never appear in experiment code. DANGEROUS_BUILTINS: frozenset[str] = frozenset( { "eval", "exec", "compile", "__import__", } ) # Modules that should not be imported at all. BANNED_MODULES: frozenset[str] = frozenset( { "subprocess", "shutil", "socket", "http", "urllib", "requests", "ftplib", "smtplib", "ctypes", "signal", } ) # Packages considered safe / always available in experiment sandbox. SAFE_STDLIB: frozenset[str] = frozenset( { "abc", "ast", "bisect", "builtins", "collections", "contextlib", "copy", "csv", "dataclasses", "datetime", "decimal", "enum", "functools", "glob", "gzip", "hashlib", "heapq", "io", "itertools", "json", "logging", "math", "operator", "os", # os itself is ok, certain calls aren't "pathlib", "pickle", "pprint", "random", "re", "statistics", "string", "struct", "sys", "tempfile", "textwrap", "time", "traceback", "typing", "unittest", "uuid", "warnings", "zipfile", } ) COMMON_SCIENCE: frozenset[str] = frozenset( { "numpy", "np", "pandas", "pd", "scipy", "sklearn", "matplotlib", "plt", "seaborn", "torch", "tensorflow", "tf", "jax", "transformers", "datasets", "tqdm", "yaml", "pyyaml", "rich", # LLM training stack "peft", "trl", "accelerate", "bitsandbytes", "sentencepiece", "tokenizers", "safetensors", "evaluate", "rouge_score", # Runtime-injected by the experiment harness "experiment_harness", } ) # --------------------------------------------------------------------------- # AST visitor for security checks # --------------------------------------------------------------------------- class _SecurityVisitor(ast.NodeVisitor): """Walk AST to detect dangerous calls and imports.""" def __init__(self) -> None: self.issues: list[ValidationIssue] = [] # -- function calls -- def visit_Call(self, node: ast.Call) -> None: name = _resolve_call_name(node.func) if name in DANGEROUS_BUILTINS: self.issues.append( ValidationIssue( severity="error", category="security", message=f"Dangerous built-in call: {name}()", line=node.lineno, col=node.col_offset, ) ) elif name in DANGEROUS_CALLS: self.issues.append( ValidationIssue( severity="error", category="security", message=f"Dangerous call: {name}()", line=node.lineno, col=node.col_offset, ) ) self.generic_visit(node) # -- import statements -- def visit_Import(self, node: ast.Import) -> None: for alias in node.names: top = alias.name.split(".")[0] if top in BANNED_MODULES: self.issues.append( ValidationIssue( severity="error", category="security", message=f"Banned module import: {alias.name}", line=node.lineno, ) ) self.generic_visit(node) def visit_ImportFrom(self, node: ast.ImportFrom) -> None: if node.module: top = node.module.split(".")[0] if top in BANNED_MODULES: self.issues.append( ValidationIssue( severity="error", category="security", message=f"Banned module import: from {node.module}", line=node.lineno, ) ) self.generic_visit(node) def _resolve_call_name(node: ast.expr) -> str: """Best-effort name resolution for a Call node's func.""" if isinstance(node, ast.Name): return node.id if isinstance(node, ast.Attribute): prefix = _resolve_call_name(node.value) if prefix: return f"{prefix}.{node.attr}" return node.attr return "" # --------------------------------------------------------------------------- # Import extractor # --------------------------------------------------------------------------- def extract_imports(code: str) -> set[str]: """Return top-level module names imported by *code*. Returns an empty set if the code can't be parsed. """ try: tree = ast.parse(code) except SyntaxError: return set() modules: set[str] = set() for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: modules.add(alias.name.split(".")[0]) elif isinstance(node, ast.ImportFrom) and node.module: modules.add(node.module.split(".")[0]) return modules # --------------------------------------------------------------------------- # Public validation functions # --------------------------------------------------------------------------- def validate_syntax(code: str) -> CodeValidation: """Check *code* parses as valid Python.""" result = CodeValidation() try: ast.parse(code) except SyntaxError as exc: result.issues.append( ValidationIssue( severity="error", category="syntax", message=str(exc.msg) if exc.msg else str(exc), line=exc.lineno, col=exc.offset, ) ) return result def validate_security(code: str) -> CodeValidation: """Scan *code* AST for dangerous calls and imports.""" result = CodeValidation() try: tree = ast.parse(code) except SyntaxError: # If can't parse, skip security — syntax check will catch it. return result visitor = _SecurityVisitor() visitor.visit(tree) result.issues.extend(visitor.issues) return result def validate_imports( code: str, available: set[str] | None = None, ) -> CodeValidation: """Check that all imported modules are available. *available* defaults to ``SAFE_STDLIB | COMMON_SCIENCE`` plus any modules already in ``sys.modules``. """ result = CodeValidation() if available is None: available = set(SAFE_STDLIB) | set(COMMON_SCIENCE) | set(sys.modules.keys()) imports = extract_imports(code) for mod in sorted(imports): if mod not in available: result.issues.append( ValidationIssue( severity="warning", category="import", message=f"Module '{mod}' may not be available in sandbox", ) ) return result def validate_code( code: str, *, available_packages: set[str] | None = None, skip_security: bool = False, skip_imports: bool = False, ) -> CodeValidation: """Run all validations and return a combined :class:`CodeValidation`. 1. Syntax check (always) 2. Security scan (unless *skip_security*) 3. Import availability (unless *skip_imports*) """ combined = CodeValidation() # 1. Syntax syntax = validate_syntax(code) combined.issues.extend(syntax.issues) if not syntax.ok: # No point running further checks if code doesn't parse return combined # 2. Security if not skip_security: security = validate_security(code) combined.issues.extend(security.issues) # 3. Import availability if not skip_imports: imp = validate_imports(code, available=available_packages) combined.issues.extend(imp.issues) return combined # --------------------------------------------------------------------------- # Error description helper (for LLM repair prompt) # --------------------------------------------------------------------------- def format_issues_for_llm(validation: CodeValidation) -> str: """Format validation issues as a concise error report for LLM repair.""" if validation.ok and not validation.warnings: return "No issues found." lines: list[str] = [] for issue in validation.issues: loc = f"line {issue.line}" if issue.line else "unknown location" lines.append( f"- [{issue.severity.upper()}] ({issue.category}) {issue.message} @ {loc}" ) return "\n".join(lines) # --------------------------------------------------------------------------- # Code complexity and quality checks (R10-Fix6) # --------------------------------------------------------------------------- def check_code_complexity(code: str) -> list[str]: """Check whether generated experiment code is too simplistic. Returns a list of warning strings. Empty list means no quality concerns. """ warnings: list[str] = [] # Count non-blank, non-comment, non-import lines effective_lines = [ l for l in code.splitlines() if l.strip() and not l.strip().startswith("#") and not l.strip().startswith(("import ", "from ")) ] if len(effective_lines) < 10: warnings.append( f"Code has only {len(effective_lines)} effective lines " f"(excluding blanks/comments/imports) — likely too simple for " f"a research experiment" ) # Check for trivially short functions/methods try: tree = ast.parse(code) func_count = 0 for node in ast.walk(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): func_count += 1 if func_count == 0 and len(effective_lines) > 5: warnings.append( "Code has no function definitions — research experiments " "should be structured with reusable functions" ) except SyntaxError: pass # Check for hardcoded metrics (a common LLM failure mode) import re hardcoded_patterns = [ (r"print\(['\"].*:\s*0\.\d+['\"]\)", "print statement with hardcoded metric value"), (r"metric.*=\s*0\.\d{2,}", "hardcoded metric assignment"), ] for pattern, desc in hardcoded_patterns: if re.search(pattern, code): warnings.append(f"Possible hardcoded metric: {desc}") # Check for trivial computation patterns trivial_patterns = [ ("sum(x**2)", "trivial sum-of-squares computation"), ("np.sum(x**2)", "trivial sum-of-squares computation"), ("0.3 + idx * 0.03", "formulaic/simulated metric generation"), ] for pattern, desc in trivial_patterns: if pattern in code: warnings.append(f"Trivial computation detected: {desc}") return warnings # --------------------------------------------------------------------------- # Deep code quality analysis (Phase 1 / P1.1 + P1.2) # --------------------------------------------------------------------------- def check_class_quality(all_files: dict[str, str]) -> list[str]: """Analyze class implementations across all experiment files. Detects: - Empty or trivial class inheritance (class B(A): pass) - Classes with too few methods (< 2 non-dunder) - Duplicate class bodies (identical forward/train logic across variants) - nn.Module created inside forward() instead of __init__() """ warnings: list[str] = [] class_info: dict[str, dict[str, Any]] = {} for fname, code in all_files.items(): if not fname.endswith(".py"): continue try: tree = ast.parse(code) except SyntaxError: continue for node in ast.walk(tree): if not isinstance(node, ast.ClassDef): continue cls_name = node.name methods: list[str] = [] method_sources: dict[str, str] = {} has_forward_new_module = False body_lines = 0 for item in ast.walk(node): if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)): methods.append(item.name) # Approximate method body size m_start = item.lineno m_end = item.end_lineno or item.lineno body_len = m_end - m_start method_sources[item.name] = f"{fname}:{m_start}-{m_end}" # Check for nn.Module creation inside forward() if item.name in ("forward", "__call__"): for sub in ast.walk(item): if isinstance(sub, ast.Call): call_name = _resolve_call_name(sub.func) if call_name.startswith("nn.") and call_name != "nn.Module": has_forward_new_module = True # Count effective body lines code_lines = code.splitlines() if node.end_lineno and node.lineno: cls_body = code_lines[node.lineno - 1 : node.end_lineno] body_lines = sum( 1 for l in cls_body if l.strip() and not l.strip().startswith("#") and not l.strip().startswith(("import ", "from ")) ) non_dunder = [m for m in methods if not m.startswith("__")] class_info[f"{fname}:{cls_name}"] = { "methods": methods, "non_dunder": non_dunder, "body_lines": body_lines, "file": fname, "has_forward_new_module": has_forward_new_module, } # --- Check 1: Empty or trivial class --- if body_lines <= 2: warnings.append( f"[{fname}] Class '{cls_name}' has only {body_lines} body lines " f"— likely an empty or trivial subclass (class B(A): pass)" ) # --- Check 2: Too few methods for an algorithm class --- if body_lines > 5 and len(non_dunder) < 2: warnings.append( f"[{fname}] Class '{cls_name}' has only {len(non_dunder)} " f"non-dunder method(s) — algorithm classes should have at " f"least __init__ + one core method (forward/train_step/predict)" ) # --- Check 3: nn.Module created in forward() --- if has_forward_new_module: warnings.append( f"[{fname}] Class '{cls_name}' creates nn.Module (nn.Linear etc.) " f"inside forward() — these modules are unregistered and untrained. " f"Move to __init__() and register as submodules." ) # --- Check 4: Duplicate class implementations --- # Compare class body hashes to find copy-paste variants class_names = list(class_info.keys()) for i, name_a in enumerate(class_names): info_a = class_info[name_a] for name_b in class_names[i + 1:]: info_b = class_info[name_b] if ( info_a["body_lines"] > 5 and info_b["body_lines"] > 5 and info_a["non_dunder"] == info_b["non_dunder"] and abs(info_a["body_lines"] - info_b["body_lines"]) <= 2 ): # Same methods, same body size — likely duplicates warnings.append( f"Classes '{name_a.split(':')[1]}' and '{name_b.split(':')[1]}' " f"have identical method signatures and similar body sizes " f"({info_a['body_lines']} vs {info_b['body_lines']} lines) — " f"may be copy-paste variants with no real algorithmic difference" ) # --- Check 5: Ablation subclasses must override with different logic --- # Parse inheritance relationships and compare method ASTs for fname_code, code in all_files.items(): if not fname_code.endswith(".py"): continue try: tree = ast.parse(code) except SyntaxError: continue # Build {class_name: ClassDef} map for this file file_classes: dict[str, ast.ClassDef] = {} for node in ast.walk(tree): if isinstance(node, ast.ClassDef): file_classes[node.name] = node for cls_name, cls_node in file_classes.items(): # Check if this class inherits from another class in the same file for base in cls_node.bases: base_name = None if isinstance(base, ast.Name): base_name = base.id elif isinstance(base, ast.Attribute): base_name = base.attr if not base_name or base_name not in file_classes: continue parent_node = file_classes[base_name] # Get method bodies as AST dumps for comparison child_methods = { m.name: ast.dump(m) for m in cls_node.body if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef)) and not m.name.startswith("__") } parent_methods = { m.name: ast.dump(m) for m in parent_node.body if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef)) and not m.name.startswith("__") } if not child_methods: # Already caught by Check 1 (empty class) continue # Check if all overridden methods have identical AST to parent identical_count = 0 override_count = 0 for method_name, method_dump in child_methods.items(): if method_name in parent_methods: override_count += 1 if method_dump == parent_methods[method_name]: identical_count += 1 if override_count > 0 and identical_count == override_count: warnings.append( f"[{fname_code}] Class '{cls_name}' inherits from " f"'{base_name}' and overrides {override_count} method(s), " f"but ALL overridden methods have identical AST to parent " f"— this is NOT a real ablation. Methods must differ." ) elif override_count == 0 and len(child_methods) > 0: # Has methods but none override parent — might be fine # (new methods that parent doesn't have) pass # --- Check 6: Ablation subclass must override >=1 parent method --- _lname = cls_name.lower() if ("ablation" in _lname or "no_" in _lname or "without" in _lname): parent_non_dunder = { m.name for m in parent_node.body if isinstance(m, (ast.FunctionDef, ast.AsyncFunctionDef)) and not m.name.startswith("__") } child_overrides = set(child_methods.keys()) & parent_non_dunder if not child_overrides and parent_non_dunder: warnings.append( f"[{fname_code}] Ablation class '{cls_name}' inherits " f"from '{base_name}' but does NOT override any of its " f"methods ({', '.join(sorted(parent_non_dunder))}). " f"An ablation MUST override the method that removes " f"the ablated component." ) return warnings def check_variable_scoping(code: str, fname: str = "main.py") -> list[str]: """Detect common variable scoping bugs in experiment code. Catches the pattern where a variable is defined inside an if-branch but used outside that branch (UnboundLocalError at runtime). """ warnings: list[str] = [] try: tree = ast.parse(code) except SyntaxError: return warnings for node in ast.walk(tree): if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): continue # Collect variables assigned only inside if/elif/else branches if_only_vars: dict[str, int] = {} top_level_vars: set[str] = set() for child in ast.iter_child_nodes(node): if isinstance(child, ast.If): _collect_if_only_assignments(child, if_only_vars) elif isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)): for target in _extract_assign_targets(child): top_level_vars.add(target) # Check for variables used after the if block but only defined inside it for var_name, var_line in if_only_vars.items(): if var_name not in top_level_vars: # Check if this variable is used later in the function for later_node in ast.walk(node): if ( isinstance(later_node, ast.Name) and later_node.id == var_name and isinstance(later_node.ctx, ast.Load) and later_node.lineno > var_line ): warnings.append( f"[{fname}:{var_line}] Variable '{var_name}' is assigned " f"only inside an if-branch but used at line " f"{later_node.lineno} — will cause UnboundLocalError " f"if the branch is not taken" ) break return warnings def _collect_if_only_assignments( if_node: ast.If, result: dict[str, int] ) -> None: """Collect variables assigned only inside if/elif branches.""" for child in ast.iter_child_nodes(if_node): if isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)): for target in _extract_assign_targets(child): result[target] = child.lineno elif isinstance(child, ast.If): _collect_if_only_assignments(child, result) def _extract_assign_targets(node: ast.AST) -> list[str]: """Extract variable names from assignment targets.""" names: list[str] = [] if isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name): names.append(target.id) elif isinstance(node, ast.AugAssign): if isinstance(node.target, ast.Name): names.append(node.target.id) elif isinstance(node, ast.AnnAssign): if isinstance(node.target, ast.Name): names.append(node.target.id) return names def auto_fix_unbound_locals(code: str) -> tuple[str, int]: """Programmatically fix UnboundLocalError patterns. For each variable assigned only inside an if-branch but used later, insert ``var = None`` before the if-statement. Returns (fixed_code, num_fixes). """ try: tree = ast.parse(code) except SyntaxError: return code, 0 lines = code.splitlines(keepends=True) insertions: dict[int, list[str]] = {} # lineno -> lines to insert before for node in ast.walk(tree): if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): continue if_only_vars: dict[str, int] = {} top_level_vars: set[str] = set() if_line_map: dict[str, int] = {} # var -> if-statement lineno for child in ast.iter_child_nodes(node): if isinstance(child, ast.If): before: dict[str, int] = {} _collect_if_only_assignments(child, before) for var_name, var_line in before.items(): if_only_vars[var_name] = var_line if_line_map[var_name] = child.lineno elif isinstance(child, (ast.Assign, ast.AugAssign, ast.AnnAssign)): for target in _extract_assign_targets(child): top_level_vars.add(target) for var_name, var_line in if_only_vars.items(): if var_name in top_level_vars: continue # Confirm it's actually used later used_later = False for later_node in ast.walk(node): if ( isinstance(later_node, ast.Name) and later_node.id == var_name and isinstance(later_node.ctx, ast.Load) and later_node.lineno > var_line ): used_later = True break if not used_later: continue if_lineno = if_line_map.get(var_name) if if_lineno is None: continue # Determine indentation of the if-statement if if_lineno <= len(lines): if_line = lines[if_lineno - 1] indent = if_line[: len(if_line) - len(if_line.lstrip())] else: indent = " " insertions.setdefault(if_lineno, []) fix_line = f"{indent}{var_name} = None\n" if fix_line not in insertions[if_lineno]: insertions[if_lineno].append(fix_line) if not insertions: return code, 0 # Apply insertions in reverse line order to keep line numbers stable num_fixes = sum(len(v) for v in insertions.values()) for lineno in sorted(insertions, reverse=True): idx = lineno - 1 for fix_line in reversed(insertions[lineno]): lines.insert(idx, fix_line) return "".join(lines), num_fixes def check_api_correctness(code: str, fname: str = "main.py") -> list[str]: """Detect common API misuse patterns. Catches: - np.erf() (should be scipy.special.erf) - nn.Linear/nn.Conv2d inside forward() (unregistered module) - random.seed() without numpy.random.seed() (incomplete seeding) - NumPy 2.0 removed APIs (.ptp(), np.bool, etc.) """ import re as _re warnings: list[str] = [] lines = code.splitlines() for i, line in enumerate(lines, 1): stripped = line.strip() if stripped.startswith("#"): continue # np.erf doesn't exist if _re.search(r"\bnp\.erf\b", stripped): warnings.append( f"[{fname}:{i}] np.erf() does not exist — use " f"scipy.special.erf() or math.erf() instead" ) # NumPy 2.0 removed ndarray methods if _re.search(r"\.ptp\s*\(", stripped): warnings.append( f"[{fname}:{i}] ndarray.ptp() was removed in NumPy 2.0 — " f"use np.ptp(arr) or arr.max() - arr.min() instead" ) # NumPy 2.0 removed type aliases for old_alias in ("np.bool", "np.int", "np.float", "np.complex", "np.object", "np.str"): pattern = _re.escape(old_alias) + r"(?![_\w\d])" if _re.search(pattern, stripped): warnings.append( f"[{fname}:{i}] {old_alias} was removed in NumPy 2.0 — " f"use {old_alias}_ or Python builtin instead" ) # np.random.RandomState with hardcoded seed in a function called multiple times if _re.search(r"RandomState\(\s*\d+\s*\)", stripped) and "def " not in stripped: warnings.append( f"[{fname}:{i}] Hardcoded RandomState seed inside a loop/function " f"may produce identical results across calls — pass seed as parameter" ) # --- Import-usage mismatch detection --- # Detect `from X import Y` followed by `X.Y(...)` — guaranteed NameError import_from_map: dict[str, set[str]] = {} # module -> {names} import_module_set: set[str] = set() # modules imported with `import X` for i, line in enumerate(lines, 1): stripped = line.strip() m = _re.match(r"from\s+([\w.]+)\s+import\s+(.+)", stripped) if m: mod = m.group(1) names = {n.strip().split(" as ")[-1].strip() for n in m.group(2).split(",")} import_from_map.setdefault(mod, set()).update(names) elif _re.match(r"import\s+([\w.]+)", stripped) and "from" not in stripped: m2 = _re.match(r"import\s+([\w.]+)", stripped) if m2: import_module_set.add(m2.group(1).split(".")[0]) # Now scan for qualified calls to modules that were only from-imported for i, line in enumerate(lines, 1): stripped = line.strip() if stripped.startswith("#"): continue for mod, _names in import_from_map.items(): top_mod = mod.split(".")[0] # Only flag if the module was NOT also imported via `import X` if top_mod in import_module_set: continue # Check for `module.name(...)` usage when `name` was from-imported for name in _names: pattern = _re.escape(f"{mod}.{name}") + r"\s*\(" if _re.search(pattern, stripped): warnings.append( f"[{fname}:{i}] Import-usage mismatch: '{name}' was imported " f"via `from {mod} import {name}` but called as `{mod}.{name}()` " f"— this will raise NameError. Use `{name}()` directly." ) return warnings def check_undefined_calls(code: str, fname: str = "main.py") -> list[str]: """Detect calls to undefined functions/names in experiment code. Catches the pattern where a function is called but never defined or imported, which would cause NameError at runtime. """ warnings: list[str] = [] try: tree = ast.parse(code) except SyntaxError: return warnings # Common builtins that are always available builtins = { "print", "len", "range", "enumerate", "zip", "map", "filter", "sorted", "list", "dict", "set", "tuple", "str", "int", "float", "bool", "bytes", "type", "isinstance", "issubclass", "hasattr", "getattr", "setattr", "delattr", "callable", "iter", "next", "reversed", "slice", "super", "property", "staticmethod", "classmethod", "abs", "all", "any", "bin", "chr", "ord", "hex", "oct", "pow", "round", "sum", "min", "max", "open", "input", "repr", "hash", "id", "dir", "vars", "globals", "locals", "format", "ascii", "object", "Exception", "ValueError", "TypeError", "KeyError", "IndexError", "AttributeError", "RuntimeError", "StopIteration", "NotImplementedError", "AssertionError", "ImportError", "FileNotFoundError", "OSError", "IOError", "ZeroDivisionError", "OverflowError", "MemoryError", "RecursionError", "SystemExit", "KeyboardInterrupt", "GeneratorExit", "BaseException", "Warning", "DeprecationWarning", "UserWarning", "FutureWarning", "PendingDeprecationWarning", "SyntaxWarning", "RuntimeWarning", "ResourceWarning", "BytesWarning", "UnicodeWarning", "breakpoint", "memoryview", "bytearray", "frozenset", "complex", "divmod", "eval", "exec", "compile", "__import__", "help", "exit", "quit", } # Collect all defined names in the module defined_names: set[str] = set() for node in ast.walk(tree): # Function definitions if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): defined_names.add(node.name) # Class definitions elif isinstance(node, ast.ClassDef): defined_names.add(node.name) # Imports elif isinstance(node, ast.Import): for alias in node.names: name = alias.asname if alias.asname else alias.name.split(".")[0] defined_names.add(name) elif isinstance(node, ast.ImportFrom): for alias in node.names: name = alias.asname if alias.asname else alias.name if name != "*": defined_names.add(name) # Assignments (including comprehensions) elif isinstance(node, ast.Assign): for target in node.targets: if isinstance(target, ast.Name): defined_names.add(target.id) elif isinstance(target, ast.Tuple): for elt in target.elts: if isinstance(elt, ast.Name): defined_names.add(elt.id) elif isinstance(node, ast.AnnAssign): if isinstance(node.target, ast.Name): defined_names.add(node.target.id) elif isinstance(node, ast.AugAssign): if isinstance(node.target, ast.Name): defined_names.add(node.target.id) # For loop targets elif isinstance(node, ast.For): if isinstance(node.target, ast.Name): defined_names.add(node.target.id) elif isinstance(node.target, ast.Tuple): for elt in node.target.elts: if isinstance(elt, ast.Name): defined_names.add(elt.id) # With statement targets elif isinstance(node, ast.With): for item in node.items: if item.optional_vars and isinstance(item.optional_vars, ast.Name): defined_names.add(item.optional_vars.id) # Exception handlers elif isinstance(node, ast.ExceptHandler): if node.name: defined_names.add(node.name) # Named expressions (walrus operator) elif isinstance(node, ast.NamedExpr): if isinstance(node.target, ast.Name): defined_names.add(node.target.id) # Also collect function parameters for node in ast.walk(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): for arg in node.args.args: defined_names.add(arg.arg) for arg in node.args.posonlyargs: defined_names.add(arg.arg) for arg in node.args.kwonlyargs: defined_names.add(arg.arg) if node.args.vararg: defined_names.add(node.args.vararg.arg) if node.args.kwarg: defined_names.add(node.args.kwarg.arg) # Now find all function calls to bare names (not attributes like obj.method()) for node in ast.walk(tree): if isinstance(node, ast.Call): # Only check bare name calls, not attribute calls (obj.method()) if isinstance(node.func, ast.Name): call_name = node.func.id if ( call_name not in defined_names and call_name not in builtins ): warnings.append( f"[{fname}:{node.lineno}] Call to undefined function " f"'{call_name}()' — this will raise NameError at runtime. " f"Either define the function or remove the call." ) return warnings def check_filename_collisions(files: dict[str, str]) -> list[str]: """BUG-202: Detect local .py filenames that shadow pip/stdlib packages. The LLM commonly generates ``config.py``, ``models.py``, etc. which get shadowed by pip-installed packages (e.g. ``pip install config``). The result is an import crash at runtime. """ # Filenames (without .py) that are known to collide with pip/stdlib packages. _SHADOW_RISK: set[str] = { # pip packages frequently installed as transitive deps "config", "test", "tests", "types", "typing_extensions", # stdlib modules the LLM might accidentally shadow "io", "logging", "json", "time", "random", "copy", "math", "os", "sys", "collections", "functools", "abc", "re", "statistics", "signal", "pickle", "itertools", "string", "tokenize", "token", "email", "calendar", "numbers", "operator", "queue", "code", "profile", } warnings: list[str] = [] for fname in files: stem = fname.removesuffix(".py") if fname.endswith(".py") else None if stem and stem in _SHADOW_RISK: warnings.append( f"[{fname}] Filename shadows stdlib/pip package '{stem}'. " f"Rename to e.g. '{stem}_config.py' or 'experiment_{stem}.py' " f"to avoid import collisions at runtime." ) return warnings def deep_validate_files( files: dict[str, str], ) -> list[str]: """Run all deep quality checks across all experiment files. Returns a list of warning strings. Empty = no concerns. """ warnings: list[str] = [] warnings.extend(check_class_quality(files)) warnings.extend(check_filename_collisions(files)) for fname, code in files.items(): if not fname.endswith(".py"): continue warnings.extend(check_variable_scoping(code, fname)) warnings.extend(check_api_correctness(code, fname)) return warnings ================================================ FILE: researchclaw/experiment/visualize.py ================================================ """Experiment result visualization. Generates publication-quality charts from experiment run data: - Condition comparison (grouped bar chart with CI error bars) - Metric heatmap (condition × metric matrix) - Metric trajectory (line chart with markers) - Ablation delta chart (horizontal bar showing delta from baseline) - Pipeline execution timeline - Iteration score history Uses Paul Tol colorblind-safe palette and academic styling. """ from __future__ import annotations import json import logging import math from pathlib import Path from typing import Any logger = logging.getLogger(__name__) try: import matplotlib matplotlib.use("Agg") # Non-interactive backend import matplotlib.pyplot as plt import matplotlib.colors as mcolors from matplotlib.patches import FancyBboxPatch import numpy as np HAS_MATPLOTLIB = True except ImportError: HAS_MATPLOTLIB = False # Paul Tol "bright" palette — colorblind-safe, publication-ready _PAUL_TOL_BRIGHT = [ "#4477AA", # blue "#EE6677", # red/pink "#228833", # green "#CCBB44", # yellow "#66CCEE", # cyan "#AA3377", # purple "#BBBBBB", # grey ] # Extended palette for many conditions (Tol muted + bright merged) _PAUL_TOL_EXTENDED = [ "#4477AA", "#EE6677", "#228833", "#CCBB44", "#66CCEE", "#AA3377", "#332288", "#88CCEE", "#44AA99", "#117733", "#999933", "#CC6677", "#882255", "#661100", "#6699CC", ] # Metrics to exclude from comparison charts (timing, meta, non-scientific) _EXCLUDED_METRICS: set[str] = { "time_budget_sec", "elapsed_sec", "elapsed_time", "execution_time", "wall_time", "runtime_sec", "total_time", "timeout", "seed", "seed_count", "n_seeds", "num_seeds", "success_rate", "num_conditions", "total_conditions", "calibration_iterations", } # Prefixes that indicate meta/timing metrics _EXCLUDED_PREFIXES: tuple[str, ...] = ("time_", "runtime_", "elapsed_", "wall_") def _is_excluded_metric(name: str) -> bool: """Return True if *name* is a timing/meta metric that shouldn't be charted.""" low = name.lower() if low in _EXCLUDED_METRICS: return True return any(low.startswith(p) for p in _EXCLUDED_PREFIXES) def _shorten_label(name: str, max_len: int = 22) -> str: """Shorten a metric label for chart readability.""" if len(name) <= max_len: return name return name[: max_len - 1] + "\u2026" def _format_cond_name(name: str) -> str: """Format condition name for display: underscores → spaces, title case.""" return name.replace("_", " ").title() def _ensure_dir(path: Path) -> Path: path.parent.mkdir(parents=True, exist_ok=True) return path def _setup_academic_style() -> None: """Apply academic styling via rcParams.""" plt.rcParams.update({ "font.family": "serif", "font.size": 11, "axes.labelsize": 12, "axes.titlesize": 13, "axes.titleweight": "bold", "xtick.labelsize": 10, "ytick.labelsize": 10, "legend.fontsize": 10, "figure.dpi": 300, "savefig.dpi": 300, "savefig.bbox": "tight", "axes.spines.top": False, "axes.spines.right": False, "axes.grid": True, "grid.alpha": 0.3, "grid.linestyle": "--", "axes.axisbelow": True, }) # --------------------------------------------------------------------------- # 1. Condition comparison — grouped bar with CI error bars # --------------------------------------------------------------------------- def plot_condition_comparison( condition_summaries: dict[str, dict[str, Any]], output_path: Path, *, metric_key: str = "primary_metric", title: str = "", ) -> Path | None: """Bar chart comparing conditions with mean +/- 95% CI error bars. Uses Paul Tol colorblind-safe palette with gradient shading. """ if not HAS_MATPLOTLIB or not condition_summaries: return None _setup_academic_style() names: list[str] = [] means: list[float] = [] ci_low: list[float] = [] ci_high: list[float] = [] for cond, info in condition_summaries.items(): m = info.get("metrics", {}) mean_val = m.get(f"{metric_key}_mean") or m.get(metric_key) if mean_val is None: continue fmean = float(mean_val) names.append(_format_cond_name(cond)) means.append(fmean) ci_low.append(float(info.get("ci95_low", fmean))) ci_high.append(float(info.get("ci95_high", fmean))) if not names: return None yerr_lo = [max(0, m - lo) for m, lo in zip(means, ci_low)] yerr_hi = [max(0, hi - m) for m, hi in zip(means, ci_high)] n = len(names) colors = [_PAUL_TOL_EXTENDED[i % len(_PAUL_TOL_EXTENDED)] for i in range(n)] fig, ax = plt.subplots(figsize=(max(7, n * 1.2), 5)) x = np.arange(n) bars = ax.bar( x, means, color=colors, alpha=0.88, edgecolor="white", linewidth=0.8, width=0.7, ) ax.errorbar( x, means, yerr=[yerr_lo, yerr_hi], fmt="none", ecolor="#333333", capsize=5, capthick=1.5, linewidth=1.5, ) # Value labels above bars y_max = max(m + h for m, h in zip(means, yerr_hi)) if yerr_hi else max(means) offset = y_max * 0.025 for i, m in enumerate(means): ax.text( i, m + yerr_hi[i] + offset, f"{m:.3f}", ha="center", va="bottom", fontsize=9, fontweight="bold", color="#333", ) ax.set_xlabel("Method / Condition") metric_label = metric_key.replace("_", " ").title() ax.set_ylabel(metric_label) ax.set_title(title or f"{metric_label} Comparison (Mean \u00b1 95% CI)") ax.set_xticks(x) ax.set_xticklabels(names, rotation=30, ha="right") ax.set_ylim(bottom=0) fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved condition comparison: %s", output_path) return output_path # --------------------------------------------------------------------------- # 2. Metric heatmap — condition × metric matrix # --------------------------------------------------------------------------- def plot_metric_heatmap( condition_summaries: dict[str, dict[str, Any]], output_path: Path, *, title: str = "Performance Heatmap (Per-Condition Metrics)", max_metrics: int = 12, ) -> Path | None: """Heatmap of normalized metric values across conditions. Shows per-condition performance normalized to [0, 1] per metric column. """ if not HAS_MATPLOTLIB or not condition_summaries: return None _setup_academic_style() # Collect all metric keys across conditions all_metric_keys: set[str] = set() for info in condition_summaries.values(): m = info.get("metrics", {}) for k in m: if not _is_excluded_metric(k) and not k.endswith("_std"): all_metric_keys.add(k) # Filter to _mean variants or raw metrics, deduplicate clean_keys: list[str] = [] for k in sorted(all_metric_keys): base = k.replace("_mean", "") if base not in [ck.replace("_mean", "") for ck in clean_keys]: clean_keys.append(k) if len(clean_keys) < 2: return None clean_keys = clean_keys[:max_metrics] cond_names = list(condition_summaries.keys()) # Build matrix data = np.zeros((len(cond_names), len(clean_keys))) for i, cond in enumerate(cond_names): m = condition_summaries[cond].get("metrics", {}) for j, mk in enumerate(clean_keys): val = m.get(mk, 0) try: data[i, j] = float(val) except (ValueError, TypeError): data[i, j] = 0 # Normalize per column (min-max) for j in range(data.shape[1]): col = data[:, j] lo, hi = col.min(), col.max() if hi > lo: data[:, j] = (col - lo) / (hi - lo) else: data[:, j] = 0.5 fig, ax = plt.subplots( figsize=(max(6, len(clean_keys) * 1.3), max(4, len(cond_names) * 0.6)) ) im = ax.imshow(data, cmap="YlGnBu", aspect="auto", vmin=0, vmax=1) # Annotate cells for i in range(len(cond_names)): for j in range(len(clean_keys)): val = data[i, j] color = "white" if val > 0.6 else "#333" ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=8, color=color, fontweight="bold") ax.set_xticks(range(len(clean_keys))) ax.set_xticklabels( [_shorten_label(k.replace("_mean", "").replace("_", " ")) for k in clean_keys], rotation=35, ha="right", fontsize=9, ) ax.set_yticks(range(len(cond_names))) ax.set_yticklabels([_format_cond_name(c) for c in cond_names], fontsize=9) ax.set_title(title) cbar = fig.colorbar(im, ax=ax, shrink=0.8, label="Normalized Score") fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved metric heatmap: %s", output_path) return output_path # --------------------------------------------------------------------------- # 3. Ablation delta chart — horizontal bars showing improvement over baseline # --------------------------------------------------------------------------- def plot_ablation_deltas( condition_summaries: dict[str, dict[str, Any]], output_path: Path, *, metric_key: str = "primary_metric", baseline_name: str = "", title: str = "", higher_is_better: bool = True, ) -> Path | None: """Horizontal bar chart showing delta from baseline for each ablation. Bars go left (worse) or right (better) from zero. """ if not HAS_MATPLOTLIB or not condition_summaries: return None _setup_academic_style() # Find baseline cond_keys = list(condition_summaries.keys()) if baseline_name: base_key = baseline_name else: # Heuristic: pick "baseline", "heuristic_baseline", or first condition for candidate in ["baseline", "heuristic_baseline", "random_baseline"]: if candidate in cond_keys: base_key = candidate break else: base_key = cond_keys[0] base_info = condition_summaries.get(base_key, {}) base_m = base_info.get("metrics", {}) base_val = float(base_m.get(f"{metric_key}_mean") or base_m.get(metric_key, 0)) if base_val == 0: return None names: list[str] = [] deltas: list[float] = [] for cond, info in condition_summaries.items(): if cond == base_key: continue m = info.get("metrics", {}) val = m.get(f"{metric_key}_mean") or m.get(metric_key) if val is None: continue fval = float(val) pct = ((fval - base_val) / abs(base_val)) * 100 names.append(_format_cond_name(cond)) deltas.append(pct) if not names: return None # Sort by delta pairs = sorted(zip(deltas, names), reverse=True) deltas, names = zip(*pairs) deltas = list(deltas) names = list(names) fig, ax = plt.subplots(figsize=(8, max(4, len(names) * 0.5))) y = np.arange(len(names)) bar_colors = [] for d in deltas: if higher_is_better: bar_colors.append("#228833" if d > 0 else "#EE6677") else: bar_colors.append("#228833" if d < 0 else "#EE6677") ax.barh(y, deltas, color=bar_colors, alpha=0.85, edgecolor="white", height=0.6) ax.axvline(x=0, color="#333", linewidth=1, linestyle="-") # Value labels for i, d in enumerate(deltas): ha = "left" if d >= 0 else "right" offset = 0.5 if d >= 0 else -0.5 ax.text(d + offset, i, f"{d:+.1f}%", ha=ha, va="center", fontsize=9, fontweight="bold") ax.set_yticks(y) ax.set_yticklabels(names) ax.set_xlabel(f"\u0394 {metric_key.replace('_', ' ').title()} vs. Baseline (%)") ax.set_title(title or f"Ablation Analysis (Baseline: {_format_cond_name(base_key)})") ax.invert_yaxis() fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved ablation deltas: %s", output_path) return output_path # --------------------------------------------------------------------------- # 4. Metric trajectory — line chart across refinement iterations # --------------------------------------------------------------------------- def plot_metric_trajectory( runs: list[dict[str, Any]], metric_key: str, output_path: Path, *, title: str = "", ) -> Path | None: """Plot metric values across runs as a styled line chart with markers.""" if not HAS_MATPLOTLIB or not runs: return None _setup_academic_style() values: list[float] = [] labels: list[str] = [] for i, r in enumerate(runs): m = r.get("metrics") or r.get("key_metrics") or {} if isinstance(m, dict) and metric_key in m: try: values.append(float(m[metric_key])) labels.append(r.get("run_id", f"Iter {i + 1}")) except (ValueError, TypeError): continue if not values: return None fig, ax = plt.subplots(figsize=(max(6, len(values) * 1.5), 4.5)) x = range(len(values)) ax.plot( x, values, "o-", color=_PAUL_TOL_BRIGHT[0], linewidth=2.5, markersize=8, markerfacecolor="white", markeredgewidth=2, markeredgecolor=_PAUL_TOL_BRIGHT[0], ) # Fill area under curve ax.fill_between(x, values, alpha=0.08, color=_PAUL_TOL_BRIGHT[0]) # Value annotations for i, v in enumerate(values): ax.annotate( f"{v:.4f}", (i, v), textcoords="offset points", xytext=(0, 12), ha="center", fontsize=9, fontweight="bold", ) metric_label = metric_key.replace("_", " ").title() ax.set_xlabel("Refinement Iteration") ax.set_ylabel(metric_label) ax.set_title(title or f"{metric_label} Across Iterations") ax.set_xticks(list(x)) ax.set_xticklabels( [_shorten_label(lb, 15) for lb in labels], rotation=30, ha="right", ) fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved metric trajectory: %s", output_path) return output_path # --------------------------------------------------------------------------- # 5. Experiment comparison — multi-metric bar chart # --------------------------------------------------------------------------- def plot_experiment_comparison( metrics_summary: dict[str, dict[str, float]], output_path: Path, *, title: str = "Experiment Results Comparison", ) -> Path | None: """Grouped bar chart comparing mean/min/max across metrics.""" if not HAS_MATPLOTLIB or not metrics_summary: return None _setup_academic_style() filtered = {k: v for k, v in metrics_summary.items() if not _is_excluded_metric(k)} if not filtered: return None # Limit to top 12 metrics if len(filtered) > 12: top = sorted(filtered.items(), key=lambda kv: abs(kv[1].get("mean", 0)), reverse=True)[:12] filtered = dict(top) names = list(filtered.keys()) means = [filtered[n].get("mean", 0) for n in names] mins = [filtered[n].get("min", 0) for n in names] maxs = [filtered[n].get("max", 0) for n in names] fig, ax = plt.subplots(figsize=(max(7, len(names) * 1.3), 5)) x = np.arange(len(names)) width = 0.6 bars = ax.bar( x, means, width=width, color=_PAUL_TOL_BRIGHT[0], alpha=0.88, edgecolor="white", linewidth=0.8, label="Mean", ) # Min-max range as thin lines for i, (lo, hi) in enumerate(zip(mins, maxs)): ax.plot([i, i], [lo, hi], color="#333", linewidth=2, solid_capstyle="round") ax.plot(i, lo, "_", color="#333", markersize=8, markeredgewidth=2) ax.plot(i, hi, "_", color="#333", markersize=8, markeredgewidth=2) ax.set_xlabel("Metric") ax.set_ylabel("Value") ax.set_title(title) ax.set_xticks(x) ax.set_xticklabels( [_shorten_label(n.replace("_", " ")) for n in names], rotation=35, ha="right", ) ax.legend(loc="upper right") fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved experiment comparison: %s", output_path) return output_path # --------------------------------------------------------------------------- # 6. Pipeline execution timeline # --------------------------------------------------------------------------- def plot_pipeline_timeline( stage_results: list[dict[str, Any]], output_path: Path, *, title: str = "Pipeline Execution Timeline", ) -> Path | None: """Horizontal bar chart showing execution time per stage.""" if not HAS_MATPLOTLIB or not stage_results: return None _setup_academic_style() labels: list[str] = [] durations: list[float] = [] colors: list[str] = [] for r in stage_results: name = r.get("stage_name", r.get("stage", "?")) elapsed = r.get("elapsed_sec", 0) status = r.get("status", "done") labels.append(str(name)) durations.append(float(elapsed) if elapsed else 1.0) colors.append("#228833" if status == "done" else "#EE6677") if not labels: return None fig, ax = plt.subplots(figsize=(10, max(4, len(labels) * 0.35))) y = range(len(labels)) ax.barh(list(y), durations, color=colors, alpha=0.85, edgecolor="white", height=0.6) ax.set_yticks(list(y)) ax.set_yticklabels(labels, fontsize=9) ax.set_xlabel("Time (seconds)") ax.set_title(title) ax.invert_yaxis() fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved pipeline timeline: %s", output_path) return output_path # --------------------------------------------------------------------------- # 7. Iteration score history # --------------------------------------------------------------------------- def plot_iteration_scores( scores: list[float | None], output_path: Path, *, threshold: float = 7.0, title: str = "Quality Score by Iteration", ) -> Path | None: """Line chart of quality scores across iterations.""" if not HAS_MATPLOTLIB or not scores: return None _setup_academic_style() valid = [(i + 1, s) for i, s in enumerate(scores) if s is not None] if not valid: return None iters, vals = zip(*valid) fig, ax = plt.subplots(figsize=(6, 4.5)) ax.plot( iters, vals, "o-", color=_PAUL_TOL_BRIGHT[5], linewidth=2.5, markersize=9, markerfacecolor="white", markeredgewidth=2, markeredgecolor=_PAUL_TOL_BRIGHT[5], ) ax.axhline( y=threshold, color=_PAUL_TOL_BRIGHT[1], linestyle="--", alpha=0.7, linewidth=1.5, label=f"Threshold ({threshold})", ) ax.fill_between(iters, vals, alpha=0.06, color=_PAUL_TOL_BRIGHT[5]) ax.set_xlabel("Iteration") ax.set_ylabel("Quality Score") ax.set_title(title) ax.set_ylim(0, 10.5) ax.legend(loc="lower right") fig.tight_layout() fig.savefig(_ensure_dir(output_path), dpi=300, bbox_inches="tight") plt.close(fig) logger.info("Saved iteration scores: %s", output_path) return output_path # --------------------------------------------------------------------------- # 8. All-in-one: generate all charts from run directory # --------------------------------------------------------------------------- def generate_all_charts( run_dir: Path, output_dir: Path | None = None, *, metric_key: str = "val_loss", metric_direction: str = "minimize", ) -> list[Path]: """Scan run_dir and generate all applicable charts. Returns list of generated image paths. """ if not HAS_MATPLOTLIB: logger.warning("matplotlib not available — skipping chart generation") return [] if output_dir is None: output_dir = run_dir / "charts" output_dir.mkdir(parents=True, exist_ok=True) generated: list[Path] = [] # Collect experiment runs runs: list[dict[str, Any]] = [] for stage_subdir in sorted(run_dir.glob("stage-*/runs")): for run_file in sorted(stage_subdir.glob("*.json")): try: data = json.loads(run_file.read_text(encoding="utf-8")) if isinstance(data, dict): runs.append(data) except (json.JSONDecodeError, OSError): continue # 1. Metric trajectory path = plot_metric_trajectory( runs, metric_key, output_dir / "metric_trajectory.png" ) if path: generated.append(path) # 2. Load experiment summary for condition-based charts # BUG-215: Also search stage-14* versioned dirs when stage-14/ is missing. summary_path = run_dir / "stage-14" / "experiment_summary.json" if not summary_path.exists(): for _s14 in sorted(run_dir.glob("stage-14*"), reverse=True): _alt = _s14 / "experiment_summary.json" if _alt.exists(): summary_path = _alt break if summary_path.exists(): try: summary = json.loads(summary_path.read_text(encoding="utf-8")) cs = summary.get("condition_summaries", {}) if cs: # 2a. Condition comparison (bar chart with CI) path = plot_condition_comparison( cs, output_dir / "method_comparison.png", metric_key=metric_key, ) if path: generated.append(path) # 2b. Ablation delta chart (horizontal bars) path = plot_ablation_deltas( cs, output_dir / "ablation_analysis.png", metric_key=metric_key, higher_is_better=(metric_direction != "minimize"), ) if path: generated.append(path) # 2c. Metric heatmap (condition × metric) path = plot_metric_heatmap( cs, output_dir / "metric_heatmap.png", ) if path: generated.append(path) # 2d. Raw metrics comparison (fallback, limited) ms = summary.get("metrics_summary", {}) if ms: ms = {k: v for k, v in ms.items() if not _is_excluded_metric(k)} if ms: path = plot_experiment_comparison( ms, output_dir / "experiment_comparison.png" ) if path: generated.append(path) except (json.JSONDecodeError, OSError): pass # 3. Iteration scores iter_path = run_dir / "iteration_summary.json" if iter_path.exists(): try: iter_data = json.loads(iter_path.read_text(encoding="utf-8")) scores = iter_data.get("iteration_scores", []) threshold = iter_data.get("quality_threshold", 7.0) path = plot_iteration_scores( scores, output_dir / "iteration_scores.png", threshold=threshold ) if path: generated.append(path) except (json.JSONDecodeError, OSError): pass logger.info("Generated %d chart(s) in %s", len(generated), output_dir) return generated ================================================ FILE: researchclaw/feedback/FEEDBACK_ANALYSIS_PROMPT.md ================================================ # Tester Feedback Analysis — Claude Code Prompt > **用途:** 在 Claude Code agent 窗口中读取本文件,agent 将自动完成「测试反馈分析 → Bug 修复文档生成」的全流程。 > > **使用方式:** 打开 Claude Code,输入: > ``` > 请读取 researchclaw/feedback/FEEDBACK_ANALYSIS_PROMPT.md,然后按照指示处理 feedback_inbox/ 目录下的所有测试反馈。 > ``` --- ## 你的角色 你是 AutoResearchClaw 项目的高级 QA 工程师和代码架构师。你需要分析来自不同学科领域测试者的反馈,与当前 Pipeline 代码进行对比,生成一份结构化的 **Bug 修复文档**。 --- ## 背景 AutoResearchClaw 是一个 23 阶段的全自动学术研究 Pipeline(从选题到论文生成)。我们招募了来自不同学科的测试者运行 Pipeline,他们提交了运行反馈和交付物。你需要帮我把这些反馈转化为可执行的 Bug 修复方案。 ### ⚠️ 关键认知:测试者使用的可能是旧版本 **这一点极其重要,必须贯穿你的整个分析过程:** 测试者运行 Pipeline 的时间点各不相同,他们使用的代码版本很可能**不是当前主分支的最新版本**。我们的代码在快速迭代中,很多问题在他们测试之后可能已经被修复、部分修复或因架构调整而不再适用。 因此你必须: - **不要无条件信任反馈中描述的 Bug** —— 它可能已经不存在了 - **对每个问题都要在当前代码中实际验证** —— 读代码确认,而不是仅凭反馈文字就下结论 - **保持批判性思维** —— 测试者的问题描述可能基于旧的代码行为、旧的配置格式、旧的依赖版本 - **如果反馈中提到的函数/类/文件已被重构或删除,直接标记为「已修复/架构已变更」** - **如果能从压缩包中识别出测试者使用的版本(如 git hash、版本号、时间戳),请记录下来,有助于判断问题时效性** --- ## 输入:反馈目录结构 所有测试反馈存放在 `feedback_inbox/` 目录下(如果实际路径不同,用户会告知)。结构如下: ``` feedback_inbox/ ├── tester_alice/ │ ├── 反馈文档.md # 反馈文档(可能是 .md / .txt / .docx / .pdf) │ ├── screenshots/ # 截图文件夹(可选) │ │ ├── error1.png │ │ └── stage12_fail.png │ └── artifacts.zip # Pipeline 交付物压缩包(论文、代码、各阶段输出) ├── tester_bob/ │ ├── feedback.md │ └── deliverables.tar.gz ├── tester_charlie/ │ └── 测试报告.txt └── ... ``` **注意:** - 每个子文件夹 = 一个测试者 - 反馈文档命名不固定,但通常是唯一的文本文件 - 压缩包内是完整的 Pipeline 运行输出目录 - 有些测试者可能只有反馈文档没有压缩包,反之亦然 - 截图可能散落在子文件夹内或专门的 screenshots/ 目录里 --- ## 你的工作流程 ### 第一步:扫描并读取所有反馈 1. 列出 `feedback_inbox/` 下所有子目录 2. 对每个子目录: - 找到反馈文档并读取全文 - 如有截图,记录文件名(用于在报告中引用) - 如有压缩包,列出内容目录(不必完整解压),重点关注: - 错误日志(含 error / fail / traceback 的文件) - 阶段输出 JSON(stage_*.json / checkpoint.json) - Pipeline 元数据(run_meta.json / pipeline_summary.json) - 如果压缩包中有明显的报错信息,提取关键片段 ### 第二步:理解当前 Pipeline 架构 在分析之前,你需要了解当前代码的最新状态。请阅读以下关键文件: - `researchclaw/pipeline/stages.py` — 23 阶段定义和状态机 - `researchclaw/pipeline/executor.py` — 核心执行逻辑(重点关注各阶段的 execute 函数) - `researchclaw/pipeline/runner.py` — Pipeline 运行入口 - `researchclaw/config.py` — 配置结构 - `researchclaw/llm/client.py` — LLM 调用逻辑 - `researchclaw/literature/search.py` — 文献搜索 - `researchclaw/experiment/docker_sandbox.py` — Docker 沙箱执行 - `researchclaw/pipeline/code_agent.py` — 代码生成 Agent - `researchclaw/templates/converter.py` — LaTeX 转换 - `researchclaw/prompts.py` — Prompt 模板 **不需要逐行阅读,但要对整体架构和各模块职责有清晰认识。** ### 第三步:逐个分析反馈 对每个测试者的反馈,执行以下分析: #### 3a. 提取问题列表 从反馈文本中提取每一个独立的问题/Bug/需求,包括: - 明确报告的 Bug("XX 阶段报错了") - 模糊描述的问题("效果不太好"、"生成的论文有问题") - 功能需求("希望能支持 XX") - UX 问题("不知道怎么配置") - 性能问题("跑了 3 小时还没完") #### 3b. 对比代码进行验证 对每个提取的问题: 1. **定位相关代码** — 根据问题描述和涉及的 Pipeline 阶段,找到对应的源代码文件和函数 2. **判断是否仍然存在** — 阅读当前代码,判断这个 Bug 是否已被修复(主分支在快速迭代,部分问题可能已解决) 3. **分析根因** — 如果 Bug 仍存在,分析具体的根本原因(不是表面现象) 4. **评估价值** — 判断这个问题是否值得修复: - **值得修复:** 影响 Pipeline 正常运行、影响论文质量、多人反馈的共性问题 - **暂缓处理:** 边缘场景、个别配置问题、已有 workaround - **不处理:** 设计如此、超出范围的需求、无法复现 #### 3c. 生成修复方案 对每个确认的 Bug,给出: - **具体是什么 Bug** — 一句话描述 - **根因在哪里** — 哪个文件、哪个函数、什么逻辑有问题 - **怎么修复** — 具体的代码修改方案(不需要写完整代码,但要足够具体,比如"在 executor.py 的 `_run_experiment` 函数中,第 XX 行的异常处理需要增加 TimeoutError 的 catch") - **修复后的预期行为** — 修好后应该是什么样的 ### 第四步:生成 Bug 修复文档 将所有分析结果汇总为一份 Markdown 文档,保存到 `docs/BUG_FIX_DOCUMENT_<日期>.md`。 --- ## 输出文档格式 ```markdown # Bug Fix Document — AutoResearchClaw Pipeline > 生成日期:YYYY-MM-DD > 反馈来源:N 位测试者 > 总计问题:N 个 ## 📊 总览 | 分类 | 数量 | |------|------| | 🔴 确认的 Bug(需修复) | N | | 🟢 已修复(无需处理) | N | | 🔵 功能需求 | N | | 🟡 需要更多信息 | N | | ⚪ 不处理 | N | ## 🔥 修复优先级 | 优先级 | ID | 问题 | 阶段 | 涉及文件 | |--------|----|------|------|----------| | 🔴 CRITICAL | xxx-001 | ... | ... | ... | | 🟠 HIGH | xxx-002 | ... | ... | ... | | ... | ... | ... | ... | ... | --- ## 确认的 Bug — 详细修复方案 ### 🔴 `xxx-001` — Bug 标题 | 字段 | 内容 | |------|------| | **严重程度** | CRITICAL / HIGH / MEDIUM / LOW | | **所属阶段** | STAGE_NAME | | **报告者** | tester_id | **问题描述:** xxx **根因分析:** xxx(具体到文件名、函数名、行号、逻辑问题) **涉及文件:** - `researchclaw/xxx/yyy.py` **修复方案:** xxx(具体的修改步骤,另一台机器上的 agent 能直接按此执行) **修复后预期行为:** xxx
原始反馈证据 (测试者的原话和截图引用)
--- (重复以上格式,直到所有确认的 Bug 都写完) --- ## 功能需求 ### 🔵 `xxx-010` — 需求标题 - 报告者:xxx - 描述:xxx - 建议:xxx --- ## 已修复(无需处理) | ID | 问题 | 报告者 | 已修复原因 | |----|------|--------|-----------| | ... | ... | ... | ... | --- ## 附录:按测试者分组 ### 测试者:`tester_alice` - 学科/领域:xxx(如果能从反馈中推断) - 总计问题:N - 确认 Bug:N - 已修复:N | ID | 问题 | 状态 | 严重程度 | |----|------|------|---------| | ... | ... | ... | ... | ``` --- ## 重要原则 1. **代码为准:** 判断 Bug 是否存在时,以当前代码为准,不要猜测。实际读代码确认。 2. **具体到位:** 修复方案要具体到文件、函数、逻辑,让另一个 agent 能直接执行。不要只说"需要优化"这种模糊描述。 3. **合并去重:** 多个测试者报告同一个问题时,合并为一条,注明所有报告者。 4. **区分表里:** 测试者描述的可能是表面现象,你需要找到深层根因。 5. **务实判断:** 不是所有反馈都值得处理。有些是配置问题、有些是预期行为、有些修复代价远大于收益 —— 这些需要你做出判断。 6. **保留证据:** 每个问题都保留测试者的原始描述作为证据。 7. **中文输出:** 文档用中文书写(技术术语、代码、文件名保持英文)。 --- ## 特殊情况处理 - **反馈文档是英文:** 正常分析,输出仍用中文。 - **没有压缩包只有反馈文档:** 仅基于反馈文本分析,标注"无法验证运行产物"。 - **没有反馈文档只有压缩包:** 从压缩包的错误日志和阶段输出中推断问题。 - **反馈内容模糊难以定位:** 归类为"需要更多信息",说明缺少什么信息。 - **反馈涉及已删除/重构的功能:** 标记为"已修复"或"架构已变更"。 --- ## 第五步:提交并推送 分析完成、文档生成后,你需要完成以下操作: 1. **切换到主分支:** `git checkout main` 2. **将 Bug 修复文档提交到 `docs/` 目录:** - 文件命名格式:`docs/BUG_FIX_DOCUMENT_.md` - 如果当天已有同名文档,加序号:`docs/BUG_FIX_DOCUMENT__02.md` 3. **提交并推送到远程主分支:** ``` git add docs/BUG_FIX_DOCUMENT_.md git commit -m "docs: add bug fix document from tester feedback ()" git push origin main ``` 4. **告知用户:** 推送完成后,告知用户文档路径和 Bug 摘要,以便在其他机器上拉取并执行修复。 **重要:** 提交时不要加 Co-Authored-By,commit 作者只能是用户自己。 --- ## 开始 现在请扫描 feedback inbox 目录,开始工作。 ================================================ FILE: researchclaw/hardware.py ================================================ """Hardware detection for GPU-aware experiment execution.""" from __future__ import annotations import logging import platform import subprocess from dataclasses import asdict, dataclass logger = logging.getLogger(__name__) # VRAM threshold (MB) — GPUs with less than this are "limited" _HIGH_VRAM_THRESHOLD_MB = 8192 # Words that indicate a log/status line rather than a metric LOG_WORDS: frozenset[str] = frozenset({ "running", "loading", "saving", "processing", "starting", "finished", "completed", "initializing", "downloading", "training", "evaluating", "epoch", "step", "iteration", "experiment", "warning", "error", "info", "debug", "experiments", "using", "setting", "creating", "building", "computing", "reading", "writing", "opening", "closing", }) # Maximum word count for a plausible metric name _MAX_METRIC_NAME_WORDS = 6 @dataclass(frozen=True) class HardwareProfile: """Detected hardware capabilities of the local machine.""" has_gpu: bool gpu_type: str # "cuda" | "mps" | "cpu" gpu_name: str # e.g. "NVIDIA RTX 4090" / "Apple M3 Pro" / "CPU only" vram_mb: int | None # NVIDIA only; None for MPS/CPU tier: str # "high" | "limited" | "cpu_only" warning: str # User-facing warning message (empty if tier=high) def to_dict(self) -> dict[str, object]: return asdict(self) def detect_hardware() -> HardwareProfile: """Detect local GPU hardware and return a HardwareProfile. Detection order: 1. NVIDIA GPU via ``nvidia-smi`` 2. macOS Apple Silicon (MPS) via platform check 3. Fallback to CPU-only """ # --- Try NVIDIA --- profile = _detect_nvidia() if profile is not None: return profile # --- Try macOS MPS (Apple Silicon) --- profile = _detect_mps() if profile is not None: return profile # --- CPU only --- return HardwareProfile( has_gpu=False, gpu_type="cpu", gpu_name="CPU only", vram_mb=None, tier="cpu_only", warning=( "No GPU detected. Only CPU-based experiments (NumPy, sklearn) are supported. " "For deep learning research ideas, please use a machine with a GPU or a remote GPU server." ), ) def _detect_nvidia() -> HardwareProfile | None: """Detect NVIDIA GPU via nvidia-smi.""" try: result = subprocess.run( [ "nvidia-smi", "--query-gpu=name,memory.total", "--format=csv,noheader,nounits", ], capture_output=True, text=True, timeout=10, check=False, ) if result.returncode != 0: return None # Parse first GPU line: "NVIDIA GeForce RTX 4090, 24564" line = result.stdout.strip().splitlines()[0].strip() parts = [p.strip() for p in line.split(",")] if len(parts) < 2: return None gpu_name = parts[0] try: vram_mb = int(float(parts[1])) except (ValueError, IndexError): vram_mb = 0 if vram_mb >= _HIGH_VRAM_THRESHOLD_MB: tier = "high" warning = "" else: tier = "limited" warning = ( f"Local GPU ({gpu_name}, {vram_mb} MB VRAM) has limited memory. " "Complex deep learning experiments may be slow or run out of memory. " "Consider using a remote GPU server for best results." ) return HardwareProfile( has_gpu=True, gpu_type="cuda", gpu_name=gpu_name, vram_mb=vram_mb, tier=tier, warning=warning, ) except (FileNotFoundError, subprocess.TimeoutExpired, OSError): return None def _detect_mps() -> HardwareProfile | None: """Detect macOS Apple Silicon GPU (MPS).""" if platform.system() != "Darwin": return None if platform.machine() != "arm64": return None # Get chip name via sysctl gpu_name = "Apple Silicon GPU" try: result = subprocess.run( ["sysctl", "-n", "machdep.cpu.brand_string"], capture_output=True, text=True, timeout=5, check=False, ) if result.returncode == 0 and result.stdout.strip(): gpu_name = result.stdout.strip() except (FileNotFoundError, subprocess.TimeoutExpired, OSError): pass return HardwareProfile( has_gpu=True, gpu_type="mps", gpu_name=gpu_name, vram_mb=None, # MPS shares system memory tier="limited", warning=( f"macOS GPU detected ({gpu_name}). PyTorch MPS backend is available " "but has limited performance compared to NVIDIA CUDA GPUs. " "For large-scale experiments, consider using a remote GPU server." ), ) def ensure_torch_available(python_path: str, gpu_type: str) -> bool: """Check if PyTorch is importable; attempt install if not. Returns True if torch is available after this call. """ from pathlib import Path python = Path(python_path) if not python.is_absolute(): python = Path.cwd() / python # Check if already installed try: result = subprocess.run( [str(python), "-c", "import torch; print(torch.__version__)"], capture_output=True, text=True, timeout=30, check=False, ) if result.returncode == 0: version = result.stdout.strip() logger.info("PyTorch %s already available at %s", version, python) return True except (FileNotFoundError, subprocess.TimeoutExpired, OSError): return False # Not installed — attempt install if gpu_type == "cpu": logger.info("No GPU available; skipping PyTorch installation") return False logger.info("PyTorch not found. Attempting install for %s...", gpu_type) pip_cmd = [str(python), "-m", "pip", "install", "--quiet", "torch"] try: result = subprocess.run( pip_cmd, capture_output=True, text=True, timeout=300, check=False, ) if result.returncode == 0: logger.info("PyTorch installed successfully") return True logger.warning("PyTorch installation failed: %s", result.stderr[:300]) return False except (FileNotFoundError, subprocess.TimeoutExpired, OSError) as exc: logger.warning("PyTorch installation error: %s", exc) return False def is_metric_name(name: str) -> bool: """Return True if *name* looks like a metric name rather than a log line. Used to filter stdout lines when parsing ``name: value`` metric output. """ words = name.lower().split() if len(words) > _MAX_METRIC_NAME_WORDS: return False if any(w in LOG_WORDS for w in words): return False return True ================================================ FILE: researchclaw/health.py ================================================ from __future__ import annotations import importlib import json import logging import os import shutil import socket import sys import urllib.error import urllib.request from collections.abc import Callable as AbcCallable from collections.abc import Mapping from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import ContextManager, cast import yaml from researchclaw.config import RCConfig, validate_config logger = logging.getLogger(__name__) @dataclass(frozen=True) class CheckResult: name: str status: str detail: str fix: str = "" @dataclass(frozen=True) class DoctorReport: timestamp: str checks: list[CheckResult] overall: str @property def actionable_fixes(self) -> list[str]: return [check.fix for check in self.checks if check.fix] def to_dict(self) -> dict[str, object]: return { "timestamp": self.timestamp, "overall": self.overall, "checks": [ { "name": check.name, "status": check.status, "detail": check.detail, "fix": check.fix, } for check in self.checks ], "actionable_fixes": self.actionable_fixes, } def check_python_version() -> CheckResult: version_tuple = ( int(sys.version_info.major), int(sys.version_info.minor), int(sys.version_info.micro), ) if version_tuple >= (3, 11, 0): return CheckResult( name="python_version", status="pass", detail=( f"Python {sys.version_info.major}.{sys.version_info.minor}." f"{sys.version_info.micro}" ), ) return CheckResult( name="python_version", status="fail", detail=( f"Python {sys.version_info.major}.{sys.version_info.minor}." f"{sys.version_info.micro} is unsupported" ), fix="Install Python 3.11 or newer", ) def check_yaml_import() -> CheckResult: try: _ = importlib.import_module("yaml") except ImportError: return CheckResult( name="yaml_import", status="fail", detail="PyYAML is not importable", fix="pip install pyyaml", ) return CheckResult(name="yaml_import", status="pass", detail="PyYAML import ok") def check_config_valid(config_path: str | Path) -> CheckResult: path = Path(config_path) if not path.exists(): return CheckResult( name="config_valid", status="fail", detail=f"Config file not found: {path}", fix="Provide --config path to an existing YAML config file", ) try: with path.open(encoding="utf-8") as handle: data_obj = _load_yaml_object(handle.read()) except yaml.YAMLError as exc: return CheckResult( name="config_valid", status="fail", detail=f"Config YAML parse error: {exc}", fix="Fix YAML syntax errors in the config file", ) except OSError as exc: return CheckResult( name="config_valid", status="fail", detail=f"Could not read config file: {exc}", fix="Verify file permissions and path", ) data: object = {} if data_obj is None else data_obj if not isinstance(data, dict): return CheckResult( name="config_valid", status="fail", detail="Config root must be a mapping", fix="Ensure the config file starts with key-value mappings", ) data_map = cast(Mapping[object, object], data) typed_data = {str(key): value for key, value in data_map.items()} result = validate_config(typed_data) if result.ok: return CheckResult( name="config_valid", status="pass", detail="Config validation ok" ) return CheckResult( name="config_valid", status="fail", detail="; ".join(result.errors), fix="Fix validation errors in config file", ) def _models_url(base_url: str) -> str: return f"{base_url.rstrip('/')}/models" def _is_timeout(exc: BaseException) -> bool: if isinstance(exc, TimeoutError): return True if isinstance(exc, socket.timeout): return True reason = getattr(exc, "reason", None) return isinstance(reason, (TimeoutError, socket.timeout)) def check_llm_connectivity(base_url: str) -> CheckResult: if not base_url.strip(): return CheckResult( name="llm_connectivity", status="fail", detail="LLM base URL is empty", fix="Set llm.base_url in config", ) url = _models_url(base_url) req = urllib.request.Request(url, method="HEAD") try: with urllib.request.urlopen(req, timeout=5): return CheckResult( name="llm_connectivity", status="pass", detail=f"Reachable: {url}", ) except urllib.error.HTTPError as exc: if exc.code == 405: try: with urllib.request.urlopen(url, timeout=5): return CheckResult( name="llm_connectivity", status="pass", detail=f"Reachable: {url}", ) except urllib.error.HTTPError as get_exc: return CheckResult( name="llm_connectivity", status="fail", detail=f"LLM endpoint HTTP {get_exc.code}", fix="Check llm.base_url and provider status", ) except urllib.error.URLError as get_exc: if _is_timeout(get_exc): return CheckResult( name="llm_connectivity", status="fail", detail="LLM endpoint unreachable", fix="Verify endpoint URL and network connectivity", ) return CheckResult( name="llm_connectivity", status="fail", detail=f"LLM connectivity error: {get_exc.reason}", fix="Verify endpoint URL and network connectivity", ) except TimeoutError: return CheckResult( name="llm_connectivity", status="fail", detail="LLM endpoint unreachable", fix="Verify endpoint URL and network connectivity", ) return CheckResult( name="llm_connectivity", status="fail", detail=f"LLM endpoint HTTP {exc.code}", fix="Check llm.base_url and provider status", ) except urllib.error.URLError as exc: if _is_timeout(exc): return CheckResult( name="llm_connectivity", status="fail", detail="LLM endpoint unreachable", fix="Verify endpoint URL and network connectivity", ) return CheckResult( name="llm_connectivity", status="fail", detail=f"LLM connectivity error: {exc.reason}", fix="Verify endpoint URL and network connectivity", ) except TimeoutError: return CheckResult( name="llm_connectivity", status="fail", detail="LLM endpoint unreachable", fix="Verify endpoint URL and network connectivity", ) def _fetch_models(base_url: str, api_key: str = "") -> tuple[int, dict[str, object]]: headers: dict[str, str] = {} if api_key: headers["Authorization"] = f"Bearer {api_key}" request = urllib.request.Request(_models_url(base_url), headers=headers) with _urlopen(request, timeout=5) as response: raw_bytes = _read_response_bytes(response) payload_map = _load_json_mapping(raw_bytes.decode("utf-8") or "{}") payload: dict[str, object] = { str(key): value for key, value in payload_map.items() } return 200, payload def _read_response_bytes(response: object) -> bytes: if not hasattr(response, "read"): raise ValueError("Response object has no read method") reader_obj = getattr(response, "read", None) if reader_obj is None or not isinstance(reader_obj, AbcCallable): raise ValueError("Response read attribute is not callable") reader = cast(AbcCallable[[], object], reader_obj) raw = reader() if not isinstance(raw, (bytes, bytearray)): raise ValueError("Response body is not bytes") return bytes(raw) def _urlopen(req: str | urllib.request.Request, timeout: int) -> ContextManager[object]: return cast(ContextManager[object], urllib.request.urlopen(req, timeout=timeout)) def _load_yaml_object(content: str) -> object: return cast(object, yaml.safe_load(content)) def _load_json_mapping(content: str) -> Mapping[object, object]: payload_obj = cast(object, json.loads(content)) if not isinstance(payload_obj, dict): raise ValueError("models response must be a JSON object") return cast(Mapping[object, object], payload_obj) def check_api_key_valid(base_url: str, api_key: str) -> CheckResult: if not api_key.strip(): return CheckResult( name="api_key_valid", status="fail", detail="API key is empty", fix="Set llm.api_key or environment variable defined by llm.api_key_env", ) try: status, _ = _fetch_models(base_url, api_key) if status == 200: return CheckResult( name="api_key_valid", status="pass", detail="API key accepted", ) except urllib.error.HTTPError as exc: if exc.code == 401: return CheckResult( name="api_key_valid", status="fail", detail="Invalid API key", fix="Set a valid API key for the configured endpoint", ) return CheckResult( name="api_key_valid", status="warn", detail=f"API key check returned HTTP {exc.code}", fix="Verify endpoint health and API key permissions", ) except urllib.error.URLError as exc: return CheckResult( name="api_key_valid", status="warn", detail=f"Could not verify API key: {exc.reason}", fix="Retry when endpoint/network is available", ) except (json.JSONDecodeError, OSError, ValueError) as exc: return CheckResult( name="api_key_valid", status="warn", detail=f"Could not verify API key: {exc}", fix="Retry when endpoint/network is available", ) return CheckResult( name="api_key_valid", status="warn", detail="Could not verify API key", fix="Retry when endpoint/network is available", ) def check_model_available(base_url: str, api_key: str, model: str) -> CheckResult: """Check if a single model is available (kept for backward compat).""" results = _check_models_against_endpoint(base_url, api_key, [model]) if results is None: return CheckResult( name="model_available", status="warn", detail="Could not verify model availability", fix="Retry when endpoint/network is available", ) available, _missing = results if model in available: return CheckResult( name="model_available", status="pass", detail=f"Model available: {model}", ) return CheckResult( name="model_available", status="fail", detail=f"Model {model} not available", fix="Update llm.primary_model or endpoint model access", ) def check_model_chain( base_url: str, api_key: str, primary_model: str, fallback_models: tuple[str, ...] | list[str] = (), ) -> CheckResult: """Check the full model fallback chain — pass if ANY model works.""" all_models = [m for m in [primary_model] + list(fallback_models) if m.strip()] if not all_models: return CheckResult( name="model_chain", status="warn", detail="No models configured", fix="Set llm.primary_model in config", ) results = _check_models_against_endpoint(base_url, api_key, all_models) if results is None: return CheckResult( name="model_chain", status="warn", detail="Could not verify model availability", fix="Retry when endpoint/network is available", ) available, missing = results if not available: return CheckResult( name="model_chain", status="fail", detail=f"No models available (tested: {', '.join(all_models)})", fix="Update llm.primary_model/fallback_models or endpoint model access", ) if missing: return CheckResult( name="model_chain", status="pass", detail=( f"Fallback chain OK — available: {', '.join(sorted(available))}; " f"unavailable: {', '.join(sorted(missing))}" ), ) return CheckResult( name="model_chain", status="pass", detail=f"All models available: {', '.join(sorted(available))}", ) def _check_models_against_endpoint( base_url: str, api_key: str, models: list[str] ) -> tuple[set[str], set[str]] | None: """Return (available, missing) sets, or None if endpoint unreachable.""" if not models or not all(m.strip() for m in models): models = [m for m in models if m.strip()] if not models: return set(), set() try: _, payload = _fetch_models(base_url, api_key) except ( urllib.error.HTTPError, urllib.error.URLError, json.JSONDecodeError, OSError, ): return None models_obj = payload.get("data") endpoint_models = cast( list[object] | None, models_obj if isinstance(models_obj, list) else None ) if not isinstance(endpoint_models, list): return None available_ids: set[str] = set() for item in endpoint_models: if not isinstance(item, dict): continue item_map = cast(Mapping[object, object], item) model_id_obj = item_map.get("id") if isinstance(model_id_obj, str): available_ids.add(model_id_obj) requested = set(models) available = requested & available_ids missing = requested - available_ids return available, missing def check_sandbox_python(python_path: str) -> CheckResult: if not python_path.strip(): return CheckResult( name="sandbox_python", status="warn", detail="Sandbox python path is empty", fix="Set experiment.sandbox.python_path in config", ) path = Path(python_path) if path.exists() and os.access(path, os.X_OK): return CheckResult( name="sandbox_python", status="pass", detail=f"Sandbox python found: {path}", ) return CheckResult( name="sandbox_python", status="warn", detail=f"Sandbox python missing or not executable: {path}", fix="Install sandbox interpreter or update experiment.sandbox.python_path", ) def check_matplotlib() -> CheckResult: try: _ = importlib.import_module("matplotlib") except ImportError: return CheckResult( name="matplotlib", status="warn", detail="Not installed; charts will be skipped", fix="pip install matplotlib", ) return CheckResult(name="matplotlib", status="pass", detail="matplotlib import ok") def check_experiment_mode(mode: str) -> CheckResult: if mode == "simulated": return CheckResult( name="experiment_mode", status="warn", detail="Experiment mode is simulated — results will be synthetic", fix="Use sandbox or docker mode for real execution", ) return CheckResult( name="experiment_mode", status="pass", detail=f"Experiment mode: {mode}", ) def check_acp_agent(agent_command: str) -> CheckResult: """Check that the ACP agent CLI is available on PATH.""" resolved = shutil.which(agent_command) if resolved: return CheckResult( name="acp_agent", status="pass", detail=f"ACP agent found: {resolved}", ) return CheckResult( name="acp_agent", status="fail", detail=f"ACP agent '{agent_command}' not found on PATH", fix=f"Install {agent_command} or update llm.acp.agent in config", ) def check_docker_runtime(config: RCConfig) -> CheckResult: """Check Docker daemon, image availability, and optional NVIDIA runtime.""" from researchclaw.experiment.docker_sandbox import DockerSandbox if not DockerSandbox.check_docker_available(): return CheckResult( name="docker_runtime", status="fail", detail="Docker daemon is not reachable", fix="Install and start Docker, or switch to mode: sandbox", ) docker_cfg = config.experiment.docker if not DockerSandbox.ensure_image(docker_cfg.image): return CheckResult( name="docker_runtime", status="fail", detail=f"Docker image '{docker_cfg.image}' not found locally", fix=f"docker build -t {docker_cfg.image} researchclaw/docker/", ) detail = f"Docker OK, image={docker_cfg.image}" if docker_cfg.gpu_enabled: detail += ", GPU passthrough enabled" return CheckResult(name="docker_runtime", status="pass", detail=detail) def run_doctor(config_path: str | Path) -> DoctorReport: """Run all health checks and return report.""" checks: list[CheckResult] = [] path = Path(config_path) checks.append(check_python_version()) checks.append(check_yaml_import()) checks.append(check_config_valid(path)) base_url = "" api_key = "" model = "" fallback_models: tuple[str, ...] = () sandbox_python_path = "" experiment_mode = "" provider = "" acp_agent_command = "claude" try: config = RCConfig.load(path, check_paths=False) provider = config.llm.provider base_url = config.llm.base_url api_key = config.llm.api_key or os.environ.get(config.llm.api_key_env, "") model = config.llm.primary_model fallback_models = config.llm.fallback_models sandbox_python_path = config.experiment.sandbox.python_path experiment_mode = config.experiment.mode acp_agent_command = config.llm.acp.agent except (FileNotFoundError, OSError, ValueError, yaml.YAMLError) as exc: logger.debug("Could not fully load config for doctor checks: %s", exc) if provider == "acp": checks.append(check_acp_agent(acp_agent_command)) else: checks.append(check_llm_connectivity(base_url)) checks.append(check_api_key_valid(base_url, api_key)) checks.append(check_model_chain(base_url, api_key, model, fallback_models)) checks.append(check_sandbox_python(sandbox_python_path)) checks.append(check_matplotlib()) checks.append(check_experiment_mode(experiment_mode)) if experiment_mode == "docker": try: checks.append(check_docker_runtime(config)) except Exception as exc: # noqa: BLE001 logger.debug("Docker health check failed: %s", exc) checks.append( CheckResult( name="docker_runtime", status="fail", detail=f"Docker health check error: {exc}", fix="Ensure Docker is installed and the daemon is running", ) ) overall = "fail" if any(c.status == "fail" for c in checks) else "pass" return DoctorReport( timestamp=datetime.now(timezone.utc).isoformat(timespec="seconds"), checks=checks, overall=overall, ) def print_doctor_report(report: DoctorReport) -> None: """Pretty-print doctor report to stdout.""" icon_by_status = {"pass": "✅", "fail": "❌", "warn": "⚠️"} encoding = getattr(sys.stdout, "encoding", None) or "utf-8" try: for icon in icon_by_status.values(): icon.encode(encoding) except UnicodeEncodeError: icon_by_status = {"pass": "[OK]", "fail": "[FAIL]", "warn": "[WARN]"} print(f"ResearchClaw Doctor Report ({report.timestamp})") for check in report.checks: icon = icon_by_status.get(check.status, "-") print(f"{icon} {check.name}: {check.detail}") if check.fix: print(f" Fix: {check.fix}") fail_count = sum(1 for check in report.checks if check.status == "fail") warn_count = sum(1 for check in report.checks if check.status == "warn") if report.overall == "pass": print("Result: PASS") else: print(f"Result: FAIL ({fail_count} errors, {warn_count} warnings)") def write_doctor_report(report: DoctorReport, path: Path) -> None: """Write report as JSON.""" path.parent.mkdir(parents=True, exist_ok=True) _ = path.write_text(json.dumps(report.to_dict(), indent=2) + "\n", encoding="utf-8") ================================================ FILE: researchclaw/knowledge/__init__.py ================================================ """Knowledge management — base, adapters.""" ================================================ FILE: researchclaw/knowledge/base.py ================================================ """Knowledge base integration for ARC pipeline. Supports two backends: - ``markdown`` (default): Plain Markdown files in ``docs/kb/`` - ``obsidian``: Markdown with Obsidian-compatible wikilinks, tags, and frontmatter Both backends produce files that are valid Markdown and can be browsed without any special tooling — the Obsidian backend simply adds extra metadata that Obsidian can consume. """ from __future__ import annotations import json from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any import yaml def _utcnow_iso() -> str: return datetime.now(timezone.utc).isoformat(timespec="seconds") # --------------------------------------------------------------------------- # KB Entry # --------------------------------------------------------------------------- @dataclass class KBEntry: """A single knowledge-base entry to be written.""" category: ( str # questions | literature | experiments | findings | decisions | reviews ) entry_id: str # Unique ID (e.g. "goal-define-run-abc") title: str content: str # Markdown body source_stage: str # e.g. "01-goal_define" run_id: str evidence_refs: list[str] | None = None tags: list[str] | None = None links: list[str] | None = None # For Obsidian wikilinks # --------------------------------------------------------------------------- # Writers # --------------------------------------------------------------------------- def _markdown_frontmatter(entry: KBEntry) -> str: """Generate YAML frontmatter block.""" meta: dict[str, Any] = { "id": entry.entry_id, "title": entry.title, "stage": entry.source_stage, "run_id": entry.run_id, "created": _utcnow_iso(), } if entry.tags: meta["tags"] = entry.tags if entry.evidence_refs: meta["evidence"] = entry.evidence_refs return ( "---\n" + yaml.dump(meta, default_flow_style=False, allow_unicode=True).rstrip() + "\n---\n" ) def _obsidian_enhancements(entry: KBEntry) -> str: """Add Obsidian-compatible wikilinks and tag line at end of content.""" extras: list[str] = [] if entry.tags: tag_line = " ".join(f"#{t}" for t in entry.tags) extras.append(f"\n{tag_line}") if entry.links: link_line = "Related: " + ", ".join(f"[[{l}]]" for l in entry.links) extras.append(link_line) return "\n".join(extras) def write_kb_entry( kb_root: Path, entry: KBEntry, *, backend: str = "markdown", ) -> Path: """Write a single KB entry to the appropriate category directory. Returns the path to the written file. """ category_dir = kb_root / entry.category category_dir.mkdir(parents=True, exist_ok=True) # Build content parts: list[str] = [] parts.append(_markdown_frontmatter(entry)) parts.append(f"# {entry.title}\n") parts.append(entry.content) if backend == "obsidian": obs = _obsidian_enhancements(entry) if obs: parts.append(obs) filename = f"{entry.entry_id}.md" filepath = category_dir / filename filepath.write_text("\n".join(parts), encoding="utf-8") return filepath # --------------------------------------------------------------------------- # Pipeline KB integration # --------------------------------------------------------------------------- KB_CATEGORY_MAP: dict[int, str] = { 1: "questions", 2: "questions", 3: "decisions", 4: "literature", 5: "literature", 6: "literature", 7: "findings", 8: "questions", 9: "decisions", 10: "experiments", 11: "decisions", 12: "experiments", 13: "experiments", 14: "findings", 15: "decisions", 16: "reviews", 17: "reviews", 18: "reviews", 19: "reviews", 20: "decisions", 21: "decisions", 22: "reviews", } def write_stage_to_kb( kb_root: Path, stage_id: int, stage_name: str, run_id: str, artifacts: list[str], stage_dir: Path, *, backend: str = "markdown", topic: str = "", ) -> list[Path]: """Write stage results to the knowledge base. Reads the primary output artifact and creates a KB entry in the appropriate category directory. Returns list of paths written. """ category = KB_CATEGORY_MAP.get(stage_id, "findings") written: list[Path] = [] # Read the primary artifact content content_parts: list[str] = [] evidence: list[str] = [] for artifact_name in artifacts: artifact_path = stage_dir / artifact_name.rstrip("/") if artifact_path.is_file(): text = artifact_path.read_text(encoding="utf-8") # Truncate very large files for KB entry if len(text) > 5000: text = text[:5000] + "\n\n... (truncated, see full artifact)\n" content_parts.append(text) evidence.append(f"stage-{stage_id:02d}/{artifact_name}") elif artifact_path.is_dir(): files = sorted(artifact_path.iterdir()) content_parts.append( f"Directory with {len(files)} files: {', '.join(f.name for f in files[:10])}" ) evidence.append(f"stage-{stage_id:02d}/{artifact_name}/") if not content_parts: content_parts.append( f"Stage {stage_id:02d} ({stage_name}) completed. See artifacts directory for details." ) entry = KBEntry( category=category, entry_id=f"{stage_name}-{run_id}", title=f"Stage {stage_id:02d}: {stage_name.replace('_', ' ').title()}", content="\n\n".join(content_parts), source_stage=f"{stage_id:02d}-{stage_name}", run_id=run_id, evidence_refs=evidence, tags=[stage_name, f"stage-{stage_id:02d}", f"run-{run_id[:8]}"], links=[f"run-{run_id}"] if backend == "obsidian" else None, ) path = write_kb_entry(kb_root, entry, backend=backend) written.append(path) return written # --------------------------------------------------------------------------- # Weekly report generation (#19) # --------------------------------------------------------------------------- def generate_weekly_report( kb_root: Path, run_dirs: list[Path], *, backend: str = "markdown", week_label: str = "", ) -> Path: """Generate a weekly summary report from completed pipeline runs. Scans ``run_dirs`` for ``pipeline_summary.json`` files and aggregates statistics into a Markdown report written to ``kb_root/reviews/``. """ if not week_label: week_label = datetime.now(timezone.utc).strftime("%Y-W%W") runs_data: list[dict] = [] for run_dir in run_dirs: summary_path = run_dir / "pipeline_summary.json" if summary_path.exists(): runs_data.append(json.loads(summary_path.read_text(encoding="utf-8"))) # Build report total_runs = len(runs_data) total_stages = sum(r.get("stages_executed", 0) for r in runs_data) total_done = sum(r.get("stages_done", 0) for r in runs_data) total_failed = sum(r.get("stages_failed", 0) for r in runs_data) total_blocked = sum(r.get("stages_blocked", 0) for r in runs_data) report_lines = [ f"## Summary", f"- Week: {week_label}", f"- Pipeline runs: {total_runs}", f"- Stages executed: {total_stages}", f"- Stages completed: {total_done}", f"- Stages failed: {total_failed}", f"- Stages blocked (gate): {total_blocked}", f"- Success rate: {total_done / total_stages * 100:.1f}%" if total_stages > 0 else "- Success rate: N/A", "", "## Run Details", ] for rd in runs_data: run_id = rd.get("run_id", "unknown") report_lines.append( f"- **{run_id}**: {rd.get('stages_done', 0)}/{rd.get('stages_executed', 0)} stages done, final={rd.get('final_status', '?')}" ) report_lines.extend(["", "## Recommendations"]) if total_failed > 0: report_lines.append( f"- ⚠️ {total_failed} stage failures detected. Review error logs." ) if total_blocked > 0: report_lines.append(f"- 🔒 {total_blocked} stages awaiting gate approval.") if total_failed == 0 and total_blocked == 0: report_lines.append("- ✅ All stages completed successfully.") content = "\n".join(report_lines) entry = KBEntry( category="reviews", entry_id=f"weekly-report-{week_label}", title=f"Weekly Report — {week_label}", content=content, source_stage="report", run_id=week_label, tags=["weekly-report", week_label], ) return write_kb_entry(kb_root, entry, backend=backend) ================================================ FILE: researchclaw/knowledge/graph/__init__.py ================================================ """Research knowledge graph built on NetworkX. Extracts entities (Papers, Methods, Datasets, Metrics) and relations (CITES, EXTENDS, OUTPERFORMS) from literature and experiment results, enabling research gap discovery and trend analysis. """ from researchclaw.knowledge.graph.entities import Entity, EntityType from researchclaw.knowledge.graph.relations import Relation, RelationType from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder from researchclaw.knowledge.graph.query import KnowledgeGraphQuery __all__ = [ "Entity", "EntityType", "Relation", "RelationType", "KnowledgeGraphBuilder", "KnowledgeGraphQuery", ] ================================================ FILE: researchclaw/knowledge/graph/builder.py ================================================ """Knowledge graph builder — constructs graph from literature and experiments.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.knowledge.graph.entities import Entity, EntityType from researchclaw.knowledge.graph.relations import Relation, RelationType logger = logging.getLogger(__name__) class KnowledgeGraphBuilder: """Builds and manages a research knowledge graph. Uses dictionaries for storage (compatible with NetworkX-style serialization) without requiring networkx as a hard dependency. """ def __init__(self, max_entities: int = 10000) -> None: self._entities: dict[str, Entity] = {} self._relations: list[Relation] = [] self._max_entities = max_entities @property def entity_count(self) -> int: """Number of entities in the graph.""" return len(self._entities) @property def relation_count(self) -> int: """Number of relations in the graph.""" return len(self._relations) def add_entity(self, entity: Entity) -> bool: """Add an entity to the graph. Args: entity: The entity to add. Returns: True if added, False if capacity reached or duplicate. """ if entity.id in self._entities: # Update attributes of existing entity existing = self._entities[entity.id] merged = {**existing.attributes, **entity.attributes} self._entities[entity.id] = Entity( id=entity.id, entity_type=entity.entity_type, name=entity.name or existing.name, attributes=merged, ) return True if len(self._entities) >= self._max_entities: logger.warning("Knowledge graph capacity reached (%d)", self._max_entities) return False self._entities[entity.id] = entity return True def add_relation(self, relation: Relation) -> bool: """Add a relation to the graph. Args: relation: The relation to add. Returns: True if added, False if source or target entity doesn't exist. """ if relation.source_id not in self._entities: logger.debug("Source entity not found: %s", relation.source_id) return False if relation.target_id not in self._entities: logger.debug("Target entity not found: %s", relation.target_id) return False # Check for duplicate for existing in self._relations: if ( existing.source_id == relation.source_id and existing.target_id == relation.target_id and existing.relation_type == relation.relation_type ): return True # Already exists self._relations.append(relation) return True def get_entity(self, entity_id: str) -> Entity | None: """Get an entity by ID.""" return self._entities.get(entity_id) def get_entities_by_type(self, entity_type: EntityType) -> list[Entity]: """Get all entities of a specific type.""" return [ e for e in self._entities.values() if e.entity_type == entity_type ] def get_relations_for( self, entity_id: str, direction: str = "both", ) -> list[Relation]: """Get relations involving an entity. Args: entity_id: The entity to query. direction: "outgoing", "incoming", or "both". Returns: List of matching relations. """ results: list[Relation] = [] for rel in self._relations: if direction in ("outgoing", "both") and rel.source_id == entity_id: results.append(rel) if direction in ("incoming", "both") and rel.target_id == entity_id: results.append(rel) return results def remove_entity(self, entity_id: str) -> bool: """Remove an entity and all its relations.""" if entity_id not in self._entities: return False del self._entities[entity_id] self._relations = [ r for r in self._relations if r.source_id != entity_id and r.target_id != entity_id ] return True def add_paper( self, paper_id: str, title: str, year: int | None = None, authors: list[str] | None = None, abstract: str = "", ) -> Entity: """Convenience method to add a paper entity. Args: paper_id: Unique paper ID (e.g., arxiv ID). title: Paper title. year: Publication year. authors: List of author names. abstract: Paper abstract. Returns: The created Entity. """ attrs: dict[str, Any] = {} if year: attrs["year"] = year if authors: attrs["authors"] = authors if abstract: attrs["abstract"] = abstract[:500] entity = Entity( id=paper_id, entity_type=EntityType.PAPER, name=title, attributes=attrs, ) self.add_entity(entity) return entity def add_method( self, method_id: str, name: str, description: str = "", ) -> Entity: """Convenience method to add a method entity.""" entity = Entity( id=method_id, entity_type=EntityType.METHOD, name=name, attributes={"description": description} if description else {}, ) self.add_entity(entity) return entity def add_dataset( self, dataset_id: str, name: str, domain: str = "", ) -> Entity: """Convenience method to add a dataset entity.""" entity = Entity( id=dataset_id, entity_type=EntityType.DATASET, name=name, attributes={"domain": domain} if domain else {}, ) self.add_entity(entity) return entity def save(self, path: str | Path) -> None: """Save graph to JSON file. Args: path: File path for output. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) data = { "entities": [e.to_dict() for e in self._entities.values()], "relations": [r.to_dict() for r in self._relations], } path.write_text( json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8" ) logger.info( "Saved knowledge graph: %d entities, %d relations to %s", len(self._entities), len(self._relations), path, ) def load(self, path: str | Path) -> int: """Load graph from JSON file. Args: path: File path to load from. Returns: Total number of entities loaded. """ path = Path(path) if not path.exists(): return 0 try: data = json.loads(path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError) as exc: logger.warning("Failed to load knowledge graph: %s", exc) return 0 for entity_data in data.get("entities", []): try: entity = Entity.from_dict(entity_data) self.add_entity(entity) except (ValueError, KeyError) as exc: logger.debug("Skipping malformed entity: %s", exc) for rel_data in data.get("relations", []): try: relation = Relation.from_dict(rel_data) self.add_relation(relation) except (ValueError, KeyError) as exc: logger.debug("Skipping malformed relation: %s", exc) logger.info( "Loaded knowledge graph: %d entities, %d relations", self.entity_count, self.relation_count, ) return self.entity_count ================================================ FILE: researchclaw/knowledge/graph/entities.py ================================================ """Entity definitions for the research knowledge graph.""" from __future__ import annotations from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any class EntityType(str, Enum): """Types of entities in the knowledge graph.""" PAPER = "paper" METHOD = "method" DATASET = "dataset" METRIC = "metric" AUTHOR = "author" CONCEPT = "concept" @dataclass class Entity: """A node in the knowledge graph. Attributes: id: Unique identifier (e.g., arxiv ID, method name hash). entity_type: The type of entity. name: Display name. attributes: Additional key-value attributes. """ id: str entity_type: EntityType name: str attributes: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: """Serialize to dictionary.""" d = asdict(self) d["entity_type"] = self.entity_type.value return d @classmethod def from_dict(cls, data: dict[str, Any]) -> Entity: """Deserialize from dictionary.""" return cls( id=str(data.get("id", "")), entity_type=EntityType(data.get("entity_type", "concept")), name=str(data.get("name", "")), attributes=data.get("attributes") or {}, ) ================================================ FILE: researchclaw/knowledge/graph/query.py ================================================ """Knowledge graph query engine.""" from __future__ import annotations import logging from collections import Counter, defaultdict from typing import Any from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder from researchclaw.knowledge.graph.entities import EntityType from researchclaw.knowledge.graph.relations import RelationType logger = logging.getLogger(__name__) class KnowledgeGraphQuery: """Query engine for the research knowledge graph. Provides high-level research-oriented queries like finding gaps, trending methods, method comparisons, and topic suggestions. """ def __init__(self, graph: KnowledgeGraphBuilder) -> None: self._graph = graph def find_research_gaps(self, domain: str = "") -> list[str]: """Find research gaps — datasets without many methods applied. Args: domain: Optional domain filter. Returns: List of gap descriptions. """ datasets = self._graph.get_entities_by_type(EntityType.DATASET) methods = self._graph.get_entities_by_type(EntityType.METHOD) if domain: datasets = [ d for d in datasets if domain.lower() in d.attributes.get("domain", "").lower() or domain.lower() in d.name.lower() ] gaps: list[str] = [] for dataset in datasets: # Count methods applied to this dataset rels = self._graph.get_relations_for(dataset.id, direction="incoming") method_rels = [ r for r in rels if r.relation_type == RelationType.USES_DATASET ] if len(method_rels) < 2 and methods: gaps.append( f"Dataset '{dataset.name}' has only {len(method_rels)} " f"method(s) evaluated — potential research opportunity" ) return gaps def find_trending_methods(self, min_citations: int = 2) -> list[str]: """Find methods with high citation/usage counts. Args: min_citations: Minimum citation count to qualify. Returns: List of trending method descriptions. """ methods = self._graph.get_entities_by_type(EntityType.METHOD) trending: list[tuple[int, str]] = [] for method in methods: rels = self._graph.get_relations_for(method.id, direction="incoming") citation_count = len([ r for r in rels if r.relation_type in ( RelationType.EXTENDS, RelationType.APPLIES_METHOD, RelationType.CITES, ) ]) if citation_count >= min_citations: trending.append((citation_count, method.name)) trending.sort(reverse=True) return [ f"'{name}' — referenced {count} time(s)" for count, name in trending ] def get_method_comparison( self, method_a: str, method_b: str, ) -> dict[str, Any]: """Compare two methods across shared datasets. Args: method_a: Name or ID of first method. method_b: Name or ID of second method. Returns: Dict with comparison results. """ entity_a = self._find_method(method_a) entity_b = self._find_method(method_b) if not entity_a or not entity_b: return { "error": "One or both methods not found", "method_a": method_a, "method_b": method_b, } # Find datasets used by each method datasets_a = self._get_datasets_for_method(entity_a.id) datasets_b = self._get_datasets_for_method(entity_b.id) shared = set(datasets_a.keys()) & set(datasets_b.keys()) comparison: dict[str, Any] = { "method_a": entity_a.name, "method_b": entity_b.name, "shared_datasets": list(shared), "unique_to_a": list(set(datasets_a.keys()) - shared), "unique_to_b": list(set(datasets_b.keys()) - shared), } # Check outperforms relations outperforms_a = [] outperforms_b = [] for rel in self._graph.get_relations_for(entity_a.id, direction="outgoing"): if rel.relation_type == RelationType.OUTPERFORMS and rel.target_id == entity_b.id: outperforms_a.append(rel.attributes) for rel in self._graph.get_relations_for(entity_b.id, direction="outgoing"): if rel.relation_type == RelationType.OUTPERFORMS and rel.target_id == entity_a.id: outperforms_b.append(rel.attributes) comparison["a_outperforms_b"] = outperforms_a comparison["b_outperforms_a"] = outperforms_b return comparison def suggest_topics( self, interests: list[str], top_k: int = 5, ) -> list[str]: """Suggest research topics based on graph structure and interests. Args: interests: List of interest keywords. top_k: Number of suggestions. Returns: List of suggested topics. """ suggestions: list[tuple[float, str]] = [] # Score entities by relevance to interests for entity in self._graph._entities.values(): score = 0.0 name_lower = entity.name.lower() desc = entity.attributes.get("description", "").lower() abstract = entity.attributes.get("abstract", "").lower() combined = f"{name_lower} {desc} {abstract}" for interest in interests: if interest.lower() in combined: score += 1.0 if score > 0: # Boost by connection count rels = self._graph.get_relations_for(entity.id) score += len(rels) * 0.1 suggestions.append((score, entity.name)) # Find gaps as additional suggestions gaps = self.find_research_gaps() for gap in gaps[:3]: for interest in interests: if interest.lower() in gap.lower(): suggestions.append((0.5, gap)) suggestions.sort(reverse=True) seen: set[str] = set() unique: list[str] = [] for _, text in suggestions: if text not in seen: seen.add(text) unique.append(text) if len(unique) >= top_k: break return unique def _find_method(self, name_or_id: str) -> Any: """Find a method entity by name or ID.""" entity = self._graph.get_entity(name_or_id) if entity: return entity for e in self._graph.get_entities_by_type(EntityType.METHOD): if e.name.lower() == name_or_id.lower(): return e return None def _get_datasets_for_method(self, method_id: str) -> dict[str, Any]: """Get datasets that a method has been evaluated on.""" datasets: dict[str, Any] = {} for rel in self._graph.get_relations_for(method_id, direction="outgoing"): if rel.relation_type == RelationType.USES_DATASET: entity = self._graph.get_entity(rel.target_id) if entity: datasets[entity.name] = rel.attributes return datasets ================================================ FILE: researchclaw/knowledge/graph/relations.py ================================================ """Relation definitions for the research knowledge graph.""" from __future__ import annotations from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any class RelationType(str, Enum): """Types of relations between entities.""" CITES = "cites" EXTENDS = "extends" OUTPERFORMS = "outperforms" USES_DATASET = "uses_dataset" APPLIES_METHOD = "applies_method" EVALUATES_WITH = "evaluates_with" # metric used for evaluation AUTHORED_BY = "authored_by" RELATED_TO = "related_to" @dataclass class Relation: """A directed edge in the knowledge graph. Attributes: source_id: ID of the source entity. target_id: ID of the target entity. relation_type: The type of relation. attributes: Additional key-value attributes (e.g., metric value). """ source_id: str target_id: str relation_type: RelationType attributes: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: """Serialize to dictionary.""" d = asdict(self) d["relation_type"] = self.relation_type.value return d @classmethod def from_dict(cls, data: dict[str, Any]) -> Relation: """Deserialize from dictionary.""" return cls( source_id=str(data.get("source_id", "")), target_id=str(data.get("target_id", "")), relation_type=RelationType(data.get("relation_type", "related_to")), attributes=data.get("attributes") or {}, ) ================================================ FILE: researchclaw/knowledge/graph/visualizer.py ================================================ """Knowledge graph visualization export.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder from researchclaw.knowledge.graph.entities import EntityType logger = logging.getLogger(__name__) # Color mapping for entity types _TYPE_COLORS: dict[str, str] = { EntityType.PAPER: "#4A90D9", EntityType.METHOD: "#E74C3C", EntityType.DATASET: "#2ECC71", EntityType.METRIC: "#F39C12", EntityType.AUTHOR: "#9B59B6", EntityType.CONCEPT: "#95A5A6", } def export_to_dot(graph: KnowledgeGraphBuilder, path: str | Path) -> None: """Export graph to Graphviz DOT format. Args: graph: The knowledge graph to export. path: Output file path. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) lines = ["digraph KnowledgeGraph {"] lines.append(" rankdir=LR;") lines.append(" node [shape=box, style=filled, fontsize=10];") # Nodes for entity in graph._entities.values(): color = _TYPE_COLORS.get(entity.entity_type, "#CCCCCC") label = entity.name[:40].replace('"', '\\"') lines.append( f' "{entity.id}" [label="{label}", ' f'fillcolor="{color}", fontcolor="white"];' ) # Edges for rel in graph._relations: label = rel.relation_type.value lines.append(f' "{rel.source_id}" -> "{rel.target_id}" [label="{label}"];') lines.append("}") path.write_text("\n".join(lines), encoding="utf-8") logger.info("Exported graph to DOT: %s", path) def export_to_json_cytoscape( graph: KnowledgeGraphBuilder, path: str | Path, ) -> None: """Export graph to Cytoscape.js compatible JSON. Args: graph: The knowledge graph to export. path: Output file path. """ path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) elements: list[dict[str, Any]] = [] for entity in graph._entities.values(): elements.append({ "data": { "id": entity.id, "label": entity.name, "type": entity.entity_type.value, **entity.attributes, }, }) for i, rel in enumerate(graph._relations): elements.append({ "data": { "id": f"edge_{i}", "source": rel.source_id, "target": rel.target_id, "label": rel.relation_type.value, **rel.attributes, }, }) path.write_text( json.dumps({"elements": elements}, indent=2, ensure_ascii=False), encoding="utf-8", ) logger.info("Exported graph to Cytoscape JSON: %s", path) def graph_summary(graph: KnowledgeGraphBuilder) -> str: """Generate a text summary of the graph. Args: graph: The knowledge graph. Returns: Multi-line summary string. """ from collections import Counter type_counts: Counter[str] = Counter() for entity in graph._entities.values(): type_counts[entity.entity_type.value] += 1 rel_counts: Counter[str] = Counter() for rel in graph._relations: rel_counts[rel.relation_type.value] += 1 lines = [ f"Knowledge Graph Summary: {graph.entity_count} entities, " f"{graph.relation_count} relations", "", "Entity types:", ] for etype, count in type_counts.most_common(): lines.append(f" {etype}: {count}") lines.append("") lines.append("Relation types:") for rtype, count in rel_counts.most_common(): lines.append(f" {rtype}: {count}") return "\n".join(lines) ================================================ FILE: researchclaw/literature/__init__.py ================================================ """Real literature search and citation management for ResearchClaw. Provides API clients for Semantic Scholar and arXiv, plus unified search with deduplication and BibTeX generation. All network I/O uses stdlib ``urllib`` — **zero** extra pip dependencies. """ from researchclaw.literature.models import Author, Paper from researchclaw.literature.search import search_papers from researchclaw.literature.verify import ( CitationResult, VerificationReport, VerifyStatus, verify_citations, ) __all__ = [ "Author", "CitationResult", "Paper", "VerificationReport", "VerifyStatus", "search_papers", "verify_citations", ] ================================================ FILE: researchclaw/literature/arxiv_client.py ================================================ """arXiv API client powered by the ``arxiv`` library. The ``arxiv`` pip package (2.4+) provides robust arXiv search with built-in rate limiting, retries, pagination, and PDF download support. Public API ---------- - ``search_arxiv(query, limit, sort_by, year_min)`` → ``list[Paper]`` - ``download_pdf(arxiv_id, dirpath)`` → ``Path | None`` - ``get_paper_by_id(arxiv_id)`` → ``Paper | None`` Circuit breaker is preserved for extra resilience beyond the library's built-in retry logic. """ from __future__ import annotations import logging import re import threading import time from pathlib import Path from typing import Any try: import arxiv # pip install arxiv except ImportError: arxiv = None # type: ignore[assignment] from researchclaw.literature.models import Author, Paper logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Circuit breaker (kept for extra safety on top of arxiv library retries) # --------------------------------------------------------------------------- _CB_THRESHOLD = 3 _CB_INITIAL_COOLDOWN = 180 _CB_MAX_COOLDOWN = 600 _CB_CLOSED = "closed" _CB_OPEN = "open" _CB_HALF_OPEN = "half_open" _cb_state: str = _CB_CLOSED _cb_consecutive_429s: int = 0 _cb_cooldown_sec: float = _CB_INITIAL_COOLDOWN _cb_open_since: float = 0.0 _cb_trip_count: int = 0 _cb_lock = threading.Lock() def _reset_circuit_breaker() -> None: """Reset circuit breaker state (for tests).""" global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 global _cb_open_since, _cb_trip_count # noqa: PLW0603 with _cb_lock: _cb_state = _CB_CLOSED _cb_consecutive_429s = 0 _cb_cooldown_sec = _CB_INITIAL_COOLDOWN _cb_open_since = 0.0 _cb_trip_count = 0 def _cb_should_allow() -> bool: global _cb_state # noqa: PLW0603 with _cb_lock: if _cb_state == _CB_CLOSED: return True if _cb_state == _CB_OPEN: elapsed = time.monotonic() - _cb_open_since if elapsed >= _cb_cooldown_sec: _cb_state = _CB_HALF_OPEN logger.info("arXiv circuit breaker → HALF_OPEN (%.0fs cooldown elapsed)", elapsed) return True return False return True # HALF_OPEN: allow probe def _cb_on_success() -> None: global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 with _cb_lock: _cb_consecutive_429s = 0 if _cb_state != _CB_CLOSED: logger.info("arXiv circuit breaker → CLOSED (request succeeded)") _cb_state = _CB_CLOSED _cb_cooldown_sec = _CB_INITIAL_COOLDOWN def _cb_on_failure() -> bool: global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 global _cb_open_since, _cb_trip_count # noqa: PLW0603 with _cb_lock: _cb_consecutive_429s += 1 if _cb_state == _CB_HALF_OPEN or _cb_consecutive_429s >= _CB_THRESHOLD: if _cb_state == _CB_HALF_OPEN: _cb_cooldown_sec = min(_cb_cooldown_sec * 2, _CB_MAX_COOLDOWN) _cb_state = _CB_OPEN _cb_open_since = time.monotonic() _cb_trip_count += 1 logger.warning( "arXiv circuit breaker TRIPPED (trip #%d, cooldown %.0fs)", _cb_trip_count, _cb_cooldown_sec, ) return True return False # --------------------------------------------------------------------------- # Shared arxiv.Client instance (reuses connection, respects rate limits) # --------------------------------------------------------------------------- _client: arxiv.Client | None = None def _get_client() -> arxiv.Client: """Get or create the shared arxiv Client.""" global _client # noqa: PLW0603 if _client is None: _client = arxiv.Client( page_size=100, # fetch up to 100 per API call delay_seconds=3.1, # arXiv requires ≥3s between requests num_retries=3, # built-in retry on failure ) return _client # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def search_arxiv( query: str, *, limit: int = 50, sort_by: str = "relevance", year_min: int = 0, ) -> list[Paper]: """Search arXiv for papers matching *query*. Parameters ---------- query: Free-text search query. Supports arXiv field syntax (e.g., ``ti:transformer``, ``au:vaswani``, ``cat:cs.LG``). limit: Maximum number of results (up to 300). sort_by: Sort criterion: "relevance", "submitted_date", or "last_updated". year_min: If > 0, only return papers published in this year or later. Returns ------- list[Paper] Parsed papers. Empty list on failure. """ if arxiv is None: logger.warning("arxiv library not installed — skipping arXiv search") return [] if not _cb_should_allow(): logger.info("[rate-limit] arXiv circuit breaker OPEN — skipping") return [] limit = min(limit, 300) sort_map = { "relevance": arxiv.SortCriterion.Relevance, "submitted_date": arxiv.SortCriterion.SubmittedDate, "last_updated": arxiv.SortCriterion.LastUpdatedDate, } criterion = sort_map.get(sort_by, arxiv.SortCriterion.Relevance) search = arxiv.Search( query=query, max_results=limit, sort_by=criterion, sort_order=arxiv.SortOrder.Descending, ) papers: list[Paper] = [] try: client = _get_client() for result in client.results(search): paper = _convert_result(result) if year_min > 0 and paper.year < year_min: continue papers.append(paper) _cb_on_success() logger.info("arXiv: found %d papers for %r", len(papers), query) except arxiv.HTTPError as exc: logger.warning("arXiv HTTP error: %s", exc) _cb_on_failure() except arxiv.UnexpectedEmptyPageError: logger.warning("arXiv returned unexpected empty page for %r", query) _cb_on_failure() except Exception as exc: # noqa: BLE001 logger.warning("arXiv search failed: %s", exc) _cb_on_failure() return papers def get_paper_by_id(arxiv_id: str) -> Paper | None: """Fetch a single paper by arXiv ID (e.g., '2301.00001').""" if arxiv is None: logger.warning("arxiv library not installed — cannot look up %s", arxiv_id) return None try: search = arxiv.Search(id_list=[arxiv_id]) client = _get_client() for result in client.results(search): return _convert_result(result) except Exception as exc: # noqa: BLE001 logger.warning("arXiv ID lookup failed for %s: %s", arxiv_id, exc) return None def download_pdf( arxiv_id: str, dirpath: str | Path = ".", filename: str = "", ) -> Path | None: """Download PDF for a given arXiv ID. Parameters ---------- arxiv_id: arXiv paper ID (e.g., '2301.00001'). dirpath: Directory to save the PDF. filename: Custom filename. If empty, uses ``{arxiv_id}.pdf``. Returns ------- Path | None Path to downloaded PDF, or None on failure. """ if arxiv is None: logger.warning("arxiv library not installed — cannot download PDF") return None try: search = arxiv.Search(id_list=[arxiv_id]) client = _get_client() for result in client.results(search): dirpath = Path(dirpath) dirpath.mkdir(parents=True, exist_ok=True) fname = filename or f"{arxiv_id.replace('/', '_')}.pdf" result.download_pdf(dirpath=str(dirpath), filename=fname) pdf_path = dirpath / fname logger.info("Downloaded arXiv PDF: %s → %s", arxiv_id, pdf_path) return pdf_path except Exception as exc: # noqa: BLE001 logger.warning("PDF download failed for %s: %s", arxiv_id, exc) return None def search_arxiv_advanced( *, title: str = "", author: str = "", abstract: str = "", category: str = "", limit: int = 50, year_min: int = 0, ) -> list[Paper]: """Advanced arXiv search using field-specific queries. Example: search_arxiv_advanced(title="transformer", category="cs.LG") """ parts = [] if title: parts.append(f"ti:{title}") if author: parts.append(f"au:{author}") if abstract: parts.append(f"abs:{abstract}") if category: parts.append(f"cat:{category}") if not parts: return [] query = " AND ".join(parts) return search_arxiv(query, limit=limit, year_min=year_min) # --------------------------------------------------------------------------- # Internal: convert arxiv.Result → Paper # --------------------------------------------------------------------------- def _convert_result(result: arxiv.Result) -> Paper: """Convert an ``arxiv.Result`` to our ``Paper`` dataclass.""" # Extract arXiv ID from entry_id URL arxiv_id = "" if result.entry_id: m = re.search(r"(\d{4}\.\d{4,5})(v\d+)?$", result.entry_id) if m: arxiv_id = m.group(1) # Authors authors = tuple(Author(name=a.name) for a in result.authors) # Year from published date year = result.published.year if result.published else 0 # DOI doi = result.doi or "" # Primary category as venue venue = result.primary_category or "" # Prefer HTML abstract URL url = f"https://arxiv.org/abs/{arxiv_id}" if arxiv_id else result.entry_id return Paper( paper_id=f"arxiv-{arxiv_id}" if arxiv_id else f"arxiv-{result.entry_id}", title=result.title or "", authors=authors, year=year, abstract=result.summary or "", venue=venue, citation_count=0, # arXiv doesn't provide citation counts doi=doi, arxiv_id=arxiv_id, url=url, source="arxiv", ) ================================================ FILE: researchclaw/literature/cache.py ================================================ """Local query cache for literature search results. Caches search results by (query, source, limit) hash to avoid redundant API calls. Cache entries expire after TTL_SEC seconds. Cache directory: .researchclaw_cache/literature/ """ from __future__ import annotations import hashlib import json import logging import time from pathlib import Path from typing import Any logger = logging.getLogger(__name__) _DEFAULT_CACHE_DIR = Path(".researchclaw_cache") / "literature" _TTL_SEC = 86400 * 7 # 7 days (default for S2, OpenAlex) # Per-source TTLs: arXiv updates daily at midnight, so 24h cache is optimal. # Citation verification results are permanent (verified papers don't change). _SOURCE_TTL: dict[str, float] = { "arxiv": 86400, # 24 hours — arXiv metadata updates once/day "semantic_scholar": 86400 * 3, # 3 days "openalex": 86400 * 3, # 3 days "citation_verify": 86400 * 365, # ~permanent } def _cache_dir(base: Path | None = None) -> Path: d = base or _DEFAULT_CACHE_DIR d.mkdir(parents=True, exist_ok=True) return d def cache_key(query: str, source: str, limit: int) -> str: """Deterministic cache key from query parameters.""" raw = f"{query.strip().lower()}|{source.strip().lower()}|{limit}" return hashlib.sha256(raw.encode()).hexdigest()[:16] def get_cached( query: str, source: str, limit: int, *, cache_base: Path | None = None, ttl: float | None = None, ) -> list[dict[str, Any]] | None: """Return cached results or None if miss/expired. If *ttl* is not provided, uses source-specific TTL from ``_SOURCE_TTL``, falling back to the global ``_TTL_SEC``. """ d = _cache_dir(cache_base) key = cache_key(query, source, limit) path = d / f"{key}.json" if not path.exists(): return None effective_ttl = ttl if ttl is not None else _SOURCE_TTL.get(source, _TTL_SEC) try: data = json.loads(path.read_text(encoding="utf-8")) ts = data.get("timestamp", 0) age_sec = time.time() - ts if age_sec > effective_ttl: logger.debug("Cache expired for key %s (age=%.0fs > ttl=%.0fs)", key, age_sec, effective_ttl) return None papers = data.get("papers", []) if not isinstance(papers, list): return None age_str = _format_age(age_sec) logger.info( "[cache] HIT query=%r source=%s age=%s (%d papers)", query[:50], source, age_str, len(papers), ) return papers except (json.JSONDecodeError, TypeError, ValueError): return None def _format_age(seconds: float) -> str: """Human-readable age string.""" if seconds < 60: return f"{seconds:.0f}s" if seconds < 3600: return f"{seconds / 60:.0f}m" if seconds < 86400: return f"{seconds / 3600:.1f}h" return f"{seconds / 86400:.1f}d" def put_cache( query: str, source: str, limit: int, papers: list[dict[str, Any]], *, cache_base: Path | None = None, ) -> None: """Write search results to cache.""" d = _cache_dir(cache_base) key = cache_key(query, source, limit) path = d / f"{key}.json" payload = { "query": query, "source": source, "limit": limit, "timestamp": time.time(), "papers": papers, } path.write_text(json.dumps(payload, indent=2), encoding="utf-8") logger.debug("Cached %d papers for key %s", len(papers), key) def clear_cache(*, cache_base: Path | None = None) -> int: """Remove all cache files. Return count of files deleted.""" d = _cache_dir(cache_base) count = 0 for f in d.glob("*.json"): f.unlink() count += 1 return count def cache_stats(*, cache_base: Path | None = None) -> dict[str, Any]: """Return cache statistics.""" d = _cache_dir(cache_base) files = list(d.glob("*.json")) total_bytes = sum(f.stat().st_size for f in files) return { "entries": len(files), "total_bytes": total_bytes, "cache_dir": str(d), } ================================================ FILE: researchclaw/literature/models.py ================================================ """Data models for literature search results. Paper and Author are frozen dataclasses — immutable after creation. ``Paper.to_bibtex()`` generates a valid BibTeX entry from metadata, and ``Paper.cite_key`` returns a normalised citation key. """ from __future__ import annotations import re import unicodedata from dataclasses import dataclass, field @dataclass(frozen=True) class Author: """A paper author.""" name: str affiliation: str = "" def last_name(self) -> str: """Return ASCII-folded last name for citation keys.""" parts = self.name.strip().split() raw = parts[-1] if parts else "unknown" # Fold accented characters to ASCII nfkd = unicodedata.normalize("NFKD", raw) ascii_name = nfkd.encode("ascii", "ignore").decode("ascii") return re.sub(r"[^a-zA-Z]", "", ascii_name).lower() or "unknown" @dataclass(frozen=True) class Paper: """A single paper from Semantic Scholar, arXiv, or similar sources. Fields are designed to hold the union of metadata available from both Semantic Scholar and arXiv APIs. """ paper_id: str title: str authors: tuple[Author, ...] = () year: int = 0 abstract: str = "" venue: str = "" citation_count: int = 0 doi: str = "" arxiv_id: str = "" url: str = "" source: str = "" # "semantic_scholar" | "arxiv" _bibtex_override: str = field(default="", repr=False) # ------------------------------------------------------------------ # Citation key # ------------------------------------------------------------------ @property def cite_key(self) -> str: """Normalised citation key: ``lastname``. Example: ``smith2024transformer`` """ last = self.authors[0].last_name() if self.authors else "anon" yr = str(self.year) if self.year else "0000" # First meaningful noun-ish word from title (>3 chars, alpha only) kw = "" for word in self.title.split(): cleaned = re.sub(r"[^a-zA-Z]", "", word).lower() if len(cleaned) > 3 and cleaned not in _STOPWORDS: kw = cleaned break return f"{last}{yr}{kw}" # ------------------------------------------------------------------ # BibTeX generation # ------------------------------------------------------------------ def to_bibtex(self) -> str: """Generate a BibTeX entry string. If ``_bibtex_override`` was populated (e.g. from CrossRef), return that directly. Otherwise construct from metadata. """ if self._bibtex_override: return self._bibtex_override.strip() key = self.cite_key authors_str = " and ".join(a.name for a in self.authors) or "Unknown" # T1.4: Detect arXiv category codes used as venue (e.g. "cs.CY", "math.OC") # These are NOT journal names and must be treated as arXiv preprints. import re as _re _venue = self.venue or "" _is_arxiv_category = bool( _re.match( r"^(?:cs|math|stat|eess|physics|q-bio|q-fin|astro-ph|cond-mat|" r"gr-qc|hep-ex|hep-lat|hep-ph|hep-th|nlin|nucl-ex|nucl-th|" r"quant-ph)\.[A-Z]{2}$", _venue, ) ) # Decide entry type if _venue and not _is_arxiv_category and any( kw in _venue.lower() for kw in ( "conference", "proc", "workshop", "neurips", "icml", "iclr", "aaai", "cvpr", "acl", "emnlp", "naacl", "eccv", "iccv", "sigir", "kdd", "www", "ijcai", ) ): entry_type = "inproceedings" venue_field = f" booktitle = {{{_venue}}}," elif self.arxiv_id and (not _venue or _is_arxiv_category): # arXiv paper: use standard format with eprint ID entry_type = "article" venue_field = f" journal = {{arXiv preprint arXiv:{self.arxiv_id}}}," else: entry_type = "article" venue_field = ( f" journal = {{{_venue or 'Unknown'}}}," if _venue else "" ) lines = [f"@{entry_type}{{{key},"] lines.append(f" title = {{{self.title}}},") lines.append(f" author = {{{authors_str}}},") lines.append(f" year = {{{self.year or 'Unknown'}}},") if venue_field: lines.append(venue_field) if self.doi: lines.append(f" doi = {{{self.doi}}},") if self.arxiv_id: lines.append(f" eprint = {{{self.arxiv_id}}},") lines.append(" archiveprefix = {arXiv},") if self.url: lines.append(f" url = {{{self.url}}},") lines.append("}") return "\n".join(lines) # ------------------------------------------------------------------ # Serialisation helpers (for JSONL output) # ------------------------------------------------------------------ def to_dict(self) -> dict[str, object]: """Serialise to a plain dict for JSON/JSONL output.""" return { "paper_id": self.paper_id, "title": self.title, "authors": [ {"name": a.name, "affiliation": a.affiliation} for a in self.authors ], "year": self.year, "abstract": self.abstract, "venue": self.venue, "citation_count": self.citation_count, "doi": self.doi, "arxiv_id": self.arxiv_id, "url": self.url, "source": self.source, "cite_key": self.cite_key, } # Common English stopwords to skip when picking a keyword for cite_key _STOPWORDS = frozenset( { "the", "and", "for", "with", "from", "that", "this", "into", "over", "upon", "about", "through", "using", "based", "towards", "toward", "between", "under", "more", "than", "when", "what", "which", "where", "does", "have", "been", "some", "each", "also", "much", "very", "learning", # too generic for ML papers } ) ================================================ FILE: researchclaw/literature/novelty.py ================================================ """Novelty checker — detects similar existing work before paper generation. Searches real academic APIs (Semantic Scholar + arXiv) for papers that may overlap with the proposed research hypotheses. Produces a structured report with similarity scores and a go/differentiate/abort recommendation. Usage ----- :: from researchclaw.literature.novelty import check_novelty report = check_novelty( topic="Adaptive learning rate schedules", hypotheses_text=hypotheses_md, ) print(report["novelty_score"]) # 0.72 """ from __future__ import annotations import json import logging import re from datetime import datetime, timezone from difflib import SequenceMatcher from typing import Any logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Stop words for keyword extraction (overlap with executor's but standalone) # --------------------------------------------------------------------------- _STOP_WORDS = frozenset( { "a", "an", "the", "and", "or", "but", "in", "on", "of", "for", "to", "with", "by", "at", "from", "as", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall", "not", "no", "nor", "so", "yet", "both", "each", "every", "all", "any", "few", "more", "most", "other", "some", "such", "than", "too", "very", "just", "about", "above", "after", "again", "between", "into", "through", "during", "before", "under", "over", "using", "based", "via", "toward", "towards", "new", "novel", "approach", "method", "study", "research", "paper", "work", "propose", "proposed", "show", "results", "performance", "evaluation", } ) # --------------------------------------------------------------------------- # Keyword extraction # --------------------------------------------------------------------------- def _extract_keywords(text: str) -> list[str]: """Extract meaningful keywords from text (lowercased, 3+ chars, no stops).""" tokens = re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", text.lower()) seen: set[str] = set() result: list[str] = [] for t in tokens: if t not in _STOP_WORDS and len(t) >= 3 and t not in seen: seen.add(t) result.append(t) return result # --------------------------------------------------------------------------- # Similarity metrics # --------------------------------------------------------------------------- def _jaccard_keywords(keywords_a: list[str], keywords_b: list[str]) -> float: """Jaccard similarity between two keyword lists.""" set_a = set(keywords_a) set_b = set(keywords_b) if not set_a or not set_b: return 0.0 return len(set_a & set_b) / len(set_a | set_b) def _title_similarity(title_a: str, title_b: str) -> float: """Sequence-based similarity between two titles (0-1).""" return SequenceMatcher(None, title_a.lower(), title_b.lower()).ratio() def _compute_similarity( hypothesis_keywords: list[str], paper_title: str, paper_abstract: str, hypothesis_title: str = "", ) -> float: """Combined similarity score between hypotheses keywords and a paper.""" paper_keywords = _extract_keywords(f"{paper_title} {paper_abstract}") kw_sim = _jaccard_keywords(hypothesis_keywords, paper_keywords) # Blend keyword overlap with title similarity when available if hypothesis_title and paper_title: t_sim = _title_similarity(hypothesis_title, paper_title) return round(0.7 * kw_sim + 0.3 * t_sim, 4) return round(kw_sim, 4) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def check_novelty( topic: str, hypotheses_text: str, *, papers_already_seen: list[dict[str, Any]] | None = None, max_search_results: int = 30, similarity_threshold: float = 0.25, s2_api_key: str = "", ) -> dict[str, Any]: """Check whether the proposed research has significant overlap with existing work. Parameters ---------- topic: Research topic string. hypotheses_text: Full text of generated hypotheses (markdown). papers_already_seen: Papers already collected by the pipeline (from candidates.jsonl). If provided, these are checked for overlap too. max_search_results: Max papers to retrieve from academic APIs. similarity_threshold: Minimum similarity to flag a paper as potentially overlapping. s2_api_key: Optional Semantic Scholar API key. Returns ------- dict with keys: topic, hypotheses_checked, similar_papers_found, novelty_score, assessment, similar_papers, recommendation, generated. """ # Extract keywords from topic + hypotheses combined_text = f"{topic}\n{hypotheses_text}" hyp_keywords = _extract_keywords(combined_text) # --- Search for similar existing work --- similar_papers: list[dict[str, Any]] = [] total_papers_retrieved = 0 # Track total API results (even below threshold) # Build search queries from hypotheses queries = _build_novelty_queries(topic, hypotheses_text) # Try real API search try: from researchclaw.literature.search import search_papers_multi_query found = search_papers_multi_query( queries, limit_per_query=min(15, max_search_results), s2_api_key=s2_api_key, ) total_papers_retrieved = len(found) for paper in found[:max_search_results]: sim = _compute_similarity(hyp_keywords, paper.title, paper.abstract) if sim >= similarity_threshold: similar_papers.append( { "title": paper.title, "paper_id": paper.paper_id, "year": paper.year, "venue": paper.venue, "citation_count": paper.citation_count, "similarity": sim, "url": paper.url, "cite_key": paper.cite_key, } ) logger.info( "Novelty search: %d papers found, %d above threshold %.2f", len(found), len(similar_papers), similarity_threshold, ) except Exception: # noqa: BLE001 logger.warning( "Real novelty search failed, checking pipeline papers only", exc_info=True ) # Also check papers already collected by the pipeline if papers_already_seen: for p in papers_already_seen: if not isinstance(p, dict): continue title = str(p.get("title", "")) abstract = str(p.get("abstract", "")) sim = _compute_similarity(hyp_keywords, title, abstract) if sim >= similarity_threshold: # Avoid duplicates existing_titles = {sp["title"].lower() for sp in similar_papers} if title.lower() not in existing_titles: similar_papers.append( { "title": title, "paper_id": str(p.get("paper_id", "")), "year": p.get("year", 0), "venue": str(p.get("venue", "")), "citation_count": p.get("citation_count", 0), "similarity": sim, "url": str(p.get("url", "")), "cite_key": str(p.get("cite_key", "")), } ) # Sort by similarity descending similar_papers.sort(key=lambda x: x["similarity"], reverse=True) # --- Compute novelty score --- novelty_score, assessment = _assess_novelty(similar_papers, similarity_threshold) # --- Determine search coverage quality --- # If API returned very few papers or none at all, the novelty score is unreliable. if total_papers_retrieved == 0 and not papers_already_seen: search_coverage = "insufficient" elif total_papers_retrieved < 5: search_coverage = "partial" else: search_coverage = "full" # When search coverage is insufficient, flag the assessment as unreliable # instead of reporting a misleading perfect novelty score. if search_coverage == "insufficient" and not similar_papers: assessment = "insufficient_data" recommendation = "proceed_with_caution" elif assessment == "critical": recommendation = "abort" elif assessment == "low": recommendation = "differentiate" else: recommendation = "proceed" # Count hypotheses hyp_count = len(re.findall(r"^##\s+H\d+", hypotheses_text, re.MULTILINE)) if hyp_count == 0: hyp_count = len(re.findall(r"hypothesis", hypotheses_text, re.IGNORECASE)) hyp_count = max(1, hyp_count) return { "topic": topic, "hypotheses_checked": hyp_count, "search_queries": queries, "similar_papers_found": len(similar_papers), "novelty_score": novelty_score, "assessment": assessment, "similar_papers": similar_papers[:20], # cap output size "recommendation": recommendation, "similarity_threshold": similarity_threshold, "search_coverage": search_coverage, "total_papers_retrieved": total_papers_retrieved, "generated": datetime.now(timezone.utc).isoformat(timespec="seconds"), } def _build_novelty_queries(topic: str, hypotheses_text: str) -> list[str]: """Build targeted search queries from topic and hypotheses.""" queries = [topic] # Extract hypothesis titles (## H1, ## H2, etc.) for match in re.finditer(r"^##\s+H\d+[:\s]*(.+)", hypotheses_text, re.MULTILINE): hyp_title = match.group(1).strip() if hyp_title and len(hyp_title) > 10: queries.append(hyp_title[:200]) # Extract key phrases from the hypotheses keywords = _extract_keywords(hypotheses_text)[:10] if keywords: # Build a query from top keywords kw_query = " ".join(keywords[:5]) if kw_query not in queries: queries.append(kw_query) return queries[:5] # Cap at 5 queries def _assess_novelty( similar_papers: list[dict[str, Any]], threshold: float, ) -> tuple[float, str]: """Compute overall novelty score and assessment. Returns (score, assessment) where score is 0-1 (higher = more novel) and assessment is 'high' | 'moderate' | 'low' | 'critical'. """ if not similar_papers: return 1.0, "high" # Take top-5 most similar top = similar_papers[:5] max_sim = max(p["similarity"] for p in top) avg_sim = sum(p["similarity"] for p in top) / len(top) # High-citation papers with high similarity are more concerning high_cite_overlap = sum( 1 for p in top if p["similarity"] >= 0.4 and p.get("citation_count", 0) >= 50 ) # Novelty score: inverse of max similarity, adjusted raw_score = 1.0 - max_sim if high_cite_overlap >= 2: raw_score *= 0.7 # penalty for multiple high-impact overlaps novelty_score = round(max(0.0, min(1.0, raw_score)), 3) # Assessment thresholds if novelty_score >= 0.7: assessment = "high" elif novelty_score >= 0.45: assessment = "moderate" elif novelty_score >= 0.25: assessment = "low" else: assessment = "critical" return novelty_score, assessment ================================================ FILE: researchclaw/literature/openalex_client.py ================================================ """OpenAlex API client. Uses stdlib ``urllib`` + ``json`` — zero extra dependencies. Public API ---------- - ``search_openalex(query, limit, year_min)`` → ``list[Paper]`` Rate limits (with polite pool email): - List/filter: 10,000/day - Full-text search: 1,000/day OpenAlex provides generous rate limits and indexes arXiv, PubMed, CrossRef, and many other sources — making it an excellent primary search backend that reduces pressure on arXiv and Semantic Scholar. """ from __future__ import annotations import json import logging import random import re import threading import time import urllib.error import urllib.parse import urllib.request from typing import Any from researchclaw.literature.models import Author, Paper logger = logging.getLogger(__name__) _BASE_URL = "https://api.openalex.org/works" _POLITE_EMAIL = "researchclaw@users.noreply.github.com" _MAX_PER_REQUEST = 50 _MAX_RETRIES = 3 _MAX_WAIT_SEC = 60 _TIMEOUT_SEC = 20 _RATE_LIMIT_SEC = 0.2 # OpenAlex is generous; 200ms is more than enough # Last request timestamp for rate limiting _last_request_time: float = 0.0 _rate_lock = threading.Lock() def search_openalex( query: str, *, limit: int = 20, year_min: int = 0, email: str = _POLITE_EMAIL, ) -> list[Paper]: """Search OpenAlex for papers matching *query*. Parameters ---------- query: Free-text search query. limit: Maximum number of results (capped at 50). year_min: If >0, restrict to papers published in this year or later. email: Polite pool email for higher rate limits. Returns ------- list[Paper] Parsed papers. Empty list on network failure. """ global _last_request_time # noqa: PLW0603 # Rate limiting (locked to serialize concurrent callers) with _rate_lock: now = time.monotonic() elapsed = now - _last_request_time if elapsed < _RATE_LIMIT_SEC: time.sleep(_RATE_LIMIT_SEC - elapsed) _last_request_time = time.monotonic() limit = min(limit, _MAX_PER_REQUEST) # Build filter string filters = [] if year_min > 0: filters.append(f"from_publication_date:{year_min}-01-01") params: dict[str, str] = { "search": query, "per_page": str(limit), "mailto": email, "select": ( "id,title,authorships,publication_year,primary_location," "cited_by_count,doi,ids,abstract_inverted_index,type" ), } if filters: params["filter"] = ",".join(filters) url = f"{_BASE_URL}?{urllib.parse.urlencode(params)}" data = _request_with_retry(url, email) if data is None: return [] results = data.get("results", []) if not isinstance(results, list): return [] papers: list[Paper] = [] for item in results: try: papers.append(_parse_openalex_work(item)) except Exception: # noqa: BLE001 logger.debug("Failed to parse OpenAlex work: %s", item.get("id", "?")) return papers # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _request_with_retry( url: str, email: str, ) -> dict[str, Any] | None: """GET *url* with exponential back-off retries.""" for attempt in range(_MAX_RETRIES): try: req = urllib.request.Request( url, headers={ "Accept": "application/json", "User-Agent": f"ResearchClaw/1.0 (mailto:{email})", }, ) with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp: body = resp.read().decode("utf-8") return json.loads(body) except urllib.error.HTTPError as exc: if exc.code == 429: retry_after = exc.headers.get("Retry-After") if exc.headers else None if retry_after: try: wait = float(retry_after) except (ValueError, TypeError): wait = 2 ** (attempt + 1) else: wait = 2 ** (attempt + 1) # BUG-22: If Retry-After is absurdly long (>300s), skip immediately if wait > 300: logger.warning( "[rate-limit] OpenAlex Retry-After=%s (>300s). " "Skipping request instead of waiting.", retry_after, ) return None wait = min(wait, _MAX_WAIT_SEC) jitter = random.uniform(0, wait * 0.2) logger.warning( "[rate-limit] OpenAlex 429 (Retry-After: %s). " "Waiting %.1fs (attempt %d/%d)...", retry_after or "none", wait + jitter, attempt + 1, _MAX_RETRIES, ) time.sleep(wait + jitter) continue if exc.code in (500, 502, 503, 504): wait = 2 ** attempt jitter = random.uniform(0, wait * 0.2) logger.warning( "OpenAlex HTTP %d. Retry %d/%d in %.0fs...", exc.code, attempt + 1, _MAX_RETRIES, wait + jitter, ) time.sleep(wait + jitter) continue logger.warning("OpenAlex HTTP %d for %s", exc.code, url) return None except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc: wait = min(2**attempt, _MAX_WAIT_SEC) jitter = random.uniform(0, wait * 0.2) logger.warning( "OpenAlex request failed (%s). Retry %d/%d in %ds...", exc, attempt + 1, _MAX_RETRIES, wait, ) time.sleep(wait + jitter) logger.error("OpenAlex request exhausted retries for: %s", url) return None def _reconstruct_abstract(inverted_index: dict[str, list[int]] | None) -> str: """Reconstruct abstract from OpenAlex inverted index format.""" if not inverted_index or not isinstance(inverted_index, dict): return "" # Build word -> position mapping words: list[tuple[int, str]] = [] for word, positions in inverted_index.items(): for pos in positions: words.append((pos, word)) words.sort(key=lambda x: x[0]) return " ".join(w for _, w in words) def _parse_openalex_work(item: dict[str, Any]) -> Paper: """Convert a single OpenAlex work JSON to a ``Paper``.""" # Title title = str(item.get("title") or "").strip() title = re.sub(r"\s+", " ", title) # Authors authorships = item.get("authorships") or [] authors = tuple( Author( name=str(a.get("author", {}).get("display_name", "Unknown")), affiliation=str( (a.get("institutions") or [{}])[0].get("display_name", "") if a.get("institutions") else "" ), ) for a in authorships if isinstance(a, dict) ) # Year year = int(item.get("publication_year") or 0) # Abstract (inverted index format) abstract = _reconstruct_abstract(item.get("abstract_inverted_index")) # Venue from primary_location primary_loc = item.get("primary_location") or {} source_info = primary_loc.get("source") or {} venue = str(source_info.get("display_name") or "").strip() # BUG-33: arXiv category codes (e.g. cs.LG, stat.ML) are not proper venue names if venue and re.match(r"^[a-z]{2,}\.[A-Z]{2}$", venue): venue = "" # Citation count citation_count = int(item.get("cited_by_count") or 0) # DOI raw_doi = str(item.get("doi") or "").strip() doi = raw_doi.replace("https://doi.org/", "").replace("http://doi.org/", "") # IDs ids = item.get("ids") or {} openalex_id = str(ids.get("openalex") or item.get("id") or "").strip() # arXiv ID from ids or DOI arxiv_id = "" raw_arxiv = str(ids.get("arxiv") or "").strip() if raw_arxiv: # Extract numeric ID from URLs like https://arxiv.org/abs/2301.00001 m = re.search(r"(\d{4}\.\d{4,5})", raw_arxiv) if m: arxiv_id = m.group(1) # URL url = "" if arxiv_id: url = f"https://arxiv.org/abs/{arxiv_id}" elif doi: url = f"https://doi.org/{doi}" elif openalex_id: url = openalex_id # Paper ID paper_id = f"oalex-{openalex_id.split('/')[-1]}" if openalex_id else f"oalex-{title[:20]}" return Paper( paper_id=paper_id, title=title, authors=authors, year=year, abstract=abstract, venue=venue, citation_count=citation_count, doi=doi, arxiv_id=arxiv_id, url=url, source="openalex", ) ================================================ FILE: researchclaw/literature/search.py ================================================ """Unified literature search with deduplication. Combines results from OpenAlex, Semantic Scholar, and arXiv, deduplicates by DOI → arXiv ID → fuzzy title match, and returns a merged list sorted by citation count (descending). Source priority: OpenAlex (most generous limits) → Semantic Scholar → arXiv. If any source hits rate limits, remaining sources compensate automatically. Public API ---------- - ``search_papers(query, limit, sources, year_min, deduplicate)`` → ``list[Paper]`` """ from __future__ import annotations from collections.abc import Callable, Sequence from dataclasses import asdict import importlib import logging import re import time import urllib.error from typing import cast from researchclaw.literature.arxiv_client import search_arxiv from researchclaw.literature.models import Author, Paper from researchclaw.literature.openalex_client import search_openalex from researchclaw.literature.semantic_scholar import search_semantic_scholar logger = logging.getLogger(__name__) # OpenAlex first (10K/day), then S2 (1K/5min), then arXiv (1/3s) — least # pressure on the most restrictive API. _DEFAULT_SOURCES = ("openalex", "semantic_scholar", "arxiv") CacheGet = Callable[[str, str, int], list[dict[str, object]] | None] CachePut = Callable[[str, str, int, list[dict[str, object]]], None] def _cache_api() -> tuple[CacheGet, CachePut]: cache_mod = importlib.import_module("researchclaw.literature.cache") return cast(CacheGet, cache_mod.get_cached), cast(CachePut, cache_mod.put_cache) def _papers_to_dicts(papers: list[Paper]) -> list[dict[str, object]]: """Convert papers to serializable dicts for caching.""" return [asdict(p) for p in papers] def _as_int(value: object, default: int = 0) -> int: if isinstance(value, int): return value if isinstance(value, float): return int(value) if isinstance(value, str): try: return int(value) except ValueError: return default return default def _dicts_to_papers(dicts: list[dict[str, object]]) -> list[Paper]: """Reconstruct Paper objects from cached dicts.""" papers: list[Paper] = [] for d in dicts: try: authors_raw = d.get("authors", ()) if not isinstance(authors_raw, list): authors_raw = [] authors = tuple( Author( name=str(cast(dict[str, object], a).get("name", "")), affiliation=str(cast(dict[str, object], a).get("affiliation", "")), ) for a in authors_raw if isinstance(a, dict) ) paper_id = cast(str, d["paper_id"]) title = cast(str, d["title"]) papers.append( Paper( paper_id=paper_id, title=title, authors=authors, year=_as_int(d.get("year", 0), 0), abstract=str(d.get("abstract", "")), venue=str(d.get("venue", "")), citation_count=_as_int(d.get("citation_count", 0), 0), doi=str(d.get("doi", "")), arxiv_id=str(d.get("arxiv_id", "")), url=str(d.get("url", "")), source=str(d.get("source", "")), ) ) except (KeyError, TypeError, ValueError): continue return papers def search_papers( query: str, *, limit: int = 20, sources: Sequence[str] = _DEFAULT_SOURCES, year_min: int = 0, deduplicate: bool = True, s2_api_key: str = "", ) -> list[Paper]: """Search multiple academic sources and return deduplicated results. Parameters ---------- query: Free-text search query. limit: Maximum results *per source*. sources: Which backends to query. Default: both S2 and arXiv. year_min: If >0, pass to backends that support year filtering. deduplicate: Whether to remove duplicates across sources. s2_api_key: Optional Semantic Scholar API key. Returns ------- list[Paper] Merged results, sorted by citation_count descending. """ all_papers: list[Paper] = [] cache_get: CacheGet cache_put: CachePut cache_get, cache_put = _cache_api() source_stats: dict[str, int] = {} # track per-source counts cache_hits = 0 for src in sources: src_lower = src.lower().replace("-", "_").replace(" ", "_") cache_source = ( "semantic_scholar" if src_lower in ("semantic_scholar", "s2") else src_lower ) try: if src_lower == "openalex": papers = search_openalex( query, limit=limit, year_min=year_min, ) all_papers.extend(papers) cache_put(query, "openalex", limit, _papers_to_dicts(papers)) source_stats["openalex"] = len(papers) logger.info( "OpenAlex returned %d papers for %r", len(papers), query ) time.sleep(0.5) elif src_lower in ("semantic_scholar", "s2"): papers = search_semantic_scholar( query, limit=limit, year_min=year_min, api_key=s2_api_key, ) all_papers.extend(papers) cache_put(query, "semantic_scholar", limit, _papers_to_dicts(papers)) source_stats["semantic_scholar"] = len(papers) logger.info( "Semantic Scholar returned %d papers for %r", len(papers), query ) # Rate-limit gap before next source time.sleep(1.0) elif src_lower == "arxiv": papers = search_arxiv(query, limit=limit, year_min=year_min) all_papers.extend(papers) cache_put(query, "arxiv", limit, _papers_to_dicts(papers)) source_stats["arxiv"] = len(papers) logger.info("arXiv returned %d papers for %r", len(papers), query) else: logger.warning("Unknown literature source: %s (skipped)", src) except ( OSError, RuntimeError, TypeError, ValueError, urllib.error.HTTPError, urllib.error.URLError, ): logger.warning( "[rate-limit] Source %s failed for %r — trying cache", src, query ) cached = cache_get(query, cache_source, limit) if cached: papers = _dicts_to_papers(cached) all_papers.extend(papers) cache_hits += len(papers) logger.info( "[cache] HIT: %d papers for %s/%r", len(papers), src, query ) else: logger.warning( "No cache available for %s/%r — skipping", src, query ) # Summary log total = len(all_papers) parts = [f"{src}: {n}" for src, n in source_stats.items()] if cache_hits: parts.append(f"cache: {cache_hits}") logger.info( "[literature] Found %d papers (%s) for %r", total, ", ".join(parts) if parts else "none", query, ) if deduplicate: all_papers = _deduplicate(all_papers) # Sort by citation count descending, then year descending all_papers.sort(key=lambda p: (p.citation_count, p.year), reverse=True) return all_papers def search_papers_multi_query( queries: list[str], *, limit_per_query: int = 20, sources: Sequence[str] = _DEFAULT_SOURCES, year_min: int = 0, s2_api_key: str = "", inter_query_delay: float = 1.5, ) -> list[Paper]: """Run multiple queries and return deduplicated union. Adds a delay between queries to respect rate limits. """ all_papers: list[Paper] = [] for i, q in enumerate(queries): if i > 0: time.sleep(inter_query_delay) results = search_papers( q, limit=limit_per_query, sources=sources, year_min=year_min, s2_api_key=s2_api_key, deduplicate=False, # we dedup globally below ) all_papers.extend(results) logger.info("Query %d/%d %r → %d papers", i + 1, len(queries), q, len(results)) deduped = _deduplicate(all_papers) deduped.sort(key=lambda p: (p.citation_count, p.year), reverse=True) return deduped # ------------------------------------------------------------------ # Deduplication # ------------------------------------------------------------------ def _normalise_title(title: str) -> str: """Lower-case, strip punctuation, collapse whitespace.""" t = title.lower() t = re.sub(r"[^a-z0-9\s]", "", t) return re.sub(r"\s+", " ", t).strip() def _deduplicate(papers: list[Paper]) -> list[Paper]: """Remove duplicates. Priority: DOI > arXiv ID > fuzzy title. When a duplicate is found, the entry with higher citation_count wins (i.e. Semantic Scholar data is preferred over arXiv-only data). """ seen_doi: dict[str, int] = {} seen_arxiv: dict[str, int] = {} seen_title: dict[str, int] = {} result: list[Paper] = [] def _update_indices(p: Paper, idx: int) -> None: """Register all identifiers of *p* in the lookup dicts at *idx*.""" if p.doi: seen_doi[p.doi.lower().strip()] = idx if p.arxiv_id: seen_arxiv[p.arxiv_id.strip()] = idx norm = _normalise_title(p.title) if norm: seen_title[norm] = idx def _replace_at(old: Paper, new: Paper, idx: int) -> None: """Replace paper at *idx* and clean up stale index entries.""" # Remove old identifiers that the new paper does NOT share if old.doi: old_doi = old.doi.lower().strip() new_doi = new.doi.lower().strip() if new.doi else "" if old_doi != new_doi and seen_doi.get(old_doi) == idx: del seen_doi[old_doi] if old.arxiv_id: old_ax = old.arxiv_id.strip() new_ax = new.arxiv_id.strip() if new.arxiv_id else "" if old_ax != new_ax and seen_arxiv.get(old_ax) == idx: del seen_arxiv[old_ax] old_norm = _normalise_title(old.title) new_norm = _normalise_title(new.title) if old_norm and old_norm != new_norm and seen_title.get(old_norm) == idx: del seen_title[old_norm] result[idx] = new _update_indices(new, idx) for paper in papers: is_dup = False # Check DOI if paper.doi: doi_key = paper.doi.lower().strip() if doi_key in seen_doi: idx = seen_doi[doi_key] if paper.citation_count > result[idx].citation_count: _replace_at(result[idx], paper, idx) is_dup = True # Check arXiv ID if not is_dup and paper.arxiv_id: ax_key = paper.arxiv_id.strip() if ax_key in seen_arxiv: idx = seen_arxiv[ax_key] if paper.citation_count > result[idx].citation_count: _replace_at(result[idx], paper, idx) is_dup = True # Check fuzzy title if not is_dup: norm = _normalise_title(paper.title) if norm and norm in seen_title: idx = seen_title[norm] if paper.citation_count > result[idx].citation_count: _replace_at(result[idx], paper, idx) is_dup = True if is_dup: continue # Not a duplicate — store indices and append new_idx = len(result) _update_indices(paper, new_idx) result.append(paper) return result def papers_to_bibtex(papers: Sequence[Paper]) -> str: """Generate a combined BibTeX file from a list of papers.""" entries = [p.to_bibtex() for p in papers] return "\n\n".join(entries) + "\n" ================================================ FILE: researchclaw/literature/semantic_scholar.py ================================================ """Semantic Scholar API client. Uses only stdlib ``urllib`` — zero extra dependencies. Public API ---------- - ``search_semantic_scholar(query, limit, year_min)`` → ``list[Paper]`` Rate limit: 1 req/s (free, no API key). Retries up to 3 times with exponential back-off on transient failures. Circuit breaker has three states: CLOSED → normal operation OPEN → skip all requests, auto-recover after cooldown HALF_OPEN → try one probe request, success→CLOSED, fail→OPEN (doubled cooldown) """ from __future__ import annotations import json import logging import random import threading import time import urllib.error import urllib.parse import urllib.request from typing import Any from researchclaw.literature.models import Author, Paper logger = logging.getLogger(__name__) _BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search" _FIELDS = "paperId,title,abstract,year,venue,citationCount,authors,externalIds,url" _MAX_PER_REQUEST = 100 _RATE_LIMIT_SEC = 1.5 # conservative spacing between requests _MAX_RETRIES = 3 _MAX_WAIT_SEC = 60 _TIMEOUT_SEC = 30 # --------------------------------------------------------------------------- # Three-state circuit breaker # --------------------------------------------------------------------------- _CB_THRESHOLD = 3 # consecutive 429s to trip _CB_INITIAL_COOLDOWN = 120 # seconds before first HALF_OPEN probe _CB_MAX_COOLDOWN = 600 # cap cooldown at 10 minutes # States _CB_CLOSED = "closed" _CB_OPEN = "open" _CB_HALF_OPEN = "half_open" _cb_state: str = _CB_CLOSED _cb_consecutive_429s: int = 0 _cb_cooldown_sec: float = _CB_INITIAL_COOLDOWN _cb_open_since: float = 0.0 # monotonic timestamp when breaker opened _cb_trip_count: int = 0 # total number of trips in this process _cb_lock = threading.Lock() def _reset_circuit_breaker() -> None: """Reset circuit breaker state (for tests).""" global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 global _cb_open_since, _cb_trip_count # noqa: PLW0603 with _cb_lock: _cb_state = _CB_CLOSED _cb_consecutive_429s = 0 _cb_cooldown_sec = _CB_INITIAL_COOLDOWN _cb_open_since = 0.0 _cb_trip_count = 0 def _cb_should_allow() -> bool: """Check if circuit breaker allows a request.""" global _cb_state # noqa: PLW0603 with _cb_lock: if _cb_state == _CB_CLOSED: return True if _cb_state == _CB_OPEN: elapsed = time.monotonic() - _cb_open_since if elapsed >= _cb_cooldown_sec: _cb_state = _CB_HALF_OPEN logger.info( "S2 circuit breaker → HALF_OPEN after %.0fs cooldown. " "Trying one probe request...", elapsed, ) return True return False # HALF_OPEN: allow the probe return True def _cb_on_success() -> None: """Record a successful request.""" global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 with _cb_lock: _cb_consecutive_429s = 0 if _cb_state != _CB_CLOSED: logger.info("S2 circuit breaker → CLOSED (request succeeded)") _cb_state = _CB_CLOSED _cb_cooldown_sec = _CB_INITIAL_COOLDOWN # reset cooldown def _cb_on_429() -> bool: """Record a 429 response. Returns True if breaker is now OPEN.""" global _cb_state, _cb_consecutive_429s, _cb_cooldown_sec # noqa: PLW0603 global _cb_open_since, _cb_trip_count # noqa: PLW0603 with _cb_lock: _cb_consecutive_429s += 1 if _cb_state == _CB_HALF_OPEN: # Probe failed — back to OPEN with doubled cooldown _cb_cooldown_sec = min(_cb_cooldown_sec * 2, _CB_MAX_COOLDOWN) _cb_state = _CB_OPEN _cb_open_since = time.monotonic() _cb_trip_count += 1 logger.warning( "S2 circuit breaker → OPEN (probe failed). " "Next cooldown: %.0fs (trip #%d)", _cb_cooldown_sec, _cb_trip_count, ) return True if _cb_consecutive_429s >= _CB_THRESHOLD: _cb_state = _CB_OPEN _cb_open_since = time.monotonic() _cb_trip_count += 1 logger.warning( "S2 circuit breaker TRIPPED after %d consecutive 429s. " "Cooldown: %.0fs (trip #%d). arXiv still active.", _cb_consecutive_429s, _cb_cooldown_sec, _cb_trip_count, ) return True return False # Last request timestamp for rate limiting _last_request_time: float = 0.0 _rate_lock = threading.Lock() def search_semantic_scholar( query: str, *, limit: int = 20, year_min: int = 0, api_key: str = "", ) -> list[Paper]: """Search Semantic Scholar for papers matching *query*. Parameters ---------- query: Free-text search query. limit: Maximum number of results (capped at 100 per API constraint). year_min: If >0, restrict to papers published in this year or later. api_key: Optional S2 API key (raises rate limit to 10 req/s). Returns ------- list[Paper] Parsed papers. Empty list on network failure. """ global _last_request_time # noqa: PLW0603 # Rate limiting: locked to serialize concurrent callers with _rate_lock: now = time.monotonic() rate_limit = 0.3 if api_key else _RATE_LIMIT_SEC elapsed_since_last = now - _last_request_time if elapsed_since_last < rate_limit: time.sleep(rate_limit - elapsed_since_last) _last_request_time = time.monotonic() limit = min(limit, _MAX_PER_REQUEST) params: dict[str, str] = { "query": query, "limit": str(limit), "fields": _FIELDS, } if year_min > 0: params["year"] = f"{year_min}-" url = f"{_BASE_URL}?{urllib.parse.urlencode(params)}" headers: dict[str, str] = {"Accept": "application/json"} if api_key: headers["x-api-key"] = api_key data = _request_with_retry(url, headers) if data is None: return [] raw_papers = data.get("data", []) if not isinstance(raw_papers, list): return [] papers: list[Paper] = [] for item in raw_papers: try: papers.append(_parse_s2_paper(item)) except Exception: # noqa: BLE001 logger.debug("Failed to parse S2 paper entry: %s", item) return papers # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _request_with_retry( url: str, headers: dict[str, str], ) -> dict[str, Any] | None: """GET *url* with exponential back-off retries.""" if not _cb_should_allow(): return None for attempt in range(_MAX_RETRIES): try: req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp: body = resp.read().decode("utf-8") _cb_on_success() return json.loads(body) except urllib.error.HTTPError as exc: if exc.code == 429: if _cb_on_429(): return None # breaker tripped delay = min(2 ** (attempt + 1), _MAX_WAIT_SEC) jitter = random.uniform(0, delay * 0.3) wait = delay + jitter logger.warning( "S2 rate-limited (429). Waiting %.1fs (attempt %d/%d)...", wait, attempt + 1, _MAX_RETRIES, ) time.sleep(wait) continue logger.warning("S2 HTTP %d for %s", exc.code, url) return None except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc: wait = min(2**attempt, _MAX_WAIT_SEC) jitter = random.uniform(0, wait * 0.2) logger.warning( "S2 request failed (%s). Retry %d/%d in %ds \u2026", exc, attempt + 1, _MAX_RETRIES, wait, ) time.sleep(wait + jitter) logger.error("S2 request exhausted retries for: %s", url) return None _BATCH_URL = "https://api.semanticscholar.org/graph/v1/paper/batch" _BATCH_MAX = 500 # S2 batch endpoint max def batch_fetch_papers( paper_ids: list[str], *, api_key: str = "", fields: str = _FIELDS, ) -> list[Paper]: """Batch fetch paper details via POST /graph/v1/paper/batch. Accepts S2 paper IDs, arXiv IDs (prefixed ``ARXIV:``), or DOIs. Returns parsed papers; silently skips papers that fail to resolve. """ if not paper_ids: return [] if not _cb_should_allow(): return [] global _last_request_time # noqa: PLW0603 rate = 0.3 if api_key else _RATE_LIMIT_SEC with _rate_lock: now = time.monotonic() elapsed = now - _last_request_time if elapsed < rate: time.sleep(rate - elapsed) _last_request_time = time.monotonic() all_papers: list[Paper] = [] # Process in chunks of _BATCH_MAX for i in range(0, len(paper_ids), _BATCH_MAX): chunk = paper_ids[i : i + _BATCH_MAX] url = f"{_BATCH_URL}?fields={fields}" headers: dict[str, str] = { "Accept": "application/json", "Content-Type": "application/json", } if api_key: headers["x-api-key"] = api_key body = json.dumps({"ids": chunk}).encode("utf-8") result = _post_with_retry(url, headers, body) with _rate_lock: _last_request_time = time.monotonic() if result is None: continue for item in result: if item is None: continue # unresolved ID try: all_papers.append(_parse_s2_paper(item)) except Exception: # noqa: BLE001 logger.debug("Failed to parse batch S2 paper entry") # Delay between chunks (sleep outside lock to avoid contention) if i + _BATCH_MAX < len(paper_ids): time.sleep(rate) with _rate_lock: _last_request_time = time.monotonic() return all_papers def _post_with_retry( url: str, headers: dict[str, str], body: bytes, ) -> list[dict[str, Any]] | None: """POST *url* with exponential back-off retries.""" if not _cb_should_allow(): return None for attempt in range(_MAX_RETRIES): try: req = urllib.request.Request(url, data=body, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=_TIMEOUT_SEC) as resp: data = json.loads(resp.read().decode("utf-8")) _cb_on_success() return data if isinstance(data, list) else None except urllib.error.HTTPError as exc: if exc.code == 429: if _cb_on_429(): return None delay = min(2 ** (attempt + 1), _MAX_WAIT_SEC) jitter = random.uniform(0, delay * 0.3) logger.warning( "S2 batch rate-limited (429). Waiting %.1fs (attempt %d/%d)...", delay + jitter, attempt + 1, _MAX_RETRIES, ) time.sleep(delay + jitter) continue logger.warning("S2 batch HTTP %d", exc.code) return None except (urllib.error.URLError, OSError, json.JSONDecodeError) as exc: wait = min(2**attempt, _MAX_WAIT_SEC) jitter = random.uniform(0, wait * 0.2) logger.warning( "S2 batch request failed (%s). Retry %d/%d in %ds…", exc, attempt + 1, _MAX_RETRIES, wait, ) time.sleep(wait + jitter) logger.error("S2 batch request exhausted retries") return None def _parse_s2_paper(item: dict[str, Any]) -> Paper: """Convert a single Semantic Scholar JSON entry to a ``Paper``.""" ext_ids = item.get("externalIds") or {} authors_raw = item.get("authors") or [] authors = tuple( Author(name=a.get("name", "Unknown")) for a in authors_raw if isinstance(a, dict) ) return Paper( paper_id=f"s2-{item.get('paperId', '')}", title=str(item.get("title", "")).strip(), authors=authors, year=int(item.get("year") or 0), abstract=str(item.get("abstract") or "").strip(), venue=str(item.get("venue") or "").strip(), citation_count=int(item.get("citationCount") or 0), doi=str(ext_ids.get("DOI") or "").strip(), arxiv_id=str(ext_ids.get("ArXiv") or "").strip(), url=str(item.get("url") or "").strip(), source="semantic_scholar", ) ================================================ FILE: researchclaw/literature/trends.py ================================================ """Literature trend analysis — analyze trends from search results.""" from __future__ import annotations import logging from typing import Any logger = logging.getLogger(__name__) class LiteratureTrendAnalyzer: """Analyze trends from literature search results.""" def __init__(self, search_client: Any = None): self.client = search_client def get_daily_papers( self, domains: list[str], max_papers: int = 20, ) -> list[dict[str, Any]]: """Get today's most relevant papers via literature search.""" if self.client is None: return [] try: from researchclaw.literature.search import search_papers query = " OR ".join(domains) if domains else "machine learning" papers = search_papers(query, limit=max_papers) return [ { "title": p.title, "authors": [a.name for a in p.authors], "abstract": p.abstract or "", "url": p.url or "", "year": p.year, "citation_count": p.citation_count, "source": p.source, } for p in papers ] except Exception as exc: logger.warning("Literature trend fetch failed: %s", exc) return [] def analyze_keyword_trends( self, domains: list[str], window_days: int = 30, ) -> dict[str, Any]: """Analyze keyword frequency trends.""" papers = self.get_daily_papers(domains) if not papers: return {"keywords": [], "total_papers": 0} from researchclaw.trends.trend_analyzer import TrendAnalyzer analyzer = TrendAnalyzer() analysis = analyzer.analyze(papers, window_days) return { "keywords": analysis.get("rising_keywords", []), "total_papers": len(papers), "methods": analysis.get("method_trends", []), } def find_emerging_topics( self, domains: list[str], ) -> list[dict[str, Any]]: """Discover emerging research directions.""" papers = self.get_daily_papers(domains, max_papers=50) if not papers: return [] from researchclaw.trends.trend_analyzer import TrendAnalyzer analyzer = TrendAnalyzer() analysis = analyzer.analyze(papers) keywords = analysis.get("rising_keywords", []) # Emerging topics = high-frequency bigrams emerging = [ { "topic": kw["keyword"], "frequency": kw["count"], "type": kw.get("type", "unigram"), } for kw in keywords if kw.get("type") == "bigram" and kw["count"] >= 3 ] return emerging[:10] ================================================ FILE: researchclaw/literature/verify.py ================================================ """Citation verification engine — detect hallucinated references. Verifies each BibTeX entry against real academic APIs using a three-layer strategy: L1: **arXiv ID lookup** — direct ``id_list`` query to arXiv API L2: **DOI resolution** — HTTP GET to CrossRef ``/works/{doi}`` L3: **Title search** — search Semantic Scholar + arXiv by title Classifications: - ``VERIFIED``: API confirms existence + title similarity ≥ 0.80 - ``SUSPICIOUS``: Found a paper but metadata diverges (0.50 ≤ sim < 0.80) - ``HALLUCINATED``: Not found via any API or sim < 0.50 - ``SKIPPED``: Entry cannot be verified (no title, or all APIs unreachable) All network I/O uses stdlib ``urllib`` — zero extra pip dependencies. """ from __future__ import annotations import json import logging import re import time import urllib.error import urllib.parse import urllib.request import xml.etree.ElementTree as ET from dataclasses import dataclass, field from enum import Enum from typing import Sequence from researchclaw.literature.models import Author, Paper logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Public enums & data classes # --------------------------------------------------------------------------- class VerifyStatus(str, Enum): """Verification outcome for a single citation.""" VERIFIED = "verified" SUSPICIOUS = "suspicious" HALLUCINATED = "hallucinated" SKIPPED = "skipped" @dataclass class CitationResult: """Verification result for one BibTeX entry.""" cite_key: str title: str status: VerifyStatus confidence: float # 0.0–1.0 method: str # "arxiv_id" | "doi" | "title_search" | "skipped" details: str = "" matched_paper: Paper | None = None relevance_score: float | None = None # 0.0–1.0, set by LLM relevance check def to_dict(self) -> dict[str, object]: d: dict[str, object] = { "cite_key": self.cite_key, "title": self.title, "status": self.status.value, "confidence": round(self.confidence, 3), "method": self.method, "details": self.details, } if self.relevance_score is not None: d["relevance_score"] = round(self.relevance_score, 2) if self.matched_paper: d["matched_paper"] = { "title": self.matched_paper.title, "authors": [a.name for a in self.matched_paper.authors], "year": self.matched_paper.year, "source": self.matched_paper.source, } return d @dataclass class VerificationReport: """Aggregate report for all citations in a paper.""" total: int = 0 verified: int = 0 suspicious: int = 0 hallucinated: int = 0 skipped: int = 0 results: list[CitationResult] = field(default_factory=list) @property def integrity_score(self) -> float: """Fraction of verifiable citations that are verified (0.0–1.0).""" verifiable = self.total - self.skipped if verifiable <= 0: return 1.0 return round(self.verified / verifiable, 3) def to_dict(self) -> dict[str, object]: return { "summary": { "total": self.total, "verified": self.verified, "suspicious": self.suspicious, "hallucinated": self.hallucinated, "skipped": self.skipped, "integrity_score": self.integrity_score, }, "results": [r.to_dict() for r in self.results], } # --------------------------------------------------------------------------- # BibTeX parsing # --------------------------------------------------------------------------- _ENTRY_RE = re.compile( r"@(\w+)\s*\{\s*([^,\s]+)\s*,\s*(.*?)\s*\}(?=\s*(?:@|\Z))", re.DOTALL, ) _FIELD_RE = re.compile( r"(\w+)\s*=\s*\{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)\}", re.DOTALL, ) def parse_bibtex_entries(bib_text: str) -> list[dict[str, str]]: """Parse BibTeX text into a list of field dicts. Each dict contains at least ``key`` and ``type``, plus any parsed fields (``title``, ``author``, ``year``, ``doi``, ``eprint``, ``url``, …). """ entries: list[dict[str, str]] = [] for m in _ENTRY_RE.finditer(bib_text): entry: dict[str, str] = { "type": m.group(1).lower(), "key": m.group(2).strip(), } body = m.group(3) for fm in _FIELD_RE.finditer(body): entry[fm.group(1).lower()] = fm.group(2).strip() entries.append(entry) return entries # --------------------------------------------------------------------------- # Title similarity # --------------------------------------------------------------------------- def title_similarity(a: str, b: str) -> float: """Word-overlap Jaccard-ish similarity between two titles. Returns 0.0–1.0. Uses max(len) as denominator so short titles don't inflate the score. """ def _words(t: str) -> set[str]: return set(re.sub(r"[^a-z0-9\s]", "", t.lower()).split()) - {""} wa, wb = _words(a), _words(b) if not wa or not wb: return 0.0 return len(wa & wb) / max(len(wa), len(wb)) # --------------------------------------------------------------------------- # L1: arXiv ID verification # --------------------------------------------------------------------------- _ARXIV_API = "https://export.arxiv.org/api/query" _ARXIV_NS = {"atom": "http://www.w3.org/2005/Atom"} _ARXIV_TIMEOUT = 20 def verify_by_arxiv_id(arxiv_id: str, expected_title: str) -> CitationResult | None: """Look up a paper by arXiv ID and compare titles. Returns *None* on network failure so that the caller can fall through to the next verification layer. """ # arXiv ID lookup uses id_list, not search_query params = urllib.parse.urlencode({"id_list": arxiv_id, "max_results": "1"}) url = f"{_ARXIV_API}?{params}" try: req = urllib.request.Request(url, headers={"User-Agent": "ResearchClaw/0.1"}) with urllib.request.urlopen(req, timeout=_ARXIV_TIMEOUT) as resp: data = resp.read().decode("utf-8") except Exception as exc: logger.debug("arXiv ID verification failed for %s: %s", arxiv_id, exc) return None try: root = ET.fromstring(data) except ET.ParseError: return None entries = root.findall("atom:entry", _ARXIV_NS) if not entries: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.HALLUCINATED, confidence=0.9, method="arxiv_id", details=f"arXiv ID {arxiv_id} not found in arXiv", ) # arXiv returns an "error" entry when ID is invalid entry = entries[0] found_title_el = entry.find("atom:title", _ARXIV_NS) found_title = ( (found_title_el.text or "").strip() if found_title_el is not None else "" ) found_title = re.sub(r"\s+", " ", found_title) # Check for arXiv error responses (they return entry with id but title "Error") entry_id = entry.findtext("atom:id", "", _ARXIV_NS) if "api/errors" in entry_id or not found_title or found_title.lower() == "error": return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.HALLUCINATED, confidence=0.9, method="arxiv_id", details=f"arXiv ID {arxiv_id} returned error or empty response", ) sim = title_similarity(expected_title, found_title) if sim >= 0.80: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.VERIFIED, confidence=sim, method="arxiv_id", details=f"Confirmed via arXiv: '{found_title}'", ) elif sim >= 0.50: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="arxiv_id", details=f"arXiv ID exists but title differs (sim={sim:.2f}): '{found_title}'", ) else: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="arxiv_id", details=f"arXiv ID exists but title mismatch (sim={sim:.2f}): '{found_title}'", ) # --------------------------------------------------------------------------- # L2: DOI verification via CrossRef # --------------------------------------------------------------------------- _CROSSREF_API = "https://api.crossref.org/works" _CROSSREF_TIMEOUT = 20 _DATACITE_API = "https://api.datacite.org/dois" _DATACITE_TIMEOUT = 15 def _verify_doi_datacite(doi: str, expected_title: str) -> CitationResult | None: """Fallback DOI verification via DataCite API. arXiv DOIs (10.48550/arXiv.*) are registered with DataCite, not CrossRef. Returns *None* on network failure. """ encoded_doi = urllib.parse.quote(doi, safe="") url = f"{_DATACITE_API}/{encoded_doi}" try: req = urllib.request.Request( url, headers={ "User-Agent": "ResearchClaw/0.1", "Accept": "application/json", }, ) with urllib.request.urlopen(req, timeout=_DATACITE_TIMEOUT) as resp: body = json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as exc: if exc.code == 404: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.HALLUCINATED, confidence=0.9, method="doi", details=f"DOI {doi} not found via CrossRef or DataCite", ) logger.debug("DataCite HTTP error for DOI %s: %s", doi, exc) return None except Exception as exc: logger.debug("DataCite verification failed for %s: %s", doi, exc) return None # Extract title from DataCite response attrs = body.get("data", {}).get("attributes", {}) dc_titles = attrs.get("titles", []) found_title = dc_titles[0].get("title", "") if dc_titles else "" if not found_title: # DOI exists in DataCite but no title — still counts as verified return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.VERIFIED, confidence=0.85, method="doi", details=f"DOI {doi} resolves via DataCite (no title comparison)", ) sim = title_similarity(expected_title, found_title) if sim >= 0.80: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.VERIFIED, confidence=sim, method="doi", details=f"Confirmed via DataCite: '{found_title}'", ) elif sim >= 0.50: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="doi", details=f"DataCite DOI resolves but title differs (sim={sim:.2f}): '{found_title}'", ) else: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="doi", details=f"DataCite DOI resolves but title mismatch (sim={sim:.2f}): '{found_title}'", ) def verify_by_doi(doi: str, expected_title: str) -> CitationResult | None: """Verify a DOI via CrossRef API, with DataCite fallback for arXiv DOIs. Returns *None* on network failure. """ encoded_doi = urllib.parse.quote(doi, safe="") url = f"{_CROSSREF_API}/{encoded_doi}" try: req = urllib.request.Request( url, headers={ "User-Agent": "ResearchClaw/0.1 (mailto:researchclaw@example.com)", "Accept": "application/json", }, ) with urllib.request.urlopen(req, timeout=_CROSSREF_TIMEOUT) as resp: body = json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as exc: if exc.code == 404: # CrossRef 404 — try DataCite for arXiv/DataCite DOIs if doi.startswith("10.48550/") or doi.startswith("10.5281/"): dc_result = _verify_doi_datacite(doi, expected_title) if dc_result is not None: return dc_result return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.HALLUCINATED, confidence=0.9, method="doi", details=f"DOI {doi} not found (HTTP 404)", ) logger.debug("CrossRef HTTP error for DOI %s: %s", doi, exc) return None except Exception as exc: logger.debug("DOI verification failed for %s: %s", doi, exc) return None # Extract title from CrossRef response message = body.get("message", {}) titles = message.get("title", []) found_title = titles[0] if titles else "" if not found_title: # DOI exists but no title in response — still counts as verified return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.VERIFIED, confidence=0.85, method="doi", details=f"DOI {doi} resolves via CrossRef (no title comparison)", ) sim = title_similarity(expected_title, found_title) if sim >= 0.80: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.VERIFIED, confidence=sim, method="doi", details=f"Confirmed via CrossRef: '{found_title}'", ) elif sim >= 0.50: return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="doi", details=f"DOI resolves but title differs (sim={sim:.2f}): '{found_title}'", ) else: # DOI exists but title is very different — the DOI may be real # but the BibTeX entry may have wrong metadata return CitationResult( cite_key="", title=expected_title, status=VerifyStatus.SUSPICIOUS, confidence=sim, method="doi", details=f"DOI resolves but title mismatch (sim={sim:.2f}): '{found_title}'", ) # --------------------------------------------------------------------------- # L3-alt: OpenAlex title search (primary L3 source — higher rate limits) # --------------------------------------------------------------------------- _OPENALEX_API = "https://api.openalex.org/works" _OPENALEX_TIMEOUT = 15 _OPENALEX_EMAIL = "researchclaw@users.noreply.github.com" def verify_by_openalex(title: str) -> CitationResult | None: """Verify a paper via OpenAlex API (10K+ calls/day vs S2's ~1 req/s). Returns *None* only on network failure (allows fallthrough to S2). """ params = urllib.parse.urlencode({ "filter": "title.search:" + title.replace(",", " ").replace(":", " "), "per_page": "5", "mailto": _OPENALEX_EMAIL, }) url = f"{_OPENALEX_API}?{params}" try: req = urllib.request.Request( url, headers={ "User-Agent": f"ResearchClaw/0.1 (mailto:{_OPENALEX_EMAIL})", "Accept": "application/json", }, ) with urllib.request.urlopen(req, timeout=_OPENALEX_TIMEOUT) as resp: body = json.loads(resp.read().decode("utf-8")) except Exception as exc: logger.debug("OpenAlex search failed for %r: %s", title, exc) return None results = body.get("results", []) if not results: return CitationResult( cite_key="", title=title, status=VerifyStatus.HALLUCINATED, confidence=0.7, method="openalex", details="No results found via OpenAlex", ) best_sim = 0.0 best_result = None for r in results: found_title = r.get("title", "") if found_title: sim = title_similarity(title, found_title) if sim > best_sim: best_sim = sim best_result = r if best_sim >= 0.80: return CitationResult( cite_key="", title=title, status=VerifyStatus.VERIFIED, confidence=best_sim, method="openalex", details=f"Confirmed via OpenAlex: '{best_result.get('title', '')}'", ) elif best_sim >= 0.50: return CitationResult( cite_key="", title=title, status=VerifyStatus.SUSPICIOUS, confidence=best_sim, method="openalex", details=f"Partial match via OpenAlex (sim={best_sim:.2f}): '{best_result.get('title', '')}'", ) else: return CitationResult( cite_key="", title=title, status=VerifyStatus.HALLUCINATED, confidence=0.7, method="openalex", details="No close match found via OpenAlex", ) # --------------------------------------------------------------------------- # Verification result cache (avoids re-verifying known papers) # --------------------------------------------------------------------------- import hashlib from pathlib import Path _CACHE_DIR = Path.home() / ".cache" / "researchclaw" / "citation_verify" def _cache_key(title: str) -> str: return hashlib.sha256(title.lower().strip().encode()).hexdigest()[:16] def _read_cache(title: str) -> CitationResult | None: _CACHE_DIR.mkdir(parents=True, exist_ok=True) cache_file = _CACHE_DIR / f"{_cache_key(title)}.json" if cache_file.exists(): try: data = json.loads(cache_file.read_text(encoding="utf-8")) return CitationResult( cite_key=data.get("cite_key", ""), title=data.get("title", title), status=VerifyStatus(data["status"]), confidence=data["confidence"], method=data["method"], details=data.get("details", ""), ) except (json.JSONDecodeError, KeyError, ValueError): return None return None def _write_cache(title: str, result: CitationResult) -> None: _CACHE_DIR.mkdir(parents=True, exist_ok=True) cache_file = _CACHE_DIR / f"{_cache_key(title)}.json" cache_file.write_text( json.dumps(result.to_dict(), indent=2), encoding="utf-8", ) # --------------------------------------------------------------------------- # L3: Title search via Semantic Scholar + arXiv # --------------------------------------------------------------------------- def verify_by_title_search( title: str, *, s2_api_key: str = "", ) -> CitationResult | None: """Search for a paper by title and verify its existence. Uses the unified ``search_papers`` function from our literature module. Returns *None* only on total network failure. """ from researchclaw.literature.search import search_papers try: results = search_papers( title, limit=5, s2_api_key=s2_api_key, deduplicate=True, ) except Exception as exc: logger.debug("Title search failed for %r: %s", title, exc) return None if not results: return CitationResult( cite_key="", title=title, status=VerifyStatus.HALLUCINATED, confidence=0.7, method="title_search", details="No results found via Semantic Scholar + arXiv", ) # Find best title match best_sim = 0.0 best_paper: Paper | None = None for paper in results: sim = title_similarity(title, paper.title) if sim > best_sim: best_sim = sim best_paper = paper if best_sim >= 0.80: return CitationResult( cite_key="", title=title, status=VerifyStatus.VERIFIED, confidence=best_sim, method="title_search", details=f"Found via search: '{best_paper.title}'" if best_paper else "", matched_paper=best_paper, ) elif best_sim >= 0.50: return CitationResult( cite_key="", title=title, status=VerifyStatus.SUSPICIOUS, confidence=best_sim, method="title_search", details=( f"Partial match (sim={best_sim:.2f}): '{best_paper.title}'" if best_paper else "" ), matched_paper=best_paper, ) else: return CitationResult( cite_key="", title=title, status=VerifyStatus.HALLUCINATED, confidence=1.0 - best_sim, method="title_search", details=( f"Best match too weak (sim={best_sim:.2f}): '{best_paper.title}'" if best_paper else "No match found" ), ) # --------------------------------------------------------------------------- # Main entry point # --------------------------------------------------------------------------- def verify_citations( bib_text: str, *, s2_api_key: str = "", inter_verify_delay: float = 1.5, ) -> VerificationReport: """Verify all BibTeX entries against real academic APIs. Three-layer verification: 1. If entry has ``eprint`` (arXiv ID) → arXiv API lookup 2. If entry has ``doi`` → CrossRef API lookup 3. Otherwise → title search via Semantic Scholar + arXiv Parameters ---------- bib_text: Raw BibTeX string. s2_api_key: Optional Semantic Scholar API key for L3 title search. inter_verify_delay: Seconds to wait between API calls (rate limiting). """ entries = parse_bibtex_entries(bib_text) report = VerificationReport(total=len(entries)) # Adaptive delays: OpenAlex/CrossRef can be queried much faster than arXiv _DELAY_ARXIV = inter_verify_delay # arXiv: conservative (1.5s default) _DELAY_CROSSREF = 0.3 # CrossRef: 50 req/s polite pool _DELAY_OPENALEX = 0.2 # OpenAlex: 10K/day api_call_count = 0 # BUG-22: Global timeout — stop verifying after 5 minutes total _verify_start = time.monotonic() _VERIFY_TIMEOUT_SEC = 300 # 5 minutes for i, entry in enumerate(entries): # BUG-22: Check global timeout — mark remaining as SKIPPED if time.monotonic() - _verify_start > _VERIFY_TIMEOUT_SEC: logger.warning( "Verification timeout (%.0fs). Marking remaining %d/%d " "citations as SKIPPED.", _VERIFY_TIMEOUT_SEC, len(entries) - i, len(entries), ) for remaining_entry in entries[i:]: _rkey = remaining_entry.get("key", f"unknown_{i}") _rtitle = remaining_entry.get("title", "") report.results.append(CitationResult( cite_key=_rkey, title=_rtitle, status=VerifyStatus.SKIPPED, confidence=0.0, method="skipped", details="Verification timeout exceeded", )) report.skipped += 1 break key = entry.get("key", f"unknown_{i}") title = entry.get("title", "") arxiv_id = entry.get("eprint", "") doi = entry.get("doi", "") # Skip entries with no title if not title: result = CitationResult( cite_key=key, title="", status=VerifyStatus.SKIPPED, confidence=0.0, method="skipped", details="No title in BibTeX entry", ) report.skipped += 1 report.results.append(result) continue # Check cache first cached = _read_cache(title) if cached is not None: cached.cite_key = key report.results.append(cached) if cached.status == VerifyStatus.VERIFIED: report.verified += 1 elif cached.status == VerifyStatus.SUSPICIOUS: report.suspicious += 1 elif cached.status == VerifyStatus.HALLUCINATED: report.hallucinated += 1 else: report.skipped += 1 logger.debug("[cache] verify HIT [%s] %r → %s", key, title[:50], cached.status.value) continue result: CitationResult | None = None # Verification order optimized to minimize arXiv API pressure: # DOI → CrossRef (fast, high limit) # > OpenAlex title search (10K/day) # > arXiv ID lookup (only if others fail, 1/3s) # > S2 title search (last resort) # L2 first: DOI verification via CrossRef (fast, generous limits) if result is None and doi: if api_call_count > 0: time.sleep(_DELAY_CROSSREF) result = verify_by_doi(doi, title) api_call_count += 1 if result is not None: logger.info( "L2 DOI [%s] %s → %s (%.2f)", key, doi, result.status.value, result.confidence, ) # L3a: OpenAlex title search (high rate limits, good coverage) if result is None: if api_call_count > 0: time.sleep(_DELAY_OPENALEX) result = verify_by_openalex(title) api_call_count += 1 if result is not None: logger.info( "L3a OpenAlex [%s] %r → %s (%.2f)", key, title[:50], result.status.value, result.confidence, ) # L1: arXiv ID — only if DOI and OpenAlex both failed if result is None and arxiv_id: if api_call_count > 0: time.sleep(_DELAY_ARXIV) result = verify_by_arxiv_id(arxiv_id, title) api_call_count += 1 if result is not None: logger.info( "L1 arXiv ID [%s] %s → %s (%.2f)", key, arxiv_id, result.status.value, result.confidence, ) # L3b: S2 title search — last resort fallback if result is None: result = verify_by_title_search(title, s2_api_key=s2_api_key) api_call_count += 1 if result is not None: logger.info( "L3b S2 [%s] %r → %s (%.2f)", key, title[:50], result.status.value, result.confidence, ) # Fallback: all layers failed (network issues) if result is None: result = CitationResult( cite_key=key, title=title, status=VerifyStatus.SKIPPED, confidence=0.0, method="skipped", details="All verification methods failed (network error?)", ) result = CitationResult( cite_key=key, title=result.title, status=result.status, confidence=result.confidence, method=result.method, details=result.details, matched_paper=result.matched_paper, ) # Cache the result (skip SKIPPED — network failures shouldn't be cached) if result.status != VerifyStatus.SKIPPED: _write_cache(title, result) if result.status == VerifyStatus.VERIFIED: report.verified += 1 elif result.status == VerifyStatus.SUSPICIOUS: report.suspicious += 1 elif result.status == VerifyStatus.HALLUCINATED: report.hallucinated += 1 else: report.skipped += 1 report.results.append(result) return report # --------------------------------------------------------------------------- # Post-processing helpers # --------------------------------------------------------------------------- def filter_verified_bibtex( bib_text: str, report: VerificationReport, *, include_suspicious: bool = True, ) -> str: """Return a cleaned BibTeX string with only verified entries. Parameters ---------- bib_text: Original BibTeX string. report: Verification report from ``verify_citations()``. include_suspicious: If True, keep SUSPICIOUS entries. If False, only keep VERIFIED. """ # Build set of keys to keep keep_keys: set[str] = set() for r in report.results: if r.status == VerifyStatus.VERIFIED: keep_keys.add(r.cite_key) elif r.status == VerifyStatus.SUSPICIOUS and include_suspicious: keep_keys.add(r.cite_key) elif r.status == VerifyStatus.SKIPPED: keep_keys.add(r.cite_key) # keep unverifiable entries # Rebuild BibTeX keeping only entries whose keys are in keep_keys kept: list[str] = [] for m in _ENTRY_RE.finditer(bib_text): key = m.group(2).strip() if key in keep_keys: kept.append(m.group(0)) return "\n\n".join(kept) + "\n" if kept else "" def annotate_paper_hallucinations( paper_text: str, report: VerificationReport, ) -> str: """Remove hallucinated citations from paper text. - HALLUCINATED citations: removed from text (recorded in verification report) - SUSPICIOUS/VERIFIED/SKIPPED: left as-is Works with both ``\\cite{key}`` (LaTeX) and ``[key]`` (Markdown) formats. """ hallucinated_keys: set[str] = set() for r in report.results: if r.status == VerifyStatus.HALLUCINATED: hallucinated_keys.add(r.cite_key) if not hallucinated_keys: return paper_text result = paper_text # Handle \cite{key1, key2} format — remove only hallucinated keys def _replace_latex(m: re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] kept = [k for k in keys if k not in hallucinated_keys] if not kept: return "" # All keys hallucinated — remove entire cite return "\\cite{" + ", ".join(kept) + "}" result = re.sub(r"\\cite\{([^}]+)\}", _replace_latex, result) # Handle [key1, key2] and [key1; key2] format (Markdown multi-key) _CITE_KEY_PAT = r"[a-zA-Z]+\d{4}[a-zA-Z]*" def _replace_markdown_multi(m: re.Match[str]) -> str: keys = [k.strip() for k in re.split(r"[,;]\s*", m.group(1))] kept = [k for k in keys if k not in hallucinated_keys] if not kept: return "" return "[" + ", ".join(kept) + "]" result = re.sub( rf"\[({_CITE_KEY_PAT}(?:\s*[,;]\s*{_CITE_KEY_PAT})*)\]", _replace_markdown_multi, result, ) # Clean up artifacts: double spaces, empty parenthetical citations, orphan punctuation result = re.sub(r" {2,}", " ", result) result = re.sub(r"\(\s*\)", "", result) result = re.sub(r"\[\s*\]", "", result) return result ================================================ FILE: researchclaw/llm/__init__.py ================================================ """LLM integration — OpenAI-compatible and ACP agent clients.""" from __future__ import annotations from typing import TYPE_CHECKING, Union if TYPE_CHECKING: from researchclaw.config import RCConfig from researchclaw.llm.acp_client import ACPClient from researchclaw.llm.client import LLMClient # Provider presets for common LLM services PROVIDER_PRESETS = { "openai": { "base_url": "https://api.openai.com/v1", }, "openrouter": { "base_url": "https://openrouter.ai/api/v1", }, "deepseek": { "base_url": "https://api.deepseek.com/v1", }, "anthropic": { "base_url": "https://api.anthropic.com", }, "kimi-anthropic": { "base_url": "https://api.kimi.com/coding/", }, "novita": { "base_url": "https://api.novita.ai/openai", }, "minimax": { "base_url": "https://api.minimax.io/v1", }, "openai-compatible": { "base_url": None, # Use user-provided base_url }, } def create_llm_client(config: RCConfig) -> LLMClient | ACPClient: """Factory: return the right LLM client based on ``config.llm.provider``. - ``"acp"`` → :class:`ACPClient` (spawns an ACP-compatible agent) - ``"anthropic"`` → :class:`LLMClient` with Anthropic Messages API adapter - ``"kimi-anthropic"`` → :class:`LLMClient` with Kimi Coding Anthropic adapter - ``"openrouter"`` → :class:`LLMClient` with OpenRouter base URL - ``"openai"`` → :class:`LLMClient` with OpenAI base URL - ``"deepseek"`` → :class:`LLMClient` with DeepSeek base URL - ``"novita"`` → :class:`LLMClient` with Novita AI base URL - ``"minimax"`` → :class:`LLMClient` with MiniMax base URL - ``"openai-compatible"`` (default) → :class:`LLMClient` with custom base_url OpenRouter is fully compatible with the OpenAI API format, making it a drop-in replacement with access to 200+ models from Anthropic, Google, Meta, Mistral, and more. See: https://openrouter.ai/models """ if config.llm.provider == "acp": from researchclaw.llm.acp_client import ACPClient as _ACP return _ACP.from_rc_config(config) from researchclaw.llm.client import LLMClient as _LLM # Use from_rc_config to properly initialize adapters (e.g., Anthropic) return _LLM.from_rc_config(config) ================================================ FILE: researchclaw/llm/acp_client.py ================================================ """ACP (Agent Client Protocol) LLM client via acpx. Uses acpx as the ACP bridge to communicate with any ACP-compatible agent (Claude Code, Codex, Gemini CLI, etc.) via persistent named sessions. Key advantage: a single persistent session maintains context across all 23 pipeline stages — the agent remembers everything. """ from __future__ import annotations import atexit import logging import os import re import shutil import subprocess import sys import tempfile import weakref from dataclasses import dataclass from typing import Any from researchclaw.llm.client import LLMResponse logger = logging.getLogger(__name__) # acpx output markers _DONE_RE = re.compile(r"^\[done\]") _CLIENT_RE = re.compile(r"^\[client\]") _ACPX_RE = re.compile(r"^\[acpx\]") _TOOL_RE = re.compile(r"^\[tool\]") @dataclass class ACPConfig: """Configuration for ACP agent connection.""" agent: str = "claude" cwd: str = "." acpx_command: str = "" # auto-detect if empty session_name: str = "researchclaw" timeout_sec: int = 1800 # per-prompt timeout def _find_acpx() -> str | None: """Find the acpx binary — check PATH, then OpenClaw's plugin directory.""" found = shutil.which("acpx") if found: return found # Check OpenClaw's bundled acpx plugin openclaw_acpx = os.path.expanduser( "~/.openclaw/extensions/acpx/node_modules/.bin/acpx" ) if os.path.isfile(openclaw_acpx) and os.access(openclaw_acpx, os.X_OK): return openclaw_acpx return None class ACPClient: """LLM client that uses acpx to communicate with ACP agents. Spawns persistent named sessions via acpx, reusing them across ``.chat()`` calls so the agent maintains context across the full 23-stage pipeline. """ # Track live instances for atexit cleanup (weak refs to avoid preventing GC) _live_instances: list[weakref.ref[ACPClient]] = [] def __init__(self, acp_config: ACPConfig) -> None: self.config = acp_config self._acpx: str | None = acp_config.acpx_command or None self._session_ready = False # Register for atexit cleanup to prevent zombie acpx processes ACPClient._live_instances.append(weakref.ref(self)) atexit.register(ACPClient._atexit_cleanup) @classmethod def from_rc_config(cls, rc_config: Any) -> ACPClient: """Build from a ResearchClaw ``RCConfig``.""" acp = rc_config.llm.acp return cls(ACPConfig( agent=acp.agent, cwd=acp.cwd, acpx_command=getattr(acp, "acpx_command", ""), session_name=getattr(acp, "session_name", "researchclaw"), timeout_sec=getattr(acp, "timeout_sec", 1800), )) # ------------------------------------------------------------------ # Public interface (matches LLMClient) # ------------------------------------------------------------------ def chat( self, messages: list[dict[str, str]], *, model: str | None = None, max_tokens: int | None = None, temperature: float | None = None, json_mode: bool = False, system: str | None = None, strip_thinking: bool = False, ) -> LLMResponse: """Send a prompt and return the agent's response. Parameters mirror ``LLMClient.chat()`` for drop-in compatibility. ``model``, ``max_tokens``, ``temperature``, and ``json_mode`` are accepted but not forwarded — the agent manages its own model and parameters. """ prompt_text = self._messages_to_prompt(messages, system=system) content = self._send_prompt(prompt_text) if strip_thinking: from researchclaw.utils.thinking_tags import strip_thinking_tags content = strip_thinking_tags(content) return LLMResponse( content=content, model=f"acp:{self.config.agent}", finish_reason="stop", ) def preflight(self) -> tuple[bool, str]: """Check that acpx and the agent are available.""" acpx = self._resolve_acpx() if not acpx: return False, ( "acpx not found. Install it: npm install -g acpx " "or set llm.acp.acpx_command in config." ) # Check the agent binary exists agent = self.config.agent if not shutil.which(agent): return False, f"ACP agent CLI not found: {agent!r} (not on PATH)" # Create the session try: self._ensure_session() return True, f"OK - ACP session ready ({agent} via acpx)" except Exception as exc: # noqa: BLE001 return False, f"ACP session init failed: {exc}" def close(self) -> None: """Close the acpx session.""" if not self._session_ready: return acpx = self._resolve_acpx() if not acpx: return try: subprocess.run( [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent, "sessions", "close", self.config.session_name], capture_output=True, timeout=15, ) except Exception: # noqa: BLE001 pass self._session_ready = False def __del__(self) -> None: """Best-effort cleanup on garbage collection.""" try: self.close() except Exception: # noqa: BLE001 pass @classmethod def _atexit_cleanup(cls) -> None: """Close all live ACP sessions on interpreter shutdown.""" for ref in cls._live_instances: inst = ref() if inst is not None: try: inst.close() except Exception: # noqa: BLE001 pass cls._live_instances.clear() # ------------------------------------------------------------------ # Internals # ------------------------------------------------------------------ def _resolve_acpx(self) -> str | None: """Resolve the acpx binary path (cached).""" if self._acpx: return self._acpx self._acpx = _find_acpx() return self._acpx def _abs_cwd(self) -> str: return os.path.abspath(self.config.cwd) def _ensure_session(self) -> None: """Find or create the named acpx session.""" if self._session_ready: return acpx = self._resolve_acpx() if not acpx: raise RuntimeError("acpx not found") # Use 'ensure' which finds existing or creates new result = subprocess.run( [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent, "sessions", "ensure", "--name", self.config.session_name], capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=30, ) if result.returncode != 0: # Fall back to 'new' result = subprocess.run( [acpx, "--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent, "sessions", "new", "--name", self.config.session_name], capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=30, ) if result.returncode != 0: raise RuntimeError( f"Failed to create ACP session: {result.stderr.strip()}" ) self._session_ready = True logger.info("ACP session '%s' ready (%s)", self.config.session_name, self.config.agent) # Linux MAX_ARG_STRLEN is 128 KB; Windows CreateProcess limit is ~32 KB. _MAX_CLI_PROMPT_BYTES = 30_000 if sys.platform == "win32" else 100_000 # Localized error snippets for "command line too long" (may be in any OS language) _CMD_TOO_LONG_HINTS = ( "too long", # English Windows "trop long", # French Windows "zu lang", # German Windows "demasiado larg", # Spanish Windows "e2big", # POSIX ) # Error patterns that indicate a dead/stale session (retryable) _RECONNECT_ERRORS = ( "agent needs reconnect", "session not found", "Query closed", ) _MAX_RECONNECT_ATTEMPTS = 2 def _send_prompt(self, prompt: str) -> str: """Send a prompt via acpx and return the response text. For large prompts that would exceed the OS argument-length limit (``E2BIG``), the prompt is written to a temp file and the agent is asked to read it. If the session has died (common after long-running stages), retries up to ``_MAX_RECONNECT_ATTEMPTS`` times with automatic reconnection. """ acpx = self._resolve_acpx() if not acpx: raise RuntimeError("acpx not found") prompt_bytes = len(prompt.encode("utf-8")) use_file = prompt_bytes > self._MAX_CLI_PROMPT_BYTES if use_file: logger.info( "Prompt too large for CLI arg (%d bytes). Using temp file.", prompt_bytes, ) last_exc: RuntimeError | None = None for attempt in range(1 + self._MAX_RECONNECT_ATTEMPTS): self._ensure_session() try: if use_file: return self._send_prompt_via_file(acpx, prompt) return self._send_prompt_cli(acpx, prompt) except OSError as os_exc: # OS-level failure (e.g., Windows CreateProcess arg limit). # Fall back to temp-file transport automatically. if not use_file: logger.warning( "CLI subprocess raised OSError, " "falling back to temp file: %s", os_exc, ) use_file = True return self._send_prompt_via_file(acpx, prompt) raise RuntimeError( f"ACP prompt failed: {os_exc}" ) from os_exc except RuntimeError as exc: # Detect localized "command line too long" from subprocess stderr exc_lower = str(exc).lower() if not use_file and any( h in exc_lower for h in self._CMD_TOO_LONG_HINTS ): logger.warning( "CLI prompt too long for OS, " "falling back to temp file: %s", exc, ) use_file = True return self._send_prompt_via_file(acpx, prompt) if not any(pat in str(exc) for pat in self._RECONNECT_ERRORS): raise last_exc = exc if attempt < self._MAX_RECONNECT_ATTEMPTS: logger.warning( "ACP session died (%s), reconnecting (attempt %d/%d)...", exc, attempt + 1, self._MAX_RECONNECT_ATTEMPTS, ) self._force_reconnect() raise last_exc # type: ignore[misc] def _force_reconnect(self) -> None: """Close the stale session and reset so _ensure_session creates a new one.""" try: self.close() except Exception: # noqa: BLE001 pass self._session_ready = False def _send_prompt_cli(self, acpx: str, prompt: str) -> str: """Send prompt as a CLI argument (original path).""" try: result = subprocess.run( [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent, "-s", self.config.session_name, prompt], capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=self.config.timeout_sec, ) except subprocess.TimeoutExpired as exc: raise RuntimeError( f"ACP prompt timed out after {self.config.timeout_sec}s" ) from exc if result.returncode != 0: stderr = (result.stderr or "").strip() raise RuntimeError(f"ACP prompt failed (exit {result.returncode}): {stderr}") return self._extract_response(result.stdout) def _send_prompt_via_file(self, acpx: str, prompt: str) -> str: """Write prompt to a temp file, ask the agent to read and respond.""" fd, prompt_path = tempfile.mkstemp( suffix=".md", prefix="rc_prompt_", ) try: with os.fdopen(fd, "w", encoding="utf-8") as f: f.write(prompt) short_prompt = ( f"Read the file at {prompt_path} in its entirety. " f"Follow ALL instructions contained in that file and " f"respond exactly as requested. Do NOT summarize, " f"just produce the requested output." ) try: result = subprocess.run( [acpx, "--approve-all", "--ttl", "0", "--cwd", self._abs_cwd(), self.config.agent, "-s", self.config.session_name, short_prompt], capture_output=True, text=True, encoding="utf-8", errors="replace", timeout=self.config.timeout_sec, ) except subprocess.TimeoutExpired as exc: raise RuntimeError( f"ACP prompt timed out after {self.config.timeout_sec}s" ) from exc if result.returncode != 0: stderr = (result.stderr or "").strip() raise RuntimeError( f"ACP prompt failed (exit {result.returncode}): {stderr}" ) return self._extract_response(result.stdout) finally: try: os.unlink(prompt_path) except OSError: pass @staticmethod def _extract_response(raw_output: str | None) -> str: """Extract the agent's actual response from acpx output. Strips acpx metadata lines ([client], [acpx], [tool], [done]) and their continuation lines (indented or sub-field lines like ``input:``, ``output:``, ``files:``, ``kind:``). """ if not raw_output: return "" lines: list[str] = [] in_tool_block = False for line in raw_output.splitlines(): # Skip acpx control lines if _DONE_RE.match(line) or _CLIENT_RE.match(line) or _ACPX_RE.match(line): in_tool_block = False continue if _TOOL_RE.match(line): in_tool_block = True continue # Tool blocks have indented continuation lines if in_tool_block: if line.startswith(" ") or not line.strip(): continue # Non-indented, non-empty line = end of tool block in_tool_block = False # Skip empty lines at start if not lines and not line.strip(): continue lines.append(line) # Trim trailing empty lines while lines and not lines[-1].strip(): lines.pop() return "\n".join(lines) @staticmethod def _messages_to_prompt( messages: list[dict[str, str]], *, system: str | None = None, ) -> str: """Flatten a chat-messages list into a single text prompt. Preserves role labels so the agent can distinguish context. """ parts: list[str] = [] if system: parts.append(f"[System]\n{system}") for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") if role == "system": parts.append(f"[System]\n{content}") elif role == "assistant": parts.append(f"[Previous Response]\n{content}") else: parts.append(content) return "\n\n".join(parts) ================================================ FILE: researchclaw/llm/anthropic_adapter.py ================================================ """Anthropic Messages API adapter for ResearchClaw.""" import json import logging import urllib.error from typing import Any try: import httpx HAS_HTTPX = True except ImportError: HAS_HTTPX = False logger = logging.getLogger(__name__) _JSON_MODE_INSTRUCTION = ( "You MUST respond with valid JSON only. " "Do not include any text outside the JSON object." ) # Map Anthropic stop_reason → OpenAI finish_reason _STOP_REASON_MAP = { "end_turn": "stop", "max_tokens": "length", "stop_sequence": "stop", "tool_use": "tool_calls", } class AnthropicAdapter: """Adapter to call Anthropic Messages API and return OpenAI-compatible response.""" def __init__(self, base_url: str, api_key: str, timeout_sec: int = 300): if not HAS_HTTPX: raise ImportError( "httpx is required for Anthropic adapter. Install: pip install httpx" ) self.base_url = base_url.rstrip("/") self.api_key = api_key self.timeout_sec = timeout_sec self._client: httpx.Client | None = None def close(self) -> None: """BUG-DA8-09: Close the httpx connection pool to prevent fd leaks.""" if self._client is not None: try: self._client.close() except Exception: # noqa: BLE001 pass self._client = None def chat_completion( self, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool = False, ) -> dict[str, Any]: """Call Anthropic Messages API and return OpenAI-compatible response. Raises urllib.error.HTTPError on API errors so the upstream retry logic in LLMClient._call_with_retry works unchanged. """ # Extract and concatenate all system messages system_parts: list[str] = [] user_messages: list[dict[str, str]] = [] for msg in messages: if msg["role"] == "system": system_parts.append(msg["content"]) else: user_messages.append(msg) system_msg = "\n\n".join(system_parts) if system_parts else None # Merge consecutive messages with the same role (Anthropic # requires strict user/assistant alternation) merged: list[dict[str, str]] = [] for msg in user_messages: if merged and merged[-1]["role"] == msg["role"]: merged[-1] = { "role": msg["role"], "content": merged[-1]["content"] + "\n\n" + msg["content"], } else: merged.append(dict(msg)) user_messages = merged # Ensure at least one user message and that it starts with "user" if not user_messages: user_messages = [{"role": "user", "content": "Hello."}] elif user_messages[0]["role"] != "user": user_messages.insert(0, {"role": "user", "content": "Continue."}) # Prepend JSON instruction when json_mode is requested if json_mode: system_msg = ( f"{_JSON_MODE_INSTRUCTION}\n\n{system_msg}" if system_msg else _JSON_MODE_INSTRUCTION ) # BUG-DA8-05: Thinking-enabled Claude models require temperature=1.0 # and do not accept other temperature values. _THINKING_MODELS = ("claude-3-7", "claude-4") _is_thinking = any(model.startswith(p) for p in _THINKING_MODELS) if _is_thinking: temperature = 1.0 # Build Anthropic request body: dict[str, Any] = { "model": model, "messages": user_messages, "max_tokens": max_tokens, "temperature": temperature, } if system_msg: body["system"] = system_msg url = f"{self.base_url}/v1/messages" headers = { "x-api-key": self.api_key, "anthropic-version": "2023-06-01", "content-type": "application/json", } try: if self._client is None: self._client = httpx.Client(timeout=self.timeout_sec) response = self._client.post(url, headers=headers, json=body) response.raise_for_status() data = response.json() except httpx.HTTPStatusError as exc: # Convert to urllib.error.HTTPError for upstream retry logic. # Include Anthropic's error body so upstream logs show the # actual reason (e.g. "Prefilling not supported"). detail = "" try: detail = exc.response.text[:500] except Exception: # noqa: BLE001 pass msg = f"{exc}: {detail}" if detail else str(exc) raise urllib.error.HTTPError( url, exc.response.status_code, msg, dict(exc.response.headers), None, ) from exc except httpx.HTTPError as exc: # Catch all transport errors (ConnectError, TimeoutException, # ReadError, RemoteProtocolError, PoolTimeout, etc.) raise urllib.error.URLError(str(exc)) from exc # Check for Anthropic error responses if data.get("type") == "error" or "error" in data: error_info = data.get("error", {}) raise urllib.error.HTTPError( url, 500, f"{error_info.get('type', 'api_error')}: {error_info.get('message', str(data))}", {}, None, ) # Extract ALL text content blocks (not just the first) content = "" if "content" in data and data["content"]: text_parts = [ block.get("text", "") for block in data["content"] if block.get("type") == "text" ] content = "\n".join(text_parts) # Map Anthropic stop_reason to OpenAI finish_reason raw_stop_reason = data.get("stop_reason", "end_turn") finish_reason = _STOP_REASON_MAP.get(raw_stop_reason, "stop") return { "choices": [ { "message": {"role": "assistant", "content": content}, "finish_reason": finish_reason, } ], "usage": { "prompt_tokens": data.get("usage", {}).get("input_tokens", 0), "completion_tokens": data.get("usage", {}).get("output_tokens", 0), "total_tokens": ( data.get("usage", {}).get("input_tokens", 0) + data.get("usage", {}).get("output_tokens", 0) ), }, "model": data.get("model", model), } ================================================ FILE: researchclaw/llm/client.py ================================================ """Lightweight OpenAI-compatible LLM client — stdlib only. Features: - Model fallback chain (gpt-5.2 → gpt-5.1 → gpt-4.1 → gpt-4o) - Auto-detect max_tokens vs max_completion_tokens per model - Cloudflare User-Agent bypass - Exponential backoff retry with jitter - JSON mode support - Streaming disabled (sync only) """ from __future__ import annotations import json import logging import os import time import urllib.error import urllib.request from dataclasses import dataclass, field from typing import Any logger = logging.getLogger(__name__) # Models that require max_completion_tokens instead of max_tokens _NEW_PARAM_MODELS = frozenset( { "o3", "o3-mini", "o4-mini", "gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4", } ) _DEFAULT_USER_AGENT = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) @dataclass class LLMResponse: """Parsed response from the LLM API.""" content: str model: str prompt_tokens: int = 0 completion_tokens: int = 0 total_tokens: int = 0 finish_reason: str = "" truncated: bool = False raw: dict[str, Any] = field(default_factory=dict) @dataclass class LLMConfig: """Configuration for the LLM client.""" base_url: str api_key: str primary_model: str = "gpt-4o" fallback_models: list[str] = field( default_factory=lambda: ["gpt-4.1", "gpt-4o-mini"] ) max_tokens: int = 4096 temperature: float = 0.7 max_retries: int = 3 retry_base_delay: float = 2.0 timeout_sec: int = 300 user_agent: str = _DEFAULT_USER_AGENT # MetaClaw bridge: extra headers for proxy requests extra_headers: dict[str, str] = field(default_factory=dict) # MetaClaw bridge: fallback URL if primary (proxy) is unreachable fallback_url: str = "" fallback_api_key: str = "" class LLMClient: """Stateless OpenAI-compatible chat completion client.""" def __init__(self, config: LLMConfig) -> None: self.config = config self._model_chain = [config.primary_model] + list(config.fallback_models) self._anthropic = None # Will be set by from_rc_config if needed @classmethod def from_rc_config(cls, rc_config: Any) -> LLMClient: from researchclaw.llm import PROVIDER_PRESETS provider = getattr(rc_config.llm, "provider", "openai") preset = PROVIDER_PRESETS.get(provider, {}) preset_base_url = preset.get("base_url") api_key = str( rc_config.llm.api_key or os.environ.get(rc_config.llm.api_key_env, "") or "" ) # Use preset base_url if available and config doesn't override base_url = rc_config.llm.base_url or preset_base_url or "" # Preserve original URL/key before MetaClaw bridge override # (needed for Anthropic adapter which should always talk directly # to the Anthropic API, not through the OpenAI-compatible proxy). original_base_url = base_url original_api_key = api_key # MetaClaw bridge: if enabled, point to proxy and set up fallback bridge = getattr(rc_config, "metaclaw_bridge", None) fallback_url = "" fallback_api_key = "" if bridge and getattr(bridge, "enabled", False): fallback_url = base_url fallback_api_key = api_key base_url = bridge.proxy_url if bridge.fallback_url: fallback_url = bridge.fallback_url if bridge.fallback_api_key: fallback_api_key = bridge.fallback_api_key config = LLMConfig( base_url=base_url, api_key=api_key, primary_model=rc_config.llm.primary_model or "gpt-4o", fallback_models=list(rc_config.llm.fallback_models or []), fallback_url=fallback_url, fallback_api_key=fallback_api_key, ) client = cls(config) # Detect Anthropic or Kimi-Anthropic provider — use original URL/key (not the # MetaClaw proxy URL which is OpenAI-compatible only). if provider in ("anthropic", "kimi-anthropic"): from .anthropic_adapter import AnthropicAdapter client._anthropic = AnthropicAdapter( original_base_url, original_api_key, config.timeout_sec ) return client def chat( self, messages: list[dict[str, str]], *, model: str | None = None, max_tokens: int | None = None, temperature: float | None = None, json_mode: bool = False, system: str | None = None, strip_thinking: bool = False, ) -> LLMResponse: """Send a chat completion request with retry and fallback. Args: messages: List of {role, content} dicts. model: Override model (skips fallback chain). max_tokens: Override max token count. temperature: Override temperature. json_mode: Request JSON response format. system: Prepend a system message. strip_thinking: If True, strip reasoning tags from the response content. Use this when the output will be written to paper/script artifacts but NOT for general chat calls (to avoid corrupting legitimate content). Returns: LLMResponse with content and metadata. """ if system: messages = [{"role": "system", "content": system}] + messages models = [model] if model else self._model_chain max_tok = max_tokens or self.config.max_tokens temp = temperature if temperature is not None else self.config.temperature last_error: Exception | None = None for m in models: try: resp = self._call_with_retry(m, messages, max_tok, temp, json_mode) if strip_thinking: from researchclaw.utils.thinking_tags import strip_thinking_tags resp = LLMResponse( content=strip_thinking_tags(resp.content), model=resp.model, prompt_tokens=resp.prompt_tokens, completion_tokens=resp.completion_tokens, total_tokens=resp.total_tokens, finish_reason=resp.finish_reason, truncated=resp.truncated, raw=resp.raw, ) return resp except Exception as exc: # noqa: BLE001 logger.warning("Model %s failed: %s. Trying next.", m, exc) last_error = exc raise RuntimeError( f"All models failed. Last error: {last_error}" ) from last_error def preflight(self) -> tuple[bool, str]: """Quick connectivity check - one minimal chat call. Returns (success, message). Distinguishes: 401 (bad key), 403 (model forbidden), 404 (bad endpoint), 429 (rate limited), timeout. """ is_reasoning = any( self.config.primary_model.startswith(p) for p in _NEW_PARAM_MODELS ) min_tokens = 64 if is_reasoning else 1 try: _ = self.chat( [{"role": "user", "content": "ping"}], max_tokens=min_tokens, temperature=0, ) return True, f"OK - model {self.config.primary_model} responding" except urllib.error.HTTPError as e: status_map = { 401: "Invalid API key", 403: f"Model {self.config.primary_model} not allowed for this key", 404: f"Endpoint not found: {self.config.base_url}", 429: "Rate limited - try again in a moment", } msg = status_map.get(e.code, f"HTTP {e.code}") return False, msg except (urllib.error.URLError, OSError) as e: return False, f"Connection failed: {e}" except RuntimeError as e: # chat() wraps errors in RuntimeError; extract original HTTPError cause = e.__cause__ if isinstance(cause, urllib.error.HTTPError): status_map = { 401: "Invalid API key", 403: f"Model {self.config.primary_model} not allowed for this key", 404: f"Endpoint not found: {self.config.base_url}", 429: "Rate limited - try again in a moment", } msg = status_map.get(cause.code, f"HTTP {cause.code}") return False, msg return False, f"All models failed: {e}" def _call_with_retry( self, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool, ) -> LLMResponse: """Call with exponential backoff retry.""" for attempt in range(self.config.max_retries): try: return self._raw_call( model, messages, max_tokens, temperature, json_mode ) except urllib.error.HTTPError as e: status = e.code body = "" try: body = e.read().decode()[:500] except Exception: # noqa: BLE001 pass # Non-retryable errors if status == 403 and "not allowed to use model" in body: raise # Model not available — let fallback handle # 400 is normally non-retryable, but some providers # (Azure OpenAI) return 400 during overload / rate-limit. # Retry if the body hints at a transient issue. if status == 400: _transient_400 = any( kw in body.lower() for kw in ("rate limit", "ratelimit", "overloaded", "temporarily", "capacity", "throttl", "too many", "retry") ) if not _transient_400: raise # Genuine bad request — don't retry # Retryable: 429 (rate limit), transient 400, 500, 502, 503, 504, # 529 (Anthropic overloaded) if status in (400, 429, 500, 502, 503, 504, 529): delay = self.config.retry_base_delay * (2**attempt) # Add jitter import random delay += random.uniform(0, delay * 0.3) logger.info( "Retry %d/%d for %s (HTTP %d). Waiting %.1fs.", attempt + 1, self.config.max_retries, model, status, delay, ) time.sleep(delay) continue raise # Other HTTP errors except urllib.error.URLError: if attempt < self.config.max_retries - 1: delay = self.config.retry_base_delay * (2**attempt) time.sleep(delay) continue raise # All retries exhausted raise RuntimeError( f"LLM call failed after {self.config.max_retries} retries for model {model}" ) def _raw_call( self, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool, ) -> LLMResponse: """Make a single API call.""" # Use Anthropic adapter if configured if self._anthropic: data = self._anthropic.chat_completion(model, messages, max_tokens, temperature, json_mode) else: # Original OpenAI logic # Copy messages to avoid mutating the caller's list (important for # retries and model-fallback — each attempt must start from the # original, un-modified messages). msgs = [dict(m) for m in messages] # MiniMax API requires temperature in [0, 1.0] _temp = temperature if "api.minimax.io" in self.config.base_url: _temp = max(0.0, min(_temp, 1.0)) body: dict[str, Any] = { "model": model, "messages": msgs, "temperature": _temp, } # Use correct token parameter based on model if any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS): reasoning_min = 32768 body["max_completion_tokens"] = max(max_tokens, reasoning_min) else: body["max_tokens"] = max_tokens if json_mode: # Many OpenAI-compatible proxies serving Claude models don't # support the response_format parameter and return HTTP 400. # Fall back to a system-prompt injection for non-OpenAI models. if model.startswith("claude"): _json_hint = ( "You MUST respond with valid JSON only. " "Do not include any text outside the JSON object." ) # Prepend to existing system message or add as new one if msgs and msgs[0]["role"] == "system": msgs[0]["content"] = ( _json_hint + "\n\n" + msgs[0]["content"] ) else: msgs.insert( 0, {"role": "system", "content": _json_hint} ) else: body["response_format"] = {"type": "json_object"} payload = json.dumps(body).encode("utf-8") url = f"{self.config.base_url.rstrip('/')}/chat/completions" headers = { "Authorization": f"Bearer {self.config.api_key}", "Content-Type": "application/json", "User-Agent": self.config.user_agent, } # MetaClaw bridge: inject extra headers (session ID, stage info, etc.) headers.update(self.config.extra_headers) req = urllib.request.Request(url, data=payload, headers=headers) try: with urllib.request.urlopen(req, timeout=self.config.timeout_sec) as resp: data = json.loads(resp.read()) except (urllib.error.URLError, OSError) as exc: # MetaClaw bridge: fallback to direct LLM if proxy unreachable if self.config.fallback_url: logger.warning( "Primary endpoint unreachable, falling back to %s: %s", self.config.fallback_url, exc, ) fallback_url = ( f"{self.config.fallback_url.rstrip('/')}/chat/completions" ) fallback_key = self.config.fallback_api_key or self.config.api_key fallback_headers = { "Authorization": f"Bearer {fallback_key}", "Content-Type": "application/json", "User-Agent": self.config.user_agent, } fallback_req = urllib.request.Request( fallback_url, data=payload, headers=fallback_headers ) with urllib.request.urlopen( fallback_req, timeout=self.config.timeout_sec ) as resp: data = json.loads(resp.read()) else: raise # Handle API error responses if "error" in data: error_info = data["error"] error_msg = error_info.get("message", str(error_info)) error_type = error_info.get("type", "api_error") import io raise urllib.error.HTTPError( "", 500, f"{error_type}: {error_msg}", {}, io.BytesIO(error_msg.encode()), ) # Validate response structure if "choices" not in data or not data["choices"]: raise ValueError(f"Malformed API response: missing choices. Got: {data}") choice = data["choices"][0] usage = data.get("usage", {}) message = choice.get("message", {}) content = message.get("content") or "" return LLMResponse( content=content, model=data.get("model", model), prompt_tokens=usage.get("prompt_tokens", 0), completion_tokens=usage.get("completion_tokens", 0), total_tokens=usage.get("total_tokens", 0), finish_reason=choice.get("finish_reason", ""), truncated=(choice.get("finish_reason", "") == "length"), raw=data, ) def create_client_from_yaml(yaml_path: str | None = None) -> LLMClient: """Create an LLMClient from the ARC config file. Reads base_url and api_key from config.arc.yaml's llm section. """ import yaml as _yaml if yaml_path is None: yaml_path = "config.yaml" with open(yaml_path, encoding="utf-8") as f: raw = _yaml.safe_load(f) llm_section = raw.get("llm", {}) api_key = str( os.environ.get( llm_section.get("api_key_env", "OPENAI_API_KEY"), llm_section.get("api_key", ""), ) or "" ) return LLMClient( LLMConfig( base_url=llm_section.get("base_url", "https://api.openai.com/v1"), api_key=api_key, primary_model=llm_section.get("primary_model", "gpt-4o"), fallback_models=llm_section.get( "fallback_models", ["gpt-4.1", "gpt-4o-mini"] ), ) ) ================================================ FILE: researchclaw/mcp/__init__.py ================================================ """MCP (Model Context Protocol) standardized integration for AutoResearchClaw.""" from researchclaw.mcp.server import ResearchClawMCPServer from researchclaw.mcp.client import MCPClient from researchclaw.mcp.registry import MCPServerRegistry __all__ = ["ResearchClawMCPServer", "MCPClient", "MCPServerRegistry"] ================================================ FILE: researchclaw/mcp/client.py ================================================ """MCP Client: connect to external MCP servers for enhanced capabilities.""" from __future__ import annotations import json import logging from typing import Any logger = logging.getLogger(__name__) class MCPClient: """Connect to an external MCP server and invoke its tools. Supports stdio and SSE transports. The actual protocol I/O is abstracted so we can add more transports later. """ def __init__(self, server_uri: str, transport: str = "stdio") -> None: self.uri = server_uri self.transport = transport self._connected = False self._tools_cache: list[dict[str, Any]] | None = None # ── connection ──────────────────────────────────────────────── async def connect(self) -> None: """Establish connection to the MCP server.""" logger.info("Connecting to MCP server: %s (transport=%s)", self.uri, self.transport) self._connected = True async def disconnect(self) -> None: """Close the connection.""" self._connected = False self._tools_cache = None @property def is_connected(self) -> bool: return self._connected # ── tool discovery ──────────────────────────────────────────── async def list_tools(self) -> list[dict[str, Any]]: """List tools available on the remote MCP server.""" if not self._connected: raise ConnectionError("Not connected to MCP server") if self._tools_cache is not None: return self._tools_cache response = await self._send_request("tools/list", {}) tools = response.get("tools", []) self._tools_cache = tools return tools async def call_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: """Call a tool on the remote MCP server.""" if not self._connected: raise ConnectionError("Not connected to MCP server") return await self._send_request("tools/call", {"name": name, "arguments": arguments}) # ── resource access ─────────────────────────────────────────── async def list_resources(self) -> list[dict[str, Any]]: """List resources available on the remote MCP server.""" if not self._connected: raise ConnectionError("Not connected to MCP server") response = await self._send_request("resources/list", {}) return response.get("resources", []) async def read_resource(self, uri: str) -> str: """Read a resource from the remote MCP server.""" if not self._connected: raise ConnectionError("Not connected to MCP server") response = await self._send_request("resources/read", {"uri": uri}) contents = response.get("contents", []) if contents: return contents[0].get("text", "") return "" # ── transport layer ─────────────────────────────────────────── async def _send_request(self, method: str, params: dict[str, Any]) -> dict[str, Any]: """Send a JSON-RPC request to the MCP server. This is a stub — real implementation delegates to transport.py. """ message = { "jsonrpc": "2.0", "id": 1, "method": method, "params": params, } logger.debug("MCP request: %s", json.dumps(message, default=str)[:200]) # Stub: return empty result return {"result": {}} ================================================ FILE: researchclaw/mcp/registry.py ================================================ """Registry of connected MCP servers.""" from __future__ import annotations import logging from typing import Any from researchclaw.mcp.client import MCPClient logger = logging.getLogger(__name__) class MCPServerRegistry: """Track connected external MCP servers.""" def __init__(self) -> None: self._servers: dict[str, MCPClient] = {} async def register(self, name: str, uri: str, transport: str = "stdio") -> MCPClient: """Register and connect to an external MCP server.""" client = MCPClient(uri, transport=transport) await client.connect() self._servers[name] = client logger.info("Registered MCP server: %s -> %s", name, uri) return client async def unregister(self, name: str) -> None: """Disconnect and remove an MCP server.""" client = self._servers.pop(name, None) if client: await client.disconnect() def get(self, name: str) -> MCPClient | None: """Get a connected MCP client by name.""" return self._servers.get(name) def list_all(self) -> list[dict[str, Any]]: """List all registered MCP servers.""" return [ {"name": name, "uri": client.uri, "connected": client.is_connected} for name, client in self._servers.items() ] async def close_all(self) -> None: """Disconnect from all servers.""" for name in list(self._servers): await self.unregister(name) @property def count(self) -> int: return len(self._servers) ================================================ FILE: researchclaw/mcp/server.py ================================================ """ResearchClaw MCP Server: expose pipeline capabilities to external agents.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.mcp.tools import TOOL_DEFINITIONS, list_tool_names logger = logging.getLogger(__name__) class ResearchClawMCPServer: """MCP Server that exposes AutoResearchClaw capabilities as tools. External agents (e.g., Claude, OpenClaw) can connect to this server and invoke pipeline operations via the MCP protocol. """ def __init__(self, config: Any = None) -> None: self.config = config self._handlers: dict[str, Any] = {} self._running = False def get_tools(self) -> list[dict[str, Any]]: """Return the list of available MCP tools.""" return TOOL_DEFINITIONS async def handle_tool_call(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: """Handle an incoming MCP tool call.""" if name not in list_tool_names(): return {"error": f"Unknown tool: {name}", "success": False} logger.info("MCP tool call: %s(%s)", name, json.dumps(arguments, default=str)[:200]) try: if name == "run_pipeline": return await self._handle_run_pipeline(arguments) elif name == "get_pipeline_status": return await self._handle_get_status(arguments) elif name == "get_experiment_results": return await self._handle_get_results(arguments) elif name == "search_literature": return await self._handle_search_literature(arguments) elif name == "review_paper": return await self._handle_review_paper(arguments) elif name == "get_paper": return await self._handle_get_paper(arguments) else: return {"error": f"Handler not implemented: {name}", "success": False} except Exception as exc: logger.error("MCP tool call %s failed: %s", name, exc) return {"error": str(exc), "success": False} async def _handle_run_pipeline(self, args: dict[str, Any]) -> dict[str, Any]: """Start a pipeline run.""" topic = args["topic"] # In production, this would invoke the full pipeline asynchronously return { "success": True, "message": f"Pipeline started for topic: {topic}", "run_id": f"mcp-stub-{topic[:20]}", } async def _handle_get_status(self, args: dict[str, Any]) -> dict[str, Any]: """Get pipeline status.""" run_id = args["run_id"] run_dir = Path(f"artifacts/{run_id}") if not run_dir.exists(): return {"success": False, "error": f"Run not found: {run_id}"} # Read checkpoint if available checkpoint_file = run_dir / "checkpoint.json" if checkpoint_file.exists(): data = json.loads(checkpoint_file.read_text(encoding="utf-8")) return {"success": True, "run_id": run_id, "checkpoint": data} return {"success": True, "run_id": run_id, "status": "no_checkpoint"} async def _handle_get_results(self, args: dict[str, Any]) -> dict[str, Any]: """Get experiment results.""" run_id = args["run_id"] run_dir = Path(f"artifacts/{run_id}") results_file = run_dir / "experiment_results.json" if results_file.exists(): data = json.loads(results_file.read_text(encoding="utf-8")) return {"success": True, "results": data} return {"success": False, "error": "No results found"} async def _handle_search_literature(self, args: dict[str, Any]) -> dict[str, Any]: """Search literature (stub — real implementation would use literature/ module).""" return { "success": True, "query": args["query"], "results": [], "message": "Literature search stub", } async def _handle_review_paper(self, args: dict[str, Any]) -> dict[str, Any]: """Review paper (stub).""" return { "success": True, "paper_path": args["paper_path"], "review": "Stub review — not yet implemented", } async def _handle_get_paper(self, args: dict[str, Any]) -> dict[str, Any]: """Get generated paper.""" run_id = args["run_id"] fmt = args.get("format", "markdown") run_dir = Path(f"artifacts/{run_id}") if fmt == "latex": paper_file = run_dir / "paper.tex" else: paper_file = run_dir / "paper_draft.md" if paper_file.exists(): return {"success": True, "content": paper_file.read_text(encoding="utf-8")} return {"success": False, "error": f"Paper not found in {run_dir}"} # ── server lifecycle ────────────────────────────────────────── async def start(self, transport: str = "stdio") -> None: """Start the MCP server (stdio or SSE transport).""" self._running = True logger.info("MCP server started (transport=%s)", transport) async def stop(self) -> None: """Stop the MCP server.""" self._running = False logger.info("MCP server stopped") @property def is_running(self) -> bool: return self._running ================================================ FILE: researchclaw/mcp/tools.py ================================================ """MCP tool definitions for ResearchClaw capabilities.""" from __future__ import annotations from typing import Any # Tool schemas exposed by the ResearchClaw MCP Server TOOL_DEFINITIONS: list[dict[str, Any]] = [ { "name": "run_pipeline", "description": "Start an autonomous research pipeline run on a given topic.", "inputSchema": { "type": "object", "properties": { "topic": {"type": "string", "description": "The research topic"}, "config_path": {"type": "string", "description": "Path to config YAML (optional)"}, "auto_approve": {"type": "boolean", "description": "Auto-approve gate stages"}, }, "required": ["topic"], }, }, { "name": "get_pipeline_status", "description": "Get the current status of a pipeline run.", "inputSchema": { "type": "object", "properties": { "run_id": {"type": "string", "description": "The pipeline run ID"}, }, "required": ["run_id"], }, }, { "name": "get_experiment_results", "description": "Get experiment results from a completed or running pipeline.", "inputSchema": { "type": "object", "properties": { "run_id": {"type": "string", "description": "The pipeline run ID"}, "stage": {"type": "string", "description": "Specific stage name (optional)"}, }, "required": ["run_id"], }, }, { "name": "search_literature", "description": "Search academic papers on a topic using Semantic Scholar and arXiv.", "inputSchema": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query"}, "max_results": {"type": "integer", "description": "Max results (default 10)"}, }, "required": ["query"], }, }, { "name": "review_paper", "description": "Run AI peer review on a paper draft.", "inputSchema": { "type": "object", "properties": { "paper_path": {"type": "string", "description": "Path to paper markdown or LaTeX file"}, "run_id": {"type": "string", "description": "Associated run ID (optional)"}, }, "required": ["paper_path"], }, }, { "name": "get_paper", "description": "Get the generated paper from a pipeline run.", "inputSchema": { "type": "object", "properties": { "run_id": {"type": "string", "description": "The pipeline run ID"}, "format": {"type": "string", "enum": ["markdown", "latex"], "description": "Output format"}, }, "required": ["run_id"], }, }, ] def get_tool_schema(name: str) -> dict[str, Any] | None: """Get the schema for a specific tool by name.""" for tool in TOOL_DEFINITIONS: if tool["name"] == name: return tool return None def list_tool_names() -> list[str]: """Return all available tool names.""" return [t["name"] for t in TOOL_DEFINITIONS] ================================================ FILE: researchclaw/mcp/transport.py ================================================ """MCP transport layer: stdio and SSE implementations.""" from __future__ import annotations import asyncio import json import logging import sys from typing import Any, Protocol logger = logging.getLogger(__name__) class MCPTransport(Protocol): """Protocol for MCP message transport.""" async def send(self, message: dict[str, Any]) -> None: ... async def receive(self) -> dict[str, Any]: ... async def close(self) -> None: ... class StdioTransport: """MCP transport over stdin/stdout (for CLI integration).""" def __init__(self) -> None: self._reader: asyncio.StreamReader | None = None self._writer: asyncio.StreamWriter | None = None async def start(self) -> None: """Initialize stdin/stdout streams for async I/O.""" loop = asyncio.get_event_loop() self._reader = asyncio.StreamReader() protocol = asyncio.StreamReaderProtocol(self._reader) await loop.connect_read_pipe(lambda: protocol, sys.stdin) w_transport, w_protocol = await loop.connect_write_pipe( asyncio.streams.FlowControlMixin, sys.stdout ) self._writer = asyncio.StreamWriter(w_transport, w_protocol, self._reader, loop) async def send(self, message: dict[str, Any]) -> None: """Write a JSON-RPC message to stdout.""" if self._writer is None: raise RuntimeError("Transport not started") data = json.dumps(message, ensure_ascii=False) header = f"Content-Length: {len(data.encode())}\r\n\r\n" self._writer.write(header.encode() + data.encode()) await self._writer.drain() async def receive(self) -> dict[str, Any]: """Read a JSON-RPC message from stdin.""" if self._reader is None: raise RuntimeError("Transport not started") # Read headers content_length = 0 while True: line = await self._reader.readline() decoded = line.decode().strip() if not decoded: break if decoded.lower().startswith("content-length:"): content_length = int(decoded.split(":")[1].strip()) if content_length == 0: raise EOFError("No content-length header received") body = await self._reader.readexactly(content_length) return json.loads(body) async def close(self) -> None: """Close the transport.""" if self._writer: self._writer.close() class SSETransport: """MCP transport over Server-Sent Events (for web integration). This is a stub — a full implementation would use aiohttp or similar. """ def __init__(self, host: str = "0.0.0.0", port: int = 3000) -> None: self.host = host self.port = port self._running = False async def start(self) -> None: """Start the SSE server.""" self._running = True logger.info("SSE transport started on %s:%d", self.host, self.port) async def send(self, message: dict[str, Any]) -> None: """Send an SSE event (stub).""" logger.debug("SSE send: %s", json.dumps(message, default=str)[:200]) async def receive(self) -> dict[str, Any]: """Receive from SSE (stub).""" raise NotImplementedError("SSE receive not yet implemented") async def close(self) -> None: """Stop the SSE server.""" self._running = False ================================================ FILE: researchclaw/memory/__init__.py ================================================ """Persistent evolutionary memory system for AutoResearchClaw. Provides three categories of memory: - **Ideation**: Research topics, hypotheses, and their outcomes. - **Experiment**: Hyperparameters, architectures, and training tricks. - **Writing**: Review feedback, paper structure patterns. Each category supports semantic retrieval via embeddings, time-decay weighting, and confidence scoring. """ from researchclaw.memory.store import MemoryEntry, MemoryStore from researchclaw.memory.retriever import MemoryRetriever from researchclaw.memory.decay import time_decay_weight, confidence_update __all__ = [ "MemoryEntry", "MemoryStore", "MemoryRetriever", "time_decay_weight", "confidence_update", ] ================================================ FILE: researchclaw/memory/decay.py ================================================ """Time-decay and confidence scoring for memory entries.""" from __future__ import annotations import math from datetime import datetime, timezone def time_decay_weight( created_at: datetime, half_life_days: float = 90.0, max_age_days: float = 365.0, *, now: datetime | None = None, ) -> float: """Compute exponential decay weight based on entry age. Args: created_at: When the memory was created. half_life_days: Half-life in days (weight = 0.5 after this many days). max_age_days: Entries older than this get weight 0.0. now: Current time (defaults to UTC now). Returns: Weight in [0.0, 1.0]. """ if now is None: now = datetime.now(timezone.utc) if created_at.tzinfo is None: created_at = created_at.replace(tzinfo=timezone.utc) if now.tzinfo is None: now = now.replace(tzinfo=timezone.utc) age_seconds = (now - created_at).total_seconds() age_days = age_seconds / 86400.0 if age_days < 0: return 1.0 if age_days > max_age_days: return 0.0 return math.exp(-age_days * math.log(2) / half_life_days) def confidence_update( current: float, delta: float, floor: float = 0.0, ceiling: float = 1.0, ) -> float: """Update confidence score with clamping. Args: current: Current confidence value. delta: Change amount (positive for success, negative for failure). floor: Minimum allowed value. ceiling: Maximum allowed value. Returns: Updated confidence clamped to [floor, ceiling]. """ return max(floor, min(ceiling, current + delta)) ================================================ FILE: researchclaw/memory/embeddings.py ================================================ """Vector embedding management for memory retrieval. Supports three backends (auto-fallback): 1. OpenAI-compatible API embeddings (text-embedding-3-small) 2. sentence-transformers local model 3. TF-IDF bag-of-words (zero-dependency fallback) """ from __future__ import annotations import hashlib import json import logging import math import re from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # Dimension for TF-IDF fallback _TFIDF_DIM = 256 def _tokenize(text: str) -> list[str]: """Simple whitespace + punctuation tokenizer.""" return re.findall(r"[a-z0-9]+", text.lower()) def _hash_token(token: str, dim: int = _TFIDF_DIM) -> int: """Hash a token to a dimension index.""" h = hashlib.md5(token.encode(), usedforsecurity=False).hexdigest() return int(h, 16) % dim class EmbeddingProvider: """Manages embedding generation with automatic fallback.""" def __init__( self, model: str = "text-embedding-3-small", api_base_url: str = "", api_key: str = "", cache_dir: Path | None = None, ) -> None: self._model = model self._api_base_url = api_base_url self._api_key = api_key self._cache_dir = cache_dir self._backend: str | None = None self._dim: int = _TFIDF_DIM self._local_model: Any = None @property def backend(self) -> str: """Return the active backend name.""" if self._backend is None: self._detect_backend() return self._backend # type: ignore[return-value] @property def dimension(self) -> int: """Return embedding dimensionality.""" if self._backend is None: self._detect_backend() return self._dim def _detect_backend(self) -> None: """Auto-detect the best available embedding backend.""" # 1. Try OpenAI API if self._api_base_url and self._api_key: self._backend = "api" self._dim = 1536 # text-embedding-3-small default logger.info("Embedding backend: OpenAI API (%s)", self._model) return # 2. Try sentence-transformers try: from sentence_transformers import SentenceTransformer # type: ignore[import-untyped] self._local_model = SentenceTransformer("all-MiniLM-L6-v2") self._backend = "sentence_transformers" self._dim = 384 logger.info("Embedding backend: sentence-transformers (all-MiniLM-L6-v2)") return except ImportError: pass # 3. Fallback to TF-IDF self._backend = "tfidf" self._dim = _TFIDF_DIM logger.info("Embedding backend: TF-IDF fallback (dim=%d)", self._dim) def embed(self, text: str) -> list[float]: """Generate embedding vector for a text string. Args: text: Input text to embed. Returns: List of floats representing the embedding vector. """ if self._backend is None: self._detect_backend() if self._backend == "api": return self._embed_api(text) elif self._backend == "sentence_transformers": return self._embed_local(text) else: return self._embed_tfidf(text) def embed_batch(self, texts: list[str]) -> list[list[float]]: """Generate embeddings for multiple texts. Args: texts: List of input texts. Returns: List of embedding vectors. """ if self._backend is None: self._detect_backend() if self._backend == "sentence_transformers" and self._local_model is not None: embeddings = self._local_model.encode(texts) return [e.tolist() for e in embeddings] return [self.embed(t) for t in texts] def _embed_api(self, text: str) -> list[float]: """Get embedding from OpenAI-compatible API.""" import urllib.request url = f"{self._api_base_url.rstrip('/')}/embeddings" payload = json.dumps({ "input": text[:8000], "model": self._model, }).encode() req = urllib.request.Request( url, data=payload, headers={ "Content-Type": "application/json", "Authorization": f"Bearer {self._api_key}", }, ) try: with urllib.request.urlopen(req, timeout=30) as resp: data = json.loads(resp.read()) return data["data"][0]["embedding"] except Exception as exc: logger.warning("API embedding failed, falling back to TF-IDF: %s", exc) return self._embed_tfidf(text) def _embed_local(self, text: str) -> list[float]: """Get embedding from local sentence-transformers model.""" if self._local_model is None: return self._embed_tfidf(text) embedding = self._local_model.encode(text) return embedding.tolist() def _embed_tfidf(self, text: str) -> list[float]: """Generate TF-IDF-style bag-of-words embedding (zero-dependency fallback).""" tokens = _tokenize(text) if not tokens: return [0.0] * self._dim vec = [0.0] * _TFIDF_DIM for token in tokens: idx = _hash_token(token, _TFIDF_DIM) vec[idx] += 1.0 # L2 normalize norm = math.sqrt(sum(v * v for v in vec)) if norm > 0: vec = [v / norm for v in vec] return vec ================================================ FILE: researchclaw/memory/experiment_memory.py ================================================ """Experiment memory — records and retrieves experiment experiences.""" from __future__ import annotations import json import logging from typing import Any from researchclaw.memory.retriever import MemoryRetriever from researchclaw.memory.store import MemoryStore logger = logging.getLogger(__name__) CATEGORY = "experiment" class ExperimentMemory: """Records and retrieves experiment experiences. Tracks hyperparameter configurations, model architectures, and training tricks that worked (or failed) in past runs. """ def __init__( self, store: MemoryStore, retriever: MemoryRetriever, embed_fn: Any = None, ) -> None: self._store = store self._retriever = retriever self._embed_fn = embed_fn def record_hyperparams( self, task_type: str, hyperparams: dict[str, Any], metric: float, metric_name: str = "primary_metric", run_id: str = "", ) -> str: """Record an effective hyperparameter configuration. Args: task_type: Type of task (e.g., "image_classification"). hyperparams: Dict of hyperparameter values. metric: Achieved metric value. metric_name: Name of the metric. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ hp_str = json.dumps(hyperparams, indent=2, default=str) content = ( f"Task: {task_type}\n" f"Hyperparameters:\n{hp_str}\n" f"Result: {metric_name}={metric:.4f}" ) metadata = { "type": "hyperparams", "task_type": task_type, "hyperparams": hyperparams, "metric": metric, "metric_name": metric_name, "run_id": run_id, } # Higher metric → higher confidence (assuming maximize) confidence = min(1.0, 0.4 + metric * 0.5) embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def record_architecture( self, task_type: str, architecture: str, metric: float, run_id: str = "", ) -> str: """Record a successful model architecture. Args: task_type: Type of task. architecture: Architecture description. metric: Achieved metric value. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ content = ( f"Task: {task_type}\n" f"Architecture: {architecture}\n" f"Performance: {metric:.4f}" ) metadata = { "type": "architecture", "task_type": task_type, "architecture": architecture, "metric": metric, "run_id": run_id, } confidence = min(1.0, 0.4 + metric * 0.5) embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def record_training_trick( self, trick: str, improvement: float, context: str, run_id: str = "", ) -> str: """Record an effective training trick. Args: trick: Description of the trick. improvement: Relative improvement (e.g., 0.05 for 5%). context: When/where the trick was applied. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ content = ( f"Trick: {trick}\n" f"Improvement: {improvement:+.1%}\n" f"Context: {context}" ) metadata = { "type": "training_trick", "trick": trick, "improvement": improvement, "context": context, "run_id": run_id, } confidence = 0.6 if improvement > 0 else 0.3 embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def recall_best_configs( self, task_type: str, top_k: int = 3, ) -> str: """Retrieve best configurations for a task type. Args: task_type: Description of the current task. top_k: Number of results. Returns: Formatted string of best configurations. """ query = f"best hyperparameters and architecture for {task_type}" results = self._retriever.recall_by_text( query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn ) if not results: return "" parts = ["### Best Experiment Configurations (from memory)"] for i, (entry, score) in enumerate(results, 1): metric = entry.metadata.get("metric", "?") parts.append( f"{i}. {entry.content.splitlines()[0]} " f"(metric: {metric}, relevance: {score:.2f})" ) # Include hyperparams detail if available hp = entry.metadata.get("hyperparams") if hp: parts.append(f" Config: {json.dumps(hp, default=str)}") return "\n".join(parts) ================================================ FILE: researchclaw/memory/ideation_memory.py ================================================ """Ideation memory — records and retrieves research direction experiences.""" from __future__ import annotations import logging from typing import Any from researchclaw.memory.retriever import MemoryRetriever from researchclaw.memory.store import MemoryStore logger = logging.getLogger(__name__) CATEGORY = "ideation" class IdeationMemory: """Records and retrieves research direction experiences. Tracks which topics succeeded or failed, which hypotheses were feasible, and builds up anti-patterns to avoid in the future. """ def __init__( self, store: MemoryStore, retriever: MemoryRetriever, embed_fn: Any = None, ) -> None: self._store = store self._retriever = retriever self._embed_fn = embed_fn def record_topic_outcome( self, topic: str, outcome: str, quality_score: float, run_id: str = "", ) -> str: """Record the outcome of a research topic. Args: topic: The research topic description. outcome: One of "success", "failure", "abandoned". quality_score: Quality score (0-10). run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ content = ( f"Topic: {topic}\n" f"Outcome: {outcome}\n" f"Quality: {quality_score:.1f}/10" ) metadata = { "type": "topic_outcome", "outcome": outcome, "quality_score": quality_score, "run_id": run_id, } # Higher quality → higher confidence confidence = min(1.0, 0.3 + quality_score / 15.0) if outcome == "failure": confidence = max(0.5, confidence) # failures are valuable too embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def record_hypothesis( self, hypothesis: str, feasible: bool, reason: str, run_id: str = "", ) -> str: """Record a hypothesis feasibility assessment. Args: hypothesis: The hypothesis text. feasible: Whether it was feasible. reason: Reason for the assessment. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ outcome = "feasible" if feasible else "infeasible" content = ( f"Hypothesis: {hypothesis}\n" f"Assessment: {outcome}\n" f"Reason: {reason}" ) metadata = { "type": "hypothesis", "feasible": feasible, "run_id": run_id, } confidence = 0.6 if feasible else 0.7 # infeasible is more informative embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def recall_similar_topics( self, query: str, top_k: int = 5, ) -> str: """Retrieve similar historical research directions with outcomes. Args: query: Current research topic or query. top_k: Number of results to return. Returns: Formatted string of similar past topics and their outcomes. """ results = self._retriever.recall_by_text( query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn ) if not results: return "" parts = ["### Past Research Directions (from memory)"] for i, (entry, score) in enumerate(results, 1): outcome = entry.metadata.get("outcome", "unknown") quality = entry.metadata.get("quality_score", "?") icon = {"success": "+", "failure": "-", "abandoned": "~"}.get( outcome, "?" ) parts.append( f"{i}. [{icon}] {entry.content.splitlines()[0]} " f"(score: {quality}, relevance: {score:.2f})" ) return "\n".join(parts) def get_anti_patterns(self) -> list[str]: """Get known failure patterns to avoid. Returns: List of topic descriptions that previously failed. """ entries = self._store.get_all(CATEGORY) failures: list[str] = [] for entry in entries: if entry.metadata.get("outcome") == "failure": topic_line = entry.content.splitlines()[0] reason = entry.metadata.get("reason", "") msg = topic_line if reason: msg += f" — {reason}" failures.append(msg) return failures ================================================ FILE: researchclaw/memory/retriever.py ================================================ """Similarity-based memory retrieval engine. Combines cosine similarity with time-decay and confidence weighting to return the most relevant memory entries for a given query. """ from __future__ import annotations import logging import math from datetime import datetime, timezone from typing import Any from researchclaw.memory.decay import time_decay_weight from researchclaw.memory.store import MemoryEntry, MemoryStore logger = logging.getLogger(__name__) def cosine_similarity(a: list[float], b: list[float]) -> float: """Compute cosine similarity between two vectors. Args: a: First vector. b: Second vector. Returns: Cosine similarity in [-1, 1], or 0.0 if either vector is zero. """ if len(a) != len(b) or not a: return 0.0 dot = sum(x * y for x, y in zip(a, b)) norm_a = math.sqrt(sum(x * x for x in a)) norm_b = math.sqrt(sum(x * x for x in b)) if norm_a == 0.0 or norm_b == 0.0: return 0.0 return dot / (norm_a * norm_b) class MemoryRetriever: """Retrieves relevant memories using semantic similarity. Scoring formula: score = sim_weight * cosine_sim + decay_weight * time_decay + conf_weight * confidence + access_weight * normalized_access_count """ def __init__( self, store: MemoryStore, half_life_days: float = 90.0, sim_weight: float = 0.5, decay_weight: float = 0.2, conf_weight: float = 0.2, access_weight: float = 0.1, ) -> None: self._store = store self._half_life_days = half_life_days self._sim_weight = sim_weight self._decay_weight = decay_weight self._conf_weight = conf_weight self._access_weight = access_weight def recall( self, query_embedding: list[float], category: str | None = None, top_k: int = 5, min_score: float = 0.0, ) -> list[tuple[MemoryEntry, float]]: """Retrieve most relevant memories for a query embedding. Args: query_embedding: Query vector. category: Filter by category (None for all). top_k: Maximum number of results. min_score: Minimum composite score threshold. Returns: List of (entry, score) tuples sorted by relevance. """ entries = self._store.get_all(category) if not entries: return [] # Find max access count for normalization max_access = max((e.access_count for e in entries), default=1) if max_access == 0: max_access = 1 scored: list[tuple[MemoryEntry, float]] = [] now = datetime.now(timezone.utc) for entry in entries: # Cosine similarity sim = cosine_similarity(query_embedding, entry.embedding) # Time decay try: created = datetime.fromisoformat(entry.created_at) except (ValueError, TypeError): created = now decay = time_decay_weight( created, half_life_days=self._half_life_days, now=now ) # Normalized access frequency norm_access = entry.access_count / max_access # Composite score score = ( self._sim_weight * sim + self._decay_weight * decay + self._conf_weight * entry.confidence + self._access_weight * norm_access ) if score >= min_score: scored.append((entry, score)) scored.sort(key=lambda x: x[1], reverse=True) # Mark top results as accessed for entry, _ in scored[:top_k]: self._store.mark_accessed(entry.id) return scored[:top_k] def recall_by_text( self, query: str, category: str | None = None, top_k: int = 5, embed_fn: Any = None, ) -> list[tuple[MemoryEntry, float]]: """Retrieve memories using text query (requires embed function). Args: query: Text query string. category: Filter by category. top_k: Maximum results. embed_fn: Callable that converts text to embedding vector. Returns: List of (entry, score) tuples. """ if embed_fn is None: logger.warning("No embedding function provided for text recall") return [] query_embedding = embed_fn(query) return self.recall(query_embedding, category=category, top_k=top_k) def format_for_prompt( self, results: list[tuple[MemoryEntry, float]], max_chars: int = 3000, ) -> str: """Format retrieval results as prompt injection text. Args: results: List of (entry, score) tuples from recall(). max_chars: Maximum character count for output. Returns: Formatted string suitable for LLM prompt injection. """ if not results: return "" parts: list[str] = [] total_len = 0 for i, (entry, score) in enumerate(results, 1): line = f"{i}. [{entry.category}] (relevance: {score:.2f}) {entry.content}" if total_len + len(line) > max_chars: break parts.append(line) total_len += len(line) return "\n".join(parts) ================================================ FILE: researchclaw/memory/store.py ================================================ """Unified memory storage engine. Manages three categories of memory (ideation, experiment, writing) with JSONL persistence, vector embeddings for semantic retrieval, time-decay weighting, and confidence scoring. """ from __future__ import annotations import json import logging import uuid from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any logger = logging.getLogger(__name__) @dataclass class MemoryEntry: """A single memory record.""" id: str category: str # "ideation" | "experiment" | "writing" content: str metadata: dict[str, Any] embedding: list[float] confidence: float created_at: str # ISO 8601 last_accessed: str # ISO 8601 access_count: int def to_dict(self) -> dict[str, Any]: """Serialize to dictionary.""" return asdict(self) @classmethod def from_dict(cls, data: dict[str, Any]) -> MemoryEntry: """Deserialize from dictionary.""" return cls( id=str(data.get("id", "")), category=str(data.get("category", "")), content=str(data.get("content", "")), metadata=data.get("metadata") or {}, embedding=data.get("embedding") or [], confidence=float(data.get("confidence", 0.5)), created_at=str(data.get("created_at", "")), last_accessed=str(data.get("last_accessed", "")), access_count=int(data.get("access_count", 0)), ) VALID_CATEGORIES = ("ideation", "experiment", "writing") class MemoryStore: """JSONL-backed persistent memory storage. Stores MemoryEntry records organized by category, supporting add/recall/update/prune/save/load operations. """ def __init__( self, store_dir: str | Path, max_entries_per_category: int = 500, confidence_threshold: float = 0.3, ) -> None: self._store_dir = Path(store_dir) self._max_per_category = max_entries_per_category self._confidence_threshold = confidence_threshold self._entries: dict[str, list[MemoryEntry]] = { cat: [] for cat in VALID_CATEGORIES } self._dirty = False @property def store_dir(self) -> Path: """Return the storage directory path.""" return self._store_dir def add( self, category: str, content: str, metadata: dict[str, Any] | None = None, embedding: list[float] | None = None, confidence: float = 0.5, ) -> str: """Add a new memory entry. Args: category: One of "ideation", "experiment", "writing". content: The memory content text. metadata: Optional metadata dict (run_id, stage, topic, etc.). embedding: Pre-computed embedding vector (or empty). confidence: Initial confidence score (0-1). Returns: The generated entry ID. Raises: ValueError: If category is invalid. """ if category not in VALID_CATEGORIES: raise ValueError( f"Invalid category '{category}'. Must be one of {VALID_CATEGORIES}" ) now = datetime.now(timezone.utc).isoformat(timespec="seconds") entry_id = uuid.uuid4().hex[:12] entry = MemoryEntry( id=entry_id, category=category, content=content, metadata=metadata or {}, embedding=embedding or [], confidence=confidence, created_at=now, last_accessed=now, access_count=0, ) self._entries[category].append(entry) self._dirty = True # Enforce capacity limit (remove lowest confidence) entries = self._entries[category] if len(entries) > self._max_per_category: entries.sort(key=lambda e: e.confidence, reverse=True) self._entries[category] = entries[: self._max_per_category] return entry_id def get(self, entry_id: str) -> MemoryEntry | None: """Retrieve a single entry by ID.""" for entries in self._entries.values(): for entry in entries: if entry.id == entry_id: return entry return None def get_all(self, category: str | None = None) -> list[MemoryEntry]: """Return all entries, optionally filtered by category.""" if category: return list(self._entries.get(category, [])) result: list[MemoryEntry] = [] for entries in self._entries.values(): result.extend(entries) return result def update_confidence(self, entry_id: str, delta: float) -> bool: """Update the confidence score of an entry. Args: entry_id: The entry to update. delta: Change amount (+0.1 for success, -0.2 for failure). Returns: True if entry was found and updated, False otherwise. """ for entries in self._entries.values(): for i, entry in enumerate(entries): if entry.id == entry_id: new_conf = max(0.0, min(1.0, entry.confidence + delta)) # Replace with updated entry (frozen-like pattern) entries[i] = MemoryEntry( id=entry.id, category=entry.category, content=entry.content, metadata=entry.metadata, embedding=entry.embedding, confidence=new_conf, created_at=entry.created_at, last_accessed=entry.last_accessed, access_count=entry.access_count, ) self._dirty = True return True return False def mark_accessed(self, entry_id: str) -> bool: """Update last_accessed timestamp and increment access count.""" now = datetime.now(timezone.utc).isoformat(timespec="seconds") for entries in self._entries.values(): for i, entry in enumerate(entries): if entry.id == entry_id: entries[i] = MemoryEntry( id=entry.id, category=entry.category, content=entry.content, metadata=entry.metadata, embedding=entry.embedding, confidence=entry.confidence, created_at=entry.created_at, last_accessed=now, access_count=entry.access_count + 1, ) self._dirty = True return True return False def prune( self, confidence_threshold: float | None = None, max_age_days: float = 365.0, ) -> int: """Remove expired and low-confidence entries. Args: confidence_threshold: Minimum confidence (default from init). max_age_days: Maximum age in days. Returns: Number of entries removed. """ threshold = confidence_threshold if confidence_threshold is not None else self._confidence_threshold now = datetime.now(timezone.utc) removed = 0 for category in VALID_CATEGORIES: before = len(self._entries[category]) kept: list[MemoryEntry] = [] for entry in self._entries[category]: try: created = datetime.fromisoformat(entry.created_at) if created.tzinfo is None: created = created.replace(tzinfo=timezone.utc) age_days = (now - created).total_seconds() / 86400.0 except (ValueError, TypeError): age_days = 0.0 if entry.confidence >= threshold and age_days <= max_age_days: kept.append(entry) self._entries[category] = kept removed += before - len(kept) if removed > 0: self._dirty = True logger.info("Pruned %d memory entries", removed) return removed def save(self) -> None: """Persist all entries to disk in JSONL format.""" self._store_dir.mkdir(parents=True, exist_ok=True) for category in VALID_CATEGORIES: path = self._store_dir / f"{category}.jsonl" with path.open("w", encoding="utf-8") as f: for entry in self._entries[category]: f.write(json.dumps(entry.to_dict(), ensure_ascii=False) + "\n") self._dirty = False total = sum(len(v) for v in self._entries.values()) logger.info("Saved %d memory entries to %s", total, self._store_dir) def load(self) -> int: """Load entries from disk. Returns: Total number of entries loaded. """ total = 0 for category in VALID_CATEGORIES: path = self._store_dir / f"{category}.jsonl" if not path.exists(): continue entries: list[MemoryEntry] = [] for line in path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: data = json.loads(line) entries.append(MemoryEntry.from_dict(data)) except (json.JSONDecodeError, TypeError) as exc: logger.warning("Skipping malformed memory entry: %s", exc) continue self._entries[category] = entries total += len(entries) logger.info("Loaded %d memory entries from %s", total, self._store_dir) return total def count(self, category: str | None = None) -> int: """Return total entries, optionally filtered by category.""" if category: return len(self._entries.get(category, [])) return sum(len(v) for v in self._entries.values()) ================================================ FILE: researchclaw/memory/writing_memory.py ================================================ """Writing memory — records and retrieves writing experiences.""" from __future__ import annotations import logging from typing import Any from researchclaw.memory.retriever import MemoryRetriever from researchclaw.memory.store import MemoryStore logger = logging.getLogger(__name__) CATEGORY = "writing" class WritingMemory: """Records and retrieves writing experiences. Tracks review feedback and resolutions, successful paper structure patterns, and writing tips that improved paper quality. """ def __init__( self, store: MemoryStore, retriever: MemoryRetriever, embed_fn: Any = None, ) -> None: self._store = store self._retriever = retriever self._embed_fn = embed_fn def record_review_feedback( self, feedback_type: str, feedback: str, resolution: str, run_id: str = "", ) -> str: """Record review feedback and its resolution. Args: feedback_type: Type of feedback (e.g., "clarity", "novelty", "methodology"). feedback: The reviewer's feedback text. resolution: How the feedback was addressed. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ content = ( f"Feedback type: {feedback_type}\n" f"Issue: {feedback}\n" f"Resolution: {resolution}" ) metadata = { "type": "review_feedback", "feedback_type": feedback_type, "run_id": run_id, } confidence = 0.7 # review feedback is usually reliable embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def record_successful_structure( self, section: str, structure: str, score: float, run_id: str = "", ) -> str: """Record a high-scoring paper structure pattern. Args: section: Paper section (e.g., "introduction", "method"). structure: Description of the structure pattern. score: Quality score achieved. run_id: Pipeline run identifier. Returns: The generated memory entry ID. """ content = ( f"Section: {section}\n" f"Structure: {structure}\n" f"Quality score: {score:.1f}/10" ) metadata = { "type": "structure_pattern", "section": section, "score": score, "run_id": run_id, } confidence = min(1.0, 0.3 + score / 15.0) embedding = self._embed_fn(content) if self._embed_fn else [] return self._store.add( CATEGORY, content, metadata, embedding, confidence ) def recall_writing_tips( self, section: str, context: str, top_k: int = 5, ) -> str: """Retrieve writing tips relevant to the current task. Args: section: Current paper section being written. context: Additional context (topic, methodology, etc.). top_k: Number of results. Returns: Formatted string of writing tips. """ query = f"writing tips for {section} section: {context}" results = self._retriever.recall_by_text( query, category=CATEGORY, top_k=top_k, embed_fn=self._embed_fn ) if not results: return "" parts = ["### Writing Experience (from memory)"] for i, (entry, score) in enumerate(results, 1): entry_type = entry.metadata.get("type", "tip") parts.append( f"{i}. [{entry_type}] {entry.content.splitlines()[0]} " f"(relevance: {score:.2f})" ) # Show resolution for review feedback if entry_type == "review_feedback": lines = entry.content.splitlines() for line in lines: if line.startswith("Resolution:"): parts.append(f" {line}") break return "\n".join(parts) ================================================ FILE: researchclaw/metaclaw_bridge/__init__.py ================================================ """MetaClaw integration bridge for AutoResearchClaw. Provides skill injection, evolution bridging, PRM quality gates, and session lifecycle management via the MetaClaw proxy. """ from researchclaw.metaclaw_bridge.config import MetaClawBridgeConfig __all__ = ["MetaClawBridgeConfig"] ================================================ FILE: researchclaw/metaclaw_bridge/config.py ================================================ """Configuration for the MetaClaw integration bridge.""" from __future__ import annotations from dataclasses import dataclass, field @dataclass(frozen=True) class PRMConfig: """PRM (Process Reward Model) quality gate settings.""" enabled: bool = False api_base: str = "" api_key_env: str = "" api_key: str = "" model: str = "gpt-5.4" votes: int = 3 temperature: float = 0.6 gate_stages: tuple[int, ...] = (5, 9, 15, 20) @dataclass(frozen=True) class LessonToSkillConfig: """Settings for converting AutoResearchClaw lessons into MetaClaw skills.""" enabled: bool = True min_severity: str = "error" max_skills_per_run: int = 3 @dataclass(frozen=True) class MetaClawBridgeConfig: """Top-level MetaClaw bridge configuration.""" enabled: bool = False proxy_url: str = "http://localhost:30000" skills_dir: str = "~/.metaclaw/skills" fallback_url: str = "" # Direct LLM URL if MetaClaw proxy is down fallback_api_key: str = "" prm: PRMConfig = field(default_factory=PRMConfig) lesson_to_skill: LessonToSkillConfig = field(default_factory=LessonToSkillConfig) ================================================ FILE: researchclaw/metaclaw_bridge/lesson_to_skill.py ================================================ """Convert AutoResearchClaw failure lessons into MetaClaw skills. Analyses high-severity lessons from the evolution store and uses an LLM to generate actionable MetaClaw skill files that prevent future recurrence. """ from __future__ import annotations import json import logging import re from pathlib import Path from typing import TYPE_CHECKING from researchclaw.metaclaw_bridge.stage_skill_map import ( LESSON_CATEGORY_TO_SKILL_CATEGORY, ) if TYPE_CHECKING: from researchclaw.evolution import LessonEntry from researchclaw.llm.client import LLMClient logger = logging.getLogger(__name__) _SEVERITY_ORDER = {"info": 0, "warning": 1, "error": 2, "critical": 3} _CONVERSION_PROMPT_SYSTEM = """\ You are a skill designer for an AI agent system. Your job is to convert failure lessons from an automated research pipeline into reusable skill guides that help the agent avoid the same mistakes in the future. Each skill must include: - A descriptive name (lowercase-hyphenated, prefixed with "arc-") - A one-line description of when to use the skill - A category from: {categories} - Markdown content with numbered steps and an anti-pattern section Output a JSON array of skill objects. Each object has: "name": "arc-", "description": "", "category": "", "content": "" """ _CONVERSION_PROMPT_USER = """\ The following failure lessons were extracted from automated research runs. Please generate {max_skills} reusable skills to address these failures. ## Failure Lessons {lessons_text} ## Existing Skills (do not duplicate) {existing_skills} Return ONLY a JSON array. No extra text. """ def _format_lessons(lessons: list[LessonEntry]) -> str: """Format lessons into a text block for the LLM prompt.""" parts: list[str] = [] for i, lesson in enumerate(lessons, 1): parts.append( f"{i}. [{lesson.severity}] [{lesson.category}] " f"Stage {lesson.stage_name}: {lesson.description}" ) return "\n".join(parts) def _list_existing_skill_names(skills_dir: Path) -> list[str]: """List all existing skill names in the MetaClaw skills directory.""" try: if not skills_dir.exists(): return [] return [d.name for d in skills_dir.iterdir() if d.is_dir()] except OSError: return [] def _parse_skills_response(text: str) -> list[dict[str, str]]: """Parse the LLM response into a list of skill dicts.""" # Strip markdown code fences if present text = text.strip() if text.startswith("```"): text = re.sub(r"^```\w*\n?", "", text) text = re.sub(r"\n?```\s*$", "", text) try: data = json.loads(text) # Handle both bare array and {"skills": [...]} wrapper if isinstance(data, dict): for key in ("skills", "results", "data"): if key in data and isinstance(data[key], list): data = data[key] break if isinstance(data, list): return [ s for s in data if isinstance(s, dict) and all(k in s for k in ("name", "description", "category", "content")) ] except json.JSONDecodeError: logger.warning("Failed to parse skill evolution response as JSON") return [] def _write_skill(skills_dir: Path, skill: dict[str, str]) -> Path | None: """Write a single skill to disk as a SKILL.md file.""" name = skill["name"] # Sanitize name name = re.sub(r"[^a-z0-9-]", "-", name.lower()).strip("-") if not name: return None skill_dir = skills_dir / name skill_dir.mkdir(parents=True, exist_ok=True) skill_path = skill_dir / "SKILL.md" content = f"---\nname: {name}\n" content += f"description: {skill['description']}\n" content += "metadata:\n" content += f" category: {skill['category']}\n" content += f"---\n{skill['content']}\n" skill_path.write_text(content, encoding="utf-8") logger.info("Created new MetaClaw skill: %s", name) return skill_path def _severity_at_least(severity: str, min_severity: str) -> bool: """Check if severity meets or exceeds the minimum threshold.""" return _SEVERITY_ORDER.get(severity, 0) >= _SEVERITY_ORDER.get(min_severity, 0) def convert_lessons_to_skills( lessons: list[LessonEntry], llm: LLMClient, skills_dir: str | Path, *, min_severity: str = "warning", max_skills: int = 3, ) -> list[str]: """Convert failure lessons into MetaClaw skills. Args: lessons: Lessons to convert (will be filtered by severity). llm: LLM client for generating skills. skills_dir: Path to MetaClaw skills directory. min_severity: Minimum severity to include ("info", "warning", "error", "critical"). max_skills: Maximum number of skills to generate. Returns: List of created skill names. """ if not lessons: return [] # Filter by severity threshold (>= min_severity) filtered = [ l for l in lessons if _severity_at_least(getattr(l, "severity", ""), min_severity) ] if not filtered: logger.info( "No lessons at severity >= %s (total lessons: %d)", min_severity, len(lessons) ) return [] logger.info( "Converting %d lessons (severity >= %s) to skills", len(filtered), min_severity ) skills_path = Path(skills_dir).expanduser() skills_path.mkdir(parents=True, exist_ok=True) categories = ", ".join(sorted(set(LESSON_CATEGORY_TO_SKILL_CATEGORY.values()))) existing = _list_existing_skill_names(skills_path) system = _CONVERSION_PROMPT_SYSTEM.format(categories=categories) user = _CONVERSION_PROMPT_USER.format( max_skills=max_skills, lessons_text=_format_lessons(filtered), existing_skills=", ".join(existing[:50]) if existing else "(none)", ) try: resp = llm.chat( [{"role": "user", "content": user}], system=system, json_mode=True, max_tokens=3000, ) except Exception: logger.warning("LLM call for lesson-to-skill conversion failed", exc_info=True) return [] parsed = _parse_skills_response(resp.content) if not parsed: logger.warning("No valid skills parsed from LLM response") return [] created: list[str] = [] for skill in parsed[:max_skills]: # Map category using our mapping if needed if skill["category"] not in LESSON_CATEGORY_TO_SKILL_CATEGORY.values(): lesson_cat = skill.get("category", "pipeline") skill["category"] = LESSON_CATEGORY_TO_SKILL_CATEGORY.get( lesson_cat, "research" ) path = _write_skill(skills_path, skill) if path is not None: created.append(skill["name"]) return created ================================================ FILE: researchclaw/metaclaw_bridge/prm_gate.py ================================================ """PRM (Process Reward Model) quality gate for AutoResearchClaw. Uses an LLM-as-judge approach (compatible with MetaClaw's PRMScorer) to evaluate the quality of pipeline stage outputs at key gate stages. """ from __future__ import annotations import json import logging import os import re import urllib.error import urllib.request from concurrent.futures import ThreadPoolExecutor, as_completed from statistics import mode logger = logging.getLogger(__name__) # Stage-specific evaluation instructions _GATE_INSTRUCTIONS: dict[int, str] = { 5: ( "Evaluate the quality of a literature screening result for academic research. " "Check: (1) Are the selected papers relevant to the research topic? " "(2) Is there sufficient coverage of key approaches? " "(3) Are low-quality or irrelevant papers properly filtered out?" ), 9: ( "Evaluate the quality of an experiment design for academic research. " "Check: (1) Are there proper baselines for comparison? " "(2) Are ablation studies planned? " "(3) Are statistical methods and metrics well-chosen? " "(4) Is the experiment reproducible?" ), 15: ( "Evaluate whether a research PROCEED/PIVOT decision is well-justified. " "Check: (1) Is there sufficient evidence to support the decision? " "(2) Are alternative interpretations considered? " "(3) Is the rationale logically sound?" ), 20: ( "Evaluate the overall quality of an academic paper. " "Check: (1) Is the contribution novel and clearly stated? " "(2) Is the methodology sound and well-described? " "(3) Do the experiments adequately support the claims? " "(4) Is the writing clear and well-structured?" ), } _JUDGE_SYSTEM = """\ You are a quality reviewer for an automated academic research pipeline. Based on the evaluation criteria and the provided output, decide: +1 = clearly meets quality standards and is ready to proceed -1 = fails core requirements or has critical issues 0 = ambiguous or insufficient evidence to decide Respond with ONLY "Score: 1", "Score: -1", or "Score: 0" on the first line, followed by a brief justification.""" def _single_judge_call( api_base: str, api_key: str, model: str, instruction: str, output_text: str, temperature: float, ) -> float | None: """Make a single PRM judge call and parse the score.""" messages = [ {"role": "system", "content": _JUDGE_SYSTEM}, { "role": "user", "content": ( f"## Evaluation Criteria\n{instruction}\n\n" f"## Output to Evaluate\n{output_text[:6000]}" ), }, ] body = json.dumps({ "model": model, "messages": messages, "temperature": temperature, "max_completion_tokens": 512, }).encode("utf-8") url = f"{api_base.rstrip('/')}/chat/completions" req = urllib.request.Request( url, data=body, headers={ "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, ) try: with urllib.request.urlopen(req, timeout=60) as resp: data = json.loads(resp.read()) content = data["choices"][0]["message"]["content"] # Parse "Score: X" match = re.search(r"Score:\s*([+-]?[01])", content) if match: return float(match.group(1)) return None except Exception: logger.debug("PRM judge call failed", exc_info=True) return None class ResearchPRMGate: """PRM quality gate using majority-vote LLM-as-judge scoring.""" def __init__( self, api_base: str, api_key: str, model: str = "gpt-5.4", votes: int = 3, temperature: float = 0.6, ) -> None: self.api_base = api_base self.api_key = api_key self.model = model self.votes = votes self.temperature = temperature @classmethod def from_bridge_config(cls, prm_config: object) -> ResearchPRMGate | None: """Create from MetaClawBridgeConfig.prm section. Returns None if PRM is not enabled or not configured. """ if not getattr(prm_config, "enabled", False): return None api_key = str( getattr(prm_config, "api_key", "") or os.environ.get(getattr(prm_config, "api_key_env", ""), "") or "" ) api_base = getattr(prm_config, "api_base", "") if not api_base or not api_key: return None return cls( api_base=api_base, api_key=api_key, model=getattr(prm_config, "model", "gpt-5.4"), votes=getattr(prm_config, "votes", 3), temperature=getattr(prm_config, "temperature", 0.6), ) def evaluate_stage( self, stage_num: int, output_text: str, *, custom_instruction: str | None = None, ) -> float: """Evaluate a stage output using majority-vote PRM scoring. Args: stage_num: Pipeline stage number (5, 9, 15, or 20). output_text: The stage output text to evaluate. custom_instruction: Override the default evaluation instruction. Returns: -1.0 (fail), 0.0 (ambiguous), or 1.0 (pass). """ instruction = custom_instruction or _GATE_INSTRUCTIONS.get( stage_num, "Evaluate the quality and correctness of this research output.", ) # Parallel judge calls scores: list[float] = [] with ThreadPoolExecutor(max_workers=self.votes) as pool: futures = [ pool.submit( _single_judge_call, self.api_base, self.api_key, self.model, instruction, output_text, self.temperature, ) for _ in range(self.votes) ] for future in as_completed(futures): result = future.result() if result is not None: scores.append(result) if not scores: logger.warning("All PRM judge calls failed for stage %d", stage_num) return 0.0 try: return float(mode(scores)) except Exception: # Tie — return 0.0 (ambiguous) return 0.0 def should_gate(self, stage_num: int) -> bool: """Check if PRM gating is configured for this stage.""" return stage_num in _GATE_INSTRUCTIONS ================================================ FILE: researchclaw/metaclaw_bridge/session.py ================================================ """MetaClaw session lifecycle management for AutoResearchClaw. Manages MetaClaw proxy session headers and lifecycle signals to enable proper skill evolution and RL training data collection. """ from __future__ import annotations import logging import uuid logger = logging.getLogger(__name__) class MetaClawSession: """Manages a MetaClaw session spanning an AutoResearchClaw pipeline run.""" def __init__(self, run_id: str) -> None: self.session_id = f"arc-{run_id}" self._active = True logger.info("MetaClaw session started: %s", self.session_id) def get_headers(self, stage_name: str = "") -> dict[str, str]: """Return HTTP headers for MetaClaw proxy requests. Args: stage_name: Current pipeline stage (for logging/tracking). Returns: Dict of headers to include in LLM API requests. """ headers: dict[str, str] = { "X-Session-Id": self.session_id, "X-Turn-Type": "main", } if stage_name: headers["X-AutoRC-Stage"] = stage_name return headers def end(self) -> dict[str, str]: """Return headers that signal session completion. Call this when the pipeline run finishes to trigger MetaClaw's post-session processing (skill evolution, etc.). """ self._active = False logger.info("MetaClaw session ended: %s", self.session_id) return { "X-Session-Id": self.session_id, "X-Session-Done": "true", "X-Turn-Type": "main", } @property def is_active(self) -> bool: return self._active ================================================ FILE: researchclaw/metaclaw_bridge/skill_feedback.py ================================================ """Track MetaClaw skill effectiveness across pipeline runs. Records which skills were active during each stage and correlates with stage success/failure to identify high/low-value skills. """ from __future__ import annotations import json import logging from dataclasses import asdict, dataclass from datetime import datetime, timezone from pathlib import Path logger = logging.getLogger(__name__) @dataclass class SkillEffectivenessRecord: """One record of a skill's effectiveness in a pipeline stage.""" skill_name: str stage_name: str run_id: str stage_success: bool timestamp: str class SkillFeedbackStore: """JSONL-backed store for skill effectiveness records.""" def __init__(self, store_path: Path) -> None: self._path = store_path self._path.parent.mkdir(parents=True, exist_ok=True) def append(self, record: SkillEffectivenessRecord) -> None: with self._path.open("a", encoding="utf-8") as f: f.write(json.dumps(asdict(record), ensure_ascii=False) + "\n") def append_many(self, records: list[SkillEffectivenessRecord]) -> None: if not records: return with self._path.open("a", encoding="utf-8") as f: for rec in records: f.write(json.dumps(asdict(rec), ensure_ascii=False) + "\n") logger.info("Recorded %d skill effectiveness entries", len(records)) def load_all(self) -> list[SkillEffectivenessRecord]: if not self._path.exists(): return [] records: list[SkillEffectivenessRecord] = [] for line in self._path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: data = json.loads(line) records.append( SkillEffectivenessRecord( skill_name=data["skill_name"], stage_name=data["stage_name"], run_id=data["run_id"], stage_success=data["stage_success"], timestamp=data["timestamp"], ) ) except (json.JSONDecodeError, KeyError): continue return records def compute_skill_stats(self) -> dict[str, dict[str, int | float]]: """Compute success rate per skill across all recorded runs. Returns: Dict mapping skill_name to {total, successes, success_rate}. """ records = self.load_all() stats: dict[str, dict[str, int]] = {} for rec in records: if rec.skill_name not in stats: stats[rec.skill_name] = {"total": 0, "successes": 0} stats[rec.skill_name]["total"] += 1 if rec.stage_success: stats[rec.skill_name]["successes"] += 1 result: dict[str, dict[str, int | float]] = {} for name, counts in stats.items(): total = counts["total"] successes = counts["successes"] result[name] = { "total": total, "successes": successes, "success_rate": successes / total if total > 0 else 0.0, } return result def record_stage_skills( store: SkillFeedbackStore, stage_name: str, run_id: str, stage_success: bool, active_skills: list[str], ) -> None: """Record effectiveness of all active skills for a completed stage.""" now = datetime.now(timezone.utc).isoformat(timespec="seconds") records = [ SkillEffectivenessRecord( skill_name=skill, stage_name=stage_name, run_id=run_id, stage_success=stage_success, timestamp=now, ) for skill in active_skills ] store.append_many(records) ================================================ FILE: researchclaw/metaclaw_bridge/stage_skill_map.py ================================================ """Maps AutoResearchClaw pipeline stages to MetaClaw skill categories. Each stage maps to: - task_type: MetaClaw's task category for skill retrieval - skills: Preferred research-specific skills to inject - top_k: Number of skills to inject at this stage """ from __future__ import annotations from typing import Any STAGE_SKILL_MAP: dict[str, dict[str, Any]] = { "topic_init": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 4, }, "problem_decompose": { "task_type": "research", "skills": ["research-gap-identification"], "top_k": 4, }, "search_strategy": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 6, }, "literature_collect": { "task_type": "research", "skills": ["literature-search-strategy"], "top_k": 4, }, "literature_screen": { "task_type": "research", "skills": ["paper-relevance-screening"], "top_k": 6, }, "knowledge_extract": { "task_type": "research", "skills": ["knowledge-card-extraction"], "top_k": 4, }, "synthesis": { "task_type": "research", "skills": ["research-gap-identification"], "top_k": 6, }, "hypothesis_gen": { "task_type": "research", "skills": ["hypothesis-formulation"], "top_k": 6, }, "experiment_design": { "task_type": "research", "skills": ["experiment-design-rigor"], "top_k": 6, }, "code_generation": { "task_type": "coding", "skills": ["hardware-aware-coding"], "top_k": 6, }, "resource_planning": { "task_type": "productivity", "skills": [], "top_k": 3, }, "experiment_run": { "task_type": "automation", "skills": ["experiment-debugging"], "top_k": 4, }, "iterative_refine": { "task_type": "coding", "skills": ["experiment-debugging"], "top_k": 6, }, "result_analysis": { "task_type": "data_analysis", "skills": ["statistical-analysis"], "top_k": 6, }, "research_decision": { "task_type": "research", "skills": ["research-pivot-decision"], "top_k": 4, }, "paper_outline": { "task_type": "communication", "skills": ["academic-writing-structure"], "top_k": 4, }, "paper_draft": { "task_type": "communication", "skills": ["academic-writing-structure"], "top_k": 6, }, "peer_review": { "task_type": "communication", "skills": ["peer-review-methodology"], "top_k": 6, }, "paper_revision": { "task_type": "communication", "skills": ["academic-writing-structure", "peer-review-methodology"], "top_k": 6, }, "quality_gate": { "task_type": "research", "skills": ["peer-review-methodology"], "top_k": 4, }, "knowledge_archive": { "task_type": "automation", "skills": [], "top_k": 2, }, "export_publish": { "task_type": "automation", "skills": [], "top_k": 2, }, "citation_verify": { "task_type": "research", "skills": ["citation-integrity"], "top_k": 4, }, } # Mapping from AutoResearchClaw lesson categories to skill categories. # Uses the new taxonomy: writing, domain, experiment, tooling. LESSON_CATEGORY_TO_SKILL_CATEGORY: dict[str, str] = { "system": "tooling", "experiment": "experiment", "writing": "writing", "analysis": "experiment", "literature": "experiment", "pipeline": "tooling", } def get_stage_config(stage_name: str) -> dict[str, Any]: """Return the MetaClaw skill config for a given pipeline stage. Falls back to a generic research config if the stage is unknown. """ return STAGE_SKILL_MAP.get( stage_name, {"task_type": "research", "skills": [], "top_k": 4}, ) ================================================ FILE: researchclaw/overleaf/__init__.py ================================================ """Overleaf bidirectional sync for AutoResearchClaw.""" from researchclaw.overleaf.sync import OverleafSync from researchclaw.overleaf.conflict import ConflictResolver from researchclaw.overleaf.watcher import FileWatcher from researchclaw.overleaf.formatter import LatexFormatter __all__ = ["OverleafSync", "ConflictResolver", "FileWatcher", "LatexFormatter"] ================================================ FILE: researchclaw/overleaf/conflict.py ================================================ """Section-level conflict detection and resolution for LaTeX files.""" from __future__ import annotations import logging import re import subprocess from pathlib import Path logger = logging.getLogger(__name__) # LaTeX section commands in order of depth _SECTION_PATTERN = re.compile( r"^\\(part|chapter|section|subsection|subsubsection)\b", re.MULTILINE, ) _CONFLICT_MARKER = re.compile(r"^<<<<<<<\s", re.MULTILINE) class ConflictResolver: """Detect and resolve merge conflicts at the LaTeX section level.""" def has_conflicts(self, repo_dir: Path) -> bool: """Check if there are unresolved merge conflicts.""" for tex in repo_dir.glob("**/*.tex"): content = tex.read_text(encoding="utf-8", errors="replace") if _CONFLICT_MARKER.search(content): return True return False def detect(self, repo_dir: Path) -> list[dict[str, str]]: """Find all conflict regions and which sections they belong to.""" conflicts: list[dict[str, str]] = [] for tex in repo_dir.glob("**/*.tex"): content = tex.read_text(encoding="utf-8", errors="replace") file_conflicts = _extract_conflicts(content) for c in file_conflicts: c["file"] = str(tex.relative_to(repo_dir)) conflicts.extend(file_conflicts) return conflicts def resolve(self, repo_dir: Path, strategy: str = "ours") -> list[str]: """Resolve all conflicts in .tex files using the given strategy. strategy: "ours" keeps the local (AI) version, "theirs" keeps the remote (human) version """ resolved_files: list[str] = [] for tex in repo_dir.glob("**/*.tex"): content = tex.read_text(encoding="utf-8", errors="replace") if not _CONFLICT_MARKER.search(content): continue resolved = _resolve_content(content, strategy) tex.write_text(resolved, encoding="utf-8") resolved_files.append(str(tex.relative_to(repo_dir))) logger.info("Resolved conflicts in %s (strategy=%s)", tex.name, strategy) return resolved_files def _extract_conflicts(content: str) -> list[dict[str, str]]: """Extract conflict regions from file content.""" conflicts: list[dict[str, str]] = [] in_conflict = False ours_lines: list[str] = [] theirs_lines: list[str] = [] current = ours_lines for line in content.splitlines(): if line.startswith("<<<<<<<"): in_conflict = True ours_lines = [] theirs_lines = [] current = ours_lines elif line.startswith("=======") and in_conflict: current = theirs_lines elif line.startswith(">>>>>>>") and in_conflict: conflicts.append({ "ours": "\n".join(ours_lines), "theirs": "\n".join(theirs_lines), }) in_conflict = False elif in_conflict: current.append(line) return conflicts def _resolve_content(content: str, strategy: str) -> str: """Replace conflict markers with the chosen side.""" lines = content.splitlines(keepends=True) result: list[str] = [] state = "normal" # normal | ours | theirs for line in lines: if line.startswith("<<<<<<<"): state = "ours" elif line.startswith("=======") and state == "ours": state = "theirs" elif line.startswith(">>>>>>>") and state == "theirs": state = "normal" else: if state == "normal": result.append(line) elif state == "ours" and strategy == "ours": result.append(line) elif state == "theirs" and strategy == "theirs": result.append(line) return "".join(result) ================================================ FILE: researchclaw/overleaf/formatter.py ================================================ """LaTeX formatting adapter for Overleaf compatibility.""" from __future__ import annotations import re from pathlib import Path class LatexFormatter: """Adapt pipeline LaTeX output for Overleaf conventions.""" @staticmethod def normalize_paths(content: str, figures_prefix: str = "figures/") -> str: """Normalize figure paths to use Overleaf-style relative paths.""" # Replace absolute or deep-nested paths with flat figures/ prefix content = re.sub( r"\\includegraphics(\[.*?\])?\{[^}]*?([^/}]+\.(?:png|pdf|jpg|eps))\}", lambda m: f"\\includegraphics{m.group(1) or ''}{{{figures_prefix}{m.group(2)}}}", content, ) return content @staticmethod def ensure_document_class(content: str) -> str: """Ensure the file has a \\documentclass declaration.""" if "\\documentclass" not in content: content = "\\documentclass{article}\n" + content return content @staticmethod def strip_local_comments(content: str) -> str: """Remove AutoResearchClaw-internal comments from LaTeX.""" lines = content.splitlines(keepends=True) return "".join( line for line in lines if not line.strip().startswith("% RESEARCHCLAW:") ) @staticmethod def fix_encoding(content: str) -> str: """Ensure UTF-8 input encoding package is declared.""" if "\\usepackage[utf8]{inputenc}" not in content and "\\usepackage{inputenc}" not in content: # Insert after documentclass content = re.sub( r"(\\documentclass.*?\n)", r"\1\\usepackage[utf8]{inputenc}\n", content, count=1, ) return content def format_for_overleaf(self, tex_path: Path) -> str: """Apply all formatting steps to a LaTeX file.""" content = tex_path.read_text(encoding="utf-8") content = self.ensure_document_class(content) content = self.fix_encoding(content) content = self.normalize_paths(content) content = self.strip_local_comments(content) return content ================================================ FILE: researchclaw/overleaf/sync.py ================================================ """Overleaf Git-based bidirectional sync engine.""" from __future__ import annotations import logging import shutil import subprocess from datetime import datetime, timezone from pathlib import Path from typing import Any from researchclaw.overleaf.conflict import ConflictResolver logger = logging.getLogger(__name__) class OverleafSync: """Bidirectional sync between pipeline output and Overleaf via Git.""" def __init__( self, git_url: str, branch: str = "main", auto_push: bool = True, auto_pull: bool = False, ) -> None: self.git_url = git_url self.branch = branch self.auto_push = auto_push self.auto_pull = auto_pull self.local_dir: Path | None = None self._last_sync: datetime | None = None self._conflict_resolver = ConflictResolver() def setup(self, run_dir: Path) -> Path: """Clone or update the Overleaf repo into the run directory.""" self.local_dir = run_dir / "overleaf_repo" if self.local_dir.exists() and (self.local_dir / ".git").exists(): logger.info("Pulling latest from Overleaf...") self._git("pull", "origin", self.branch) else: logger.info("Cloning Overleaf repo: %s", self.git_url) self.local_dir.mkdir(parents=True, exist_ok=True) self._git_clone() return self.local_dir def push_paper( self, paper_tex: Path, bib_file: Path | None = None, figures_dir: Path | None = None, ) -> bool: """Push pipeline-generated paper to Overleaf. Copies .tex, .bib, and figures into the local clone, then commits and pushes. """ if not self.local_dir: raise RuntimeError("Call setup() before push_paper()") # Copy main tex file dst_tex = self.local_dir / paper_tex.name shutil.copy2(paper_tex, dst_tex) logger.info("Copied %s -> %s", paper_tex, dst_tex) # Copy bib file if bib_file and bib_file.exists(): dst_bib = self.local_dir / bib_file.name shutil.copy2(bib_file, dst_bib) # Copy figures if figures_dir and figures_dir.is_dir(): dst_figs = self.local_dir / "figures" if dst_figs.exists(): shutil.rmtree(dst_figs) shutil.copytree(figures_dir, dst_figs) # Git add, commit, push self._git("add", "-A") status = self._git("status", "--porcelain") if not status.strip(): logger.info("No changes to push") return False ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") self._git("commit", "-m", f"AutoResearchClaw sync: {ts}") self._git("push", "origin", self.branch) self._last_sync = datetime.now(timezone.utc) logger.info("Pushed paper to Overleaf") return True def pull_changes(self) -> list[str]: """Pull human edits from Overleaf and return changed file names.""" if not self.local_dir: raise RuntimeError("Call setup() before pull_changes()") # Record current HEAD old_head = self._git("rev-parse", "HEAD").strip() self._git("pull", "origin", self.branch) new_head = self._git("rev-parse", "HEAD").strip() if old_head == new_head: return [] # Get list of changed files diff_output = self._git("diff", "--name-only", old_head, new_head) changed = [f.strip() for f in diff_output.splitlines() if f.strip()] self._last_sync = datetime.now(timezone.utc) return changed def get_status(self) -> dict[str, Any]: """Return sync status information.""" status: dict[str, Any] = { "git_url": self.git_url, "branch": self.branch, "local_dir": str(self.local_dir) if self.local_dir else None, "last_sync": self._last_sync.isoformat() if self._last_sync else None, "auto_push": self.auto_push, "auto_pull": self.auto_pull, } if self.local_dir and (self.local_dir / ".git").exists(): pending = self._git("status", "--porcelain").strip() status["pending_changes"] = len(pending.splitlines()) if pending else 0 return status def resolve_conflicts(self, strategy: str = "ours") -> list[str]: """Resolve merge conflicts using the given strategy.""" if not self.local_dir: raise RuntimeError("Call setup() before resolve_conflicts()") return self._conflict_resolver.resolve(self.local_dir, strategy) # ── git helpers ─────────────────────────────────────────────── def _git(self, *args: str) -> str: """Run a git command in the local repo directory.""" cmd = ["git", "-C", str(self.local_dir), *args] result = subprocess.run( cmd, capture_output=True, text=True, timeout=60, ) if result.returncode != 0 and "conflict" not in result.stderr.lower(): raise RuntimeError(f"git {' '.join(args)} failed: {result.stderr.strip()}") return result.stdout def _git_clone(self) -> None: """Clone the Overleaf repo.""" result = subprocess.run( ["git", "clone", "-b", self.branch, self.git_url, str(self.local_dir)], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: raise RuntimeError(f"git clone failed: {result.stderr.strip()}") ================================================ FILE: researchclaw/overleaf/watcher.py ================================================ """File change watcher for Overleaf sync polling.""" from __future__ import annotations import logging import time from pathlib import Path from typing import Any logger = logging.getLogger(__name__) class FileWatcher: """Watch a directory for file changes (poll-based).""" def __init__(self, watch_dir: Path, extensions: tuple[str, ...] = (".tex", ".bib")) -> None: self.watch_dir = watch_dir self.extensions = extensions self._snapshot: dict[str, float] = {} self._take_snapshot() def _take_snapshot(self) -> None: """Record modification times of all tracked files.""" self._snapshot = {} if not self.watch_dir.exists(): return for ext in self.extensions: for f in self.watch_dir.rglob(f"*{ext}"): self._snapshot[str(f.relative_to(self.watch_dir))] = f.stat().st_mtime def check_changes(self) -> list[str]: """Return files that have changed since the last snapshot.""" changed: list[str] = [] current: dict[str, float] = {} if not self.watch_dir.exists(): return changed for ext in self.extensions: for f in self.watch_dir.rglob(f"*{ext}"): rel = str(f.relative_to(self.watch_dir)) mtime = f.stat().st_mtime current[rel] = mtime old_mtime = self._snapshot.get(rel) if old_mtime is None or mtime > old_mtime: changed.append(rel) # Check for deleted files for rel in self._snapshot: if rel not in current: changed.append(rel) self._snapshot = current return changed def poll_loop(self, interval_sec: int = 300, callback: Any = None) -> None: """Blocking poll loop that calls callback on changes. This is meant to be run in a background thread. """ logger.info("Starting file watcher on %s (interval=%ds)", self.watch_dir, interval_sec) while True: time.sleep(interval_sec) changes = self.check_changes() if changes and callback: callback(changes) ================================================ FILE: researchclaw/pipeline/__init__.py ================================================ """Pipeline core — 23-stage research pipeline.""" ================================================ FILE: researchclaw/pipeline/_domain.py ================================================ """Domain detection — maps research topic to academic domain & venue context.""" from __future__ import annotations _DOMAIN_KEYWORDS: dict[str, tuple[list[str], str, str]] = { # domain_id: (keywords, display_name, top_venues) "ml": ( ["machine learning", "deep learning", "neural network", "transformer", "reinforcement learning", "GAN", "diffusion model", "LLM", "language model", "computer vision", "NLP", "representation learning", "self-supervised", "federated learning", "meta-learning", "continual learning", "few-shot", "knowledge distillation", "attention mechanism", "fine-tuning", "RLHF", "vision transformer", "ViT", "BERT", "GPT", "autoencoder"], "machine learning", "NeurIPS, ICML, ICLR", ), "physics": ( ["quantum", "thermodynamic", "electrodynamic", "particle physics", "condensed matter", "statistical mechanics", "cosmology", "astrophysics", "plasma", "optics", "photonics", "relativity", "gravitational", "PDE", "PINN", "physics-informed", "Burgers", "Navier-Stokes", "Darcy flow", "Schrödinger", "scientific computing", "operator learning", "neural operator", "Fourier neural", "DeepONet"], "physics", "Physical Review Letters, Nature Physics, JHEP", ), "chemistry": ( ["molecular", "catalysis", "polymer", "organic chemistry", "inorganic", "electrochemistry", "spectroscopy", "crystallography", "drug discovery", "protein folding", "computational chemistry", "DFT", "force field"], "chemistry", "JACS, Nature Chemistry, Angewandte Chemie", ), "economics": ( ["econometric", "macroeconomic", "microeconomic", "game theory", "market", "fiscal policy", "monetary", "behavioral economics", "causal inference", "panel data", "regression discontinuity", "instrumental variable", "supply chain", "auction"], "economics", "AER, Econometrica, QJE, Review of Economic Studies", ), "mathematics": ( ["theorem", "proof", "prove", "conjecture", "topology", "algebra", "number theory", "combinatorics", "differential equation", "stochastic process", "functional analysis", "manifold", "Riemannian", "category theory", "graph theory", "neural ODE", "dynamical system", "Lorenz", "chaotic", "Lyapunov", "attractor", "ODE solver", "trajectory prediction", "mathematical formulation", "mathematical proof", "derivation", "Brownian motion", "branching process", "Galton-Watson", "Markov chain", "martingale", "ergodic", "convergence theorem", "marginal distribution", "extinction probability", "Feynman-Kac", "measure theory", "Hilbert space", "Banach space", "operator theory", "variational", "Euler-Lagrange", "calculus of variations"], "mathematics", "Annals of Mathematics, Inventiones Mathematicae, JAMS", ), "engineering": ( ["robotics", "control system", "signal processing", "FPGA", "embedded system", "VLSI", "antenna", "fluid dynamics", "CFD", "finite element", "structural", "mechatronics", "autonomous"], "engineering", "IEEE Transactions, ASME journals, AIAA", ), "biology": ( ["genomics", "proteomics", "transcriptomics", "CRISPR", "single-cell", "phylogenetic", "ecology", "neuroscience", "bioinformatics", "sequencing", "gene expression", "epigenetic"], "biology", "Nature, Science, Cell, PNAS", ), } def _detect_domain(topic: str, domains: tuple[str, ...] = ()) -> tuple[str, str, str]: """Detect research domain from topic string and config domains. Returns ``(domain_id, display_name, top_venues)``. Falls back to ``("ml", "machine learning", "NeurIPS, ICML, ICLR")``. """ # If user explicitly specified domains, check them first for d in domains: d_lower = d.lower().strip() for did, (kws, dname, venues) in _DOMAIN_KEYWORDS.items(): if d_lower in (did, dname) or any(k in d_lower for k in kws[:3]): return did, dname, venues # Auto-detect from topic text topic_lower = topic.lower() best_did, best_score = "ml", 0 # BUG-101: Explicit theoretical intent words boost non-empirical domain scores. # Topics like "derive the mathematical formulation of X diffusion model" # should classify as math, not ML, even if "diffusion model" is an ML keyword. _theoretical_intent = any( w in topic_lower for w in ("derive", "prove", "mathematical formulation", "mathematical proof", "formal proof", "formalism") ) for did, (kws, dname, venues) in _DOMAIN_KEYWORDS.items(): score = sum(1 for k in kws if k.lower() in topic_lower) # Boost non-empirical domains when theoretical intent is detected if _theoretical_intent and did in ("mathematics", "physics", "economics"): score += 1 if score > best_score: best_score = score best_did = did did = best_did _, dname, venues = _DOMAIN_KEYWORDS[did] return did, dname, venues def _is_ml_domain(domain_id: str) -> bool: """Check if the detected domain is ML/AI.""" return domain_id == "ml" ================================================ FILE: researchclaw/pipeline/_helpers.py ================================================ """Shared constants, data classes, and utility functions for the pipeline executor.""" from __future__ import annotations import json import logging import math import re from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any import yaml from researchclaw.config import RCConfig from researchclaw.hardware import HardwareProfile, is_metric_name from researchclaw.llm.client import LLMClient from researchclaw.pipeline.stages import ( NEXT_STAGE, Stage, StageStatus, ) from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass(frozen=True) class StageResult: """Outcome of executing a single stage.""" stage: Stage status: StageStatus artifacts: tuple[str, ...] error: str | None = None decision: str = "proceed" evidence_refs: tuple[str, ...] = () # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- _SANDBOX_SAFE_PACKAGES = { "numpy", "scipy", "torch", "sklearn", "matplotlib", "pandas", "seaborn", "tqdm", "gymnasium", "gym", } _METACLAW_SKILLS_DIR = str(Path.home() / ".metaclaw" / "skills") # --- P1-1: Topic keyword extraction for domain pre-filter --- _STOP_WORDS = frozenset( { "a", "an", "the", "and", "or", "but", "in", "on", "of", "for", "to", "with", "by", "at", "from", "as", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can", "shall", "not", "no", "nor", "so", "yet", "both", "each", "every", "all", "any", "few", "more", "most", "other", "some", "such", "than", "too", "very", "just", "about", "above", "after", "again", "between", "into", "through", "during", "before", "after", "under", "over", "using", "based", "via", "toward", "towards", "new", "novel", "approach", "method", "study", "research", "paper", "work", "propose", "proposed", } ) # --------------------------------------------------------------------------- # Timestamp utility # --------------------------------------------------------------------------- def _utcnow_iso() -> str: return datetime.now(timezone.utc).isoformat(timespec="seconds") # --------------------------------------------------------------------------- # Fallback query builder # --------------------------------------------------------------------------- def _build_fallback_queries(topic: str) -> list[str]: """Extract meaningful search queries from a long topic string. Instead of using the raw topic as a query (which is often 200+ chars and returns garbage from search engines), extract noun phrases and domain keywords. Returns 5-10 targeted queries. """ # Split on common delimiters and extract meaningful chunks chunks = re.split(r"[,:;()\[\]]+", topic) chunks = [c.strip() for c in chunks if len(c.strip()) > 8] cleaned_chunks = [] for c in chunks: c = re.sub( r"^(and|or|the|a|an|in|of|for|with|across|multiple|three|various)\s+", "", c, flags=re.IGNORECASE, ) c = c.strip() if len(c) > 8: cleaned_chunks.append(c) chunks = cleaned_chunks # Extract key terms (words that look like domain terms, not stopwords) _stop = { "the", "and", "for", "with", "from", "that", "this", "into", "over", "across", "multiple", "three", "result", "comprehensive", "using", "based", "between", "various", "different", "several", "parameter", "parameters", "analysis", "approach", "method", "framework", "frameworks", } words = topic.lower().split() key_terms = [w for w in words if len(w) > 3 and w not in _stop] queries: list[str] = [] # Strategy 1: Use meaningful chunks (up to 60 chars each) for chunk in chunks[:4]: if len(chunk) > 60: chunk = " ".join(chunk.split()[:6]) if chunk and chunk not in queries: queries.append(chunk) # Strategy 2: Bigrams of key terms clean_terms = [t for t in key_terms if re.match(r"^[a-z]", t) and ":" not in t] for i in range(min(len(clean_terms) - 1, 4)): bigram = f"{clean_terms[i]} {clean_terms[i + 1]}" if bigram not in queries: queries.append(bigram) # Deduplicate preserving order seen: set[str] = set() unique: list[str] = [] for q in queries: q_lower = q.strip().lower() if q_lower and q_lower not in seen: seen.add(q_lower) unique.append(q.strip()) # Ensure we have at least a few useful queries topic_short = topic[:60].strip() for suffix in ("survey", "review", "benchmark", "state of the art", "recent advances"): if len(unique) >= 5: break candidate = f"{topic_short} {suffix}".strip() if candidate.lower() not in seen: seen.add(candidate.lower()) unique.append(candidate) return unique[:10] # --------------------------------------------------------------------------- # Stage metadata I/O # --------------------------------------------------------------------------- def _write_stage_meta( stage_dir: Path, stage: Stage, run_id: str, result: "StageResult" ) -> None: next_stage = NEXT_STAGE[stage] meta = { "stage_id": f"{int(stage):02d}-{stage.name.lower()}", "run_id": run_id, "status": result.status.value, "decision": result.decision, "output_artifacts": list(result.artifacts), "evidence_refs": list(result.evidence_refs), "error": result.error, "ts": _utcnow_iso(), "next_stage": int(next_stage) if next_stage is not None else None, } (stage_dir / "decision.json").write_text( json.dumps(meta, indent=2), encoding="utf-8" ) # --------------------------------------------------------------------------- # Sandbox dependency helper # --------------------------------------------------------------------------- def _ensure_sandbox_deps(code: str, python_path: str) -> list[str]: """P7: Scan code imports and auto-install missing common packages.""" import subprocess as _sp imports: set[str] = set() for line in code.splitlines(): m = re.match(r"^(?:from|import)\s+(\w+)", line.strip()) if m: imports.add(m.group(1)) to_check = imports & _SANDBOX_SAFE_PACKAGES if not to_check: return [] py = python_path py_path = Path(py) if not py_path.is_absolute(): py_path = Path.cwd() / py_path installed: list[str] = [] for pkg in sorted(to_check): try: r = _sp.run( [str(py_path), "-c", f"import {pkg}"], capture_output=True, timeout=10, ) if r.returncode != 0: pip_name = "scikit-learn" if pkg == "sklearn" else pkg logger.info("Sandbox: installing missing dependency '%s'", pip_name) _sp.run( [str(py_path), "-m", "pip", "install", pip_name, "--quiet"], capture_output=True, timeout=120, ) installed.append(pip_name) except Exception as exc: logger.warning("Sandbox: failed to check/install '%s': %s", pkg, exc) if installed: logger.info("Sandbox: auto-installed packages: %s", ", ".join(installed)) return installed # --------------------------------------------------------------------------- # Prior artifact I/O # --------------------------------------------------------------------------- def _read_best_analysis(run_dir: Path) -> str: """BUG-225: Read analysis.md from the best Stage 14 iteration. Prefers ``analysis_best.md`` at run root (written by ``_promote_best_stage14``) over ``_read_prior_artifact("analysis.md")`` which may pick a degenerate non-versioned stage-14 directory. """ best = run_dir / "analysis_best.md" if best.exists(): return best.read_text(encoding="utf-8") return _read_prior_artifact(run_dir, "analysis.md") or "" def _read_prior_artifact(run_dir: Path, filename: str) -> str | None: # R14-2: Sort so non-versioned dirs (stage-13) come before versioned (stage-13_v1). # Within the same stage number, prefer the latest (non-versioned) copy. def _stage_sort_key(p: Path) -> tuple[str, int]: name = p.name # Extract base stage name and version if "_v" in name: base, _, ver = name.rpartition("_v") try: return (base, -int(ver)) # Versioned: lower priority (negative version) except ValueError: return (name, -999) return (name, 0) # Non-versioned: highest priority for stage_subdir in sorted(run_dir.glob("stage-*"), key=_stage_sort_key, reverse=True): candidate = stage_subdir / filename if candidate.is_file(): return candidate.read_text(encoding="utf-8") if filename.endswith("/") and (stage_subdir / filename.rstrip("/")).is_dir(): return str(stage_subdir / filename.rstrip("/")) return None def _find_prior_file(run_dir: Path, filename: str) -> Path | None: """Like ``_read_prior_artifact`` but returns the *Path* instead of content.""" def _stage_sort_key(p: Path) -> tuple[str, int]: name = p.name if "_v" in name: base, _, ver = name.rpartition("_v") try: return (base, -int(ver)) except ValueError: return (name, -999) return (name, 0) for stage_subdir in sorted(run_dir.glob("stage-*"), key=_stage_sort_key, reverse=True): candidate = stage_subdir / filename if candidate.is_file(): return candidate return None def _load_hardware_profile(run_dir: Path) -> dict[str, Any] | None: """Load hardware_profile.json from a prior stage (usually stage-01).""" raw = _read_prior_artifact(run_dir, "hardware_profile.json") if raw is None: return None try: data = json.loads(raw) return data if isinstance(data, dict) else None except (json.JSONDecodeError, ValueError): return None # --------------------------------------------------------------------------- # Parsing utilities # --------------------------------------------------------------------------- def _extract_yaml_block(text: str) -> str: """Extract YAML from text that may contain ACP noise. Strips [thinking] blocks, insight blocks, and other ACP artifacts before looking for YAML in markdown fences or raw text. """ # Strip ACP noise: [thinking]..., insight blocks, [plan]... cleaned = re.sub( r"\[thinking\].*?(?=\n```|\n[A-Z]|\Z)", "", text, flags=re.DOTALL, ) cleaned = re.sub(r"\[plan\].*?\n\n", "", cleaned, flags=re.DOTALL) # Try markdown fences first (most reliable) — on cleaned text if "```yaml" in cleaned: return cleaned.split("```yaml", 1)[1].split("```", 1)[0].strip() if "```yml" in cleaned: return cleaned.split("```yml", 1)[1].split("```", 1)[0].strip() if "```" in cleaned: block = cleaned.split("```", 1)[1].split("```", 1)[0].strip() if block: return block # Try the original text too (in case cleaning removed too much) if "```yaml" in text: return text.split("```yaml", 1)[1].split("```", 1)[0].strip() if "```yml" in text: return text.split("```yml", 1)[1].split("```", 1)[0].strip() if "```" in text: block = text.split("```", 1)[1].split("```", 1)[0].strip() if block: return block # Last resort: try to find YAML-like content (lines starting with key:) yaml_lines: list[str] = [] in_yaml = False for line in cleaned.splitlines(): stripped = line.strip() if not in_yaml and re.match(r"^[a-z_]+:", stripped): in_yaml = True if in_yaml: if stripped and not stripped.startswith("#"): yaml_lines.append(line) elif not stripped and yaml_lines: yaml_lines.append(line) if yaml_lines: return "\n".join(yaml_lines).strip() return text.strip() def _safe_json_loads(text: str, default: Any) -> Any: """Parse JSON from text, handling noisy ACP output. Tries multiple strategies: direct parse, markdown fence extraction, balanced brace matching (largest dict wins), and array brackets. """ if not text or not text.strip(): return default # Strategy 1: Direct parse try: return json.loads(text) except (json.JSONDecodeError, ValueError, RecursionError): pass # Strategy 2: Find JSON in markdown code fences fence_pattern = re.compile(r"```(?:json)?\s*\n(.*?)```", re.DOTALL) for match in fence_pattern.finditer(text): candidate = match.group(1).strip() try: return json.loads(candidate) except (json.JSONDecodeError, ValueError): continue # Strategy 3: Find outermost balanced braces brace_depth = 0 start = -1 candidates: list[str] = [] for i, ch in enumerate(text): if ch == "{": if brace_depth == 0: start = i brace_depth += 1 elif ch == "}": brace_depth -= 1 if brace_depth == 0 and start >= 0: candidates.append(text[start : i + 1]) start = -1 # Try candidates from largest to smallest candidates.sort(key=len, reverse=True) for candidate in candidates: try: parsed = json.loads(candidate) if isinstance(parsed, dict): return parsed except (json.JSONDecodeError, ValueError): continue # Strategy 4: Same for array [ ] bracket_depth = 0 start = -1 for i, ch in enumerate(text): if ch == "[": if bracket_depth == 0: start = i bracket_depth += 1 elif ch == "]": bracket_depth -= 1 if bracket_depth == 0 and start >= 0: try: parsed = json.loads(text[start : i + 1]) if isinstance(parsed, list): return parsed except (json.JSONDecodeError, ValueError): pass start = -1 return default def _extract_code_block(content: str) -> str: match = re.search(r"```(?:python)?\s*(.*?)\s*```", content, flags=re.DOTALL) if match is not None: return match.group(1).strip() return content.strip() def _extract_multi_file_blocks(content: str) -> dict[str, str]: """Parse LLM response containing multiple files with filename markers. Expected format:: ```filename:main.py import model ... ``` ```filename:model.py class MyModel: ... ``` Also handles common LLM format variations: - ````` ```python filename:main.py````` (space before filename) - ````` ``` filename:main.py````` (space after backticks) - ``filename:main.py`` on next line after backticks - ``# FILE: main.py`` comment markers inside code blocks Falls back to treating the entire code block as ``main.py`` if no ``filename:`` markers are found. Returns a dict mapping filename → code content. """ # R13-2: Multiple patterns to handle LLM format variations patterns = [ # Original: ```filename:xxx.py or ```python filename:xxx.py re.compile( r"```(?:python\s+)?filename:(\S+)\s*\n(.*?)```", flags=re.DOTALL, ), # Variation: ``` filename:xxx.py (space after backticks) re.compile( r"```\s+filename:(\S+)\s*\n(.*?)```", flags=re.DOTALL, ), # Variation: ```python\nfilename:xxx.py (filename on next line) re.compile( r"```(?:python)?\s*\nfilename:(\S+)\s*\n(.*?)```", flags=re.DOTALL, ), # Variation: ```python\n# filename: xxx.py (comment marker) re.compile( r"```(?:python)?\s*\n#\s*(?:FILE|filename)\s*:\s*(\S+\.py)\s*\n(.*?)```", flags=re.DOTALL, ), ] matches: list[tuple[str, str]] = [] for pattern in patterns: matches = pattern.findall(content) if matches: break if matches: files: dict[str, str] = {} for fname, code in matches: fname = fname.strip() # Security: prevent path traversal if ".." in fname or fname.startswith("/"): continue # Normalise to flat filenames (strip leading ./ or subdirs for safety) fname = fname.replace("\\", "/").split("/")[-1] if fname and fname.endswith(".py"): files[fname] = code.strip() if files: # Ensure there is a main.py entry point if "main.py" not in files: # Pick the first file as main.py first_key = next(iter(files)) files["main.py"] = files.pop(first_key) return files # Fallback: single code block → main.py code = _extract_code_block(content) if code.strip(): return {"main.py": code} return {} def _parse_jsonl_rows(text: str) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] for line in text.splitlines(): line = line.strip() if not line: continue parsed = _safe_json_loads(line, {}) if isinstance(parsed, dict): rows.append(parsed) return rows def _write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: with path.open("w", encoding="utf-8") as handle: for row in rows: handle.write(json.dumps(row, ensure_ascii=False) + "\n") def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any]: """Parse metric lines from experiment stdout. Handles multiple formats: - ``name: value`` (e.g. ``loss: 0.0042``) - ``UCB (Stochastic) cumulative_regret: 361.9233`` - ``condition=name metric=value`` (per-condition output) - ``condition=name/metric_name metric=value`` Returns a flat dict of metric_name -> value. Filters out log/status lines using :func:`is_metric_name`. """ # BUG-173: regex for condition=name metric=value format _CONDITION_RE = re.compile( r"^condition=(\S+)\s+metric=([0-9eE.+-]+)\s*$" ) metrics: dict[str, Any] = {} for line in stdout.splitlines(): line = line.strip() # --- Format 2: condition=xxx metric=yyy --- m = _CONDITION_RE.match(line) if m: cond_name = m.group(1) try: fval = float(m.group(2)) metrics[cond_name] = fval except (ValueError, TypeError): pass continue # --- Format 1: name: value --- if ":" not in line: continue # Split on the LAST colon to handle names with colons parts = line.rsplit(":", 1) if len(parts) != 2: continue name_part = parts[0].strip() value_part = parts[1].strip() # Filter out log lines that look like status messages if not is_metric_name(name_part): continue try: fval = float(value_part) # Use the full name (e.g. "UCB (Stochastic) cumulative_regret") metrics[name_part] = fval except (ValueError, TypeError): pass return metrics # --------------------------------------------------------------------------- # LLM helpers # --------------------------------------------------------------------------- def _chat_with_prompt( llm: LLMClient, system: str, user: str, *, json_mode: bool = False, max_tokens: int | None = None, retries: int = 0, strip_thinking: bool = True, ) -> Any: """Send a chat request with optional retry on timeout/transient errors. Parameters ---------- retries: Number of extra attempts after the first failure (0 = no retry). Uses exponential backoff: 2s, 4s, 8s, ... strip_thinking: If True (default for pipeline usage), strip ```` tags from the LLM response. This prevents chain-of-thought leakage from breaking YAML / JSON / LaTeX parsers downstream. """ import time messages = [{"role": "user", "content": user}] last_exc: Exception | None = None for attempt in range(1 + retries): try: if json_mode and max_tokens is not None: return llm.chat(messages, system=system, json_mode=True, max_tokens=max_tokens, strip_thinking=strip_thinking) if json_mode: return llm.chat(messages, system=system, json_mode=True, strip_thinking=strip_thinking) if max_tokens is not None: return llm.chat(messages, system=system, max_tokens=max_tokens, strip_thinking=strip_thinking) return llm.chat(messages, system=system, strip_thinking=strip_thinking) except Exception as exc: # noqa: BLE001 last_exc = exc if attempt < retries: delay = 2 ** (attempt + 1) logger.warning( "LLM call failed (attempt %d/%d): %s. Retrying in %ds...", attempt + 1, 1 + retries, exc, delay, ) time.sleep(delay) else: raise last_exc from None raise last_exc # type: ignore[misc] # unreachable but satisfies type checker def _get_evolution_overlay(run_dir: Path | None, stage_name: str) -> str: """Load evolution lessons + MetaClaw skills for prompt injection. Combines intra-run lessons (from current run's evolution dir) with cross-run arc-* skills (from ~/.metaclaw/skills/). Returns empty string if no relevant lessons/skills exist or on any error. """ if run_dir is None: return "" try: from researchclaw.evolution import EvolutionStore store = EvolutionStore(run_dir / "evolution") return store.build_overlay( stage_name, max_lessons=5, skills_dir=_METACLAW_SKILLS_DIR ) except Exception: # noqa: BLE001 return "" # --------------------------------------------------------------------------- # Context builders # --------------------------------------------------------------------------- def _collect_json_context( directory: Path, *, max_files: int = 30, max_total_chars: int = 50_000, ) -> str: """Collect JSON context from a directory, with size limits. Large fields like ``stderr`` and ``stdout`` are stripped to avoid exceeding LLM token limits (the raw experiment output can be 5 MB+). """ chunks: list[str] = [] total = 0 for file_path in sorted(directory.glob("*.json"))[:max_files]: try: data = json.loads(file_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue # Strip verbose fields that bloat the context if isinstance(data, dict): for key in ("stderr", "stdout", "raw_output", "traceback"): if key in data and isinstance(data[key], str) and len(data[key]) > 500: data[key] = data[key][:500] + f"\n... [truncated, {len(data[key])} chars total]" chunk = json.dumps(data, indent=2, ensure_ascii=False) if total + len(chunk) > max_total_chars: remaining = max_total_chars - total if remaining > 200: chunks.append(chunk[:remaining] + "\n... [truncated]") break chunks.append(chunk) total += len(chunk) return "\n\n".join(chunks) def _collect_experiment_results( run_dir: Path, metric_key: str = "", metric_direction: str = "maximize", ) -> dict[str, Any]: """Aggregate experiment metrics from runs/ directory across prior stages. Returns a dict with ``runs``, ``metrics_summary``, ``best_run``, ``latex_table``, and optionally ``structured_results``. """ runs_data: list[dict[str, Any]] = [] structured_results: Any = None # Scan all stage dirs for runs/ subdirectory for stage_subdir in sorted(run_dir.glob("stage-*/runs")): # Check for structured results.json first results_json = stage_subdir / "results.json" if results_json.exists() and structured_results is None: try: structured_results = json.loads( results_json.read_text(encoding="utf-8") ) except (json.JSONDecodeError, OSError): pass for run_file in sorted(stage_subdir.glob("*.json")): if run_file.name == "results.json": continue # Already handled above parsed = _safe_json_loads(run_file.read_text(encoding="utf-8"), {}) if isinstance(parsed, dict) and "metrics" in parsed: # Also check for structured_results inside run payload if "structured_results" in parsed and structured_results is None: structured_results = parsed["structured_results"] runs_data.append(parsed) elif isinstance(parsed, dict) and "key_metrics" in parsed: # Simulated mode uses key_metrics parsed["metrics"] = parsed.pop("key_metrics") runs_data.append(parsed) if not runs_data: result: dict[str, Any] = {"runs": [], "metrics_summary": {}, "best_run": None, "latex_table": ""} if structured_results is not None: result["structured_results"] = structured_results return result # Aggregate metrics across runs all_metric_keys: set[str] = set() for r in runs_data: m = r.get("metrics") or {} if isinstance(m, dict): all_metric_keys.update(m.keys()) metrics_summary: dict[str, dict[str, float | None]] = {} for key in sorted(all_metric_keys): values = [] for r in runs_data: m = r.get("metrics") or {} if isinstance(m, dict) and key in m: try: _fv = float(m[key]) if _fv == _fv and abs(_fv) != float("inf"): # filter NaN/Inf values.append(_fv) except (ValueError, TypeError): pass if values: metrics_summary[key] = { "min": round(min(values), 6), "max": round(max(values), 6), "mean": round(sum(values) / len(values), 6), "count": len(values), } # Find best run using metric_key and metric_direction best_run: dict[str, Any] | None = None if runs_data: def _primary_metric(r: dict[str, Any]) -> float: m = r.get("metrics") or {} if isinstance(m, dict): # Try specific metric_key first if metric_key and metric_key in m: try: return float(m[metric_key]) except (ValueError, TypeError): pass # Fallback to first metric for v in m.values(): try: return float(v) except (ValueError, TypeError): pass return 0.0 _cmp = min if metric_direction == "minimize" else max best_run = _cmp(runs_data, key=_primary_metric) # Build LaTeX table latex_lines = [ r"\begin{table}[h]", r"\centering", r"\caption{Experiment Results}", ] if metrics_summary: cols = sorted(metrics_summary.keys()) header = "Metric & Min & Max & Mean & N \\\\" latex_lines.append(r"\begin{tabular}{l" + "r" * 4 + "}") latex_lines.append(r"\hline") latex_lines.append(header) latex_lines.append(r"\hline") for col in cols: s = metrics_summary[col] row = f"{col} & {s['min']:.4f} & {s['max']:.4f} & {s['mean']:.4f} & {s['count']} \\\\" latex_lines.append(row) latex_lines.append(r"\hline") latex_lines.append(r"\end{tabular}") else: latex_lines.append(r"\begin{tabular}{l}") latex_lines.append("No experiment data available \\\\") latex_lines.append(r"\end{tabular}") latex_lines.append(r"\end{table}") # R18-1: Extract paired statistical comparisons from stdout from researchclaw.experiment.sandbox import extract_paired_comparisons paired_comparisons: list[dict[str, object]] = [] for r in runs_data: stdout = r.get("stdout", "") if stdout: paired_comparisons.extend(extract_paired_comparisons(stdout)) collected: dict[str, Any] = { "runs": runs_data, "metrics_summary": metrics_summary, "best_run": best_run, "latex_table": "\n".join(latex_lines), } if paired_comparisons: collected["paired_comparisons"] = paired_comparisons if structured_results is not None: collected["structured_results"] = structured_results return collected def _build_context_preamble( config: RCConfig, run_dir: Path, *, include_goal: bool = False, include_hypotheses: bool = False, include_synthesis: bool = False, include_exp_plan: bool = False, include_analysis: bool = False, include_decision: bool = False, include_experiment_data: bool = False, ) -> str: parts = [ "## Research Context", f"**Topic**: {config.research.topic}", f"**Domains**: {', '.join(config.research.domains) if config.research.domains else 'general'}", ] if include_goal: goal = _read_prior_artifact(run_dir, "goal.md") if goal: parts.append(f"\n### Goal\n{goal[:2200]}") if include_hypotheses: hyp = _read_prior_artifact(run_dir, "hypotheses.md") if hyp: parts.append(f"\n### Hypotheses\n{hyp[:2200]}") if include_synthesis: synthesis = _read_prior_artifact(run_dir, "synthesis.md") if synthesis: parts.append(f"\n### Synthesis\n{synthesis[:2200]}") if include_exp_plan: plan = _read_prior_artifact(run_dir, "exp_plan.yaml") if plan: parts.append(f"\n### Experiment Plan\n{plan[:2000]}") if include_analysis: analysis = _read_best_analysis(run_dir) if analysis: parts.append(f"\n### Result Analysis\n{analysis[:2500]}") if include_decision: decision = _read_prior_artifact(run_dir, "decision.md") if decision: parts.append(f"\n### Research Decision\n{decision[:1500]}") if include_experiment_data: hw_profile = _load_hardware_profile(run_dir) if hw_profile: hw_lines = ["### Hardware Environment"] for hk, hv in hw_profile.items(): hw_lines.append(f"- **{hk}**: {hv}") parts.append("\n" + "\n".join(hw_lines)) exp_summary = _read_prior_artifact(run_dir, "experiment_summary.json") if exp_summary: summary = _safe_json_loads(exp_summary, {}) if isinstance(summary, dict) and summary.get("metrics_summary"): parts.append("\n### Experiment Results (Quantitative)") ms = summary["metrics_summary"] for mk, mv in ms.items(): if isinstance(mv, dict): parts.append( f"- **{mk}**: mean={mv.get('mean', '?')}, " f"min={mv.get('min', '?')}, max={mv.get('max', '?')}, n={mv.get('count', '?')}" ) if summary.get("latex_table"): parts.append( f"\n### LaTeX Table\n```latex\n{summary['latex_table']}\n```" ) return "\n".join(parts) # --------------------------------------------------------------------------- # Topic keywords and constraints # --------------------------------------------------------------------------- def _extract_topic_keywords( topic: str, domains: tuple[str, ...] | list[str] = () ) -> list[str]: """Extract meaningful keywords from the research topic + domain list. Returns lowercased keyword list (2+ chars, no stop words). Used by the domain pre-filter to drop obviously irrelevant papers. """ tokens = re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", topic.lower()) keywords = [t for t in tokens if t not in _STOP_WORDS and len(t) >= 3] # Add domain names as keywords for d in domains: for part in re.findall(r"[a-zA-Z][a-zA-Z0-9_-]+", d.lower()): if part not in _STOP_WORDS and len(part) >= 2: keywords.append(part) # Deduplicate while preserving order seen: set[str] = set() unique: list[str] = [] for kw in keywords: if kw not in seen: seen.add(kw) unique.append(kw) return unique # --- P1-2: Topic constraint block for paper generation stages --- def _topic_constraint_block(topic: str) -> str: """Return a hard constraint instruction that anchors paper content to the topic. Prevents the common LLM failure mode of drifting off-topic or presenting environmental/infrastructure issues as research contributions. """ return ( "\n\n=== HARD TOPIC CONSTRAINT ===\n" f"The paper MUST be about: {topic}\n" "PROHIBITED content (unless user explicitly specifies case-study mode):\n" "- Do NOT treat environment setup, dependency installation, or infrastructure " "failures as a research contribution.\n" "- Do NOT present debugging logs, system errors, or configuration issues " "as experimental findings.\n" "- Do NOT drift to tangential topics not directly related to the stated topic.\n" "- Every section MUST connect back to the core research question.\n" "- The Abstract and Introduction MUST clearly state the research problem " f"derived from: {topic}\n" "- The Method section MUST describe a technical approach, not a workflow.\n" "- The Results section MUST report quantitative outcomes of experiments, " "not environment status.\n" "=== END CONSTRAINT ===\n" ) # --------------------------------------------------------------------------- # Runtime issue detection # --------------------------------------------------------------------------- def _detect_runtime_issues(sandbox_result: Any) -> str: """Detect NaN/Inf in metrics and extract stderr warnings from sandbox run. Returns a formatted string describing all runtime issues, or empty string if no issues are found. """ issues: list[str] = [] # Check metrics for NaN/Inf metrics = getattr(sandbox_result, "metrics", {}) or {} for key, val in metrics.items(): try: fval = float(val) if math.isnan(fval): issues.append(f"METRIC NaN: '{key}' returned NaN — likely a division by zero or invalid computation in code") elif math.isinf(fval): issues.append(f"METRIC Inf: '{key}' returned Infinity — likely overflow or unbounded computation") except (TypeError, ValueError): pass # Check stdout for NaN values (word boundary to avoid matching "Nanotechnology" etc.) stdout = getattr(sandbox_result, "stdout", "") or "" _nan_re = re.compile(r"\bnan\b", re.IGNORECASE) if _nan_re.search(stdout): nan_lines = [ line.strip() for line in stdout.splitlines() if _nan_re.search(line) ] if nan_lines: issues.append( f"NaN values detected in output:\n" + "\n".join(nan_lines[:10]) ) # Extract meaningful warnings from stderr stderr = getattr(sandbox_result, "stderr", "") or "" if stderr.strip(): warning_lines = [] for line in stderr.splitlines(): line_stripped = line.strip() if not line_stripped: continue # Keep RuntimeWarning, ValueError, ZeroDivisionError, etc. if any( kw in line_stripped for kw in ( "Warning", "Error", "Traceback", "Exception", "divide", "overflow", "invalid value", "NaN", "inf", ) ): warning_lines.append(line_stripped) if warning_lines: issues.append( "Runtime warnings/errors from stderr:\n" + "\n".join(warning_lines[:15]) ) # Check for identical metric values across all entries in stdout # (e.g., all algorithms reporting convergence_rate=1.0) stdout = getattr(sandbox_result, "stdout", "") or "" if stdout: from collections import Counter metric_values_by_name: dict[str, list[float]] = {} for line in stdout.splitlines(): line = line.strip() if ":" not in line: continue parts = line.rsplit(":", 1) if len(parts) != 2: continue try: fval = float(parts[1].strip()) except (ValueError, TypeError): continue # Extract metric suffix (e.g. "convergence_rate" from "UCB (Stochastic) convergence_rate") name = parts[0].strip() metric_suffix = name.split()[-1] if name.split() else name metric_values_by_name.setdefault(metric_suffix, []).append(fval) for metric_name, vals in metric_values_by_name.items(): if len(vals) >= 3: unique = set(vals) if len(unique) <= 2: issues.append( f"DUMMY METRIC: '{metric_name}' has only {len(unique)} unique value(s) " f"across {len(vals)} entries ({unique}) — likely a placeholder. " f"Implement real measurement logic (e.g., track iterations to convergence)." ) # R5-3: Check for diverging loss values (fast-fail indicator) for key, val in metrics.items(): try: fval = float(val) if "loss" in key.lower() and fval > 100: issues.append( f"DIVERGING LOSS: '{key}' = {fval} (>100) — the optimization is " f"diverging. Reduce learning rate, check gradient computation, " f"or add gradient clipping." ) except (TypeError, ValueError): pass if not issues: return "" return ( "## Runtime Issues Detected\n\n" "The experiment code ran but produced problematic results. " "Fix the ROOT CAUSE of these issues in the code:\n\n" + "\n\n".join(f"- {issue}" for issue in issues) ) # --------------------------------------------------------------------------- # NeurIPS checklist # --------------------------------------------------------------------------- def _generate_neurips_checklist( has_experiments: bool = True, has_theory: bool = False, has_code: bool = True, ) -> str: """Generate a NeurIPS-style paper checklist appendix in markdown. This checklist is based on the NeurIPS 2025 submission requirements. It is appended to the paper before LaTeX conversion. """ items = [ ("Claims", "Do the main claims accurately reflect the paper's contributions and scope?", "Yes"), ("Limitations", "Does the paper discuss limitations of the work?", "Yes"), ] if has_theory: items.append( ("Theory", "Are all assumptions stated and proofs included?", "Yes") ) items.extend([ ("Experiments reproducibility", "Does the paper fully disclose experimental settings?", "Yes" if has_experiments else "NA"), ("Code and data", "Is code or data provided for reproducibility?", "Yes" if has_code else "No"), ("Experimental details", "Are training details and hyperparameters specified?", "Yes" if has_experiments else "NA"), ("Error bars", "Are error bars or confidence intervals reported?", "Yes" if has_experiments else "NA"), ("Compute resources", "Are compute requirements documented?", "Yes" if has_experiments else "NA"), ("Code of ethics", "Does the work comply with the code of ethics?", "Yes"), ("Broader impacts", "Are potential negative societal impacts discussed?", "Yes"), ("Licenses", "Are licenses for used assets respected?", "Yes"), ("New assets", "Are newly released assets documented?", "NA"), ("Human subjects", "Were IRB approvals obtained if applicable?", "NA"), ]) lines = [ "## NeurIPS Paper Checklist", "", ] for label, question, answer in items: lines.append(f"**{label}**: {question}") lines.append(f"Answer: [{answer}]") lines.append("") return "\n".join(lines) # --------------------------------------------------------------------------- # Paper title extraction # --------------------------------------------------------------------------- def _extract_paper_title(md_text: str) -> str: """Extract paper title from markdown text for LaTeX generation. Prioritises H1 headings that appear *before* the abstract section and look like real titles (>= 4 words, starts with uppercase). This avoids picking up pseudocode comments or algorithm step labels. Also handles the common LLM pattern where a ``# Title`` heading is followed by the actual title as a plain text line (possibly bold): # Title NORM-PPO: Observation Normalization and Reward Scaling Effects """ import re as _re # Strip outer markdown fence (LLMs sometimes wrap entire paper) _text = md_text _fence_m = _re.match(r"^\s*```(?:markdown|md|latex|tex)?\s*\n", _text) if _fence_m: _text = _text[_fence_m.end():] # Also strip trailing fence _text = _re.sub(r"\n\s*```\s*$", "", _text) # Limit search to content before Abstract heading abstract_pos = _re.search( r"^#{1,2}\s+(Abstract|ABSTRACT)", _text, _re.MULTILINE ) search_region = _text[: abstract_pos.start()] if abstract_pos else _text[:3000] _SKIP = {"title", "abstract", "references", "appendix"} candidates: list[str] = [] _saw_title_heading = False lines = search_region.splitlines() for i, raw_line in enumerate(lines): line = raw_line.strip() # BUG-171: When we see a "# Title" or "## Title" heading, the actual # title is often on the next non-empty line as plain text or bold text. if _saw_title_heading and line: # Strip bold markers: **Title Text** → Title Text candidate = _re.sub(r"\*\*(.+?)\*\*", r"\1", line).strip() # Make sure it's not another heading or a skip heading if not line.startswith("#") and candidate: candidates.insert(0, candidate) # highest priority _saw_title_heading = False # Match H1 or H2 headings hm = _re.match(r"^(#{1,2})\s+(.+)$", line) if hm: heading = hm.group(2).strip() heading_lower = heading.lower() # Handle "## Title Actual Paper Title" pattern if heading_lower.startswith("title ") and len(heading) > 6: heading = heading[6:].strip() heading_lower = heading.lower() if heading_lower in _SKIP: # Mark that we saw a "# Title" heading — next non-empty line # is likely the actual title text if heading_lower == "title": _saw_title_heading = True continue candidates.append(heading) continue # Bold title line (e.g. **My Paper Title**) m = _re.match(r"\*\*(.+?)\*\*$", line) if m and len(m.group(1).split()) >= 3: candidates.append(m.group(1)) # Prefer candidates that look like real titles (>= 4 words, capitalised) for c in candidates: words = c.split() if len(words) >= 4 and c[0].isupper(): return c # Fallback: any candidate if candidates: return candidates[0] return "Untitled Paper" # --------------------------------------------------------------------------- # Framework diagram prompt # --------------------------------------------------------------------------- def _generate_framework_diagram_prompt( paper_text: str, config: "RCConfig", *, llm: "LLMClient | None" = None, ) -> str: """Generate a text-to-image prompt for a methodology framework diagram. Reads the paper's method section and produces a detailed prompt suitable for AI image generators (DALL-E, Midjourney, etc.). The prompt describes an academic-style architecture/framework overview figure. Returns the prompt as a Markdown string, or empty string on failure. """ import re as _re # Extract method/approach section from paper _method_section = "" _method_patterns = [ r"(?:^#{1,3}\s+(?:Method(?:ology)?|Approach|Proposed\s+(?:Method|Framework|Approach)|Our\s+Method|Technical\s+Approach|Model\s+Architecture).*?)(?=^#{1,3}\s+|\Z)", ] for _pat in _method_patterns: _match = _re.search(_pat, paper_text, _re.MULTILINE | _re.DOTALL | _re.IGNORECASE) if _match: _method_section = _match.group(0)[:3000] break if not _method_section: # Fallback: use abstract + first 1500 chars _abs_match = _re.search( r"(?:^#{1,2}\s+Abstract\s*\n)(.*?)(?=^#{1,2}\s+|\Z)", paper_text, _re.MULTILINE | _re.DOTALL | _re.IGNORECASE, ) _method_section = (_abs_match.group(1)[:1500] if _abs_match else paper_text[:2000]) title = _extract_paper_title(paper_text) topic = config.research.topic # Use LLM to generate the prompt if available if llm is not None: _system = ( "You are an expert academic figure designer. Generate a detailed text-to-image " "prompt for creating a methodology framework/architecture overview diagram.\n\n" "Requirements:\n" "- Academic style: clean, professional, suitable for a top-tier ML conference paper\n" "- Color palette: sophisticated and harmonious (suggest specific hex colors, " "prefer muted blues #4477AA, teals #44AA99, warm accents #CCBB44, soft purples #AA3377)\n" "- Layout: left-to-right or top-to-bottom data flow, with clearly labeled components\n" "- Components: boxes/modules with rounded corners, directional arrows, clear labels\n" "- Information density: high but not cluttered — each box should have a short label\n" "- Text on figure: minimal, only component names and key annotations\n" "- Background: white or very light grey\n" "- Style: vector-art look, flat design with subtle shadows, NO photorealism\n\n" "Output ONLY the prompt text (no markdown headers, no explanations). " "The prompt should be 150-300 words, highly specific and actionable." ) _user = ( f"Paper title: {title}\n" f"Research topic: {topic}\n\n" f"Method section excerpt:\n{_method_section}\n\n" "Generate a detailed text-to-image prompt for the methodology framework diagram." ) try: resp = _chat_with_prompt(llm, _system, _user, max_tokens=1024) _llm_prompt = resp.content.strip() if len(_llm_prompt) > 50: return ( f"# Framework Diagram Prompt\n\n" f"**Paper**: {title}\n\n" f"## Image Generation Prompt\n\n" f"{_llm_prompt}\n\n" f"## Usage Instructions\n\n" f"1. Copy the prompt above into an AI image generator " f"(DALL-E 3, Midjourney, Ideogram, etc.)\n" f"2. Generate the image at high resolution (2048x1024 or similar landscape)\n" f"3. Save as `framework_diagram.png` in the same `charts/` folder\n" f"4. Insert into the paper's Method section using:\n" f" - LaTeX: `\\includegraphics[width=\\textwidth]{{charts/framework_diagram.png}}`\n" f" - Markdown: `![Framework Overview](charts/framework_diagram.png)`\n" ) except Exception: logger.debug("Framework prompt LLM generation failed, using template") # Fallback: template-based prompt without LLM _components = [] _component_patterns = [ (r"(?:encoder|decoder|transformer|attention|convolution|MLP|GNN|ResNet|ViT)", "Neural Network Module"), (r"(?:loss|objective|criterion|training|optimization)", "Training/Optimization"), (r"(?:data|dataset|input|preprocessing|augmentation)", "Data Pipeline"), (r"(?:output|prediction|inference|evaluation)", "Output/Evaluation"), ] _method_lower = _method_section.lower() for pat, label in _component_patterns: if _re.search(pat, _method_lower): _components.append(label) if not _components: _components = ["Input Processing", "Core Model", "Training Loop", "Evaluation"] return ( f"# Framework Diagram Prompt\n\n" f"**Paper**: {title}\n\n" f"## Image Generation Prompt\n\n" f"Create a clean, academic-style methodology framework diagram for a research paper " f"titled \"{title}\". " f"The diagram should show a left-to-right data flow pipeline with these main components: " f"{', '.join(_components)}. " f"Use a professional color palette with muted blues (#4477AA), teals (#44AA99), " f"warm yellows (#CCBB44), and soft purples (#AA3377) on a white background. " f"Each component should be a rounded rectangle with a short label inside. " f"Connect components with clean directional arrows. " f"Add subtle shadows for depth. Flat vector-art style, no photorealism. " f"High information density but visually clean. " f"Suitable for a top-tier machine learning conference paper (ICML/NeurIPS/ICLR). " f"Landscape orientation, 2048x1024 resolution.\n\n" f"## Usage Instructions\n\n" f"1. Copy the prompt above into an AI image generator " f"(DALL-E 3, Midjourney, Ideogram, etc.)\n" f"2. Generate the image at high resolution (2048x1024 or similar landscape)\n" f"3. Save as `framework_diagram.png` in the same `charts/` folder\n" f"4. Insert into the paper's Method section using:\n" f" - LaTeX: `\\includegraphics[width=\\textwidth]{{charts/framework_diagram.png}}`\n" f" - Markdown: `![Framework Overview](charts/framework_diagram.png)`\n" ) # --------------------------------------------------------------------------- # Filename and data helpers # --------------------------------------------------------------------------- def _safe_filename(name: str) -> str: name = name.replace("/", "_").replace("\\", "_").replace("..", "_") name = re.sub(r"[^a-zA-Z0-9_\-.]", "_", name) return name[:100] or "unnamed" # --------------------------------------------------------------------------- # Default fallbacks # --------------------------------------------------------------------------- def _default_hypotheses(topic: str) -> str: return f"""# Hypotheses ## H1 Increasing protocol control for {topic} improves metric stability across random seeds. ## H2 Adding robustness-aware objectives for {topic} improves out-of-domain performance without major in-domain regression. ## H3 The combined approach outperforms either component under fixed compute budget. ## Generated {_utcnow_iso()} """ def _default_paper_outline(topic: str) -> str: return f"""# Paper Outline ## 1. Title Focused title on {topic} ## 2. Abstract - Problem framing - Method overview - Key quantitative result ## 3. Introduction - Motivation - Gap statement - Contributions ## 4. Related Work - Method families - Evaluation practices ## 5. Method - Problem setup - Model/algorithm - Complexity and constraints ## 6. Experiments - Datasets and metrics - Baselines and ablations - Reproducibility protocol ## 7. Results - Main table - Robustness analysis - Failure cases ## 8. Discussion - Practical implications - Limitations ## 9. Conclusion - Findings and next steps Generated: {_utcnow_iso()} """ def _default_quality_report(threshold: float) -> dict[str, Any]: # When LLM fails, return below-threshold score to force revision score = max(1.0, float(threshold) - 2.0) if threshold > 0 else 5.0 score = max(1.0, min(10.0, score)) verdict = "revise" return { "score_1_to_10": round(score, 2), "verdict": verdict, "criteria": { "novelty": round(min(10.0, score + 0.3), 2), "methodological_rigor": round(score, 2), "clarity": round(max(1.0, score - 0.2), 2), "reproducibility": round(min(10.0, score + 0.1), 2), }, "strengths": [ "Stage-by-stage evidence chain preserved", "Experiment artifacts are generated and archived", ], "weaknesses": [ "Statistical significance may need stronger reporting", "Broader external validity remains partially evaluated", ], "required_actions": [ "Report confidence intervals and seed variance", "Include at least one stronger external baseline", ], "generated": _utcnow_iso(), } # --------------------------------------------------------------------------- # Multi-perspective generation # --------------------------------------------------------------------------- def _multi_perspective_generate( llm: LLMClient, roles: dict[str, dict[str, str]], variables: dict[str, str], perspectives_dir: Path, ) -> dict[str, str]: """Generate outputs from multiple debate perspectives. Each role has its own system/user prompt. Outputs are saved to *perspectives_dir* and returned as ``{role_name: response_text}``. """ from researchclaw.prompts import _render # noqa: PLC0415 perspectives_dir.mkdir(parents=True, exist_ok=True) results: dict[str, str] = {} for role_name, role_prompts in roles.items(): try: system = _render(role_prompts["system"], variables) user = _render(role_prompts["user"], variables) resp = llm.chat( [{"role": "user", "content": user}], system=system, ) results[role_name] = resp.content (perspectives_dir / f"{role_name}.md").write_text( resp.content, encoding="utf-8" ) logger.info("Debate perspective '%s' generated (%d chars)", role_name, len(resp.content)) except Exception as exc: # noqa: BLE001 logger.warning("Debate perspective '%s' failed: %s", role_name, exc) if len(results) < 2: logger.error("Multi-perspective debate: only %d/%d roles succeeded", len(results), len(roles)) return results def _synthesize_perspectives( llm: LLMClient, perspectives: dict[str, str], sub_prompt_name: str, prompts: PromptManager, ) -> str: """Synthesize multiple perspective outputs into a unified result.""" parts = [] for role_name, text in perspectives.items(): parts.append(f"### Perspective: {role_name}\n{text}") combined = "\n\n---\n\n".join(parts) sp = prompts.sub_prompt(sub_prompt_name, perspectives=combined) resp = llm.chat( [{"role": "user", "content": sp.user}], system=sp.system, ) return resp.content def reconcile_figure_refs( tex_path: Path, charts_dir: Path, ) -> dict[str, str]: """Fix ``\\includegraphics`` paths in *tex_path* that don't match files in *charts_dir*. Three-tier matching strategy: 1. **Exact stem** — e.g. ``accuracy_plot`` matches ``accuracy_plot.png`` 2. **Normalized keyword overlap** — tokenize on ``[-_]``, apply singular/plural normalization, require Jaccard similarity >= 0.4 3. **Substring containment** — one stem is a substring of the other Returns a ``{old_path: new_path}`` dict of fixes applied (empty if none needed). """ if not tex_path.exists(): return {} tex_text = tex_path.read_text(encoding="utf-8") fig_refs = re.findall( r"\\includegraphics(?:\[[^\]]*\])?\{([^}]+)\}", tex_text ) if not fig_refs: return {} # Build map of actual chart files: lowered-stem -> charts/filename actual_files: dict[str, str] = {} if charts_dir.is_dir(): for af in charts_dir.iterdir(): if af.is_file() and af.suffix.lower() in ( ".png", ".jpg", ".jpeg", ".pdf", ".svg", ): actual_files[af.stem.lower()] = f"charts/{af.name}" if not actual_files: return {} def _singularize(word: str) -> str: """Cheap singular/plural normalization.""" if word.endswith("ies") and len(word) > 4: return word[:-3] + "y" if word.endswith("ses") and len(word) > 4: return word[:-2] if word.endswith("s") and not word.endswith("ss") and len(word) > 2: return word[:-1] return word def _tokenize(stem: str) -> set[str]: return {_singularize(w) for w in stem.replace("-", "_").split("_") if w} def _jaccard(a: set[str], b: set[str]) -> float: if not a or not b: return 0.0 return len(a & b) / len(a | b) fixes: dict[str, str] = {} for ref in fig_refs: ref_resolved = tex_path.parent / ref if ref_resolved.exists(): continue ref_stem = Path(ref).stem.lower() # Tier 1: exact stem match if ref_stem in actual_files: fixes[ref] = actual_files[ref_stem] continue # Tier 2: keyword overlap with Jaccard >= 0.4 ref_tokens = _tokenize(ref_stem) best_match, best_score = "", 0.0 for stem, apath in actual_files.items(): score = _jaccard(ref_tokens, _tokenize(stem)) if score > best_score: best_score = score best_match = apath if best_score >= 0.4 and best_match: fixes[ref] = best_match continue # Tier 3: substring containment for stem, apath in actual_files.items(): if ref_stem in stem or stem in ref_stem: fixes[ref] = apath break if fixes: for old_path, new_path in fixes.items(): tex_text = tex_text.replace(f"{{{old_path}}}", f"{{{new_path}}}") tex_path.write_text(tex_text, encoding="utf-8") logger.warning( "reconcile_figure_refs: Fixed %d figure path mismatch(es): %s", len(fixes), ", ".join(f"{k} → {v}" for k, v in fixes.items()), ) return fixes ================================================ FILE: researchclaw/pipeline/code_agent.py ================================================ """Advanced multi-phase code generation agent. Phases ------ 1. **Blueprint Planning** — produce deep implementation blueprint with per-file pseudocode, tensor shapes, and generation ordering. 2. **Sequential File Generation** — generate files one-by-one following the dependency order from the blueprint, with CodeMem summaries. Falls back to single-shot generation if blueprint parsing fails. 3. **Execution-in-the-Loop** — run in sandbox, feed errors back for repair. 4. **Solution Tree Search** — explore multiple candidate implementations, evaluate via sandbox, select the best (optional, higher cost). 5. **Multi-Agent Review** — coder-reviewer dialog for quality assurance. Integration ----------- ``CodeAgent`` is instantiated inside ``_execute_code_generation`` in ``executor.py`` when ``config.experiment.code_agent.enabled`` is True. It receives the same inputs (topic, exp_plan, metric, pkg_hint) and returns ``CodeAgentResult`` with the generated files. """ from __future__ import annotations import ast import json import logging import re import time from dataclasses import dataclass, field from pathlib import Path from typing import Any, Protocol logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- @dataclass(frozen=True) class CodeAgentConfig: """Configuration for the advanced code generation agent. All phases are independently toggleable. The default profile enables Phases 1 (blueprint), 2 (sequential generation + exec-fix), and 5 (review), which gives a large quality boost at moderate extra cost. Phase 4 (tree search) is opt-in because it multiplies both LLM and sandbox usage. """ enabled: bool = True # Phase 1: Blueprint planning (deep implementation blueprint) architecture_planning: bool = True # Phase 2: Sequential file generation (generate files one-by-one # following dependency order from blueprint, with CodeMem summaries) sequential_generation: bool = True # Phase 2.5: Hard validation gates (AST-based) hard_validation: bool = True hard_validation_max_repairs: int = 4 # Phase 3: Execution-in-the-loop exec_fix_max_iterations: int = 3 exec_fix_timeout_sec: int = 60 # Phase 4: Solution tree search (off by default) tree_search_enabled: bool = False tree_search_candidates: int = 3 tree_search_max_depth: int = 2 tree_search_eval_timeout_sec: int = 120 # Phase 5: Multi-agent review dialog review_max_rounds: int = 2 # --------------------------------------------------------------------------- # Data structures # --------------------------------------------------------------------------- @dataclass class SolutionNode: """One candidate solution in the search tree.""" node_id: str files: dict[str, str] parent_id: str | None = None depth: int = 0 # Evaluation runs_ok: bool = False returncode: int = -1 evaluated: bool = False stdout: str = "" stderr: str = "" metrics: dict[str, Any] = field(default_factory=dict) score: float = 0.0 generation_method: str = "initial" @dataclass class CodeAgentResult: """Final output from the code agent.""" files: dict[str, str] architecture_spec: str = "" validation_log: list[str] = field(default_factory=list) total_llm_calls: int = 0 total_sandbox_runs: int = 0 best_score: float = 0.0 tree_nodes_explored: int = 0 review_rounds: int = 0 # --------------------------------------------------------------------------- # Sandbox protocol (structural typing — no import dependency) # --------------------------------------------------------------------------- class _SandboxResult(Protocol): # pragma: no cover returncode: int stdout: str stderr: str elapsed_sec: float metrics: dict[str, object] timed_out: bool class _SandboxLike(Protocol): # pragma: no cover def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> Any: ... # --------------------------------------------------------------------------- # CodeAgent # --------------------------------------------------------------------------- class CodeAgent: """Multi-phase code generation agent. Parameters ---------- llm : LLMClient The language model client to use for code generation. prompts : PromptManager Manages prompt templates. config : CodeAgentConfig Agent configuration (toggles, limits, timeouts). stage_dir : Path Working directory for this stage (e.g. ``run_dir/stage-10``). sandbox_factory : callable, optional ``(ExperimentConfig, Path) -> SandboxLike``. Required for Phases 2 and 3. experiment_config : ExperimentConfig, optional Passed to ``sandbox_factory`` when creating sandboxes. """ def __init__( self, llm: Any, prompts: Any, config: CodeAgentConfig, stage_dir: Path, sandbox_factory: Any | None = None, experiment_config: Any | None = None, domain_profile: Any | None = None, code_search_result: Any | None = None, ) -> None: self._llm = llm self._pm = prompts self._cfg = config self._stage_dir = stage_dir self._sandbox_factory = sandbox_factory self._exp_config = experiment_config self._domain_profile = domain_profile self._code_search_result = code_search_result self._calls = 0 self._runs = 0 self._log: list[str] = [] self._sandbox: _SandboxLike | None = None # ── Public API ──────────────────────────────────────────────────────── def generate( self, topic: str, exp_plan: str, metric: str, pkg_hint: str, max_tokens: int = 8192, ) -> CodeAgentResult: """Execute all enabled phases and return generated files.""" t0 = time.time() self._log_event("CodeAgent.generate() started") # Phase 1: Blueprint planning arch_spec = "" blueprint = None if self._cfg.architecture_planning: arch_spec, blueprint = self._phase1_blueprint( topic, exp_plan, metric, ) # Phase 2: Code generation nodes_explored = 0 if self._cfg.tree_search_enabled and self._sandbox_factory: best, nodes_explored = self._phase3_tree_search( topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens, ) elif ( self._cfg.sequential_generation and blueprint is not None and self._is_valid_blueprint(blueprint) ): # Sequential file generation following blueprint files = self._phase2_sequential_generate( topic, exp_plan, metric, pkg_hint, arch_spec, blueprint, ) # Hard validation gates (E-03) if self._cfg.hard_validation: files = self._hard_validate_and_repair( files, topic, exp_plan, metric, pkg_hint, arch_spec, ) # Exec-fix loop files = self._exec_fix_loop(files) best = SolutionNode( node_id="sequential", files=files, runs_ok=True, score=1.0, ) else: # Fallback: single-shot generation if self._cfg.sequential_generation and blueprint is None: self._log_event( " Sequential generation requested but blueprint " "invalid — falling back to single-shot" ) files = self._phase2_generate_and_fix( topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens, ) # Hard validation gates (E-03) for single-shot too if self._cfg.hard_validation and files: files = self._hard_validate_and_repair( files, topic, exp_plan, metric, pkg_hint, arch_spec, ) best = SolutionNode( node_id="single", files=files, runs_ok=bool(files), score=1.0 if files else 0.0, ) # Phase 5: Review dialog review_rounds = 0 if self._cfg.review_max_rounds > 0: best.files, review_rounds = self._phase4_review( best.files, topic, exp_plan, metric, ) elapsed = time.time() - t0 self._log_event( f"CodeAgent.generate() done in {elapsed:.1f}s — " f"{self._calls} LLM calls, {self._runs} sandbox runs" ) return CodeAgentResult( files=best.files, architecture_spec=arch_spec, validation_log=list(self._log), total_llm_calls=self._calls, total_sandbox_runs=self._runs, best_score=best.score, tree_nodes_explored=nodes_explored, review_rounds=review_rounds, ) # ── Phase 1: Blueprint Planning ────────────────────────────────────── def _phase1_blueprint( self, topic: str, exp_plan: str, metric: str, ) -> tuple[str, dict[str, Any] | None]: """Generate a deep implementation blueprint. Returns (raw_yaml_str, parsed_blueprint_dict_or_None). """ self._log_event("Phase 1: Blueprint planning") sp = self._pm.sub_prompt( "architecture_planning", topic=topic, exp_plan=exp_plan, metric=metric, ) # Inject domain context and code search results into blueprint prompt domain_context = self._build_domain_context() if domain_context: sp = type(sp)( system=sp.system, user=sp.user + "\n\n" + domain_context, ) self._log_event(" Injected domain context into blueprint prompt") resp = self._chat(sp.system, sp.user, max_tokens=8192) # Extract YAML block from response arch_spec = resp.content yaml_match = re.search(r"```ya?ml\s*\n(.*?)```", arch_spec, re.DOTALL) if yaml_match: arch_spec = yaml_match.group(1).strip() self._log_event(f" Blueprint spec: {len(arch_spec)} chars") # Parse YAML into structured blueprint blueprint = self._parse_blueprint(arch_spec) if blueprint: n_files = len(blueprint.get("files", [])) self._log_event(f" Parsed blueprint: {n_files} files") else: self._log_event(" WARNING: Could not parse blueprint YAML") return arch_spec, blueprint def _build_domain_context(self) -> str: """Build domain-specific context for injection into prompts. Includes: - Domain profile hints (file structure, libraries, evaluation) - Code search results (API patterns, reference code) """ parts: list[str] = [] # Domain profile context if self._domain_profile is not None: try: from researchclaw.domains.prompt_adapter import get_adapter adapter = get_adapter(self._domain_profile) blueprint_ctx = adapter.get_blueprint_context() if blueprint_ctx: parts.append( "# Domain-Specific Guidance\n" + blueprint_ctx ) except Exception: logger.debug("Failed to get domain context", exc_info=True) # Code search results if self._code_search_result is not None: try: prompt_ctx = self._code_search_result.to_prompt_context() if prompt_ctx: parts.append( "# Reference Code from GitHub\n" "The following patterns were found in relevant open-source projects. " "Use them as reference for API usage and project structure.\n\n" + prompt_ctx ) except Exception: logger.debug("Failed to get code search context", exc_info=True) return "\n\n".join(parts) def _parse_blueprint(self, yaml_text: str) -> dict[str, Any] | None: """Parse blueprint YAML into a structured dict. BUG-178: LLM often includes Python type annotations in signature values (e.g. ``signature: (self, name: str) -> Config``). The bare ``:`` breaks YAML parsing. We quote unquoted signature values before parsing. """ import yaml # Pre-process: sanitize values that contain Python type annotations, # unclosed quotes, or other patterns that break YAML parsing. import re as _bp_re sanitized_lines = [] for line in yaml_text.split("\n"): stripped = line.lstrip() if not stripped or stripped.startswith("#"): sanitized_lines.append(line) continue # Skip lines that are pure list markers or block scalars if stripped.startswith(("- ", "---", "...")): # For list items like `- key: value`, extract after `- ` if stripped.startswith("- ") and ":" in stripped[2:]: inner = stripped[2:] else: sanitized_lines.append(line) continue elif ":" in stripped: inner = stripped else: sanitized_lines.append(line) continue # Find the YAML key separator (first `:` followed by space or EOL) m = _bp_re.search(r":\s", inner) if not m: sanitized_lines.append(line) continue val_part = inner[m.end():].strip() if not val_part: sanitized_lines.append(line) continue # Already properly quoted — skip if val_part.startswith(("'", "|", ">")): sanitized_lines.append(line) continue # Check if value needs quoting: # 1) Contains `:` or `->` (type annotations) # 2) Starts with `"` but doesn't end with `"` (unclosed quote) # 3) Contains `[` with `:` (e.g. dict[str, float]) needs_quoting = False if val_part.startswith('"'): # Already quoted — check if properly closed if not val_part.endswith('"') or val_part.count('"') % 2 != 0: needs_quoting = True # unclosed or malformed quote else: sanitized_lines.append(line) continue elif ":" in val_part or "->" in val_part: needs_quoting = True if needs_quoting: # Strip any existing partial quotes, escape internal quotes clean = val_part.strip('"').replace('"', '\\"') # Remove inline comments (# ...) to avoid YAML issues comment_idx = clean.find(" #") if comment_idx >= 0: clean = clean[:comment_idx].rstrip() indent = line[:len(line) - len(stripped)] prefix = stripped[:len(stripped) - len(inner)] # e.g. "- " key_sep = inner[:m.end()] sanitized_lines.append( f'{indent}{prefix}{key_sep}"{clean}"' ) else: sanitized_lines.append(line) sanitized = "\n".join(sanitized_lines) for attempt_text in (sanitized, yaml_text): try: data = yaml.safe_load(attempt_text) if isinstance(data, dict) and "files" in data: return data except Exception as exc: self._log_event(f" Blueprint YAML parse error: {exc}") return None @staticmethod def _is_valid_blueprint(blueprint: dict[str, Any]) -> bool: """Check if a blueprint has the minimum required structure.""" files = blueprint.get("files", []) if not files or not isinstance(files, list): return False # Need at least 2 files with generation_order has_order = sum( 1 for f in files if isinstance(f, dict) and "generation_order" in f ) return has_order >= 2 # ── Phase 2a: Sequential File Generation ───────────────────────────── def _phase2_sequential_generate( self, topic: str, exp_plan: str, metric: str, pkg_hint: str, arch_spec: str, blueprint: dict[str, Any], ) -> dict[str, str]: """Generate files one-by-one following blueprint dependency order.""" self._log_event("Phase 2: Sequential generation (blueprint-guided)") generated_files: dict[str, str] = {} code_memory: dict[str, dict[str, Any]] = {} # CodeMem summaries # Sort files by generation_order file_specs = blueprint.get("files", []) file_specs = [f for f in file_specs if isinstance(f, dict)] # Ensure generation_order exists; default to list position for i, fs in enumerate(file_specs): if "generation_order" not in fs: fs["generation_order"] = i + 1 file_specs.sort(key=lambda f: f.get("generation_order", 99)) for file_spec in file_specs: file_name = file_spec.get("name", "") if not file_name: continue self._log_event( f" Generating {file_name} " f"(order={file_spec.get('generation_order')})" ) # Build dependency context deps = file_spec.get("dependencies", []) dep_summaries = "" dep_code = "" for dep in deps: if isinstance(dep, str): if dep in code_memory: dep_summaries += ( f"\n### {dep} (summary)\n" + json.dumps(code_memory[dep], indent=2) + "\n" ) if dep in generated_files: dep_code += ( f"\n### {dep}\n```python\n" + generated_files[dep] + "\n```\n" ) if not dep_summaries: dep_summaries = "(no dependencies yet)" if not dep_code: dep_code = "(no dependencies yet)" # Generate this file via LLM file_spec_str = json.dumps(file_spec, indent=2, default=str) sp = self._pm.sub_prompt( "generate_single_file", file_name=file_name, file_spec=file_spec_str, blueprint=arch_spec, dependency_summaries=dep_summaries, dependency_code=dep_code, topic=topic, exp_plan=exp_plan[:4000], # Truncate to avoid token overflow pkg_hint=pkg_hint, ) resp = self._chat(sp.system, sp.user, max_tokens=8192) # Extract code from response code = self._extract_single_file_code(resp.content, file_name) if not code: self._log_event(f" WARNING: Empty code for {file_name}") continue generated_files[file_name] = code # Build CodeMem summary via AST code_memory[file_name] = self._build_code_summary( file_name, code, ) self._log_event( f" {file_name}: {len(code.split(chr(10)))} lines, " f"{len(code_memory[file_name].get('classes', []))} classes" ) # Verify we have main.py if "main.py" not in generated_files: self._log_event(" WARNING: No main.py generated, promoting first file") if generated_files: first_key = next(iter(generated_files)) generated_files["main.py"] = generated_files.pop(first_key) self._log_event( f" Sequential generation complete: {len(generated_files)} files" ) return generated_files @staticmethod def _extract_single_file_code(content: str, expected_name: str) -> str: """Extract Python code from LLM response for a single file.""" # Try to extract from ```python``` block m = re.search(r"```python\s*\n(.*?)```", content, re.DOTALL) if m: return m.group(1).strip() # Try ```filename:xxx.py block m = re.search( rf"```(?:filename:)?{re.escape(expected_name)}\s*\n(.*?)```", content, re.DOTALL, ) if m: return m.group(1).strip() # If content looks like raw Python (starts with import/from/# or def) stripped = content.strip() if stripped and ( stripped.startswith("import ") or stripped.startswith("from ") or stripped.startswith("#") or stripped.startswith("def ") or stripped.startswith("class ") or stripped.startswith('"""') ): return stripped return "" @staticmethod def _build_code_summary( filename: str, code: str, ) -> dict[str, Any]: """Build a CodeMem-style compressed summary via AST analysis.""" summary: dict[str, Any] = { "filename": filename, "classes": [], "functions": [], "imports": [], } try: tree = ast.parse(code) except SyntaxError: summary["parse_error"] = True return summary for node in ast.walk(tree): if isinstance(node, ast.ClassDef): methods = [] for n in node.body: if isinstance(n, ast.FunctionDef): args = [a.arg for a in n.args.args if a.arg != "self"] methods.append({ "name": n.name, "args": args, }) summary["classes"].append({ "name": node.name, "bases": [ast.unparse(b) for b in node.bases], "methods": methods, }) elif isinstance(node, ast.FunctionDef) and node.col_offset == 0: args = [a.arg for a in node.args.args] summary["functions"].append({ "name": node.name, "args": args, }) elif isinstance(node, (ast.Import, ast.ImportFrom)): try: summary["imports"].append(ast.unparse(node)) except Exception: pass return summary # ── Phase 2.5: Hard Validation Gates (E-03) ───────────────────────── def _hard_validate_and_repair( self, files: dict[str, str], topic: str, exp_plan: str, metric: str, pkg_hint: str, arch_spec: str, ) -> dict[str, str]: """Run AST-based hard validation and repair critical issues. Critical issues trigger targeted file regeneration. Non-critical issues are logged as warnings only. """ self._log_event("Phase 2.5: Hard validation gates") for attempt in range(self._cfg.hard_validation_max_repairs + 1): critical, warnings = self._hard_validate(files) # Log warnings for w in warnings: self._log_event(f" WARNING: {w}") if not critical: self._log_event( f" Hard validation passed " f"({len(warnings)} warning(s), attempt {attempt})" ) return files self._log_event( f" Hard validation found {len(critical)} CRITICAL issue(s) " f"(attempt {attempt}/{self._cfg.hard_validation_max_repairs})" ) for c in critical: self._log_event(f" CRITICAL: {c}") if attempt >= self._cfg.hard_validation_max_repairs: self._log_event( " Max repair attempts reached — proceeding with warnings" ) return files # Targeted repair: ask LLM to fix specific critical issues files = self._repair_critical_issues( files, critical, topic, exp_plan, metric, arch_spec, ) return files def _hard_validate( self, files: dict[str, str], ) -> tuple[list[str], list[str]]: """Run AST-based checks and classify as CRITICAL or WARNING. Returns (critical_issues, warning_issues). """ critical: list[str] = [] warnings: list[str] = [] from researchclaw.experiment.validator import ( check_class_quality, check_code_complexity, check_api_correctness, check_variable_scoping, validate_syntax, ) # 1. Syntax check — always critical for fname, code in files.items(): if not fname.endswith(".py"): continue syn = validate_syntax(code) if not syn.ok: for issue in syn.errors: critical.append( f"[{fname}] Syntax error: {issue.message} " f"(line {issue.line})" ) # 2. Class quality — some are critical class_warns = check_class_quality(files) for w in class_warns: if "identical AST to parent" in w: critical.append(w) elif "NOT a real ablation" in w: critical.append(w) elif "creates nn.Module" in w and "inside forward()" in w: critical.append(w) elif "empty or trivial subclass" in w: # Critical: ablation classes must have real implementations critical.append(w) else: warnings.append(w) # 3. Code complexity — hardcoded metrics are critical for fname, code in files.items(): if not fname.endswith(".py"): continue complexity_warns = check_code_complexity(code) for w in complexity_warns: if "hardcoded metric" in w.lower(): critical.append(f"[{fname}] {w}") elif "trivial computation" in w.lower(): critical.append(f"[{fname}] {w}") else: warnings.append(f"[{fname}] {w}") # 4. API correctness — NameError-causing issues are critical for fname, code in files.items(): if not fname.endswith(".py"): continue api_warns = check_api_correctness(code, fname) for w in api_warns: if "NameError" in w or "Import-usage mismatch" in w: critical.append(w) elif "does not exist" in w: critical.append(w) else: warnings.append(w) # 5. Variable scoping — UnboundLocalError is critical for fname, code in files.items(): if not fname.endswith(".py"): continue scope_warns = check_variable_scoping(code, fname) for w in scope_warns: if "UnboundLocalError" in w: critical.append(w) else: warnings.append(w) # 6. Cross-file import consistency — check local imports resolve known_modules = { fname.replace(".py", "") for fname in files if fname.endswith(".py") } for fname, code in files.items(): if not fname.endswith(".py"): continue try: tree = ast.parse(code) except SyntaxError: continue for node in ast.walk(tree): if isinstance(node, ast.ImportFrom) and node.module: mod_top = node.module.split(".")[0] # Check if importing from a local module that exists if mod_top in known_modules: # Verify imported names exist in target file target_file = f"{mod_top}.py" if target_file in files and node.names: target_code = files[target_file] try: target_tree = ast.parse(target_code) except SyntaxError: continue exported = set() for tnode in ast.walk(target_tree): if isinstance(tnode, ast.ClassDef): exported.add(tnode.name) elif isinstance(tnode, ast.FunctionDef): exported.add(tnode.name) elif isinstance(tnode, ast.Assign): for t in tnode.targets: if isinstance(t, ast.Name): exported.add(t.id) for alias in node.names: name = alias.name if name != "*" and name not in exported: critical.append( f"[{fname}] ImportError: " f"'{name}' not defined in " f"'{target_file}' — will crash" ) # 7. BUG-R41-04: main.py MUST have an `if __name__ == "__main__"` block # and must call a training/experiment function — otherwise Docker runs # the file and exits 0 with no output. main_code = files.get("main.py", "") if main_code: try: main_tree = ast.parse(main_code) has_main_guard = False for node in ast.walk(main_tree): if isinstance(node, ast.If): # Check for `if __name__ == "__main__"` pattern test = node.test if isinstance(test, ast.Compare): left = test.left if ( isinstance(left, ast.Name) and left.id == "__name__" and len(test.comparators) == 1 ): comp = test.comparators[0] if ( isinstance(comp, ast.Constant) and comp.value == "__main__" ): has_main_guard = True break if not has_main_guard: critical.append( "[main.py] Missing `if __name__ == \"__main__\":` block — " "script will define functions/classes but never execute " "training. Add a main guard that calls the experiment entry " "point." ) except SyntaxError: pass # Already caught by syntax check above return critical, warnings def _repair_critical_issues( self, files: dict[str, str], critical_issues: list[str], topic: str, exp_plan: str, metric: str, arch_spec: str, ) -> dict[str, str]: """Ask LLM to fix critical validation issues.""" self._log_event(" Targeted repair for critical issues") # Identify which files need repair affected_files: set[str] = set() for issue in critical_issues: # Extract filename from issue string: [filename.py] ... m = re.match(r"\[([^\]]+\.py)\]", issue) if m: affected_files.add(m.group(1)) else: # If no filename found, assume all files affected affected_files.update( f for f in files if f.endswith(".py") ) if not affected_files: affected_files.update(f for f in files if f.endswith(".py")) files_ctx = self._format_files(files) issues_text = "\n".join(f"- {issue}" for issue in critical_issues) prompt = ( "Your generated code has CRITICAL issues that will cause " "runtime failures or produce invalid results. Fix ALL of them.\n\n" "## Critical Issues Found\n" f"{issues_text}\n\n" "## Architecture Blueprint\n" f"{arch_spec[:4000]}\n\n" "## Current Code\n" f"{files_ctx}\n\n" "## Rules\n" "1. Fix every critical issue listed above\n" "2. Ablation/variant classes MUST have different implementations " "from their parent — change the forward() or core method\n" "3. Never hardcode metric values — compute them from actual data\n" "4. nn.Module layers must be created in __init__(), not forward()\n" "5. All cross-file imports must reference names that actually exist\n" "6. Output ALL files in ```filename:xxx.py``` format\n" ) sys_prompt = self._pm.system("code_generation") resp = self._chat(sys_prompt, prompt, max_tokens=16384) fixed = self._extract_files(resp.content) if fixed: merged = dict(files) merged.update(fixed) self._log_event( f" Repair updated {len(fixed)} file(s): " f"{', '.join(sorted(fixed))}" ) return merged self._log_event(" WARNING: Repair produced no extractable files") return files # ── Phase 2b: Single-Shot Generate + Exec-Fix (legacy) ─────────────── def _phase2_generate_and_fix( self, topic: str, exp_plan: str, metric: str, pkg_hint: str, arch_spec: str, max_tokens: int, ) -> dict[str, str]: """Generate code in single shot, then iteratively fix via sandbox.""" self._log_event("Phase 2: Single-shot generate + exec-fix") # Initial generation (uses the existing code_generation prompt) files = self._generate_code( topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens, ) if not files: self._log_event(" WARNING: empty generation, returning fallback") return files return self._exec_fix_loop(files) def _exec_fix_loop(self, files: dict[str, str]) -> dict[str, str]: """Run exec-fix loop if sandbox is available.""" if not self._sandbox_factory or self._cfg.exec_fix_max_iterations <= 0: return files for i in range(self._cfg.exec_fix_max_iterations): result = self._run_in_sandbox(files) if result.returncode == 0: self._log_event(f" Exec-fix iter {i}: code runs OK") break self._log_event( f" Exec-fix iter {i}: crashed (rc={result.returncode}), " f"stderr={len(result.stderr or '')} chars" ) files = self._fix_runtime_error(files, result) return files def _generate_code( self, topic: str, exp_plan: str, metric: str, pkg_hint: str, arch_spec: str, max_tokens: int, ) -> dict[str, str]: """Single code generation call with architecture spec injected.""" # Inject architecture specification into the pkg_hint slot hint = pkg_hint if arch_spec: hint = ( f"{pkg_hint}\n\n" "## ARCHITECTURE SPECIFICATION (follow this file and class structure)\n" f"{arch_spec}\n" ) # BUG-004: Inject numerical stability requirements hint += ( "\n\n## NUMERICAL STABILITY (MANDATORY)\n" "- Add gradient clipping: `torch.nn.utils.clip_grad_norm_(params, 1.0)`\n" "- After each optimizer step, check for NaN loss:\n" " `if torch.isnan(loss): print('FAIL: NaN detected'); break`\n" "- When logging metrics, guard against NaN/Inf:\n" " `v = float(val); v = 0.0 if (math.isnan(v) or math.isinf(v)) else v`\n" "- For RL: clip rewards to [-10, 10], use reward normalization\n" ) sp = self._pm.for_stage( "code_generation", topic=topic, metric=metric, pkg_hint=hint, exp_plan=exp_plan, ) resp = self._chat(sp.system, sp.user, max_tokens=max_tokens) files = self._extract_files(resp.content) if not files and resp.content.strip(): # Retry with higher token budget self._log_event(" Empty extraction, retrying with 32768 tokens") resp = self._chat(sp.system, sp.user, max_tokens=32768) files = self._extract_files(resp.content) return files def _fix_runtime_error( self, files: dict[str, str], result: Any, ) -> dict[str, str]: """Fix a runtime error using targeted or full-file repair. E-05: Parse the error traceback to identify the failing file and line, then send only the affected file with a focused context window. Falls back to full-file repair if parsing fails. """ stderr_tail = (result.stderr or "")[-3000:] stdout_tail = "\n".join( (result.stdout or "").split("\n")[-50:] ) # Try targeted repair first (E-05) error_loc = self._parse_error_location(stderr_tail, files) if error_loc: fname, lineno, error_msg = error_loc self._log_event( f" Targeted repair: {fname}:{lineno} — {error_msg[:80]}" ) fixed = self._targeted_file_repair( files, fname, lineno, error_msg, stderr_tail, ) if fixed: return fixed # Fallback: full-file repair files_ctx = self._format_files(files) sp = self._pm.sub_prompt( "code_exec_fix", stderr=stderr_tail or "(empty)", stdout_tail=stdout_tail or "(empty)", returncode=str(result.returncode), files_context=files_ctx, ) resp = self._chat(sp.system, sp.user, max_tokens=16384) fixed = self._extract_files(resp.content) if fixed: merged = dict(files) merged.update(fixed) return merged return files @staticmethod def _parse_error_location( stderr: str, files: dict[str, str], ) -> tuple[str, int, str] | None: """Parse Python traceback to find failing file and line. Returns (filename, line_number, error_message) or None. """ known_files = set(files.keys()) # Parse traceback lines: File "xxx.py", line NNN tb_pattern = re.compile( r'File "(?:[^"]*[/\\])?([^"]+\.py)", line (\d+)' ) matches = list(tb_pattern.finditer(stderr)) if not matches: return None # Find the last match that references one of our files for m in reversed(matches): fname = m.group(1) lineno = int(m.group(2)) if fname in known_files: # Extract error message (last line of stderr) lines = stderr.strip().split("\n") error_msg = lines[-1] if lines else "Unknown error" return fname, lineno, error_msg return None def _targeted_file_repair( self, files: dict[str, str], target_file: str, error_line: int, error_msg: str, full_stderr: str, ) -> dict[str, str] | None: """Repair a single file with focused context around the error.""" if target_file not in files: return None code = files[target_file] code_lines = code.split("\n") total_lines = len(code_lines) # Extract context window: ±30 lines around error window = 30 start = max(0, error_line - window - 1) end = min(total_lines, error_line + window) context_lines = code_lines[start:end] # Number the lines for the LLM numbered = "\n".join( f"{start + i + 1:4d} | {line}" for i, line in enumerate(context_lines) ) # Build compact dependency context (summaries only) dep_summaries = "" for fname, fcode in files.items(): if fname != target_file and fname.endswith(".py"): summary = self._build_code_summary(fname, fcode) dep_summaries += ( f"\n### {fname}: " f"{len(summary.get('classes', []))} classes, " f"{len(summary.get('functions', []))} functions\n" ) for cls in summary.get("classes", []): methods = ", ".join( m["name"] for m in cls.get("methods", []) ) dep_summaries += ( f" class {cls['name']}" f"({', '.join(cls.get('bases', []))})" f": [{methods}]\n" ) prompt = ( f"Fix the runtime error in `{target_file}` at line {error_line}.\n\n" f"## Error\n```\n{error_msg}\n```\n\n" f"## Full Traceback (last 1500 chars)\n" f"```\n{full_stderr[-1500:]}\n```\n\n" f"## {target_file} (lines {start + 1}-{end})\n" f"```python\n{numbered}\n```\n\n" f"## Other Files in Project\n{dep_summaries}\n\n" f"## Full File ({target_file}, {total_lines} lines)\n" f"```python\n{code}\n```\n\n" f"Output the COMPLETE fixed `{target_file}` in " f"```filename:{target_file}``` format. Fix the root cause, " f"not just the symptom." ) sys_prompt = ( "You are a debugging expert. Fix the specific runtime error " "shown. Preserve experiment design and scientific methodology. " "Output the COMPLETE fixed file." ) resp = self._chat(sys_prompt, prompt, max_tokens=16384) fixed = self._extract_files(resp.content) if not fixed: # Try extracting as single file code_match = re.search( r"```(?:python|filename:\S+)\s*\n(.*?)```", resp.content, re.DOTALL, ) if code_match: fixed = {target_file: code_match.group(1).strip()} if fixed and target_file in fixed: merged = dict(files) merged.update(fixed) self._log_event( f" Targeted repair applied to {target_file} " f"({len(fixed[target_file].split(chr(10)))} lines)" ) return merged return None # ── Phase 3: Solution Tree Search ───────────────────────────────────── def _phase3_tree_search( self, topic: str, exp_plan: str, metric: str, pkg_hint: str, arch_spec: str, max_tokens: int, ) -> tuple[SolutionNode, int]: """Explore multiple candidate solutions via tree search.""" self._log_event("Phase 3: Solution tree search") all_nodes: list[SolutionNode] = [] # Generate initial candidates n_cand = max(self._cfg.tree_search_candidates, 1) for k in range(n_cand): self._log_event(f" Generating candidate {k + 1}/{n_cand}") files = self._generate_code( topic, exp_plan, metric, pkg_hint, arch_spec, max_tokens, ) node = SolutionNode( node_id=f"gen-{k}", files=files, depth=0, generation_method="initial", ) all_nodes.append(node) # Iterative evaluate-fix-branch loop for depth in range(self._cfg.tree_search_max_depth): # Evaluate unevaluated nodes for node in all_nodes: if not node.evaluated: self._evaluate_node(node, metric) # Sort by score all_nodes.sort(key=lambda n: n.score, reverse=True) self._log_event( f" Depth {depth}: {len(all_nodes)} nodes, " f"best={all_nodes[0].node_id} score={all_nodes[0].score:.2f}" ) # If best runs OK, we're done if all_nodes[0].runs_ok: break # Generate fix variants for top-2 crashing candidates new_nodes: list[SolutionNode] = [] for node in all_nodes[:2]: if not node.runs_ok: fixed_files = self._fix_runtime_error( node.files, _SimpleResult( returncode=node.returncode, stdout=node.stdout, stderr=node.stderr, ), ) new_node = SolutionNode( node_id=f"{node.node_id}-fix{depth}", files=fixed_files, parent_id=node.node_id, depth=depth + 1, generation_method="fix", ) new_nodes.append(new_node) all_nodes.extend(new_nodes) # Final evaluation of any remaining unevaluated nodes for node in all_nodes: if node.returncode == -1: self._evaluate_node(node, metric) all_nodes.sort(key=lambda n: n.score, reverse=True) best = all_nodes[0] self._log_event( f" Tree search complete: best={best.node_id} " f"score={best.score:.2f}, explored {len(all_nodes)} nodes" ) return best, len(all_nodes) def _evaluate_node(self, node: SolutionNode, metric_key: str) -> None: """Run a node's code in sandbox and update its score.""" if not node.files: node.score = 0.0 return result = self._run_in_sandbox( node.files, timeout_sec=self._cfg.tree_search_eval_timeout_sec, ) node.evaluated = True node.returncode = result.returncode node.stdout = result.stdout node.stderr = result.stderr node.runs_ok = result.returncode == 0 node.metrics = dict(result.metrics) if result.metrics else {} node.score = self._score_node(node, metric_key) @staticmethod def _score_node(node: SolutionNode, metric_key: str) -> float: """Score a solution node based on execution results.""" score = 0.0 if node.runs_ok: score += 1.0 if node.stdout and len(node.stdout) > 100: score += 0.3 # produces meaningful output if node.metrics: score += 0.5 if metric_key in node.metrics: score += 0.5 if node.stderr and "Error" in node.stderr: score -= 0.2 return max(score, 0.0) # ── Phase 5: Multi-Agent Review Dialog ──────────────────────────────── def _phase4_review( self, files: dict[str, str], topic: str, exp_plan: str, metric: str, ) -> tuple[dict[str, str], int]: """Reviewer agent examines code; coder fixes critical issues.""" self._log_event("Phase 4: Review dialog") rounds = 0 for r in range(self._cfg.review_max_rounds): rounds += 1 files_ctx = self._format_files(files) sp = self._pm.sub_prompt( "code_reviewer", topic=topic, exp_plan=exp_plan, metric=metric, files_context=files_ctx, ) resp = self._chat(sp.system, sp.user, max_tokens=4096) review = self._parse_json(resp.content) if not isinstance(review, dict) or not review: self._log_event( f" Review round {r + 1}: could not parse JSON, skipping" ) break verdict = review.get("verdict", "APPROVE") score = review.get("score", 10) critical = review.get("critical_issues", []) self._log_event( f" Review round {r + 1}: verdict={verdict}, score={score}, " f"critical_issues={len(critical)}" ) if verdict == "APPROVE" or not critical: break # Fix critical issues using the code_generation system prompt fix_prompt = ( "A code reviewer found these critical issues in your experiment code.\n" "Fix ALL of them while preserving the experiment design.\n\n" "## Critical Issues\n" + "\n".join(f"- {issue}" for issue in critical) + f"\n\n## Current Code\n{files_ctx}\n\n" "Output ALL files in ```filename:xxx.py``` format, " "including unchanged files." ) sys_prompt = self._pm.system("code_generation") fix_resp = self._chat(sys_prompt, fix_prompt, max_tokens=16384) fixed = self._extract_files(fix_resp.content) if fixed: files = dict(files) files.update(fixed) return files, rounds # ── Helpers ──────────────────────────────────────────────────────────── def _chat(self, system: str, user: str, max_tokens: int = 8192) -> Any: """Make an LLM call and track count.""" self._calls += 1 messages = [{"role": "user", "content": user}] return self._llm.chat( messages=messages, system=system, max_tokens=max_tokens, ) def _get_or_create_sandbox(self) -> _SandboxLike: """Lazily create a single sandbox instance for all validation runs.""" if self._sandbox is None: sandbox_dir = self._stage_dir / "agent_sandbox" sandbox_dir.mkdir(parents=True, exist_ok=True) self._sandbox = self._sandbox_factory( self._exp_config, sandbox_dir, ) return self._sandbox def _run_in_sandbox( self, files: dict[str, str], timeout_sec: int | None = None, ) -> Any: """Write files to a temp directory and run in sandbox.""" if not self._sandbox_factory: raise RuntimeError("No sandbox factory configured") self._runs += 1 timeout = timeout_sec or self._cfg.exec_fix_timeout_sec # Write files to a numbered attempt directory run_dir = self._stage_dir / "agent_runs" / f"attempt_{self._runs:03d}" run_dir.mkdir(parents=True, exist_ok=True) for fname, code in files.items(): fpath = (run_dir / fname).resolve() # BUG-CA-10: Prevent path traversal from LLM-generated filenames if not fpath.is_relative_to(run_dir.resolve()): self._log_event(f" WARNING: Skipping path-traversal filename: {fname}") continue fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(code, encoding="utf-8") # Run using the sandbox sandbox = self._get_or_create_sandbox() try: result = sandbox.run_project(run_dir, timeout_sec=timeout) except Exception as exc: self._log_event(f" Sandbox run failed: {exc}") result = _SimpleResult( returncode=1, stdout="", stderr=f"Sandbox exception: {exc}", ) return result def _extract_files(self, content: str) -> dict[str, str]: """Extract multi-file code blocks from LLM output.""" # Local import to avoid circular dependency with executor.py from researchclaw.pipeline.executor import _extract_multi_file_blocks return _extract_multi_file_blocks(content) @staticmethod def _format_files(files: dict[str, str]) -> str: """Format files for inclusion in a prompt.""" parts = [] for fname in sorted(files): parts.append(f"```filename:{fname}\n{files[fname]}\n```") return "\n\n".join(parts) @staticmethod def _parse_json(text: str) -> dict[str, Any] | None: """Best-effort JSON extraction from LLM response. BUG-17: Always returns ``dict | None`` — never a bare string or list, which would cause ``.get()`` crashes in callers. """ def _as_dict(val: Any) -> dict[str, Any] | None: return val if isinstance(val, dict) else None # Direct parse try: return _as_dict(json.loads(text)) except (json.JSONDecodeError, ValueError): pass # ```json``` fenced block m = re.search(r"```json\s*\n(.*?)```", text, re.DOTALL) if m: try: return _as_dict(json.loads(m.group(1))) except (json.JSONDecodeError, ValueError): pass # First {...} object (supports up to 2 levels of nesting) m = re.search( r"\{[^{}]*(?:\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}[^{}]*)*\}", text, re.DOTALL, ) if m: try: return _as_dict(json.loads(m.group(0))) except (json.JSONDecodeError, ValueError): pass return None def _log_event(self, msg: str) -> None: """Log to both Python logger and the internal validation log.""" logger.info("[CodeAgent] %s", msg) self._log.append(msg) # --------------------------------------------------------------------------- # Lightweight result stand-in for error plumbing # --------------------------------------------------------------------------- @dataclass class _SimpleResult: """Minimal sandbox result for internal error plumbing.""" returncode: int = 1 stdout: str = "" stderr: str = "" elapsed_sec: float = 0.0 metrics: dict[str, object] = field(default_factory=dict) timed_out: bool = False ================================================ FILE: researchclaw/pipeline/contracts.py ================================================ """Stage I/O contracts for the 23-stage ResearchClaw pipeline. Each StageContract declares: - input_files: artifacts this stage reads (produced by prior stages) - output_files: artifacts this stage must produce - dod: Definition of Done — human-readable acceptance criterion - error_code: unique error identifier for diagnostics - max_retries: how many times the stage may be retried on failure """ from __future__ import annotations from dataclasses import dataclass from researchclaw.pipeline.stages import Stage @dataclass(frozen=True) class StageContract: stage: Stage input_files: tuple[str, ...] output_files: tuple[str, ...] dod: str error_code: str max_retries: int = 1 CONTRACTS: dict[Stage, StageContract] = { # Phase A: Research Scoping Stage.TOPIC_INIT: StageContract( stage=Stage.TOPIC_INIT, input_files=(), output_files=("goal.md", "hardware_profile.json"), dod="SMART goal statement with topic, scope, and constraints", error_code="E01_INVALID_GOAL", max_retries=0, ), Stage.PROBLEM_DECOMPOSE: StageContract( stage=Stage.PROBLEM_DECOMPOSE, input_files=("goal.md",), output_files=("problem_tree.md",), dod=">=3 prioritized sub-questions identified", error_code="E02_DECOMP_FAIL", ), # Phase B: Literature Discovery Stage.SEARCH_STRATEGY: StageContract( stage=Stage.SEARCH_STRATEGY, input_files=("problem_tree.md",), output_files=("search_plan.yaml", "sources.json", "queries.json"), dod=">=2 search strategies defined with verified data sources", error_code="E03_STRATEGY_BAD", ), Stage.LITERATURE_COLLECT: StageContract( stage=Stage.LITERATURE_COLLECT, input_files=("search_plan.yaml",), output_files=("candidates.jsonl",), dod=">=N candidate papers collected from specified sources", error_code="E04_COLLECT_EMPTY", max_retries=2, ), Stage.LITERATURE_SCREEN: StageContract( stage=Stage.LITERATURE_SCREEN, input_files=("candidates.jsonl",), output_files=("shortlist.jsonl",), dod="Relevance + quality dual screening completed and approved", error_code="E05_GATE_REJECT", max_retries=0, ), Stage.KNOWLEDGE_EXTRACT: StageContract( stage=Stage.KNOWLEDGE_EXTRACT, input_files=("shortlist.jsonl",), output_files=("cards/",), dod="Structured knowledge card per shortlisted paper", error_code="E06_EXTRACT_FAIL", ), # Phase C: Knowledge Synthesis Stage.SYNTHESIS: StageContract( stage=Stage.SYNTHESIS, input_files=("cards/",), output_files=("synthesis.md",), dod="Topic clusters + >=2 research gaps identified", error_code="E07_SYNTHESIS_WEAK", ), Stage.HYPOTHESIS_GEN: StageContract( stage=Stage.HYPOTHESIS_GEN, input_files=("synthesis.md",), output_files=("hypotheses.md",), dod=">=2 falsifiable research hypotheses", error_code="E08_HYP_INVALID", ), # Phase D: Experiment Design Stage.EXPERIMENT_DESIGN: StageContract( stage=Stage.EXPERIMENT_DESIGN, input_files=("hypotheses.md",), output_files=("exp_plan.yaml",), dod="Experiment plan with baselines, ablations, metrics approved", error_code="E09_GATE_REJECT", max_retries=0, ), Stage.CODE_GENERATION: StageContract( stage=Stage.CODE_GENERATION, input_files=("exp_plan.yaml",), output_files=("experiment/", "experiment_spec.md"), dod="Multi-file experiment project + spec document", error_code="E10_CODEGEN_FAIL", max_retries=2, ), Stage.RESOURCE_PLANNING: StageContract( stage=Stage.RESOURCE_PLANNING, input_files=("exp_plan.yaml",), output_files=("schedule.json",), dod="Resource schedule with GPU/time estimates", error_code="E11_SCHED_CONFLICT", ), # Phase E: Experiment Execution Stage.EXPERIMENT_RUN: StageContract( stage=Stage.EXPERIMENT_RUN, input_files=("schedule.json", "experiment/"), output_files=("runs/",), dod="All scheduled experiment runs completed with artifacts", error_code="E12_RUN_FAIL", max_retries=2, ), Stage.ITERATIVE_REFINE: StageContract( stage=Stage.ITERATIVE_REFINE, input_files=("runs/",), output_files=("refinement_log.json", "experiment_final/"), dod="Edit-run-eval loop converged or max iterations reached", error_code="E13_REFINE_FAIL", max_retries=2, ), # Phase F: Analysis & Decision Stage.RESULT_ANALYSIS: StageContract( stage=Stage.RESULT_ANALYSIS, input_files=("runs/",), output_files=("analysis.md",), dod="Metrics analyzed with statistical tests and conclusions", error_code="E14_ANALYSIS_ERR", ), Stage.RESEARCH_DECISION: StageContract( stage=Stage.RESEARCH_DECISION, input_files=("analysis.md",), output_files=("decision.md",), dod="PROCEED/PIVOT decision with evidence-based justification", error_code="E15_DECISION_FAIL", ), # Phase G: Paper Writing Stage.PAPER_OUTLINE: StageContract( stage=Stage.PAPER_OUTLINE, input_files=("analysis.md", "decision.md"), output_files=("outline.md",), dod="Complete paper outline with section-level detail", error_code="E16_OUTLINE_FAIL", ), Stage.PAPER_DRAFT: StageContract( stage=Stage.PAPER_DRAFT, input_files=("outline.md",), output_files=("paper_draft.md",), dod="Full paper draft with all sections written", error_code="E17_DRAFT_FAIL", ), Stage.PEER_REVIEW: StageContract( stage=Stage.PEER_REVIEW, input_files=("paper_draft.md",), output_files=("reviews.md",), dod=">=2 simulated review perspectives with actionable feedback", error_code="E18_REVIEW_FAIL", ), Stage.PAPER_REVISION: StageContract( stage=Stage.PAPER_REVISION, input_files=("paper_draft.md", "reviews.md"), output_files=("paper_revised.md",), dod="All review comments addressed with tracked changes", error_code="E19_REVISION_FAIL", ), # Phase H: Finalization Stage.QUALITY_GATE: StageContract( stage=Stage.QUALITY_GATE, input_files=("paper_revised.md",), output_files=("quality_report.json",), dod="Quality score meets threshold and approved", error_code="E20_GATE_REJECT", max_retries=0, ), Stage.KNOWLEDGE_ARCHIVE: StageContract( stage=Stage.KNOWLEDGE_ARCHIVE, input_files=(), output_files=("archive.md", "bundle_index.json"), dod="Retrospective + reproducibility bundle archived", error_code="E21_ARCHIVE_FAIL", ), Stage.EXPORT_PUBLISH: StageContract( stage=Stage.EXPORT_PUBLISH, input_files=("paper_revised.md",), output_files=("paper_final.md", "code/"), dod="Final paper exported in target format", error_code="E22_EXPORT_FAIL", ), Stage.CITATION_VERIFY: StageContract( stage=Stage.CITATION_VERIFY, input_files=("paper_final.md",), # references.bib is optional (BUG-50) output_files=("verification_report.json", "references_verified.bib"), dod="All citations verified against real APIs; hallucinated refs flagged", error_code="E23_VERIFY_FAIL", ), } ================================================ FILE: researchclaw/pipeline/executor.py ================================================ from __future__ import annotations import json import logging import math import re import time as _time from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path from typing import Any, Callable import yaml from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.hardware import HardwareProfile, detect_hardware, ensure_torch_available, is_metric_name from researchclaw.llm import create_llm_client from researchclaw.llm.client import LLMClient from researchclaw.prompts import PromptManager from researchclaw.pipeline.stages import ( NEXT_STAGE, Stage, StageStatus, TransitionEvent, TransitionOutcome, advance, gate_required, ) from researchclaw.pipeline.contracts import CONTRACTS, StageContract from researchclaw.experiment.validator import ( CodeValidation, format_issues_for_llm, validate_code, ) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Domain detection (extracted to _domain.py) # --------------------------------------------------------------------------- from researchclaw.pipeline._domain import ( # noqa: E402 _DOMAIN_KEYWORDS, _detect_domain, _is_ml_domain, ) # --------------------------------------------------------------------------- # Shared helpers (extracted to _helpers.py) # --------------------------------------------------------------------------- from researchclaw.pipeline._helpers import ( # noqa: E402 StageResult, _METACLAW_SKILLS_DIR, _SANDBOX_SAFE_PACKAGES, _STOP_WORDS, _build_context_preamble, _build_fallback_queries, _chat_with_prompt, _collect_experiment_results, _collect_json_context, _default_hypotheses, _default_paper_outline, _default_quality_report, _detect_runtime_issues, _ensure_sandbox_deps, _extract_code_block, _extract_multi_file_blocks, _extract_paper_title, _extract_topic_keywords, _extract_yaml_block, _find_prior_file, _generate_framework_diagram_prompt, _generate_neurips_checklist, _get_evolution_overlay, _load_hardware_profile, _multi_perspective_generate, _parse_jsonl_rows, _parse_metrics_from_stdout, _read_prior_artifact, _safe_filename, _safe_json_loads, _synthesize_perspectives, _topic_constraint_block, _utcnow_iso, _write_jsonl, _write_stage_meta, reconcile_figure_refs, ) # --------------------------------------------------------------------------- # Stages 1-2 (extracted to stage_impls/_topic.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._topic import ( # noqa: E402 _execute_topic_init, _execute_problem_decompose, ) # --------------------------------------------------------------------------- # Stages 3-6 (extracted to stage_impls/_literature.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._literature import ( # noqa: E402 _execute_search_strategy, _execute_literature_collect, _execute_literature_screen, _execute_knowledge_extract, _expand_search_queries, ) # --------------------------------------------------------------------------- # Stages 7-8 (extracted to stage_impls/_synthesis.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._synthesis import ( # noqa: E402 _execute_synthesis, _execute_hypothesis_gen, ) # --------------------------------------------------------------------------- # Stage 9 (extracted to stage_impls/_experiment_design.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._experiment_design import ( # noqa: E402 _execute_experiment_design, ) # --------------------------------------------------------------------------- # Stage 10 (extracted to stage_impls/_code_generation.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._code_generation import ( # noqa: E402 _execute_code_generation, ) # --------------------------------------------------------------------------- # Stages 11-13 (extracted to stage_impls/_execution.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._execution import ( # noqa: E402 _execute_resource_planning, _execute_experiment_run, _execute_iterative_refine, ) # --------------------------------------------------------------------------- # Stages 14-15 (extracted to stage_impls/_analysis.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._analysis import ( # noqa: E402 _execute_result_analysis, _parse_decision, _execute_research_decision, ) # --------------------------------------------------------------------------- # Stages 16-17 (extracted to stage_impls/_paper_writing.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._paper_writing import ( # noqa: E402 _execute_paper_outline, _execute_paper_draft, _collect_raw_experiment_metrics, _write_paper_sections, _validate_draft_quality, _review_compiled_pdf, _check_ablation_effectiveness, _detect_result_contradictions, _BULLET_LENIENT_SECTIONS, _BALANCE_SECTIONS, ) # --------------------------------------------------------------------------- # Stages 18-23 (extracted to stage_impls/_review_publish.py) # --------------------------------------------------------------------------- from researchclaw.pipeline.stage_impls._review_publish import ( # noqa: E402 _execute_peer_review, _execute_paper_revision, _execute_quality_gate, _execute_knowledge_archive, _execute_export_publish, _execute_citation_verify, _sanitize_fabricated_data, _collect_experiment_evidence, _check_citation_relevance, _remove_bibtex_entries, _remove_citations_from_text, ) _STAGE_EXECUTORS: dict[Stage, Callable[..., StageResult]] = { Stage.TOPIC_INIT: _execute_topic_init, Stage.PROBLEM_DECOMPOSE: _execute_problem_decompose, Stage.SEARCH_STRATEGY: _execute_search_strategy, Stage.LITERATURE_COLLECT: _execute_literature_collect, Stage.LITERATURE_SCREEN: _execute_literature_screen, Stage.KNOWLEDGE_EXTRACT: _execute_knowledge_extract, Stage.SYNTHESIS: _execute_synthesis, Stage.HYPOTHESIS_GEN: _execute_hypothesis_gen, Stage.EXPERIMENT_DESIGN: _execute_experiment_design, Stage.CODE_GENERATION: _execute_code_generation, Stage.RESOURCE_PLANNING: _execute_resource_planning, Stage.EXPERIMENT_RUN: _execute_experiment_run, Stage.ITERATIVE_REFINE: _execute_iterative_refine, Stage.RESULT_ANALYSIS: _execute_result_analysis, Stage.RESEARCH_DECISION: _execute_research_decision, Stage.PAPER_OUTLINE: _execute_paper_outline, Stage.PAPER_DRAFT: _execute_paper_draft, Stage.PEER_REVIEW: _execute_peer_review, Stage.PAPER_REVISION: _execute_paper_revision, Stage.QUALITY_GATE: _execute_quality_gate, Stage.KNOWLEDGE_ARCHIVE: _execute_knowledge_archive, Stage.EXPORT_PUBLISH: _execute_export_publish, Stage.CITATION_VERIFY: _execute_citation_verify, } def execute_stage( stage: Stage, *, run_dir: Path, run_id: str, config: RCConfig, adapters: AdapterBundle, auto_approve_gates: bool = False, ) -> StageResult: """Execute one pipeline stage, validate outputs, and apply gate logic.""" stage_dir = run_dir / f"stage-{int(stage):02d}" stage_dir.mkdir(parents=True, exist_ok=True) _t_health_start = _time.monotonic() contract: StageContract = CONTRACTS[stage] if contract.input_files: for input_file in contract.input_files: found = _read_prior_artifact(run_dir, input_file) if found is None: result = StageResult( stage=stage, status=StageStatus.FAILED, artifacts=(), error=f"Missing input: {input_file} (required by {stage.name})", decision="retry", ) _write_stage_meta(stage_dir, stage, run_id, result) return result bridge = config.openclaw_bridge if bridge.use_message and config.notifications.on_stage_start: adapters.message.notify( config.notifications.channel, f"stage-{int(stage):02d}-start", f"Starting {stage.name}", ) if bridge.use_memory: adapters.memory.append("stages", f"{run_id}:{int(stage)}:running") llm = None try: if config.llm.provider == "acp": llm = create_llm_client(config) else: candidate = LLMClient.from_rc_config(config) if candidate.config.base_url and candidate.config.api_key: llm = candidate except Exception as _llm_exc: # noqa: BLE001 logger.warning("LLM client creation failed: %s", _llm_exc) llm = None try: _ = advance(stage, StageStatus.PENDING, TransitionEvent.START) executor = _STAGE_EXECUTORS[stage] prompts = PromptManager(config.prompts.custom_file or None) # type: ignore[attr-defined] try: result = executor( stage_dir, run_dir, config, adapters, llm=llm, prompts=prompts ) except TypeError as exc: if "unexpected keyword argument 'prompts'" not in str(exc): raise result = executor(stage_dir, run_dir, config, adapters, llm=llm) except Exception as exc: # noqa: BLE001 logger.exception("Stage %s failed", stage.name) result = StageResult( stage=stage, status=StageStatus.FAILED, artifacts=(), error=str(exc), decision="retry", ) if result.status == StageStatus.DONE: for output_file in contract.output_files: if output_file.endswith("/"): path = stage_dir / output_file.rstrip("/") if not path.is_dir() or not any(path.iterdir()): result = StageResult( stage=stage, status=StageStatus.FAILED, artifacts=result.artifacts, error=f"Missing output directory: {output_file}", decision="retry", evidence_refs=result.evidence_refs, ) break else: path = stage_dir / output_file if not path.exists() or path.stat().st_size == 0: result = StageResult( stage=stage, status=StageStatus.FAILED, artifacts=result.artifacts, error=f"Missing or empty output: {output_file}", decision="retry", evidence_refs=result.evidence_refs, ) break # --- MetaClaw PRM quality gate evaluation --- try: mc_bridge = getattr(config, "metaclaw_bridge", None) if ( mc_bridge and getattr(mc_bridge, "enabled", False) and result.status == StageStatus.DONE ): mc_prm = getattr(mc_bridge, "prm", None) if mc_prm and getattr(mc_prm, "enabled", False): prm_stages = getattr(mc_prm, "gate_stages", (5, 9, 15, 20)) if int(stage) in prm_stages: from researchclaw.metaclaw_bridge.prm_gate import ResearchPRMGate prm_gate = ResearchPRMGate.from_bridge_config(mc_prm) if prm_gate is not None: # Read stage output for PRM evaluation output_text = "" for art in result.artifacts: art_path = stage_dir / art if art_path.exists() and art_path.is_file(): try: output_text += art_path.read_text(encoding="utf-8")[:4000] except (UnicodeDecodeError, OSError): pass if output_text: prm_score = prm_gate.evaluate_stage(int(stage), output_text) logger.info( "MetaClaw PRM score for stage %d: %.1f", int(stage), prm_score, ) # Write PRM score to stage health import json as _prm_json prm_report = { "stage": int(stage), "prm_score": prm_score, "model": prm_gate.model, "votes": prm_gate.votes, } (stage_dir / "prm_score.json").write_text( _prm_json.dumps(prm_report, indent=2), encoding="utf-8", ) # If PRM score is -1 (fail), mark stage as failed if prm_score == -1.0: logger.warning( "MetaClaw PRM rejected stage %d output", int(stage), ) result = StageResult( stage=result.stage, status=StageStatus.FAILED, artifacts=result.artifacts, error="PRM quality gate: output below quality threshold", decision="retry", evidence_refs=result.evidence_refs, ) except Exception: # noqa: BLE001 logger.warning("MetaClaw PRM evaluation failed (non-blocking)") if gate_required(stage, config.security.hitl_required_stages): if auto_approve_gates: if bridge.use_memory: adapters.memory.append("gates", f"{run_id}:{int(stage)}:auto-approved") else: result = StageResult( stage=result.stage, status=StageStatus.BLOCKED_APPROVAL, artifacts=result.artifacts, error=result.error, decision="block", evidence_refs=result.evidence_refs, ) if bridge.use_message and config.notifications.on_gate_required: adapters.message.notify( config.notifications.channel, f"gate-{int(stage):02d}", f"Approval required for {stage.name}", ) if bridge.use_memory: adapters.memory.append("stages", f"{run_id}:{int(stage)}:{result.status.value}") _write_stage_meta(stage_dir, stage, run_id, result) _t_health_end = _time.monotonic() stage_health = { "stage_id": f"{int(stage):02d}-{stage.name.lower()}", "run_id": run_id, "duration_sec": round(_t_health_end - _t_health_start, 2), "status": result.status.value, "artifacts_count": len(result.artifacts), "error": result.error, "timestamp": _utcnow_iso(), } try: (stage_dir / "stage_health.json").write_text( json.dumps(stage_health, indent=2), encoding="utf-8" ) except OSError: pass return result ================================================ FILE: researchclaw/pipeline/experiment_diagnosis.py ================================================ """Experiment Diagnosis Agent — analyzes WHY experiments failed. Parses experiment artifacts (stdout, stderr, metrics, experiment plan) to produce a structured failure diagnosis with root cause classification and concrete repair instructions. Used by the experiment repair loop (``experiment_repair.py``) to generate targeted fixes via OpenCode. """ from __future__ import annotations import enum import logging import math import re from dataclasses import dataclass, field from typing import Any logger = logging.getLogger(__name__) class DeficiencyType(enum.Enum): """Classification of experiment failure modes.""" NO_CONDITIONS_COMPLETED = "no_conditions" TOO_FEW_CONDITIONS = "few_conditions" MISSING_BASELINE = "no_baseline" MISSING_PROPOSED = "no_proposed" INSUFFICIENT_SEEDS = "few_seeds" TIME_GUARD_DOMINANT = "time_guard" SYNTHETIC_DATA_FALLBACK = "synthetic_data" CODE_CRASH = "code_crash" MISSING_DEPENDENCY = "missing_dep" HYPERPARAMETER_ISSUE = "bad_hyperparams" IDENTICAL_CONDITIONS = "identical_conditions" PERMISSION_ERROR = "permission_error" DATASET_UNAVAILABLE = "dataset_unavailable" GPU_OOM = "gpu_oom" @dataclass class Deficiency: """A single identified deficiency in the experiment.""" type: DeficiencyType severity: str # "critical" | "major" | "minor" description: str affected_conditions: list[str] = field(default_factory=list) suggested_fix: str = "" error_message: str = "" # The actual error text from logs @dataclass class ExperimentDiagnosis: """Structured diagnosis of an experiment run.""" deficiencies: list[Deficiency] = field(default_factory=list) repairable: bool = True reason: str = "" # Why not repairable (if applicable) summary: str = "" conditions_completed: list[str] = field(default_factory=list) conditions_failed: list[str] = field(default_factory=list) total_planned: int = 0 completion_rate: float = 0.0 def has_critical(self) -> bool: return any(d.severity == "critical" for d in self.deficiencies) def to_repair_prompt(self) -> str: """Generate a structured repair prompt for OpenCode.""" lines = ["## EXPERIMENT DIAGNOSIS\n"] lines.append(f"Completion rate: {self.completion_rate:.0%} " f"({len(self.conditions_completed)}/{self.total_planned} conditions)\n") if self.conditions_completed: lines.append(f"Completed: {', '.join(self.conditions_completed)}") if self.conditions_failed: lines.append(f"Failed: {', '.join(self.conditions_failed)}\n") lines.append("## DEFICIENCIES (ordered by severity)\n") for d in sorted(self.deficiencies, key=lambda x: {"critical": 0, "major": 1, "minor": 2}.get(x.severity, 3)): lines.append(f"### [{d.severity.upper()}] {d.type.value}") lines.append(f"**Problem**: {d.description}") if d.error_message: lines.append(f"**Error**: ```{d.error_message[:500]}```") if d.affected_conditions: lines.append(f"**Affected conditions**: {', '.join(d.affected_conditions)}") lines.append(f"**Fix**: {d.suggested_fix}\n") return "\n".join(lines) def to_dict(self) -> dict: """Serialize for persistence.""" return { "deficiencies": [ { "type": d.type.value, "severity": d.severity, "description": d.description, "affected_conditions": d.affected_conditions, "suggested_fix": d.suggested_fix, "error_message": d.error_message, } for d in self.deficiencies ], "repairable": self.repairable, "reason": self.reason, "summary": self.summary, "conditions_completed": self.conditions_completed, "conditions_failed": self.conditions_failed, "total_planned": self.total_planned, "completion_rate": self.completion_rate, } # --------------------------------------------------------------------------- # Experiment Quality Assessment # --------------------------------------------------------------------------- class PaperMode(enum.Enum): """Paper writing mode based on experiment quality.""" FULL_PAPER = "full_paper" PRELIMINARY_STUDY = "preliminary_study" NEGATIVE_RESULT = "negative_result" TECHNICAL_REPORT = "technical_report" @dataclass class ExperimentQualityAssessment: """Deterministic assessment of experiment readiness for paper writing.""" sufficient: bool mode: PaperMode deficiencies: list[Deficiency] = field(default_factory=list) repair_possible: bool = True diagnosis: ExperimentDiagnosis | None = None def assess_experiment_quality( experiment_summary: dict, refinement_log: dict | None = None, experiment_plan: dict | None = None, *, min_conditions: int = 3, min_seeds: int = 2, ) -> ExperimentQualityAssessment: """Deterministic quality assessment of experiment data. Parameters ---------- experiment_summary: Parsed ``experiment_summary.json``. refinement_log: Parsed ``refinement_log.json``. experiment_plan: Parsed experiment plan (conditions list). min_conditions: Minimum conditions required for ``full_paper`` mode. min_seeds: Minimum seeds per condition for ``full_paper`` mode. """ # Run full diagnosis stdout = _extract_stdout(experiment_summary, refinement_log) stderr = _extract_stderr(experiment_summary, refinement_log) diagnosis = diagnose_experiment( experiment_summary=experiment_summary, refinement_log=refinement_log, stdout=stdout, stderr=stderr, experiment_plan=experiment_plan, ) # Determine paper mode mode = _select_paper_mode(experiment_summary, diagnosis, min_conditions, min_seeds) sufficient = mode == PaperMode.FULL_PAPER repair_possible = not diagnosis.has_critical() or diagnosis.repairable return ExperimentQualityAssessment( sufficient=sufficient, mode=mode, deficiencies=diagnosis.deficiencies, repair_possible=repair_possible, diagnosis=diagnosis, ) def _select_paper_mode( experiment_summary: dict, diagnosis: ExperimentDiagnosis, min_conditions: int, min_seeds: int, ) -> PaperMode: """Select paper mode based on experiment quality.""" # Check for synthetic data if any(d.type == DeficiencyType.SYNTHETIC_DATA_FALLBACK for d in diagnosis.deficiencies): return PaperMode.TECHNICAL_REPORT # Check for no conditions if not diagnosis.conditions_completed: return PaperMode.TECHNICAL_REPORT # Check if only 1 condition if len(diagnosis.conditions_completed) <= 1: return PaperMode.PRELIMINARY_STUDY # Check for sufficient conditions and seeds cond_summaries = experiment_summary.get("condition_summaries", {}) conditions_with_enough_seeds = 0 for cond_name in diagnosis.conditions_completed: cond_data = cond_summaries.get(cond_name, {}) n_seeds = cond_data.get("n_seeds", 1) # Heuristic: count per-seed keys in best_run metrics metrics = experiment_summary.get("best_run", {}).get("metrics", {}) seed_keys = [k for k in metrics if k.startswith(f"{cond_name}/") and re.match(r".*/\d+/", k)] # BUG-R6-05: Guard against re.match returning None actual_seeds = len(set( m.group(1) for k in seed_keys if (m := re.match(r".*/(\d+)/", k)) is not None )) if seed_keys else 1 if actual_seeds >= min_seeds: conditions_with_enough_seeds += 1 if len(diagnosis.conditions_completed) >= min_conditions and conditions_with_enough_seeds >= min_conditions: # Check for negative result best_run = experiment_summary.get("best_run", {}) metrics = best_run.get("metrics", {}) # Simple heuristic: if primary_metric is very low, might be negative result # This is refined by checking baseline vs proposed in the full pipeline return PaperMode.FULL_PAPER if len(diagnosis.conditions_completed) >= 2: return PaperMode.PRELIMINARY_STUDY return PaperMode.TECHNICAL_REPORT # --------------------------------------------------------------------------- # Core diagnosis logic # --------------------------------------------------------------------------- def diagnose_experiment( experiment_summary: dict, refinement_log: dict | None = None, stdout: str = "", stderr: str = "", experiment_plan: dict | None = None, *, prior_diagnoses: list[dict] | None = None, ) -> ExperimentDiagnosis: """Analyze experiment failures and produce structured diagnosis. Parameters ---------- experiment_summary: Parsed ``experiment_summary.json``. refinement_log: Parsed ``refinement_log.json``. stdout: Combined stdout from experiment execution. stderr: Combined stderr from experiment execution. experiment_plan: The designed experiment plan (planned conditions). prior_diagnoses: Previous diagnosis results (to avoid recommending same fix twice). """ diag = ExperimentDiagnosis() # Determine planned vs completed conditions planned_conditions = _get_planned_conditions(experiment_plan, experiment_summary) completed_conditions = _get_completed_conditions(experiment_summary) diag.total_planned = len(planned_conditions) diag.conditions_completed = sorted(completed_conditions) diag.conditions_failed = sorted(set(planned_conditions) - completed_conditions) diag.completion_rate = len(completed_conditions) / max(len(planned_conditions), 1) combined_output = stdout + "\n" + stderr # --- Pattern-based checks --- # 1. Missing dependencies _check_missing_deps(diag, combined_output) # 2. Permission errors _check_permission_errors(diag, combined_output) # 3. GPU OOM _check_gpu_oom(diag, combined_output) # 4. Time guard dominance _check_time_guard(diag, combined_output, planned_conditions, completed_conditions) # 5. Synthetic data fallback _check_synthetic_data(diag, combined_output) # 6. Dataset unavailability _check_dataset_issues(diag, combined_output) # 7. Code crashes _check_code_crashes(diag, stderr, combined_output) # 8. Hyperparameter issues _check_hyperparams(diag, combined_output, experiment_summary) # 9. Identical conditions _check_identical_conditions(diag, experiment_summary) # 10. Insufficient seeds _check_insufficient_seeds(diag, experiment_summary) # 11. Near-random accuracy (BUG-204) _check_near_random_accuracy(diag, experiment_summary) # 12. No conditions at all if not completed_conditions: diag.deficiencies.append(Deficiency( type=DeficiencyType.NO_CONDITIONS_COMPLETED, severity="critical", description="No experimental conditions completed successfully.", suggested_fix="Fix the root cause errors above, then re-run.", )) # Determine repairability _assess_repairability(diag, prior_diagnoses) # Build summary diag.summary = ( f"{len(diag.deficiencies)} deficiency(ies) found. " f"{len(completed_conditions)}/{len(planned_conditions)} conditions completed. " f"Repairable: {diag.repairable}." ) return diag # --------------------------------------------------------------------------- # Individual check functions # --------------------------------------------------------------------------- def _check_missing_deps(diag: ExperimentDiagnosis, output: str) -> None: """Detect missing Python package errors.""" pattern = re.compile(r"ModuleNotFoundError: No module named '([^']+)'") for m in pattern.finditer(output): module = m.group(1) diag.deficiencies.append(Deficiency( type=DeficiencyType.MISSING_DEPENDENCY, severity="critical", description=f"Missing Python package: {module}", error_message=m.group(0), suggested_fix=f"Add '{module}' to requirements.txt and re-run.", )) # Also check for missing system libraries (Box2D, etc.) if "box2d" in output.lower() or "Box2D" in output: if "not available" in output.lower() or "not installed" in output.lower(): diag.deficiencies.append(Deficiency( type=DeficiencyType.MISSING_DEPENDENCY, severity="critical", description="Box2D library not available — LunarLander environments will fail.", suggested_fix="Add 'box2d-py' and 'gymnasium[box2d]' to requirements.txt.", )) def _check_permission_errors(diag: ExperimentDiagnosis, output: str) -> None: """Detect file/network permission errors.""" patterns = [ (r"PermissionError.*?(?:huggingface|hf|model|download)", "HuggingFace model download blocked"), (r"PermissionError", "File permission error"), (r"403.*?Forbidden.*?(?:huggingface|hf)", "HuggingFace API access denied"), ] for pat, desc in patterns: if re.search(pat, output, re.IGNORECASE): diag.deficiencies.append(Deficiency( type=DeficiencyType.PERMISSION_ERROR, severity="critical", description=desc, error_message=_extract_context(output, pat), suggested_fix=( "Pre-cache the model in setup.py, or switch to a smaller " "model (e.g., distilgpt2 instead of gpt2). Ensure HF_TOKEN " "is set if using gated models." ), )) break # One permission error is enough def _check_gpu_oom(diag: ExperimentDiagnosis, output: str) -> None: """Detect GPU out-of-memory errors.""" if re.search(r"CUDA out of memory|RuntimeError.*?OOM|torch\.cuda\.OutOfMemoryError", output): diag.deficiencies.append(Deficiency( type=DeficiencyType.GPU_OOM, severity="major", description="GPU out of memory during training.", error_message=_extract_context(output, r"CUDA out of memory"), suggested_fix=( "Reduce batch size by 50%. If still OOM, reduce model size " "or use gradient checkpointing." ), )) def _check_time_guard( diag: ExperimentDiagnosis, output: str, planned: set[str], completed: set[str], ) -> None: """Detect time guard killing too many conditions.""" # Count TIME_GUARD mentions time_guard_hits = len(re.findall(r"TIME_GUARD|time.guard|time guard", output, re.IGNORECASE)) skipped_conditions = planned - completed skipped_pct = len(skipped_conditions) / max(len(planned), 1) if skipped_pct > 0.5 and len(skipped_conditions) > 1: diag.deficiencies.append(Deficiency( type=DeficiencyType.TIME_GUARD_DOMINANT, severity="major", description=( f"Time guard killed {len(skipped_conditions)}/{len(planned)} conditions " f"({skipped_pct:.0%}). Too many conditions for the time budget." ), affected_conditions=sorted(skipped_conditions), suggested_fix=( f"Reduce from {len(planned)} to {min(5, len(planned))} conditions " f"(keep baseline + proposed + 1 ablation). " f"Reduce epochs by 50%. Reduce seeds from 3 to 2." ), )) def _check_synthetic_data(diag: ExperimentDiagnosis, output: str) -> None: """Detect synthetic/fake data fallback.""" patterns = [ r"using synthetic data", r"synthetic.*?fallback", r"random.*?tokens", r"WARNING.*?load failed.*?using", ] for pat in patterns: if re.search(pat, output, re.IGNORECASE): diag.deficiencies.append(Deficiency( type=DeficiencyType.SYNTHETIC_DATA_FALLBACK, severity="critical", description="Experiment fell back to synthetic/random data instead of real dataset.", error_message=_extract_context(output, pat), suggested_fix=( "Fix dataset loading. Use a pre-cached dataset " "(CIFAR-10/MNIST are available at /opt/datasets). " "Ensure download happens in setup.py, not main.py." ), )) break def _check_dataset_issues(diag: ExperimentDiagnosis, output: str) -> None: """Detect dataset loading failures.""" patterns = [ (r"FileNotFoundError.*?(?:dataset|data|csv|json)", "Dataset file not found"), (r"No such file.*?(?:dataset|data|train|test)", "Dataset path does not exist"), # BUG-203: HuggingFace DatasetNotFoundError (e.g. cifar10_corrupted) (r"DatasetNotFoundError.*?doesn't exist", "HuggingFace dataset not found on Hub"), ] for pat, desc in patterns: if re.search(pat, output, re.IGNORECASE): diag.deficiencies.append(Deficiency( type=DeficiencyType.DATASET_UNAVAILABLE, severity="critical", description=desc, error_message=_extract_context(output, pat), suggested_fix=( "The dataset does not exist on HuggingFace Hub. " "Use ONLY pre-cached datasets: CIFAR-10, CIFAR-100, MNIST, " "FashionMNIST, STL-10 (available at /opt/datasets). " "Remove the failing download from setup.py and use " "torchvision.datasets with root='/opt/datasets' instead." ), )) def _check_code_crashes(diag: ExperimentDiagnosis, stderr: str, output: str) -> None: """Detect Python runtime crashes.""" # Look for tracebacks — use MULTILINE, not DOTALL, so each traceback # is matched independently (DOTALL would eat all tracebacks into one). tb_pattern = re.compile( r"(?:Error|Exception):\s*(.+)$", re.MULTILINE, ) seen_errors: set[str] = set() for m in tb_pattern.finditer(output): error_msg = m.group(1).strip()[:200] # Skip if already handled by more specific checks if "ModuleNotFoundError" in error_msg: continue if "PermissionError" in error_msg: continue if "CUDA out of memory" in error_msg: continue if "DatasetNotFoundError" in error_msg: continue if error_msg in seen_errors: continue seen_errors.add(error_msg) diag.deficiencies.append(Deficiency( type=DeficiencyType.CODE_CRASH, severity="major", description=f"Runtime error: {error_msg}", error_message=m.group(0)[:500], suggested_fix="Fix the code error. See traceback for details.", )) def _check_hyperparams(diag: ExperimentDiagnosis, output: str, summary: dict) -> None: """Detect hyperparameter issues (diverging loss, NaN gradients).""" # NaN in training — use word boundary to avoid matching "Shannan" etc. if re.search(r"loss.*?\bnan\b|\bnan\b.*?loss|gradient.*?\bnan\b", output, re.IGNORECASE): diag.deficiencies.append(Deficiency( type=DeficiencyType.HYPERPARAMETER_ISSUE, severity="major", description="NaN detected in loss or gradients — likely learning rate too high.", suggested_fix="Reduce learning rate by 10×. Add gradient clipping (max_norm=1.0).", )) # Diverging loss — parse each value individually so one malformed value # doesn't silence the entire check (EDGE-1 fix) loss_values = re.findall(r"loss[=:]\s*([\d.]+)", output, re.IGNORECASE) if loss_values: losses: list[float] = [] for v in loss_values[-10:]: try: losses.append(float(v)) except (ValueError, TypeError): continue if losses and any(l > 100 for l in losses): diag.deficiencies.append(Deficiency( type=DeficiencyType.HYPERPARAMETER_ISSUE, severity="major", description=f"Loss diverging (max={max(losses):.1f}). Training is unstable.", suggested_fix="Reduce learning rate. Add gradient clipping. Check data normalization.", )) def _check_near_random_accuracy(diag: ExperimentDiagnosis, summary: dict) -> None: """BUG-204: Detect when all conditions produce near-random accuracy. If the metric name suggests accuracy/top-1 and the best value is below 15%, the model likely isn't learning (wrong LR, broken forward pass, etc.). """ ms = summary.get("metrics_summary", {}) if not ms: return # Find accuracy-like metrics _ACC_KEYS = {"accuracy", "acc", "top1", "top1_accuracy", "val_acc", "test_acc"} best_acc: float | None = None acc_key: str = "" for key, val in ms.items(): key_lower = key.lower().split("/")[-1] # strip condition prefix if key_lower in _ACC_KEYS or "accuracy" in key_lower or "top1" in key_lower: v = val.get("max", val) if isinstance(val, dict) else val try: fv = float(v) except (TypeError, ValueError): continue if best_acc is None or fv > best_acc: best_acc = fv acc_key = key if best_acc is not None and 0 < best_acc < 15.0: diag.deficiencies.append(Deficiency( type=DeficiencyType.HYPERPARAMETER_ISSUE, severity="critical", description=( f"Best accuracy is {best_acc:.1f}% ({acc_key}), near random chance. " f"The model is not learning." ), suggested_fix=( "Check: (1) Learning rate too high/low — try 0.001 for Adam, 0.1 for SGD. " "(2) Data preprocessing — normalize to [0,1] or ImageNet stats. " "(3) Forward pass — ensure loss backward reaches all parameters. " "(4) KD — ensure teacher is loaded with correct pretrained weights." ), )) def _check_identical_conditions(diag: ExperimentDiagnosis, summary: dict) -> None: """Detect ablation conditions producing identical results.""" warnings = summary.get("ablation_warnings", []) if warnings: affected = [] for w in warnings: m = re.search(r"Conditions '([^']+)' and '([^']+)'", w) if m: affected.extend([m.group(1), m.group(2)]) diag.deficiencies.append(Deficiency( type=DeficiencyType.IDENTICAL_CONDITIONS, severity="major", description=( f"{len(warnings)} ablation pair(s) produce identical outputs. " "The differentiating parameter is likely not wired into the code." ), affected_conditions=sorted(set(affected)), suggested_fix=( "Check that each ablation condition actually modifies the model/training. " "The condition parameter must affect the forward pass, not just be logged." ), )) def _check_insufficient_seeds(diag: ExperimentDiagnosis, summary: dict) -> None: """Check if completed conditions have too few seeds.""" metrics = summary.get("best_run", {}).get("metrics", {}) seed_pattern = re.compile(r"^(.+)/(\d+)/(.+)$") cond_seeds: dict[str, set[int]] = {} for key in metrics: m = seed_pattern.match(key) if m: cond_name, seed_str = m.group(1), m.group(2) cond_seeds.setdefault(cond_name, set()).add(int(seed_str)) single_seed_conds = [c for c, seeds in cond_seeds.items() if len(seeds) < 2] if single_seed_conds: diag.deficiencies.append(Deficiency( type=DeficiencyType.INSUFFICIENT_SEEDS, severity="minor", description=f"{len(single_seed_conds)} condition(s) have only 1 seed (no variance estimate).", affected_conditions=single_seed_conds, suggested_fix="Increase seeds to at least 2 per condition, or reduce epoch count to fit time budget.", )) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _get_planned_conditions(plan: dict | None, summary: dict) -> set[str]: """Extract planned condition names from experiment plan or summary.""" if plan: conditions = plan.get("conditions", []) if isinstance(conditions, list): return {c.get("name", str(c)) if isinstance(c, dict) else str(c) for c in conditions} # Fallback: use condition_summaries from summary return set(summary.get("condition_summaries", {}).keys()) def _get_completed_conditions(summary: dict) -> set[str]: """Extract conditions that actually produced metrics.""" completed = set() cond_summaries = summary.get("condition_summaries", {}) for cond_name, data in cond_summaries.items(): metrics = data.get("metrics", {}) if metrics and any( isinstance(v, (int, float)) and math.isfinite(v) for v in metrics.values() ): completed.add(cond_name) return completed def _extract_stdout(summary: dict, ref_log: dict | None) -> str: """Extract combined stdout from experiment artifacts.""" parts: list[str] = [] # From best_run stdout = summary.get("best_run", {}).get("stdout", "") if stdout: parts.append(stdout) # From refinement log iterations if ref_log: for it in ref_log.get("iterations", []): for key in ("sandbox", "sandbox_after_fix"): sb = it.get(key, {}) if isinstance(sb, dict): out = sb.get("stdout", "") if out: parts.append(out) return "\n".join(parts) def _extract_stderr(summary: dict, ref_log: dict | None) -> str: """Extract combined stderr from experiment artifacts.""" parts: list[str] = [] stderr = summary.get("best_run", {}).get("stderr", "") if stderr: parts.append(stderr) if ref_log: for it in ref_log.get("iterations", []): for key in ("sandbox", "sandbox_after_fix"): sb = it.get(key, {}) if isinstance(sb, dict): err = sb.get("stderr", "") if err: parts.append(err) return "\n".join(parts) def _extract_context(text: str, pattern: str, context_chars: int = 200) -> str: """Extract surrounding context for an error pattern match.""" m = re.search(pattern, text, re.IGNORECASE) if not m: return "" start = max(0, m.start() - context_chars // 2) end = min(len(text), m.end() + context_chars // 2) return text[start:end].strip() def _assess_repairability(diag: ExperimentDiagnosis, prior: list[dict] | None) -> None: """Determine if the experiment can be repaired.""" if not diag.deficiencies: diag.repairable = True return # Count how many times we've tried to fix the same issues if prior: prior_types = set() for pd in prior: for d in pd.get("deficiencies", []): prior_types.add(d.get("type", "")) current_types = {d.type.value for d in diag.deficiencies} repeated = current_types & prior_types if len(repeated) >= 3: diag.repairable = False diag.reason = f"Same deficiencies recur after {len(prior)} repair cycles: {repeated}" return # All types are potentially repairable diag.repairable = True ================================================ FILE: researchclaw/pipeline/experiment_repair.py ================================================ """Experiment Repair Loop — diagnose, fix, and re-run experiments. Orchestrates the cycle: 1. Diagnose failures (``experiment_diagnosis.py``) 2. Generate fixes via OpenCode or LLM 3. Re-run experiment in sandbox/Docker 4. Re-assess quality 5. Repeat until sufficient or max cycles reached Integrates between Stage 14 (result_analysis) and Stage 15 (research_decision). """ from __future__ import annotations import json import logging import re import time as _time from dataclasses import dataclass, field from pathlib import Path from typing import Any from researchclaw.pipeline.experiment_diagnosis import ( DeficiencyType, ExperimentDiagnosis, ExperimentQualityAssessment, PaperMode, assess_experiment_quality, diagnose_experiment, ) logger = logging.getLogger(__name__) MAX_REPAIR_CYCLES = 3 # Regex for extracting ```python filename.py\n...\n``` blocks from LLM output _CODE_BLOCK_RE = re.compile( r"```(?:python)?\s*([\w./\\-]+\.(?:py|txt))\s*\n(.*?)```", re.DOTALL, ) # Fallback: unnamed python blocks _UNNAMED_BLOCK_RE = re.compile( r"```python\s*\n(.*?)```", re.DOTALL, ) @dataclass class RepairCycleResult: """Result of one repair cycle.""" cycle: int diagnosis: ExperimentDiagnosis repair_applied: bool = False repair_description: str = "" new_assessment: ExperimentQualityAssessment | None = None error: str = "" @dataclass class ExperimentRepairResult: """Final result of the entire repair loop.""" success: bool # True if experiment is now sufficient for full_paper total_cycles: int final_mode: PaperMode final_assessment: ExperimentQualityAssessment | None = None cycle_history: list[RepairCycleResult] = field(default_factory=list) best_experiment_summary: dict | None = None def to_dict(self) -> dict: return { "success": self.success, "total_cycles": self.total_cycles, "final_mode": self.final_mode.value, "cycle_history": [ { "cycle": cr.cycle, "repair_applied": cr.repair_applied, "repair_description": cr.repair_description, "error": cr.error, "diagnosis_summary": cr.diagnosis.summary if cr.diagnosis else "", } for cr in self.cycle_history ], } # --------------------------------------------------------------------------- # Repair prompt generation # --------------------------------------------------------------------------- def build_repair_prompt( diagnosis: ExperimentDiagnosis, original_code: dict[str, str], experiment_plan: dict | None = None, time_budget_sec: int = 2400, ) -> str: """Build a structured repair prompt for OpenCode or LLM. Parameters ---------- diagnosis: The structured diagnosis from ``diagnose_experiment()``. original_code: Mapping of filename → source code for the current experiment. experiment_plan: The experiment design plan. time_budget_sec: Available time budget for the experiment. Returns ------- str A formatted prompt suitable for OpenCode or code-generation LLM. """ sections: list[str] = [] sections.append("# EXPERIMENT REPAIR TASK\n") sections.append( "The previous experiment run had failures. Your job is to fix " "the specific issues identified below. Do NOT rewrite from scratch — " "fix ONLY the identified problems.\n" ) # Diagnosis section sections.append(diagnosis.to_repair_prompt()) # Scope reduction guidance if any(d.type == DeficiencyType.TIME_GUARD_DOMINANT for d in diagnosis.deficiencies): n_planned = diagnosis.total_planned n_completed = len(diagnosis.conditions_completed) max_conditions = max(3, n_completed + 1) sections.append( f"\n## SCOPE REDUCTION REQUIRED\n" f"The experiment had {n_planned} conditions but only {n_completed} " f"completed within the time budget of {time_budget_sec}s.\n" f"**Reduce to at most {max_conditions} conditions:**\n" f"1. Keep the BASELINE condition (no modification)\n" f"2. Keep the PROPOSED method (paper's main contribution)\n" f"3. Keep 1 ablation (remove most impactful component)\n" f"4. Remove all other conditions\n" f"5. Reduce epochs by 30-50% if still tight on time\n" f"6. Reduce seeds from 3 to 2 if needed\n" ) # Dependency fixes dep_issues = [d for d in diagnosis.deficiencies if d.type == DeficiencyType.MISSING_DEPENDENCY] if dep_issues: sections.append("\n## DEPENDENCY FIXES\n") sections.append("Add these to requirements.txt:\n") for d in dep_issues: # Extract package name from description sections.append(f"- {d.description}") # Original code sections.append("\n## CURRENT CODE (fix in-place)\n") for filename, content in sorted(original_code.items()): # Truncate very long files if len(content) > 5000: content = content[:5000] + "\n... (truncated)" sections.append(f"### {filename}\n```python\n{content}\n```\n") # Constraints sections.append( f"\n## CONSTRAINTS\n" f"- Time budget: {time_budget_sec} seconds total\n" f"- Pre-cached datasets: CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10 at /opt/datasets\n" f"- Every condition MUST output: condition=CONDNAME metric=VALUE\n" f"- The code must run without errors for at least 1 seed per condition\n" ) # Output format instruction sections.append( "\n## OUTPUT FORMAT\n" "Output each fixed file using this format:\n" "```python filename.py\n" "\n" "```\n" "Include ALL files (main.py, requirements.txt, setup.py if needed).\n" "For requirements.txt, use:\n" "```python requirements.txt\n" "\n" "```\n" ) return "\n".join(sections) # --------------------------------------------------------------------------- # Best results selection # --------------------------------------------------------------------------- def select_best_results( run_dir: Path, cycle_history: list[RepairCycleResult], ) -> dict | None: """Select the best experiment_summary across all repair cycles. Looks for experiment_summary.json files in versioned stage directories and returns the one with the best primary metric / most conditions. Returns None if no valid summary found. """ candidates: list[tuple[float, int, dict]] = [] # Check main stage-14 main_summary = _try_load_summary(run_dir / "stage-14" / "experiment_summary.json") if main_summary: score = _summary_quality_score(main_summary) candidates.append((score, 0, main_summary)) # Check repair versions for i in range(1, MAX_REPAIR_CYCLES + 1): path = run_dir / f"stage-14_repair_v{i}" / "experiment_summary.json" summary = _try_load_summary(path) if summary: score = _summary_quality_score(summary) candidates.append((score, i, summary)) if not candidates: return None # Sort by quality score (descending) candidates.sort(key=lambda x: x[0], reverse=True) best_score, best_cycle, best_summary = candidates[0] logger.info( "Best experiment results from cycle %d (score=%.2f)", best_cycle, best_score ) return best_summary def _try_load_summary(path: Path) -> dict | None: """Try to load and parse an experiment_summary.json.""" if not path.exists(): return None try: return json.loads(path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return None def _summary_quality_score(summary: dict) -> float: """Compute a simple quality score for ranking summaries. Higher = better. Considers: - Number of completed conditions (×10) - Whether primary_metric is non-NaN (×5) - Number of metric keys (×1) """ import math score = 0.0 n_conditions = len(summary.get("condition_summaries", {})) score += n_conditions * 10.0 pm = summary.get("best_run", {}).get("metrics", {}).get("primary_metric") if isinstance(pm, (int, float)) and math.isfinite(pm): score += 5.0 n_keys = summary.get("total_metric_keys", 0) score += n_keys * 1.0 return score # --------------------------------------------------------------------------- # Full repair loop # --------------------------------------------------------------------------- def run_repair_loop( run_dir: Path, config: Any, run_id: str = "", ) -> ExperimentRepairResult: """Execute the full experiment repair loop. After Stage 14 diagnosis finds quality issues: 1. Load current experiment code 2. For each cycle: diagnose → LLM/OpenCode fix → re-run in sandbox → re-assess 3. Select best results across all cycles 4. Return structured result Parameters ---------- run_dir: Path to the pipeline run directory (contains stage-* subdirs). config: RCConfig instance with experiment and LLM settings. run_id: Pipeline run ID for logging. Returns ------- ExperimentRepairResult """ repair_cfg = config.experiment.repair # Load initial experiment summary summary = _load_experiment_summary(run_dir) if not summary: logger.warning("[%s] Repair loop: no experiment_summary.json found", run_id) return ExperimentRepairResult( success=False, total_cycles=0, final_mode=PaperMode.TECHNICAL_REPORT, ) # Initial quality assessment — pass user-configured thresholds ref_log = _load_refinement_log(run_dir) _min_cond = getattr(repair_cfg, "min_conditions", 3) qa = assess_experiment_quality(summary, ref_log, min_conditions=_min_cond) if qa.sufficient: logger.info("[%s] Repair loop: experiment already sufficient (%s)", run_id, qa.mode.value) return ExperimentRepairResult( success=True, total_cycles=0, final_mode=qa.mode, final_assessment=qa, best_experiment_summary=summary, ) # Load experiment code code = _load_experiment_code(run_dir) if not code: logger.warning("[%s] Repair loop: no experiment code found", run_id) return ExperimentRepairResult( success=False, total_cycles=0, final_mode=qa.mode, ) # Collect stdout/stderr for diagnosis stdout, stderr = _collect_experiment_output(run_dir) # Load experiment plan plan = _load_experiment_plan(run_dir) # Create LLM client try: from researchclaw.llm import create_llm_client llm = create_llm_client(config) except Exception as exc: logger.error("[%s] Repair loop: cannot create LLM client: %s", run_id, exc) return ExperimentRepairResult( success=False, total_cycles=0, final_mode=qa.mode, ) cycle_history: list[RepairCycleResult] = [] best_summary = summary best_mode = qa.mode best_updated = False max_cycles = min(repair_cfg.max_cycles, MAX_REPAIR_CYCLES) loop_start = _time.monotonic() prior_diagnoses: list[dict] = [] for cycle in range(1, max_cycles + 1): logger.info("[%s] Repair cycle %d/%d starting...", run_id, cycle, max_cycles) print(f"[{run_id}] Repair cycle {cycle}/{max_cycles}...") # 1. Diagnose current state diag = diagnose_experiment( experiment_summary=summary, experiment_plan=plan, refinement_log=ref_log, stdout=stdout, stderr=stderr, prior_diagnoses=prior_diagnoses or None, ) prior_diagnoses.append(diag.to_dict() if hasattr(diag, "to_dict") else {}) # 2. Build repair prompt repair_prompt = build_repair_prompt( diag, code, experiment_plan=plan, time_budget_sec=config.experiment.time_budget_sec, ) # 3. Get fixed code via LLM (with OpenCode fallback) fixed_code = _get_repaired_code( repair_prompt, code, llm, config, run_dir, cycle, ) if not fixed_code: cycle_result = RepairCycleResult( cycle=cycle, diagnosis=diag, repair_applied=False, error="Failed to generate repaired code", ) cycle_history.append(cycle_result) logger.warning("[%s] Repair cycle %d: code generation failed", run_id, cycle) break # 4. Save fixed code to versioned directory repair_dir = run_dir / f"stage-14_repair_v{cycle}" repair_dir.mkdir(parents=True, exist_ok=True) exp_dir = repair_dir / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) for fname, content in fixed_code.items(): (exp_dir / fname).write_text(content, encoding="utf-8") logger.info( "[%s] Repair cycle %d: saved %d files to %s", run_id, cycle, len(fixed_code), exp_dir, ) # 5. Re-run experiment in sandbox sandbox_result = _run_experiment_in_sandbox( exp_dir, config, repair_dir, timeout_sec=repair_cfg.timeout_sec_per_cycle, ) if sandbox_result is None: cycle_result = RepairCycleResult( cycle=cycle, diagnosis=diag, repair_applied=True, repair_description=f"Fixed {len(fixed_code)} files", error="Sandbox execution failed", ) cycle_history.append(cycle_result) logger.warning("[%s] Repair cycle %d: sandbox execution failed", run_id, cycle) continue # 6. Build new experiment summary from sandbox results new_summary = _build_experiment_summary_from_run(sandbox_result, fixed_code) (repair_dir / "experiment_summary.json").write_text( json.dumps(new_summary, indent=2), encoding="utf-8" ) # 7. Re-assess quality new_qa = assess_experiment_quality(new_summary, min_conditions=_min_cond) new_score = _summary_quality_score(new_summary) old_score = _summary_quality_score(best_summary) cycle_result = RepairCycleResult( cycle=cycle, diagnosis=diag, repair_applied=True, repair_description=( f"Fixed {len(fixed_code)} files; " f"score {old_score:.1f} → {new_score:.1f}; " f"mode: {new_qa.mode.value}" ), new_assessment=new_qa, ) cycle_history.append(cycle_result) # Track best if new_score > _summary_quality_score(best_summary): best_summary = new_summary best_mode = new_qa.mode best_updated = True logger.info( "[%s] Repair cycle %d: score %.1f → %.1f, mode=%s, sufficient=%s", run_id, cycle, old_score, new_score, new_qa.mode.value, new_qa.sufficient, ) print( f"[{run_id}] Repair cycle {cycle}: " f"score {old_score:.1f} → {new_score:.1f}, " f"mode={new_qa.mode.value}" ) if new_qa.sufficient: logger.info("[%s] Repair successful after %d cycles!", run_id, cycle) print(f"[{run_id}] Experiment repair successful! Mode: {new_qa.mode.value}") return ExperimentRepairResult( success=True, total_cycles=cycle, final_mode=new_qa.mode, final_assessment=new_qa, cycle_history=cycle_history, best_experiment_summary=best_summary, ) # Update for next cycle code = fixed_code summary = new_summary stdout = sandbox_result.get("stdout", "") stderr = sandbox_result.get("stderr", "") # Exhausted all cycles — use best available elapsed = _time.monotonic() - loop_start logger.info( "[%s] Repair loop completed: %d cycles in %.1fs, best mode=%s", run_id, len(cycle_history), elapsed, best_mode.value, ) # Promote best summary only if a repair cycle actually improved it if best_updated and best_summary is not summary: best_path = run_dir / "experiment_summary_best.json" best_path.write_text(json.dumps(best_summary, indent=2), encoding="utf-8") return ExperimentRepairResult( success=False, total_cycles=len(cycle_history), final_mode=best_mode, cycle_history=cycle_history, best_experiment_summary=best_summary, ) # --------------------------------------------------------------------------- # Helper: load experiment artifacts # --------------------------------------------------------------------------- def _load_experiment_summary(run_dir: Path) -> dict | None: """Load the most recent experiment_summary.json.""" for candidate in sorted(run_dir.glob("stage-14*/experiment_summary.json"), reverse=True): try: return json.loads(candidate.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue return None def _load_refinement_log(run_dir: Path) -> dict | None: """Load the most recent refinement_log.json.""" for candidate in sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True): try: return json.loads(candidate.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue return None def _load_experiment_code(run_dir: Path) -> dict[str, str]: """Load experiment code from the most recent stage directory. Prefers: stage-13/experiment_final/ → stage-10/experiment/ → stage-10/*.py """ code: dict[str, str] = {} # Try refined code first for refine_dir in sorted(run_dir.glob("stage-13*/experiment_final"), reverse=True): if refine_dir.is_dir(): for py_file in sorted(refine_dir.glob("*.py")): try: code[py_file.name] = py_file.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass # Also grab requirements.txt, setup.py for extra in ("requirements.txt", "setup.py"): extra_path = refine_dir / extra if extra_path.exists(): try: code[extra] = extra_path.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass if code: return code # Fall back to stage-10 experiment directory for exp_dir in sorted(run_dir.glob("stage-10*/experiment"), reverse=True): if exp_dir.is_dir(): for py_file in sorted(exp_dir.glob("*.py")): try: code[py_file.name] = py_file.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass for extra in ("requirements.txt", "setup.py"): extra_path = exp_dir / extra if extra_path.exists(): try: code[extra] = extra_path.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass if code: return code # Last resort: any .py files in stage-10* for stage_dir in sorted(run_dir.glob("stage-10*"), reverse=True): for py_file in sorted(stage_dir.glob("*.py")): try: code[py_file.name] = py_file.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass if code: return code return code def _load_experiment_plan(run_dir: Path) -> dict | None: """Load experiment plan from stage-09.""" for candidate in sorted(run_dir.glob("stage-09*/experiment_design.json"), reverse=True): try: return json.loads(candidate.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue return None def _collect_experiment_output(run_dir: Path) -> tuple[str, str]: """Collect stdout/stderr from experiment runs.""" stdout_parts: list[str] = [] stderr_parts: list[str] = [] for stage_dir in sorted(run_dir.glob("stage-14*")): runs_dir = stage_dir / "runs" if not runs_dir.is_dir(): continue for run_file in sorted(runs_dir.glob("*.json"))[:5]: try: data = json.loads(run_file.read_text(encoding="utf-8")) if isinstance(data, dict): stdout_parts.append(data.get("stdout", "")) stderr_parts.append(data.get("stderr", "")) except (json.JSONDecodeError, OSError): continue return "\n".join(stdout_parts).strip(), "\n".join(stderr_parts).strip() # --------------------------------------------------------------------------- # Helper: get repaired code from LLM or OpenCode # --------------------------------------------------------------------------- def _get_repaired_code( repair_prompt: str, current_code: dict[str, str], llm: Any, config: Any, run_dir: Path, cycle: int, ) -> dict[str, str] | None: """Get repaired code via OpenCode (if available) or LLM fallback. Returns merged code dict (current + repaired files) or None on failure. """ repair_cfg = config.experiment.repair # Try OpenCode first if enabled if repair_cfg.use_opencode and config.experiment.opencode.enabled: result = _repair_via_opencode(repair_prompt, current_code, config, run_dir, cycle) if result: return result logger.info("OpenCode repair unavailable, falling back to LLM") # LLM repair return _repair_via_llm(repair_prompt, current_code, llm) def _repair_via_opencode( repair_prompt: str, current_code: dict[str, str], config: Any, run_dir: Path, cycle: int, ) -> dict[str, str] | None: """Attempt repair via OpenCode agent.""" try: from researchclaw.pipeline.opencode_bridge import OpenCodeBridge _oc_cfg = config.experiment.opencode bridge = OpenCodeBridge( model=getattr(_oc_cfg, "model", "") or "", llm_base_url=getattr(config.llm, "base_url", "") or "", api_key_env=getattr(config.llm, "api_key_env", "") or "", llm_provider=getattr(config.llm, "provider", "openai-compatible") or "openai-compatible", timeout_sec=getattr(_oc_cfg, "timeout_sec", 600), max_retries=getattr(_oc_cfg, "max_retries", 1), workspace_cleanup=getattr(_oc_cfg, "workspace_cleanup", True), ) workspace = run_dir / f"_repair_opencode_v{cycle}" workspace.mkdir(parents=True, exist_ok=True) result = bridge.generate( stage_dir=workspace, topic="experiment repair", exp_plan=repair_prompt, metric=getattr(config.experiment, "metric_key", "primary_metric"), time_budget_sec=getattr(config.experiment, "time_budget_sec", 2400), ) if result.success and result.files: # Merge with current code merged = dict(current_code) merged.update(result.files) logger.info( "OpenCode repair: %d files generated (%d total after merge)", len(result.files), len(merged), ) return merged except Exception as exc: logger.warning("OpenCode repair failed: %s", exc) return None def _repair_via_llm( repair_prompt: str, current_code: dict[str, str], llm: Any, ) -> dict[str, str] | None: """Repair experiment code via LLM chat.""" system = ( "You are an expert experiment repair assistant. " "Fix the experiment code based on the diagnosis below. " "Output ONLY the fixed files. For each file, use this exact format:\n\n" "```python filename.py\n" "\n" "```\n\n" "Include ALL files that need changes (main.py, requirements.txt, etc.). " "Output the COMPLETE file content, not just the changed parts." ) try: resp = llm.chat( [{"role": "user", "content": repair_prompt}], system=system, ) content = resp.content except Exception as exc: logger.warning("LLM repair call failed: %s", exc) return None if not content or not content.strip(): logger.warning("LLM repair returned empty response") return None # Extract code blocks from response files = _extract_code_blocks(content) if not files: logger.warning("LLM repair: no code blocks found in response") return None # Merge with current code (only update files that were fixed) merged = dict(current_code) merged.update(files) logger.info( "LLM repair: extracted %d files (%d total after merge)", len(files), len(merged), ) return merged def _extract_code_blocks(text: str) -> dict[str, str]: """Extract named code blocks from LLM response. Matches patterns like: ```python main.py ``` """ files: dict[str, str] = {} # Try named blocks first for match in _CODE_BLOCK_RE.finditer(text): fname = match.group(1).strip() code = match.group(2).strip() if fname and code: # Normalize filename — strip path prefixes fname = Path(fname).name files[fname] = code # If no named blocks, try unnamed and assume main.py if not files: for match in _UNNAMED_BLOCK_RE.finditer(text): code = match.group(1).strip() if code and len(code) > 50: # Skip tiny snippets files["main.py"] = code break return files # --------------------------------------------------------------------------- # Helper: run experiment in sandbox # --------------------------------------------------------------------------- def _run_experiment_in_sandbox( exp_dir: Path, config: Any, work_dir: Path, timeout_sec: int = 600, ) -> dict | None: """Run experiment code in Docker/sandbox and return results dict. Returns a dict with keys: stdout, stderr, returncode, metrics, elapsed_sec, timed_out. Returns None if sandbox creation fails. """ try: from researchclaw.experiment.factory import create_sandbox sandbox_dir = work_dir / "sandbox" sandbox_dir.mkdir(parents=True, exist_ok=True) sandbox = create_sandbox(config.experiment, sandbox_dir) result = sandbox.run_project( exp_dir, timeout_sec=timeout_sec, ) return { "stdout": result.stdout, "stderr": result.stderr, "returncode": result.returncode, "metrics": dict(result.metrics) if result.metrics else {}, "elapsed_sec": result.elapsed_sec, "timed_out": result.timed_out, } except Exception as exc: logger.warning("Sandbox execution failed: %s", exc) return None def _build_experiment_summary_from_run( run_result: dict, code: dict[str, str], ) -> dict: """Build an experiment_summary.json from a single sandbox run. Parses condition-level metrics from stdout and builds the standard summary format expected by ``assess_experiment_quality()``. """ metrics = run_result.get("metrics", {}) stdout = run_result.get("stdout", "") # Also parse metrics from stdout if sandbox didn't capture them if not metrics and stdout: try: from researchclaw.experiment.sandbox import parse_metrics metrics = parse_metrics(stdout) except ImportError: pass # Group metrics by condition condition_summaries: dict[str, dict] = {} for key, value in metrics.items(): if not isinstance(value, (int, float)): continue parts = key.split("/") if len(parts) >= 3: # Format: condition_name/seed/metric_name cond_name = parts[0] metric_name = parts[-1] if cond_name not in condition_summaries: condition_summaries[cond_name] = {"metrics": {}, "seeds": {}} condition_summaries[cond_name]["metrics"][metric_name] = value seed_key = "/".join(parts[1:-1]) condition_summaries[cond_name]["seeds"].setdefault(seed_key, {})[metric_name] = value elif len(parts) == 2: # BUG-199: Stage 13 refinement produces 2-part keys # (condition_name/metric_name) without a seed component. # Treat as a single-seed result. cond_name, metric_name = parts if cond_name not in condition_summaries: condition_summaries[cond_name] = {"metrics": {}, "seeds": {}} condition_summaries[cond_name]["metrics"][metric_name] = value condition_summaries[cond_name]["seeds"].setdefault("0", {})[metric_name] = value elif len(parts) == 1: # Top-level metric like "primary_metric" pass # Compute per-condition mean metrics for cond_name, cdata in condition_summaries.items(): seeds = cdata.get("seeds", {}) if seeds: cdata["n_seeds"] = len(seeds) # Average each metric across seeds all_metrics: dict[str, list[float]] = {} for seed_data in seeds.values(): for mk, mv in seed_data.items(): if isinstance(mv, (int, float)): all_metrics.setdefault(mk, []).append(float(mv)) for mk, values in all_metrics.items(): if values: cdata["metrics"][mk] = sum(values) / len(values) # Remove seeds from final output (not standard format) cdata.pop("seeds", None) return { "condition_summaries": condition_summaries, "best_run": { "metrics": metrics, "status": "completed" if run_result.get("returncode") == 0 else "failed", "stdout": stdout[:5000], "stderr": run_result.get("stderr", "")[:2000], }, "metrics_summary": {}, "total_conditions": len(condition_summaries), "total_metric_keys": len(metrics), } ================================================ FILE: researchclaw/pipeline/opencode_bridge.py ================================================ """OpenCode 'Beast Mode' bridge — routes complex code generation to OpenCode CLI. OpenCode (https://github.com/anomalyco/opencode) is an external AI coding agent invoked via ``opencode run --format json "prompt"``. This module provides: 1. **ComplexityScore / score_complexity()** — analyses an experiment plan to decide whether beast mode is warranted. 2. **OpenCodeBridge** — manages workspace creation, OpenCode invocation, file collection, and cleanup. """ from __future__ import annotations import ast import json import logging import os import re import shutil import subprocess import time from dataclasses import dataclass, field from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Complexity scoring # --------------------------------------------------------------------------- # Keywords that indicate multi-component architectures _COMPONENT_KEYWORDS: tuple[str, ...] = ( "encoder", "decoder", "discriminator", "generator", "critic", "actor", "teacher", "student", "backbone", "head", "neck", "classifier", "embedder", "attention", "transformer", "tokenizer", "vae", "autoencoder", ) # Indicators that multi-file generation is needed _FILE_HINT_KEYWORDS: tuple[str, ...] = ( "model.py", "trainer.py", "dataset.py", "utils.py", "config.py", "multiple files", "modular", "separate module", "multi-file", ) # Domain-complexity keywords _DOMAIN_COMPLEX_KEYWORDS: tuple[str, ...] = ( "multi-modal", "multimodal", "distributed", "gan", "diffusion", "nerf", "mixture of experts", "moe", "meta-learning", "meta learning", "maml", "neural ode", "neural sde", "physics-informed", "pinn", "graph neural", "gnn", "reinforcement learning", "multi-agent", "world model", "vision-language", "text-to-image", "image-to-text", ) # Patterns suggesting deep dependency chains _DEPENDENCY_KEYWORDS: tuple[str, ...] = ( "custom layer", "custom loss", "wrapper", "registry", "hook", "callback", "scheduler", "custom optimizer", "custom dataset", "custom sampler", "custom transform", ) @dataclass class ComplexityScore: """Result of complexity analysis on an experiment plan.""" score: float # 0.0-1.0 signals: dict[str, float] = field(default_factory=dict) recommendation: str = "" # "beast_mode" | "code_agent" | "legacy" reason: str = "" def _count_keyword_hits(text: str, keywords: tuple[str, ...]) -> int: text_lower = text.lower() return sum(1 for kw in keywords if kw in text_lower) def score_complexity( exp_plan: str, topic: str = "", *, historical_failures: int = 0, threshold: float = 0.6, ) -> ComplexityScore: """Score the complexity of an experiment to determine if beast mode is warranted. Returns a ComplexityScore with score in [0.0, 1.0]. """ if not exp_plan and not topic: return ComplexityScore( score=0.0, signals={}, recommendation="legacy", reason="Empty plan", ) combined = f"{topic}\n{exp_plan}" # Signal 1: Component count (weight 0.25) comp_hits = _count_keyword_hits(combined, _COMPONENT_KEYWORDS) component_score = min(comp_hits / 5.0, 1.0) # Signal 2: File count hint (weight 0.20) file_hits = _count_keyword_hits(combined, _FILE_HINT_KEYWORDS) file_score = min(file_hits / 3.0, 1.0) # Signal 3: Domain complexity (weight 0.20) domain_hits = _count_keyword_hits(combined, _DOMAIN_COMPLEX_KEYWORDS) domain_score = min(domain_hits / 3.0, 1.0) # Signal 4: Condition count (weight 0.15) # Look for numbered conditions, ablation mentions, variant mentions condition_pattern = re.compile( r"(?:condition|ablation|variant|experiment)\s*[\-_:]?\s*\d+", re.IGNORECASE, ) condition_matches = len(condition_pattern.findall(combined)) # Also count bullet points in conditions/ablations sections condition_matches += combined.lower().count("baseline") condition_score = min(condition_matches / 8.0, 1.0) # Signal 5: Historical failures (weight 0.10) failure_score = min(historical_failures / 3.0, 1.0) # Signal 6: Dependency depth (weight 0.10) dep_hits = _count_keyword_hits(combined, _DEPENDENCY_KEYWORDS) dep_score = min(dep_hits / 3.0, 1.0) # Weighted sum weighted = ( 0.25 * component_score + 0.20 * file_score + 0.20 * domain_score + 0.15 * condition_score + 0.10 * failure_score + 0.10 * dep_score ) final_score = min(max(weighted, 0.0), 1.0) signals = { "component_count": round(component_score, 3), "file_count_hint": round(file_score, 3), "domain_complexity": round(domain_score, 3), "condition_count": round(condition_score, 3), "historical_failure": round(failure_score, 3), "dependency_depth": round(dep_score, 3), } if final_score >= threshold: recommendation = "beast_mode" reason = ( f"Complexity {final_score:.2f} >= threshold {threshold:.2f}: " f"top signals: " + ", ".join( f"{k}={v:.2f}" for k, v in sorted(signals.items(), key=lambda x: -x[1])[:3] ) ) else: recommendation = "code_agent" reason = f"Complexity {final_score:.2f} < threshold {threshold:.2f}" return ComplexityScore( score=round(final_score, 4), signals=signals, recommendation=recommendation, reason=reason, ) # --------------------------------------------------------------------------- # OpenCode bridge # --------------------------------------------------------------------------- @dataclass class OpenCodeResult: """Result from an OpenCode invocation.""" success: bool files: dict[str, str] = field(default_factory=dict) opencode_log: str = "" elapsed_sec: float = 0.0 error: str = "" _MEGA_PROMPT_TEMPLATE = """\ You are implementing a complete, runnable ML/science experiment. Read the files in the current workspace: - EXPERIMENT_PLAN.yaml — the full experiment design - GUIDANCE.md — topic, metric, environment constraints, domain-specific guidance Your task: 1. Design the file structure (main.py is the required entry point). 2. Implement ALL files with complete, runnable code. No placeholders or TODOs. 3. main.py must be the entry point and print the primary metric as: {metric}: 4. Include numerical stability guards (gradient clipping, NaN detection, etc.). 5. Use multi-seed evaluation (seeds 0, 1, 2) and report mean ± std. 6. Each ablation/condition MUST be genuinely different — not copy-paste with a renamed variable. 7. Implement a time guard: stop gracefully at 80% of the time budget ({time_budget_sec} seconds). 8. Write requirements.txt listing any extra pip packages needed. 9. If the experiment needs dataset downloads, write a setup.py that handles them. IMPORTANT CONSTRAINTS: - The code will run in an isolated Docker container with PyTorch, torchvision, and common ML packages pre-installed. - Do NOT use argparse or CLI arguments — hardcode all configuration. - All output must go to stdout (print statements). - Keep the experiment feasible within {time_budget_sec} seconds total. """ class OpenCodeBridge: """Manages OpenCode CLI invocations for beast mode code generation.""" def __init__( self, *, model: str = "", llm_base_url: str = "", api_key_env: str = "", llm_provider: str = "openai-compatible", timeout_sec: int = 600, max_retries: int = 1, workspace_cleanup: bool = True, ) -> None: self._model = model self._llm_base_url = llm_base_url self._api_key_env = api_key_env self._llm_provider = llm_provider self._timeout_sec = timeout_sec self._max_retries = max_retries self._workspace_cleanup = workspace_cleanup # -- availability check --------------------------------------------------- @staticmethod def check_available() -> bool: """Return True if the ``opencode`` CLI is installed and callable.""" opencode_cmd = shutil.which("opencode") if not opencode_cmd: return False try: result = subprocess.run( [opencode_cmd, "--version"], capture_output=True, text=True, timeout=15, ) return result.returncode == 0 except FileNotFoundError: return False except subprocess.TimeoutExpired: return False except Exception: # noqa: BLE001 return False # -- workspace preparation ------------------------------------------------ def _prepare_workspace( self, stage_dir: Path, topic: str, exp_plan: str, metric: str, pkg_hint: str, extra_guidance: str, time_budget_sec: int, ) -> Path: """Create a temporary workspace directory with context files.""" ws = stage_dir / f"opencode_beast_{int(time.time())}_{time.monotonic_ns() % 100000}" ws.mkdir(parents=True, exist_ok=True) # Write experiment plan (ws / "EXPERIMENT_PLAN.yaml").write_text( exp_plan or "# No experiment plan provided\n", encoding="utf-8", ) # Write guidance document guidance_parts = [ f"# Experiment Guidance\n", f"## Topic\n{topic}\n", f"## Primary Metric\n{metric}\n", f"## Time Budget\n{time_budget_sec} seconds\n", ] if pkg_hint: guidance_parts.append(f"## Environment\n{pkg_hint}\n") if extra_guidance: guidance_parts.append(f"## Additional Guidance\n{extra_guidance}\n") (ws / "GUIDANCE.md").write_text( "\n".join(guidance_parts), encoding="utf-8", ) # Write opencode.json config opencode_cfg = self._build_opencode_config() (ws / "opencode.json").write_text( json.dumps(opencode_cfg, indent=2), encoding="utf-8", ) # OpenCode requires a git repository — initialise one with # a single commit so that ``opencode run`` doesn't hang. # BUG-OB-01/OB-02: Check return codes and catch TimeoutExpired. try: r = subprocess.run( ["git", "init"], cwd=str(ws), capture_output=True, timeout=10, ) if r.returncode != 0: raise OSError(f"git init failed: {r.stderr}") subprocess.run( ["git", "add", "-A"], cwd=str(ws), capture_output=True, timeout=10, ) subprocess.run( ["git", "-c", "user.email=beast@researchclaw", "-c", "user.name=BeastMode", "commit", "-m", "init workspace"], cwd=str(ws), capture_output=True, timeout=10, ) except subprocess.TimeoutExpired as exc: raise OSError(f"git workspace init timed out: {exc}") from exc return ws def _is_azure(self) -> bool: """Detect Azure OpenAI from base URL or provider string.""" return ( "azure" in (self._llm_base_url or "").lower() or "azure" in (self._llm_provider or "").lower() ) def _build_opencode_config(self) -> dict[str, Any]: """Build the opencode.json configuration. Always uses the "openai" provider — this works for both standard OpenAI endpoints and Azure OpenAI (which accepts Bearer token auth on the ``/openai/v1`` path and now supports the Responses API). """ cfg: dict[str, Any] = { "$schema": "https://opencode.ai/config.json", } if self._llm_base_url: if self._model: cfg["model"] = ( self._model if "/" in self._model else f"openai/{self._model}" ) cfg["provider"] = { "openai": { "options": { "baseURL": self._llm_base_url, "apiKey": f"{{env:{self._api_key_env}}}" if self._api_key_env else "", }, "models": {}, } } # Register the model so OpenCode knows it exists if self._model: model_name = self._model.split("/")[-1] cfg["provider"]["openai"]["models"] = { model_name: { "name": model_name, "modalities": { "input": ["text"], "output": ["text"], }, } } elif self._model: cfg["model"] = ( self._model if "/" in self._model else f"openai/{self._model}" ) return cfg # -- model resolution ------------------------------------------------------- def _resolve_opencode_model(self) -> str: """Resolve the model identifier for OpenCode CLI's ``-m`` flag. Resolution order: 1. If model already contains "/" (e.g. "anthropic/claude-sonnet-4-6") → use as-is 2. Otherwise → "openai/{model}" (works for both Azure and standard OpenAI) Note: Azure AI Services now supports the Responses API with Bearer token auth via the OpenAI-compatible endpoint, so we use the "openai" provider universally — no Anthropic fallback needed. """ if not self._model: return "anthropic/claude-sonnet-4-6" if "/" in self._model: return self._model return f"openai/{self._model}" # -- invocation ------------------------------------------------------------ def _invoke_opencode( self, workspace: Path, prompt: str, ) -> tuple[bool, str, float]: """Run ``opencode run`` in the workspace. Returns (success, log, elapsed).""" env = os.environ.copy() # Pass API key via environment if configured if self._api_key_env: api_key = os.environ.get(self._api_key_env, "") if api_key: # We always use the "openai" provider for OpenCode now, # which reads OPENAI_API_KEY (works for Azure too via # Bearer token auth on the OpenAI-compatible endpoint). env["OPENAI_API_KEY"] = api_key # Use -m flag to specify model (more reliable than opencode.json) resolved_model = self._resolve_opencode_model() opencode_cmd = shutil.which("opencode") or "opencode" cmd = [opencode_cmd, "run", "-m", resolved_model, "--format", "json", prompt] t0 = time.monotonic() try: result = subprocess.run( cmd, cwd=str(workspace), capture_output=True, text=True, timeout=self._timeout_sec, env=env, ) elapsed = time.monotonic() - t0 log = result.stdout + "\n" + result.stderr return result.returncode == 0, log, elapsed except subprocess.TimeoutExpired as exc: elapsed = time.monotonic() - t0 log = f"TIMEOUT after {elapsed:.1f}s" if exc.stdout: log += f"\nstdout: {exc.stdout[:2000] if isinstance(exc.stdout, str) else exc.stdout.decode(errors='replace')[:2000]}" return False, log, elapsed except FileNotFoundError: return False, "opencode CLI not found", 0.0 except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - t0 return False, f"Unexpected error: {exc}", elapsed # -- file collection ------------------------------------------------------- @staticmethod def _collect_files(workspace: Path) -> dict[str, str]: """Collect generated Python files, requirements.txt, and setup.py. File names are flattened to basenames (e.g. ``src/main.py`` → ``main.py``) because the downstream executor expects a flat file dict. If two files share the same basename, the one closer to the workspace root wins. """ files: dict[str, str] = {} # Sort by depth (fewer parts first) so root-level files take priority py_files = sorted( workspace.rglob("*.py"), key=lambda p: len(p.relative_to(workspace).parts), ) for py_file in py_files: rel = py_file.relative_to(workspace) parts = rel.parts if any(p.startswith("__pycache__") or p.startswith(".") for p in parts): continue # Flatten to basename — executor expects flat structure basename = rel.name if basename not in files: try: files[basename] = py_file.read_text(encoding="utf-8", errors="replace") except OSError as exc: logger.warning("Beast mode: failed to read %s: %s", py_file, exc) # Also collect requirements.txt and setup.py at root for extra in ("requirements.txt", "setup.py"): p = workspace / extra if p.exists() and extra not in files: files[extra] = p.read_text(encoding="utf-8", errors="replace") return files # -- entry-point validation ------------------------------------------------ @staticmethod def _has_main_guard(source: str) -> bool: """Return True if *source* contains ``if __name__ == "__main__":``.""" try: tree = ast.parse(source) except SyntaxError: return False for node in ast.walk(tree): if isinstance(node, ast.If): test = node.test if isinstance(test, ast.Compare) and isinstance(test.left, ast.Name): if test.left.id == "__name__" and len(test.comparators) == 1: comp = test.comparators[0] if isinstance(comp, ast.Constant) and comp.value == "__main__": return True return False @staticmethod def _ensure_main_entry_point(files: dict[str, str]) -> dict[str, str]: """Ensure ``main.py`` has an ``if __name__ == "__main__"`` guard. Beast Mode often generates multi-file projects where ``main.py`` is a library module and the real entry point lives in another file (e.g. ``run_experiment.py``). Since the Docker sandbox always executes ``python3 main.py``, a library-only ``main.py`` exits immediately with no output. Strategy: 1. If ``main.py`` already has the guard → return unchanged. 2. Find the first other ``.py`` file that **does** have the guard. 3. Swap: rename that file to ``main.py`` and the old ``main.py`` to a helper module (its original basename, or ``_lib.py``). 4. If no file has a guard, append a minimal stub to ``main.py`` that calls the most likely entry function (``main()``, ``run()``, etc.). """ main_code = files.get("main.py", "") if not main_code: return files if OpenCodeBridge._has_main_guard(main_code): return files # -- Strategy 2/3: find another file with the guard and swap ----------- for fname, code in files.items(): if fname == "main.py" or not fname.endswith(".py"): continue if OpenCodeBridge._has_main_guard(code): logger.info( "Beast mode: main.py lacks __main__ guard; swapping " "entry point with %s", fname, ) new_files = dict(files) # Rename original main.py → helper module helper_name = fname # reuse the other file's name for old main new_files[helper_name] = main_code new_files["main.py"] = code return new_files # -- Strategy 4: inject a minimal entry point into main.py ------------- # Look for common entry functions defined in main.py entry_func: str | None = None try: tree = ast.parse(main_code) candidates = [ n.name for n in ast.walk(tree) if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) and n.name in ("main", "run", "run_experiment", "train", "run_experiments", "experiment", "run_all") ] if candidates: entry_func = candidates[0] except SyntaxError: pass if entry_func: logger.info( "Beast mode: main.py lacks __main__ guard; injecting call " "to %s()", entry_func, ) new_files = dict(files) new_files["main.py"] = ( main_code.rstrip() + "\n\n\nif __name__ == \"__main__\":\n" + f" {entry_func}()\n" ) return new_files logger.warning( "Beast mode: main.py lacks __main__ guard and no known entry " "function found — experiment may exit without producing output", ) return files # -- main entry point ------------------------------------------------------ def generate( self, stage_dir: Path, topic: str, exp_plan: str, metric: str, pkg_hint: str = "", extra_guidance: str = "", time_budget_sec: int = 300, ) -> OpenCodeResult: """Run OpenCode to generate experiment code. Returns an OpenCodeResult with success status and generated files. """ # Check availability first if not self.check_available(): return OpenCodeResult( success=False, error="OpenCode CLI not installed or not callable", ) workspace: Path | None = None last_error = "" for attempt in range(1 + self._max_retries): # Prepare workspace try: workspace = self._prepare_workspace( stage_dir=stage_dir, topic=topic, exp_plan=exp_plan, metric=metric, pkg_hint=pkg_hint, extra_guidance=extra_guidance, time_budget_sec=time_budget_sec, ) except OSError as exc: last_error = f"Failed to prepare workspace: {exc}" logger.warning("Beast mode: %s", last_error) continue # Build the mega-prompt (use replace instead of .format() to # avoid KeyError when metric contains curly braces like "F{1}") prompt = _MEGA_PROMPT_TEMPLATE.replace( "{metric}", metric ).replace( "{time_budget_sec}", str(time_budget_sec) ) logger.info( "Beast mode: invoking OpenCode (attempt %d/%d, timeout=%ds)", attempt + 1, 1 + self._max_retries, self._timeout_sec, ) success, log, elapsed = self._invoke_opencode(workspace, prompt) if success: files = self._collect_files(workspace) if "main.py" not in files: logger.warning( "Beast mode: OpenCode succeeded but no main.py found " "(files: %s)", list(files.keys()), ) last_error = "No main.py in OpenCode output" # Cleanup failed workspace if self._workspace_cleanup and workspace.exists(): shutil.rmtree(workspace, ignore_errors=True) continue # BUG-R52-01: Ensure main.py has an entry point files = self._ensure_main_entry_point(files) # Write log try: (stage_dir / "opencode_log.txt").write_text( log or "", encoding="utf-8", ) except OSError as _wexc: logger.warning("Beast mode: failed to write log: %s", _wexc) # Cleanup workspace if configured if self._workspace_cleanup and workspace.exists(): shutil.rmtree(workspace, ignore_errors=True) return OpenCodeResult( success=True, files=files, opencode_log=log, elapsed_sec=elapsed, ) last_error = log logger.warning( "Beast mode: OpenCode attempt %d failed (%.1fs): %s", attempt + 1, elapsed, log[:500], ) # Cleanup failed workspace if self._workspace_cleanup and workspace and workspace.exists(): shutil.rmtree(workspace, ignore_errors=True) # All attempts failed return OpenCodeResult( success=False, opencode_log=last_error, error=f"OpenCode failed after {1 + self._max_retries} attempt(s)", ) # --------------------------------------------------------------------------- # Helper: count historical failures # --------------------------------------------------------------------------- def count_historical_failures(run_dir: Path, stage_name: str = "stage-10") -> int: """Count past Stage 10 failures from stage directories and logs. Each stage directory is counted at most once, even if multiple failure indicators are present. """ failures = 0 for d in run_dir.glob(f"{stage_name}*"): failed = False # Check for beast_mode_log.json bm_log = d / "beast_mode_log.json" if bm_log.exists(): try: data = json.loads(bm_log.read_text(encoding="utf-8")) if not data.get("success", True): failed = True except Exception: # noqa: BLE001 pass # Check for stage health failures if not failed: health = d / "stage_health.json" if health.exists(): try: data = json.loads(health.read_text(encoding="utf-8")) if data.get("status") == "FAILED": failed = True except Exception: # noqa: BLE001 pass # Check for validation report with FAILED status if not failed: vr = d / "validation_report.md" if vr.exists(): try: content = vr.read_text(encoding="utf-8") if "BLOCKED" in content or "FAILED" in content: failed = True except Exception: # noqa: BLE001 pass if failed: failures += 1 return failures ================================================ FILE: researchclaw/pipeline/paper_verifier.py ================================================ """Post-generation paper verification gate. Extracts all numeric values from a generated LaTeX paper, compares them against the ``VerifiedRegistry``, and rejects the paper if unverified numbers appear in strict sections (Results, Experiments, Tables). This is the **hard, deterministic** defense against fabrication. """ from __future__ import annotations import logging import math import re from dataclasses import dataclass, field from pathlib import Path from researchclaw.pipeline.verified_registry import VerifiedRegistry logger = logging.getLogger(__name__) # Numbers that are always allowed (years, common constants, etc.) _ALWAYS_ALLOWED: set[float] = { 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 0.5, 0.01, 0.001, 0.0001, 0.1, 0.05, 0.95, 0.99, 2024.0, 2025.0, 2026.0, 2027.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 224.0, 299.0, 384.0, # Common image sizes # BUG-192: Common hyperparameter values 0.0003, 3e-4, 0.0005, 5e-4, 0.002, 2e-3, # learning rates 0.2, 0.3, 0.25, 0.7, 0.6, 0.8, # clip epsilon, dropout, gradient clip, GCE q, common HP 0.9, 0.999, 0.9999, # Adam betas, momentum 0.02, 0.03, # weight init std 1e-5, 1e-6, 1e-8, # epsilon, weight decay 300.0, 400.0, 500.0, # epochs 4096.0, 8192.0, # larger batch sizes / hidden dims } # Regex for extracting decimal numbers (including negative, scientific notation) # NOTE: lookbehind/lookahead must NOT exclude { } — numbers inside \textbf{91.5} # must still be extracted. We only exclude letters, underscore, and backslash. _NUMBER_RE = re.compile( r"(? float: """Fraction of numbers that are unverified.""" if self.total_numbers_checked == 0: return 0.0 return len(self.unverified_numbers) / self.total_numbers_checked def verify_paper( tex_text: str, registry: VerifiedRegistry, *, tolerance: float = 0.01, strict_sections: set[str] | None = None, lenient_sections: set[str] | None = None, ) -> VerificationResult: """Verify that all numbers in the paper are grounded in experiment data. Parameters ---------- tex_text: The full LaTeX source of the paper. registry: The verified value registry built from experiment data. tolerance: Relative tolerance for number matching (default 1%). strict_sections: Section names where unverified numbers cause REJECT. lenient_sections: Section names where unverified numbers cause WARNING only. Returns ------- VerificationResult Contains pass/fail status, list of unverified numbers, and summary. """ if strict_sections is None: strict_sections = _STRICT_SECTIONS if lenient_sections is None: lenient_sections = _LENIENT_SECTIONS result = VerificationResult(passed=True, severity="PASS") # 1. Parse sections sections = _parse_sections(tex_text) # 2. Find all tables (for in_table flag) table_ranges = _find_table_ranges(tex_text) # 3. Create skip mask (positions to ignore) skip_mask = _build_skip_mask(tex_text) # 4. Extract and verify numbers lines = tex_text.split("\n") for line_idx, line in enumerate(lines): line_num = line_idx + 1 section = _section_at_line(sections, line_idx) section_lower = section.lower() if section else "" in_table = any( start <= line_idx <= end and is_results for start, end, is_results in table_ranges ) for m in _NUMBER_RE.finditer(line): num_str = m.group(1) char_pos = _line_offset(lines, line_idx) + m.start() # Skip if inside a skip zone if skip_mask[char_pos]: continue try: value = float(num_str) except ValueError: continue if not math.isfinite(value): continue result.total_numbers_checked += 1 # Always-allowed numbers if value in _ALWAYS_ALLOWED: result.total_numbers_verified += 1 continue # Integer-like small numbers (likely indices, counts, etc.) # BUG-23 P1: In strict sections or tables, only auto-pass very small # integers (≤5) — larger counts (e.g. "20 datasets") could be fabricated. is_strict_ctx = _is_strict_section(section_lower, strict_sections) or in_table _int_limit = 5 if is_strict_ctx else 20 if value == int(value) and abs(value) <= _int_limit: result.total_numbers_verified += 1 continue # Check against registry if registry.is_verified(value, tolerance=tolerance): result.total_numbers_verified += 1 continue # UNVERIFIED — classify severity by section ctx = line.strip()[:120] unv = UnverifiedNumber( value=value, line_number=line_num, context=ctx, section=section or "(preamble)", in_table=in_table, ) result.unverified_numbers.append(unv) is_strict = _is_strict_section(section_lower, strict_sections) if is_strict or in_table: result.strict_violations += 1 else: result.lenient_violations += 1 # 5. Check for fabricated conditions result.fabricated_conditions = _check_condition_names(tex_text, registry, lines) # 5b. BUG-23 P2: Check training config claims (epochs, dataset, etc.) result.config_warnings = _check_training_config(tex_text, registry) # 6. Determine severity if result.strict_violations > 0 or len(result.fabricated_conditions) > 0: result.passed = False result.severity = "REJECT" elif result.lenient_violations > 0: result.passed = True result.severity = "WARN" else: result.passed = True result.severity = "PASS" # 7. Build summary result.summary = _build_summary(result) logger.info("Paper verification: %s", result.summary) return result def verify_paper_file( tex_path: Path, registry: VerifiedRegistry, **kwargs, ) -> VerificationResult: """Convenience: verify from a file path.""" tex_text = tex_path.read_text(encoding="utf-8") return verify_paper(tex_text, registry, **kwargs) # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _parse_sections(tex_text: str) -> list[tuple[int, str]]: """Parse section headings and their line positions. Returns list of (line_index, section_name) sorted by line_index. """ sections: list[tuple[int, str]] = [] lines = tex_text.split("\n") for i, line in enumerate(lines): m = _SECTION_RE.search(line) if m: sections.append((i, m.group(1).strip())) return sections def _section_at_line(sections: list[tuple[int, str]], line_idx: int) -> str | None: """Return the section name that contains the given line.""" current = None for sec_line, sec_name in sections: if sec_line <= line_idx: current = sec_name else: break return current _STRICT_EXEMPT_KEYWORDS: set[str] = { "dataset", "setup", "protocol", "hyperparameter", "implementation", "hardware", "infrastructure", "notation", "preliminaries", } def _is_strict_section(section_lower: str, strict_set: set[str]) -> bool: """Check if a section name matches any strict section pattern. BUG-R49-02: Sections like "Datasets and Evaluation Protocol" contain the keyword "evaluation" but describe protocol parameters, not results. Such sections are exempted when they also contain a setup/protocol keyword. """ if not section_lower: return False for strict_name in strict_set: if strict_name in section_lower: # Check for exemption: if the section also contains a # setup/protocol keyword, it's not a results section. if any(kw in section_lower for kw in _STRICT_EXEMPT_KEYWORDS): return False return True return False def _find_table_ranges(tex_text: str) -> list[tuple[int, int, bool]]: """Find line ranges of table environments. Returns ``(start_line, end_line, is_results_table)`` tuples. Hyperparameter / configuration tables (detected by ``\\caption`` keywords) are marked ``is_results_table=False`` so the verifier skips strict checks on their numeric content (BUG-192). """ _HP_CAPTION_KW = { "hyperparameter", "hyper-parameter", "configuration", "config", "setting", "training detail", "implementation detail", } ranges: list[tuple[int, int, bool]] = [] lines = tex_text.split("\n") in_table = False start = 0 for i, line in enumerate(lines): if r"\begin{table" in line: in_table = True start = i elif r"\end{table" in line and in_table: # Scan table block for \caption to determine type table_block = "\n".join(lines[start : i + 1]).lower() is_hp = any(kw in table_block for kw in _HP_CAPTION_KW) ranges.append((start, i, not is_hp)) in_table = False return ranges def _build_skip_mask(tex_text: str) -> list[bool]: """Build a per-character boolean mask of positions to skip.""" mask = [False] * len(tex_text) for pattern in _SKIP_PATTERNS: for m in pattern.finditer(tex_text): for pos in range(m.start(), m.end()): if pos < len(mask): mask[pos] = True return mask def _line_offset(lines: list[str], line_idx: int) -> int: """Return the character offset of the start of a line.""" offset = 0 for i in range(line_idx): offset += len(lines[i]) + 1 # +1 for newline return offset def _check_condition_names( tex_text: str, registry: VerifiedRegistry, lines: list[str], ) -> list[FabricatedCondition]: """Check if the paper mentions condition names that never ran.""" fabricated: list[FabricatedCondition] = [] # Only check if we have known conditions if not registry.condition_names: return fabricated # Build pattern of known condition names (exact match in text) # Look for condition-like names that appear in tables or bold text # This is heuristic — we look for unknown names that look like conditions known_lower = {name.lower() for name in registry.condition_names} # Common generic terms that should NOT be flagged as fabricated conditions _GENERIC_TERMS = { "method", "metric", "condition", "---", "", "model", "approach", "variant", "architecture", "ours", "average", "mean", "std", "total", "baseline", "proposed", "ablation", "default", "results", "table", "figure", "section", } def _is_candidate(name: str) -> bool: """Check if a cleaned name looks like a real condition name.""" return bool( name and name.lower() not in known_lower and name.lower() not in _GENERIC_TERMS and not name.startswith("\\") and len(name) > 1 and not name.isdigit() # BUG-DA8-15: Reject numeric-looking strings (e.g. "91.5" from \textbf{91.5}) and not re.match(r'^[\d.eE+\-]+$', name) ) def _clean_latex(s: str) -> str: s = re.sub(r"\\textbf\{([^}]*)\}", r"\1", s) s = re.sub(r"\\textit\{([^}]*)\}", r"\1", s) return s.replace("\\_", "_").strip() _seen_names: set[str] = set() # 1. Extract potential condition names from TABLE ROWS for i, line in enumerate(lines): if "&" in line and "\\\\" in line: cells = line.split("&") if cells: cand_clean = _clean_latex(cells[0].strip().rstrip("\\").strip()) if _is_candidate(cand_clean) and cand_clean.lower() not in _seen_names: _seen_names.add(cand_clean.lower()) fabricated.append( FabricatedCondition( name=cand_clean, line_number=i + 1, context=line.strip()[:120], ) ) # 2. BUG-23 P2: Also check PROSE — bold/italic condition mentions in # Results/Experiments sections that don't match known conditions. _strict_sections_lower = { "results", "experiments", "experimental results", "evaluation", "ablation", "comparison", } sections = _parse_sections(tex_text) for i, line in enumerate(lines): section = _section_at_line(sections, i) if not section or section.lower() not in _strict_sections_lower: continue # Find \textbf{CondName} or \textit{CondName} in prose for m in re.finditer(r"\\text(?:bf|it)\{([^}]+)\}", line): cand_clean = _clean_latex(m.group(1)).strip() # Only flag multi-word or snake_case names that look like conditions if ( _is_candidate(cand_clean) and ("_" in cand_clean or cand_clean[0].isupper()) and cand_clean.lower() not in _seen_names ): _seen_names.add(cand_clean.lower()) fabricated.append( FabricatedCondition( name=cand_clean, line_number=i + 1, context=line.strip()[:120], ) ) return fabricated def _check_training_config( tex_text: str, registry: VerifiedRegistry, ) -> list[str]: """BUG-23 P2: Check if paper claims about training config match reality. Extracts epoch counts from paper text and compares against known training_config from the registry. Returns list of warning strings. """ warnings: list[str] = [] # Extract "trained for N epochs" or "N epochs" claims epoch_claims = re.findall( r"(?:trained?\s+(?:for\s+)?|over\s+|(?:for|with)\s+)(\d+)\s+epoch", tex_text, re.IGNORECASE, ) if epoch_claims and registry.training_config: actual_steps = registry.training_config.get("TRAINING_STEPS") actual_epochs = registry.training_config.get("epochs") if actual_epochs is not None: for claim in epoch_claims: claimed = int(claim) if abs(claimed - actual_epochs) > max(5, actual_epochs * 0.3): warnings.append( f"Paper claims {claimed} epochs but experiment ran {int(actual_epochs)} epochs" ) elif actual_steps is not None: # Can't compare epochs to steps directly, but flag very large claims for claim in epoch_claims: claimed = int(claim) if claimed > 500: warnings.append( f"Paper claims {claimed} epochs — verify against actual training steps ({int(actual_steps)})" ) # Check condition count claims ("N conditions" / "N methods" / "N baselines") count_claims = re.findall( r"(\d+)\s+(?:condition|method|baseline|approach|variant)s?\b", tex_text, re.IGNORECASE, ) if count_claims and registry.condition_names: actual_count = len(registry.condition_names) for claim in count_claims: claimed = int(claim) if claimed > actual_count + 1: warnings.append( f"Paper claims {claimed} conditions/methods but only {actual_count} ran" ) if warnings: logger.warning("Training config validation: %s", warnings) return warnings def _build_summary(result: VerificationResult) -> str: """Build human-readable summary.""" parts = [f"severity={result.severity}"] parts.append( f"checked={result.total_numbers_checked}, " f"verified={result.total_numbers_verified}, " f"unverified={len(result.unverified_numbers)}" ) if result.strict_violations: parts.append(f"strict_violations={result.strict_violations}") if result.fabricated_conditions: names = [fc.name for fc in result.fabricated_conditions[:3]] parts.append(f"fabricated_conditions={names}") if result.config_warnings: parts.append(f"config_warnings={len(result.config_warnings)}") return "; ".join(parts) ================================================ FILE: researchclaw/pipeline/runner.py ================================================ from __future__ import annotations import json import importlib import logging import os import shutil import tempfile import time as _time from pathlib import Path from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.evolution import EvolutionStore, extract_lessons from researchclaw.knowledge.base import write_stage_to_kb from researchclaw.pipeline.executor import StageResult, execute_stage from researchclaw.pipeline.stages import ( DECISION_ROLLBACK, MAX_DECISION_PIVOTS, NONCRITICAL_STAGES, STAGE_SEQUENCE, Stage, StageStatus, ) def _utcnow_iso() -> str: from datetime import datetime, timezone return datetime.now(timezone.utc).isoformat(timespec="seconds") def _should_start(stage: Stage, from_stage: Stage, started: bool) -> bool: if started: return True return stage == from_stage def _build_pipeline_summary( *, run_id: str, results: list[StageResult], from_stage: Stage, run_dir: Path | None = None, ) -> dict[str, object]: summary: dict[str, object] = { "run_id": run_id, "stages_executed": len(results), "stages_done": sum(1 for item in results if item.status == StageStatus.DONE), "stages_blocked": sum( 1 for item in results if item.status == StageStatus.BLOCKED_APPROVAL ), "stages_failed": sum( 1 for item in results if item.status == StageStatus.FAILED ), "degraded": any(r.decision == "degraded" for r in results), "from_stage": int(from_stage), "final_stage": int(results[-1].stage) if results else int(from_stage), "final_status": results[-1].status.value if results else "no_stages", "generated": _utcnow_iso(), "content_metrics": _collect_content_metrics(run_dir), } return summary def _write_pipeline_summary(run_dir: Path, summary: dict[str, object]) -> None: (run_dir / "pipeline_summary.json").write_text( json.dumps(summary, indent=2), encoding="utf-8", ) def _write_checkpoint(run_dir: Path, stage: Stage, run_id: str) -> None: """Write checkpoint atomically via temp file + rename to prevent corruption.""" checkpoint = { "last_completed_stage": int(stage), "last_completed_name": stage.name, "run_id": run_id, "timestamp": _utcnow_iso(), } target = run_dir / "checkpoint.json" fd, tmp_path = tempfile.mkstemp(dir=run_dir, suffix=".tmp", prefix="checkpoint_") os.close(fd) try: with open(tmp_path, "w", encoding="utf-8") as fh: fh.write(json.dumps(checkpoint, indent=2)) Path(tmp_path).replace(target) except BaseException: Path(tmp_path).unlink(missing_ok=True) raise def _write_heartbeat(run_dir: Path, stage: Stage, run_id: str) -> None: """Write heartbeat file for sentinel watchdog monitoring.""" import os heartbeat = { "pid": os.getpid(), "last_stage": int(stage), "last_stage_name": stage.name, "run_id": run_id, "timestamp": _utcnow_iso(), } (run_dir / "heartbeat.json").write_text( json.dumps(heartbeat, indent=2), encoding="utf-8" ) def read_checkpoint(run_dir: Path) -> Stage | None: """Read checkpoint and return the NEXT stage to execute, or None if no checkpoint.""" cp_path = run_dir / "checkpoint.json" if not cp_path.exists(): return None try: data = json.loads(cp_path.read_text(encoding="utf-8")) last_num = data.get("last_completed_stage") if last_num is None: return None for i, stage in enumerate(STAGE_SEQUENCE): if int(stage) == last_num: if i + 1 < len(STAGE_SEQUENCE): return STAGE_SEQUENCE[i + 1] return None return None except (json.JSONDecodeError, TypeError, ValueError): return None def resume_from_checkpoint( run_dir: Path, default_stage: Stage = Stage.TOPIC_INIT ) -> Stage: """Resolve the stage to resume from using checkpoint metadata.""" next_stage = read_checkpoint(run_dir) return next_stage if next_stage is not None else default_stage def _collect_content_metrics(run_dir: Path | None) -> dict[str, object]: """Collect content authenticity metrics from stage outputs.""" metrics: dict[str, object] = { "template_ratio": None, "citation_verify_score": None, "total_citations": None, "verified_citations": None, "degraded_sources": [], } if run_dir is None: return metrics draft_path = run_dir / "stage-17" / "paper_draft.md" if draft_path.exists(): try: quality_module = importlib.import_module("researchclaw.quality") compute_template_ratio = quality_module.compute_template_ratio text = draft_path.read_text(encoding="utf-8") metrics["template_ratio"] = round(compute_template_ratio(text), 4) except ( AttributeError, ModuleNotFoundError, UnicodeDecodeError, OSError, ValueError, TypeError, ): pass verify_path = run_dir / "stage-23" / "verification_report.json" if verify_path.exists(): try: vdata = json.loads(verify_path.read_text(encoding="utf-8")) if isinstance(vdata, dict): summary = vdata.get("summary", vdata) total = summary.get("total", 0) if isinstance(summary, dict) else None verified = summary.get("verified", 0) if isinstance(summary, dict) else None if isinstance(total, int | float) and isinstance(verified, int | float): total_num = int(total) verified_num = int(verified) metrics["total_citations"] = total_num metrics["verified_citations"] = verified_num if total_num > 0: metrics["citation_verify_score"] = round( verified_num / total_num, 4 ) except (json.JSONDecodeError, OSError, TypeError, ValueError): pass return metrics logger = logging.getLogger(__name__) def _run_experiment_diagnosis(run_dir: Path, config: RCConfig, run_id: str) -> None: """Run experiment diagnosis after Stage 14 and save reports. Produces: - ``run_dir/experiment_diagnosis.json`` — structured diagnosis + quality assessment - ``run_dir/repair_prompt.txt`` — repair instructions (if quality is insufficient) """ try: from researchclaw.pipeline.experiment_diagnosis import ( diagnose_experiment, assess_experiment_quality, ) # Find the most recent stage-14 experiment_summary.json summary_path = None for candidate in sorted(run_dir.glob("stage-14*/experiment_summary.json")): summary_path = candidate if not summary_path or not summary_path.exists(): return summary = json.loads(summary_path.read_text(encoding="utf-8")) # Collect stdout/stderr from experiment runs stdout, stderr = "", "" runs_dir = summary_path.parent / "runs" if runs_dir.is_dir(): for run_file in sorted(runs_dir.glob("*.json"))[:5]: try: run_data = json.loads(run_file.read_text(encoding="utf-8")) if isinstance(run_data, dict): stdout += run_data.get("stdout", "") + "\n" stderr += run_data.get("stderr", "") + "\n" except (json.JSONDecodeError, OSError): continue # Load experiment plan from stage-09 plan = None for candidate in sorted(run_dir.glob("stage-09*/exp_plan.yaml")): try: import yaml as _yaml_diag plan = _yaml_diag.safe_load(candidate.read_text(encoding="utf-8")) except Exception: pass if plan is None: for candidate in sorted(run_dir.glob("stage-09*/experiment_design.json")): try: plan = json.loads(candidate.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): pass # Load refinement log if available ref_log = None for candidate in sorted(run_dir.glob("stage-13*/refinement_log.json")): try: ref_log = json.loads(candidate.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): pass # Run diagnosis diag = diagnose_experiment( experiment_summary=summary, experiment_plan=plan, refinement_log=ref_log, stdout=stdout.strip(), stderr=stderr.strip(), ) # Run quality assessment qa = assess_experiment_quality(summary, ref_log) # Save diagnosis report diag_report = { "diagnosis": diag.to_dict(), "quality_assessment": { "mode": qa.mode.value, "sufficient": qa.sufficient, "repair_possible": qa.repair_possible, "deficiency_types": [d.type.value for d in qa.deficiencies], }, "repair_needed": not qa.sufficient, "generated": _utcnow_iso(), } (run_dir / "experiment_diagnosis.json").write_text( json.dumps(diag_report, indent=2), encoding="utf-8" ) if not qa.sufficient: # Generate repair prompt for the REFINE loop from researchclaw.pipeline.experiment_repair import build_repair_prompt code: dict[str, str] = {} # Try refined code first, then stage-10 experiment dir, then raw stage-10 for _glob_pat in ( "stage-13*/experiment_final/*.py", "stage-10*/experiment/*.py", "stage-10*/*.py", ): for candidate in sorted(run_dir.glob(_glob_pat)): try: code[candidate.name] = candidate.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass if code: break repair_prompt = build_repair_prompt( diag, code, time_budget_sec=config.experiment.time_budget_sec ) (run_dir / "repair_prompt.txt").write_text( repair_prompt, encoding="utf-8" ) logger.info( "[%s] Experiment diagnosis: mode=%s, deficiencies=%d — repair prompt saved", run_id, qa.mode.value, len(diag.deficiencies), ) print( f"[{run_id}] Experiment diagnosis: {qa.mode.value} " f"({len(diag.deficiencies)} issues found, repair needed)" ) else: logger.info( "[%s] Experiment diagnosis: mode=%s, sufficient=True — quality OK", run_id, qa.mode.value, ) print(f"[{run_id}] Experiment diagnosis: {qa.mode.value} — quality OK") except Exception as exc: logger.warning("Experiment diagnosis failed: %s", exc) def _run_experiment_repair(run_dir: Path, config: RCConfig, run_id: str) -> None: """Execute the experiment repair loop when diagnosis finds quality issues. Calls the repair loop from ``experiment_repair.py`` which: 1. Loads experiment code and diagnosis 2. Gets fixes from LLM or OpenCode 3. Re-runs experiment in sandbox 4. Re-assesses quality 5. Repeats up to max_cycles """ try: from researchclaw.pipeline.experiment_repair import run_repair_loop repair_result = run_repair_loop( run_dir=run_dir, config=config, run_id=run_id, ) # Save repair result (run_dir / "experiment_repair_result.json").write_text( json.dumps(repair_result.to_dict(), indent=2), encoding="utf-8" ) # BUG-186: Promote best experiment summary to stage-14/ so # downstream stages (sanitizer, paper_verifier) see it. # BUG-198: Only promote if the repair summary is RICHER than # the existing stage-14 summary. The repair loop can produce # empty summaries (metrics: {}, 0 conditions) which would # overwrite enriched data from the analysis stage. if repair_result.best_experiment_summary: from researchclaw.pipeline.experiment_repair import ( _summary_quality_score, ) best_path = run_dir / "stage-14" / "experiment_summary.json" existing_score = 0.0 if best_path.exists(): try: existing = json.loads( best_path.read_text(encoding="utf-8") ) existing_score = _summary_quality_score(existing) except (json.JSONDecodeError, OSError): pass repair_score = _summary_quality_score( repair_result.best_experiment_summary ) if repair_score > existing_score: best_path.write_text( json.dumps( repair_result.best_experiment_summary, indent=2 ), encoding="utf-8", ) logger.info( "[%s] Promoted repair results to stage-14 " "(score %.1f > %.1f, success=%s)", run_id, repair_score, existing_score, repair_result.success, ) else: logger.info( "[%s] Kept existing stage-14 summary (score %.1f >= " "repair score %.1f)", run_id, existing_score, repair_score, ) if repair_result.success: # Re-run diagnosis with updated results _run_experiment_diagnosis(run_dir, config, run_id) else: logger.info( "[%s] Repair loop completed without reaching full_paper quality " "(best mode: %s, %d cycles)", run_id, repair_result.final_mode.value, repair_result.total_cycles, ) except Exception as exc: logger.warning("[%s] Experiment repair failed: %s", run_id, exc) print(f"[{run_id}] Experiment repair failed: {exc}") def execute_pipeline( *, run_dir: Path, run_id: str, config: RCConfig, adapters: AdapterBundle, from_stage: Stage = Stage.TOPIC_INIT, auto_approve_gates: bool = False, stop_on_gate: bool = False, skip_noncritical: bool = False, kb_root: Path | None = None, ) -> list[StageResult]: """Execute pipeline stages sequentially from `from_stage` and write summary.""" results: list[StageResult] = [] started = False total_stages = len(STAGE_SEQUENCE) for stage in STAGE_SEQUENCE: started = _should_start(stage, from_stage, started) if not started: continue stage_num = int(stage) prefix = f"[{run_id}] Stage {stage_num:02d}/{total_stages}" print(f"{prefix} {stage.name} — running...") # BUG-218: Ensure the best stage-14 experiment data is promoted # BEFORE paper writing begins. Without this, the recursive REFINE # path writes the paper using the latest (potentially worse) # iteration's data, because the post-recursion promotion at line # ~547 runs only after the recursive call—i.e. after the paper # has already been written. if stage == Stage.PAPER_OUTLINE: _promote_best_stage14(run_dir, config) t0 = _time.monotonic() result = execute_stage( stage, run_dir=run_dir, run_id=run_id, config=config, adapters=adapters, auto_approve_gates=auto_approve_gates, ) elapsed = _time.monotonic() - t0 if result.status == StageStatus.DONE: arts = ", ".join(result.artifacts) if result.artifacts else "none" if result.decision == "degraded": print( f"{prefix} {stage.name} — DEGRADED ({elapsed:.1f}s) " f"— continuing with sanitization → {arts}" ) else: print(f"{prefix} {stage.name} — done ({elapsed:.1f}s) → {arts}") elif result.status == StageStatus.FAILED: err = result.error or "unknown error" print(f"{prefix} {stage.name} — FAILED ({elapsed:.1f}s) — {err}") elif result.status == StageStatus.BLOCKED_APPROVAL: print(f"{prefix} {stage.name} — blocked (awaiting approval)") results.append(result) if kb_root is not None and result.status == StageStatus.DONE: try: stage_dir = run_dir / f"stage-{int(stage):02d}" write_stage_to_kb( kb_root, stage_id=int(stage), stage_name=stage.name.lower(), run_id=run_id, artifacts=list(result.artifacts), stage_dir=stage_dir, backend=config.knowledge_base.backend, topic=config.research.topic, ) except Exception: # noqa: BLE001 pass if result.status == StageStatus.DONE: _write_checkpoint(run_dir, stage, run_id) # --- Experiment diagnosis + repair after Stage 14 (result_analysis) --- if ( stage == Stage.RESULT_ANALYSIS and result.status == StageStatus.DONE and config.experiment.repair.enabled ): _run_experiment_diagnosis(run_dir, config, run_id) # Check if repair loop should run _diag_path = run_dir / "experiment_diagnosis.json" if _diag_path.exists(): try: _diag_data = json.loads(_diag_path.read_text(encoding="utf-8")) if _diag_data.get("repair_needed"): _run_experiment_repair(run_dir, config, run_id) except (json.JSONDecodeError, OSError): pass # --- Heartbeat for sentinel watchdog --- _write_heartbeat(run_dir, stage, run_id) # --- PIVOT/REFINE decision handling --- if ( stage == Stage.RESEARCH_DECISION and result.status == StageStatus.DONE and result.decision in DECISION_ROLLBACK ): pivot_count = _read_pivot_count(run_dir) # R6-4: Skip REFINE if experiment metrics are empty for consecutive cycles if pivot_count > 0 and _consecutive_empty_metrics(run_dir, pivot_count): logger.warning( "Consecutive REFINE cycles produced empty metrics — forcing PROCEED" ) print( f"[{run_id}] Consecutive empty metrics across REFINE cycles — forcing PROCEED" ) # BUG-211: Promote best stage-14 before proceeding with # empty data — an earlier iteration may have real metrics. _promote_best_stage14(run_dir, config) elif pivot_count < MAX_DECISION_PIVOTS: rollback_target = DECISION_ROLLBACK[result.decision] _record_decision_history( run_dir, result.decision, rollback_target, pivot_count + 1 ) logger.info( "Decision %s: rolling back to %s (attempt %d/%d)", result.decision.upper(), rollback_target.name, pivot_count + 1, MAX_DECISION_PIVOTS, ) print( f"[{run_id}] Decision: {result.decision.upper()} → " f"rollback to {rollback_target.name} " f"(attempt {pivot_count + 1}/{MAX_DECISION_PIVOTS})" ) # Version existing stage directories before overwriting _version_rollback_stages( run_dir, rollback_target, pivot_count + 1 ) # Recurse from rollback target pivot_results = execute_pipeline( run_dir=run_dir, run_id=run_id, config=config, adapters=adapters, from_stage=rollback_target, auto_approve_gates=auto_approve_gates, stop_on_gate=stop_on_gate, skip_noncritical=skip_noncritical, kb_root=kb_root, ) results.extend(pivot_results) # BUG-211: Promote best stage-14 after REFINE completes so # downstream stages use the best data, not just the latest. _promote_best_stage14(run_dir, config) break # Exit current loop; recursive call handles the rest else: # Quality gate: check if experiment results are actually usable _quality_ok, _quality_msg = _check_experiment_quality( run_dir, pivot_count ) if not _quality_ok: logger.warning( "Max pivot attempts (%d) reached — forcing PROCEED " "with quality warning: %s", MAX_DECISION_PIVOTS, _quality_msg, ) print( f"[{run_id}] QUALITY WARNING: {_quality_msg}" ) # Write quality warning to run directory _qw_path = run_dir / "quality_warning.txt" _qw_path.write_text( f"Max pivots ({MAX_DECISION_PIVOTS}) reached.\n" f"Quality gate failed: {_quality_msg}\n" f"Paper will be written but may have significant issues.\n", encoding="utf-8", ) else: logger.warning( "Max pivot attempts (%d) reached — forcing PROCEED", MAX_DECISION_PIVOTS, ) print( f"[{run_id}] Max pivot attempts reached — forcing PROCEED" ) # BUG-205: After forced PROCEED, promote the BEST stage-14 # experiment summary across all REFINE iterations. _promote_best_stage14(run_dir, config) if result.status == StageStatus.FAILED: if skip_noncritical and stage in NONCRITICAL_STAGES: logger.warning("Noncritical stage %s failed - skipping", stage.name) else: break if result.status == StageStatus.BLOCKED_APPROVAL and stop_on_gate: break summary = _build_pipeline_summary( run_id=run_id, results=results, from_stage=from_stage, run_dir=run_dir, ) _write_pipeline_summary(run_dir, summary) # --- Evolution: extract and store lessons --- lessons: list[object] = [] try: lessons = extract_lessons(results, run_id=run_id, run_dir=run_dir) if lessons: store = EvolutionStore(run_dir / "evolution") store.append_many(lessons) logger.info("Extracted %d lessons from pipeline run", len(lessons)) except Exception: # noqa: BLE001 logger.warning("Evolution lesson extraction failed (non-blocking)") # --- MetaClaw bridge: convert high-severity lessons to skills --- try: _metaclaw_post_pipeline(config, results, lessons, run_id, run_dir) except Exception: # noqa: BLE001 logger.warning("MetaClaw post-pipeline hook failed (non-blocking)") # --- Package deliverables into a single folder --- try: deliverables_dir = _package_deliverables(run_dir, run_id, config) if deliverables_dir is not None: print(f"[{run_id}] Deliverables packaged → {deliverables_dir}") except Exception: # noqa: BLE001 logger.warning("Deliverables packaging failed (non-blocking)") return results def _package_deliverables( run_dir: Path, run_id: str, config: RCConfig, ) -> Path | None: """Collect all final user-facing deliverables into a single ``deliverables/`` folder. Returns the deliverables directory path, or None if nothing was packaged. Packaged artifacts (best-available version selected automatically): - paper_final.md — Final paper (Markdown) - paper.tex — Conference-ready LaTeX - references.bib — BibTeX bibliography - code/ — Experiment code package - verification_report.json — Citation verification report (if available) """ dest = run_dir / "deliverables" dest.mkdir(parents=True, exist_ok=True) packaged: list[str] = [] # --- 1. Final paper (Markdown) --- # Prefer verified version (stage 23) over base version (stage 22) paper_md = None for candidate in [ run_dir / "stage-23" / "paper_final_verified.md", run_dir / "stage-22" / "paper_final.md", ]: if candidate.exists() and candidate.stat().st_size > 0: paper_md = candidate break if paper_md is not None: shutil.copy2(paper_md, dest / "paper_final.md") packaged.append("paper_final.md") # --- 2. LaTeX paper --- # BUG-183: Stage 22's paper.tex has been sanitized (fabricated numbers # replaced with ---). Regenerating from Markdown would undo this because # the Markdown was never sanitized. Prefer Stage-22 paper.tex when a # sanitization report exists. Only regenerate from verified Markdown if # no sanitization was performed (i.e., the run was clean). tex_regenerated = False _sanitization_report = run_dir / "stage-22" / "sanitization_report.json" _was_sanitized = _sanitization_report.exists() verified_md = run_dir / "stage-23" / "paper_final_verified.md" if ( not _was_sanitized and paper_md is not None and paper_md == verified_md and verified_md.exists() and verified_md.stat().st_size > 0 ): try: from researchclaw.templates import get_template, markdown_to_latex from researchclaw.pipeline.executor import _extract_paper_title tpl = get_template(config.export.target_conference) v_text = verified_md.read_text(encoding="utf-8") tex_content = markdown_to_latex( v_text, tpl, title=_extract_paper_title(v_text), authors=config.export.authors, bib_file=config.export.bib_file, ) # IMP-17: Quality check — ensure regenerated LaTeX has # proper structure (abstract, multiple sections) _has_abstract = ( "\\begin{abstract}" in tex_content and tex_content.split("\\begin{abstract}")[1] .split("\\end{abstract}")[0] .strip() ) _section_count = tex_content.count("\\section{") if _has_abstract and _section_count >= 3: (dest / "paper.tex").write_text(tex_content, encoding="utf-8") packaged.append("paper.tex") tex_regenerated = True logger.info( "Deliverables: regenerated paper.tex from verified markdown" ) else: logger.warning( "Regenerated paper.tex has poor structure " "(abstract=%s, sections=%d) — using Stage 22 version", bool(_has_abstract), _section_count, ) except Exception: # noqa: BLE001 logger.debug("paper.tex regeneration from verified md failed") elif _was_sanitized: logger.info( "Deliverables: using Stage 22 paper.tex (sanitized) — " "skipping markdown regeneration to preserve sanitization" ) if not tex_regenerated: tex_src = run_dir / "stage-22" / "paper.tex" if tex_src.exists() and tex_src.stat().st_size > 0: shutil.copy2(tex_src, dest / "paper.tex") packaged.append("paper.tex") # --- 3. References (BibTeX) --- # Prefer verified bib (stage 23) over base bib (stage 22) bib_src = None for candidate in [ run_dir / "stage-23" / "references_verified.bib", run_dir / "stage-22" / "references.bib", ]: if candidate.exists() and candidate.stat().st_size > 0: bib_src = candidate break if bib_src is not None: shutil.copy2(bib_src, dest / "references.bib") packaged.append("references.bib") # --- 4. Experiment code package --- code_src = run_dir / "stage-22" / "code" if code_src.is_dir(): code_dest = dest / "code" if code_dest.exists(): shutil.rmtree(code_dest) shutil.copytree(code_src, code_dest) packaged.append("code/") # --- 5. Verification report (optional) --- verify_src = run_dir / "stage-23" / "verification_report.json" if verify_src.exists() and verify_src.stat().st_size > 0: shutil.copy2(verify_src, dest / "verification_report.json") packaged.append("verification_report.json") # --- 5b. Sanitization report (degraded mode) --- san_src = run_dir / "stage-22" / "sanitization_report.json" if san_src.exists() and san_src.stat().st_size > 0: shutil.copy2(san_src, dest / "sanitization_report.json") packaged.append("sanitization_report.json") # --- 6. Charts (optional) --- charts_src = run_dir / "stage-22" / "charts" if charts_src.is_dir() and any(charts_src.iterdir()): charts_dest = dest / "charts" if charts_dest.exists(): shutil.rmtree(charts_dest) shutil.copytree(charts_src, charts_dest) packaged.append("charts/") # --- 7. Conference style files (.sty, .bst) --- try: from researchclaw.templates import get_template tpl = get_template(config.export.target_conference) style_files = tpl.get_style_files() for sf in style_files: shutil.copy2(sf, dest / sf.name) packaged.append(sf.name) if style_files: logger.info( "Deliverables: bundled %d style files for %s", len(style_files), tpl.display_name, ) except Exception: # noqa: BLE001 logger.debug("Style file bundling skipped (template lookup failed)") # --- 8. Verify & repair cite key coverage (IMP-12 + IMP-14) --- tex_path = dest / "paper.tex" bib_path = dest / "references.bib" if tex_path.exists() and bib_path.exists(): try: tex_text = tex_path.read_text(encoding="utf-8") bib_text = bib_path.read_text(encoding="utf-8") import re as _re # IMP-15: Deduplicate .bib entries _seen_bib_keys: set[str] = set() _deduped_entries: list[str] = [] for _bm in _re.finditer( r"(@\w+\{([^,]+),.*?\n\})", bib_text, _re.DOTALL ): _bkey = _bm.group(2).strip() if _bkey not in _seen_bib_keys: _seen_bib_keys.add(_bkey) _deduped_entries.append(_bm.group(1)) if len(_deduped_entries) < len( list(_re.finditer(r"@\w+\{", bib_text)) ): bib_text = "\n\n".join(_deduped_entries) + "\n" bib_path.write_text(bib_text, encoding="utf-8") logger.info( "Deliverables: deduplicated .bib → %d entries", len(_deduped_entries), ) # Collect all cite keys from \cite{key1, key2} all_cite_keys: set[str] = set() for cm in _re.finditer(r"\\cite\{([^}]+)\}", tex_text): all_cite_keys.update(k.strip() for k in cm.group(1).split(",")) bib_keys = set(_re.findall(r"@\w+\{([^,]+),", bib_text)) missing = all_cite_keys - bib_keys # IMP-14: Strip orphaned \cite{key} from paper.tex if missing: logger.warning( "Deliverables: stripping %d orphaned cite keys from " "paper.tex: %s", len(missing), sorted(missing)[:10], ) def _filter_cite(m: _re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] kept = [k for k in keys if k not in missing] if not kept: return "" return "\\cite{" + ", ".join(kept) + "}" tex_text = _re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text) # Clean up whitespace artifacts: double spaces, space before period tex_text = _re.sub(r" +", " ", tex_text) tex_text = _re.sub(r" ([.,;:)])", r"\1", tex_text) tex_path.write_text(tex_text, encoding="utf-8") logger.info( "Deliverables: paper.tex repaired — all remaining cite " "keys verified" ) else: logger.info( "Deliverables: all %d cite keys verified in references.bib", len(all_cite_keys), ) except Exception: # noqa: BLE001 logger.debug("Cite key verification/repair skipped") # --- 9. IMP-18: Compile LaTeX to verify paper.tex --- if tex_path.exists() and bib_path.exists(): try: from researchclaw.templates.compiler import compile_latex compile_result = compile_latex(tex_path, max_attempts=3, timeout=120) if compile_result.success: logger.info("IMP-18: paper.tex compiles successfully") # Keep the generated PDF pdf_path = dest / tex_path.stem pdf_file = dest / (tex_path.stem + ".pdf") if pdf_file.exists(): packaged.append(f"{tex_path.stem}.pdf") else: logger.warning( "IMP-18: paper.tex compilation failed after %d attempts: %s", compile_result.attempts, compile_result.errors[:3], ) if compile_result.fixes_applied: logger.info( "IMP-18: Applied %d auto-fixes: %s", len(compile_result.fixes_applied), compile_result.fixes_applied, ) except Exception: # noqa: BLE001 logger.debug("IMP-18: LaTeX compilation skipped (non-blocking)") if not packaged: # Nothing to package — remove empty dir dest.rmdir() return None # --- Write manifest --- manifest = { "run_id": run_id, "target_conference": config.export.target_conference, "files": packaged, "generated": _utcnow_iso(), "notes": { "paper_final.md": "Final paper in Markdown format", "paper.tex": f"Conference-ready LaTeX ({config.export.target_conference})", "references.bib": "BibTeX bibliography (verified citations only)", "code/": "Experiment source code with requirements.txt", "verification_report.json": "Citation integrity & relevance verification", "charts/": "Result visualizations", }, } (dest / "manifest.json").write_text( json.dumps(manifest, indent=2), encoding="utf-8" ) logger.info( "Deliverables packaged: %s (%d items)", dest, len(packaged), ) return dest def _version_rollback_stages( run_dir: Path, rollback_target: Stage, attempt: int ) -> None: """Rename stage directories that will be overwritten by a PIVOT/REFINE. For example, if rolling back to Stage 8 (attempt 2), renames: stage-08/ → stage-08_v1/ stage-09/ → stage-09_v1/ ... up to stage-15/ """ import shutil rollback_num = int(rollback_target) # Stages from rollback target up to RESEARCH_DECISION (15) will be rerun decision_num = int(Stage.RESEARCH_DECISION) for stage_num in range(rollback_num, decision_num + 1): stage_dir = run_dir / f"stage-{stage_num:02d}" if stage_dir.exists(): version_dir = run_dir / f"stage-{stage_num:02d}_v{attempt}" if version_dir.exists(): shutil.rmtree(version_dir) stage_dir.rename(version_dir) logger.debug( "Versioned %s → %s", stage_dir.name, version_dir.name ) def _consecutive_empty_metrics(run_dir: Path, pivot_count: int) -> bool: """R6-4: Check if the current and previous REFINE cycles both produced empty metrics.""" # Check the most recent experiment_summary.json (stage-14) and its versioned predecessor. # BUG-215: When stage-14/ doesn't exist (renamed to stage-14_v{N} without # promotion), fall back to the latest versioned directory as "current". current = run_dir / "stage-14" / "experiment_summary.json" if not current.exists(): # Try the latest versioned directory for _v in range(pivot_count + 1, 0, -1): alt = run_dir / f"stage-14_v{_v}" / "experiment_summary.json" if alt.exists(): current = alt break prev = run_dir / f"stage-14_v{pivot_count}" / "experiment_summary.json" for path in (current, prev): if not path.exists(): return False try: data = json.loads(path.read_text(encoding="utf-8")) # Check all possible metric locations has_metrics = False ms = data.get("metrics_summary", {}) if isinstance(ms, dict) and ms: has_metrics = True br = data.get("best_run", {}) if isinstance(br, dict) and br.get("metrics"): has_metrics = True if has_metrics: return False # At least one cycle had real metrics except (json.JSONDecodeError, OSError, AttributeError): return False return True # Both cycles had empty metrics def _promote_best_stage14(run_dir: Path, config: RCConfig) -> None: """BUG-205: After forced PROCEED, promote the best stage-14 experiment. Scans all ``stage-14*`` directories, scores them by primary metric, and copies the best experiment_summary.json into ``stage-14/`` if the current ``stage-14/`` is not already the best. """ import shutil metric_key = config.experiment.metric_key or "primary_metric" metric_dir = config.experiment.metric_direction or "maximize" candidates: list[tuple[float, Path]] = [] for d in sorted(run_dir.glob("stage-14*")): summary_path = d / "experiment_summary.json" if not summary_path.exists(): continue try: data = json.loads(summary_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue ms = data.get("metrics_summary", {}) pm_val: float | None = None # BUG-DA8-03: Exact match first, then substring fallback # (avoids "accuracy" matching "balanced_accuracy") if metric_key in ms: _v = ms[metric_key] try: pm_val = float(_v["mean"] if isinstance(_v, dict) else _v) except (TypeError, ValueError, KeyError): pass if pm_val is None: for k, v in ms.items(): if metric_key in k: try: pm_val = float(v["mean"] if isinstance(v, dict) else v) except (TypeError, ValueError, KeyError): pass break if pm_val is not None: candidates.append((pm_val, d)) if not candidates: return # nothing to promote current_dir = run_dir / "stage-14" # Sort: best first candidates.sort(key=lambda x: x[0], reverse=(metric_dir == "maximize")) # BUG-226: Detect degenerate near-zero metrics (broken normalization or # collapsed training). When minimising, a value >1000x smaller than the # second-best almost certainly comes from a degenerate iteration. if metric_dir == "minimize" and len(candidates) > 1: _bv, _bd = candidates[0] _sv = candidates[1][0] if 0 < _bv < _sv * 1e-3: logger.warning( "BUG-226: Degenerate best value %.6g is >1000× smaller than " "second-best %.6g — skipping degenerate iteration %s", _bv, _sv, _bd.name, ) candidates.pop(0) best_val, best_dir = candidates[0] # BUG-223: Always write canonical best summary at run root BEFORE any # early return, so downstream consumers (Stage 17, Stage 20, Stage 22, # VerifiedRegistry) always find experiment_summary_best.json. _best_src = best_dir / "experiment_summary.json" if _best_src.exists(): shutil.copy2(_best_src, run_dir / "experiment_summary_best.json") logger.info( "BUG-223: Wrote experiment_summary_best.json from %s (%.4f)", best_dir.name, best_val, ) # BUG-225: Also copy analysis.md from the best iteration so Stage 17 # doesn't read stale analysis from a degenerate non-versioned stage-14. _best_analysis = best_dir / "analysis.md" if _best_analysis.exists(): shutil.copy2(_best_analysis, run_dir / "analysis_best.md") if best_dir == current_dir: logger.info("BUG-205: stage-14/ already has the best result (%.4f)", best_val) return # Promote: copy best summary into stage-14/ current_summary = current_dir / "experiment_summary.json" best_summary = best_dir / "experiment_summary.json" # BUG-213: Also promote when stage-14/ is missing or empty if best_summary.exists(): current_dir.mkdir(parents=True, exist_ok=True) logger.warning( "BUG-205: Promoting %s (%.4f) over stage-14/", best_dir.name, best_val, ) shutil.copy2(best_summary, current_summary) # Also copy charts, analysis, and figure plans if they exist for fname in [ "analysis.md", "results_table.tex", "figure_plan.json", # BUG-213: must travel with metrics "figure_plan_final.json", # BUG-213: ditto ]: src = best_dir / fname if src.exists(): shutil.copy2(src, current_dir / fname) # Copy charts directory best_charts = best_dir / "charts" current_charts = current_dir / "charts" if best_charts.is_dir(): if current_charts.is_dir(): shutil.rmtree(current_charts) shutil.copytree(best_charts, current_charts) def _check_experiment_quality( run_dir: Path, pivot_count: int ) -> tuple[bool, str]: """Quality gate before forced PROCEED. Returns (ok, message). ok=False means experiment results have critical quality issues and the forced-PROCEED paper will likely be poor. """ # BUG-DA8-18: Check experiment_summary_best.json first (repair-promoted) summary_path = run_dir / "experiment_summary_best.json" if not summary_path.exists(): summary_path = run_dir / "stage-14" / "experiment_summary.json" if not summary_path.exists(): for v in range(pivot_count, 0, -1): alt = run_dir / f"stage-14_v{v}" / "experiment_summary.json" if alt.exists(): summary_path = alt break if not summary_path.exists(): return False, "No experiment_summary.json found — no metrics produced" try: data = json.loads(summary_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return False, "experiment_summary.json is malformed" # Check 1: Are all metrics zero? ms = data.get("metrics_summary", {}) if isinstance(ms, dict): values: list[float] = [] for k, v in ms.items(): if isinstance(v, (int, float)): values.append(float(v)) # BUG-212: metrics_summary values are often dicts {min,max,mean,count} elif isinstance(v, dict) and "mean" in v: _mv = v["mean"] if isinstance(_mv, (int, float)): values.append(float(_mv)) if values and all(v == 0.0 for v in values): return False, "All experiment metrics are zero — experiments likely failed" # Check 2: Zero variance across conditions (R13-1) # Look for ablation_warnings or condition comparison data ablation_warnings = data.get("ablation_warnings", []) # BUG-212: Key is "condition_summaries", not "conditions" conditions = data.get( "condition_summaries", data.get("condition_metrics", {}) ) if isinstance(conditions, dict) and len(conditions) >= 2: primary_values: list[float] = [] for cond_name, cond_data in conditions.items(): if isinstance(cond_data, dict): # BUG-212: Primary metric lives inside cond_data["metrics"] _metrics = cond_data.get("metrics", cond_data) pm = _metrics.get( "primary_metric", _metrics.get("primary_metric_mean"), ) if isinstance(pm, (int, float)): primary_values.append(float(pm)) if len(primary_values) >= 2 and len(set(primary_values)) == 1: return False, ( f"All {len(primary_values)} conditions have identical primary_metric " f"({primary_values[0]}) — condition implementations are likely broken" ) # Check 3: Too many ablation warnings if isinstance(ablation_warnings, list) and len(ablation_warnings) >= 3: return False, ( f"{len(ablation_warnings)} ablation warnings — most conditions " f"produce identical results" ) # Check 4: Analysis quality score (if available) quality = data.get("analysis_quality", data.get("quality_score")) if isinstance(quality, (int, float)) and quality < 3.0: return False, f"Analysis quality score {quality}/10 — below minimum threshold" return True, "Quality checks passed" def _read_pivot_count(run_dir: Path) -> int: """Read how many PIVOT/REFINE decisions have been made so far.""" history_path = run_dir / "decision_history.json" if not history_path.exists(): return 0 try: data = json.loads(history_path.read_text(encoding="utf-8")) if isinstance(data, list): return len(data) except (json.JSONDecodeError, OSError): pass return 0 def _record_decision_history( run_dir: Path, decision: str, rollback_target: Stage, attempt: int ) -> None: """Append a decision event to the history log.""" history_path = run_dir / "decision_history.json" history: list[dict[str, object]] = [] if history_path.exists(): try: data = json.loads(history_path.read_text(encoding="utf-8")) if isinstance(data, list): history = data except (json.JSONDecodeError, OSError): pass history.append({ "decision": decision, "rollback_target": rollback_target.name, "rollback_stage_num": int(rollback_target), "attempt": attempt, "timestamp": _utcnow_iso(), }) history_path.write_text( json.dumps(history, indent=2), encoding="utf-8" ) logger = logging.getLogger(__name__) def _read_quality_score(run_dir: Path) -> float | None: """Extract quality score from the most recent quality_report.json.""" report_path = run_dir / "stage-20" / "quality_report.json" if not report_path.exists(): return None try: data = json.loads(report_path.read_text(encoding="utf-8")) if isinstance(data, dict): # Try common keys: score_1_to_10, score, quality_score for key in ("score_1_to_10", "score", "quality_score", "overall_score"): if key in data: return float(data[key]) except (json.JSONDecodeError, ValueError, TypeError): pass return None def _write_iteration_context( run_dir: Path, iteration: int, reviews: str, quality_score: float | None ) -> None: """Write iteration feedback file so next round can read it.""" ctx = { "iteration": iteration, "quality_score": quality_score, "reviews_excerpt": reviews[:3000] if reviews else "", "generated": _utcnow_iso(), } (run_dir / "iteration_context.json").write_text( json.dumps(ctx, indent=2), encoding="utf-8" ) def execute_iterative_pipeline( *, run_dir: Path, run_id: str, config: RCConfig, adapters: AdapterBundle, auto_approve_gates: bool = False, kb_root: Path | None = None, max_iterations: int = 3, quality_threshold: float = 7.0, convergence_rounds: int = 2, ) -> dict[str, object]: """Run the full pipeline with iterative quality improvement. After the first full pass (stages 1-22), if the quality gate score is below *quality_threshold*, re-run stages 16-22 (paper writing + finalization) with review feedback injected. Stop when: - Score >= quality_threshold, OR - Score hasn't improved for *convergence_rounds* consecutive iterations, OR - *max_iterations* reached. Returns a summary dict with iteration history. """ iteration_scores: list[float | None] = [] all_results: list[list[StageResult]] = [] # --- First full pass --- logger.info("Iteration 1/%d: running full pipeline (stages 1-22)", max_iterations) results = execute_pipeline( run_dir=run_dir, run_id=f"{run_id}-iter1", config=config, adapters=adapters, auto_approve_gates=auto_approve_gates, kb_root=kb_root, ) all_results.append(results) score = _read_quality_score(run_dir) iteration_scores.append(score) logger.info("Iteration 1 score: %s", score) # --- Iterative improvement --- for iteration in range(2, max_iterations + 1): # Check if we've met quality threshold if score is not None and score >= quality_threshold: logger.info( "Quality threshold %.1f met (score=%.1f). Stopping.", quality_threshold, score, ) break # Check convergence (score hasn't improved) if len(iteration_scores) >= convergence_rounds: recent = iteration_scores[-convergence_rounds:] if all(s is not None for s in recent): recent_scores = [float(s) for s in recent if s is not None] if max(recent_scores) - min(recent_scores) < 0.5: logger.info( "Convergence detected: scores %s unchanged for %d rounds. Stopping.", recent, convergence_rounds, ) break # Write iteration context with feedback from reviews reviews_text = "" reviews_path = run_dir / "stage-18" / "reviews.md" if reviews_path.exists(): reviews_text = reviews_path.read_text(encoding="utf-8") _write_iteration_context(run_dir, iteration, reviews_text, score) # Re-run from PAPER_OUTLINE (stage 16) through EXPORT_PUBLISH (stage 22) logger.info( "Iteration %d/%d: re-running stages 16-22 with feedback", iteration, max_iterations, ) results = execute_pipeline( run_dir=run_dir, run_id=f"{run_id}-iter{iteration}", config=config, adapters=adapters, from_stage=Stage.PAPER_OUTLINE, auto_approve_gates=auto_approve_gates, kb_root=kb_root, ) all_results.append(results) score = _read_quality_score(run_dir) iteration_scores.append(score) logger.info("Iteration %d score: %s", iteration, score) # --- Build iterative summary --- converged = False if len(iteration_scores) >= convergence_rounds: recent_window = iteration_scores[-convergence_rounds:] if all(s is not None for s in recent_window): recent_scores = [float(s) for s in recent_window if s is not None] converged = max(recent_scores) - min(recent_scores) < 0.5 summary: dict[str, object] = { "run_id": run_id, "total_iterations": len(iteration_scores), "iteration_scores": iteration_scores, "quality_threshold": quality_threshold, "converged": converged, "final_score": iteration_scores[-1] if iteration_scores else None, "met_threshold": score is not None and score >= quality_threshold, "stages_per_iteration": [len(r) for r in all_results], "generated": _utcnow_iso(), } (run_dir / "iteration_summary.json").write_text( json.dumps(summary, indent=2, default=str), encoding="utf-8" ) # --- Package deliverables into a single folder --- try: deliverables_dir = _package_deliverables(run_dir, run_id, config) if deliverables_dir is not None: print(f"[{run_id}] Deliverables packaged → {deliverables_dir}") except Exception: # noqa: BLE001 logger.warning("Deliverables packaging failed (non-blocking)") return summary def _metaclaw_post_pipeline( config: RCConfig, results: list[StageResult], lessons: list[object], run_id: str, run_dir: Path, ) -> None: """MetaClaw bridge: post-pipeline hook. 1. Convert high-severity lessons into MetaClaw skills. 2. Record skill effectiveness feedback. 3. Signal session end to MetaClaw proxy. """ bridge = getattr(config, "metaclaw_bridge", None) if not bridge or not getattr(bridge, "enabled", False): return from researchclaw.llm.client import LLMClient # 1. Lesson-to-skill conversion l2s = getattr(bridge, "lesson_to_skill", None) if l2s and getattr(l2s, "enabled", False) and lessons: try: from researchclaw.metaclaw_bridge.lesson_to_skill import ( convert_lessons_to_skills, ) min_sev = getattr(l2s, "min_severity", "warning") llm = LLMClient.from_rc_config(config) new_skills = convert_lessons_to_skills( lessons, llm, getattr(bridge, "skills_dir", "~/.metaclaw/skills"), min_severity=min_sev, max_skills=getattr(l2s, "max_skills_per_run", 3), ) if new_skills: logger.info( "MetaClaw: generated %d new skills from lessons: %s", len(new_skills), new_skills, ) except Exception: # noqa: BLE001 logger.warning("MetaClaw lesson-to-skill conversion failed", exc_info=True) # 2. Skill effectiveness feedback try: from researchclaw.metaclaw_bridge.skill_feedback import ( SkillFeedbackStore, record_stage_skills, ) from researchclaw.metaclaw_bridge.stage_skill_map import get_stage_config feedback_store = SkillFeedbackStore(run_dir / "evolution" / "skill_effectiveness.jsonl") for result in results: stage_num = int(getattr(result, "stage", 0)) stage_name = { 1: "topic_init", 2: "problem_decompose", 3: "search_strategy", 4: "literature_collect", 5: "literature_screen", 6: "knowledge_extract", 7: "synthesis", 8: "hypothesis_gen", 9: "experiment_design", 10: "code_generation", 11: "resource_planning", 12: "experiment_run", 13: "iterative_refine", 14: "result_analysis", 15: "research_decision", 16: "paper_outline", 17: "paper_draft", 18: "peer_review", 19: "paper_revision", 20: "quality_gate", 21: "knowledge_archive", 22: "export_publish", 23: "citation_verify", }.get(stage_num, "") if not stage_name: continue stage_config = get_stage_config(stage_name) active_skills = stage_config.get("skills", []) status = str(getattr(result, "status", "")) success = "done" in status.lower() if active_skills: record_stage_skills( feedback_store, stage_name, run_id, success, active_skills, ) except Exception: # noqa: BLE001 logger.warning("MetaClaw skill feedback recording failed") # 3. Signal session end (fire-and-forget) try: from researchclaw.metaclaw_bridge.session import MetaClawSession import json as _json import urllib.request as _urllib_req session = MetaClawSession(run_id) end_headers = session.end() # Send a minimal request to signal session end proxy_url = getattr(bridge, "proxy_url", "http://localhost:30000") url = f"{proxy_url.rstrip('/')}/v1/chat/completions" body = _json.dumps({ "model": "session-end", "messages": [{"role": "user", "content": "session complete"}], "max_tokens": 1, }).encode("utf-8") headers = {"Content-Type": "application/json"} headers.update(end_headers) req = _urllib_req.Request(url, data=body, headers=headers) try: _urllib_req.urlopen(req, timeout=5) except Exception: # noqa: BLE001 pass # Best-effort signal except Exception: # noqa: BLE001 pass ================================================ FILE: researchclaw/pipeline/stage_impls/__init__.py ================================================ """Stage implementation modules for the research pipeline executor.""" ================================================ FILE: researchclaw/pipeline/stage_impls/_analysis.py ================================================ """Stages 14-15: Result analysis and research decision.""" from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain, _is_ml_domain from researchclaw.pipeline._helpers import ( StageResult, _build_context_preamble, _chat_with_prompt, _collect_experiment_results, _collect_json_context, _get_evolution_overlay, _multi_perspective_generate, _read_prior_artifact, _safe_json_loads, _synthesize_perspectives, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_result_analysis( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: # --- Collect experiment data --- exp_data = _collect_experiment_results( run_dir, metric_key=config.experiment.metric_key, metric_direction=config.experiment.metric_direction, ) runs_dir = _read_prior_artifact(run_dir, "runs/") or "" context = "" if runs_dir: context = _collect_json_context(Path(runs_dir), max_files=30) # --- R13-1: Merge Stage 13 (ITERATIVE_REFINE) results if available --- # Stage 13 stores richer per-condition metrics in refinement_log.json # that _collect_experiment_results() misses (it only scans runs/ dirs). _refine_log_text = _read_prior_artifact(run_dir, "refinement_log.json") if _refine_log_text: try: _refine_data = json.loads(_refine_log_text) _best_iter = None _best_ver = _refine_data.get("best_version", "") def _get_best_sandbox(it: dict) -> dict: """BUG-181: Metrics may be in sandbox or sandbox_after_fix.""" sbx = it.get("sandbox", {}) if sbx.get("metrics"): return sbx sbx_fix = it.get("sandbox_after_fix", {}) if sbx_fix.get("metrics"): return sbx_fix return sbx for _it in _refine_data.get("iterations", []): _sbx = _get_best_sandbox(_it) _it_metrics = _sbx.get("metrics", {}) if _it.get("version_dir", "") == _best_ver and _it_metrics: _best_iter = _it break # If no version match, take the first iteration with metrics if _best_iter is None: for _it in _refine_data.get("iterations", []): _sbx = _get_best_sandbox(_it) if _sbx.get("metrics"): _best_iter = _it break if _best_iter is not None: _sbx = _get_best_sandbox(_best_iter) _refine_metrics = _sbx.get("metrics", {}) # BUG-165 fix: Prefer Stage 13 refinement data when it is # actually better. The old `or True` unconditionally # replaced existing data, causing catastrophic regressions # (BUG-205: v1=78.93% destroyed by v3=8.65%). _refine_is_better = not exp_data["metrics_summary"] if not _refine_is_better and _refine_metrics: # Compare primary_metric values to decide _mkey = config.experiment.metric_key or "primary_metric" _mdir = config.experiment.metric_direction or "maximize" _existing_pm: float | None = None _refine_pm: float | None = None # BUG-214: Use exact match first, then substring fallback # to avoid "accuracy" matching "balanced_accuracy". _ms_items = list((exp_data.get("metrics_summary") or {}).items()) for _k, _v in _ms_items: if _k == _mkey: try: _existing_pm = float(_v["mean"] if isinstance(_v, dict) else _v) except (TypeError, ValueError, KeyError): pass break else: for _k, _v in _ms_items: if _mkey in _k: try: _existing_pm = float(_v["mean"] if isinstance(_v, dict) else _v) except (TypeError, ValueError, KeyError): pass break _refine_items = list(_refine_metrics.items()) for _k, _v in _refine_items: if _k == _mkey: try: _refine_pm = float(_v) except (TypeError, ValueError): pass break else: for _k, _v in _refine_items: if _mkey in _k: try: _refine_pm = float(_v) except (TypeError, ValueError): pass break if _existing_pm is None: _refine_is_better = True # no existing data elif _refine_pm is not None: if _mdir == "maximize": _refine_is_better = _refine_pm > _existing_pm else: _refine_is_better = _refine_pm < _existing_pm logger.info( "Stage 14: Refine metric comparison: existing=%s, refine=%s, " "direction=%s → refine_is_better=%s", _existing_pm, _refine_pm, _mdir, _refine_is_better, ) if _refine_metrics and _refine_is_better: # Refinement has richer data — rebuild metrics_summary from it _new_summary: dict[str, dict[str, float | None]] = {} for _mk, _mv in _refine_metrics.items(): try: _fv = float(_mv) _new_summary[_mk] = { "min": round(_fv, 6), "max": round(_fv, 6), "mean": round(_fv, 6), "count": 1, } except (ValueError, TypeError): pass if _new_summary: exp_data["metrics_summary"] = _new_summary # Also update best_run with refinement data exp_data["best_run"] = { "run_id": "iterative-refine-best", "task_id": "sandbox-main", "status": "completed", "metrics": { k: v for k, v in _refine_metrics.items() }, "elapsed_sec": _sbx.get("elapsed_sec", 0), "stdout": "", # omit for brevity "stderr": _sbx.get("stderr", ""), "timed_out": _sbx.get("timed_out", False), } # Rebuild latex table _ltx = [ r"\begin{table}[h]", r"\centering", r"\caption{Experiment Results (Best Refinement Iteration)}", r"\begin{tabular}{lrrrr}", r"\hline", r"Metric & Min & Max & Mean & N \\", r"\hline", ] for _col in sorted(_new_summary.keys()): _s = _new_summary[_col] _ltx.append( f"{_col} & {_s['min']:.4f} & {_s['max']:.4f} " f"& {_s['mean']:.4f} & {_s['count']} \\\\" ) _ltx.extend([r"\hline", r"\end{tabular}", r"\end{table}"]) exp_data["latex_table"] = "\n".join(_ltx) # Count unique conditions (keys without 'seed' and not ending in _mean/_std) _conditions = { k for k in _refine_metrics if "seed" not in k and not k.endswith("_std") } exp_data["runs"] = [exp_data["best_run"]] # Store condition count for accurate reporting exp_data["best_run"]["condition_count"] = len(_conditions) if not context: context = json.dumps( {"refinement_best_metrics": _refine_metrics}, indent=2, default=str, ) _bm_val = _refine_data.get("best_metric") logger.info( "R13-1: Merged %d metrics from refinement_log (best_metric=%.4f)", len(_refine_metrics), float(_bm_val) if isinstance(_bm_val, (int, float)) else 0.0, ) except (json.JSONDecodeError, OSError, KeyError): logger.warning("R13-1: Failed to parse refinement_log.json, using Stage 12 data") # --- R19-2: Extract PAIRED comparisons from refinement stdout --- from researchclaw.experiment.sandbox import extract_paired_comparisons as _extract_paired _all_paired: list[dict[str, object]] = [] # First: from _collect_experiment_results (Stage 12 runs/) if exp_data.get("paired_comparisons"): _all_paired.extend(exp_data["paired_comparisons"]) # Second: from refinement_log iterations (Stage 13) if _refine_log_text: try: _rl = json.loads(_refine_log_text) for _it in _rl.get("iterations", []): for _sbx_key in ("sandbox", "sandbox_after_fix"): _sbx_stdout = (_it.get(_sbx_key) or {}).get("stdout", "") if _sbx_stdout: _all_paired.extend(_extract_paired(_sbx_stdout)) except (json.JSONDecodeError, OSError): pass # --- R19-3: Build structured condition_summaries from metrics --- _condition_summaries: dict[str, dict[str, Any]] = {} _ms = exp_data.get("metrics_summary", {}) _best_metrics = {} if exp_data.get("best_run") and isinstance(exp_data["best_run"], dict): _best_metrics = exp_data["best_run"].get("metrics", {}) # Group metrics by condition prefix (e.g., "ppo/primary_metric" → condition "ppo") for _mk, _mv in _best_metrics.items(): parts = _mk.split("/") if len(parts) >= 2: cond = parts[0] metric_name = parts[-1] if cond not in _condition_summaries: _condition_summaries[cond] = {"metrics": {}} try: _condition_summaries[cond]["metrics"][metric_name] = float(_mv) except (ValueError, TypeError): pass # BUG-09 fix: If no condition summaries were built (metrics don't use # condition/metric format), try to extract from metrics_summary or # structured_results so FigureAgent has data to work with. if not _condition_summaries and _ms: # Try to parse condition data from metrics_summary keys for _mk, _mv in _ms.items(): parts = _mk.split("/") if len(parts) >= 2: cond = parts[0] metric_name = parts[-1] if cond not in _condition_summaries: _condition_summaries[cond] = {"metrics": {}} try: # BUG-182: metrics_summary values are dicts {min,max,mean,count}, # not plain floats. Extract the mean value. if isinstance(_mv, dict): _val = float(_mv["mean"]) if "mean" in _mv else None else: _val = float(_mv) if _val is not None: _condition_summaries[cond]["metrics"][metric_name] = _val except (ValueError, TypeError, KeyError): pass if not _condition_summaries: # Last resort: build from structured_results condition keys _sr = exp_data.get("structured_results", {}) if isinstance(_sr, dict): for _sk, _sv in _sr.items(): if isinstance(_sv, dict) and _sk not in ("metadata", "config"): _condition_summaries[_sk] = {"metrics": {}} for _smk, _smv in _sv.items(): try: _condition_summaries[_sk]["metrics"][_smk] = float(_smv) except (ValueError, TypeError): pass # R33: Build per-seed data structure (needed for CIs and paired tests below) _seed_data: dict[str, dict[int, float]] = {} # {condition: {seed: value}} for _mk, _mv in _best_metrics.items(): parts = _mk.split("/") # Pattern: condition/regime/seed_id/primary_metric if len(parts) >= 4 and parts[-1] == config.experiment.metric_key: cond = parts[0] try: seed_id = int(parts[2]) val = float(_mv) _seed_data.setdefault(cond, {})[seed_id] = val except (ValueError, TypeError): pass # Enrich condition summaries with seed counts, success rates, and CIs for _ck, _cv in _condition_summaries.items(): # Look for success_rate in metrics sr_key = f"{_ck}/success_rate" if sr_key in _best_metrics: try: _cv["success_rate"] = float(_best_metrics[sr_key]) except (ValueError, TypeError): pass # Count seed-level entries to estimate n_seeds _seed_count = 0 for _mk in _best_metrics: if _mk.startswith(f"{_ck}/") and "seed" in _mk.lower(): _seed_count += 1 if _seed_count > 0: _cv["n_seed_metrics"] = _seed_count # R33: Compute mean ± std and bootstrap 95% CI from per-seed data if _ck in _seed_data and len(_seed_data[_ck]) >= 3: _vals = list(_seed_data[_ck].values()) import statistics as _stats_mod _mean = _stats_mod.mean(_vals) _std = _stats_mod.stdev(_vals) _cv["metrics"][f"{config.experiment.metric_key}_mean"] = round(_mean, 6) _cv["metrics"][f"{config.experiment.metric_key}_std"] = round(_std, 6) _cv["n_seeds"] = len(_vals) # Bootstrap 95% CI (use local RNG to avoid corrupting global state) import random as _rng_mod _rng_local = _rng_mod.Random(42) _boot_means = [] for _ in range(1000): _sample = [_rng_local.choice(_vals) for _ in range(len(_vals))] _boot_means.append(_stats_mod.mean(_sample)) _boot_means.sort() _ci_low = round(_boot_means[int(0.025 * len(_boot_means))], 6) _ci_high = round(_boot_means[int(0.975 * len(_boot_means))], 6) # IMP-16: Sanity check — CI must contain the mean if _ci_low > _mean or _ci_high < _mean: logger.warning( "Bootstrap CI [%.4f, %.4f] does not contain mean %.4f " "for condition %s — replacing CI with mean ± 1.96*SE", _ci_low, _ci_high, _mean, _ck, ) _se = _std / (len(_vals) ** 0.5) _ci_low = round(_mean - 1.96 * _se, 6) _ci_high = round(_mean + 1.96 * _se, 6) _cv["ci95_low"] = _ci_low _cv["ci95_high"] = _ci_high # Count totals _total_conditions = len(_condition_summaries) if _condition_summaries else None _total_metrics = len(_best_metrics) if _best_metrics else None # --- R33: Pipeline-level paired computation as fallback --- # If the experiment code's PAIRED lines are sparse or suspicious (e.g., # all identical t-stats), compute fresh paired tests from per-seed data. # (_seed_data was built above before condition summary enrichment) if len(_seed_data) >= 2: # Find common seeds across conditions _all_seeds_sets = [set(v.keys()) for v in _seed_data.values()] _common_seeds = set.intersection(*_all_seeds_sets) if _all_seeds_sets else set() if len(_common_seeds) >= 3: _cond_names_sorted = sorted(_seed_data.keys()) _pipeline_paired: list[dict[str, object]] = [] # Compare each condition against the first baseline (alphabetically) _baseline_cond = _cond_names_sorted[0] for _other_cond in _cond_names_sorted[1:]: _diffs = [] for _sid in sorted(_common_seeds): _diffs.append( _seed_data[_other_cond][_sid] - _seed_data[_baseline_cond][_sid] ) if _diffs: import statistics _n = len(_diffs) _mean_d = statistics.mean(_diffs) _std_d = statistics.stdev(_diffs) if _n > 1 else 0.0 _t = (_mean_d / (_std_d / (_n ** 0.5))) if _std_d > 0 else 0.0 _df = _n - 1 # Two-tailed p-value using t-distribution import math try: from scipy.stats import t as _t_dist _p = float(2 * _t_dist.sf(abs(_t), _df)) except ImportError: _p = 2 * (1 - 0.5 * (1 + math.erf(abs(_t) / (2 ** 0.5)))) if _df < 30: _p = min(1.0, _p * (1 + 2.5 / max(_df, 1))) _pipeline_paired.append({ "method": _other_cond, "baseline": _baseline_cond, "mean_diff": round(_mean_d, 6), "std_diff": round(_std_d, 6), "t_stat": round(_t, 4), "p_value": round(_p, 6), "n_seeds": _n, "source": "pipeline_computed", }) # Use pipeline-computed if experiment code's are suspicious _exp_t_stats = {round(p.get("t_stat", 0), 4) for p in _all_paired} _all_identical = len(_exp_t_stats) <= 1 and len(_all_paired) > 1 if _pipeline_paired and (_all_identical or len(_all_paired) < len(_pipeline_paired)): logger.info( "R33: Using %d pipeline-computed paired tests (experiment code had %d, identical=%s)", len(_pipeline_paired), len(_all_paired), _all_identical, ) _all_paired = _pipeline_paired # --- P8: Detect identical conditions (broken ablations) --- _ablation_warnings: list[str] = [] if _condition_summaries and len(_condition_summaries) >= 2: _cond_names = sorted(_condition_summaries.keys()) for _i in range(len(_cond_names)): for _j in range(_i + 1, len(_cond_names)): _c1, _c2 = _cond_names[_i], _cond_names[_j] _s1_raw = _condition_summaries[_c1] _s2_raw = _condition_summaries[_c2] # BUG-133 fix: compare inner metrics dicts, not top-level keys _s1_m = _s1_raw.get("metrics", {}) if isinstance(_s1_raw, dict) else {} _s2_m = _s2_raw.get("metrics", {}) if isinstance(_s2_raw, dict) else {} if not isinstance(_s1_m, dict): _s1_m = {} if not isinstance(_s2_m, dict): _s2_m = {} _shared_keys = set(_s1_m.keys()) & set(_s2_m.keys()) if not _shared_keys: continue _all_equal = True for _sk in _shared_keys: _v1 = _s1_m[_sk] _v2 = _s2_m[_sk] if _v1 != _v2: _all_equal = False break if _all_equal and _shared_keys: _warn = ( f"ABLATION FAILURE: Conditions '{_c1}' and '{_c2}' produce " f"identical outputs across all {len(_shared_keys)} metrics. " f"The ablation is invalid — the differentiating parameter " f"is likely not used in the code." ) _ablation_warnings.append(_warn) logger.warning("P8: %s", _warn) elif _shared_keys: # R5-BUG-03: Also flag near-identical conditions (< 1% relative diff) _near_identical = True for _sk in _shared_keys: _v1 = _s1_m[_sk] _v2 = _s2_m[_sk] try: _v1f, _v2f = float(_v1), float(_v2) _denom = max(abs(_v1f), abs(_v2f), 1e-12) if abs(_v1f - _v2f) / _denom > 0.01: _near_identical = False break except (TypeError, ValueError): _near_identical = False break if _near_identical: _warn = ( f"ABLATION WARNING: Conditions '{_c1}' and '{_c2}' produce " f"near-identical outputs (<1% relative difference) across " f"all {len(_shared_keys)} metrics. The ablation may be trivial." ) _ablation_warnings.append(_warn) logger.warning("P8: %s", _warn) # --- Improvement B: Validate seed counts --- _seed_insufficiency_warnings: list[str] = [] for _sc_name, _sc_seeds in _seed_data.items(): _n_seeds = len(_sc_seeds) if 0 < _n_seeds < 3: _warn = ( f"SEED_INSUFFICIENCY: Condition '{_sc_name}' has only " f"{_n_seeds} seed(s) (minimum 3 required for statistical validity)" ) _seed_insufficiency_warnings.append(_warn) logger.warning("B: %s", _warn) # --- Write structured experiment summary --- summary_payload = { "metrics_summary": exp_data["metrics_summary"], "total_runs": len(exp_data["runs"]), "best_run": exp_data["best_run"], "latex_table": exp_data["latex_table"], "generated": _utcnow_iso(), } if _seed_insufficiency_warnings: summary_payload["seed_insufficiency_warnings"] = _seed_insufficiency_warnings # R13-1: Detect zero-variance across conditions (all conditions identical primary metric) if _condition_summaries and len(_condition_summaries) >= 2: _primary_vals = [] for _cs in _condition_summaries.values(): if isinstance(_cs, dict): # Try 'metrics' dict first (actual structure), then 'primary_metric' fallback _metrics = _cs.get("metrics", {}) if isinstance(_metrics, dict) and _metrics: _pv_candidate = next(iter(_metrics.values()), None) if isinstance(_pv_candidate, dict): _pv_candidate = _pv_candidate.get("mean") if isinstance(_pv_candidate, (int, float)): _primary_vals.append(_pv_candidate) continue _pm = _cs.get("primary_metric", {}) _pv = _pm.get("mean") if isinstance(_pm, dict) else _pm if isinstance(_pv, (int, float)): _primary_vals.append(_pv) if len(_primary_vals) >= 2 and len(set(_primary_vals)) == 1: _zv_warn = ( f"ZERO VARIANCE: All {len(_primary_vals)} conditions have " f"identical primary_metric ({_primary_vals[0]}). " f"Experiment condition wiring is likely broken." ) _ablation_warnings.append(_zv_warn) logger.warning("R13-1: %s", _zv_warn) if _ablation_warnings: summary_payload["ablation_warnings"] = _ablation_warnings if _all_paired: summary_payload["paired_comparisons"] = _all_paired if _condition_summaries: summary_payload["condition_summaries"] = _condition_summaries summary_payload["condition_metrics"] = _condition_summaries # alias for quality gate summary_payload["total_conditions"] = _total_conditions if _total_metrics: summary_payload["total_metric_keys"] = _total_metrics (stage_dir / "experiment_summary.json").write_text( json.dumps(summary_payload, indent=2, default=str), encoding="utf-8" ) if exp_data["latex_table"]: (stage_dir / "results_table.tex").write_text( exp_data["latex_table"], encoding="utf-8" ) # --- Build data-augmented prompt --- preamble = _build_context_preamble( config, run_dir, include_goal=True, include_hypotheses=True ) data_context = "" if exp_data["metrics_summary"]: lines = ["\n## Quantitative Results"] for mk, mv in exp_data["metrics_summary"].items(): if isinstance(mv, dict): lines.append( f"- {mk}: mean={mv.get('mean', '?')}, min={mv.get('min', '?')}, " f"max={mv.get('max', '?')}, n={mv.get('count', '?')}" ) data_context = "\n".join(lines) # Append structured results if available if exp_data.get("structured_results"): structured_text = json.dumps( exp_data["structured_results"], indent=2, default=str ) # Truncate to avoid blowing up context if len(structured_text) > 6000: structured_text = structured_text[:6000] + "\n... (truncated)" data_context += ( f"\n\n## Structured Experiment Results (from results.json)\n" f"```json\n{structured_text}\n```" ) # P8: Inject ablation warnings into data context if _ablation_warnings: data_context += "\n\nCRITICAL ABLATION WARNINGS:\n" for _aw in _ablation_warnings: data_context += f"- {_aw}\n" data_context += ( "\nYou MUST address these in your analysis. Identical conditions " "mean the ablation design is broken and the comparison is meaningless.\n" ) if llm is not None: _pm = prompts or PromptManager() from researchclaw.prompts import DEBATE_ROLES_ANALYSIS # noqa: PLC0415 # --- Multi-perspective debate --- perspectives_dir = stage_dir / "perspectives" variables = { "preamble": preamble, "data_context": data_context, "context": context, } perspectives = _multi_perspective_generate( llm, DEBATE_ROLES_ANALYSIS, variables, perspectives_dir ) # --- Synthesize into unified analysis --- analysis = _synthesize_perspectives( llm, perspectives, "analysis_synthesize", _pm ) else: # Template with real data if available ms = exp_data["metrics_summary"] metrics_block = "" if ms: for mk, mv in ms.items(): if isinstance(mv, dict): metrics_block += ( f"- **{mk}**: mean={mv.get('mean')}, " f"min={mv.get('min')}, max={mv.get('max')}, n={mv.get('count')}\n" ) else: metrics_block = f"- Primary metric key: `{config.experiment.metric_key}`\n- No quantitative data yet.\n" analysis = f"""# Result Analysis ## Metrics Summary {metrics_block} ## Comparative Findings - Proposed approach results from {len(exp_data["runs"])} run(s) collected. ## Statistical Checks - Recommend confidence interval and seed-wise variance reporting. ## Limitations - Limited runs and synthetic constraints. ## Conclusion - Proceed to decision stage with moderate confidence. Generated: {_utcnow_iso()} """ (stage_dir / "analysis.md").write_text(analysis, encoding="utf-8") artifacts = ["analysis.md", "experiment_summary.json"] if (stage_dir / "results_table.tex").exists(): artifacts.append("results_table.tex") # IMP-6 + FA: Generate charts early (Stage 14) so paper draft can reference them # Try FigureAgent first (multi-agent intelligent charts), fall back to visualize.py _figure_plan_saved = False if config.experiment.figure_agent.enabled and llm is not None: try: from researchclaw.agents.figure_agent import FigureOrchestrator from researchclaw.agents.figure_agent.orchestrator import FigureAgentConfig as _FACfg _fa_cfg = _FACfg( enabled=True, min_figures=config.experiment.figure_agent.min_figures, max_figures=config.experiment.figure_agent.max_figures, max_iterations=config.experiment.figure_agent.max_iterations, render_timeout_sec=config.experiment.figure_agent.render_timeout_sec, use_docker=config.experiment.figure_agent.use_docker, docker_image=config.experiment.figure_agent.docker_image, output_format=config.experiment.figure_agent.output_format, gemini_api_key=config.experiment.figure_agent.gemini_api_key, gemini_model=config.experiment.figure_agent.gemini_model, nano_banana_enabled=config.experiment.figure_agent.nano_banana_enabled, strict_mode=config.experiment.figure_agent.strict_mode, dpi=config.experiment.figure_agent.dpi, ) _fa = FigureOrchestrator(llm, _fa_cfg, stage_dir=stage_dir) # Build conditions list from condition_summaries _fa_conditions = list(_condition_summaries.keys()) if _condition_summaries else [] # BUG-09 fix: pass best_run metrics as fallback data if # structured_results is empty, so Planner has some data to chart _fa_exp_results = exp_data.get("structured_results", {}) if not _fa_exp_results and _best_metrics: _fa_exp_results = {"best_run_metrics": _best_metrics} # Read paper draft for Decision Agent analysis _paper_draft = ( _read_prior_artifact(run_dir, "paper_draft.md") or _read_prior_artifact(run_dir, "outline.md") or "" ) _fa_plan = _fa.orchestrate({ "experiment_results": _fa_exp_results, "condition_summaries": _condition_summaries, "metrics_summary": exp_data.get("metrics_summary", {}), "metric_key": config.experiment.metric_key, "conditions": _fa_conditions, "topic": _read_prior_artifact(run_dir, "topic.md") or config.research.topic, "hypothesis": _read_prior_artifact(run_dir, "hypotheses.md") or "", "paper_draft": _paper_draft, "output_dir": str(stage_dir / "charts"), }) if _fa_plan.figure_count > 0: # Save figure plan for Stage 17 to read (stage_dir / "figure_plan.json").write_text( json.dumps(_fa_plan.to_dict(), indent=2, default=str), encoding="utf-8", ) _figure_plan_saved = True for _cf_name in _fa_plan.get_chart_files(): artifacts.append(f"charts/{_cf_name}") logger.info( "Stage 14: FigureAgent generated %d charts (%d passed review, %.1fs)", _fa_plan.figure_count, _fa_plan.passed_count, _fa_plan.elapsed_sec, ) else: logger.warning("Stage 14: FigureAgent produced no charts, falling back") except Exception as _fa_exc: logger.warning("Stage 14: FigureAgent failed (%s), falling back to visualize.py", _fa_exc) # Fallback: legacy visualize.py chart generation if not _figure_plan_saved: try: from researchclaw.experiment.visualize import ( generate_all_charts as _gen_charts_early, ) _charts_dir = stage_dir / "charts" _early_charts = _gen_charts_early( run_dir, _charts_dir, metric_key=config.experiment.metric_key, ) if _early_charts: for _cp in _early_charts: artifacts.append(f"charts/{_cp.name}") logger.info( "Stage 14: Generated %d early charts (legacy) for paper embedding", len(_early_charts), ) except Exception as _chart_exc: logger.warning("Stage 14: Early chart generation failed: %s", _chart_exc) return StageResult( stage=Stage.RESULT_ANALYSIS, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-14/{a}" for a in artifacts), ) def _parse_decision(text: str) -> str: """Extract PROCEED/PIVOT/REFINE from decision text. Looks for the first standalone keyword on its own line after a ``## Decision`` heading. Falls back to a keyword scan of the first few lines after the heading, but only matches the keyword itself (not mentions inside explanatory prose like "PIVOT is not warranted"). Returns lowercase ``"proceed"`` / ``"pivot"`` / ``"refine"``. Defaults to ``"proceed"`` if nothing matches. """ import re as _re text_upper = text.upper() # Look in the first occurrence after "## Decision" heading decision_section = "" for keyword in ("## DECISION", "## Decision", "## decision"): if keyword.upper() in text_upper: idx = text_upper.index(keyword.upper()) decision_section = text[idx : idx + 200] break search_text = decision_section or text[:500] # First try: look for a line that is just the keyword (possibly with # whitespace / markdown bold / trailing punctuation). for line in search_text.splitlines(): stripped = line.strip().strip("*").strip("#").strip() if stripped.upper() in ("PROCEED", "PIVOT", "REFINE"): return stripped.lower() # Fallback: regex for standalone word boundaries so that # "PIVOT is not warranted" does NOT match as a decision. for kw in ("PIVOT", "REFINE", "PROCEED"): # Only match if the keyword appears as the FIRST keyword-class token # on its own (not embedded in a sentence saying "not PIVOT"). pattern = _re.compile( r"(?:^|##\s*Decision\s*\n\s*)" + kw, _re.IGNORECASE | _re.MULTILINE ) if pattern.search(search_text): return kw.lower() # Last resort: position-based — prefer whichever keyword appears LAST # (the final conclusion after deliberation is more reliable than early mentions) # BUG-DA8-08: Old code always returned "refine" when both keywords present search_upper = search_text.upper() last_refine = search_upper.rfind("REFINE") last_pivot = search_upper.rfind("PIVOT") if last_refine >= 0 and (last_pivot < 0 or last_refine > last_pivot): return "refine" if last_pivot >= 0 and (last_refine < 0 or last_pivot > last_refine): return "pivot" return "proceed" def _execute_research_decision( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: analysis = _read_prior_artifact(run_dir, "analysis.md") or "" # P6: Detect degenerate REFINE cycles — inject warning if metrics stagnate _degenerate_hint = "" _refine_log = _read_prior_artifact(run_dir, "refinement_log.json") if _refine_log: try: _rl = json.loads(_refine_log) _iters = _rl.get("iterations", []) _metrics = [it.get("metric") for it in _iters if isinstance(it, dict)] _valid = [m for m in _metrics if m is not None] _all_saturated = _valid and all(m <= 0.001 or m >= 0.999 for m in _valid) _all_identical = len(set(_valid)) <= 1 and len(_valid) >= 2 if _all_saturated or _all_identical: _degenerate_hint = ( "\n\nSYSTEM WARNING — DEGENERATE REFINE CYCLE DETECTED:\n" f"Metrics across {len(_valid)} iterations: {_valid}\n" "All iterations produce identical/saturated results. Further REFINE " "cycles CANNOT fix this — the underlying benchmark design is too " "easy/hard. You SHOULD choose PROCEED with a quality caveat rather " "than REFINE again.\n" ) logger.warning("P6: Degenerate refine cycle detected, injecting PROCEED hint") except (json.JSONDecodeError, OSError): pass # Phase 2: Inject experiment diagnosis into decision prompt _diagnosis_hint = "" _diag_path = run_dir / "experiment_diagnosis.json" if _diag_path.exists(): try: _diag_data = json.loads(_diag_path.read_text(encoding="utf-8")) _qa = _diag_data.get("quality_assessment", {}) _mode = _qa.get("mode", "unknown") _sufficient = _qa.get("sufficient", False) _deficiency_types = _qa.get("deficiency_types", []) if not _sufficient: _diagnosis_hint = ( "\n\n## EXPERIMENT DIAGNOSIS (from automated analysis)\n" f"Quality mode: {_mode}\n" f"Sufficient for full paper: NO\n" f"Issues found: {', '.join(_deficiency_types)}\n\n" "IMPORTANT: The experiment has significant issues. " "If REFINE is chosen, a structured repair prompt is available " "at repair_prompt.txt with specific fixes for identified issues.\n" "If the same issues persist after 2+ REFINE cycles, choose PROCEED " "with appropriate quality caveats.\n" ) logger.info( "Stage 15: Injected experiment diagnosis — mode=%s, issues=%s", _mode, _deficiency_types, ) except (json.JSONDecodeError, OSError): pass # Improvement C: Check ablation quality — if >50% trivial, push REFINE _ablation_refine_hint = "" # BUG-DA8-16: Prefer experiment_summary_best.json (promoted best) over # alphabetically-last stage-14* (which could be a stale versioned dir) _exp_sum_path = run_dir / "experiment_summary_best.json" if not _exp_sum_path.is_file(): _exp_sum_path = None for _s14 in sorted(run_dir.glob("stage-14*/experiment_summary.json"), reverse=True): _exp_sum_path = _s14 break if _exp_sum_path and _exp_sum_path.is_file(): try: from researchclaw.pipeline.stage_impls._paper_writing import _check_ablation_effectiveness _abl_exp = json.loads(_exp_sum_path.read_text(encoding="utf-8")) _abl_warnings = _check_ablation_effectiveness(_abl_exp, threshold=0.02) if _abl_warnings: _trivial_count = sum(1 for w in _abl_warnings if "ineffective" in w.lower() or "trivial" in w.lower()) _total_abl = max(1, len(_abl_warnings)) if _trivial_count / _total_abl > 0.5: _ablation_refine_hint = ( "\n\n## ABLATION QUALITY ASSESSMENT (CRITICAL)\n" f"STRONG RECOMMENDATION: Choose REFINE.\n" f"{_trivial_count}/{_total_abl} ablations show <2% difference from baseline " f"(trivially similar). This means the ablation design is broken.\n" "Warnings:\n" + "\n".join(f"- {w}" for w in _abl_warnings) + "\n" ) logger.warning("C: %d/%d ablations trivial → recommending REFINE", _trivial_count, _total_abl) except Exception: # noqa: BLE001 pass if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "research_decision") sp = _pm.for_stage("research_decision", evolution_overlay=_overlay, analysis=analysis) _user = sp.user + _degenerate_hint + _diagnosis_hint + _ablation_refine_hint resp = _chat_with_prompt(llm, sp.system, _user) decision_md = resp.content else: decision_md = f"""# Research Decision ## Decision PROCEED ## Justification Current evidence suggests measurable progress with actionable limitations. ## Next Actions - Build detailed paper outline - Expand ablation and uncertainty analysis in writing Generated: {_utcnow_iso()} """ (stage_dir / "decision.md").write_text(decision_md, encoding="utf-8") # --- Extract structured decision --- decision = _parse_decision(decision_md) # T3.1: Validate decision quality — check for minimum experiment rigor _quality_warnings: list[str] = [] _dec_lower = decision_md.lower() if "baseline" not in _dec_lower and "control" not in _dec_lower: _quality_warnings.append("Decision text does not mention baselines") if "seed" not in _dec_lower and "replicat" not in _dec_lower and "run" not in _dec_lower: _quality_warnings.append("Decision text does not mention multi-seed/replicate runs") if "metric" not in _dec_lower and "accuracy" not in _dec_lower and "loss" not in _dec_lower: _quality_warnings.append("Decision text does not mention evaluation metrics") if _quality_warnings: logger.warning("T3.1: Decision quality warnings: %s", _quality_warnings) decision_payload = { "decision": decision, "raw_text_excerpt": decision_md[:500], "quality_warnings": _quality_warnings, "generated": _utcnow_iso(), } (stage_dir / "decision_structured.json").write_text( json.dumps(decision_payload, indent=2), encoding="utf-8" ) logger.info("Research decision: %s", decision) return StageResult( stage=Stage.RESEARCH_DECISION, status=StageStatus.DONE, artifacts=("decision.md", "decision_structured.json"), evidence_refs=("stage-15/decision.md",), decision=decision, ) ================================================ FILE: researchclaw/pipeline/stage_impls/_code_generation.py ================================================ """Stage 10: Code generation.""" from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.experiment.validator import ( CodeValidation, format_issues_for_llm, validate_code, ) from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, _chat_with_prompt, _ensure_sandbox_deps, _extract_code_block, _extract_multi_file_blocks, _extract_yaml_block, _get_evolution_overlay, _load_hardware_profile, _read_prior_artifact, _safe_json_loads, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) # Improvement G: Continuous-action environments that are incompatible with DQN _CONTINUOUS_ENVS = { "pendulum", "halfcheetah", "hopper", "walker2d", "ant", "humanoid", "swimmer", "reacher", "invertedpendulum", "inverteddoublependulum", "mountaincarcontinuous", "lunarlander-continuous", } def _check_rl_compatibility(code: str) -> list[str]: """Detect DQN + continuous-action environment mismatches. Returns a list of error strings if incompatible combinations are found. """ errors: list[str] = [] code_lower = code.lower() has_dqn = "dqn" in code_lower if not has_dqn: return errors for env_name in _CONTINUOUS_ENVS: if env_name in code_lower: errors.append( f"RL COMPATIBILITY ERROR: DQN is used with continuous-action " f"environment '{env_name}'. DQN only works with DISCRETE action " f"spaces. Use SAC, TD3, or PPO instead." ) return errors def _execute_code_generation( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or "" metric = config.experiment.metric_key max_repair = 5 # BUG-14: Increased from 3 to give more chances for critical bugs files: dict[str, str] = {} validation_log: list[str] = [] # --- Detect available packages for sandbox --- _pm = prompts or PromptManager() # --- Hardware-aware package hint --- hw_profile = _load_hardware_profile(run_dir) if config.experiment.mode in ("sandbox", "docker"): if config.experiment.mode == "docker": pkg_prefix = "docker mode" _net_policy = config.experiment.docker.network_policy _base_pkgs = ( ", torchvision, torchaudio, matplotlib, seaborn, scipy, " "tqdm, torchdiffeq, gymnasium, networkx, PyYAML, Pillow, " "transformers, datasets, accelerate, peft, bitsandbytes, " "timm, einops, torchmetrics, h5py" ) if _net_policy == "none": pkg_extras = _base_pkgs + " (ONLY pre-installed packages — NO pip install available)" elif _net_policy in ("setup_only", "pip_only"): pkg_extras = _base_pkgs + ", and additional pip-installable packages via requirements.txt" else: pkg_extras = _base_pkgs + ", and additional pip-installable packages (auto-detected from imports)" else: pkg_prefix = "sandbox mode" pkg_extras = "" if hw_profile and hw_profile.get("has_gpu"): gpu_type = hw_profile.get("gpu_type", "cuda") gpu_name = hw_profile.get("gpu_name", "GPU") tier = hw_profile.get("tier", "limited") if tier == "high": device_hint = f"torch.device('{gpu_type}')" pkg_hint = ( f"\nAVAILABLE PACKAGES ({pkg_prefix}): Python stdlib, numpy, torch, sklearn, scipy, pandas{pkg_extras}.\n" f"GPU: {gpu_name} ({gpu_type}). You MAY use PyTorch with GPU acceleration.\n" f"Use `device = {device_hint}` for tensor operations.\n" ) else: # limited (low VRAM NVIDIA or MPS) device_hint = f"torch.device('{gpu_type}')" pkg_hint = ( f"\nAVAILABLE PACKAGES ({pkg_prefix}): Python stdlib, numpy, torch, sklearn, scipy, pandas{pkg_extras}.\n" f"GPU: {gpu_name} ({gpu_type}) — LIMITED performance.\n" f"Use `device = {device_hint}` but design LIGHTWEIGHT experiments:\n" f"- Small models (<1M parameters)\n" f"- Few epochs (<=20)\n" f"- Small datasets (<=10K samples)\n" f"- Avoid large batch sizes\n" ) else: pkg_hint = _pm.block("pkg_hint_sandbox") else: pkg_hint = "" # --- Compute budget hint --- time_budget_sec = config.experiment.time_budget_sec try: compute_budget = _pm.block("compute_budget").replace( "{time_budget_sec}", str(time_budget_sec) ) except Exception: # noqa: BLE001 compute_budget = ( f"\n## Compute Budget Constraint\n" f"- Total execution time limit: {time_budget_sec} seconds\n" f"- Design experiments that complete within this budget\n" f"- Implement a time guard: stop gracefully at 80% of budget\n" ) # --- Dataset guidance + setup script + HP reporting (docker/sandbox modes) --- extra_guidance = "" _net_policy = getattr(getattr(config, "docker", None), "network_policy", "setup_only") if config.experiment.mode in ("sandbox", "docker"): _net_policy = ( config.experiment.docker.network_policy if config.experiment.mode == "docker" else "none" # sandbox mode has no network ) if _net_policy == "none": # Network disabled: inject strict offline-only guidance try: extra_guidance += _pm.block("network_disabled_guidance") except Exception: # noqa: BLE001 pass elif _net_policy == "full": try: extra_guidance += _pm.block("dataset_guidance") extra_guidance += _pm.block("network_full_guidance") except Exception: # noqa: BLE001 pass else: # setup_only or pip_only — existing behavior try: extra_guidance += _pm.block("dataset_guidance") except Exception: # noqa: BLE001 pass if config.experiment.mode == "docker": try: extra_guidance += _pm.block("setup_script_guidance") except Exception: # noqa: BLE001 pass try: extra_guidance += _pm.block("hp_reporting") except Exception: # noqa: BLE001 pass # I-06: Multi-seed enforcement for all experiments try: extra_guidance += _pm.block("multi_seed_enforcement") except Exception: # noqa: BLE001 pass # --- BA: Inject BenchmarkAgent plan from Stage 9 --- _bp_path = None for _s9_dir in sorted(run_dir.glob("stage-09*"), reverse=True): _candidate = _s9_dir / "benchmark_plan.json" if _candidate.exists(): _bp_path = _candidate break if _bp_path is not None: try: import json as _json_bp _bp_data = _json_bp.loads(_bp_path.read_text(encoding="utf-8")) # Reconstruct the prompt block from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan _bp = BenchmarkPlan( selected_benchmarks=_bp_data.get("selected_benchmarks", []), selected_baselines=_bp_data.get("selected_baselines", []), data_loader_code=_bp_data.get("data_loader_code", ""), baseline_code=_bp_data.get("baseline_code", ""), experiment_notes=_bp_data.get("experiment_notes", ""), ) _bp_block = _bp.to_prompt_block() if _bp_block: extra_guidance += ( "\n\n## BenchmarkAgent Selections (USE THESE)\n" "The following datasets, baselines, and code snippets were " "automatically selected and validated by the BenchmarkAgent. " "You MUST use these selections in your experiment code.\n\n" + _bp_block ) logger.info( "BA: Injected benchmark plan (%d benchmarks, %d baselines)", len(_bp.selected_benchmarks), len(_bp.selected_baselines), ) except Exception as _bp_exc: logger.debug("BA: Failed to load benchmark plan: %s", _bp_exc) # --- P2.2+P2.3: LLM training topic detection and guidance --- _llm_keywords = ( "language model", "llm", "fine-tun", "lora", "qlora", "peft", "instruction tun", "rlhf", "dpo", "sft", "alignment", "transformer train", "causal lm", "chat model", "qwen", "llama", "mistral", "phi-", "gemma", "pretraining", "tokeniz", ) topic_lower = config.research.topic.lower() is_llm_topic = any(kw in topic_lower for kw in _llm_keywords) # --- I-08: RL topic detection and step guidance --- _rl_keywords = ( "reinforcement learning", "policy gradient", "ppo", "sac", "td3", "ddpg", "dqn", "a2c", "a3c", "mujoco", "locomotion", "continuous control", "reward shaping", "exploration", "multi-agent rl", "marl", "curriculum rl", "imitation learning", "inverse rl", "offline rl", "model-based rl", "actor-critic", "reinforce", "gym", "gymnasium", ) is_rl_topic = any(kw in topic_lower for kw in _rl_keywords) if is_rl_topic: try: extra_guidance += _pm.block("rl_step_guidance") except Exception: # noqa: BLE001 pass # --- F-01: Framework API doc injection (auto-detected) --- try: from researchclaw.data import detect_frameworks, load_framework_docs _hypothesis_text = _read_prior_artifact(run_dir, "hypotheses.md") or "" _fw_ids = detect_frameworks( config.research.topic, _hypothesis_text, exp_plan or "" ) if _fw_ids: _fw_docs = load_framework_docs(_fw_ids, max_chars=8000) if _fw_docs: extra_guidance += _fw_docs logger.info("F-01: Injected framework docs for: %s", _fw_ids) except Exception: # noqa: BLE001 logger.debug("F-01: Framework doc injection skipped", exc_info=True) if is_llm_topic and config.experiment.mode == "docker": try: extra_guidance += _pm.block("llm_training_guidance") except Exception: # noqa: BLE001 pass try: extra_guidance += _pm.block("llm_eval_guidance") except Exception: # noqa: BLE001 pass # P2.3: Warn if time budget is too short for LLM training if time_budget_sec < 3600: extra_guidance += ( "\n## COMPUTE BUDGET WARNING\n" f"Current time_budget_sec={time_budget_sec} is likely TOO SHORT " f"for LLM fine-tuning. Typical LoRA training needs 1-4 hours. " f"Design a LIGHTWEIGHT experiment:\n" f"- Use a small dataset (<=5000 samples)\n" f"- Train for 1-3 epochs only\n" f"- Use small batch size (1-2) with gradient accumulation\n" f"- Use 4-bit quantization (QLoRA) to minimize memory\n" f"- Limit max_seq_length to 512-1024\n" f"- If possible, use a smaller model (<=7B parameters)\n" ) # --- Domain-specific guidance injection for non-ML domains --- try: from researchclaw.domains.detector import detect_domain as _dd_s10, is_ml_domain as _is_ml_s10 _dp = _dd_s10(topic=config.research.topic) if not _is_ml_s10(_dp): from researchclaw.domains.prompt_adapter import get_adapter as _ga _adapter = _ga(_dp) _blocks = _adapter.get_code_generation_blocks({}) if _blocks.compute_budget: compute_budget = _blocks.compute_budget if _blocks.dataset_guidance: extra_guidance = _blocks.dataset_guidance + "\n" + extra_guidance if _blocks.code_generation_hints: extra_guidance += "\n" + _blocks.code_generation_hints if _blocks.output_format_guidance: extra_guidance += "\n" + _blocks.output_format_guidance logger.info("Injected domain-specific guidance for %s", _dp.domain_id) except Exception: # noqa: BLE001 logger.debug("Domain guidance injection skipped", exc_info=True) # BUG-R6-01: Add explicit implementation constraints to prevent LLM # from substituting unrelated DL models for lightweight algorithms. extra_guidance += ( "\n\nIMPLEMENTATION CONSTRAINTS (MUST FOLLOW):\n" "- Implement EXACTLY the algorithm/method described in the topic.\n" "- Do NOT replace the stated method with a deep-learning proxy " "(e.g. ResNet, BERT, GPT, Gymnasium+SB3) unless the topic " "EXPLICITLY requires deep learning.\n" "- Prefer lightweight CPU-friendly libraries (numpy, scipy, " "sklearn, pandas) unless deep learning is inherent to the topic.\n" "- The experiment MUST be self-contained and runnable without GPU.\n" ) # --- Code generation: Beast Mode → CodeAgent → Legacy single-shot --- _code_agent_active = False _beast_mode_used = False _code_max_tokens = 8192 # ── Beast Mode: OpenCode external agent (optional) ───────────────── _oc_cfg = config.experiment.opencode if _oc_cfg.enabled: from researchclaw.pipeline.opencode_bridge import ( OpenCodeBridge, OpenCodeResult, count_historical_failures, score_complexity, ) _hist_failures = count_historical_failures(run_dir) _cplx = score_complexity( exp_plan=exp_plan, topic=config.research.topic, historical_failures=_hist_failures, threshold=_oc_cfg.complexity_threshold, ) # Persist complexity analysis (stage_dir / "complexity_analysis.json").write_text( json.dumps( { "score": _cplx.score, "signals": _cplx.signals, "recommendation": _cplx.recommendation, "reason": _cplx.reason, "threshold": _oc_cfg.complexity_threshold, "historical_failures": _hist_failures, }, indent=2, ), encoding="utf-8", ) if _cplx.recommendation == "beast_mode": _proceed = _oc_cfg.auto if not _proceed: # Non-auto mode: check for HITL adapter if adapters.hitl is not None: try: _proceed = adapters.hitl.confirm( f"Beast Mode: complexity={_cplx.score:.2f} " f"(threshold={_oc_cfg.complexity_threshold}). " f"Route to OpenCode?" ) except Exception: # noqa: BLE001 logger.info( "Beast mode: HITL adapter unavailable, skipping " "(set opencode.auto=true for non-interactive runs)" ) else: logger.info( "Beast mode: no HITL adapter, skipping " "(set opencode.auto=true for non-interactive runs)" ) if _proceed: _oc_model = _oc_cfg.model or config.llm.primary_model _bridge = OpenCodeBridge( model=_oc_model, llm_base_url=config.llm.base_url, api_key_env=config.llm.api_key_env, llm_provider=config.llm.provider, timeout_sec=_oc_cfg.timeout_sec, max_retries=_oc_cfg.max_retries, workspace_cleanup=_oc_cfg.workspace_cleanup, ) logger.info( "Beast mode: ENGAGED (complexity=%.2f, model=%s)", _cplx.score, _oc_model, ) _oc_result: OpenCodeResult = _bridge.generate( stage_dir=stage_dir, topic=config.research.topic, exp_plan=exp_plan, metric=metric, pkg_hint=pkg_hint + "\n" + compute_budget, extra_guidance=extra_guidance, time_budget_sec=config.experiment.time_budget_sec, ) # Persist beast mode log (stage_dir / "beast_mode_log.json").write_text( json.dumps( { "success": _oc_result.success, "elapsed_sec": _oc_result.elapsed_sec, "files": list(_oc_result.files.keys()), "error": _oc_result.error, "complexity_score": _cplx.score, "model": _oc_model, }, indent=2, ), encoding="utf-8", ) if _oc_result.success and _oc_result.files: files = _oc_result.files _beast_mode_used = True _code_agent_active = True # skip legacy path logger.info( "Beast mode: SUCCESS — %d files in %.1fs", len(files), _oc_result.elapsed_sec, ) else: logger.warning( "Beast mode: FAILED (%s) — falling back to CodeAgent", _oc_result.error or "unknown error", ) else: logger.info( "Beast mode: complexity=%.2f (threshold=%.2f), not triggered", _cplx.score, _oc_cfg.complexity_threshold, ) if not _beast_mode_used and config.experiment.code_agent.enabled and llm is not None: # ── F-02: Advanced Code Agent path ──────────────────────────────── from researchclaw.pipeline.code_agent import CodeAgent as _CodeAgent _ca_cfg = config.experiment.code_agent # Ensure we have a proper config object if not hasattr(_ca_cfg, "enabled"): from researchclaw.pipeline.code_agent import ( CodeAgentConfig as _CAConfig, ) _ca_cfg = _CAConfig() # Sandbox factory (only for sandbox/docker modes) _sandbox_factory = None if config.experiment.mode in ("sandbox", "docker"): from researchclaw.experiment.factory import ( create_sandbox as _csb, ) _sandbox_factory = _csb if any( config.llm.primary_model.startswith(p) for p in ("gpt-5", "o3", "o4") ): _code_max_tokens = 16384 # ── Domain detection + Code Search for non-ML domains ────────── _domain_profile = None _code_search_result = None try: from researchclaw.domains.detector import detect_domain as _dd from researchclaw.domains.detector import is_ml_domain as _is_ml _domain_profile = _dd(topic=config.research.topic) logger.info( "CodeAgent: domain=%s (%s)", _domain_profile.display_name, _domain_profile.domain_id, ) # Run code search for non-ML domains (ML has enough built-in knowledge) if not _is_ml(_domain_profile): try: from researchclaw.agents.code_searcher import CodeSearchAgent _cs_agent = CodeSearchAgent(llm=llm) _code_search_result = _cs_agent.search( topic=config.research.topic, domain=_domain_profile, ) if _code_search_result and _code_search_result.patterns.has_content: logger.info( "Code search: %d patterns, %d repos found", len(_code_search_result.patterns.api_patterns), len(_code_search_result.repos_found), ) except Exception: # noqa: BLE001 logger.debug("Code search unavailable", exc_info=True) except Exception: # noqa: BLE001 logger.debug("Domain detection unavailable", exc_info=True) _agent = _CodeAgent( llm=llm, prompts=_pm, config=_ca_cfg, stage_dir=stage_dir, sandbox_factory=_sandbox_factory, experiment_config=config.experiment, domain_profile=_domain_profile, code_search_result=_code_search_result, ) _agent_result = _agent.generate( topic=config.research.topic, exp_plan=exp_plan, metric=metric, pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, max_tokens=_code_max_tokens, ) files = _agent_result.files _code_agent_active = True # Write agent artifacts (stage_dir / "code_agent_log.json").write_text( json.dumps( { "log": _agent_result.validation_log, "llm_calls": _agent_result.total_llm_calls, "sandbox_runs": _agent_result.total_sandbox_runs, "best_score": _agent_result.best_score, "tree_nodes_explored": _agent_result.tree_nodes_explored, "review_rounds": _agent_result.review_rounds, }, indent=2, ), encoding="utf-8", ) if _agent_result.architecture_spec: (stage_dir / "architecture_spec.yaml").write_text( _agent_result.architecture_spec, encoding="utf-8", ) logger.info( "CodeAgent: %d LLM calls, %d sandbox runs, score=%.2f", _agent_result.total_llm_calls, _agent_result.total_sandbox_runs, _agent_result.best_score, ) elif not _beast_mode_used and llm is not None: # ── Legacy single-shot generation ───────────────────────────────── topic = config.research.topic _md = config.experiment.metric_direction _md_hint = ( f"`{_md}` — use direction={'lower' if _md == 'minimize' else 'higher'} " f"in METRIC_DEF. You MUST NOT use the opposite direction." ) _overlay = _get_evolution_overlay(run_dir, "code_generation") sp = _pm.for_stage( "code_generation", evolution_overlay=_overlay, topic=topic, metric=metric, pkg_hint=pkg_hint + "\n" + compute_budget + "\n" + extra_guidance, exp_plan=exp_plan, metric_direction_hint=_md_hint, ) # R13-3: Use higher max_tokens for reasoning models (they consume tokens # for internal chain-of-thought). Retry once with even higher limit on empty. _code_max_tokens = sp.max_tokens or 8192 if any(config.llm.primary_model.startswith(p) for p in ("gpt-5", "o3", "o4")): _code_max_tokens = max(_code_max_tokens, 16384) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=_code_max_tokens, ) files = _extract_multi_file_blocks(resp.content) if not files and not resp.content.strip(): # Empty response — retry with higher token limit logger.warning( "R13-3: Empty LLM response for code_generation (len=%d, " "finish_reason=%s, tokens=%d). Retrying with 32768 tokens.", len(resp.content), resp.finish_reason, resp.total_tokens, ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=32768, ) files = _extract_multi_file_blocks(resp.content) if not files: logger.warning( "R13-2: _extract_multi_file_blocks returned empty. " "LLM response length=%d, first 300 chars: %s", len(resp.content), resp.content[:300], ) # --- Fallback: generic numerical experiment --- if not files: files = { "main.py": ( "import numpy as np\n" "\n" "np.random.seed(42)\n" "\n" "# Fallback experiment: parameter sweep on a synthetic objective\n" "# This runs when LLM code generation fails to produce valid code.\n" "dim = 10\n" "n_conditions = 3\n" "results = {}\n" "\n" "for cond_idx in range(n_conditions):\n" " cond_name = f'condition_{cond_idx}'\n" " scores = []\n" " for seed in range(3):\n" " rng = np.random.RandomState(seed + cond_idx * 100)\n" " x = rng.randn(dim)\n" " score = float(1.0 / (1.0 + np.sum(x ** 2)))\n" " scores.append(score)\n" " mean_score = float(np.mean(scores))\n" " results[cond_name] = mean_score\n" f" print(f'condition={{cond_name}} {metric}: {{mean_score:.6f}}')\n" "\n" "best = max(results, key=results.get)\n" f"print(f'{metric}: {{results[best]:.6f}}')\n" ) } # --- Validate each file + auto-repair loop --- all_valid = True attempt = 0 for fname, code in list(files.items()): # Skip non-Python files (requirements.txt, setup.py, etc.) if not fname.endswith(".py"): continue validation = validate_code(code) repair_attempt = 0 while not validation.ok and llm is not None and repair_attempt < max_repair: repair_attempt += 1 attempt += 1 # Only send errors to the LLM — warnings don't block validation # and confuse the LLM into over-correcting (e.g. removing runtime imports) errors_only = type(validation)( issues=[i for i in validation.issues if i.severity == "error"] ) issues_text = format_issues_for_llm(errors_only) validation_log.append( f"File {fname} attempt {repair_attempt}: {validation.summary()}" ) logger.info( "Code validation failed for %s (attempt %d/%d): %s", fname, repair_attempt, max_repair, validation.summary(), ) all_files_ctx = "\n\n".join( f"```filename:{f}\n{c}\n```" for f, c in files.items() ) rp = _pm.sub_prompt( "code_repair", fname=fname, issues_text=issues_text, all_files_ctx=all_files_ctx, ) resp = _chat_with_prompt(llm, rp.system, rp.user) _repaired = _extract_code_block(resp.content) if _repaired.strip(): files[fname] = _repaired else: logger.warning("Repair attempt returned empty code, keeping original") validation = validate_code(files[fname]) if not validation.ok: all_valid = False # BUG-14: Log remaining issues prominently logger.warning( "Code validation FAILED for %s after %d repair attempts: %s", fname, max_repair, validation.summary(), ) # Improvement G: RL algorithm-environment compatibility check for fname, code in list(files.items()): if not fname.endswith(".py"): continue _rl_errors = _check_rl_compatibility(code) if _rl_errors: for _rl_err in _rl_errors: logger.error("Stage 10: %s (in %s)", _rl_err, fname) validation_log.append(f"RL_COMPAT: {fname}: {_rl_err}") all_valid = False # BUG-14: Block on critical validation failures (syntax/import errors) if not all_valid: _has_critical = False for fname, code in files.items(): _v = validate_code(code) if not _v.ok: for issue in _v.issues: if issue.severity == "error" and issue.category in ( "syntax", "import", ): _has_critical = True if _has_critical: logger.error( "Stage 10: CRITICAL validation issues remain after %d repair " "attempts. Blocking stage.", max_repair, ) (stage_dir / "validation_report.md").write_text( "# Code Validation Report\n\n" f"**Status**: BLOCKED — critical issues remain after {max_repair} repairs\n\n" + "\n".join(f"- {e}" for e in validation_log), encoding="utf-8", ) return StageResult( stage=Stage.CODE_GENERATION, status=StageStatus.FAILED, artifacts=("validation_report.md",), evidence_refs=(), ) # --- BUG-184: Cross-import validation — warn if a .py file imports a # local module that doesn't exist in the files dict. This catches the # case where Beast Mode/CodeAgent produced an intermediate file that # got lost during repair iterations. _known_modules = { f.replace(".py", "") for f in files if f.endswith(".py") } _stdlib_and_common = { "os", "sys", "json", "math", "time", "copy", "re", "random", "pathlib", "argparse", "logging", "collections", "functools", "itertools", "abc", "typing", "dataclasses", "enum", "io", "csv", "pickle", "glob", "shutil", "subprocess", "datetime", "numpy", "np", "torch", "torchvision", "gymnasium", "gym", "sklearn", "scipy", "pandas", "matplotlib", "PIL", "tqdm", "einops", "timm", "transformers", "datasets", "peft", "stable_baselines3", } for fname, code in list(files.items()): if not fname.endswith(".py"): continue for _m in re.findall( r"^(?:from|import)\s+([a-zA-Z_][a-zA-Z0-9_]*)", code, re.MULTILINE, ): if (_m not in _known_modules and _m not in _stdlib_and_common and not _m.startswith("_")): logger.warning( "BUG-184: %s imports '%s' which is not in generated " "files — experiment may crash on import", fname, _m, ) # --- Write experiment directory --- exp_dir = stage_dir / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) for fname, code in files.items(): (exp_dir / fname).write_text(code, encoding="utf-8") # --- Write validation report --- if validation_log or not all_valid: report_lines = ["# Code Validation Report\n"] if all_valid: report_lines.append(f"**Status**: PASSED after {attempt} total repair(s)\n") else: report_lines.append( f"**Status**: FAILED after {attempt} total repair attempt(s)\n" ) for entry in validation_log: report_lines.append(f"- {entry}") (stage_dir / "validation_report.md").write_text( "\n".join(report_lines), encoding="utf-8" ) # --- R10-Fix6: Code complexity and quality check --- from researchclaw.experiment.validator import ( auto_fix_unbound_locals, check_code_complexity, deep_validate_files, ) # --- BUG-3 fix: Programmatic auto-fix for UnboundLocalError patterns --- _total_ub_fixes = 0 for fname, code in list(files.items()): if fname.endswith(".py"): fixed_code, n_fixes = auto_fix_unbound_locals(code) if n_fixes > 0: files[fname] = fixed_code (exp_dir / fname).write_text(fixed_code, encoding="utf-8") _total_ub_fixes += n_fixes logger.info( "Stage 10: auto-fixed %d UnboundLocalError risk(s) in %s", n_fixes, fname, ) if _total_ub_fixes: logger.info( "Stage 10: auto-fixed %d total UnboundLocalError risks", _total_ub_fixes ) complexity_warnings: list[str] = [] for fname, code in files.items(): if fname.endswith(".py"): cw = check_code_complexity(code) for w in cw: complexity_warnings.append(f"[{fname}] {w}") logger.warning("Stage 10 code quality: [%s] %s", fname, w) # --- P1.1+P1.2: Deep quality analysis (class quality, scoping, API) --- deep_warnings = deep_validate_files(files) for w in deep_warnings: logger.warning("Stage 10 deep quality: %s", w) complexity_warnings.extend(deep_warnings) # --- P1.2: If critical deep issues found, attempt one repair cycle --- critical_deep = [w for w in deep_warnings if any( kw in w for kw in ("UnboundLocalError", "unregistered", "does not exist", "empty or trivial subclass", "does NOT override", "Import-usage mismatch", "NameError", "was removed", "ptp()", "copy-paste", "identical method signatures", "identical AST", "NOT a real ablation", "shadows stdlib/pip") )] if critical_deep and llm is not None: logger.info( "Stage 10: %d critical code issues found — triggering repair cycle", len(critical_deep), ) repair_issues = "\n".join(f"- {w}" for w in critical_deep) all_code_ctx = "\n\n".join( f"```filename:{f}\n{c}\n```" for f, c in files.items() ) repair_prompt = ( f"CRITICAL CODE QUALITY ISSUES FOUND:\n{repair_issues}\n\n" f"Fix ALL these issues in the code below. Return the complete " f"corrected files using ```filename:xxx.py format.\n\n" f"RULES:\n" f"- nn.Linear/nn.Conv must be created in __init__(), not forward()\n" f"- Variables used after if/else must be defined before the branch\n" f"- Use scipy.special.erf, not np.erf\n" f"- Ablation/variant classes must have genuinely different logic\n" f"- Every class must have a real implementation, not just `pass`\n" f"- Ablation classes MUST override the parent method that implements " f"the component being ablated (e.g., if ablating attention, override " f"the attention method with a simpler alternative like mean pooling)\n" f"- IMPORT CONSISTENCY: if you write `from X import Y`, call `Y()` " f"directly — NOT `X.Y()`. Mixing styles causes NameError.\n" f"- NumPy 2.0: ndarray.ptp() was removed — use arr.max()-arr.min()\n" f"- NumPy 2.0: np.bool/np.int/np.float removed — use builtins\n" f"- Pretrained models (EfficientNet, ResNet, ViT) expect 224×224 input " f"— add `transforms.Resize(224)` when using CIFAR (32×32) or similar\n" f"- Copy-paste ablation: if two classes have identical bodies, REWRITE " f"the ablation to genuinely remove/reduce a component (e.g., zero out " f"attention weights, halve hidden dimensions, remove a loss term)\n" f"- KD: teacher must be frozen, add projection layers if teacher_dim != " f"student_dim, use temperature T=4 for soft targets\n" f"- FILENAME COLLISIONS: If a file like config.py shadows a pip/stdlib " f"package, rename it (e.g., config.py → experiment_config.py) and update " f"ALL imports referencing it\n\n" f"Current code:\n{all_code_ctx}\n" ) try: repair_resp = _chat_with_prompt( llm, _pm.system("code_generation"), repair_prompt, max_tokens=_code_max_tokens, ) repaired = _extract_multi_file_blocks(repair_resp.content) if repaired and "main.py" in repaired: files = repaired for fname, code in files.items(): (exp_dir / fname).write_text(code, encoding="utf-8") # Re-check after repair deep_warnings_after = deep_validate_files(files) fixed = len(critical_deep) - len([ w for w in deep_warnings_after if any(kw in w for kw in ( "UnboundLocalError", "unregistered", "does not exist", "empty or trivial subclass", "does NOT override", "Import-usage mismatch", "NameError", "was removed", "ptp()", "copy-paste", "identical method signatures", "identical AST", "NOT a real ablation", "shadows stdlib/pip", )) ]) logger.info( "Stage 10: Deep repair fixed %d/%d critical issues", fixed, len(critical_deep), ) complexity_warnings.append( f"[REPAIR] Deep repair fixed {fixed}/{len(critical_deep)} " f"critical issues" ) except Exception as exc: logger.debug("Deep repair failed: %s", exc) if complexity_warnings: health: dict[str, Any] = {} health["code_complexity_warnings"] = complexity_warnings (stage_dir / "code_complexity.json").write_text( json.dumps(health, indent=2), encoding="utf-8" ) # --- P1.4: LLM Code Review (Stage 10.5) --- # Skip when CodeAgent is active — Phase 4 review already covers this. if llm is not None and not _code_agent_active: all_code_review = "\n\n".join( f"# --- {fname} ---\n{code}" for fname, code in files.items() ) if len(all_code_review) > 12000: all_code_review = all_code_review[:12000] + "\n... [truncated]" review_prompt = ( f"You are a senior researcher reviewing experiment code for a " f"research submission.\n\n" f"TOPIC: {config.research.topic}\n" f"EXPERIMENT PLAN:\n{exp_plan[:3000]}\n\n" f"CODE:\n```python\n{all_code_review}\n```\n\n" f"Review the code and return JSON with this EXACT structure:\n" f'{{"score": <1-10>, "issues": [' f'{{"severity": "critical|major|minor", ' f'"description": "...", "fix": "..."}}], ' f'"verdict": "pass|needs_fix"}}\n\n' f"Check specifically:\n" f"1. Does each algorithm/method have a DISTINCT implementation? " f"(Not just renamed copies)\n" f"2. Are ablation conditions genuinely different from the main method?\n" f"3. Are loss functions / training loops mathematically correct?\n" f"4. Will the code actually run without errors? Check variable scoping, " f"API usage, tensor shape compatibility.\n" f"5. Is the code complex enough for a research paper? (Not trivial)\n" f"6. Are experimental conditions fairly compared (same seeds, data)?\n" f"7. If using pretrained models (EfficientNet, ResNet, ViT), are input " f"images resized to the model's expected size (e.g., 224x224)? CIFAR " f"images are 32x32 and MUST be resized for pretrained models.\n" f"8. Are imports consistent? `from X import Y` must use `Y()`, not `X.Y()`.\n" ) try: review_resp = llm.chat( [{"role": "user", "content": review_prompt}], system="You are a meticulous ML code reviewer. Be strict.", max_tokens=2048, ) # Extract JSON from LLM response (may be wrapped in markdown fences) _review_text = review_resp.content if hasattr(review_resp, "content") else str(review_resp) # Strip markdown JSON fences if present _review_text = _review_text.strip() if _review_text.startswith("```"): _lines = _review_text.splitlines() _start = 1 if _lines[0].strip().startswith("```") else 0 _end = len(_lines) - 1 if _lines[-1].strip() == "```" else len(_lines) _review_text = "\n".join(_lines[_start:_end]) review_data = _safe_json_loads(_review_text, {}) if isinstance(review_data, dict): review_score = review_data.get("score", 0) review_verdict = review_data.get("verdict", "unknown") review_issues = review_data.get("issues", []) # Write review report review_report = { "score": review_score, "verdict": review_verdict, "issues": review_issues, "timestamp": _utcnow_iso(), } (stage_dir / "code_review.json").write_text( json.dumps(review_report, indent=2), encoding="utf-8" ) # If critical issues found and score low, attempt fix critical_issues = [ i for i in review_issues if isinstance(i, dict) and i.get("severity") == "critical" ] if critical_issues and review_score <= 4: logger.warning( "Stage 10 code review: score=%d, %d critical issues — " "attempting fix", review_score, len(critical_issues), ) fix_descriptions = "\n".join( f"- [{i.get('severity', '?')}] {i.get('description', '?')}: " f"{i.get('fix', 'no fix suggested')}" for i in critical_issues ) fix_prompt = ( f"Code review found {len(critical_issues)} CRITICAL issues " f"(score: {review_score}/10):\n{fix_descriptions}\n\n" f"Fix ALL critical issues. Return complete corrected files " f"using ```filename:xxx.py format.\n\n" f"Current code:\n" + "\n\n".join( f"```filename:{f}\n{c}\n```" for f, c in files.items() ) ) try: fix_resp = _chat_with_prompt( llm, _pm.system("code_generation"), fix_prompt, max_tokens=_code_max_tokens, ) fixed_files = _extract_multi_file_blocks(fix_resp.content) if fixed_files and "main.py" in fixed_files: files = fixed_files for fname, code in files.items(): (exp_dir / fname).write_text(code, encoding="utf-8") logger.info( "Stage 10: Code fixed after review " "(was %d/10, %d critical issues)", review_score, len(critical_issues), ) except Exception as exc: logger.debug("Review-fix failed: %s", exc) except Exception as exc: logger.debug("Code review failed: %s", exc) # --- FIX-3: Topic-experiment alignment check --- # BUG-171: Previous 8000-char truncation caused false-positive misalignment # for multi-file experiments (30-90K chars). LLM saw "[truncated]" and # concluded code was incomplete. Fix: build a structured summary that # includes file inventory + full main.py + per-file function/class headers. alignment_ok = True alignment_note = "" if llm is not None: # Build structured code summary for alignment check _file_inventory = [] for _fn, _cd in files.items(): _lines = _cd.count("\n") + 1 _file_inventory.append(f" {_fn}: {_lines} lines, {len(_cd)} chars") _inventory_block = "FILES GENERATED:\n" + "\n".join(_file_inventory) # BUG-179: Beast Mode may use a different entry point (e.g. # run_experiment.py). Detect the actual entry point by scanning # for ``if __name__ == "__main__"`` in all files, preferring main.py. _entry_file = "main.py" if "main.py" not in files or not files.get("main.py", "").strip(): for _fn, _cd in files.items(): if 'if __name__' in _cd and '__main__' in _cd: _entry_file = _fn break elif files.get("main.py", ""): # main.py exists but may be a stub — if another file has the # real orchestration (more lines + __main__ guard), prefer it _main_lines = files["main.py"].count("\n") for _fn, _cd in files.items(): if _fn == "main.py": continue if ('if __name__' in _cd and '__main__' in _cd and _cd.count("\n") > _main_lines * 1.5): _entry_file = _fn break _main_code = files.get(_entry_file, files.get("main.py", "")) _main_block = f"# --- {_entry_file} (FULL — entry point) ---\n{_main_code}" # Cap main.py at 12000 chars to stay within token budget if len(_main_block) > 12000: _main_block = _main_block[:12000] + "\n... [main.py truncated at 12000 chars]" # For other files, include imports + function/class signatures _other_summaries = [] for _fn, _cd in files.items(): if _fn == _entry_file: continue _sig_lines = [] for _line in _cd.split("\n"): _stripped = _line.strip() if (_stripped.startswith("def ") or _stripped.startswith("class ") or _stripped.startswith("async def ") # BUG-209: Include import lines — they reveal which # techniques/libraries are used (e.g. CosineAnnealingLR) or _stripped.startswith("import ") or _stripped.startswith("from ")): _sig_lines.append(_line) if _sig_lines: _other_summaries.append( f"# --- {_fn} (imports + signatures) ---\n" + "\n".join(_sig_lines) ) else: # Small file — include first 800 chars _preview = _cd[:800] if len(_cd) > 800: _preview += f"\n... [{len(_cd) - 800} more chars]" _other_summaries.append(f"# --- {_fn} (preview) ---\n{_preview}") _other_block = "\n\n".join(_other_summaries) # Cap other summaries if len(_other_block) > 6000: _other_block = _other_block[:6000] + "\n... [other files truncated]" all_code_for_check = ( f"{_inventory_block}\n\n{_main_block}\n\n{_other_block}" ) align_prompt = ( f"Research topic: {config.research.topic}\n\n" f"Experiment code:\n```python\n{all_code_for_check}\n```\n\n" "TASK: Evaluate whether this experiment code actually tests the " "stated research topic. Answer with JSON:\n" '{"aligned": true/false, "reason": "...", "suggestions": "..."}\n\n' "IMPORTANT: The code spans MULTIPLE files. The file inventory above " "shows ALL generated files. Only main.py is shown in full; other " "files show function/class signatures. Do NOT mark as misaligned " "just because helper files are summarized — they contain full " "implementations.\n\n" "Check specifically:\n" "- Does main.py orchestrate an experiment matching the topic?\n" "- Do the helper file signatures indicate relevant models/methods?\n" "- If the topic mentions a specific technique, is there evidence of " "its implementation (function names, class names, imports)?\n" "- Are the experimental conditions meaningfully different from each other?\n" ) try: align_resp = llm.chat( [{"role": "user", "content": align_prompt}], system="You are a scientific code reviewer checking topic-experiment alignment.", max_tokens=1024, ) align_data = _safe_json_loads(align_resp.content, {}) if isinstance(align_data, dict) and not align_data.get("aligned", True): alignment_ok = False alignment_note = align_data.get("reason", "Misaligned") suggestions = align_data.get("suggestions", "") logger.warning( "Stage 10: Topic-experiment MISALIGNMENT detected: %s", alignment_note, ) # BUG-R6-01: Allow up to 2 regeneration attempts with re-check. _max_regen = 2 for _regen_attempt in range(1, _max_regen + 1): logger.info( "Stage 10: Alignment regen attempt %d/%d", _regen_attempt, _max_regen, ) regen_prompt = ( f"The experiment code you previously generated does NOT align " f"with the research topic.\n\n" f"TOPIC: {config.research.topic}\n" f"MISALIGNMENT: {alignment_note}\n" f"SUGGESTIONS: {suggestions}\n\n" f"REGENERATE the experiment code to DIRECTLY test the stated " f"topic. The code MUST implement the core technique described " f"in the topic, not a generic proxy.\n\n" f"CRITICAL CONSTRAINTS:\n" f"- You MUST implement the EXACT algorithm/method from the topic.\n" f"- Do NOT substitute a deep-learning proxy (ResNet, BERT, etc.) " f"when the topic describes a tabular, bandit, or game-theoretic method.\n" f"- Use ONLY lightweight CPU-friendly libraries (numpy, scipy, " f"sklearn) unless the topic EXPLICITLY requires deep learning.\n" f"- The experiment must be self-contained and runnable without GPU.\n\n" f"{pkg_hint}\n{compute_budget}\n" f"PLAN:\n{exp_plan}\n\n" f"Return multiple files using ```filename:xxx.py format." ) regen_resp = _chat_with_prompt( llm, system=_pm.system("code_generation"), user=regen_prompt, max_tokens=_code_max_tokens, ) regen_files = _extract_multi_file_blocks(regen_resp.content) if not regen_files or "main.py" not in regen_files: logger.warning( "Stage 10: Regen attempt %d produced no main.py", _regen_attempt, ) continue files = regen_files for fname, code in files.items(): (exp_dir / fname).write_text(code, encoding="utf-8") # Re-check alignment on regenerated code (BUG-171 fix) _rc_inv = [] for _fn, _cd in files.items(): _rc_inv.append(f" {_fn}: {_cd.count(chr(10))+1} lines") _rc_main = files.get("main.py", "") if len(_rc_main) > 12000: _rc_main = _rc_main[:12000] + "\n... [truncated]" _rc_sigs = [] for _fn, _cd in files.items(): if _fn == "main.py": continue # BUG-209: Include imports alongside signatures _slines = [l for l in _cd.split("\n") if l.strip().startswith(( "def ", "class ", "async def ", "import ", "from ", ))] if _slines: _rc_sigs.append(f"# {_fn} imports+signatures:\n" + "\n".join(_slines)) recheck_code = ( "FILES:\n" + "\n".join(_rc_inv) + "\n\n" f"# main.py (FULL):\n{_rc_main}\n\n" + "\n".join(_rc_sigs) ) recheck_resp = llm.chat( [{"role": "user", "content": ( f"Research topic: {config.research.topic}\n\n" f"Experiment code:\n```python\n{recheck_code}\n```\n\n" "TASK: Evaluate whether this experiment code actually tests " "the stated research topic. Only main.py is shown in full; " "other files show signatures only. Answer with JSON:\n" '{"aligned": true/false, "reason": "...", "suggestions": "..."}\n' )}], system="You are a scientific code reviewer checking topic-experiment alignment.", max_tokens=1024, ) recheck_data = _safe_json_loads(recheck_resp.content, {}) if isinstance(recheck_data, dict) and recheck_data.get("aligned", False): alignment_ok = True alignment_note = f"Regenerated after alignment check (attempt {_regen_attempt})" logger.info( "Stage 10: Code aligned after regen attempt %d", _regen_attempt, ) break else: alignment_note = recheck_data.get("reason", alignment_note) suggestions = recheck_data.get("suggestions", suggestions) logger.warning( "Stage 10: Regen attempt %d still misaligned: %s", _regen_attempt, alignment_note, ) except Exception as exc: logger.debug("Alignment check failed: %s", exc) # --- FIX-7: Ablation distinctness check --- main_code = files.get("main.py", "") if llm is not None and main_code and "condition" in main_code.lower(): try: ablation_prompt = ( f"Examine this experiment code:\n```python\n{main_code[:6000]}\n```\n\n" "Check if any experimental conditions (methods/ablations) have " "IDENTICAL configurations (same hyperparameters, same code paths). " "Answer JSON: " '{"has_duplicates": true/false, "details": "which conditions are identical"}' ) abl_resp = llm.chat( [{"role": "user", "content": ablation_prompt}], system="You are a code reviewer checking experimental conditions.", max_tokens=512, ) abl_data = _safe_json_loads(abl_resp.content, {}) if isinstance(abl_data, dict) and abl_data.get("has_duplicates"): logger.warning( "Stage 10: Duplicate ablation conditions detected: %s", abl_data.get("details", ""), ) (stage_dir / "ablation_warning.json").write_text( json.dumps(abl_data, indent=2), encoding="utf-8" ) # --- Attempt ablation repair --- all_code_ctx = "\n\n".join( f"```filename:{f}\n{c}\n```" for f, c in files.items() ) dup_details = abl_data.get("details", "unknown") abl_repair_prompt = ( f"ABLATION REPAIR REQUIRED — duplicate conditions detected:\n" f"{dup_details}\n\n" f"Rewrite the ablation/variant conditions so each one is " f"GENUINELY DIFFERENT. Concrete strategies:\n" f"- 'no_': REMOVE the component entirely " f"(e.g., replace attention with mean pooling, remove a loss term)\n" f"- 'reduced_capacity': HALVE hidden dimensions or layers\n" f"- Different conditions MUST produce different outputs on the " f"same input. Add a startup assertion that runs one forward pass " f"per condition on identical input and prints:\n" f" ABLATION_CHECK: vs outputs_differ=True\n\n" f"Return ALL files using ```filename:xxx.py format.\n\n" f"Current code:\n{all_code_ctx}\n" ) try: abl_repair_resp = _chat_with_prompt( llm, _pm.system("code_generation"), abl_repair_prompt, max_tokens=_code_max_tokens, ) repaired_files = _extract_multi_file_blocks( abl_repair_resp.content ) if repaired_files and "main.py" in repaired_files: files = repaired_files for fname, code in files.items(): (exp_dir / fname).write_text(code, encoding="utf-8") logger.info( "Stage 10: Ablation repair applied — " "rewrote duplicate conditions" ) except Exception as exc: logger.debug("Ablation repair failed: %s", exc) except Exception as exc: logger.debug("Ablation validation skipped: %s", exc) # --- Write spec --- file_list = ", ".join(f"`{f}`" for f in sorted(files.keys())) main_validation = validate_code(files.get("main.py", "")) _align_status = "ALIGNED" if alignment_ok else f"MISALIGNED: {alignment_note}" spec = f"""# Experiment Specification ## Topic {config.research.topic} ## Project Structure Multi-file experiment project with {len(files)} file(s): {file_list} ## Entry Point `main.py` \u2014 executed directly via sandbox ## Outputs - `main.py` emits metric lines in `name: value` format - Primary metric key: `{metric}` ## Topic-Experiment Alignment {_align_status} ## Constraints - Time budget per run: {config.experiment.time_budget_sec}s - Max iterations: {config.experiment.max_iterations} - Self-contained execution (no external data, no network) - Validated: {main_validation.summary()} ## Generated {_utcnow_iso()} """ (stage_dir / "experiment_spec.md").write_text(spec, encoding="utf-8") artifacts = ["experiment/", "experiment_spec.md"] if (stage_dir / "validation_report.md").exists(): artifacts.append("validation_report.md") # BUG-R6-01: Fail stage if alignment check detected persistent mismatch # after all regen attempts, instead of silently proceeding. if not alignment_ok: logger.error( "Stage 10: Persistent topic-experiment misalignment after all " "regen attempts. Failing stage. Reason: %s", alignment_note, ) return StageResult( stage=Stage.CODE_GENERATION, status=StageStatus.FAILED, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-10/{a}" for a in artifacts), error=f"Topic-experiment misalignment: {alignment_note}", ) return StageResult( stage=Stage.CODE_GENERATION, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-10/{a}" for a in artifacts), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_execution.py ================================================ """Stages 11-13: Resource planning, experiment execution, and iterative refinement.""" from __future__ import annotations import json import logging import math import re import time as _time from pathlib import Path from typing import Any from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.experiment.validator import ( CodeValidation, format_issues_for_llm, validate_code, ) from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, _chat_with_prompt, _detect_runtime_issues, _ensure_sandbox_deps, _extract_code_block, _extract_multi_file_blocks, _get_evolution_overlay, _load_hardware_profile, _parse_metrics_from_stdout, _read_prior_artifact, _safe_filename, _safe_json_loads, _utcnow_iso, _write_stage_meta, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_resource_planning( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: exp_plan = _read_prior_artifact(run_dir, "exp_plan.yaml") or "" schedule: dict[str, Any] | None = None if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "resource_planning") sp = _pm.for_stage("resource_planning", evolution_overlay=_overlay, exp_plan=exp_plan) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) parsed = _safe_json_loads(resp.content, {}) if isinstance(parsed, dict): schedule = parsed if schedule is None: schedule = { "tasks": [ { "id": "baseline", "name": "Run baseline", "depends_on": [], "gpu_count": 1, "estimated_minutes": 20, "priority": "high", }, { "id": "proposed", "name": "Run proposed method", "depends_on": ["baseline"], "gpu_count": 1, "estimated_minutes": 30, "priority": "high", }, ], "total_gpu_budget": 1, "generated": _utcnow_iso(), } schedule.setdefault("generated", _utcnow_iso()) (stage_dir / "schedule.json").write_text( json.dumps(schedule, indent=2), encoding="utf-8" ) return StageResult( stage=Stage.RESOURCE_PLANNING, status=StageStatus.DONE, artifacts=("schedule.json",), evidence_refs=("stage-11/schedule.json",), ) def _execute_experiment_run( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: from researchclaw.experiment.factory import create_sandbox from researchclaw.experiment.runner import ExperimentRunner schedule_text = _read_prior_artifact(run_dir, "schedule.json") or "{}" # Try multi-file experiment directory first, fall back to single file exp_dir_path = _read_prior_artifact(run_dir, "experiment/") code_text = "" if exp_dir_path and Path(exp_dir_path).is_dir(): main_path = Path(exp_dir_path) / "main.py" if main_path.exists(): code_text = main_path.read_text(encoding="utf-8") if not code_text: code_text = _read_prior_artifact(run_dir, "experiment.py") or "" runs_dir = stage_dir / "runs" runs_dir.mkdir(parents=True, exist_ok=True) mode = config.experiment.mode if mode in ("sandbox", "docker"): # P7: Auto-install missing dependencies before subprocess sandbox if mode == "sandbox": _all_code = code_text if exp_dir_path and Path(exp_dir_path).is_dir(): for _pyf in Path(exp_dir_path).glob("*.py"): try: _all_code += "\n" + _pyf.read_text(encoding="utf-8") except (OSError, UnicodeDecodeError): pass _ensure_sandbox_deps(_all_code, config.experiment.sandbox.python_path) sandbox = create_sandbox(config.experiment, runs_dir / "sandbox") # Use run_project for multi-file, run for single-file if exp_dir_path and Path(exp_dir_path).is_dir(): result = sandbox.run_project( Path(exp_dir_path), timeout_sec=config.experiment.time_budget_sec ) else: result = sandbox.run( code_text, timeout_sec=config.experiment.time_budget_sec ) # Try to read structured results.json from sandbox working dir structured_results: dict[str, Any] | None = None sandbox_project = runs_dir / "sandbox" / "_project" results_json_path = sandbox_project / "results.json" if results_json_path.exists(): try: structured_results = json.loads( results_json_path.read_text(encoding="utf-8") ) # Copy results.json to runs dir for easy access (runs_dir / "results.json").write_text( results_json_path.read_text(encoding="utf-8"), encoding="utf-8", ) except (json.JSONDecodeError, OSError): structured_results = None # If sandbox metrics are empty, try to parse from stdout effective_metrics = result.metrics if not effective_metrics and result.stdout: effective_metrics = _parse_metrics_from_stdout(result.stdout) # Determine run status: completed / partial (timed out with data) / failed # R6-2: Detect stdout failure signals even when exit code is 0 _stdout_has_failure = bool( result.stdout and not effective_metrics and any( sig in result.stdout for sig in ("FAIL:", "NaN/divergence", "Traceback (most recent") ) ) if result.returncode == 0 and not result.timed_out and not _stdout_has_failure: run_status = "completed" elif result.timed_out and effective_metrics: run_status = "partial" logger.warning( "Experiment timed out but captured %d partial metrics", len(effective_metrics), ) else: run_status = "failed" if _stdout_has_failure: logger.warning( "Experiment exited cleanly but stdout contains failure signals" ) # P1: Warn if experiment completed suspiciously fast (trivially easy benchmark) if run_status == "completed" and result.elapsed_sec and result.elapsed_sec < 5.0: logger.warning( "Stage 12: Experiment completed in %.2fs — benchmark may be trivially easy. " "Consider increasing task difficulty.", result.elapsed_sec, ) run_payload: dict[str, Any] = { "run_id": "run-1", "task_id": "sandbox-main", "status": run_status, "metrics": effective_metrics, "elapsed_sec": result.elapsed_sec, "stdout": result.stdout, "stderr": result.stderr, "timed_out": result.timed_out, "completed_at": _utcnow_iso(), } if structured_results is not None: run_payload["structured_results"] = structured_results # Auto-generate results.json from parsed metrics if sandbox didn't produce one if structured_results is None and effective_metrics: auto_results = {"source": "stdout_parsed", "metrics": effective_metrics} (runs_dir / "results.json").write_text( json.dumps(auto_results, indent=2), encoding="utf-8" ) logger.info("Stage 12: Auto-generated results.json from stdout metrics (%d keys)", len(effective_metrics)) (runs_dir / "run-1.json").write_text( json.dumps(run_payload, indent=2), encoding="utf-8" ) # R11-6: Time budget adequacy check if result.timed_out or (result.elapsed_sec and result.elapsed_sec > config.experiment.time_budget_sec * 0.9): # Parse stdout to estimate how many conditions/seeds completed _stdout = result.stdout or "" _completed_conditions = set() _completed_seeds = 0 for _line in _stdout.splitlines(): if "condition=" in _line and "seed=" in _line: _completed_seeds += 1 _cond_match = re.match(r".*condition=(\S+)", _line) if _cond_match: _completed_conditions.add(_cond_match.group(1)) _time_budget_warning = { "timed_out": result.timed_out, "elapsed_sec": result.elapsed_sec, "budget_sec": config.experiment.time_budget_sec, "conditions_completed": sorted(_completed_conditions), "total_seed_runs": _completed_seeds, "warning": ( f"Experiment used {result.elapsed_sec:.0f}s of " f"{config.experiment.time_budget_sec}s budget. " f"Only {len(_completed_conditions)} conditions completed " f"({_completed_seeds} seed-runs). Consider increasing " f"time_budget_sec for more complete results." ), } logger.warning( "Stage 12: %s", _time_budget_warning["warning"] ) (stage_dir / "time_budget_warning.json").write_text( json.dumps(_time_budget_warning, indent=2), encoding="utf-8" ) # FIX-8: Validate seed count from structured results if structured_results and isinstance(structured_results, dict): _sr_conditions = structured_results.get("conditions", structured_results.get("per_condition", {})) if isinstance(_sr_conditions, dict): for _cname, _cdata in _sr_conditions.items(): if isinstance(_cdata, dict): _seeds_run = _cdata.get("seeds_run", _cdata.get("n_seeds", 0)) if isinstance(_seeds_run, (int, float)) and 0 < _seeds_run < 3: logger.warning( "Stage 12: Condition '%s' ran only %d seed(s) — " "minimum 3 required for statistical validity", _cname, int(_seeds_run), ) elif mode == "simulated": schedule = _safe_json_loads(schedule_text, {}) tasks = schedule.get("tasks", []) if isinstance(schedule, dict) else [] if not isinstance(tasks, list): tasks = [] for idx, task in enumerate(tasks or [{"id": "task-1", "name": "simulated"}]): task_id = ( str(task.get("id", f"task-{idx + 1}")) if isinstance(task, dict) else f"task-{idx + 1}" ) payload = { "run_id": f"run-{idx + 1}", "task_id": task_id, "status": "simulated", "key_metrics": { config.experiment.metric_key: round(0.3 + idx * 0.03, 4), "secondary_metric": round(0.6 - idx * 0.04, 4), }, "notes": "Simulated run result", "completed_at": _utcnow_iso(), } run_id = str(payload["run_id"]) (runs_dir / f"{_safe_filename(run_id)}.json").write_text( json.dumps(payload, indent=2), encoding="utf-8" ) else: runner = ExperimentRunner(config.experiment, runs_dir / "workspace") history = runner.run_loop(code_text, run_id=f"exp-{run_dir.name}", llm=llm) runner.save_history(stage_dir / "experiment_history.json") for item in history.results: payload = { "run_id": f"run-{item.iteration}", "task_id": item.run_id, "status": "completed" if item.error is None else "failed", "metrics": item.metrics, "primary_metric": item.primary_metric, "improved": item.improved, "kept": item.kept, "elapsed_sec": item.elapsed_sec, "error": item.error, "completed_at": _utcnow_iso(), } run_id = str(payload["run_id"]) (runs_dir / f"{_safe_filename(run_id)}.json").write_text( json.dumps(payload, indent=2), encoding="utf-8" ) return StageResult( stage=Stage.EXPERIMENT_RUN, status=StageStatus.DONE, artifacts=("runs/",), evidence_refs=("stage-12/runs/",), ) def _execute_iterative_refine( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: from researchclaw.experiment.factory import create_sandbox from researchclaw.experiment.validator import format_issues_for_llm, validate_code def _to_float(value: Any) -> float | None: try: if value is None: return None f = float(value) # BUG-EX-01: NaN/Inf block all future improvement detection if math.isnan(f) or math.isinf(f): return None return f except (TypeError, ValueError): return None # R10-Fix3: Skip iterative refinement in simulated mode (no real execution) if config.experiment.mode == "simulated": logger.info( "Stage 13: Skipping iterative refinement in simulated mode " "(no real code execution available)" ) import shutil final_dir = stage_dir / "experiment_final" # Copy latest experiment code as final (directory or single file) copied = False for stage_num in (12, 10): src_dir = run_dir / f"stage-{stage_num:02d}" / "experiment" if src_dir.is_dir(): if final_dir.exists(): shutil.rmtree(final_dir) shutil.copytree(src_dir, final_dir) copied = True break # Also check for single experiment.py src_file = run_dir / f"stage-{stage_num:02d}" / "experiment.py" if src_file.is_file(): (stage_dir / "experiment_final.py").write_text( src_file.read_text(encoding="utf-8"), encoding="utf-8" ) copied = True break log: dict[str, Any] = { "generated": _utcnow_iso(), "mode": "simulated", "skipped": True, "skip_reason": "Iterative refinement not meaningful in simulated mode", "metric_key": config.experiment.metric_key, } (stage_dir / "refinement_log.json").write_text( json.dumps(log, indent=2), encoding="utf-8" ) return StageResult( stage=Stage.ITERATIVE_REFINE, status=StageStatus.DONE, artifacts=("refinement_log.json",), evidence_refs=(), ) metric_key = config.experiment.metric_key metric_direction = config.experiment.metric_direction # P9: Detect metric direction mismatch between config and experiment code. # The code-gen stage instructs experiments to print a line like: # METRIC_DEF: primary_metric | direction=higher | desc=... # Log a warning if mismatch is detected, but trust the config value # (BUG-06 fix: no longer auto-override, since Stage 9 and 12 now # explicitly enforce config.metric_direction in prompts). _runs_dir_detect = _read_prior_artifact(run_dir, "runs/") if _runs_dir_detect and Path(_runs_dir_detect).is_dir(): import re as _re_detect for _rf in sorted(Path(_runs_dir_detect).glob("*.json"))[:5]: try: _rp = _safe_json_loads(_rf.read_text(encoding="utf-8"), {}) _stdout = _rp.get("stdout", "") if isinstance(_rp, dict) else "" _match = _re_detect.search( r"METRIC_DEF:.*direction\s*=\s*(higher|lower)", _stdout ) if _match: _detected = _match.group(1) _detected_dir = "maximize" if _detected == "higher" else "minimize" if _detected_dir != metric_direction: logger.warning( "P9: Metric direction mismatch — config says '%s' but " "experiment code declares 'direction=%s'. " "Keeping config value '%s'. Code will be " "corrected in next refinement cycle.", metric_direction, _detected, metric_direction, ) break except OSError: pass maximize = metric_direction == "maximize" def _is_better(candidate: float | None, current: float | None) -> bool: if candidate is None: return False if current is None: return True return candidate > current if maximize else candidate < current def _find_metric(metrics: dict[str, object], key: str) -> float | None: """R13-4: Find metric value with fuzzy key matching. Tries exact match first, then looks for aggregate keys that contain the metric name (e.g. 'primary_metric_mean' when key='primary_metric'). """ # Exact match val = _to_float(metrics.get(key)) if val is not None: return val # Try aggregate/mean keys containing the metric name # Prefer keys ending with the metric name or containing '_mean' candidates: list[tuple[str, float]] = [] for mk, mv in metrics.items(): fv = _to_float(mv) if fv is None: continue if mk == key or mk.endswith(f"/{key}"): return fv # Exact match via condition prefix if key in mk and ("mean" in mk or "avg" in mk): candidates.append((mk, fv)) elif mk.endswith(f"_{key}") or mk.endswith(f"/{key}_mean"): candidates.append((mk, fv)) if candidates: # Take the aggregate mean if available, otherwise first match for ck, cv in candidates: if "mean" in ck: return cv return candidates[0][1] # Last resort: if there's an "overall" or root-level aggregate for mk, mv in metrics.items(): fv = _to_float(mv) if fv is not None and key in mk and "/" not in mk and "seed" not in mk: return fv return None requested_iterations = int(getattr(config.experiment, "max_iterations", 10) or 10) max_iterations = max(1, min(requested_iterations, 10)) # BUG-57: Wall-clock time cap for the entire refinement stage. # Default: 3× the per-iteration time budget (e.g., 2400s → 7200s = 2h). import time as _time_bug57 _refine_start_time = _time_bug57.monotonic() _per_iter_budget = int(getattr(config.experiment, "time_budget_sec", 2400) or 2400) _max_refine_wall_sec = int( getattr(config.experiment, "max_refine_duration_sec", 0) or 0 ) or int(_per_iter_budget * 1.5) # --- Collect baseline metrics from prior runs --- runs_dir_path: Path | None = None runs_dir_text = _read_prior_artifact(run_dir, "runs/") if runs_dir_text: runs_dir_path = Path(runs_dir_text) run_summaries: list[str] = [] baseline_metric: float | None = None if runs_dir_path is not None: for run_file in sorted(runs_dir_path.glob("*.json"))[:40]: payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {}) if not isinstance(payload, dict): continue # R5-5: Truncate stdout/stderr for context efficiency summary = dict(payload) if "stdout" in summary and isinstance(summary["stdout"], str): lines = summary["stdout"].splitlines() if len(lines) > 30: summary["stdout"] = ( f"[...truncated {len(lines) - 30} lines...]\n" + "\n".join(lines[-30:]) ) if len(summary["stdout"]) > 2000: summary["stdout"] = summary["stdout"][-2000:] if "stderr" in summary and isinstance(summary["stderr"], str): lines = summary["stderr"].splitlines() if len(lines) > 50: summary["stderr"] = "\n".join(lines[-50:]) if len(summary["stderr"]) > 2000: summary["stderr"] = summary["stderr"][-2000:] run_summaries.append(json.dumps(summary, ensure_ascii=False)) metrics = payload.get("metrics") if not isinstance(metrics, dict): metrics = ( payload.get("key_metrics") if isinstance(payload.get("key_metrics"), dict) else {} ) metric_val = ( _find_metric(metrics, metric_key) if isinstance(metrics, dict) else None ) if metric_val is None: metric_val = _to_float(payload.get("primary_metric")) if _is_better(metric_val, baseline_metric): baseline_metric = metric_val # --- Read experiment project (multi-file or single-file) --- # BUG-58: When PIVOT rolls back to Stage 13, prefer the best refined code # from a previous cycle (stage-13_vX/experiment_final/) over the original # unrefined code (stage-12/experiment/ or stage-10/experiment/). # Enhanced: try ALL versioned directories (latest first) with fallback chain. exp_dir_text: str | None = None _prev_refine_dirs = sorted( run_dir.glob("stage-13_v*/experiment_final"), key=lambda p: p.parent.name, reverse=True, # latest version first ) # BUG-58 fix: Find the best version across ALL cycles (not just latest) _best_prev_metric: float | None = None _best_prev_dir: Path | None = None for _prd in _prev_refine_dirs: if not _prd.is_dir(): continue _prd_log = _prd.parent / "refinement_log.json" if _prd_log.is_file(): _prd_data = _safe_json_loads( _prd_log.read_text(encoding="utf-8"), {} ) _prd_metric = _prd_data.get("best_metric") if isinstance(_prd_data, dict) else None if isinstance(_prd_metric, (int, float)) and _is_better(_prd_metric, _best_prev_metric): _best_prev_metric = _prd_metric _best_prev_dir = _prd elif _best_prev_dir is None: # No log but directory exists — use as fallback _best_prev_dir = _prd if _best_prev_dir is not None: exp_dir_text = str(_best_prev_dir) logger.info( "BUG-58: Recovered best refined code from PIVOT cycle: %s (metric=%s)", _best_prev_dir.parent.name, f"{_best_prev_metric:.4f}" if _best_prev_metric is not None else "N/A", ) if not exp_dir_text: exp_dir_text = _read_prior_artifact(run_dir, "experiment/") best_files: dict[str, str] = {} if exp_dir_text and Path(exp_dir_text).is_dir(): # BUG-EX-02: Load ALL text files (not just .py) — requirements.txt, # setup.py, config files are needed for Docker sandbox phases. for src_file in sorted(Path(exp_dir_text).iterdir()): if src_file.is_file() and src_file.suffix in ( ".py", ".txt", ".yaml", ".yml", ".json", ".cfg", ".ini", ".sh", ): try: best_files[src_file.name] = src_file.read_text(encoding="utf-8") except UnicodeDecodeError: pass # skip binary files if not best_files: # Backward compat: single experiment.py original_code = _read_prior_artifact(run_dir, "experiment.py") or "" if original_code: best_files = {"main.py": original_code} # --- Detect if prior experiment timed out --- prior_timed_out = False prior_time_budget = config.experiment.time_budget_sec if runs_dir_path is not None: for run_file in sorted(runs_dir_path.glob("*.json"))[:5]: try: payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {}) if isinstance(payload, dict) and payload.get("timed_out"): prior_timed_out = True break except OSError: pass best_metric = baseline_metric best_version = "experiment/" # BUG-58: Recover best_metric from best previous PIVOT cycle if _best_prev_metric is not None and _is_better(_best_prev_metric, best_metric): best_metric = _best_prev_metric logger.info( "BUG-58: Recovered best_metric=%.4f from previous PIVOT", best_metric, ) no_improve_streak = 0 consecutive_no_metrics = 0 log: dict[str, Any] = { "generated": _utcnow_iso(), "mode": config.experiment.mode, "metric_key": metric_key, "metric_direction": metric_direction, "max_iterations_requested": requested_iterations, "max_iterations_executed": max_iterations, "baseline_metric": baseline_metric, "project_files": list(best_files.keys()), "iterations": [], "converged": False, "stop_reason": "max_iterations_reached", } # --- Helper: write files to a directory --- def _write_project(target_dir: Path, project_files: dict[str, str]) -> None: target_dir.mkdir(parents=True, exist_ok=True) for fname, code in project_files.items(): (target_dir / fname).write_text(code, encoding="utf-8") # --- Helper: format all files for LLM context --- def _files_to_context(project_files: dict[str, str]) -> str: parts = [] for fname, code in sorted(project_files.items()): parts.append(f"```filename:{fname}\n{code}\n```") return "\n\n".join(parts) if llm is None: logger.info("Stage 13: LLM unavailable, saving original experiment as final") final_dir = stage_dir / "experiment_final" _write_project(final_dir, best_files) # Backward compat if "main.py" in best_files: (stage_dir / "experiment_final.py").write_text( best_files["main.py"], encoding="utf-8" ) log.update( { "converged": True, "stop_reason": "llm_unavailable", "best_metric": best_metric, "best_version": "experiment_final/", "iterations": [ { "iteration": 0, "version_dir": "experiment_final/", "source": "fallback_original", "metric": best_metric, } ], } ) (stage_dir / "refinement_log.json").write_text( json.dumps(log, indent=2), encoding="utf-8" ) artifacts = ("refinement_log.json", "experiment_final/") return StageResult( stage=Stage.ITERATIVE_REFINE, status=StageStatus.DONE, artifacts=artifacts, evidence_refs=tuple(f"stage-13/{a}" for a in artifacts), ) _pm = prompts or PromptManager() timeout_refine_attempts = 0 # R7-3: Read experiment plan to detect condition coverage gaps _exp_plan_text = _read_prior_artifact(run_dir, "exp_plan.yaml") or "" _condition_coverage_hint = "" if _exp_plan_text and run_summaries: # Check if stdout contains condition labels _all_stdout = " ".join(run_summaries) _has_condition_labels = "condition=" in _all_stdout if not _has_condition_labels and _exp_plan_text.strip(): _condition_coverage_hint = ( "\nCONDITION COVERAGE GAP DETECTED:\n" "The experiment plan specifies multiple conditions/treatments, " "but the output contains NO condition labels (no 'condition=...' in stdout).\n" "You MUST:\n" "1. Run ALL conditions/treatments from the experiment plan independently\n" "2. Label each metric output: `condition= {metric_key}: `\n" "3. Print a SUMMARY line comparing all conditions after completion\n" "This is the MOST IMPORTANT improvement — a single unlabeled metric stream " "cannot support any comparative conclusions.\n\n" ) logger.info( "Stage 13: condition coverage gap detected, injecting multi-condition hint" ) # P1: Track metrics history for saturation detection _metrics_history: list[float | None] = [baseline_metric] for iteration in range(1, max_iterations + 1): # BUG-57: Check wall-clock time before starting a new iteration _elapsed = _time_bug57.monotonic() - _refine_start_time if _elapsed > _max_refine_wall_sec: logger.warning( "Stage 13: Wall-clock time cap reached (%.0fs > %ds). " "Stopping refinement after %d iterations.", _elapsed, _max_refine_wall_sec, iteration - 1, ) log["stop_reason"] = "wall_clock_time_cap" break logger.info("Stage 13: refinement iteration %d/%d (%.0fs elapsed, cap %ds)", iteration, max_iterations, _elapsed, _max_refine_wall_sec) # P1: Detect metric saturation and inject difficulty upgrade hint _saturation_hint = "" _valid_metrics = [m for m in _metrics_history if m is not None] if len(_valid_metrics) >= 2: _last_two = _valid_metrics[-2:] _saturated = False # Use relative change rate instead of hard-coded thresholds _change_rate = abs(_last_two[-1] - _last_two[-2]) / max(abs(_last_two[-2]), 1e-8) if metric_direction == "minimize": _saturated = all(m <= 0.001 for m in _last_two) or ( _change_rate < 0.001 and _last_two[-1] < 0.01 ) else: _saturated = all(m >= 0.999 for m in _last_two) or ( _change_rate < 0.001 and _last_two[-1] > 0.99 ) if _saturated: _saturation_hint = ( "\n\nWARNING — BENCHMARK SATURATION DETECTED:\n" "All methods achieve near-perfect scores, making the task too easy " "to discriminate between methods.\n" "YOU MUST increase benchmark difficulty in this iteration:\n" "1. Increase the number of actions/decisions from 8 to at least 20\n" "2. Increase the horizon from 12-18 to at least 50-100 steps\n" "3. Increase noise level to at least 0.3-0.5\n" "4. Add partial observability (agent cannot see full state)\n" "5. Add delayed rewards (reward only at episode end)\n" "6. Ensure random search achieves < 50% success rate\n" "Without this change, the experiment produces meaningless results.\n" ) logger.warning("Stage 13: metric saturation detected, injecting difficulty upgrade hint") files_context = _files_to_context(best_files) # BUG-10 fix: anchor refinement to original experiment plan _exp_plan_anchor = "" if _exp_plan_text.strip(): _exp_plan_anchor = ( "Original experiment plan (exp_plan.yaml):\n" "```yaml\n" + _exp_plan_text[:4000] + "\n```\n" "You MUST preserve ALL condition names from this plan.\n\n" ) ip = _pm.sub_prompt( "iterative_improve", metric_key=metric_key, metric_direction=metric_direction, files_context=files_context, run_summaries=chr(10).join(run_summaries[:20]), condition_coverage_hint=_condition_coverage_hint, topic=config.research.topic, exp_plan_anchor=_exp_plan_anchor, ) # --- Timeout-aware prompt injection --- user_prompt = ip.user + _saturation_hint if prior_timed_out and baseline_metric is None: timeout_refine_attempts += 1 timeout_hint = ( f"\n\nCRITICAL: The experiment TIMED OUT after {prior_time_budget}s " f"with NO results. You MUST drastically reduce the experiment scale:\n" f"- Reduce total runs to ≤50\n" f"- Reduce steps per run to ≤2000\n" f"- Remove conditions that are not essential\n" f"- Add time.time() checks to stop gracefully before timeout\n" f"- Print intermediate metrics frequently so partial data is captured\n" f"- Time budget is {prior_time_budget}s — design for ≤{int(prior_time_budget * 0.7)}s\n" ) user_prompt = user_prompt + timeout_hint logger.warning( "Stage 13: injecting timeout-aware prompt (attempt %d)", timeout_refine_attempts, ) response = _chat_with_prompt( llm, ip.system, user_prompt, max_tokens=ip.max_tokens or 8192, ) extracted_files = _extract_multi_file_blocks(response.content) # If LLM returns only single block, treat as main.py update if not extracted_files: single_code = _extract_code_block(response.content) if single_code.strip(): extracted_files = {"main.py": single_code} # R8-2: Merge with best_files to preserve supporting modules # (e.g., graphs.py, game.py) that the LLM didn't rewrite candidate_files = dict(best_files) if extracted_files: candidate_files.update(extracted_files) # If LLM returned nothing at all, candidate_files == best_files (unchanged) # BUG-R6-02: Preserve entry point when LLM strips main() function. # The LLM often returns only class/function improvements without the # main() entry point, causing the script to exit with no output. _new_main = candidate_files.get("main.py", "") _old_main = best_files.get("main.py", "") if ( _new_main and _old_main and "if __name__" not in _new_main and "if __name__" in _old_main ): # Extract the entry-point block from original main.py _ep_idx = _old_main.rfind("\ndef main(") if _ep_idx == -1: _ep_idx = _old_main.rfind("\nif __name__") if _ep_idx != -1: _entry_block = _old_main[_ep_idx:] candidate_files["main.py"] = _new_main.rstrip() + "\n\n" + _entry_block logger.info( "Stage 13 iter %d: restored entry point stripped by LLM " "(%d chars appended from original main.py)", iteration, len(_entry_block), ) # Validate main.py main_code = candidate_files.get("main.py", "") validation = validate_code(main_code) issue_text = "" repaired = False if not validation.ok: issue_text = format_issues_for_llm(validation) logger.info( "Stage 13 iteration %d validation failed: %s", iteration, validation.summary(), ) irp = _pm.sub_prompt( "iterative_repair", issue_text=issue_text, all_files_ctx=_files_to_context(candidate_files), ) repair_response = _chat_with_prompt(llm, irp.system, irp.user) candidate_files["main.py"] = _extract_code_block(repair_response.content) validation = validate_code(candidate_files["main.py"]) repaired = True # Save version directory version_dir = stage_dir / f"experiment_v{iteration}" _write_project(version_dir, candidate_files) iter_record: dict[str, Any] = { "iteration": iteration, "version_dir": f"experiment_v{iteration}/", "files": list(candidate_files.keys()), "validation_ok": validation.ok, "validation_summary": validation.summary(), "repaired": repaired, "metric": None, "improved": False, } if issue_text: iter_record["validation_issues"] = issue_text metric_val = None # R6-3: initialize before conditional block if validation.ok and config.experiment.mode in ("sandbox", "docker"): # P7: Ensure deps for refined code (subprocess sandbox only) if config.experiment.mode == "sandbox": _refine_code = "\n".join(candidate_files.values()) _ensure_sandbox_deps(_refine_code, config.experiment.sandbox.python_path) sandbox = create_sandbox( config.experiment, stage_dir / f"refine_sandbox_v{iteration}", ) rerun = sandbox.run_project( version_dir, timeout_sec=config.experiment.time_budget_sec, ) metric_val = _find_metric(rerun.metrics, metric_key) # R19-1: Store stdout (capped) so PAIRED lines survive for Stage 14 _stdout_cap = rerun.stdout[:50000] if rerun.stdout else "" iter_record["sandbox"] = { "returncode": rerun.returncode, "metrics": rerun.metrics, "elapsed_sec": rerun.elapsed_sec, "timed_out": rerun.timed_out, "stderr": rerun.stderr[:2000] if rerun.stderr else "", "stdout": _stdout_cap, } iter_record["metric"] = metric_val # BUG-110: Parse ABLATION_CHECK lines from stdout if rerun.stdout: import re as _re_ablation _ablation_checks = _re_ablation.findall( r"ABLATION_CHECK:\s*(\S+)\s+vs\s+(\S+)\s+outputs_differ=(True|False)", rerun.stdout, ) if _ablation_checks: _identical_pairs = [ (c1, c2) for c1, c2, diff in _ablation_checks if diff == "False" ] iter_record["ablation_checks"] = [ {"cond1": c1, "cond2": c2, "differ": diff == "True"} for c1, c2, diff in _ablation_checks ] if _identical_pairs: _pairs_str = ", ".join(f"{c1} vs {c2}" for c1, c2 in _identical_pairs) logger.warning( "BUG-110: Identical ablation outputs detected: %s. " "Ablation conditions may not be wired correctly.", _pairs_str, ) iter_record["ablation_identical"] = True # --- Track timeout in refine sandbox --- if rerun.timed_out: prior_timed_out = True timeout_refine_attempts += 1 logger.warning( "Stage 13 iteration %d: sandbox timed out after %.1fs", iteration, rerun.elapsed_sec, ) # If still no metrics after timeout, use partial stdout metrics if not rerun.metrics and rerun.stdout: from researchclaw.experiment.sandbox import parse_metrics as _parse_sb_metrics partial = _parse_sb_metrics(rerun.stdout) if partial: iter_record["sandbox"]["metrics"] = partial metric_val = _find_metric(partial, metric_key) iter_record["metric"] = metric_val logger.info( "Stage 13 iteration %d: recovered %d partial metrics from timeout stdout", iteration, len(partial), ) # --- Detect runtime issues (NaN/Inf, stderr warnings) --- runtime_issues = _detect_runtime_issues(rerun) if runtime_issues: iter_record["runtime_issues"] = runtime_issues logger.info( "Stage 13 iteration %d: runtime issues detected: %s", iteration, runtime_issues[:200], ) # Attempt LLM repair with runtime context rrp = _pm.sub_prompt( "iterative_repair", issue_text=runtime_issues, all_files_ctx=_files_to_context(candidate_files), ) repair_resp = _chat_with_prompt(llm, rrp.system, rrp.user) repaired_files = _extract_multi_file_blocks(repair_resp.content) if not repaired_files: single = _extract_code_block(repair_resp.content) if single.strip(): repaired_files = dict(candidate_files) repaired_files["main.py"] = single if repaired_files: # BUG-106 fix: merge instead of replace to preserve # supporting modules (trainers.py, utils.py, etc.) merged = dict(candidate_files) merged.update(repaired_files) candidate_files = merged _write_project(version_dir, candidate_files) # Re-run after runtime fix sandbox2 = create_sandbox( config.experiment, stage_dir / f"refine_sandbox_v{iteration}_fix", ) rerun2 = sandbox2.run_project( version_dir, timeout_sec=config.experiment.time_budget_sec, ) metric_val = _find_metric(rerun2.metrics, metric_key) iter_record["sandbox_after_fix"] = { "returncode": rerun2.returncode, "metrics": rerun2.metrics, "elapsed_sec": rerun2.elapsed_sec, "timed_out": rerun2.timed_out, } iter_record["metric"] = metric_val iter_record["runtime_repaired"] = True if metric_val is not None: consecutive_no_metrics = 0 # R6-1: Only count toward no_improve_streak when we have real metrics if _is_better(metric_val, best_metric): best_metric = metric_val best_files = dict(candidate_files) best_version = f"experiment_v{iteration}/" iter_record["improved"] = True no_improve_streak = 0 else: no_improve_streak += 1 else: consecutive_no_metrics += 1 elif validation.ok and best_version == "experiment/": best_files = dict(candidate_files) best_version = f"experiment_v{iteration}/" # P1: Track metric for saturation detection _metrics_history.append(metric_val) log["iterations"].append(iter_record) if consecutive_no_metrics >= 3: log["stop_reason"] = "consecutive_no_metrics" logger.warning("Stage 13: Aborting after %d consecutive iterations without metrics", consecutive_no_metrics) break if no_improve_streak >= 2: log["converged"] = True log["stop_reason"] = "no_improvement_for_2_iterations" logger.info( "Stage 13 converged after %d iterations (no improvement streak=%d)", iteration, no_improve_streak, ) break # Write final experiment directory final_dir = stage_dir / "experiment_final" _write_project(final_dir, best_files) # Backward compat: also write experiment_final.py (copy of main.py) if "main.py" in best_files: (stage_dir / "experiment_final.py").write_text( best_files["main.py"], encoding="utf-8" ) log["best_metric"] = best_metric log["best_version"] = best_version log["final_version"] = "experiment_final/" # BUG-110: Aggregate ablation check results across iterations _all_ablation_identical = any( iter_rec.get("ablation_identical", False) for iter_rec in log.get("iterations", []) if isinstance(iter_rec, dict) ) if _all_ablation_identical: log["ablation_identical_warning"] = True (stage_dir / "refinement_log.json").write_text( json.dumps(log, indent=2), encoding="utf-8" ) artifacts = ["refinement_log.json", "experiment_final/"] artifacts.extend( entry["version_dir"] for entry in log["iterations"] if isinstance(entry, dict) and isinstance(entry.get("version_dir"), str) ) return StageResult( stage=Stage.ITERATIVE_REFINE, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-13/{a}" for a in artifacts), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_experiment_design.py ================================================ """Stage 9: Experiment design.""" from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any import yaml from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, _build_context_preamble, _chat_with_prompt, _extract_yaml_block, _get_evolution_overlay, _load_hardware_profile, _read_prior_artifact, _safe_json_loads, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_experiment_design( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: hypotheses = _read_prior_artifact(run_dir, "hypotheses.md") or "" preamble = _build_context_preamble( config, run_dir, include_goal=True, include_hypotheses=True ) plan: dict[str, Any] | None = None # ── Domain detection ────────────────────────────────────────────────── # Detect the research domain early so we can adapt experiment design # and code generation. For ML domains, existing behavior is unchanged. _domain_profile = None try: from researchclaw.domains.detector import detect_domain as _detect_domain_adv _domain_profile = _detect_domain_adv( topic=config.research.topic, hypotheses=hypotheses, ) logger.info( "Domain detected: %s (%s)", _domain_profile.display_name, _domain_profile.domain_id, ) # Persist domain profile for Stage 10 import json as _json_dd (stage_dir / "domain_profile.json").write_text( _json_dd.dumps({ "domain_id": _domain_profile.domain_id, "display_name": _domain_profile.display_name, "experiment_paradigm": _domain_profile.experiment_paradigm, "core_libraries": _domain_profile.core_libraries, "gpu_required": _domain_profile.gpu_required, }, indent=2), encoding="utf-8", ) except Exception: # noqa: BLE001 logger.debug("Domain detection unavailable", exc_info=True) if llm is not None: _pm = prompts or PromptManager() # Pass dataset_guidance block for experiment design try: _dg_block = _pm.block("dataset_guidance") except (KeyError, Exception): # noqa: BLE001 _dg_block = "" # I-08: Inject RL step guidance for RL topics _rl_kws = ("reinforcement learning", "ppo", "sac", "td3", "ddpg", "dqn", "mujoco", "continuous control", "actor-critic", "policy gradient", "exploration bonus") _is_rl_topic = any(kw in config.research.topic.lower() for kw in _rl_kws) if _is_rl_topic: try: _dg_block += _pm.block("rl_step_guidance") except Exception: # noqa: BLE001 pass # Improvement G: For RL with short budget, constrain to classic control if config.experiment.time_budget_sec <= 3600: _dg_block += ( "\n\n## RL TIME CONSTRAINT (MANDATORY):\n" f"Your time budget is {config.experiment.time_budget_sec}s (≤ 3600s).\n" "You MUST use ONLY classic control environments: " "CartPole-v1, Pendulum-v1, MountainCar-v0, Acrobot-v1, LunarLander-v3.\n" "Do NOT use MuJoCo (HalfCheetah, Hopper, Walker2d, Ant, Humanoid) — " "they require >5000s for meaningful training.\n" ) if config.experiment.time_budget_sec <= 1800: _dg_block += ( "Time budget ≤ 1800s: use ONLY CartPole-v1 or Pendulum-v1 " "(the simplest environments).\n" ) # F-01: Inject framework docs for experiment design try: from researchclaw.data import detect_frameworks, load_framework_docs _fw_ids = detect_frameworks(config.research.topic, hypotheses) if _fw_ids: _fw_docs = load_framework_docs(_fw_ids, max_chars=4000) if _fw_docs: _dg_block += _fw_docs except Exception: # noqa: BLE001 pass # Improvement A: Compute hardware profile + per-condition budget _hw_profile_str = ( "- GPU: NVIDIA RTX 6000 Ada (49140 MB VRAM)\n" "- GPU count: 1\n" "- CPU: shared server" ) _per_condition_sec = int(config.experiment.time_budget_sec * 0.7 / 6) _tier1 = "CIFAR-10, CIFAR-100, MNIST, FashionMNIST, STL-10, SVHN" _overlay = _get_evolution_overlay(run_dir, "experiment_design") sp = _pm.for_stage( "experiment_design", evolution_overlay=_overlay, preamble=preamble, hypotheses=hypotheses, dataset_guidance=_dg_block, time_budget_sec=config.experiment.time_budget_sec, metric_key=config.experiment.metric_key, metric_direction=config.experiment.metric_direction, hardware_profile=_hw_profile_str, per_condition_budget_sec=_per_condition_sec, available_tier1_datasets=_tier1, ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) raw_yaml = _extract_yaml_block(resp.content) try: parsed = yaml.safe_load(raw_yaml) except yaml.YAMLError: parsed = None # Fallback: reasoning models sometimes emit the YAML without fences # or wrapped in prose. Try parsing the whole response as YAML. if not isinstance(parsed, dict): try: parsed = yaml.safe_load(resp.content) except yaml.YAMLError: pass # Last fallback: try to find any YAML-like dict in the response if not isinstance(parsed, dict): import re as _re_yaml # Look for lines starting with known keys _yaml_lines = [] _capturing = False for line in resp.content.splitlines(): if _re_yaml.match( r"^(baselines|proposed_methods|ablations|datasets|" r"metrics|objectives|risks|compute_budget)\s*:", line, ): _capturing = True if _capturing: if line.strip() == "" or line.startswith("```"): continue if line.startswith("#") or line.startswith("**"): continue _yaml_lines.append(line) if _yaml_lines: try: parsed = yaml.safe_load("\n".join(_yaml_lines)) except yaml.YAMLError: pass if isinstance(parsed, dict): plan = parsed else: logger.warning( "Stage 09: LLM response could not be parsed as YAML " "(len=%d, first 200 chars: %s). Content extraction method " "returned: %s", len(resp.content), resp.content[:200], raw_yaml[:200] if raw_yaml else "", ) # BUG-12: Retry with a stricter, shorter prompt if llm is not None: logger.info("Stage 09: Retrying with strict YAML-only prompt...") _retry_prompt = ( "Output ONLY valid YAML. No prose, no markdown fences, no explanation.\n" f"Topic: {config.research.topic}\n" "Required keys: baselines, proposed_methods, ablations, " "datasets, metrics, objectives, risks, compute_budget.\n" "Each key maps to a list of strings." ) _retry_resp = _chat_with_prompt( llm, "You output ONLY valid YAML. Nothing else.", _retry_prompt, max_tokens=4096, ) try: _retry_parsed = yaml.safe_load(_retry_resp.content) if isinstance(_retry_parsed, dict): plan = _retry_parsed logger.info("Stage 09: Strict YAML retry succeeded.") except yaml.YAMLError: pass # BUG-12: Fallback 4 — extract method/baseline names from Stage 8 hypotheses if plan is None: _hyp_text = _read_prior_artifact(run_dir, "hypotheses.md") or "" if _hyp_text: import re as _re_hyp # Extract method-like names from hypothesis text _method_candidates = _re_hyp.findall( r"(?:proposed|our|novel|new)\s+(?:method|approach|algorithm|framework|model)[:\s]+[\"']?([A-Za-z][\w-]+)", _hyp_text, _re_hyp.IGNORECASE, ) _baseline_candidates = _re_hyp.findall( r"(?:baseline|compare|existing|standard|traditional)\s+(?:method|approach|model)?[:\s]+[\"']?([A-Za-z][\w-]+)", _hyp_text, _re_hyp.IGNORECASE, ) if _method_candidates or _baseline_candidates: logger.info( "Stage 09: Extracted names from hypotheses: methods=%s, baselines=%s", _method_candidates[:3], _baseline_candidates[:3], ) plan = { "topic": config.research.topic, "generated": _utcnow_iso(), "objectives": ["Evaluate hypotheses with controlled experiments"], "datasets": ["primary_dataset"], "baselines": _baseline_candidates[:3] or ["baseline_1", "baseline_2"], "proposed_methods": _method_candidates[:3] or ["proposed_method"], "ablations": ["without_key_component", "simplified_version"], "metrics": [config.experiment.metric_key, "secondary_metric"], "risks": ["validity threats", "confounding variables"], "compute_budget": {"max_gpu": 1, "max_hours": 4}, } if plan is None: # BUG-12: Use domain-aware names instead of fully generic placeholders _topic_prefix = config.research.topic.split()[0] if config.research.topic else "method" logger.warning( "Stage 09: LLM failed to produce valid experiment plan YAML. " "Using topic-derived fallback." ) plan = { "topic": config.research.topic, "generated": _utcnow_iso(), "objectives": ["Evaluate hypotheses with controlled experiments"], "datasets": ["primary_dataset", "secondary_dataset"], "baselines": [f"{_topic_prefix}_baseline_1", f"{_topic_prefix}_baseline_2"], "proposed_methods": [f"{_topic_prefix}_proposed", f"{_topic_prefix}_variant"], "ablations": ["without_key_component", "simplified_version"], "metrics": [config.experiment.metric_key, "secondary_metric"], "risks": ["validity threats", "confounding variables"], "compute_budget": {"max_gpu": 1, "max_hours": 4}, } # ── BA: BenchmarkAgent — intelligent dataset/baseline selection ────── _benchmark_plan = None # BUG-40: Skip BenchmarkAgent for non-ML domains — it has no relevant # benchmarks for physics/chemistry/mathematics/etc. and would inject # wrong datasets (e.g., CIFAR-10 for PDE topics). _ba_domain_id, _, _ = _detect_domain( config.research.topic, tuple(config.research.domains) if config.research.domains else (), ) _ba_domain_ok = _ba_domain_id == "ml" if not _ba_domain_ok: logger.info( "BenchmarkAgent skipped: domain '%s' is not ML (topic: %s)", _ba_domain_id, config.research.topic[:80], ) if ( _ba_domain_ok and config.experiment.benchmark_agent.enabled and config.experiment.mode in ("sandbox", "docker") and llm is not None ): try: from researchclaw.agents.benchmark_agent import BenchmarkOrchestrator from researchclaw.agents.benchmark_agent.orchestrator import ( BenchmarkAgentConfig as _BACfg, ) _ba_cfg_raw = config.experiment.benchmark_agent _ba_cfg = _BACfg( enabled=_ba_cfg_raw.enabled, enable_hf_search=_ba_cfg_raw.enable_hf_search, max_hf_results=_ba_cfg_raw.max_hf_results, enable_web_search=_ba_cfg_raw.enable_web_search, max_web_results=_ba_cfg_raw.max_web_results, web_search_min_local=_ba_cfg_raw.web_search_min_local, tier_limit=_ba_cfg_raw.tier_limit, min_benchmarks=_ba_cfg_raw.min_benchmarks, min_baselines=_ba_cfg_raw.min_baselines, prefer_cached=_ba_cfg_raw.prefer_cached, max_iterations=_ba_cfg_raw.max_iterations, ) _hw = _load_hardware_profile(run_dir) _ba = BenchmarkOrchestrator( llm, config=_ba_cfg, gpu_memory_mb=( _hw.get("gpu_memory_mb", 49000) if _hw else 49000 ), time_budget_sec=config.experiment.time_budget_sec, network_policy=( config.experiment.docker.network_policy if config.experiment.mode == "docker" else "full" ), stage_dir=stage_dir / "benchmark_agent", ) _benchmark_plan = _ba.orchestrate({ "topic": config.research.topic, "hypothesis": hypotheses, "experiment_plan": plan.get("objectives", "") if isinstance(plan, dict) else "", }) # Inject BenchmarkAgent selections into experiment plan if isinstance(plan, dict) and _benchmark_plan.selected_benchmarks: plan["datasets"] = [ b.get("name", "Unknown") for b in _benchmark_plan.selected_benchmarks ] # Normalize existing baselines to list of strings # BUG-35: LLM may emit baselines as dict, list of dicts, # or list of strings — normalize all to list[str]. _baselines_from_plan = plan.get("baselines", []) if isinstance(_baselines_from_plan, dict): _baselines_from_plan = list(_baselines_from_plan.keys()) elif isinstance(_baselines_from_plan, list): _baselines_from_plan = [ item["name"] if isinstance(item, dict) else str(item) for item in _baselines_from_plan ] else: _baselines_from_plan = [] plan["baselines"] = [ bl.get("name", "Unknown") for bl in _benchmark_plan.selected_baselines ] + _baselines_from_plan # Deduplicate baselines plan["baselines"] = list(dict.fromkeys(plan["baselines"])) logger.info( "BenchmarkAgent: %d benchmarks, %d baselines selected (%d LLM calls, %.1fs)", len(_benchmark_plan.selected_benchmarks), len(_benchmark_plan.selected_baselines), _benchmark_plan.total_llm_calls, _benchmark_plan.elapsed_sec, ) except Exception as _ba_exc: logger.warning("BenchmarkAgent failed (non-fatal): %s", _ba_exc) # Save benchmark plan for code_generation stage if _benchmark_plan is not None: try: (stage_dir / "benchmark_plan.json").write_text( json.dumps(_benchmark_plan.to_dict(), indent=2, ensure_ascii=False), encoding="utf-8", ) except Exception: # noqa: BLE001 pass plan.setdefault("topic", config.research.topic) # BUG-R41-09: Enforce condition count limit based on time budget. # Too many conditions (30+) guarantee timeouts and wasted compute. _time_budget = getattr( getattr(config, "experiment", None), "time_budget_sec", 3600 ) _max_conditions = 8 # default for budgets ≤ 3600s if _time_budget > 3600: _max_conditions = 12 if _time_budget > 7200: _max_conditions = 20 _baselines = plan.get("baselines", []) if isinstance(_baselines, dict): _baselines = list(_baselines.values()) _proposed = plan.get("proposed_methods", []) if isinstance(_proposed, dict): _proposed = list(_proposed.values()) _ablations = plan.get("ablations", []) if isinstance(_ablations, dict): _ablations = list(_ablations.values()) _total = len(_baselines) + len(_proposed) + len(_ablations) if _total > _max_conditions: logger.warning( "Stage 9: Plan has %d conditions (limit %d for %ds budget). " "Trimming to fit.", _total, _max_conditions, _time_budget, ) # Keep all proposed methods (up to max), trim baselines and ablations _proposed_count = min(len(_proposed), max(1, _max_conditions - 4)) _remaining = max(0, _max_conditions - _proposed_count) _baseline_budget = max(1, _remaining // 2) _ablation_budget = max(0, _remaining - _baseline_budget) if len(_proposed) > _proposed_count: plan["proposed_methods"] = _proposed[:_proposed_count] logger.info( "Stage 9: Trimmed proposed methods %d → %d", len(_proposed), _proposed_count, ) if len(_baselines) > _baseline_budget: plan["baselines"] = _baselines[:_baseline_budget] logger.info( "Stage 9: Trimmed baselines %d → %d", len(_baselines), _baseline_budget, ) if len(_ablations) > _ablation_budget: plan["ablations"] = _ablations[:_ablation_budget] logger.info( "Stage 9: Trimmed ablations %d → %d", len(_ablations), _ablation_budget, ) (stage_dir / "exp_plan.yaml").write_text( yaml.dump(plan, default_flow_style=False, allow_unicode=True), encoding="utf-8", ) return StageResult( stage=Stage.EXPERIMENT_DESIGN, status=StageStatus.DONE, artifacts=("exp_plan.yaml",), evidence_refs=("stage-09/exp_plan.yaml",), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_literature.py ================================================ """Stages 3-6: Search strategy, literature collection, screening, and knowledge extraction.""" from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any import yaml from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._helpers import ( StageResult, _build_fallback_queries, _chat_with_prompt, _extract_topic_keywords, _extract_yaml_block, _get_evolution_overlay, _parse_jsonl_rows, _read_prior_artifact, _safe_filename, _safe_json_loads, _utcnow_iso, _write_jsonl, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Local helpers # --------------------------------------------------------------------------- def _expand_search_queries(queries: list[str], topic: str) -> list[str]: """Expand search queries for broader literature coverage. Generates additional queries by extracting key phrases from the topic and creating focused sub-queries. This ensures we find papers even when the original queries are too narrow or specific for arXiv. """ expanded = list(queries) # keep originals seen = {q.lower().strip() for q in queries} # Extract key phrases from topic by splitting on common delimiters # e.g. "Comparing A, B, and C on X with Y" → ["A", "B", "C", "X", "Y"] topic_words = topic.split() # Generate shorter, broader queries from the topic if len(topic_words) > 5: # First 5 words as a broader query broad = " ".join(topic_words[:5]) if broad.lower().strip() not in seen: expanded.append(broad) seen.add(broad.lower().strip()) # Last 5 words as another perspective tail = " ".join(topic_words[-5:]) if tail.lower().strip() not in seen: expanded.append(tail) seen.add(tail.lower().strip()) # Add "survey" and "benchmark" variants of the topic for suffix in ("survey", "benchmark", "comparison"): # Take first 4 content words + suffix short_topic = " ".join(topic_words[:4]) variant = f"{short_topic} {suffix}" if variant.lower().strip() not in seen: expanded.append(variant) seen.add(variant.lower().strip()) return expanded # --------------------------------------------------------------------------- # Stage executors # --------------------------------------------------------------------------- def _execute_search_strategy( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: problem_tree = _read_prior_artifact(run_dir, "problem_tree.md") or "" topic = config.research.topic plan: dict[str, Any] | None = None sources: list[dict[str, Any]] | None = None if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "search_strategy") sp = _pm.for_stage("search_strategy", evolution_overlay=_overlay, topic=topic, problem_tree=problem_tree) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) payload = _safe_json_loads(resp.content, {}) if isinstance(payload, dict): yaml_text = str(payload.get("search_plan_yaml", "")).strip() if yaml_text: try: parsed = yaml.safe_load(_extract_yaml_block(yaml_text)) except yaml.YAMLError: parsed = None if isinstance(parsed, dict): plan = parsed src = payload.get("sources", []) if isinstance(src, list): sources = [item for item in src if isinstance(item, dict)] if plan is None: # Build smart fallback queries by extracting key terms from topic # instead of using the raw (often very long) topic string. _fallback_queries = _build_fallback_queries(topic) plan = { "topic": topic, "generated": _utcnow_iso(), "search_strategies": [ { "name": "keyword_core", "queries": _fallback_queries[:5], "sources": ["arxiv", "semantic_scholar", "openreview"], "max_results_per_query": 60, }, { "name": "backward_forward_citation", "queries": _fallback_queries[5:10] or _fallback_queries[:3], "sources": ["semantic_scholar", "google_scholar"], "depth": 1, }, ], "filters": { "min_year": 2020, "language": ["en"], "peer_review_preferred": True, }, "deduplication": {"method": "title_doi_hash", "fuzzy_threshold": 0.9}, } if not sources: sources = [ { "id": "arxiv", "name": "arXiv", "type": "api", "url": "https://export.arxiv.org/api/query", "status": "available", "query": topic, "verified_at": _utcnow_iso(), }, { "id": "semantic_scholar", "name": "Semantic Scholar", "type": "api", "url": "https://api.semanticscholar.org/graph/v1/paper/search", "status": "available", "query": topic, "verified_at": _utcnow_iso(), }, ] if config.openclaw_bridge.use_web_fetch: for src in sources: try: response = adapters.web_fetch.fetch(str(src.get("url", ""))) src["status"] = ( "verified" if response.status_code in (200, 301, 302, 405) else "unreachable" ) src["http_status"] = response.status_code except Exception: # noqa: BLE001 src["status"] = "unknown" (stage_dir / "search_plan.yaml").write_text( yaml.dump(plan, default_flow_style=False, allow_unicode=True), encoding="utf-8", ) (stage_dir / "sources.json").write_text( json.dumps( {"sources": sources, "count": len(sources), "generated": _utcnow_iso()}, indent=2, ), encoding="utf-8", ) # F1.5: Extract queries from plan for Stage 4 real literature search queries_list: list[str] = [] year_min = 2020 if isinstance(plan, dict): strategies = plan.get("search_strategies", []) if isinstance(strategies, list): for strat in strategies: if isinstance(strat, dict): qs = strat.get("queries", []) if isinstance(qs, list): queries_list.extend(str(q) for q in qs if q) filters = plan.get("filters", {}) if isinstance(filters, dict) and filters.get("min_year"): try: year_min = int(filters["min_year"]) except (ValueError, TypeError): pass # --- Sanitize queries: shorten overly long queries --- # LLMs often produce the full topic title as a query, which is too long for # arXiv and Semantic Scholar (they work best with 3-8 keyword queries). _stop = { "a", "an", "the", "of", "for", "in", "on", "and", "or", "with", "to", "by", "from", "its", "is", "are", "was", "be", "as", "at", "via", "using", "based", "study", "analysis", "empirical", "towards", "toward", "into", "exploring", "comparison", "tasks", "effectiveness", "investigation", "comprehensive", "novel", } def _extract_keywords(text: str) -> list[str]: """Extract meaningful keywords from text, removing stop words.""" return [ w for w in re.split(r"[^a-zA-Z0-9]+", text) if w.lower() not in _stop and len(w) > 1 ] _MAX_QUERY_LEN = 60 # characters — beyond this, shorten to keywords _SEARCH_SUFFIXES = ["benchmark", "survey", "seminal", "state of the art"] def _shorten_query(q: str, max_kw: int = 6) -> str: """Shorten a query to *max_kw* keywords, preserving any trailing suffix.""" q_stripped = q.strip() # Check if query ends with a known search suffix suffix = "" q_core = q_stripped for sfx in _SEARCH_SUFFIXES: if q_stripped.lower().endswith(sfx): suffix = sfx q_core = q_stripped[: -len(sfx)].strip() break # Extract keywords from the core part kws = _extract_keywords(q_core) shortened = " ".join(kws[:max_kw]) if suffix: shortened = f"{shortened} {suffix}" return shortened if queries_list: sanitized: list[str] = [] for q in queries_list: if len(q) > _MAX_QUERY_LEN: shortened = _shorten_query(q) if shortened.strip(): sanitized.append(shortened) else: sanitized.append(q) queries_list = sanitized if not queries_list: # Build diverse keyword queries from the topic _words = _extract_keywords(topic) kw_primary = " ".join(_words[:6]) kw_short = " ".join(_words[:4]) queries_list = [ kw_primary, f"{kw_short} benchmark", f"{kw_short} survey", ] # Ensure minimum query diversity — if dedup leaves too few, add variants _all_kw = _extract_keywords(topic) _seen_q: set[str] = set() unique_queries: list[str] = [] for q in queries_list: q_lower = q.strip().lower() if q_lower and q_lower not in _seen_q: _seen_q.add(q_lower) unique_queries.append(q.strip()) # If we have fewer than 5 unique queries, generate supplemental keyword variants if len(unique_queries) < 5 and len(_all_kw) >= 3: supplements = [ " ".join(_all_kw[:4]) + " survey", " ".join(_all_kw[:4]) + " benchmark", " ".join(_all_kw[1:5]), # shifted window for diversity " ".join(_all_kw[:3]) + " comparison", " ".join(_all_kw[:3]) + " deep learning", " ".join(_all_kw[2:6]), # another shifted window ] for s in supplements: s_lower = s.strip().lower() if s_lower not in _seen_q: _seen_q.add(s_lower) unique_queries.append(s.strip()) if len(unique_queries) >= 8: break queries_list = unique_queries (stage_dir / "queries.json").write_text( json.dumps({"queries": queries_list, "year_min": year_min}, indent=2), encoding="utf-8", ) return StageResult( stage=Stage.SEARCH_STRATEGY, status=StageStatus.DONE, artifacts=("search_plan.yaml", "sources.json", "queries.json"), evidence_refs=( "stage-03/search_plan.yaml", "stage-03/sources.json", "stage-03/queries.json", ), ) def _execute_literature_collect( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: """Stage 4: Collect literature — prefer real APIs, fallback to LLM.""" topic = config.research.topic # Read queries.json from Stage 3 (F1.5 output) queries_text = _read_prior_artifact(run_dir, "queries.json") queries_data = _safe_json_loads(queries_text or "{}", {}) queries: list[str] = queries_data.get("queries", [topic]) year_min: int = queries_data.get("year_min", 2020) # --- Try real API search first --- candidates: list[dict[str, Any]] = [] bibtex_entries: list[str] = [] real_search_succeeded = False try: from researchclaw.literature.search import ( search_papers_multi_query, papers_to_bibtex, ) # Expand queries for broader coverage expanded_queries = _expand_search_queries(queries, config.research.topic) logger.info( "[literature] Searching %d queries (expanded from %d) " "across OpenAlex → S2 → arXiv…", len(expanded_queries), len(queries), ) papers = search_papers_multi_query( expanded_queries, limit_per_query=40, year_min=year_min, s2_api_key=config.llm.s2_api_key, ) if papers: real_search_succeeded = True # Count by source src_counts: dict[str, int] = {} for p in papers: src_counts[p.source] = src_counts.get(p.source, 0) + 1 d = p.to_dict() d["collected_at"] = _utcnow_iso() candidates.append(d) bibtex_entries.append(p.to_bibtex()) src_str = ", ".join(f"{s}: {n}" for s, n in src_counts.items()) logger.info( "[literature] Found %d papers (%s)", len(papers), src_str ) except Exception: # noqa: BLE001 logger.warning( "[rate-limit] Literature search failed — falling back to LLM", exc_info=True, ) # --- Inject foundational/seminal papers --- try: from researchclaw.data import load_seminal_papers seminal = load_seminal_papers(topic) if seminal: _existing_titles = {c.get("title", "").lower() for c in candidates} _injected = 0 for sp in seminal: if sp.get("title", "").lower() not in _existing_titles: candidates.append({ "id": f"seminal-{sp.get('cite_key', '')}", "title": sp.get("title", ""), "source": "seminal_library", "url": "", "year": sp.get("year", 2020), "abstract": f"Foundational paper on {', '.join(sp.get('keywords', [])[:3])}.", "authors": [{"name": sp.get("authors", "")}], "cite_key": sp.get("cite_key", ""), "venue": sp.get("venue", ""), "collected_at": _utcnow_iso(), }) _injected += 1 if _injected: logger.info("Stage 4: Injected %d seminal papers from seed library", _injected) except Exception: # noqa: BLE001 logger.debug("Seminal paper injection skipped", exc_info=True) # --- Fallback: LLM-generated candidates --- if not candidates and llm is not None: plan_text = _read_prior_artifact(run_dir, "search_plan.yaml") or "" _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "literature_collect") sp = _pm.for_stage("literature_collect", evolution_overlay=_overlay, topic=topic, plan_text=plan_text) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) payload = _safe_json_loads(resp.content, {}) if isinstance(payload, dict) and isinstance(payload.get("candidates"), list): candidates = [row for row in payload["candidates"] if isinstance(row, dict)] # --- Web search augmentation (Tavily/DDG + Google Scholar + Crawl4AI) --- web_context_parts: list[str] = [] if config.web_search.enabled: try: from researchclaw.web.agent import WebSearchAgent import os tavily_key = config.web_search.tavily_api_key or os.environ.get( config.web_search.tavily_api_key_env, "" ) web_agent = WebSearchAgent( tavily_api_key=tavily_key, enable_scholar=config.web_search.enable_scholar, enable_crawling=config.web_search.enable_crawling, enable_pdf=config.web_search.enable_pdf_extraction, max_web_results=config.web_search.max_web_results, max_scholar_results=config.web_search.max_scholar_results, max_crawl_urls=config.web_search.max_crawl_urls, ) web_result = web_agent.search_and_extract( topic, search_queries=queries, ) # Convert Google Scholar papers into candidates for sp in web_result.scholar_papers: _existing_titles = { str(c.get("title", "")).lower().strip() for c in candidates } if sp.title.lower().strip() not in _existing_titles: lit_paper = sp.to_literature_paper() d = lit_paper.to_dict() d["collected_at"] = _utcnow_iso() candidates.append(d) bibtex_entries.append(lit_paper.to_bibtex()) # Save web search context for downstream stages web_context = web_result.to_context_string(max_length=20_000) if web_context.strip(): (stage_dir / "web_context.md").write_text( web_context, encoding="utf-8" ) web_context_parts.append(web_context) # Save full web search metadata (stage_dir / "web_search_result.json").write_text( json.dumps(web_result.to_dict(), indent=2, default=str), encoding="utf-8", ) logger.info( "[web-search] Added %d scholar papers, %d web results, %d crawled pages", len(web_result.scholar_papers), len(web_result.web_results), len(web_result.crawled_pages), ) except Exception: # noqa: BLE001 logger.warning( "[web-search] Web search augmentation failed — continuing with academic APIs only", exc_info=True, ) # --- Ultimate fallback: placeholder data --- # BUG-L2: Do NOT overwrite real_search_succeeded here — it was already # set correctly in the search block above. Overwriting would mislabel # LLM-hallucinated or seminal papers as "real search" results. if not candidates: logger.warning("Stage 4: All literature searches failed — using placeholder papers") candidates = [ { "id": f"candidate-{idx + 1}", "title": f"[Placeholder] Study {idx + 1} on {topic}", "source": "arxiv" if idx % 2 == 0 else "semantic_scholar", "url": f"https://example.org/{_safe_filename(topic.lower())}/{idx + 1}", "year": 2024, "abstract": f"This candidate investigates {topic} and reports preliminary findings.", "collected_at": _utcnow_iso(), "is_placeholder": True, } for idx in range(max(20, config.research.daily_paper_count or 20)) ] # Write candidates out = stage_dir / "candidates.jsonl" _write_jsonl(out, candidates) # BUG-50 fix: Generate BibTeX from candidates when real search failed # (LLM/placeholder fallback paths don't populate bibtex_entries) if not bibtex_entries and candidates: for c in candidates: if c.get("is_placeholder"): continue _ck = c.get("cite_key", "") if not _ck: # Derive cite_key from first author surname + year _authors = c.get("authors", []) _surname = "unknown" if isinstance(_authors, list) and _authors: _a0 = _authors[0] if isinstance(_authors[0], str) else (_authors[0].get("name", "") if isinstance(_authors[0], dict) else "") _surname = _a0.split()[-1].lower() if _a0.strip() else "unknown" _yr = c.get("year", 2024) _title_word = "".join( w[0] for w in str(c.get("title", "study")).split()[:3] ).lower() _ck = f"{_surname}{_yr}{_title_word}" _title = c.get("title", "Untitled") _year = c.get("year", 2024) _author_str = "" _raw_authors = c.get("authors", []) if isinstance(_raw_authors, list): _names = [] for _a in _raw_authors: if isinstance(_a, str): _names.append(_a) elif isinstance(_a, dict): _names.append(_a.get("name", "")) _author_str = " and ".join(n for n in _names if n) bibtex_entries.append( f"@article{{{_ck},\n" f" title={{{_title}}},\n" f" author={{{_author_str or 'Unknown'}}},\n" f" year={{{_year}}},\n" f" url={{{c.get('url', '')}}},\n" f"}}" ) logger.info( "Stage 4: Generated %d BibTeX entries from candidates (fallback)", len(bibtex_entries), ) # Write references.bib (F2.4) artifacts = ["candidates.jsonl"] if web_context_parts: artifacts.append("web_context.md") if (stage_dir / "web_search_result.json").exists(): artifacts.append("web_search_result.json") if bibtex_entries: bib_content = "\n\n".join(bibtex_entries) + "\n" (stage_dir / "references.bib").write_text(bib_content, encoding="utf-8") artifacts.append("references.bib") logger.info( "Stage 4: Wrote %d BibTeX entries to references.bib", len(bibtex_entries) ) # Write search metadata (stage_dir / "search_meta.json").write_text( json.dumps( { "real_search": real_search_succeeded, "queries_used": queries, "year_min": year_min, "total_candidates": len(candidates), "bibtex_entries": len(bibtex_entries), "ts": _utcnow_iso(), }, indent=2, ), encoding="utf-8", ) artifacts.append("search_meta.json") return StageResult( stage=Stage.LITERATURE_COLLECT, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-04/{a}" for a in artifacts), ) def _execute_literature_screen( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl") or "" # --- P1-1: keyword relevance pre-filter --- # Before LLM screening, drop papers whose title+abstract share no keywords # with the research topic. This catches cross-domain noise cheaply. topic_keywords = _extract_topic_keywords( config.research.topic, config.research.domains ) filtered_rows: list[dict[str, Any]] = [] dropped_count = 0 for raw_line in candidates_text.strip().splitlines(): row = _safe_json_loads(raw_line, {}) if not isinstance(row, dict): continue title = str(row.get("title", "")).lower() abstract = str(row.get("abstract", "")).lower() text_blob = f"{title} {abstract}" overlap = sum(1 for kw in topic_keywords if kw in text_blob) # T2.2: Relaxed from ≥2 to ≥1 keyword hit — previous threshold was # too aggressive (94% rejection rate). Single-keyword matches are # still screened by the LLM in the next step. if overlap >= 1: row["keyword_overlap"] = overlap filtered_rows.append(row) else: dropped_count += 1 # If pre-filter dropped everything, fall back to original (safety valve) if not filtered_rows: filtered_rows = _parse_jsonl_rows(candidates_text) # Rebuild candidates_text from filtered rows candidates_text = "\n".join( json.dumps(r, ensure_ascii=False) for r in filtered_rows ) logger.info( "Domain pre-filter: kept %d, dropped %d (keywords: %s)", len(filtered_rows), dropped_count, topic_keywords[:8], ) shortlist: list[dict[str, Any]] = [] if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "literature_screen") sp = _pm.for_stage( "literature_screen", evolution_overlay=_overlay, topic=config.research.topic, domains=", ".join(config.research.domains) if config.research.domains else "general", quality_threshold=config.research.quality_threshold, candidates_text=candidates_text, ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) payload = _safe_json_loads(resp.content, {}) if isinstance(payload, dict) and isinstance(payload.get("shortlist"), list): shortlist = [row for row in payload["shortlist"] if isinstance(row, dict)] # T2.2: Ensure minimum shortlist size of 15 for adequate related work _MIN_SHORTLIST = 15 if not shortlist: rows = ( filtered_rows[:_MIN_SHORTLIST] if filtered_rows else _parse_jsonl_rows(candidates_text)[:_MIN_SHORTLIST] ) for idx, item in enumerate(rows): item["relevance_score"] = round(0.75 - idx * 0.02, 3) item["quality_score"] = round(0.72 - idx * 0.015, 3) item["keep_reason"] = "Template screened entry" shortlist.append(item) elif len(shortlist) < _MIN_SHORTLIST: # T2.2: LLM returned too few — supplement from filtered candidates existing_titles = { str(s.get("title", "")).lower().strip() for s in shortlist } for row in filtered_rows: if len(shortlist) >= _MIN_SHORTLIST: break title_lower = str(row.get("title", "")).lower().strip() if title_lower and title_lower not in existing_titles: row.setdefault("relevance_score", 0.5) row.setdefault("quality_score", 0.5) row.setdefault("keep_reason", "Supplemented to meet minimum shortlist") shortlist.append(row) existing_titles.add(title_lower) logger.info( "Stage 5: Supplemented shortlist to %d papers (minimum: %d)", len(shortlist), _MIN_SHORTLIST, ) out = stage_dir / "shortlist.jsonl" _write_jsonl(out, shortlist) return StageResult( stage=Stage.LITERATURE_SCREEN, status=StageStatus.DONE, artifacts=("shortlist.jsonl",), evidence_refs=("stage-05/shortlist.jsonl",), ) def _execute_knowledge_extract( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: shortlist = _read_prior_artifact(run_dir, "shortlist.jsonl") or "" # Inject web context from Stage 4 if available web_context = _read_prior_artifact(run_dir, "web_context.md") or "" if web_context: shortlist = shortlist + "\n\n--- Web Search Context ---\n" + web_context[:10_000] cards_dir = stage_dir / "cards" cards_dir.mkdir(parents=True, exist_ok=True) cards: list[dict[str, Any]] = [] if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "knowledge_extract") sp = _pm.for_stage("knowledge_extract", evolution_overlay=_overlay, shortlist=shortlist) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) payload = _safe_json_loads(resp.content, {}) if isinstance(payload, dict) and isinstance(payload.get("cards"), list): cards = [item for item in payload["cards"] if isinstance(item, dict)] if not cards: rows = _parse_jsonl_rows(shortlist) for idx, paper in enumerate(rows[:6]): title = str(paper.get("title", f"Paper {idx + 1}")) cards.append( { "card_id": f"card-{idx + 1}", "title": title, "problem": f"How to improve {config.research.topic}", "method": "Template method summary", "data": "Template dataset", "metrics": "Template metric", "findings": "Template key finding", "limitations": "Template limitation", "citation": str(paper.get("url", "")), "cite_key": str(paper.get("cite_key", "")), } ) for idx, card in enumerate(cards): card_id = _safe_filename(str(card.get("card_id", f"card-{idx + 1}"))) parts = [f"# {card.get('title', card_id)}", ""] for key in ( "cite_key", "problem", "method", "data", "metrics", "findings", "limitations", "citation", ): parts.append(f"## {key.title()}") parts.append(str(card.get(key, ""))) parts.append("") (cards_dir / f"{card_id}.md").write_text("\n".join(parts), encoding="utf-8") return StageResult( stage=Stage.KNOWLEDGE_EXTRACT, status=StageStatus.DONE, artifacts=("cards/",), evidence_refs=("stage-06/cards/",), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_paper_writing.py ================================================ """Stages 16-17: Paper outline and paper draft generation.""" from __future__ import annotations import json import logging import math import re from pathlib import Path from typing import Any import yaml from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain, _is_ml_domain from researchclaw.pipeline._helpers import ( StageResult, _build_context_preamble, _chat_with_prompt, _collect_experiment_results, _default_paper_outline, _extract_paper_title, _generate_framework_diagram_prompt, _generate_neurips_checklist, _get_evolution_overlay, _read_best_analysis, _read_prior_artifact, _safe_json_loads, _topic_constraint_block, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_paper_outline( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: analysis = _read_best_analysis(run_dir) decision = _read_prior_artifact(run_dir, "decision.md") or "" preamble = _build_context_preamble( config, run_dir, include_analysis=True, include_decision=True, include_experiment_data=True, ) # WS-5.2: Read iteration feedback if available (multi-round iteration) feedback = "" iter_ctx_path = run_dir / "iteration_context.json" if iter_ctx_path.exists(): try: ctx = json.loads(iter_ctx_path.read_text(encoding="utf-8")) iteration = ctx.get("iteration", 1) prev_score = ctx.get("quality_score") reviews_excerpt = ctx.get("reviews_excerpt", "") if iteration > 1 and reviews_excerpt: feedback = ( f"\n\n## Iteration {iteration} Feedback\n" f"Previous quality score: {prev_score}/10\n" f"Reviewer feedback to address:\n{reviews_excerpt[:2000]}\n" f"\nYou MUST address these reviewer concerns in this revision.\n" ) except (json.JSONDecodeError, KeyError): pass if llm is not None: _pm = prompts or PromptManager() # IMP-20: Pass academic style guide block for outline stage try: _asg = _pm.block("academic_style_guide") except (KeyError, Exception): _asg = "" _overlay = _get_evolution_overlay(run_dir, "paper_outline") sp = _pm.for_stage( "paper_outline", evolution_overlay=_overlay, preamble=preamble, topic_constraint=_pm.block("topic_constraint", topic=config.research.topic), feedback=feedback, analysis=analysis, decision=decision, academic_style_guide=_asg, ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) outline = resp.content # Reasoning models may consume all tokens on CoT — retry with more if not outline.strip() and sp.max_tokens: logger.warning("Empty outline from LLM — retrying with 2x tokens") resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens * 2, ) outline = resp.content if not outline.strip(): logger.warning("LLM returned empty outline — using default") outline = _default_paper_outline(config.research.topic) else: outline = _default_paper_outline(config.research.topic) (stage_dir / "outline.md").write_text(outline, encoding="utf-8") return StageResult( stage=Stage.PAPER_OUTLINE, status=StageStatus.DONE, artifacts=("outline.md",), evidence_refs=("stage-16/outline.md",), ) def _collect_raw_experiment_metrics(run_dir: Path) -> tuple[str, bool]: """Collect raw experiment metric lines from stdout for paper writing. Returns a tuple of (formatted block, has_parsed_metrics). ``has_parsed_metrics`` is True when at least one run had a non-empty ``metrics`` dict in its JSON payload — a reliable signal of real data. """ metric_lines: list[str] = [] run_count = 0 has_parsed_metrics = False for stage_subdir in sorted(run_dir.glob("stage-*/runs")): for run_file in sorted(stage_subdir.glob("*.json")): if run_file.name == "results.json": continue try: payload = json.loads(run_file.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue if not isinstance(payload, dict): continue # R10: Skip simulated data — only collect real experiment results if payload.get("status") == "simulated": continue run_count += 1 # Extract from parsed metrics (check both 'metrics' and 'key_metrics') metrics = payload.get("metrics", {}) or payload.get("key_metrics", {}) if isinstance(metrics, dict) and metrics: has_parsed_metrics = True for k, v in metrics.items(): metric_lines.append(f" {k}: {v}") # Also extract from stdout for full detail # BUG-23: Filter out infrastructure lines that are NOT experiment results _INFRA_KEYS = { "SEED_COUNT", "TIME_ESTIMATE", "TRAINING_STEPS", "REGISTERED_CONDITIONS", "METRIC_DEF", "GPU_MEMORY", "BATCH_SIZE", "NUM_WORKERS", "TOTAL_PARAMS", "time_budget_sec", "max_epochs", "num_seeds", } stdout = payload.get("stdout", "") if stdout: for line in stdout.splitlines(): line = line.strip() if ":" in line: parts = line.rsplit(":", 1) try: float(parts[1].strip()) key_part = parts[0].strip().split("/")[-1] # last segment if key_part in _INFRA_KEYS: continue # skip infrastructure lines metric_lines.append(f" {line}") except (ValueError, TypeError, IndexError): pass # R19-4 + R23-1: Collect metrics from refinement_log.json (Stage 13). # If refinement has richer data than Stage 12 runs/, REPLACE Stage 12 data # to avoid confusing the paper writer with conflicting sources. _refine_lines: list[str] = [] _refine_run_count = 0 # Scan ALL refinement logs across versions, pick by quality (primary # metric) then richness (metric count). BUG-207: Previous logic picked # the sandbox entry with the most metric keys regardless of whether it # represented a regression (e.g. sandbox_after_fix with 1.29% accuracy # winning over sandbox with 78.93% because it had 6 more keys). _best_refine_metrics: dict[str, Any] = {} _best_refine_stdout = "" _best_refine_primary: float | None = None for _rl_path in sorted(run_dir.glob("stage-13*/refinement_log.json")): try: _rlog = json.loads(_rl_path.read_text(encoding="utf-8")) for _it in _rlog.get("iterations", []): for _sbx_key in ("sandbox", "sandbox_after_fix"): _sbx = _it.get(_sbx_key, {}) if not isinstance(_sbx, dict): continue _sbx_metrics = _sbx.get("metrics", {}) if not isinstance(_sbx_metrics, dict) or not _sbx_metrics: continue # Extract primary metric value for quality comparison _sbx_primary: float | None = None for _pm_key in ("primary_metric", "best_metric"): if _pm_key in _sbx_metrics: try: _sbx_primary = float(_sbx_metrics[_pm_key]) except (ValueError, TypeError): pass break # Prefer higher primary metric; fall back to count _dominated = False if _best_refine_primary is not None and _sbx_primary is not None: if _sbx_primary > _best_refine_primary: _dominated = True # new is better elif _sbx_primary < _best_refine_primary * 0.5: continue # skip: regression (>50% worse) # Accept if quality-dominant or richer-with-no-regression if _dominated or len(_sbx_metrics) > len(_best_refine_metrics): _best_refine_metrics = _sbx_metrics _best_refine_stdout = _sbx.get("stdout", "") _best_refine_primary = _sbx_primary except (json.JSONDecodeError, OSError): pass if _best_refine_metrics and len(_best_refine_metrics) > len(metric_lines) // 2: # Refinement has richer data — REPLACE Stage 12 data to avoid conflicts metric_lines = [] run_count = 1 for k, v in _best_refine_metrics.items(): metric_lines.append(f" {k}: {v}") # Also extract PAIRED and metric lines from stdout if _best_refine_stdout: for _line in _best_refine_stdout.splitlines(): _line = _line.strip() if _line.startswith("PAIRED:"): metric_lines.append(f" {_line}") elif ":" in _line: parts = _line.rsplit(":", 1) try: float(parts[1].strip()) metric_lines.append(f" {_line}") except (ValueError, TypeError, IndexError): pass elif _best_refine_metrics: # Refinement has some data but not richer — append to existing run_count += 1 for k, v in _best_refine_metrics.items(): metric_lines.append(f" {k}: {v}") if _best_refine_stdout: for _line in _best_refine_stdout.splitlines(): _line = _line.strip() if _line.startswith("PAIRED:"): metric_lines.append(f" {_line}") if not metric_lines: return "", has_parsed_metrics # Deduplicate while preserving order seen: set[str] = set() unique: list[str] = [] for line in metric_lines: if line not in seen: seen.add(line) unique.append(line) # BUG-29: Reformat raw metric lines into human-readable condition summaries # to prevent LLM from pasting raw path-style lines into the paper _grouped: dict[str, list[str]] = {} _ungrouped: list[str] = [] for line in unique[:200]: stripped = line.strip() # Match pattern: condition/env/step/metric: value parts = stripped.split("/") if len(parts) >= 3 and ":" in parts[-1]: cond = parts[0] detail = "/".join(parts[1:]) _grouped.setdefault(cond, []).append(f" - {detail}") else: _ungrouped.append(stripped) formatted_lines: list[str] = [] if _grouped: for cond, details in sorted(_grouped.items()): formatted_lines.append(f"## Condition: {cond}") formatted_lines.extend(details[:30]) if _ungrouped: formatted_lines.extend(_ungrouped) return ( f"\n\nACTUAL EXPERIMENT DATA (from {run_count} run(s) — use ONLY these numbers):\n" "```\n" + "\n".join(formatted_lines[:200]) + "\n```\n" "CRITICAL: Every number in the Results table MUST come from the data above. " "Do NOT round excessively, do NOT invent numbers, do NOT change values. " f"The experiment ran {run_count} time(s) — state this accurately in the methodology.\n" "NEVER paste raw metric paths (like 'condition/env/step/metric: value') " "into the paper. Always convert to formatted LaTeX tables or inline prose.\n" ), has_parsed_metrics def _write_paper_sections( *, llm: LLMClient, pm: PromptManager, run_dir: Path | None = None, preamble: str, topic_constraint: str, exp_metrics_instruction: str, citation_instruction: str, outline: str, model_name: str = "", ) -> str: """Write a conference-grade paper in 3 sequential LLM calls. Call 1: Title + Abstract + Introduction + Related Work Call 2: Method + Experiments (with full experiment data) Call 3: Results + Discussion + Limitations + Conclusion Each call receives prior sections for coherence. """ # Render writing_structure block for injection try: _writing_structure = pm.block("writing_structure") except (KeyError, Exception): # noqa: BLE001 _writing_structure = "" _overlay = _get_evolution_overlay(run_dir, "paper_draft") system = pm.for_stage( "paper_draft", evolution_overlay=_overlay, preamble=preamble, topic_constraint=topic_constraint, exp_metrics_instruction=exp_metrics_instruction, citation_instruction=citation_instruction, writing_structure=_writing_structure, outline=outline, ).system sections: list[str] = [] # --- R4-3: Title guidelines and abstract structure --- try: title_guidelines = pm.block("title_guidelines") except (KeyError, Exception): # noqa: BLE001 title_guidelines = "" try: abstract_structure = pm.block("abstract_structure") except (KeyError, Exception): # noqa: BLE001 abstract_structure = "" # IMP-20/25/31/24: Academic style, narrative, anti-hedging, anti-repetition try: academic_style_guide = pm.block("academic_style_guide") except (KeyError, Exception): # noqa: BLE001 academic_style_guide = "" try: narrative_writing_rules = pm.block("narrative_writing_rules") except (KeyError, Exception): # noqa: BLE001 narrative_writing_rules = "" try: anti_hedging_rules = pm.block("anti_hedging_rules") except (KeyError, Exception): # noqa: BLE001 anti_hedging_rules = "" try: anti_repetition_rules = pm.block("anti_repetition_rules") except (KeyError, Exception): # noqa: BLE001 anti_repetition_rules = "" # --- Call 1: Title + Abstract + Introduction + Related Work --- call1_user = ( f"{preamble}\n\n" f"{topic_constraint}" f"{citation_instruction}\n\n" f"{title_guidelines}\n\n" f"{academic_style_guide}\n" f"{narrative_writing_rules}\n" f"{anti_hedging_rules}\n" f"{anti_repetition_rules}\n\n" "Write the following sections of a NeurIPS/ICML-quality paper in markdown. " "Follow the LENGTH REQUIREMENTS strictly:\n\n" "1. **Title** (HARD RULE: MUST be 14 words or fewer. Create a catchy method name " "first, then build the title: 'MethodName: Subtitle'. If your title exceeds 14 words, " "it will be automatically rejected. NEVER use 'Untitled Paper'.)\n" f"2. **Abstract** (150-220 words — HARD LIMIT. Do NOT exceed 220 words. " f"Do NOT include raw metric paths or 16-digit decimals.){abstract_structure}\n" "3. **Introduction** (800-1000 words): real-world motivation, problem statement, " "research gap analysis with citations, method overview, 3-4 contributions as bullet points, " "paper organization paragraph. MUST cite 8-12 references.\n" "4. **Related Work** (600-800 words): organized into 3-4 thematic subsections, each discussing " "4-5 papers with proper citations. Compare approaches, identify limitations, position this work.\n\n" f"Outline:\n{outline}\n\n" "Output markdown with ## headers. Do NOT include a References section.\n" "IMPORTANT: Start DIRECTLY with '## Title'. Do NOT include any preamble, " "data verification, condition listing, or metric enumeration before the title. " "The paper should read like a published manuscript, not a data report." ) # R14-1: Higher token limit for reasoning models _paper_max_tokens = 12000 if any(model_name.startswith(p) for p in ("gpt-5", "o3", "o4")): _paper_max_tokens = 24000 # T3.5: Retry once on failure, use placeholder if still fails try: resp1 = _chat_with_prompt(llm, system, call1_user, max_tokens=_paper_max_tokens, retries=1) part1 = resp1.content.strip() except Exception: # noqa: BLE001 logger.error("Stage 17: Part 1 LLM call failed after retry — using placeholder") part1 = ( "## Title\n[PLACEHOLDER — LLM call failed]\n\n" "## Abstract\n[This section could not be generated due to an LLM error. " "Please regenerate this stage.]\n\n" "## Introduction\n[PLACEHOLDER]\n\n" "## Related Work\n[PLACEHOLDER]" ) sections.append(part1) logger.info("Stage 17: Part 1 (Title+Abstract+Intro+Related Work) — %d chars", len(part1)) # --- Call 2: Method + Experiments --- call2_user = ( f"{preamble}\n\n" f"{topic_constraint}" f"{exp_metrics_instruction}\n\n" f"{narrative_writing_rules}\n" f"{anti_hedging_rules}\n\n" # IMP-21: Citation instruction for Method + Experiments "CITATION REQUIREMENT: The Method section MUST cite at least 3-5 related " "technical papers (foundations your method builds on). The Experiments section " "MUST cite baseline method papers. Use [cite_key] syntax.\n" f"{citation_instruction}\n\n" "You are continuing a paper. The sections written so far are:\n\n" f"---\n{part1}\n---\n\n" "Now write the next sections, maintaining consistency with the above:\n\n" "5. **Method** (1000-1500 words): formal problem definition with mathematical notation " "($x$, $\\theta$, etc.), detailed algorithm description with equations, step-by-step procedure, " "complexity analysis, design rationale for key choices. Include algorithm pseudocode if applicable. " "Write as FLOWING PROSE — do NOT use bullet-point lists for method components.\n" "6. **Experiments** (800-1200 words): detailed experimental setup, datasets with statistics " "(size, splits, features), all baselines and their implementations, hyperparameter settings " "in a markdown table, evaluation metrics with mathematical definitions, hardware and runtime info.\n" "METHOD NAMES IN TABLES: Use SHORT abbreviations (4-8 chars) for method names " "in tables. Define abbreviation mappings in a footnote. " "NEVER put method names longer than 20 characters in table cells.\n\n" f"Outline:\n{outline}\n\n" "Output markdown with ## headers. Continue from where Part 1 ended." ) try: resp2 = _chat_with_prompt(llm, system, call2_user, max_tokens=_paper_max_tokens, retries=1) part2 = resp2.content.strip() except Exception: # noqa: BLE001 logger.error("Stage 17: Part 2 LLM call failed after retry — using placeholder") part2 = ( "## Method\n[PLACEHOLDER — LLM call failed. Please regenerate this stage.]\n\n" "## Experiments\n[PLACEHOLDER]" ) sections.append(part2) logger.info("Stage 17: Part 2 (Method+Experiments) — %d chars", len(part2)) # --- Call 3: Results + Discussion + Limitations + Conclusion --- call3_user = ( f"{preamble}\n\n" f"{topic_constraint}" f"{exp_metrics_instruction}\n\n" f"{narrative_writing_rules}\n" f"{anti_hedging_rules}\n" f"{anti_repetition_rules}\n\n" # IMP-21: Citation instruction for Results + Discussion + Conclusion "CITATION REQUIREMENT: The Discussion section MUST cite at least 3-5 papers " "when comparing findings with prior work. The Conclusion may cite 1-2 " "foundational references.\n" f"{citation_instruction}\n\n" "You are completing a paper. The sections written so far are:\n\n" f"---\n{part1}\n\n{part2}\n---\n\n" "Now write the final sections, maintaining consistency:\n\n" "7. **Results** (600-800 words):\n" " - START with an AGGREGATED results table (Table 1): rows = methods, columns = metrics.\n" " Each cell = mean \u00b1 std across seeds. Bold the best value per column.\n" " EVERY table MUST have a descriptive caption that allows understanding without " " reading the main text. NEVER use just 'Table 1' as a caption.\n" " - Follow with a PER-REGIME table (Table 2) breaking down by easy/hard regimes.\n" " - Include a STATISTICAL COMPARISON table (Table 3): paired t-tests between key methods.\n" " - NEVER dump raw per-seed numbers in the main text. Aggregate first, then discuss.\n" " - MUST include at least 2 figures using markdown image syntax: ![Caption](charts/filename.png)\n" " One figure MUST be a performance comparison chart. Figures MUST be referenced " " in text: 'As shown in Figure 1, ...'\n" "8. **Discussion** (400-600 words): interpretation of key findings, unexpected results, " "comparison with prior work (CITE 3-5 papers here!), practical implications.\n" "9. **Limitations** (200-300 words): honest assessment of scope, dataset, methodology. " "ALL caveats consolidated HERE — nowhere else in the paper.\n" "10. **Conclusion** (100-200 words MAXIMUM — this is a HARD LIMIT): " "Summarize contributions in 2-3 sentences. State main finding in 1 sentence. " "Suggest 2-3 concrete future directions in 1-2 sentences. " "Do NOT repeat any specific numbers from Results. Do NOT restate the abstract. " "A good conclusion is SHORT and forward-looking.\n\n" "CRITICAL FORMATTING RULES FOR ALL SECTIONS:\n" "- Write as FLOWING PROSE paragraphs, NOT bullet-point lists\n" "- NEVER dump raw metric paths like 'config/method_name/seed_3/primary_metric'\n" "- All numbers must be rounded to 4 decimal places maximum\n" "- Every table MUST have a descriptive caption (not just 'Table 1')\n" "- Use \\begin{algorithm} or pseudocode notation, NOT \\begin{verbatim}\n\n" "Output markdown with ## headers. Do NOT include a References section." ) try: resp3 = _chat_with_prompt(llm, system, call3_user, max_tokens=_paper_max_tokens, retries=1) part3 = resp3.content.strip() except Exception: # noqa: BLE001 logger.error("Stage 17: Part 3 LLM call failed after retry — using placeholder") part3 = ( "## Results\n[PLACEHOLDER — LLM call failed. Please regenerate this stage.]\n\n" "## Discussion\n[PLACEHOLDER]\n\n" "## Limitations\n[PLACEHOLDER]\n\n" "## Conclusion\n[PLACEHOLDER]" ) sections.append(part3) logger.info("Stage 17: Part 3 (Results+Discussion+Limitations+Conclusion) — %d chars", len(part3)) # Combine all sections draft = "\n\n".join(sections) # R32: Strip data verification preamble that LLMs sometimes emit before # the actual paper. The preamble typically starts with "## Tested Conditions" # or similar headings and ends before "## Title". import re as _re_strip _title_match = _re_strip.search(r"^## Title\b", draft, _re_strip.MULTILINE) if _title_match and _title_match.start() > 200: _stripped = draft[_title_match.start():] logger.info( "R32: Stripped %d-char preamble before '## Title'", _title_match.start(), ) draft = _stripped total_words = len(draft.split()) logger.info("Stage 17: Full draft — %d chars, ~%d words", len(draft), total_words) return draft # --------------------------------------------------------------------------- # Draft quality validation (section balance + bullet-point density) # --------------------------------------------------------------------------- # Sections where bullets/numbered lists are acceptable. _BULLET_LENIENT_SECTIONS = frozenset({ "introduction", "limitations", "limitation", "limitations and future work", "abstract", }) # Main body sections used for balance ratio check. _BALANCE_SECTIONS = frozenset({ "introduction", "related work", "method", "experiments", "results", "discussion", }) def _validate_draft_quality( draft: str, stage_dir: Path | None = None, ) -> dict[str, Any]: """Validate a paper draft for section balance and prose quality. Checks: 1. Per-section word count vs ``SECTION_WORD_TARGETS``. 2. Bullet-point / numbered-list density per section. 3. Largest-to-smallest main-section word-count ratio. Returns a dict with ``section_analysis``, ``overall_warnings``, and ``revision_directives``. Optionally writes ``draft_quality.json`` to *stage_dir*. """ from researchclaw.prompts import SECTION_WORD_TARGETS, _SECTION_TARGET_ALIASES _heading_re = re.compile(r"^(#{1,4})\s+(.+)$", re.MULTILINE) matches = list(_heading_re.finditer(draft)) sections_data: list[dict[str, Any]] = [] for i, m in enumerate(matches): level = len(m.group(1)) heading = m.group(2).strip() start = m.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(draft) body = draft[start:end].strip() sections_data.append({ "heading": heading, "heading_lower": heading.strip().lower(), "level": level, "body": body, }) section_analysis: list[dict[str, Any]] = [] overall_warnings: list[str] = [] revision_directives: list[str] = [] main_section_words: dict[str, int] = {} _bullet_re = re.compile(r"^\s*[-*]\s+", re.MULTILINE) _numbered_re = re.compile(r"^\s*\d+\.\s+", re.MULTILINE) # BUG-24: Accumulate subsection (H3+) word counts into parent H2 sections _subsection_words: dict[str, int] = {} _current_parent = "" for sec in sections_data: if sec["level"] <= 2: _current_parent = sec["heading_lower"] _subsection_words.setdefault(_current_parent, 0) else: # Add subsection words to parent _subsection_words[_current_parent] = ( _subsection_words.get(_current_parent, 0) + len(sec["body"].split()) ) for sec in sections_data: if sec["level"] > 2: continue heading_lower: str = sec["heading_lower"] body: str = sec["body"] # BUG-24: Include subsection words in the parent's word count word_count = len(body.split()) + _subsection_words.get(heading_lower, 0) canon = heading_lower if canon not in SECTION_WORD_TARGETS: canon = _SECTION_TARGET_ALIASES.get(heading_lower, "") entry: dict[str, Any] = { "heading": sec["heading"], "word_count": word_count, "canonical": canon, } if canon and canon in SECTION_WORD_TARGETS: lo, hi = SECTION_WORD_TARGETS[canon] entry["target"] = [lo, hi] if word_count < int(lo * 0.7): overall_warnings.append( f"{sec['heading']} is severely under target " f"({word_count} words, target {lo}-{hi})" ) revision_directives.append( f"EXPAND {sec['heading']} from {word_count} to {lo}+ words. " f"Add substantive content \u2014 do NOT pad with filler." ) entry["status"] = "severely_short" elif word_count < lo: overall_warnings.append( f"{sec['heading']} is under target " f"({word_count} words, target {lo}-{hi})" ) revision_directives.append( f"Expand {sec['heading']} from {word_count} to {lo}+ words." ) entry["status"] = "short" elif word_count > int(hi * 1.3): overall_warnings.append( f"{sec['heading']} exceeds target " f"({word_count} words, target {lo}-{hi})" ) revision_directives.append( f"Compress {sec['heading']} from {word_count} to {hi} words or fewer." ) entry["status"] = "long" else: entry["status"] = "ok" if body: total_lines = len([ln for ln in body.splitlines() if ln.strip()]) bullet_lines = len(_bullet_re.findall(body)) + len(_numbered_re.findall(body)) density = bullet_lines / total_lines if total_lines > 0 else 0.0 entry["bullet_density"] = round(density, 2) threshold = 0.50 if heading_lower in _BULLET_LENIENT_SECTIONS else 0.25 if density > threshold and total_lines >= 4: overall_warnings.append( f"{sec['heading']} has {bullet_lines}/{total_lines} " f"bullet/numbered lines ({density:.0%} density, " f"threshold {threshold:.0%})" ) revision_directives.append( f"REWRITE {sec['heading']} as flowing academic prose. " f"Convert bullet points to narrative paragraphs." ) entry["bullet_status"] = "high" else: entry["bullet_status"] = "ok" canon_balance = canon or heading_lower if canon_balance in _BALANCE_SECTIONS: main_section_words[canon_balance] = word_count section_analysis.append(entry) if len(main_section_words) >= 2: wc_values = list(main_section_words.values()) max_wc = max(wc_values) min_wc = min(wc_values) if min_wc > 0 and max_wc / min_wc > 3.0: largest = max(main_section_words, key=main_section_words.get) # type: ignore[arg-type] smallest = min(main_section_words, key=main_section_words.get) # type: ignore[arg-type] overall_warnings.append( f"Section imbalance: {largest} ({max_wc} words) vs " f"{smallest} ({min_wc} words) \u2014 ratio {max_wc / min_wc:.1f}x" ) revision_directives.append( f"Rebalance sections: expand {smallest} and/or compress {largest} " f"to achieve more even section lengths." ) # --- C-4/C-5: Citation count and recency checks --- _cite_pattern = re.compile(r"\[([a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9]*)\]") cited_keys = set(_cite_pattern.findall(draft)) if cited_keys: n_citations = len(cited_keys) if n_citations < 15: overall_warnings.append( f"Only {n_citations} unique citations found (target: >=15 for a full paper)" ) revision_directives.append( f"Add more references — a top-venue paper typically cites 25-40 works. " f"Currently only {n_citations} unique citations." ) # Check recency: count citations with year >= current_year - 2 _year_pat = re.compile(r"(\d{4})") import datetime as _dt_cit _cur_year = _dt_cit.datetime.now().year recent_count = sum( 1 for k in cited_keys for m in [_year_pat.search(k)] if m and int(m.group(1)) >= _cur_year - 2 ) recency_ratio = recent_count / n_citations if n_citations > 0 else 0.0 if recency_ratio < 0.3 and n_citations >= 10: overall_warnings.append( f"Citation recency low: only {recent_count}/{n_citations} " f"({recency_ratio:.0%}) from last 3 years (target: >=30%%)" ) # --- Abstract and Conclusion length enforcement --- for sec in sections_data: hl = sec["heading_lower"] body_text: str = sec["body"] wc = len(body_text.split()) if hl == "abstract" and wc > 250: overall_warnings.append( f"Abstract is too long: {wc} words (target: 150-220 words)" ) revision_directives.append( f"COMPRESS the Abstract from {wc} to 150-220 words. " f"Remove raw metric values, redundant context, and self-references." ) if hl in ("conclusion", "conclusions", "conclusion and future work"): if wc > 300: overall_warnings.append( f"Conclusion is too long: {wc} words (target: 100-200 words)" ) revision_directives.append( f"COMPRESS the Conclusion from {wc} to 100-200 words. " f"Do NOT repeat specific metric values from Results. " f"Summarize findings in 2-3 sentences, then 2-3 future directions." ) # --- Raw metric path detection (log dumps in prose) --- _raw_path_re = re.compile( r"\\texttt\{[a-zA-Z0-9_/.-]+(?:/[a-zA-Z0-9_/.-]+){2,}", ) raw_path_count = len(_raw_path_re.findall(draft)) if raw_path_count > 3: overall_warnings.append( f"Raw metric paths in prose: {raw_path_count} instances of " f"\\texttt{{config/path/metric}} style dumps" ) revision_directives.append( "REMOVE raw experiment log paths from prose. Replace " "\\texttt{config/metric/path} with human-readable metric names " "and summarize values in tables, not inline text." ) # --- Writing quality lint --- _weasel_words = re.compile( r"\b(various|many|several|quite|fairly|really|very|rather|" r"somewhat|relatively|arguably|interestingly|importantly|" r"it is well known that|it is obvious that|clearly)\b", re.IGNORECASE, ) _duplicate_words = re.compile(r"\b(\w+)\s+\1\b", re.IGNORECASE) weasel_count = len(_weasel_words.findall(draft)) dup_matches = _duplicate_words.findall(draft) dup_count = len([d for d in dup_matches if d.lower() not in ("that", "had")]) if weasel_count > 20: overall_warnings.append( f"High weasel-word count: {weasel_count} instances " f"(consider replacing vague words with precise language)" ) revision_directives.append( "Replace vague hedging words (various, several, quite, fairly, " "rather, somewhat) with precise quantities or remove them." ) if dup_count > 0: overall_warnings.append( f"Duplicate adjacent words found: {dup_count} instance(s) " f"(e.g., 'the the', 'is is')" ) revision_directives.append( "Fix duplicate adjacent words (likely typos)." ) # --- AI-slop / boilerplate detection --- _BOILERPLATE_PHRASES = [ "delves into", "delve into", "it is worth noting", "it should be noted", "it is important to note", "leverage the power of", "leverages the power of", "in this paper, we propose", "in this work, we propose", "to the best of our knowledge", "in the realm of", "in the landscape of", "plays a crucial role", "plays a pivotal role", "groundbreaking", "cutting-edge", "state-of-the-art", "game-changing", "paradigm shift", "a myriad of", "a plethora of", "aims to bridge the gap", "bridge the gap", "shed light on", "sheds light on", "pave the way", "paves the way", "the advent of", "with the advent of", "in recent years", "in recent times", "has gained significant attention", "has attracted considerable interest", "has emerged as a promising", "a comprehensive overview", "a holistic approach", "holistic understanding", "showcasing the efficacy", "demonstrate the efficacy", "multifaceted", "underscores the importance", "navigate the complexities", "harness the potential", "harnessing the power", "it is imperative to", "it is crucial to", "a nuanced understanding", "nuanced approach", "robust and scalable", "seamlessly integrates", "the intricacies of", "intricate interplay", "facilitate a deeper understanding", "a testament to", ] draft_lower = draft.lower() boilerplate_hits: list[str] = [] for phrase in _BOILERPLATE_PHRASES: count = draft_lower.count(phrase) if count > 0: boilerplate_hits.extend([phrase] * count) if len(boilerplate_hits) > 5: unique_phrases = sorted(set(boilerplate_hits))[:5] overall_warnings.append( f"AI boilerplate detected: {len(boilerplate_hits)} instances " f"of generic LLM phrases (e.g., {', '.join(repr(p) for p in unique_phrases[:3])})" ) revision_directives.append( "REWRITE sentences containing AI-generated boilerplate phrases. " "Replace generic language (e.g., 'delves into', 'it is worth noting', " "'leverages the power of', 'plays a crucial role', 'paves the way') " "with precise, specific academic language." ) # --- Related work depth check --- _rw_headings = {"related work", "related works", "background", "literature review"} rw_body = "" for sec in sections_data: if sec["heading_lower"] in _rw_headings and sec["level"] <= 2: rw_body = sec["body"] break if rw_body and len(rw_body.split()) > 50: _comparative_pats = re.compile( r"\b(unlike|in contrast|whereas|while .+ focus|" r"however|differ(?:s|ent)|our (?:method|approach) .+ instead|" r"we (?:instead|differ)|compared to|as opposed to|" r"goes beyond|extends|improves upon|addresses the limitation)\b", re.IGNORECASE, ) sentences = [s.strip() for s in re.split(r"[.!?]+", rw_body) if s.strip()] comparative_sents = sum(1 for s in sentences if _comparative_pats.search(s)) ratio = comparative_sents / len(sentences) if sentences else 0.0 if ratio < 0.15 and len(sentences) >= 5: overall_warnings.append( f"Related Work is purely descriptive: only {comparative_sents}/{len(sentences)} " f"sentences ({ratio:.0%}) contain comparative language (target: >=15%)" ) revision_directives.append( "REWRITE Related Work to critically compare with prior methods. " "Use phrases like 'unlike X, our approach...', 'in contrast to...', " "'while X focuses on... we address...' for at least 20% of sentences." ) # --- Statistical rigor check (result sections) --- _results_headings = {"results", "experiments", "experimental results", "evaluation"} results_body = "" for sec in sections_data: if sec["heading_lower"] in _results_headings and sec["level"] <= 2: results_body += sec["body"] + "\n" if results_body and len(results_body.split()) > 100: has_std = bool(re.search(r"\u00b1|\\pm|\bstd\b|\\std\b|standard deviation", results_body, re.IGNORECASE)) has_ci = bool(re.search(r"confidence interval|\bCI\b|95%|p-value|p\s*<", results_body, re.IGNORECASE)) has_seeds = bool(re.search(r"(?:seed|run|trial)s?\s*[:=]\s*\d|averaged?\s+over\s+\d+\s+(?:seed|run|trial)", results_body, re.IGNORECASE)) if not has_std and not has_ci and not has_seeds: overall_warnings.append( "No statistical measures found in results (no std, CI, p-values, or multi-seed reporting)" ) revision_directives.append( "ADD error bars (\u00b1std), confidence intervals, or note the number of " "random seeds used. Single-run results without variance reporting " "are insufficient for top venues." ) result: dict[str, Any] = { "section_analysis": section_analysis, "overall_warnings": overall_warnings, "revision_directives": revision_directives, } if stage_dir is not None: (stage_dir / "draft_quality.json").write_text( json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8" ) if overall_warnings: logger.warning( "Draft quality: %d warning(s) \u2014 %s", len(overall_warnings), "; ".join(overall_warnings[:3]), ) else: logger.info("Draft quality: all checks passed") return result def _review_compiled_pdf( pdf_path: Path, llm: LLMClient, topic: str, ) -> dict[str, Any]: """Multi-dimensional LLM review of compiled paper (AI-Scientist style). Scores the paper on 7 academic review dimensions (1-10 each), identifies specific strengths/weaknesses, and provides an overall accept/reject recommendation with confidence. Returns a dict with dimensional scores, issues, and decision. """ if not pdf_path.exists(): return {} # Use source-based review since not all models support vision tex_path = pdf_path.with_suffix(".tex") if not tex_path.exists(): return {} tex_content = tex_path.read_text(encoding="utf-8")[:12000] review_prompt = ( "You are a senior Area Chair at a top AI conference (NeurIPS/ICML/ICLR) " "reviewing a paper submission. Provide a rigorous, structured review.\n\n" f"PAPER TOPIC: {topic}\n\n" f"LaTeX source:\n```latex\n{tex_content}\n```\n\n" "REVIEW INSTRUCTIONS:\n" "Score each dimension 1-10 (1=unacceptable, 5=borderline, 8=strong accept, " "10=best paper candidate). Be critical but fair.\n\n" "DIMENSIONS:\n" "1. SOUNDNESS: Are claims well-supported? Is methodology correct? " "Are there logical gaps or unsupported claims?\n" "2. PRESENTATION: Is the writing clear, flowing, and professional? " "Are there grammar errors, bullet lists in prose sections, or " "boilerplate phrases? Is it free of AI-generated slop?\n" "3. CONTRIBUTION: Is the contribution significant? Does it advance " "the field beyond incremental improvement?\n" "4. ORIGINALITY: Is the approach novel? Does it differentiate clearly " "from prior work?\n" "5. CLARITY: Are the method and results easy to understand? Are figures " "and tables well-designed with descriptive captions?\n" "6. SIGNIFICANCE: Would the community benefit from this work? Does it " "open new research directions?\n" "7. REPRODUCIBILITY: Are experimental details sufficient to reproduce " "results? Are hyperparameters, datasets, and metrics clearly stated?\n\n" "Also evaluate:\n" "- Are all figures referenced in the text?\n" "- Are tables properly formatted (booktabs style, no vertical rules)?\n" "- Does the related work critically compare, not just list papers?\n" "- Are statistical measures (std, CI, multiple seeds) reported?\n" "- Is there a clear limitations section?\n\n" "Return a JSON object:\n" "{\n" ' "soundness": N,\n' ' "presentation": N,\n' ' "contribution": N,\n' ' "originality": N,\n' ' "clarity": N,\n' ' "significance": N,\n' ' "reproducibility": N,\n' ' "overall_score": N,\n' ' "confidence": N,\n' ' "decision": "accept" or "reject",\n' ' "strengths": ["strength1", "strength2", ...],\n' ' "weaknesses": ["weakness1", "weakness2", ...],\n' ' "critical_issues": ["issue requiring revision", ...],\n' ' "minor_issues": ["formatting/typo issues", ...],\n' ' "summary": "2-3 sentence overall assessment"\n' "}\n" ) try: resp = llm.chat( messages=[{"role": "user", "content": review_prompt}], system=( "You are a meticulous, critical academic reviewer. " "You have reviewed 100+ papers at top venues. " "Score honestly — most papers deserve 4-6, not 7-9. " "Flag any sign of AI-generated boilerplate." ), ) review_data = _safe_json_loads(resp.content, {}) if isinstance(review_data, dict) and "overall_score" in review_data: # Compute weighted aggregate if individual scores present dim_scores = { k: review_data.get(k, 0) for k in ( "soundness", "presentation", "contribution", "originality", "clarity", "significance", "reproducibility", ) } valid = {k: v for k, v in dim_scores.items() if isinstance(v, (int, float)) and v > 0} if valid: review_data["mean_score"] = round(sum(valid.values()) / len(valid), 2) return review_data except Exception as exc: # noqa: BLE001 logger.debug("PDF review LLM call failed: %s", exc) return {} def _check_ablation_effectiveness( exp_summary: dict[str, Any], threshold: float = 0.02, ) -> list[str]: """P7: Check if ablation results are within *threshold* of baseline. Returns a list of warning strings for ineffective ablations. Threshold tightened from 5% to 2% (Improvement C) — ablations with < 2% relative difference AND < 1pp absolute difference are flagged as TRIVIAL. """ warnings: list[str] = [] cond_summaries = exp_summary.get("condition_summaries", {}) if not isinstance(cond_summaries, dict) or not cond_summaries: return warnings # Find baseline/control condition baseline_name = None baseline_mean = None for name, data in cond_summaries.items(): if not isinstance(data, dict): continue name_lower = name.lower() if any(tag in name_lower for tag in ("baseline", "control", "vanilla", "standard")): metrics = data.get("metrics") or {} if not isinstance(metrics, dict): metrics = {} # Use the first metric that has a _mean suffix or the first available for mk, mv in metrics.items(): if mk.endswith("_mean"): baseline_name = name baseline_mean = float(mv) break if baseline_mean is None: for mk, mv in metrics.items(): try: baseline_name = name baseline_mean = float(mv) break except (TypeError, ValueError): continue if baseline_name: break if baseline_name is None or baseline_mean is None: return warnings # Check each ablation condition for name, data in cond_summaries.items(): if not isinstance(data, dict): continue name_lower = name.lower() if name == baseline_name: continue if not any(tag in name_lower for tag in ("ablation", "no_", "without", "reduced")): continue metrics = data.get("metrics") or {} if not isinstance(metrics, dict): metrics = {} for mk, mv in metrics.items(): if not mk.endswith("_mean"): continue try: abl_val = float(mv) except (TypeError, ValueError): continue if baseline_mean != 0: rel_diff = abs(abl_val - baseline_mean) / abs(baseline_mean) else: rel_diff = abs(abl_val - baseline_mean) abs_diff = abs(abl_val - baseline_mean) # Improvement C: Tighter check — both relative < threshold # AND absolute < 1pp → TRIVIAL if rel_diff < threshold and abs_diff < 1.0: warnings.append( f"TRIVIAL: Ablation '{name}' {mk}={abl_val:.4f} is within " f"{rel_diff:.1%} (abs {abs_diff:.4f}pp) of baseline " f"'{baseline_name}' {mk}={baseline_mean:.4f} — " f"ablation is ineffective" ) elif rel_diff < threshold: warnings.append( f"Ablation '{name}' {mk}={abl_val:.4f} is within " f"{rel_diff:.1%} of baseline '{baseline_name}' " f"{mk}={baseline_mean:.4f} — ablation may be ineffective" ) break # Only check the first _mean metric per condition # Improvement C: Prepend CRITICAL summary if >50% trivial trivial_count = sum(1 for w in warnings if w.startswith("TRIVIAL:")) if trivial_count > 0 and len(warnings) > 0 and trivial_count / len(warnings) > 0.5: warnings.insert(0, ( f"CRITICAL: {trivial_count}/{len(warnings)} ablations are trivially " f"similar to baseline (<{threshold:.0%} relative, <1pp absolute). " f"The ablation design is likely broken — components are not effectively removed." )) return warnings def _detect_result_contradictions( exp_summary: dict[str, Any], metric_direction: str = "maximize", ) -> list[str]: """P10: Detect contradictions in experiment results before paper writing. Returns a list of advisory strings to inject into paper writing prompt. """ advisories: list[str] = [] cond_summaries = exp_summary.get("condition_summaries", {}) if not isinstance(cond_summaries, dict) or not cond_summaries: return advisories # Collect primary metric means per condition means: dict[str, float] = {} for name, data in cond_summaries.items(): if not isinstance(data, dict): continue metrics = data.get("metrics", {}) for mk, mv in metrics.items(): if mk.endswith("_mean"): try: means[name] = float(mv) except (TypeError, ValueError): pass break if len(means) < 2: return advisories # Check 1: All methods within noise margin (2% relative spread) vals = list(means.values()) val_range = max(vals) - min(vals) val_mean = sum(vals) / len(vals) if val_mean != 0 and (val_range / abs(val_mean)) < 0.02: advisories.append( "NULL RESULT: All methods produce nearly identical primary metric values " f"(range={val_range:.4f}, mean={val_mean:.4f}). Frame this as a null result — " "the methods are statistically indistinguishable. Do NOT claim any method " "is superior. Discuss possible explanations (task too easy/hard, metric " "insensitive, insufficient differentiation in methods)." ) # Check 2: Control/simple baseline outperforms proposed method # BUG-P1: Respect metric_direction — "higher is better" vs "lower is better" _maximize = metric_direction == "maximize" baseline_val = None baseline_name = None proposed_val = None proposed_name = None for name, val in means.items(): name_lower = name.lower() if any(tag in name_lower for tag in ("baseline", "control", "random", "vanilla")): if baseline_val is None or (_maximize and val > baseline_val) or (not _maximize and val < baseline_val): baseline_val = val baseline_name = name elif any(tag in name_lower for tag in ("proposed", "our", "novel", "method")): if proposed_val is None or (_maximize and val > proposed_val) or (not _maximize and val < proposed_val): proposed_val = val proposed_name = name if baseline_val is not None and proposed_val is not None: _baseline_wins = (baseline_val > proposed_val) if _maximize else (baseline_val < proposed_val) if _baseline_wins: advisories.append( f"NEGATIVE RESULT: Baseline '{baseline_name}' ({baseline_val:.4f}) " f"outperforms proposed method '{proposed_name}' ({proposed_val:.4f}). " "This is a NEGATIVE result. Do NOT claim the proposed method is superior. " "Frame as 'An Empirical Study of...' or 'When X Falls Short'. " "Discuss why the baseline won and what this implies for future work." ) return advisories def _execute_paper_draft( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: outline = _read_prior_artifact(run_dir, "outline.md") or "" preamble = _build_context_preamble( config, run_dir, include_goal=True, include_hypotheses=True, include_analysis=True, include_experiment_data=True, # WS-5.1: inject real experiment data ) # BUG-222: Read PROMOTED BEST experiment_summary for the paper prompt. # Previous code (R21-1) picked the "richest" experiment_summary across # all stage-14* dirs. After REFINE regression, a later iteration with # more conditions but worse quality could win, feeding the LLM regressed # data. Now: prefer experiment_summary_best.json (written by # _promote_best_stage14()), fall back to richest stage-14* for # non-REFINE runs. exp_summary_text = None _best_path = run_dir / "experiment_summary_best.json" if _best_path.is_file(): try: _text = _best_path.read_text(encoding="utf-8") _parsed = _safe_json_loads(_text, {}) if isinstance(_parsed, dict) and ( _parsed.get("condition_summaries") or _parsed.get("metrics_summary") ): exp_summary_text = _text logger.info("BUG-222: Using promoted experiment_summary_best.json") except OSError: pass if exp_summary_text is None: # Fallback: pick richest stage-14* (pre-BUG-222 behavior) _best_metric_count = 0 for _s14_dir in sorted(run_dir.glob("stage-14*")): _candidate = _s14_dir / "experiment_summary.json" if _candidate.is_file(): _text = _candidate.read_text(encoding="utf-8") _parsed = _safe_json_loads(_text, {}) if isinstance(_parsed, dict): _mcount = _parsed.get("total_metric_keys", 0) or len( _parsed.get("metrics_summary", {}) ) _paired_count = len(_parsed.get("paired_comparisons", [])) _cond_count = len(_parsed.get("condition_summaries", {})) _score = _mcount + _paired_count * 10 + _cond_count * 5 if _score > _best_metric_count: _best_metric_count = _score exp_summary_text = _text logger.info( "R21-1 fallback: Selected %s (score=%d)", _s14_dir.name, _score, ) if exp_summary_text is None: exp_summary_text = _read_prior_artifact(run_dir, "experiment_summary.json") exp_metrics_instruction = "" has_real_metrics = False _verified_registry = None # Phase 1: anti-fabrication verified data registry # BUG-108: Load refinement_log so VerifiedRegistry has per-iteration metrics _refinement_log_for_vr: dict | None = None _rl_candidates = sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True) _rl_path = _rl_candidates[0] if _rl_candidates else None if _rl_path and _rl_path.is_file(): try: _refinement_log_for_vr = json.loads(_rl_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): pass if exp_summary_text: exp_summary = _safe_json_loads(exp_summary_text, {}) # Phase 1: Build VerifiedRegistry from experiment data if isinstance(exp_summary, dict): try: from researchclaw.pipeline.verified_registry import VerifiedRegistry # BUG-222: Use best_only=True to ensure paper tables reflect # only the promoted best iteration, not regressed data _verified_registry = VerifiedRegistry.from_run_dir( run_dir, metric_direction=config.experiment.metric_direction, best_only=True, ) logger.info( "Stage 17: VerifiedRegistry — %d verified values, %d conditions", len(_verified_registry.values), len(_verified_registry.condition_names), ) except Exception as _vr_exc: logger.warning("Stage 17: Failed to build VerifiedRegistry: %s", _vr_exc) if isinstance(exp_summary, dict) and exp_summary.get("metrics_summary"): has_real_metrics = True exp_metrics_instruction = ( "\n\nIMPORTANT: Use the ACTUAL experiment results provided in the context. " "All numbers in the Results and Experiments sections MUST reference real data. " "Do NOT write 'no quantitative results yet' or use placeholder numbers. " "Cite specific metrics with their actual values.\n" ) # Collect raw experiment stdout metrics as hard constraint for the paper raw_metrics_block, _has_parsed_metrics = _collect_raw_experiment_metrics(run_dir) if raw_metrics_block: # BUG-23: Raw stdout alone is not sufficient — require either # metrics_summary data, parsed metrics from run JSONs, # OR at least 3 condition= patterns in raw block _has_condition_pattern = len(re.findall( r"condition[=:]", raw_metrics_block, re.IGNORECASE )) >= 3 if has_real_metrics or _has_parsed_metrics or _has_condition_pattern: has_real_metrics = True exp_metrics_instruction += raw_metrics_block # R18-1 + R19-6: Inject paired statistical comparisons AND condition summaries if exp_summary_text: exp_summary_parsed = _safe_json_loads(exp_summary_text, {}) if isinstance(exp_summary_parsed, dict): # R19-6: Inject experiment scale header so LLM knows the data richness _total_conds = exp_summary_parsed.get("total_conditions") _total_mkeys = exp_summary_parsed.get("total_metric_keys") if _total_conds or _total_mkeys: scale_block = "\n\n## EXPERIMENT SCALE\n" if _total_conds: scale_block += f"- Total conditions tested: {_total_conds}\n" if _total_mkeys: scale_block += f"- Total metric keys collected: {_total_mkeys}\n" scale_block += ( "- This is a MULTI-SEED experiment. Report mean +/- std across seeds.\n" "- Do NOT describe results as 'single run' or 'preliminary'.\n" ) exp_metrics_instruction += scale_block # Improvement B: Inject seed insufficiency warnings _seed_warns = exp_summary_parsed.get("seed_insufficiency_warnings", []) if _seed_warns: _sw_block = ( "\n\n## SEED INSUFFICIENCY WARNINGS\n" "Some conditions were run with fewer than 3 seeds. " "Results for these conditions MUST be footnoted as preliminary.\n" "All tables MUST show mean ± std format. Single-run values " "MUST be footnoted with '†single seed — interpret with caution'.\n" ) for _sw in _seed_warns: _sw_block += f"- {_sw}\n" exp_metrics_instruction += _sw_block # R19-6 + R33: Inject condition summaries with CIs cond_summaries = exp_summary_parsed.get("condition_summaries", {}) if isinstance(cond_summaries, dict) and cond_summaries: cond_block = "\n\n## PER-CONDITION SUMMARY (use in Results tables)\n" for cname, cdata in sorted(cond_summaries.items()): cond_block += f"\n### {cname}\n" if not isinstance(cdata, dict): continue sr = cdata.get("success_rate") if sr is not None: try: cond_block += f"- Success rate: {float(sr):.1%}\n" except (ValueError, TypeError): cond_block += f"- Success rate: {sr}\n" ns = cdata.get("n_seeds") or cdata.get("n_seed_metrics") if ns: cond_block += f"- Seeds: {ns}\n" ci_lo = cdata.get("ci95_low") ci_hi = cdata.get("ci95_high") if ci_lo is not None and ci_hi is not None: try: cond_block += f"- Bootstrap 95% CI: [{float(ci_lo):.4f}, {float(ci_hi):.4f}]\n" except (ValueError, TypeError): cond_block += f"- Bootstrap 95% CI: [{ci_lo}, {ci_hi}]\n" cm = cdata.get("metrics") or {} if isinstance(cm, dict) and cm: for mk, mv in sorted(cm.items()): if isinstance(mv, (int, float)): cond_block += f"- {mk}: {mv:.4f}\n" else: cond_block += f"- {mk}: {mv}\n" exp_metrics_instruction += cond_block # R18-1: Inject paired statistical comparisons paired = exp_summary_parsed.get("paired_comparisons", []) if paired: paired_block = "\n\n## PAIRED STATISTICAL COMPARISONS (use these in Results)\n" paired_block += f"Total: {len(paired)} paired tests computed.\n" for pc in paired: if not isinstance(pc, dict): continue method = pc.get("method", "?") baseline = pc.get("baseline", "?") regime = pc.get("regime", "all") md = pc.get("mean_diff", "?") sd = pc.get("std_diff", "?") ts = pc.get("t_stat", "?") pv = pc.get("p_value", "?") ci_lo = pc.get("ci95_low") ci_hi = pc.get("ci95_high") ci_str = "" if ci_lo is not None and ci_hi is not None: try: ci_str = f", 95% CI [{float(ci_lo):.3f}, {float(ci_hi):.3f}]" except (ValueError, TypeError): ci_str = f", 95% CI [{ci_lo}, {ci_hi}]" paired_block += ( f"- {method} vs {baseline} (regime={regime}): " f"mean_diff={md}, std_diff={sd}, " f"t={ts}, p={pv}{ci_str}\n" ) exp_metrics_instruction += paired_block # R24: Method naming map — translate generic condition labels _cond_names = list(cond_summaries.keys()) if isinstance(cond_summaries, dict) and cond_summaries else [] if _cond_names: naming_block = ( "\n\n## METHOD NAMING (CRITICAL — do NOT use generic labels in the paper)\n" "The condition labels below come from the experiment code. In the paper, " "you MUST use DESCRIPTIVE algorithm names, not generic labels.\n" "- If a condition name is already descriptive (e.g., 'random_search', " "'bayesian_optimization', 'ppo_policy'), use it directly as a proper name.\n" "- If a condition name is generic (e.g., 'baseline_1', 'method_variant_1'), " "you MUST infer the algorithm from the experiment code/context and use the " "real algorithm name (e.g., 'Random Search', 'Bayesian Optimization', " "'PPO', 'Curiosity-Driven RL').\n" "- NEVER write `baseline_1` or `method_variant_1` in the paper text.\n" f"- Conditions to name: {_cond_names}\n" ) exp_metrics_instruction += naming_block # IMP-8: Inject broken ablation warnings abl_warnings = exp_summary_parsed.get("ablation_warnings", []) if abl_warnings: broken_block = ( "\n\n## BROKEN ABLATIONS (DO NOT discuss as valid results)\n" "The following ablation conditions produced IDENTICAL outputs, " "indicating implementation bugs. Do NOT present their differences " "as findings. Mention them ONLY in a 'Limitations' sub-section " "as known implementation issues:\n" ) for _aw in abl_warnings: broken_block += f"- {_aw}\n" broken_block += ( "\nIf you reference these conditions, state explicitly: " "'Due to an implementation defect, conditions X and Y produced " "identical outputs; their comparison is therefore uninformative.'\n" ) exp_metrics_instruction += broken_block # R25: Statistical table format requirement if paired: stat_table_block = ( "\n\n## STATISTICAL TABLE REQUIREMENT (MANDATORY in Results section)\n" "The Results section MUST include a statistical comparison table with columns:\n" "| Comparison | Mean Diff | Std Diff | t-statistic | p-value | Significance |\n" "Use the PAIRED STATISTICAL COMPARISONS data above to fill this table.\n" "Mark significance: *** (p<0.001), ** (p<0.01), * (p<0.05), n.s.\n" "This is non-negotiable — a top-venue paper MUST have statistical tests.\n" ) exp_metrics_instruction += stat_table_block # R26: Metric definition requirement exp_metrics_instruction += ( "\n\n## METRIC DEFINITIONS (MANDATORY in Experiments section)\n" "The Experiments section MUST define each metric:\n" "- **Primary metric**: what it measures, how it is computed, range, direction " "(higher/lower is better), and units if applicable.\n" "- **Secondary metric**: same details.\n" "- For time-to-event metrics: explain the horizon, what constitutes success, " "and how failures are handled (e.g., set to max horizon).\n" "- These definitions MUST appear BEFORE any results tables.\n" ) # R27: Multi-seed framing enforcement _any_seeds = any( (cond_summaries.get(c) or {}).get("n_seed_metrics", 0) > 1 for c in _cond_names ) if _cond_names else False if _any_seeds: exp_metrics_instruction += ( "\n\n## MULTI-SEED EXPERIMENT FRAMING (CRITICAL)\n" "This experiment uses MULTIPLE independent random seeds per condition.\n" "- Report mean +/- std (or SE) for all metrics.\n" "- NEVER describe this as 'a single run' or '1 benchmark-artifact run'.\n" "- Frame as: 'We evaluate each method across N seeds per regime.'\n" "- The seed-level data IS the evidence base — it is NOT a single observation.\n" "- Include per-regime breakdowns (easy vs hard) as separate rows in tables.\n" ) # BUG-003: Inject actual evaluated datasets as a hard constraint if exp_summary_text: _ds_parsed = _safe_json_loads(exp_summary_text, {}) if isinstance(_ds_parsed, dict): _datasets: set[str] = set() # Extract from condition names (often contain dataset info) for _cname in (_ds_parsed.get("condition_summaries") or {}).keys(): _datasets.add(str(_cname)) # Extract from explicit "datasets" field if present for _ds in (_ds_parsed.get("datasets") or []): if isinstance(_ds, str): _datasets.add(_ds) # Extract from "benchmark" or "dataset" fields for _key in ("benchmark", "dataset", "dataset_name"): _dv = _ds_parsed.get(_key) if isinstance(_dv, str) and _dv: _datasets.add(_dv) if _datasets: exp_metrics_instruction += ( "\n\n## ACTUAL EVALUATED DATASETS (HARD CONSTRAINT)\n" "The following datasets/conditions were ACTUALLY tested in experiments:\n" + "".join(f"- {d}\n" for d in sorted(_datasets)) + "\nCRITICAL: Do NOT claim evaluation on any dataset not listed above.\n" "Do NOT fabricate results for datasets you did not run experiments on.\n" "If you reference other datasets, clearly state they are 'not evaluated " "in this work' or are 'left for future work'.\n" ) # P7: Ablation effectiveness check if exp_summary_text: _exp_parsed_p7 = _safe_json_loads(exp_summary_text, {}) if isinstance(_exp_parsed_p7, dict): _abl_warnings = _check_ablation_effectiveness(_exp_parsed_p7) if _abl_warnings: _abl_block = ( "\n\n## ABLATION EFFECTIVENESS WARNINGS\n" "The following ablations showed minimal effect (within 5% of baseline). " "Discuss this honestly — it may indicate the ablated component is not " "important, or the ablation was not properly implemented:\n" ) for _aw in _abl_warnings: _abl_block += f"- {_aw}\n" exp_metrics_instruction += _abl_block logger.warning("P7: Ablation effectiveness warnings: %s", _abl_warnings) # P10: Contradiction detection if exp_summary_text: _exp_parsed_p10 = _safe_json_loads(exp_summary_text, {}) if isinstance(_exp_parsed_p10, dict): _contradictions = _detect_result_contradictions( _exp_parsed_p10, metric_direction=config.experiment.metric_direction ) if _contradictions: _contra_block = ( "\n\n## RESULT INTERPRETATION ADVISORIES (CRITICAL — read before writing)\n" ) for _ca in _contradictions: _contra_block += f"- {_ca}\n" exp_metrics_instruction += _contra_block logger.warning("P10: Contradiction advisories: %s", _contradictions) # R10: HARD BLOCK — refuse to write paper when all data is simulated all_simulated = True for stage_subdir in sorted(run_dir.glob("stage-*/runs")): for run_file in sorted(stage_subdir.glob("*.json")): if run_file.name == "results.json": continue try: _payload = json.loads(run_file.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): continue if isinstance(_payload, dict) and _payload.get("status") != "simulated": all_simulated = False break if not all_simulated: break if all_simulated: logger.error( "BLOCKED: All experiment data is simulated (mode='simulated'). " "Cannot write a paper based on formulaic fake data. " "Switch to experiment.mode='sandbox' and re-run." ) (stage_dir / "paper_draft.md").write_text( "# Paper Draft Blocked\n\n" "**Reason**: All experiment results are from simulated mode " "(formulaic data: `0.3 + idx * 0.03`). " "These are not real experimental results.\n\n" "**Action Required**: Set `experiment.mode: 'sandbox'` in " "config.arc.yaml and re-run the pipeline.", encoding="utf-8", ) return StageResult( stage=Stage.PAPER_DRAFT, status=StageStatus.FAILED, artifacts=("paper_draft.md",), evidence_refs=(), ) # R4-2: HARD BLOCK — refuse to write paper with no real data (ML/empirical domains) # For non-empirical domains (math proofs, theoretical economics), allow proceeding _domain_id, _domain_name, _domain_venues = _detect_domain( config.research.topic, config.research.domains ) _empirical_domains = {"ml", "engineering", "biology", "chemistry"} if not has_real_metrics: if _domain_id in _empirical_domains: logger.error( "BLOCKED: Cannot write paper — experiment produced NO metrics. " "The pipeline will not fabricate results." ) (stage_dir / "paper_draft.md").write_text( "# Paper Draft Blocked\n\n" "**Reason**: Experiment stage produced no metrics (status: failed/timeout). " "Cannot write a paper without real experimental data.\n\n" "**Action Required**: Fix experiment execution or increase time_budget_sec.", encoding="utf-8", ) return StageResult( stage=Stage.PAPER_DRAFT, status=StageStatus.FAILED, artifacts=("paper_draft.md",), evidence_refs=(), ) else: logger.warning( "No experiment metrics found, but domain '%s' may be non-empirical " "(theoretical/mathematical). Proceeding with paper draft.", _domain_name, ) # R11-5: Experiment quality minimum threshold before paper writing # Parse analysis.md for quality rating and condition completeness analysis_text = _read_best_analysis(run_dir) _quality_warnings: list[str] = [] # Check 1: Was the analysis quality rating very low? import re as _re_q _rating_match = _re_q.search( r"(?:quality\s+rating|result\s+quality)[:\s]*\**(\d+)\s*/\s*10", analysis_text, _re_q.IGNORECASE, ) if _rating_match: _analysis_rating = int(_rating_match.group(1)) if _analysis_rating <= 3: _quality_warnings.append( f"Analysis rated experiment quality {_analysis_rating}/10" ) # BUG-23: If quality rating is ≤ 2, force has_real_metrics = False # to prevent fabricated results even if stdout had stray numbers. # R5-BUG-05: Skip override when _has_parsed_metrics is True — the # analysis.md may be stale (from pre-refinement Stage 14) while # Stage 13 refinement produced real parsed metrics. if _analysis_rating <= 2 and has_real_metrics and not _has_parsed_metrics: logger.warning( "BUG-23 guard: Analysis quality %d/10 \u2264 2 — " "overriding has_real_metrics to False (experiment likely failed)", _analysis_rating, ) has_real_metrics = False # Check 2: Are baselines missing? _analysis_lower = analysis_text.lower() if "no" in _analysis_lower and "baseline" in _analysis_lower: if any(phrase in _analysis_lower for phrase in [ "no baseline", "no bo", "no random", "baselines are missing", "missing baselines", "baseline coverage is missing", ]): _quality_warnings.append("Baselines appear to be missing from results") # Check 3: Is the metric undefined? if any(phrase in _analysis_lower for phrase in [ "metric is undefined", "primary_metric is undefined", "undefined metric", "metric undefined", ]): _quality_warnings.append("Primary metric is undefined (direction/units/formula unknown)") # Check 4: Very few conditions completed _condition_count = len(_re_q.findall( r"condition[=:\s]+\w+.*?(?:mean|primary_metric)", raw_metrics_block or "", _re_q.IGNORECASE, )) if _quality_warnings: _warning_block = "\n".join(f" - {w}" for w in _quality_warnings) logger.warning( "Stage 17: Experiment quality concerns detected before paper writing:\n%s", _warning_block, ) # Inject quality warnings into the paper writing prompt so the LLM # writes an appropriately hedged paper exp_metrics_instruction += ( "\n\n## EXPERIMENT QUALITY WARNINGS (address these honestly in the paper)\n" + "\n".join(f"- {w}" for w in _quality_warnings) + "\n\nBecause of these issues, the paper MUST:\n" "- Use hedged language ('preliminary', 'pilot', 'initial exploration')\n" "- NOT claim definitive comparisons between methods\n" "- Dedicate a substantial Limitations section to these gaps\n" "- Frame the contribution as methodology/framework, not empirical findings\n" ) # Save warnings for tracking (stage_dir / "quality_warnings.json").write_text( json.dumps(_quality_warnings, indent=2), encoding="utf-8" ) # Phase 1: Inject pre-built results tables from VerifiedRegistry if _verified_registry is not None: try: from researchclaw.templates.results_table_builder import ( build_results_tables, build_condition_whitelist, ) _prebuilt_tables = build_results_tables( _verified_registry, metric_direction=_verified_registry.metric_direction, ) _condition_whitelist = build_condition_whitelist(_verified_registry) if _prebuilt_tables: _tables_block = "\n\n".join(t.latex_code for t in _prebuilt_tables) exp_metrics_instruction += ( "\n\n## PRE-BUILT RESULTS TABLES (MANDATORY — copy verbatim)\n" "The tables below were AUTO-GENERATED from verified experiment data.\n" "You MUST include these tables in the Results section EXACTLY as shown.\n" "Do NOT modify any numbers. Do NOT add rows with fabricated data.\n" "You MAY adjust formatting (bold, alignment) but NOT numerical values.\n\n" + _tables_block ) logger.info("Stage 17: Injected pre-built results tables into prompt") if _condition_whitelist: exp_metrics_instruction += ( "\n\n## VERIFIED CONDITIONS (ONLY mention these in the paper)\n" + _condition_whitelist + "\nDo NOT discuss conditions not in this list. Do NOT invent new conditions.\n" ) except Exception as _tb_exc: logger.warning("Stage 17: Failed to build pre-built tables: %s", _tb_exc) # R4-2: Anti-fabrication data integrity instruction exp_metrics_instruction += ( "\n\n## CRITICAL: Data Integrity Rules\n" "- You may ONLY report numbers that appear in the experiment data above\n" "- If the experiment data is incomplete (fewer conditions than planned), report\n" " ONLY the conditions that were actually run\n" "- Do NOT extrapolate, interpolate, or 'fill in' missing cells in tables\n" "- Do NOT invent confidence intervals, p-values, or statistical tests unless\n" " the actual data supports them\n" "- If only N conditions completed, simply report results for those N conditions\n" " without repeating apologies or disclaimers about missing conditions\n" "- Any table cell without real data must show '\u2014' (not a plausible number)\n" "- FORBIDDEN: generating numbers that 'look right' based on your training data\n" ) # IMP-6 + FA: Inject chart references into paper draft prompt # Prefer FigureAgent's figure_plan.json (rich descriptions) over raw file scan # BUG-FIX: figure_plan.json may be a list (from FigureAgent planner) or a dict # (from executor overwrite). The orchestrator writes a list at planning time; # the executor overwrites with a dict only when figure_count > 0. If the # FigureAgent renders 0 charts the list persists, and calling .get() on it # raises AttributeError. _fa_descriptions = "" # BUG-178: Iterate in reverse order so we read the LATEST stage-14 # iteration's figure plan, matching Stage 22 which copies charts # from the newest iteration. for _s14_dir in sorted(run_dir.glob("stage-14*"), reverse=True): # Prefer the final plan (dict with figure_descriptions) if it exists for _fp_name in ("figure_plan_final.json", "figure_plan.json"): _fp_path = _s14_dir / _fp_name if not _fp_path.exists(): continue try: _fp_data = json.loads(_fp_path.read_text(encoding="utf-8")) if isinstance(_fp_data, dict): _fa_descriptions = _fp_data.get("figure_descriptions", "") elif isinstance(_fp_data, list) and _fp_data: # List format from FigureAgent planner — synthesize descriptions _desc_parts = ["## PLANNED FIGURES (from figure plan)\n"] for _fig in _fp_data: if isinstance(_fig, dict): _fid = _fig.get("figure_id", "unnamed") _ftitle = _fig.get("title", "") _fcap = _fig.get("caption", "") _fsec = _fig.get("section", "results") _desc_parts.append( f"- **{_fid}** ({_fsec}): {_ftitle}\n {_fcap}" ) if len(_desc_parts) > 1: _fa_descriptions = "\n".join(_desc_parts) except (json.JSONDecodeError, OSError): pass if _fa_descriptions: break if _fa_descriptions: break if _fa_descriptions: exp_metrics_instruction += "\n\n" + _fa_descriptions logger.info("Stage 17: Injected FigureAgent figure descriptions into paper draft prompt") else: # Fallback: scan for chart files from the LATEST stage-14 iteration # BUG-178: Must use reverse order to match Stage 22 chart copy behavior _chart_files: list[str] = [] for _s14_dir in sorted(run_dir.glob("stage-14*"), reverse=True): _charts_path = _s14_dir / "charts" if _charts_path.is_dir(): _found = sorted(_charts_path.glob("*.png")) if _found: _chart_files = [f.name for f in _found] break # Use only the latest iteration's charts if _chart_files: _chart_block = ( "\n\n## AVAILABLE FIGURES (embed in the paper)\n" "The following figures were generated from actual experiment data. " "You MUST reference at least 1-2 of these in the Results section " "using markdown image syntax: `![Caption](charts/filename.png)`\n\n" ) for _cf_name in _chart_files: _label = _cf_name.replace("_", " ").replace(".png", "").title() _chart_block += f"- `charts/{_cf_name}` \u2014 {_label}\n" _chart_block += ( "\nFor each figure referenced, write a descriptive caption and " "discuss what the figure shows in 2-3 sentences.\n" ) exp_metrics_instruction += _chart_block logger.info( "Stage 17: Injected %d chart references into paper draft prompt", len(_chart_files), ) # WS-5.5: Framework diagram placeholder instruction exp_metrics_instruction += ( "\n\n## FRAMEWORK DIAGRAM PLACEHOLDER\n" "In the Method/Approach section, include a placeholder for the methodology " "framework overview figure. Insert this exactly:\n\n" "```\n" "![Framework Overview](charts/framework_diagram.png)\n" "**Figure N.** Overview of the proposed methodology. " "[A detailed framework diagram will be generated separately and inserted here.]\n" "```\n\n" "This figure should be referenced in the text as 'Figure N' and discussed briefly " "(1-2 sentences describing the overall pipeline/architecture flow). " "The actual image will be generated post-hoc using a text-to-image model.\n" ) # P5: Extract hyperparameters from results.json for paper Method section _hp_table = "" for _s14_dir in sorted(run_dir.glob("stage-14*")): for _run_file in sorted(_s14_dir.glob("runs/*.json")): try: _run_data = json.loads(_run_file.read_text(encoding="utf-8")) if isinstance(_run_data, dict) and _run_data.get("hyperparameters"): _hp = _run_data["hyperparameters"] if isinstance(_hp, dict) and _hp: _hp_table = "\n\n## HYPERPARAMETERS (include as a table in the Method section)\n" _hp_table += "| Hyperparameter | Value |\n|---|---|\n" for _hk, _hv in sorted(_hp.items()): _hp_table += f"| {_hk} | {_hv} |\n" _hp_table += ( "\nThis table MUST appear in the Method/Experiments section. " "Include ALL hyperparameters used, with justification for key choices.\n" ) break except (json.JSONDecodeError, OSError): continue if _hp_table: break # Also check staging dirs for results.json if not _hp_table: for _staging_dir in sorted(run_dir.glob("stage-*/runs/_docker_*")): _rjson = _staging_dir / "results.json" if _rjson.is_file(): try: _rdata = json.loads(_rjson.read_text(encoding="utf-8")) if isinstance(_rdata, dict) and _rdata.get("hyperparameters"): _hp = _rdata["hyperparameters"] if isinstance(_hp, dict) and _hp: _hp_table = "\n\n## HYPERPARAMETERS (include as a table in the Method section)\n" _hp_table += "| Hyperparameter | Value |\n|---|---|\n" for _hk, _hv in sorted(_hp.items()): _hp_table += f"| {_hk} | {_hv} |\n" _hp_table += ( "\nThis table MUST appear in the Method/Experiments section. " "Include ALL hyperparameters used, with justification for key choices.\n" ) break except (json.JSONDecodeError, OSError): continue if _hp_table: exp_metrics_instruction += _hp_table # F2.6: Build citation list from references.bib / candidates with cite_keys citation_instruction = "" bib_text = _read_prior_artifact(run_dir, "references.bib") # P3: Pre-verify citations before paper draft — remove hallucinated refs if bib_text and bib_text.strip(): from researchclaw.literature.verify import ( filter_verified_bibtex, verify_citations as _verify_cit, ) try: _pre_report = _verify_cit(bib_text, inter_verify_delay=0.5) _kept = _pre_report.verified + _pre_report.suspicious _removed = _pre_report.hallucinated if _removed > 0: bib_text = filter_verified_bibtex( bib_text, _pre_report, include_suspicious=True ) (stage_dir / "references_preverified.bib").write_text( bib_text, encoding="utf-8" ) logger.info( "P3: Pre-verification kept %d/%d citations (removed %d hallucinated)", _kept, _pre_report.total, _removed, ) except Exception as exc: logger.warning("P3: Pre-verification failed, using original bib: %s", exc) candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl") if candidates_text: cite_lines: list[str] = [] for row_text in candidates_text.strip().splitlines(): row = _safe_json_loads(row_text, {}) if isinstance(row, dict) and row.get("cite_key"): authors_info = "" if isinstance(row.get("authors"), list) and row["authors"]: first_author = row["authors"][0] if isinstance(first_author, dict): # BUG-38: name may be non-str (tuple/list) — force str _name = first_author.get("name", "") authors_info = _name if isinstance(_name, str) else str(_name) elif isinstance(first_author, str): authors_info = first_author if len(row["authors"]) > 1: authors_info += " et al." title = row.get("title", "") cite_lines.append( f"- [{row['cite_key']}] \u2192 TITLE: \"{title}\" " f"| {authors_info} " f"({row.get('venue', '')}, {row.get('year', '')}, " f"cited {row.get('citation_count', 0)} times) " f"| ONLY cite this key when discussing: {title}" ) if cite_lines: citation_instruction = ( "\n\nAVAILABLE REFERENCES (use [cite_key] to cite in the text):\n" + "\n".join(cite_lines) + "\n\nCRITICAL CITATION RULES:\n" "- In the body text, cite using [cite_key] format, e.g. [smith2024transformer].\n" "- Do NOT write a References section \u2014 it will be auto-generated from the bibliography file.\n" "- Do NOT invent any references or arXiv IDs not in the above list.\n" "- You may cite a subset, but NEVER fabricate citations or change arXiv IDs.\n" "- SEMANTIC MATCHING: Before citing a reference, verify that its TITLE matches\n" " the concept you are discussing. Do NOT use an unrelated cite_key just\n" " because it sounds similar.\n" "- If no reference in the list matches the concept you want to cite,\n" " write 'prior work has shown...' WITHOUT a citation, rather than using\n" " a mismatched reference.\n" "- Each [cite_key] MUST correspond to the paper whose title is shown\n" " next to that key in the list above. Cross-check before citing.\n" "\nCITATION QUANTITY & QUALITY CONSTRAINTS:\n" "- Cite 25-40 unique references in the paper body. The Related Work\n" " section alone should cite at least 15 references.\n" "- Every citation MUST be directly relevant to the paper's topic.\n" "- DO NOT cite papers from unrelated domains (wireless communication, " "manufacturing, UAV, etc.).\n" "- Prefer well-known, highly-cited papers over obscure ones.\n" "- If unsure whether a paper exists or is relevant, DO NOT cite it.\n" ) if llm is not None: _pm = prompts or PromptManager() topic_constraint = _pm.block("topic_constraint", topic=config.research.topic) # --- Section-by-section writing (3 calls) for conference-grade depth --- draft = _write_paper_sections( llm=llm, pm=_pm, run_dir=run_dir, preamble=preamble, topic_constraint=topic_constraint, exp_metrics_instruction=exp_metrics_instruction, citation_instruction=citation_instruction, outline=outline, model_name=config.llm.primary_model, ) # R7: Strip LLM-generated References section — it often fabricates arXiv IDs. import re as _re_r7 ref_pattern = _re_r7.compile( r'^(#{1,2}\s*References.*)', _re_r7.MULTILINE | _re_r7.DOTALL ) ref_match = ref_pattern.search(draft) if ref_match: draft = draft[:ref_match.start()].rstrip() logger.info("Stage 17: Stripped LLM-generated References section (R7 fix)") else: # Build template with real data if available results_section = "Template results summary." if exp_summary_text: exp_summary = _safe_json_loads(exp_summary_text, {}) if isinstance(exp_summary, dict) and exp_summary.get("metrics_summary"): lines = ["Experiment results:"] for mk, mv in exp_summary["metrics_summary"].items(): if isinstance(mv, dict): lines.append( f"- {mk}: mean={mv.get('mean')}, min={mv.get('min')}, " f"max={mv.get('max')}, n={mv.get('count')}" ) results_section = "\n".join(lines) draft = f"""# Draft Title ## Abstract Template draft abstract. ## Introduction Template introduction for {config.research.topic}. ## Related Work Template related work. ## Method Template method description. ## Experiments Template experimental setup. ## Results {results_section} ## Limitations Template limitations. ## Conclusion Template conclusion. ## References Template references. Generated: {_utcnow_iso()} """ (stage_dir / "paper_draft.md").write_text(draft, encoding="utf-8") # Validate draft quality (section balance + bullet density) _validate_draft_quality(draft, stage_dir=stage_dir) return StageResult( stage=Stage.PAPER_DRAFT, status=StageStatus.DONE, artifacts=("paper_draft.md",), evidence_refs=("stage-17/paper_draft.md",), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_review_publish.py ================================================ """Stages 18-23: Peer review, paper revision, quality gate, knowledge archive, export/publish, and citation verify.""" from __future__ import annotations import json import logging import math import re from collections import Counter from pathlib import Path from typing import Any import yaml # noqa: F401 — available for downstream use from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain # noqa: F401 from researchclaw.pipeline._helpers import ( StageResult, _build_context_preamble, _chat_with_prompt, _collect_experiment_results, # noqa: F401 _default_quality_report, _extract_paper_title, _find_prior_file, _generate_framework_diagram_prompt, _generate_neurips_checklist, _get_evolution_overlay, _read_best_analysis, _read_prior_artifact, _safe_json_loads, _topic_constraint_block, # noqa: F401 _utcnow_iso, reconcile_figure_refs, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Helpers imported from executor.py (not yet moved to _helpers.py). # Lazy-imported inside functions to avoid circular import when executor.py # imports this module. # --------------------------------------------------------------------------- def _get_collect_raw_experiment_metrics(): from researchclaw.pipeline.stage_impls._paper_writing import _collect_raw_experiment_metrics return _collect_raw_experiment_metrics def _get_review_compiled_pdf(): from researchclaw.pipeline.stage_impls._paper_writing import _review_compiled_pdf return _review_compiled_pdf # --------------------------------------------------------------------------- # _collect_experiment_evidence # --------------------------------------------------------------------------- def _collect_experiment_evidence(run_dir: Path) -> str: """Collect actual experiment parameters and results for peer review.""" evidence_parts: list[str] = [] # 1. Read experiment code to find actual trial count, methods used exp_dir = _read_prior_artifact(run_dir, "experiment/") if exp_dir and Path(exp_dir).is_dir(): main_py = Path(exp_dir) / "main.py" if main_py.exists(): code = main_py.read_text(encoding="utf-8") evidence_parts.append(f"### Actual Experiment Code (main.py)\n```python\n{code[:3000]}\n```") # 2. Read sandbox run results (actual metrics, runtime, stderr) runs_text = _read_prior_artifact(run_dir, "runs/") if runs_text and Path(runs_text).is_dir(): for run_file in sorted(Path(runs_text).glob("*.json"))[:5]: payload = _safe_json_loads(run_file.read_text(encoding="utf-8"), {}) if isinstance(payload, dict): summary = { "metrics": payload.get("metrics"), "elapsed_sec": payload.get("elapsed_sec"), "timed_out": payload.get("timed_out"), } stderr = payload.get("stderr", "") if stderr: summary["stderr_excerpt"] = stderr[:500] evidence_parts.append( f"### Run Result: {run_file.name}\n```json\n{json.dumps(summary, indent=2)}\n```" ) # 3. Read refinement log for actual iteration count refine_log_text = _read_prior_artifact(run_dir, "refinement_log.json") if refine_log_text: try: rlog = json.loads(refine_log_text) summary = { "iterations_executed": len(rlog.get("iterations", [])), "converged": rlog.get("converged"), "stop_reason": rlog.get("stop_reason"), "best_metric": rlog.get("best_metric"), } evidence_parts.append( f"### Refinement Summary\n```json\n{json.dumps(summary, indent=2)}\n```" ) except (json.JSONDecodeError, TypeError): pass # 4. Count actual number of experiment runs actual_run_count = 0 for stage_subdir in sorted(run_dir.glob("stage-*/runs")): for rf in stage_subdir.glob("*.json"): if rf.name != "results.json": actual_run_count += 1 if actual_run_count > 0: evidence_parts.append( f"### Actual Trial Count\n" f"**The experiment was executed {actual_run_count} time(s).** " f"If the paper claims a different number of trials, this is a CRITICAL discrepancy." ) if not evidence_parts: return "" return ( "\n\n## Actual Experiment Evidence\n" "Use the evidence below to verify the paper's methodology claims.\n\n" + "\n\n".join(evidence_parts) ) # --------------------------------------------------------------------------- # Stage 18: Peer Review # --------------------------------------------------------------------------- def _execute_peer_review( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: draft = _read_prior_artifact(run_dir, "paper_draft.md") or "" experiment_evidence = _collect_experiment_evidence(run_dir) # Load draft quality warnings from Stage 17 (if available) _quality_suffix = "" _quality_json_path = _find_prior_file(run_dir, "draft_quality.json") if _quality_json_path and _quality_json_path.exists(): try: _dq = json.loads(_quality_json_path.read_text(encoding="utf-8")) _dq_warnings = _dq.get("overall_warnings", []) if _dq_warnings: _quality_suffix = ( "\n\nAUTOMATED QUALITY ISSUES (flag these in your review):\n" + "\n".join(f"- {w}" for w in _dq_warnings) + "\n" ) except Exception: # noqa: BLE001 pass if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "peer_review") sp = _pm.for_stage( "peer_review", evolution_overlay=_overlay, topic=config.research.topic, draft=draft, experiment_evidence=experiment_evidence, ) _review_user = sp.user + _quality_suffix resp = _chat_with_prompt( llm, sp.system, _review_user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) reviews = resp.content else: reviews = """# Reviews ## Reviewer A - Strengths: Clear problem statement. - Weaknesses: Limited ablation details. - Actionable revisions: Add uncertainty analysis and stronger baselines. ## Reviewer B - Strengths: Reproducibility focus. - Weaknesses: Discussion underdeveloped. - Actionable revisions: Expand limitations and broader impact. """ (stage_dir / "reviews.md").write_text(reviews, encoding="utf-8") return StageResult( stage=Stage.PEER_REVIEW, status=StageStatus.DONE, artifacts=("reviews.md",), evidence_refs=("stage-18/reviews.md",), ) # --------------------------------------------------------------------------- # Stage 19: Paper Revision # --------------------------------------------------------------------------- def _execute_paper_revision( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: draft = _read_prior_artifact(run_dir, "paper_draft.md") or "" reviews = _read_prior_artifact(run_dir, "reviews.md") or "" draft_word_count = len(draft.split()) # R4-2: Collect real metrics for anti-fabrication guard in revision # BUG-47: _collect_raw_experiment_metrics returns tuple[str, bool], must unpack _raw_metrics_tuple = _get_collect_raw_experiment_metrics()(run_dir) raw_metrics_revision = _raw_metrics_tuple[0] if isinstance(_raw_metrics_tuple, tuple) else (_raw_metrics_tuple or "") data_integrity_revision = "" if raw_metrics_revision: data_integrity_revision = ( raw_metrics_revision + "\nDATA INTEGRITY: Do NOT add new numbers that are not in the " "experiment data above. If a reviewer asks for additional results " "you do not have, state 'Due to computational constraints, " "this analysis was not conducted' instead of fabricating data.\n" ) if llm is not None: _pm = prompts or PromptManager() try: _ws_revision = _pm.block("writing_structure") except (KeyError, Exception): # noqa: BLE001 _ws_revision = "" # IMP-20/25/31/24: Load style blocks for revision prompt _rev_blocks: dict[str, str] = {} for _bname in ("academic_style_guide", "narrative_writing_rules", "anti_hedging_rules", "anti_repetition_rules"): try: _rev_blocks[_bname] = _pm.block(_bname) except (KeyError, Exception): # noqa: BLE001 _rev_blocks[_bname] = "" # Load draft quality directives from Stage 17 _quality_prefix = "" _quality_json_path = _find_prior_file(run_dir, "draft_quality.json") if _quality_json_path and _quality_json_path.exists(): try: _dq = json.loads(_quality_json_path.read_text(encoding="utf-8")) _dq_directives = _dq.get("revision_directives", []) if _dq_directives: _quality_prefix = ( "MANDATORY QUALITY FIXES (address ALL of these):\n" + "\n".join(f"- {d}" for d in _dq_directives) + "\n\n" ) except Exception: # noqa: BLE001 pass _overlay = _get_evolution_overlay(run_dir, "paper_revision") sp = _pm.for_stage( "paper_revision", evolution_overlay=_overlay, topic_constraint=_pm.block("topic_constraint", topic=config.research.topic), writing_structure=_ws_revision, draft=draft, reviews=_quality_prefix + reviews + data_integrity_revision, **_rev_blocks, ) # R10-Fix2: Ensure max_tokens is sufficient for full paper revision revision_max_tokens = sp.max_tokens if revision_max_tokens and draft_word_count > 0: # ~1.5 tokens per word, 20% headroom min_tokens_needed = int(draft_word_count * 1.5 * 1.2) if revision_max_tokens < min_tokens_needed: revision_max_tokens = min_tokens_needed logger.info( "Stage 19: Increased max_tokens from %d to %d to fit full paper revision", sp.max_tokens, revision_max_tokens, ) # R10-Fix4: Retry on timeout for paper revision (critical stage) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=revision_max_tokens, retries=2, ) revised = resp.content revised_word_count = len(revised.split()) # Length guard: if revision is shorter than 80% of draft, retry once if draft_word_count > 500 and revised_word_count < int(draft_word_count * 0.8): logger.warning( "Paper revision (%d words) is shorter than draft (%d words). " "Retrying with stronger length enforcement.", revised_word_count, draft_word_count, ) retry_user = ( f"CRITICAL LENGTH REQUIREMENT: The draft is {draft_word_count} words. " f"Your revision MUST be at least {draft_word_count} words — ideally longer. " f"Do NOT summarize or condense ANY section. Copy each section verbatim " f"and ONLY make targeted improvements to address reviewer comments. " f"If a section has no reviewer comments, include it UNCHANGED.\n\n" + sp.user ) resp2 = _chat_with_prompt( llm, sp.system, retry_user, json_mode=sp.json_mode, max_tokens=revision_max_tokens, ) revised2 = resp2.content revised2_word_count = len(revised2.split()) if revised2_word_count >= int(draft_word_count * 0.8): revised = revised2 elif revised2_word_count > revised_word_count: # Retry improved but still not enough — use the longer version revised = revised2 logger.warning( "Retry improved (%d → %d words) but still shorter than draft (%d).", revised_word_count, revised2_word_count, draft_word_count, ) else: # Both attempts produced short output — preserve full original draft logger.warning( "Retry also produced short output (%d words). " "Falling back to FULL ORIGINAL DRAFT to prevent content loss.", revised2_word_count, ) # Extract useful revision points as appendix revision_words = revised.split() revision_summary = ( " ".join(revision_words[:500]) + "\n\n*(Revision summary truncated)*" if len(revision_words) > 500 else revised ) if revision_summary.strip(): # Save revision notes to internal file, not paper body (stage_dir / "revision_notes_internal.md").write_text( revision_summary, encoding="utf-8" ) revised = draft else: revised = draft (stage_dir / "paper_revised.md").write_text(revised, encoding="utf-8") return StageResult( stage=Stage.PAPER_REVISION, status=StageStatus.DONE, artifacts=("paper_revised.md",), evidence_refs=("stage-19/paper_revised.md",), ) # --------------------------------------------------------------------------- # Stage 20: Quality Gate # --------------------------------------------------------------------------- def _execute_quality_gate( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: revised = _read_prior_artifact(run_dir, "paper_revised.md") or "" report: dict[str, Any] | None = None # BUG-25 + BUG-180: Load the RICHEST experiment summary for cross-checking. # _read_prior_artifact returns the first match in reverse-sorted order, # which may be a repair stage with 0 conditions. Instead, scan all # stage-14* experiment summaries and pick the one with the most data. _exp_summary: dict[str, Any] = {} _exp_summary_text = "" _best_richness = -1 for _es_path in sorted(run_dir.glob("stage-14*/experiment_summary.json")): try: _es_text = _es_path.read_text(encoding="utf-8") _es_data = _safe_json_loads(_es_text, {}) if not isinstance(_es_data, dict): continue _richness = len(_es_data.get("condition_summaries", {})) if _richness > _best_richness: _best_richness = _richness _exp_summary = _es_data _exp_summary_text = _es_text except OSError: continue # Also check experiment_summary_best.json at run root _root_best = run_dir / "experiment_summary_best.json" if _root_best.is_file(): try: _rb_text = _root_best.read_text(encoding="utf-8") _rb_data = _safe_json_loads(_rb_text, {}) if isinstance(_rb_data, dict): _rb_rich = len(_rb_data.get("condition_summaries", {})) if _rb_rich > _best_richness: _exp_summary = _rb_data _exp_summary_text = _rb_text except OSError: pass # Fallback to _read_prior_artifact if nothing found above if not _exp_summary: _exp_summary_text = _read_prior_artifact(run_dir, "experiment_summary.json") or "" _exp_summary = _safe_json_loads(_exp_summary_text, {}) if _exp_summary_text else {} _exp_failed = False if isinstance(_exp_summary, dict): _best_run = _exp_summary.get("best_run", {}) if isinstance(_best_run, dict): _exp_failed = ( _best_run.get("status") == "failed" and not _best_run.get("metrics") ) # Also check if metrics_summary is empty if not _exp_summary.get("metrics_summary"): _exp_failed = True # BUG-180: If we found real condition data, don't mark as failed if _best_richness > 0: _exp_failed = False if llm is not None: _pm = prompts or PromptManager() # IMP-33: Evaluate the full paper instead of truncating to 12K chars. # Split into chunks if very long, but prefer sending the full text. paper_for_eval = revised[:40000] if len(revised) > 40000 else revised # BUG-25: Inject experiment status into quality gate prompt _exp_context = "" if _exp_summary and isinstance(_exp_summary, dict): _exp_status_keys = { k: _exp_summary.get(k) for k in ( "total_conditions", "total_metric_keys", "metrics_summary", ) if _exp_summary.get(k) is not None } # BUG-180: Include condition count from condition_summaries _cond_summ = _exp_summary.get("condition_summaries", {}) if isinstance(_cond_summ, dict) and _cond_summ: _exp_status_keys["completed_conditions"] = len(_cond_summ) _exp_status_keys["condition_names"] = list(_cond_summ.keys())[:20] if _best_run := _exp_summary.get("best_run"): _exp_status_keys["best_run_status"] = ( _best_run.get("status") if isinstance(_best_run, dict) else str(_best_run) ) _exp_context = ( "\n\nExperiment summary (for cross-checking reported numbers):\n" + json.dumps(_exp_status_keys, indent=2, default=str)[:4000] + "\n\nCross-check: If the experiment status is 'failed' with " "empty metrics, any numerical results in tables constitute " "fabrication. Penalize severely.\n" ) _overlay = _get_evolution_overlay(run_dir, "quality_gate") sp = _pm.for_stage( "quality_gate", evolution_overlay=_overlay, quality_threshold=str(config.research.quality_threshold), revised=paper_for_eval + _exp_context, ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) parsed = _safe_json_loads(resp.content, {}) if isinstance(parsed, dict): report = parsed # BUG-25: If experiment failed with no metrics, cap the quality score if report is not None and _exp_failed: _orig_score = report.get("score_1_to_10", 5) if isinstance(_orig_score, (int, float)) and _orig_score > 3: report["score_1_to_10"] = min(_orig_score, 3.0) report.setdefault("weaknesses", []).append( "Experiment failed with no metrics — any reported numerical " "results are unsupported and likely fabricated." ) logger.warning( "BUG-25: Experiment failed — capping quality score from %.1f to 3.0", _orig_score, ) if report is None: report = _default_quality_report(config.research.quality_threshold) report.setdefault("generated", _utcnow_iso()) (stage_dir / "quality_report.json").write_text( json.dumps(report, indent=2), encoding="utf-8" ) # T2.1: Enforce quality gate — fail if score below threshold score = report.get("score_1_to_10", 0) # BUG-R5-01: score can be string from LLM JSON — coerce to float if not isinstance(score, (int, float)): try: score = float(score) except (TypeError, ValueError): score = 0 verdict = report.get("verdict", "proceed") threshold = config.research.quality_threshold or 5.0 # --- Fabrication flag: collect real metrics for Stage 22 sanitization --- _fabrication_info: dict[str, Any] = { "experiment_failed": _exp_failed, "quality_score": score, "real_metric_values": [], } if isinstance(_exp_summary, dict): # Collect ALL real numeric values from experiment_summary.json _cond_summaries = _exp_summary.get("condition_summaries", {}) if isinstance(_cond_summaries, dict): for cond_name, cond_data in _cond_summaries.items(): if not isinstance(cond_data, dict): continue cond_status = cond_data.get("status", "") if cond_status == "failed": continue # skip failed conditions for k, v in cond_data.items(): if isinstance(v, (int, float)) and k not in ( "seed_count", "total_steps", "training_steps", ): _fabrication_info["real_metric_values"].append( round(float(v), 4) ) _ms = _exp_summary.get("metrics_summary", {}) if isinstance(_ms, dict): for _mk, _mv in _ms.items(): if isinstance(_mv, dict): for _stat in ("mean", "min", "max"): _sv = _mv.get(_stat) if isinstance(_sv, (int, float)): _fabrication_info["real_metric_values"].append( round(float(_sv), 4) ) _fabrication_info["has_real_data"] = bool( _fabrication_info["real_metric_values"] ) _fabrication_info["fabrication_suspected"] = ( _exp_failed and not _fabrication_info["has_real_data"] ) # Phase 1: Enhanced fabrication detection via VerifiedRegistry # BUG-108: Also pass refinement_log so NaN best_metric is properly handled _rl20_candidates = sorted(run_dir.glob("stage-13*/refinement_log.json"), reverse=True) _rl20_path = _rl20_candidates[0] if _rl20_candidates else None _rl20: dict | None = None if _rl20_path and _rl20_path.is_file(): try: _rl20 = json.loads(_rl20_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): pass try: from researchclaw.pipeline.verified_registry import VerifiedRegistry as _VR20 _vr20 = _VR20.from_run_dir(run_dir, metric_direction=config.experiment.metric_direction, best_only=True) if isinstance(_exp_summary, dict) else None if _vr20: _fabrication_info["verified_values_count"] = len(_vr20.values) _fabrication_info["verified_conditions"] = sorted(_vr20.condition_names) except Exception: pass (stage_dir / "fabrication_flags.json").write_text( json.dumps(_fabrication_info, indent=2), encoding="utf-8" ) if isinstance(score, (int, float)) and score < threshold: if config.research.graceful_degradation: logger.warning( "Quality gate DEGRADED: score %.1f < threshold %.1f — " "continuing with sanitization (graceful_degradation=True)", score, threshold, ) # Write degradation signal for downstream stages signal = { "score": score, "threshold": threshold, "verdict": verdict, "weaknesses": report.get("weaknesses", []), "generated": _utcnow_iso(), } (run_dir / "degradation_signal.json").write_text( json.dumps(signal, indent=2), encoding="utf-8" ) return StageResult( stage=Stage.QUALITY_GATE, status=StageStatus.DONE, artifacts=("quality_report.json",), evidence_refs=("stage-20/quality_report.json",), decision="degraded", ) logger.warning( "Quality gate FAILED: score %.1f < threshold %.1f (verdict=%s)", score, threshold, verdict, ) return StageResult( stage=Stage.QUALITY_GATE, status=StageStatus.FAILED, artifacts=("quality_report.json", "fabrication_flags.json"), evidence_refs=("stage-20/quality_report.json",), error=f"Quality score {score:.1f}/10 below threshold {threshold:.1f}. " f"Paper needs revision before export.", ) logger.info( "Quality gate PASSED: score %.1f >= threshold %.1f", score, threshold, ) return StageResult( stage=Stage.QUALITY_GATE, status=StageStatus.DONE, artifacts=("quality_report.json", "fabrication_flags.json"), evidence_refs=("stage-20/quality_report.json",), ) # --------------------------------------------------------------------------- # Stage 21: Knowledge Archive # --------------------------------------------------------------------------- def _execute_knowledge_archive( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: revised = _read_prior_artifact(run_dir, "paper_revised.md") or "" analysis = _read_best_analysis(run_dir) decision = _read_prior_artifact(run_dir, "decision.md") or "" preamble = _build_context_preamble(config, run_dir, include_goal=True) if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "knowledge_archive") sp = _pm.for_stage( "knowledge_archive", evolution_overlay=_overlay, preamble=preamble, decision=decision, analysis=analysis, revised=revised[:15000], ) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) archive = resp.content else: archive = f"""# Knowledge Archive ## Lessons Learned - Preserve strict metric reporting protocol. - Keep refinement logs aligned with code changes. ## Reproducibility - Include exact experiment script and schedule. - Capture run-level JSON metrics. ## Future Work - Extend robustness and external validity checks. Generated: {_utcnow_iso()} """ (stage_dir / "archive.md").write_text(archive, encoding="utf-8") files: list[str] = [] for stage_subdir in sorted(run_dir.glob("stage-*")): for artifact in sorted(stage_subdir.rglob("*")): if artifact.is_file() and artifact != (stage_dir / "bundle_index.json"): files.append(str(artifact.relative_to(run_dir))) index = { "run_id": run_dir.name, "generated": _utcnow_iso(), "artifact_count": len(files), "artifacts": files, } (stage_dir / "bundle_index.json").write_text( json.dumps(index, indent=2), encoding="utf-8" ) return StageResult( stage=Stage.KNOWLEDGE_ARCHIVE, status=StageStatus.DONE, artifacts=("archive.md", "bundle_index.json"), evidence_refs=("stage-21/archive.md", "stage-21/bundle_index.json"), ) # --------------------------------------------------------------------------- # _sanitize_fabricated_data helper # --------------------------------------------------------------------------- def _sanitize_fabricated_data( paper: str, run_dir: Path, ) -> tuple[str, dict[str, Any]]: """Replace unverified numerical data in markdown tables with '---'. Loads experiment_summary.json as ground truth, extracts all verified metric values, then scans markdown tables in Results/Experiment sections. Numbers not matching any verified value (within 1% relative tolerance) are replaced with ``---``. Returns (sanitized_paper, sanitization_report). """ import re as _re_san # --- 1. Build verified values set from experiment_summary.json --- # BUG-222: After REFINE cycles, merging ALL stage-14* data creates a # permissive registry that validates fabricated numbers from regressed # iterations. Use ONLY the promoted best data as ground truth. # experiment_summary_best.json is written by _promote_best_stage14() and # contains the single best iteration's data. verified_values: set[float] = set() def _richness(path: Path) -> int: """Score an experiment_summary.json by how many conditions it has.""" try: d = json.loads(path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return -1 if not isinstance(d, dict): return -1 conds = d.get("condition_summaries", {}) metrics = d.get("metrics_summary", {}) return len(conds) + len(metrics) # BUG-222: Prefer experiment_summary_best.json (promoted best iteration). # Only fall back to "richest stage-14*" scanning if best.json is missing # (single-iteration runs without REFINE). _root_best = run_dir / "experiment_summary_best.json" if _root_best.exists() and _richness(_root_best) > 0: exp_path = _root_best else: _candidates = list(run_dir.glob("stage-14*/experiment_summary.json")) exp_path = max(_candidates, key=_richness) if _candidates else run_dir / "stage-14" / "experiment_summary.json" if exp_path.exists(): try: exp_data = json.loads(exp_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): exp_data = {} def _collect_numbers(obj: Any, depth: int = 0) -> None: if depth > 10: return if isinstance(obj, (int, float)) and not isinstance(obj, bool): import math as _math_vv if _math_vv.isfinite(float(obj)): verified_values.add(float(obj)) elif isinstance(obj, dict): for v in obj.values(): _collect_numbers(v, depth + 1) elif isinstance(obj, list): for v in obj: _collect_numbers(v, depth + 1) # Extract from well-known keys for key in ( "metrics_summary", "condition_summaries", "best_run", "condition_metrics", "conditions", "ablation_results", ): if key in exp_data: _collect_numbers(exp_data[key]) # BUG-222: Removed BUG-206 refinement_log scanning. The original BUG-206 # rationale was "Stage 17 injects sandbox metrics, so the sanitizer must # recognise them". But that created a loophole: after REFINE regression, # the LLM would cite regressed iteration numbers and the sanitizer would # pass them because they were in the refinement log. Now that Stage 17 # also uses only the promoted best data (BUG-222), there is no need to # whitelist all sandbox metrics here. if not verified_values: report: dict[str, Any] = { "sanitized": False, "reason": "no verified values found in experiment_summary.json", "tables_processed": 0, "numbers_replaced": 0, } return paper, report def _is_verified(num: float) -> bool: """Check if num matches any verified value within 1% relative tolerance. BUG-R5-20: Also checks percentage/decimal cross-matching (e.g., 73.42 in paper vs 0.7342 in experiment, or vice versa). """ for v in verified_values: if v == 0.0: if abs(num) < 1e-9: return True elif abs(num - v) / abs(v) <= 0.01: return True # Cross-match: num might be percentage form of v (or vice versa) elif v != 0.0 and abs(num / 100.0 - v) / abs(v) <= 0.01: return True elif v != 0.0 and abs(num - v * 100.0) / abs(v * 100.0) <= 0.01: return True return False # --- 2. Find and sanitize markdown tables --- # BUG-175: Always-allowed set — common constants, hyperparameters, and # structural values that should never be sanitized (matches paper_verifier.py). _SANITIZER_ALWAYS_ALLOWED: set[float] = { 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 0.5, 0.01, 0.001, 0.0001, 0.1, 0.05, 0.95, 0.99, 2024.0, 2025.0, 2026.0, 2027.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 224.0, 299.0, 384.0, # Common image sizes # BUG-192: Common hyperparameter values 0.0003, 3e-4, 0.0005, 5e-4, 0.002, 2e-3, # learning rates 0.2, 0.3, 0.25, 0.7, 0.6, 0.8, # clip epsilon, dropout, gradient clip, GCE q, common HP 0.9, 0.999, 0.9999, # Adam betas, momentum 0.02, 0.03, # weight init std 1e-5, 1e-6, 1e-8, # epsilon, weight decay 300.0, 400.0, 500.0, # epochs 4096.0, 8192.0, # larger batch sizes / hidden dims } # Match markdown table blocks (header + separator + data rows) table_pat = _re_san.compile( r"((?:^[ \t]*\|.+\|[ \t]*\n)+" # one or more pipe-delimited lines r")", _re_san.MULTILINE, ) # Match numbers in table cells (integers, decimals, percentages, scientific) # BUG-175: Also exclude hyphen in lookaround to protect method names like # "Cos-200", "StepLR-100" from partial number extraction. # BUG-206: Include Unicode hyphens (U+2010 hyphen, U+2011 non-breaking # hyphen, U+2013 en-dash) — LLMs frequently emit these instead of ASCII # hyphens in model names like "ResNet‑34". # BUG-206: Unicode hyphens placed before escaped ASCII hyphen (\\-) # to avoid creating unintended character ranges in the class. _HYPH = "\u2010\u2011\u2013\\-" # U+2010 + U+2011 + U+2013 + ASCII hyphen num_pat = _re_san.compile( f"(? str: nonlocal numbers_replaced, numbers_kept num_str = m.group(1) pct = m.group(2) try: val = float(num_str) except ValueError: return m.group(0) # BUG-175: Always allow common constants / hyperparameters if val in _SANITIZER_ALWAYS_ALLOWED: numbers_kept += 1 return m.group(0) # BUG-175: Small integer exemption — counts, indices, # epoch numbers, etc. (≤ 20 auto-pass) if val == int(val) and abs(val) <= 20: numbers_kept += 1 return m.group(0) if _is_verified(val): numbers_kept += 1 return m.group(0) numbers_replaced += 1 replaced_values.append(num_str + pct) return "---" def _sanitize_table(match: _re_san.Match[str]) -> str: nonlocal numbers_replaced, numbers_kept, tables_processed table_text = match.group(0) lines = table_text.split("\n") # Check if this looks like a results/experiment table # (heuristic: has a separator row with dashes) has_separator = any( _re_san.match(r"^[ \t]*\|[\s:|-]+\|[ \t]*$", line) for line in lines ) if not has_separator: return table_text # BUG-192: Detect hyperparameter/config tables and SKIP sanitization. # These tables contain design choices, not experimental results. _HP_TABLE_KW = { "hyperparameter", "hyper-parameter", "configuration", "config", "setting", "parameter", "learning rate", "lr", "batch size", "optimizer", "architecture", "schedule", "warmup", "decay", "dropout", "weight decay", "momentum", "epsilon", "clip", } # BUG-224: Statistical analysis tables contain derived values # (t-statistics, p-values, effect sizes) that are computed from # the experiment data but never appear in experiment_summary.json. # These tables should NOT be sanitized. _STAT_TABLE_KW = { "t-statistic", "t-stat", "t statistic", "p-value", "p value", "paired", "cohen", "effect size", "wilcoxon", "mann-whitney", "statistical", "significance", "confidence interval", } _RESULT_TABLE_KW = { "accuracy", "acc", "loss", "f1", "auroc", "auc", "precision", "recall", "bleu", "rouge", "reward", "return", "rmse", "mae", "mse", "error", "score", "metric", "performance", "improvement", "top-1", "top1", "top-5", "top5", } _header_lower = lines[0].lower() if lines else "" _is_hp_table = any(kw in _header_lower for kw in _HP_TABLE_KW) _is_result_table = any(kw in _header_lower for kw in _RESULT_TABLE_KW) # BUG-224: Statistical analysis tables (t-tests, p-values) contain # derived values that are never in experiment_summary.json. _is_stat_table = any(kw in _header_lower for kw in _STAT_TABLE_KW) if _is_hp_table and not _is_result_table: return table_text # Skip sanitization for HP/config tables if _is_stat_table: return table_text # Skip sanitization for statistical test tables # BUG-184: Per-column HP detection — classify each column header # as HP-type (skip sanitization) or result-type (sanitize). # This handles mixed tables like "| Method | LR | Acc | F1 |" # where LR should be preserved but Acc/F1 are verified. _HP_COL_KW = { "lr", "learning rate", "batch", "epoch", "optimizer", "schedule", "warmup", "decay", "dropout", "momentum", "clip", "epsilon", "eps", "beta", "alpha", "gamma", "lambda", "weight decay", "wd", "temperature", "temp", "hidden", "dim", "layers", "heads", "steps", "iterations", "seed", "patience", "#param", "params", "size", "depth", "width", "channels", "kernel", "stride", "padding", # BUG-224: Statistical test columns (derived, not in experiment data) "t-stat", "t stat", "p-value", "p value", "p-val", "cohen", "effect", "ci lower", "ci upper", "difference", } _hp_cols: set[int] = set() # column indices that are HP columns if lines: _hdr_cells = lines[0].split("|") for _ci, _hc in enumerate(_hdr_cells): _hc_low = _hc.strip().lower() if any(kw in _hc_low for kw in _HP_COL_KW): _hp_cols.add(_ci) tables_processed += 1 sanitized_lines: list[str] = [] for i, line in enumerate(lines): # Skip header row and separator row is_separator = bool( _re_san.match(r"^[ \t]*\|[\s:|-]+\|[ \t]*$", line) ) is_header = i == 0 # first line is typically the header if is_separator or is_header: sanitized_lines.append(line) continue # BUG-175: Split by pipe and only sanitize cells after # the first data column (which typically contains method # names, condition labels, etc.) cells = line.split("|") sanitized_cells: list[str] = [] for ci, cell in enumerate(cells): # Skip first non-empty cell (method/label column), # empty edge cells, and BUG-184 HP-classified columns if ci <= 1 or not cell.strip() or ci in _hp_cols: sanitized_cells.append(cell) else: sanitized_cells.append( num_pat.sub(_replace_num, cell) ) sanitized_lines.append("|".join(sanitized_cells)) return "\n".join(sanitized_lines) sanitized = table_pat.sub(_sanitize_table, paper) # --- BUG-211: LaTeX tabular sanitization --- # LLMs sometimes write results in LaTeX \begin{tabular} format inside # the markdown paper (often within ```latex fences). The markdown # table regex above misses these entirely, allowing fabricated numbers # to pass through unchecked. latex_tab_pat = _re_san.compile( r"(\\begin\{tabular\}.*?\\end\{tabular\})", _re_san.DOTALL, ) # Keywords for HP-table vs result-table classification (reuse from above) _LTX_HP_KW = { "hyperparameter", "hyper-parameter", "configuration", "config", "setting", "learning rate", "lr", "batch size", "optimizer", } _LTX_RESULT_KW = { "accuracy", "acc", "loss", "f1", "auroc", "auc", "precision", "recall", "reward", "score", "metric", "performance", "result", } # BUG-224: Statistical analysis LaTeX tables — derived values _LTX_STAT_KW = { "t-statistic", "t-stat", "t statistic", "p-value", "p value", "paired", "cohen", "effect size", "statistical", "significance", } def _sanitize_latex_table(match: _re_san.Match[str]) -> str: nonlocal tables_processed block = match.group(0) # Heuristic: look at the first ~300 chars (column spec + header row) # to decide HP vs result table. Also check preceding \caption if # the match is part of a \begin{table} environment — we can look # backwards a bit in the full text for the caption. _start = match.start() _context = sanitized[max(0, _start - 300):_start + 300].lower() _is_hp = any(kw in _context for kw in _LTX_HP_KW) _is_res = any(kw in _context for kw in _LTX_RESULT_KW) # BUG-224: Statistical test tables — derived values not in experiment data _is_stat = any(kw in _context for kw in _LTX_STAT_KW) if _is_hp and not _is_res: return block # HP/config table — skip if _is_stat: return block # Statistical analysis table — skip tables_processed += 1 # Split into rows by \\ (LaTeX row separator). # We split on \\ but keep the delimiter so we can reconstruct. parts = _re_san.split(r"(\\\\)", block) result_parts: list[str] = [] _seen_midrule = False for part in parts: # Preserve row separators as-is if part == "\\\\": result_parts.append(part) continue _stripped = part.strip() # Rule lines — no numbers to sanitize if _re_san.search( r"\\(hline|toprule|midrule|bottomrule|cline|cmidrule)", _stripped, ): if "midrule" in _stripped or "hline" in _stripped: _seen_midrule = True result_parts.append(part) continue # Column spec line (contains \begin{tabular}{...}) if r"\begin{tabular}" in part: result_parts.append(part) continue # End line if r"\end{tabular}" in part: result_parts.append(part) continue # Header row: rows before the first \midrule/\hline if not _seen_midrule: result_parts.append(part) continue # Data row — split by & and sanitize cells after the first cells = part.split("&") sanitized_cells: list[str] = [] for ci, cell in enumerate(cells): if ci == 0: # First cell is method/condition name — preserve sanitized_cells.append(cell) else: sanitized_cells.append(num_pat.sub(_replace_num, cell)) result_parts.append("&".join(sanitized_cells)) return "".join(result_parts) sanitized = latex_tab_pat.sub(_sanitize_latex_table, sanitized) # --- Improvement F: Prose-level anti-fabrication --- # Scan Results/Experiments sections for inline numeric claims like # "achieved 94.2% accuracy" or "obtained an AUROC of 0.87". # Replace unverified numbers with "[value removed]". prose_numbers_replaced = 0 _prose_pattern = _re_san.compile( r"(?:achiev|obtain|reach|attain|yield|report|record|produc|demonstrat|show|observ)" r"(?:ed|es|ing|s)?\s+" r"(?:an?\s+)?(?:\w+\s+)?(?:of\s+)?" r"(\d+\.?\d*)\s*" r"(%|\\%)?", _re_san.IGNORECASE, ) # Only process lines in Results/Experiments sections _in_results_section = False _results_headers = _re_san.compile( r"^#{1,3}\s*(Results|Experiments|Experimental|Evaluation|Ablation)", _re_san.IGNORECASE, ) _any_header = _re_san.compile(r"^#{1,3}\s+") _sanitized_lines = [] for _line in sanitized.split("\n"): if _results_headers.match(_line): _in_results_section = True elif _any_header.match(_line) and _in_results_section: # Check if we're leaving Results for a different top-level section _header_text = _line.lstrip("#").strip().lower() if _header_text and not any(kw in _header_text for kw in ("result", "experiment", "ablation", "evaluation", "comparison")): _in_results_section = False if _in_results_section and "|" not in _line: # skip table rows def _replace_prose_num(m: _re_san.Match[str]) -> str: nonlocal prose_numbers_replaced num_str = m.group(1) try: val = float(num_str) except ValueError: return m.group(0) # Skip common constants / small integers if val in _SANITIZER_ALWAYS_ALLOWED: return m.group(0) if val == int(val) and abs(val) <= 20: return m.group(0) if _is_verified(val): return m.group(0) prose_numbers_replaced += 1 return m.group(0).replace(num_str + (m.group(2) or ""), "[value removed]") _line = _prose_pattern.sub(_replace_prose_num, _line) _sanitized_lines.append(_line) sanitized = "\n".join(_sanitized_lines) report = { "sanitized": numbers_replaced > 0 or prose_numbers_replaced > 0, "tables_processed": tables_processed, "numbers_replaced": numbers_replaced, "numbers_kept": numbers_kept, "prose_numbers_replaced": prose_numbers_replaced, "verified_values_count": len(verified_values), "replaced_samples": replaced_values[:20], "generated": _utcnow_iso(), } return sanitized, report # --------------------------------------------------------------------------- # BUG-176: Missing citation resolution # BUG-194: Validate search results to avoid replacing correct entries with # garbage. Previous code searched by cite-key fragments (e.g. # "he 2016 deep") which returned completely unrelated papers. # Fix: (1) consult seminal_papers.yaml first, (2) require title- # similarity validation for API results, (3) build better queries. # --------------------------------------------------------------------------- # Minimum title-similarity between search result and expected title/query # for a result to be accepted. Prevents "Jokowi and the New Developmentalism" # from replacing "Deep Residual Learning for Image Recognition". _CITATION_RESOLVE_MIN_SIMILARITY = 0.30 def _load_seminal_papers_by_key() -> dict[str, dict]: """Load seminal_papers.yaml and index by cite_key. Returns dict like:: {"he2016deep": {"title": "Deep Residual Learning...", "authors": "He et al.", ...}, ...} Returns empty dict on any failure (missing file, bad YAML, etc.). """ try: from researchclaw.data import _load_all as _load_seminal_all all_papers = _load_seminal_all() return {p["cite_key"]: p for p in all_papers if "cite_key" in p} except Exception: # noqa: BLE001 return {} def _seminal_to_bibtex(paper: dict, cite_key: str) -> str: """Convert a seminal_papers.yaml entry dict to a BibTeX string.""" title = paper.get("title", "Unknown") authors = paper.get("authors", "Unknown") year = paper.get("year", "") venue = paper.get("venue", "") # Decide entry type venue_lower = (venue or "").lower() is_conf = any(kw in venue_lower for kw in ( "neurips", "nips", "icml", "iclr", "cvpr", "eccv", "iccv", "aaai", "acl", "emnlp", "naacl", "sigir", "kdd", "www", "ijcai", "conference", "proc", "workshop", )) if is_conf: return ( f"@inproceedings{{{cite_key},\n" f" title = {{{title}}},\n" f" author = {{{authors}}},\n" f" year = {{{year}}},\n" f" booktitle = {{{venue}}},\n" f"}}" ) return ( f"@article{{{cite_key},\n" f" title = {{{title}}},\n" f" author = {{{authors}}},\n" f" year = {{{year}}},\n" f" journal = {{{venue}}},\n" f"}}" ) def _resolve_missing_citations( missing_keys: set[str], existing_bib: str, ) -> tuple[set[str], list[str]]: """Try to find BibTeX entries for citation keys not in references.bib. Parses each cite_key (e.g. ``hendrycks2017baseline``) into an author name and year, then searches academic APIs. Returns ``(resolved_keys, new_bib_entries)`` where each entry is a complete BibTeX string. BUG-194 fix: Three-layer resolution strategy: 1. **Seminal lookup** — check seminal_papers.yaml (zero API calls, exact match) 2. **API search with validation** — search Semantic Scholar / arXiv, but ONLY accept results whose title has ≥ 30% word overlap with query terms. Previously any year-matching result was blindly accepted, causing foundational papers to be replaced with garbage. 3. **Skip** — if no confident match, leave the citation unresolved rather than inject a wrong paper. Gracefully returns empty results on any network failure. """ import re as _re176 import time as _time176 resolved: set[str] = set() new_entries: list[str] = [] def _parse_cite_key(key: str) -> tuple[str, str, str]: """Extract (author, year, keyword_hint) from a citation key. Common patterns: ``he2016deep`` → ("he", "2016", "deep") ``vaswani2017attention`` → ("vaswani", "2017", "attention") ``goodfellow2014generative`` → ("goodfellow", "2014", "generative") """ m = _re176.match(r"([a-zA-Z]+?)(\d{4})(.*)", key) if m: return m.group(1), m.group(2), m.group(3) return key, "", "" def _title_word_overlap(title: str, query_words: list[str]) -> float: """Word-overlap score between a paper title and query keywords. Returns fraction of query words found in the title (0.0–1.0). Used to validate that a search result is actually relevant. """ if not query_words: return 0.0 title_lower = set( _re176.sub(r"[^a-z0-9\s]", "", title.lower()).split() ) - {""} if not title_lower: return 0.0 matched = sum(1 for w in query_words if w.lower() in title_lower) return matched / len(query_words) # --- Layer 1: Seminal papers lookup (no API calls) --- seminal_by_key = _load_seminal_papers_by_key() for key in sorted(missing_keys): if key in seminal_by_key and key not in existing_bib: sp = seminal_by_key[key] bib_entry = _seminal_to_bibtex(sp, key) new_entries.append(bib_entry) resolved.add(key) logger.info( "BUG-194: Resolved %r via seminal_papers.yaml → %r (%s)", key, sp.get("title", "")[:60], sp.get("year", ""), ) # Remaining keys that weren't in the seminal database AND aren't already # present in the existing bib (no point re-resolving keys we already have). remaining = sorted( k for k in (missing_keys - resolved) if k not in existing_bib ) if not remaining: return resolved, new_entries # --- Layer 2: API search with title-similarity validation --- try: from researchclaw.literature.search import search_papers except ImportError: logger.debug("BUG-176: literature.search not available, skipping resolution") return resolved, new_entries for key in remaining: author, year, hint = _parse_cite_key(key) if not author or not year: continue # BUG-194: Build a better search query. # Instead of "he 2016 deep", use "he deep residual learning 2016" or # at minimum, split camelCase hints into separate words. # Split hint on word boundaries (camelCase or underscore). hint_words = _re176.findall(r"[a-zA-Z]+", hint) if hint else [] # The query words used for validation query_words = [author] + hint_words # Build search query: author + hint words + year (year helps but isn't # the primary discriminator anymore) query_parts = [author] + hint_words + [year] query = " ".join(query_parts) try: results = search_papers(query, limit=5, deduplicate=True) except Exception as exc: logger.debug("BUG-176: Search failed for %r: %s", key, exc) continue if not results: logger.debug( "BUG-194: No search results for %r (query=%r), skipping", key, query, ) continue # BUG-194: Find best match by title-word-overlap AND year match. # Previously the code just took the first year-matching result. best = None best_score = -1.0 for paper in results: overlap = _title_word_overlap(paper.title, query_words) year_bonus = 0.2 if str(paper.year) == year else 0.0 # Also give bonus for author name appearing in paper.authors author_bonus = 0.0 if any(author.lower() in a.name.lower() for a in paper.authors): author_bonus = 0.2 score = overlap + year_bonus + author_bonus if score > best_score: best_score = score best = paper if best is None: continue # BUG-194: Validate the result — require minimum similarity. # This is the KEY fix: previously ANY result was accepted blindly. overlap = _title_word_overlap(best.title, query_words) if overlap < _CITATION_RESOLVE_MIN_SIMILARITY: logger.info( "BUG-194: Rejecting search result for %r — title %r has " "too-low overlap (%.2f < %.2f) with query words %r", key, best.title[:60], overlap, _CITATION_RESOLVE_MIN_SIMILARITY, query_words, ) continue # Year must also match (or be within 1 year — sometimes conferences # vs arXiv preprint have different years) if year and best.year: year_diff = abs(int(year) - int(best.year)) if year_diff > 1: logger.info( "BUG-194: Rejecting search result for %r — year mismatch " "(%s vs %s, diff=%d)", key, year, best.year, year_diff, ) continue # Generate BibTeX with the ORIGINAL cite_key (so \cite{key} works) bib_entry = best.to_bibtex() # Replace the auto-generated cite_key with the one used in the paper orig_key_match = _re176.match(r"@(\w+)\{([^,]+),", bib_entry) if orig_key_match: bib_entry = bib_entry.replace( f"@{orig_key_match.group(1)}{{{orig_key_match.group(2)},", f"@{orig_key_match.group(1)}{{{key},", 1, ) # Verify entry doesn't duplicate an existing key if key not in existing_bib: new_entries.append(bib_entry) resolved.add(key) logger.info( "BUG-194: Resolved %r via API → %r (%s, overlap=%.2f)", key, best.title[:60], best.year, overlap, ) else: logger.debug( "BUG-194: Key %r already in bib, skipping API result", key, ) # Rate limit: 0.5s between API calls _time176.sleep(0.5) return resolved, new_entries # --------------------------------------------------------------------------- # Stage 22: Export & Publish # --------------------------------------------------------------------------- def _execute_export_publish( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: revised = _read_prior_artifact(run_dir, "paper_revised.md") or "" if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "export_publish") sp = _pm.for_stage("export_publish", evolution_overlay=_overlay, revised=revised) resp = _chat_with_prompt( llm, sp.system, sp.user, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) final_paper = resp.content # Content guard: reject LLM output that truncates the paper if revised and len(final_paper) < 0.6 * len(revised): logger.warning( "Stage 22: LLM output is %.0f%% of input length — using original", 100 * len(final_paper) / max(len(revised), 1), ) final_paper = revised else: final_paper = revised if not final_paper.strip(): final_paper = "# Final Paper\n\nNo content generated." # --- Always-on fabrication sanitization (Phase 1 anti-fabrication) --- # Back up pre-sanitized version (stage_dir / "paper_presanitized.md").write_text( final_paper, encoding="utf-8" ) # Sanitize unverified data in tables — always-on, not just degraded mode final_paper, _san_report = _sanitize_fabricated_data( final_paper, run_dir ) (stage_dir / "sanitization_report.json").write_text( json.dumps(_san_report, indent=2), encoding="utf-8" ) if _san_report.get("numbers_replaced", 0) > 0: logger.info( "Stage 22: Fabrication sanitization — %d numbers replaced, %d kept", _san_report.get("numbers_replaced", 0), _san_report.get("numbers_kept", 0), ) # Graceful degradation: insert notice only when quality gate was degraded _degradation_signal_path = run_dir / "degradation_signal.json" if _degradation_signal_path.exists(): try: _deg_signal = json.loads( _degradation_signal_path.read_text(encoding="utf-8") ) except (json.JSONDecodeError, OSError): _deg_signal = {} # Insert degradation notice after abstract _deg_score = _deg_signal.get("score", "N/A") _deg_threshold = _deg_signal.get("threshold", "N/A") _deg_notice = ( "\n\n> **Note:** This paper was produced in degraded mode. " f"Quality gate score ({_deg_score}/{_deg_threshold}) was below " "threshold. Unverified numerical results in tables have been " "replaced with `---` and require independent verification.\n\n" ) # Try to insert after ## Abstract section _abstract_markers = ["## Abstract\n", "# Abstract\n"] _notice_inserted = False for _marker in _abstract_markers: if _marker in final_paper: _marker_end = final_paper.index(_marker) + len(_marker) # Find the end of the abstract paragraph _next_section = final_paper.find("\n## ", _marker_end) _next_heading = final_paper.find("\n# ", _marker_end) _insert_pos = min( p for p in (_next_section, _next_heading) if p > 0 ) if any(p > 0 for p in (_next_section, _next_heading)) else len(final_paper) final_paper = ( final_paper[:_insert_pos] + _deg_notice + final_paper[_insert_pos:] ) _notice_inserted = True break if not _notice_inserted: # Fallback: prepend to paper final_paper = _deg_notice + final_paper logger.info( "Stage 22: Applied degraded-mode notice (score=%s, threshold=%s)", _deg_score, _deg_threshold, ) # IMP-3: Deduplicate "due to computational constraints" — keep at most 1 import re as _re_imp3 _CONSTRAINT_PAT = _re_imp3.compile( r"[Dd]ue to computational constraints", _re_imp3.IGNORECASE ) _matches = list(_CONSTRAINT_PAT.finditer(final_paper)) if len(_matches) > 1: # Keep only the first occurrence; remove subsequent ones by # deleting the enclosing sentence. for m in reversed(_matches[1:]): # Find sentence boundaries around the match start = final_paper.rfind(".", 0, m.start()) start = start + 1 if start >= 0 else m.start() end = final_paper.find(".", m.end()) end = end + 1 if end >= 0 else m.end() sentence = final_paper[start:end].strip() if sentence: final_paper = final_paper[:start] + final_paper[end:] final_paper = re.sub(r"[^\S\n]{2,}", " ", final_paper) logger.info( "Stage 22: Removed %d duplicate 'computational constraints' " "disclaimers", len(_matches) - 1, ) # IMP-19 Layer 2: Ensure at least figures are referenced in the paper import re as _re_fig chart_files = [] # BUG-215: Also search stage-14* versioned dirs (stage-14_v1, etc.) # in case stage-14/ was renamed and never recreated. _chart_search_dirs = [stage_dir / "charts", run_dir / "stage-14" / "charts"] for _s14_charts in sorted(run_dir.glob("stage-14*/charts"), reverse=True): if _s14_charts not in _chart_search_dirs: _chart_search_dirs.append(_s14_charts) for _chart_src_dir in _chart_search_dirs: if _chart_src_dir.is_dir(): chart_files.extend(sorted(_chart_src_dir.glob("*.png"))) # BUG-190: Also inject charts not already referenced in the paper. # The old condition only fired when NO figures were present. Now we # filter to only unreferenced charts, so partially-illustrated papers # also get the remaining charts injected. _already_referenced = set() for _cf in chart_files: if _cf.name in final_paper: _already_referenced.add(_cf.name) chart_files = [cf for cf in chart_files if cf.name not in _already_referenced] if chart_files: # Distribute figures to relevant sections based on filename keywords _fig_placement: dict[str, list[str]] = { "method": [], # architecture, method, model, pipeline diagrams "result": [], # experiment, comparison, ablation charts "intro": [], # concept, overview, illustration } _fig_counter = len(_already_referenced) # start numbering after existing figs for cf in chart_files[:6]: _fig_counter += 1 stem_lower = cf.stem.lower() label = cf.stem.replace("_", " ").title() fig_md = f"![Figure {_fig_counter}: {label}](charts/{cf.name})" if any(k in stem_lower for k in ("architecture", "model", "pipeline", "method", "flowchart")): _fig_placement["method"].append(fig_md) elif any(k in stem_lower for k in ("experiment", "comparison", "ablation", "result", "metric")): _fig_placement["result"].append(fig_md) elif any(k in stem_lower for k in ("concept", "overview", "illustration", "threat", "attack")): _fig_placement["intro"].append(fig_md) else: _fig_placement["result"].append(fig_md) # default to results # Insert figures at relevant section boundaries. # BUG-200: Match both H1 (#) and H2 (##) headings — LLMs generate # either level depending on the writing_structure prompt. _section_markers = { "method": ["# Method", "## Method", "# Methodology", "## Methodology", "# Approach", "## Approach", "# Framework", "## Framework", "## 3. Method", "## 3 Method"], "result": ["# Results", "## Results", "# Experiments", "## Experiments", "# Evaluation", "## Evaluation", "## 5. Results", "## 4. Experiments", "## 5 Results"], "intro": ["# Related Work", "## Related Work", "# Background", "## Background", "## 2. Related", "## 2 Related Work"], } _total_inserted = 0 for category, figs in _fig_placement.items(): if not figs: continue fig_block = "\n\n" + "\n\n".join(figs) + "\n\n" inserted = False for marker in _section_markers.get(category, []): if marker in final_paper: # Insert BEFORE the marker section (so figure appears at end of previous section) final_paper = final_paper.replace(marker, fig_block + marker, 1) inserted = True _total_inserted += len(figs) break if not inserted: # Fallback: insert before Conclusion/Limitations/Discussion for fallback in ["# Conclusion", "## Conclusion", "# Limitations", "## Limitations", "# Discussion", "## Discussion"]: if fallback in final_paper: final_paper = final_paper.replace(fallback, fig_block + fallback, 1) inserted = True _total_inserted += len(figs) break if not inserted: # BUG-200: Last resort — insert before closing fence marker # rather than appending after it (which puts content outside # the markdown fence and gets dropped by converter). _fence_end = final_paper.rfind("\n```") if _fence_end > 0: final_paper = ( final_paper[:_fence_end] + fig_block + final_paper[_fence_end:] ) else: final_paper += fig_block _total_inserted += len(figs) logger.info( "IMP-19: Injected %d figure references into paper_final.md (distributed across sections)", _total_inserted, ) # IMP-24: Detect excessive number repetition _numbers_found = _re_fig.findall(r"\b\d+\.\d{2,}\b", final_paper) _num_counts = Counter(_numbers_found) _repeated = {n: c for n, c in _num_counts.items() if c > 3} if _repeated: logger.warning( "IMP-24: Numbers repeated >3 times: %s", _repeated, ) (stage_dir / "paper_final.md").write_text(final_paper, encoding="utf-8") # --- Legacy fabrication sanitization (disabled — superseded by Phase 1 _sanitize_fabricated_data above) --- # Kept but guarded: Phase 1 always-on sanitization handles this now. # Only run if Phase 1 was somehow skipped (should never happen). _fab_flags_text = _read_prior_artifact(run_dir, "fabrication_flags.json") or "" _fab_flags = _safe_json_loads(_fab_flags_text, {}) if _fab_flags_text else {} if ( isinstance(_fab_flags, dict) and _fab_flags.get("fabrication_suspected") and _san_report.get("numbers_replaced", 0) == 0 # Phase 1 didn't run/replace ): import re as _re_fab _real_vals = set() for rv in _fab_flags.get("real_metric_values", []): if isinstance(rv, (int, float)) and math.isfinite(rv): _real_vals.add(str(round(rv, 4))) _real_vals.add(str(round(rv, 2))) _real_vals.add(str(round(rv, 1))) if rv == int(rv): _real_vals.add(str(int(rv))) def _sanitize_number(m: _re_fab.Match) -> str: # type: ignore[name-defined] """Replace fabricated numbers with '--' but keep real ones.""" num_str = m.group(0) # Keep the number if it matches any known real metric value try: num_val = float(num_str) if not math.isfinite(num_val): return "--" rounded_strs = { str(round(num_val, 4)), str(round(num_val, 2)), str(round(num_val, 1)), *( [str(int(num_val))] if num_val == int(num_val) else [] ), } if rounded_strs & _real_vals: return num_str # real value — keep it except (ValueError, OverflowError): return num_str return "--" # Only sanitize numbers in Results/Experiments/Evaluation/Ablation sections _result_section_pat = _re_fab.compile( r"(##\s*(?:\d+\.?\s*)?(?:Results|Experiments|Evaluation|Ablation" r"|Experimental Results|Quantitative).*?)(?=\n##\s|\Z)", _re_fab.DOTALL | _re_fab.IGNORECASE, ) _sanitized_count = 0 def _sanitize_section(sec_match: _re_fab.Match) -> str: # type: ignore[name-defined] nonlocal _sanitized_count section_text = sec_match.group(0) # Replace decimal numbers (e.g., 73.42, 0.891) but NOT integers # that are likely structural (year, section number, figure number) def _replace_in_section(m: _re_fab.Match) -> str: # type: ignore[name-defined] nonlocal _sanitized_count result = _sanitize_number(m) if result == "--": _sanitized_count += 1 return result return _re_fab.sub( r"\b\d+\.\d{1,6}\b", _replace_in_section, section_text ) final_paper = _result_section_pat.sub(_sanitize_section, final_paper) if _sanitized_count > 0: logger.warning( "Stage 22: Fabrication sanitization — blanked %d unsupported " "numbers in Results sections (experiment had no real metrics)", _sanitized_count, ) # Rewrite the sanitized paper (stage_dir / "paper_final.md").write_text( final_paper, encoding="utf-8" ) # Initialize artifacts list artifacts = ["paper_final.md"] # F2.7: Post-process citations — [cite_key] → \cite{cite_key} # and copy final references.bib to export stage _ay_map: dict[str, str] = {} # BUG-102: author-year → cite_key map bib_text = _read_prior_artifact(run_dir, "references.bib") if bib_text: # Replace [cite_key] patterns in the final paper with \cite{cite_key} # Collect all valid cite_keys from the bib file import re as _re valid_keys = set(_re.findall(r"@\w+\{([^,]+),", bib_text)) # BUG-102: Recover author-year citations → [cite_key] format. # When Stage 19 (paper_revision) converts [cite_key] to [Author et al., 2024], # the downstream regex can't match them. Build a reverse map from bib entries. def _build_author_year_map(bib: str, keys: set[str]) -> dict[str, str]: """Build mapping from author-year patterns to cite_keys. Returns dict like: "Raissi et al., 2019" → "raissi2019physicsinformed" "Tavella and Randall, 2000" → "tavella2000pricing" """ mapping: dict[str, str] = {} # Parse each bib entry for author + year # BUG-DA8-17: Allow newline OR whitespace before closing brace # Use \n} or just } at start-of-line to avoid greedy cross-entry match entry_pat = _re.compile( r"@\w+\{([^,]+),\s*(.*?)(?:\n\}|^[ \t]*\})", _re.DOTALL | _re.MULTILINE ) for m in entry_pat.finditer(bib): key = m.group(1).strip() if key not in keys: continue body = m.group(2) # Extract author field author_m = _re.search( r"author\s*=\s*[\{\"](.*?)[\}\"]", body, _re.IGNORECASE ) year_m = _re.search( r"year\s*=\s*[\{\"]?(\d{4})[\}\"]?", body, _re.IGNORECASE ) if not author_m or not year_m: continue author_raw = author_m.group(1).strip() year = year_m.group(1) # Parse author names (split on " and ") authors = [a.strip() for a in _re.split(r"\s+and\s+", author_raw)] # Extract last names last_names = [] for a in authors: if "," in a: last_names.append(a.split(",")[0].strip()) else: parts = a.split() last_names.append(parts[-1] if parts else a) if not last_names: continue # Generate author-year patterns: # 1 author: "Smith, 2024" # 2 authors: "Smith and Jones, 2024" # 3+ authors: "Smith et al., 2024" if len(last_names) == 1: patterns = [f"{last_names[0]}, {year}"] elif len(last_names) == 2: patterns = [ f"{last_names[0]} and {last_names[1]}, {year}", f"{last_names[0]} \\& {last_names[1]}, {year}", ] else: patterns = [ f"{last_names[0]} et al., {year}", f"{last_names[0]} et al. {year}", ] # Also add "Smith and Jones, 2024" for first two authors patterns.append( f"{last_names[0]} and {last_names[1]}, {year}" ) for pat in patterns: mapping[pat] = key return mapping _ay_map = _build_author_year_map(bib_text, valid_keys) if _ay_map: # Count how many author-year citations exist in the paper _ay_found = 0 for _ay_pat in _ay_map: if _ay_pat in final_paper: _ay_found += 1 if _ay_found > 0: logger.info( "Stage 22: Found %d author-year citation patterns — " "converting back to [cite_key] format.", _ay_found, ) # Sort by longest pattern first to avoid partial matches for _ay_pat in sorted(_ay_map, key=len, reverse=True): _ay_key = _ay_map[_ay_pat] # Match [Author et al., 2024] or [Author and Jones, 2024; ...] # Handle single-citation brackets final_paper = final_paper.replace( f"[{_ay_pat}]", f"[{_ay_key}]" ) # Handle within multi-citation brackets [A et al., 2020; B et al., 2021] # Replace the author-year segment only inside [...] brackets final_paper = _re.sub( r'\[([^\]]*?)' + _re.escape(_ay_pat) + r'([^\]]*?)\]', lambda _m: '[' + _m.group(1) + _ay_key + _m.group(2) + ']', final_paper, ) # Fix multi-key brackets: [key1; key2] → [key1, key2] # (author-year uses semicolons, cite-keys use commas) def _fix_semicolon_cites(m_sc: _re.Match[str]) -> str: inner = m_sc.group(1) # Only convert if ALL segments look like cite keys parts = [p.strip() for p in inner.split(";")] _ck = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*" if all(_re.fullmatch(_ck, p) for p in parts): return "[" + ", ".join(parts) + "]" return m_sc.group(0) final_paper = _re.sub( r"\[([^\]]+;[^\]]+)\]", _fix_semicolon_cites, final_paper ) (stage_dir / "paper_final.md").write_text( final_paper, encoding="utf-8" ) # R10-Fix4: Citation cross-validation # BUG-187: Also parse multi-key brackets like [key1, key2, key3]. # The old regex only matched single-key brackets [key2020word]. _cite_key_pat = r"[a-zA-Z]+\d{4}[a-zA-Z0-9_-]*" cited_keys_in_paper: set[str] = set() # Single-key brackets for m in _re.finditer(rf"\[({_cite_key_pat})\]", final_paper): cited_keys_in_paper.add(m.group(1)) # Multi-key brackets [key1, key2] or [key1; key2] for m in _re.finditer(r"\[([^\]]{10,300})\]", final_paper): inner = m.group(1) # Only parse if it looks like citation keys (has year-like digits) parts = _re.split(r"[,;]\s*", inner) if all(_re.fullmatch(_cite_key_pat, p.strip()) for p in parts if p.strip()): for p in parts: if p.strip(): cited_keys_in_paper.add(p.strip()) if valid_keys and cited_keys_in_paper: invalid_keys = cited_keys_in_paper - valid_keys if invalid_keys: logger.warning( "Stage 22: Found %d citation keys in paper not in references.bib: %s", len(invalid_keys), ", ".join(sorted(invalid_keys)[:20]), ) # BUG-176: Try to resolve missing citations before removing them. # Parse cite_key → search query, look up via academic APIs, # and add found entries to references.bib. resolved_keys: set[str] = set() new_bib_entries: list[str] = [] if len(invalid_keys) <= 30: # Sanity: don't flood APIs resolved_keys, new_bib_entries = _resolve_missing_citations( invalid_keys, bib_text ) if resolved_keys: valid_keys.update(resolved_keys) bib_text += "\n" + "\n\n".join(new_bib_entries) + "\n" logger.info( "Stage 22: Resolved %d/%d missing citations via API lookup", len(resolved_keys), len(invalid_keys), ) still_invalid = invalid_keys - resolved_keys if still_invalid: # IMP-29: Remove remaining unresolvable citations from # BOTH single-key and multi-key brackets. import re as _re_imp29 for bad_key in still_invalid: # Remove single-key brackets final_paper = final_paper.replace(f"[{bad_key}]", "") # Remove from multi-key brackets: [good, BAD, good] → [good, good] def _remove_from_multi(m: _re.Match) -> str: inner = m.group(1) parts = [p.strip() for p in _re.split(r"[,;]\s*", inner)] filtered = [p for p in parts if p != bad_key] if not filtered: return "" return "[" + ", ".join(filtered) + "]" final_paper = _re_imp29.sub( r"\[([^\]]*\b" + _re.escape(bad_key) + r"\b[^\]]*)\]", _remove_from_multi, final_paper, ) # Clean up whitespace artifacts from removed citations final_paper = _re_imp29.sub(r" +", " ", final_paper) final_paper = _re_imp29.sub(r" ([.,;:)])", r"\1", final_paper) (stage_dir / "paper_final.md").write_text(final_paper, encoding="utf-8") if still_invalid: (stage_dir / "invalid_citations.json").write_text( json.dumps(sorted(still_invalid), indent=2), encoding="utf-8" ) artifacts.append("invalid_citations.json") if resolved_keys: (stage_dir / "resolved_citations.json").write_text( json.dumps(sorted(resolved_keys), indent=2), encoding="utf-8" ) artifacts.append("resolved_citations.json") final_paper_latex = final_paper # default: no citation conversion if valid_keys: _CITE_KEY_PAT = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9]*" # Step 1: Convert multi-key brackets [key1, key2] → \cite{key1, key2} def _replace_multi_cite(m: _re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] matched = [k for k in keys if k in valid_keys] if matched: return "\\cite{" + ", ".join(matched) + "}" return m.group(0) final_paper_latex = _re.sub( rf"\[({_CITE_KEY_PAT}(?:\s*,\s*{_CITE_KEY_PAT})+)\]", _replace_multi_cite, final_paper, ) # Step 2: Convert single-key brackets [key] → \cite{key} def _replace_cite(m: _re.Match[str]) -> str: key = m.group(1) if key in valid_keys: return f"\\cite{{{key}}}" return m.group(0) final_paper_latex = _re.sub( rf"\[({_CITE_KEY_PAT})\]", _replace_cite, final_paper_latex ) # Step 3: Merge adjacent \cite{a} \cite{b} → \cite{a, b} def _merge_adjacent_cites(m: _re.Match[str]) -> str: keys = _re.findall(r"\\cite\{([^}]+)\}", m.group(0)) return "\\cite{" + ", ".join(keys) + "}" final_paper_latex = _re.sub( r"\\cite\{[^}]+\}(?:\s*\\cite\{[^}]+\})+", _merge_adjacent_cites, final_paper_latex, ) (stage_dir / "paper_final_latex.md").write_text( final_paper_latex, encoding="utf-8" ) artifacts.append("paper_final_latex.md") # IMP-1: Prune uncited bibliography entries — keep only keys # that actually appear in the paper text (bracket or \cite form). if valid_keys: _all_cited: set[str] = set() # Bracket-format citations [key] _all_cited.update( _re.findall(r"\[([a-zA-Z]+\d{4}[a-zA-Z0-9_-]*)\]", final_paper) ) # \cite{key, key2} format (original + latex-converted) for _src in ( final_paper, final_paper_latex, ): for _cm in _re.finditer(r"\\cite\{([^}]+)\}", _src): _all_cited.update( k.strip() for k in _cm.group(1).split(",") ) uncited_keys = valid_keys - _all_cited if uncited_keys: bib_text = _remove_bibtex_entries(bib_text, uncited_keys) logger.info( "Stage 22: Pruned %d uncited bibliography entries " "(kept %d)", len(uncited_keys), len(valid_keys) - len(uncited_keys), ) # Write final references.bib (stage_dir / "references.bib").write_text(bib_text, encoding="utf-8") artifacts.append("references.bib") logger.info( "Stage 22: Exported references.bib with %d entries", len(valid_keys) if valid_keys else 0, ) # Conference template: generate .tex file try: from researchclaw.templates import get_template, markdown_to_latex tpl = get_template(config.export.target_conference) # Use the latex-citation-processed version if available tex_source = final_paper_latex # Append NeurIPS-style checklist if target is a ML conference if tpl.name in ("neurips_2024", "neurips_2025", "icml_2025", "icml_2026", "iclr_2025", "iclr_2026"): _has_exp = bool(_read_prior_artifact(run_dir, "experiment_summary.json")) _checklist = _generate_neurips_checklist( has_experiments=_has_exp, has_code=True, ) if "NeurIPS Paper Checklist" not in tex_source: tex_source = tex_source.rstrip() + "\n\n" + _checklist _t = _extract_paper_title(tex_source) tex_content = markdown_to_latex( tex_source, tpl, title=_t if _t != "Untitled Paper" else "", authors=config.export.authors, bib_file=config.export.bib_file, bib_entries=_ay_map or None, ) (stage_dir / "paper.tex").write_text(tex_content, encoding="utf-8") artifacts.append("paper.tex") logger.info( "Stage 22: Generated paper.tex for %s (%d chars)", tpl.display_name, len(tex_content), ) # --- Phase 1 anti-fabrication: verify paper against VerifiedRegistry --- _vresult = None # BUG-DA8-04: Initialize before try to avoid fragile dir() check try: from researchclaw.pipeline.paper_verifier import verify_paper as _verify_paper # BUG-222: Use best_only=True to validate against promoted best data only from researchclaw.pipeline.verified_registry import ( VerifiedRegistry as _VR22, ) _vr22 = _VR22.from_run_dir( run_dir, metric_direction=config.experiment.metric_direction, best_only=True, ) if _vr22.values: _vresult = _verify_paper(tex_content, _vr22) (stage_dir / "paper_verification.json").write_text( json.dumps({ "passed": _vresult.passed, "severity": _vresult.severity, "total_checked": _vresult.total_numbers_checked, "total_verified": _vresult.total_numbers_verified, "strict_violations": _vresult.strict_violations, "lenient_violations": _vresult.lenient_violations, "fabrication_rate": round(_vresult.fabrication_rate, 4), "unverified_numbers": [ {"value": u.value, "line": u.line_number, "section": u.section, "in_table": u.in_table} for u in _vresult.unverified_numbers[:20] ], "fabricated_conditions": [ {"name": fc.name, "line": fc.line_number} for fc in _vresult.fabricated_conditions ], "config_warnings": getattr(_vresult, "config_warnings", []), "summary": _vresult.summary, }, indent=2), encoding="utf-8", ) logger.info( "Stage 22: Paper verification — %s (%d checked, %d verified, " "%d strict violations, fabrication_rate=%.1f%%)", _vresult.severity, _vresult.total_numbers_checked, _vresult.total_numbers_verified, _vresult.strict_violations, _vresult.fabrication_rate * 100, ) except Exception as _pv_exc: logger.debug("Stage 22: Paper verification skipped: %s", _pv_exc) # BUG-23 P1: Enforce REJECT verdict — sanitize unverified numbers if _vresult is not None and getattr(_vresult, "severity", None) == "REJECT": logger.warning( "Stage 22: Paper REJECTED by verifier (fabrication_rate=%.1f%%, " "%d strict violations). Sanitizing unverified numbers.", _vresult.fabrication_rate * 100, _vresult.strict_violations, ) # Replace unverified numbers in strict sections/tables with "---" import re as _re_san2 # BUG-R49-02: Section names that sound like results but are # actually protocol/setup sections should NOT trigger strict # sanitization. Exempt sections containing "dataset", "setup", # "protocol", "hyperparameter", or "implementation". _STRICT_EXEMPT_KW = {"dataset", "setup", "protocol", "hyperparameter", "implementation", "hardware", "infrastructure"} _sanitized_tex = tex_content _san2_count = 0 for _uv in sorted(_vresult.unverified_numbers, key=lambda u: -u.line_number): # Only sanitize strict-section / in-table numbers _uv_section_lower = (_uv.section or "").lower() _uv_is_strict = any( s in _uv_section_lower for s in ("results", "experiment", "evaluation", "ablation", "comparison", "analysis") ) # BUG-R49-02: Exempt protocol/setup sections from strict mode if _uv_is_strict and any( kw in _uv_section_lower for kw in _STRICT_EXEMPT_KW ): _uv_is_strict = False if _uv_is_strict or _uv.in_table: _lines = _sanitized_tex.split("\n") if 0 < _uv.line_number <= len(_lines): _orig_line = _lines[_uv.line_number - 1] # BUG-R49-01: Use word-boundary regex instead of # naive substring matching to avoid replacing numbers # inside identifiers (e.g. "18" in "ResNet18"). # BUG-206: Include ASCII hyphen and Unicode hyphens # (U+2010 hyphen, U+2011 non-breaking hyphen, # U+2013 en-dash) so that model variant numbers # like "34" in "ResNet-34" or "ResNet‑34" are not # mistaken for unverified experimental values. # BUG-210: Include period (.) so that fractional # parts of decimals in condition names like # "ema_decay_0.9" are not treated as standalone # numbers (prevents "0.9" → "0.---"). _BOUNDARY = "A-Za-z0-9_\u2010\u2011\u2013\\-." for _rep in ( f"{_uv.value:.4f}".rstrip("0").rstrip("."), f"{_uv.value:.3f}", f"{_uv.value:.2f}", f"{_uv.value:.1f}", f"{_uv.value:g}", str(_uv.value), ): # Word boundary: number must NOT be adjacent to # alphanumeric, underscore, or hyphen on either side. _pat = ( rf"(? _page_limit: logger.warning( "BUG-27: Paper is %d pages (limit %d). " "Consider tightening content in revision.", _qc.page_count, _page_limit, ) except Exception as _qc_exc: # noqa: BLE001 logger.debug("Stage 22: Quality checks skipped: %s", _qc_exc) else: logger.warning("Stage 22: LaTeX compilation verification FAILED: %s", _compile_result.errors[:3]) # Add compilation failure comment to .tex _tex_path = stage_dir / "paper.tex" if _tex_path.exists(): _tex_content = _tex_path.read_text(encoding="utf-8") if "% WARNING: Compilation failed" not in _tex_content: _tex_content = ( "% WARNING: Compilation failed. Errors:\n" + "".join(f"% {e}\n" for e in _compile_result.errors[:5]) + _tex_content ) _tex_path.write_text(_tex_content, encoding="utf-8") except Exception as _compile_exc: # noqa: BLE001 logger.debug("Stage 22: Compile verification skipped: %s", _compile_exc) except Exception as exc: # noqa: BLE001 logger.error("LaTeX generation failed: %s", exc, exc_info=True) # (Charts, BUG-99 path fix, and remove_missing_figures are now handled # BEFORE compile_latex() — see "Pre-compilation" block above.) # --- Code packaging: multi-file directory or single file --- exp_final_dir_path = _read_prior_artifact(run_dir, "experiment_final/") if exp_final_dir_path and Path(exp_final_dir_path).is_dir(): import ast code_dir = stage_dir / "code" code_dir.mkdir(parents=True, exist_ok=True) all_code_combined = "" code_file_names: list[str] = [] for src in sorted(Path(exp_final_dir_path).glob("*.py")): (code_dir / src.name).write_bytes(src.read_bytes()) all_code_combined += src.read_text(encoding="utf-8") + "\n" code_file_names.append(src.name) # Detect dependencies from all files detected: set[str] = set() known_packages = { "numpy": "numpy", "torch": "torch", "tensorflow": "tensorflow", "sklearn": "scikit-learn", "scikit-learn": "scikit-learn", "scipy": "scipy", "pandas": "pandas", "matplotlib": "matplotlib", "seaborn": "seaborn", "transformers": "transformers", "datasets": "datasets", "jax": "jax", } try: tree = ast.parse(all_code_combined) for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: top = alias.name.split(".")[0] if top in known_packages: detected.add(known_packages[top]) elif isinstance(node, ast.ImportFrom) and node.module: top = node.module.split(".")[0] if top in known_packages: detected.add(known_packages[top]) except SyntaxError: pass requirements = sorted(detected) (code_dir / "requirements.txt").write_text( "\n".join(requirements) + ("\n" if requirements else ""), encoding="utf-8", ) paper_title = _extract_paper_title(final_paper) file_list_md = "\n".join(f"- `{f}`" for f in code_file_names) readme = ( f"# Code Package for {paper_title}\n\n" "## Description\n" "This directory contains the experiment project used for the paper.\n\n" "## Project Files\n" f"{file_list_md}\n\n" "## How to Run\n" "`python main.py`\n\n" "## Dependencies\n" "Install dependencies with `pip install -r requirements.txt` if needed.\n" ) (code_dir / "README.md").write_text(readme, encoding="utf-8") artifacts.append("code/") logger.info( "Stage 22: Packaged multi-file code release (%d files, %d deps)", len(code_file_names), len(requirements), ) else: # Backward compat: single-file packaging code_payload = _read_prior_artifact(run_dir, "experiment_final.py") if not code_payload: code_payload = _read_prior_artifact(run_dir, "experiment.py") if code_payload: import ast code_dir = stage_dir / "code" code_dir.mkdir(parents=True, exist_ok=True) (code_dir / "experiment.py").write_text(code_payload, encoding="utf-8") detected_single: set[str] = set() known_packages_single = { "numpy": "numpy", "torch": "torch", "tensorflow": "tensorflow", "sklearn": "scikit-learn", "scikit-learn": "scikit-learn", "scipy": "scipy", "pandas": "pandas", "matplotlib": "matplotlib", "seaborn": "seaborn", "transformers": "transformers", "datasets": "datasets", "jax": "jax", } try: tree = ast.parse(code_payload) for node in ast.walk(tree): if isinstance(node, ast.Import): for alias in node.names: top = alias.name.split(".")[0] if top in known_packages_single: detected_single.add(known_packages_single[top]) elif isinstance(node, ast.ImportFrom) and node.module: top = node.module.split(".")[0] if top in known_packages_single: detected_single.add(known_packages_single[top]) except SyntaxError: pass requirements = sorted(detected_single) (code_dir / "requirements.txt").write_text( "\n".join(requirements) + ("\n" if requirements else ""), encoding="utf-8", ) paper_title = _extract_paper_title(final_paper) readme = ( f"# Code Package for {paper_title}\n\n" "## Description\n" "This directory contains the final experiment script used for the paper.\n\n" "## How to Run\n" "`python experiment.py`\n\n" "## Dependencies\n" "Install dependencies with `pip install -r requirements.txt` if needed.\n" ) (code_dir / "README.md").write_text(readme, encoding="utf-8") artifacts.append("code/") logger.info( "Stage 22: Packaged single-file code release with %d deps", len(requirements), ) # WS-5.5: Generate framework diagram prompt for methodology section try: _framework_prompt = _generate_framework_diagram_prompt( final_paper, config, llm=llm ) if _framework_prompt: _chart_dir = stage_dir / "charts" _chart_dir.mkdir(parents=True, exist_ok=True) (_chart_dir / "framework_diagram_prompt.md").write_text( _framework_prompt, encoding="utf-8" ) logger.info("Stage 22: Generated framework diagram prompt → charts/framework_diagram_prompt.md") except Exception as exc: # noqa: BLE001 logger.debug("Stage 22: Framework diagram prompt generation skipped: %s", exc) return StageResult( stage=Stage.EXPORT_PUBLISH, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-22/{a}" for a in artifacts), ) # --------------------------------------------------------------------------- # Citation helpers # --------------------------------------------------------------------------- def _check_citation_relevance( llm: Any, topic: str, results: list[Any], ) -> dict[str, float | None]: """Use LLM to assess relevance of each citation to the research topic. Returns a dict mapping cite_key → relevance score (0.0–1.0). Processes citations in batches of 30 to handle large bibliographies. """ citation_lines = [] for cr in results: citation_lines.append(f"- [{cr.cite_key}] \"{cr.title}\"") if not citation_lines: return {} all_scores: dict[str, float] = {} _BATCH_SIZE = 30 for batch_start in range(0, len(citation_lines), _BATCH_SIZE): batch = citation_lines[batch_start:batch_start + _BATCH_SIZE] citations_text = "\n".join(batch) prompt = ( f"Research topic: {topic}\n\n" f"Rate the relevance of each citation to the research topic " f"on a scale of 0.0 to 1.0.\n" f"Return ONLY a JSON object mapping cite_key to relevance score.\n" f"Example: {{\"smith2020\": 0.9, \"jones2019\": 0.2}}\n\n" f"Citations:\n{citations_text}" ) try: resp = llm.chat( [{"role": "user", "content": prompt}], system="You assess citation relevance. Return only valid JSON.", json_mode=True, ) parsed = _safe_json_loads(resp.content, {}) if isinstance(parsed, dict): for k, v in parsed.items(): if isinstance(v, (int, float)): all_scores[k] = max(0.0, min(1.0, float(v))) except Exception: # noqa: BLE001 logger.debug( "Citation relevance check failed for batch %d–%d, skipping", batch_start, batch_start + len(batch), ) return all_scores def _remove_bibtex_entries(bib_text: str, keys_to_remove: set[str]) -> str: """Remove BibTeX entries whose keys are in *keys_to_remove*.""" kept: list[str] = [] for m in re.finditer(r"@\w+\{([^,]+),", bib_text): key = m.group(1).strip() if key in keys_to_remove: continue # Find the full entry (from @ to the next @ or end) start = m.start() # Find balanced braces depth = 0 end = start for i in range(start, len(bib_text)): if bib_text[i] == "{": depth += 1 elif bib_text[i] == "}": depth -= 1 if depth == 0: end = i + 1 break if end > start: kept.append(bib_text[start:end]) return "\n\n".join(kept) + "\n" if kept else "" def _remove_citations_from_text(text: str, keys_to_remove: set[str]) -> str: """Remove \\cite{key} and [key] references for specified citation keys.""" # Handle multi-key LaTeX cites: \cite{a,b,c} → filter keys inside braces def _filter_cite(m: re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] kept = [k for k in keys if k not in keys_to_remove] if not kept: return "" return f"\\cite{{{','.join(kept)}}}" text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, text) # Markdown: [key] for key in keys_to_remove: text = re.sub(rf"\[{re.escape(key)}\]", "", text) return text # --------------------------------------------------------------------------- # Stage 23: Citation Verify # --------------------------------------------------------------------------- def _execute_citation_verify( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: from researchclaw.literature.verify import ( VerifyStatus, annotate_paper_hallucinations, filter_verified_bibtex, verify_citations, ) bib_text = _read_prior_artifact(run_dir, "references.bib") or "" paper_text = _read_prior_artifact(run_dir, "paper_final.md") or "" if not bib_text.strip(): report_data = { "summary": { "total": 0, "verified": 0, "suspicious": 0, "hallucinated": 0, "skipped": 0, "integrity_score": 1.0, }, "results": [], "note": "No references.bib found — nothing to verify.", } (stage_dir / "verification_report.json").write_text( json.dumps(report_data, indent=2), encoding="utf-8" ) (stage_dir / "references_verified.bib").write_text( "% No references to verify\n", encoding="utf-8" ) return StageResult( stage=Stage.CITATION_VERIFY, status=StageStatus.DONE, artifacts=("verification_report.json", "references_verified.bib"), evidence_refs=( "stage-23/verification_report.json", "stage-23/references_verified.bib", ), ) s2_api_key = getattr(config.llm, "s2_api_key", "") or "" from researchclaw.literature.verify import parse_bibtex_entries _n_entries = len(parse_bibtex_entries(bib_text)) logger.info( "[citation-verify] Verifying %d references " "(DOI→CrossRef > OpenAlex > arXiv > S2)…", _n_entries, ) report = verify_citations(bib_text, s2_api_key=s2_api_key) logger.info( "[citation-verify] Done: %d verified, %d suspicious, " "%d hallucinated, %d skipped (integrity: %.0f%%)", report.verified, report.suspicious, report.hallucinated, report.skipped, report.integrity_score * 100, ) # --- Relevance check: assess topical relevance of verified citations --- if llm is not None and report.results: relevance_scores = _check_citation_relevance( llm, config.research.topic, report.results ) for cr in report.results: score = relevance_scores.get(cr.cite_key) if score is not None: cr.relevance_score = score # FIX-5: Filter low-relevance citations and enforce hard cap RELEVANCE_THRESHOLD = 0.5 MAX_CITATIONS = 60 low_relevance_keys: set[str] = set() for cr in report.results: if cr.relevance_score is not None and cr.relevance_score < RELEVANCE_THRESHOLD: low_relevance_keys.add(cr.cite_key) # Hard cap: if still above MAX_CITATIONS after relevance filter, drop lowest # BUG-07 fix: Unscored citations (relevance_score=None) default to 0.7 # because they passed API verification and are likely relevant. # Previously they defaulted to 0.0 which caused mass-deletion. _DEFAULT_RELEVANCE = 0.7 remaining = [ cr for cr in report.results if cr.cite_key not in low_relevance_keys and cr.status != VerifyStatus.HALLUCINATED ] if len(remaining) > MAX_CITATIONS: remaining.sort( key=lambda c: c.relevance_score if c.relevance_score is not None else _DEFAULT_RELEVANCE, ) overflow = remaining[:len(remaining) - MAX_CITATIONS] for cr in overflow: low_relevance_keys.add(cr.cite_key) logger.info( "Stage 23: Hard cap applied, dropping %d additional low-relevance citations", len(overflow), ) if low_relevance_keys: logger.info( "Stage 23: Filtering %d low-relevance citations (threshold=%.1f, cap=%d): %s", len(low_relevance_keys), RELEVANCE_THRESHOLD, MAX_CITATIONS, ", ".join(sorted(list(low_relevance_keys)[:20])), ) (stage_dir / "verification_report.json").write_text( json.dumps(report.to_dict(), indent=2), encoding="utf-8" ) verified_bib = filter_verified_bibtex(bib_text, report, include_suspicious=True) # Remove low-relevance entries from BibTeX if low_relevance_keys: verified_bib = _remove_bibtex_entries(verified_bib, low_relevance_keys) # BUG-26: If verification stripped >50% of entries (e.g. due to rate limiting), # fall back to the original bib to avoid breaking the paper's references original_count = len(re.findall(r"@\w+\{", bib_text)) verified_count = len(re.findall(r"@\w+\{", verified_bib)) if original_count > 0 and verified_count < original_count * 0.5: logger.warning( "Stage 23: Verification stripped %d→%d entries (>50%% loss). " "Keeping original bib to avoid breaking references.", original_count, verified_count, ) verified_bib = bib_text # IMP-1: Also prune uncited entries from verified bib # BUG-182: Also scan LaTeX paper.tex (not just Markdown) for \cite{} keys. # The Markdown version may use [key] notation while LaTeX uses \cite{key}. if paper_text.strip(): _vbib_keys = set(re.findall(r"@\w+\{([^,]+),", verified_bib)) _cited_in_paper: set[str] = set() _cited_in_paper.update( re.findall(r"\[([a-zA-Z]+\d{4}[a-zA-Z0-9_-]*)\]", paper_text) ) for _cm in re.finditer(r"\\cite\{([^}]+)\}", paper_text): _cited_in_paper.update( k.strip() for k in _cm.group(1).split(",") ) # BUG-182: Also read stage-22/paper.tex for \cite{} keys _latex_paper = stage_dir.parent / "stage-22" / "paper.tex" if _latex_paper.exists(): try: _latex_text = _latex_paper.read_text(encoding="utf-8") for _cm in re.finditer(r"\\cite[pt]?\{([^}]+)\}", _latex_text): _cited_in_paper.update( k.strip() for k in _cm.group(1).split(",") ) except OSError: pass _uncited_vbib = _vbib_keys - _cited_in_paper if _uncited_vbib: verified_bib = _remove_bibtex_entries(verified_bib, _uncited_vbib) logger.info( "Stage 23: Pruned %d uncited entries from verified bib " "(kept %d)", len(_uncited_vbib), len(_vbib_keys) - len(_uncited_vbib), ) # BUG-100: If all entries were filtered out (low-relevance + uncited pruning), # write a comment instead of an empty file to avoid "Missing or empty output" error. if not verified_bib.strip(): verified_bib = "% All citations were filtered out during verification\n" logger.warning( "Stage 23: All BibTeX entries filtered out — writing placeholder" ) (stage_dir / "references_verified.bib").write_text(verified_bib, encoding="utf-8") artifacts = ["verification_report.json", "references_verified.bib"] if paper_text.strip(): annotated = annotate_paper_hallucinations(paper_text, report) # Remove \cite{} and [cite_key] references for low-relevance entries if low_relevance_keys: annotated = _remove_citations_from_text(annotated, low_relevance_keys) (stage_dir / "paper_final_verified.md").write_text(annotated, encoding="utf-8") artifacts.append("paper_final_verified.md") logger.info( "Stage 23 citation verify: %d total, %d verified, %d suspicious, " "%d hallucinated, %d skipped (integrity=%.1f%%)", report.total, report.verified, report.suspicious, report.hallucinated, report.skipped, report.integrity_score * 100, ) return StageResult( stage=Stage.CITATION_VERIFY, status=StageStatus.DONE, artifacts=tuple(artifacts), evidence_refs=tuple(f"stage-23/{a}" for a in artifacts), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_synthesis.py ================================================ """Stages 7-8: Synthesis and hypothesis generation.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.llm.client import LLMClient from researchclaw.pipeline._helpers import ( StageResult, _default_hypotheses, _get_evolution_overlay, _multi_perspective_generate, _parse_jsonl_rows, _read_prior_artifact, _synthesize_perspectives, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_synthesis( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: cards_path = _read_prior_artifact(run_dir, "cards/") or "" cards_context = "" if cards_path: snippets: list[str] = [] for path in sorted(Path(cards_path).glob("*.md"))[:24]: snippets.append(path.read_text(encoding="utf-8")) cards_context = "\n\n".join(snippets) if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "synthesis") sp = _pm.for_stage( "synthesis", evolution_overlay=_overlay, topic=config.research.topic, cards_context=cards_context, ) resp = llm.chat( [{"role": "user", "content": sp.user}], system=sp.system, max_tokens=sp.max_tokens or 8192, ) synthesis_md = resp.content else: synthesis_md = f"""# Synthesis ## Cluster Overview - Cluster A: Representation methods - Cluster B: Training strategies - Cluster C: Evaluation robustness ## Gap 1 Limited consistency across benchmark protocols. ## Gap 2 Under-reported failure behavior under distribution shift. ## Prioritized Opportunities 1. Unified experimental protocol 2. Robustness-aware evaluation suite ## Generated {_utcnow_iso()} """ (stage_dir / "synthesis.md").write_text(synthesis_md, encoding="utf-8") return StageResult( stage=Stage.SYNTHESIS, status=StageStatus.DONE, artifacts=("synthesis.md",), evidence_refs=("stage-07/synthesis.md",), ) def _execute_hypothesis_gen( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: synthesis = _read_prior_artifact(run_dir, "synthesis.md") or "" if llm is not None: _pm = prompts or PromptManager() from researchclaw.prompts import DEBATE_ROLES_HYPOTHESIS # noqa: PLC0415 # --- Multi-perspective debate --- perspectives_dir = stage_dir / "perspectives" variables = {"topic": config.research.topic, "synthesis": synthesis} perspectives = _multi_perspective_generate( llm, DEBATE_ROLES_HYPOTHESIS, variables, perspectives_dir ) # BUG-S2: If all debate perspectives failed, fall back to defaults # instead of sending empty context to the LLM (pure hallucination). if not perspectives: logger.warning("All debate perspectives failed; using default hypotheses") hypotheses_md = _default_hypotheses(config.research.topic) else: # --- Synthesize into final hypotheses --- hypotheses_md = _synthesize_perspectives( llm, perspectives, "hypothesis_synthesize", _pm ) else: hypotheses_md = _default_hypotheses(config.research.topic) (stage_dir / "hypotheses.md").write_text(hypotheses_md, encoding="utf-8") # --- Novelty check (non-blocking) --- novelty_artifacts: tuple[str, ...] = () try: from researchclaw.literature.novelty import check_novelty # noqa: PLC0415 candidates_text = _read_prior_artifact(run_dir, "candidates.jsonl") or "" papers_seen = _parse_jsonl_rows(candidates_text) if candidates_text else [] novelty_report = check_novelty( topic=config.research.topic, hypotheses_text=hypotheses_md, papers_already_seen=papers_seen, s2_api_key=getattr(config.llm, "s2_api_key", ""), ) (stage_dir / "novelty_report.json").write_text( json.dumps(novelty_report, indent=2, ensure_ascii=False), encoding="utf-8", ) novelty_artifacts = ("novelty_report.json",) logger.info( "Novelty check: score=%.3f assessment=%s recommendation=%s", novelty_report["novelty_score"], novelty_report["assessment"], novelty_report["recommendation"], ) except Exception: # noqa: BLE001 logger.warning("Novelty check failed (non-blocking)", exc_info=True) return StageResult( stage=Stage.HYPOTHESIS_GEN, status=StageStatus.DONE, artifacts=("hypotheses.md",) + novelty_artifacts, evidence_refs=("stage-08/hypotheses.md",), ) ================================================ FILE: researchclaw/pipeline/stage_impls/_topic.py ================================================ """Stages 1-2: Topic initialization and problem decomposition.""" from __future__ import annotations import json import logging from pathlib import Path from typing import TYPE_CHECKING from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.hardware import detect_hardware, ensure_torch_available from researchclaw.llm.client import LLMClient from researchclaw.pipeline._domain import _detect_domain from researchclaw.pipeline._helpers import ( StageResult, _get_evolution_overlay, _read_prior_artifact, _safe_json_loads, _utcnow_iso, ) from researchclaw.pipeline.stages import Stage, StageStatus from researchclaw.prompts import PromptManager logger = logging.getLogger(__name__) def _execute_topic_init( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: topic = config.research.topic domains = ( ", ".join(config.research.domains) if config.research.domains else "general" ) if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "topic_init") sp = _pm.for_stage( "topic_init", evolution_overlay=_overlay, topic=topic, domains=domains, project_name=config.project.name, quality_threshold=config.research.quality_threshold, ) resp = llm.chat( [{"role": "user", "content": sp.user}], system=sp.system, ) goal_md = resp.content else: goal_md = f"""# Research Goal ## Topic {topic} ## Scope Investigate the topic with emphasis on reproducible methods and measurable outcomes. ## SMART Goal - Specific: Build a focused research plan for {topic} - Measurable: Produce literature shortlist, hypotheses, experiment plan, and final paper - Achievable: Complete through staged pipeline with gate checks - Relevant: Aligned with project {config.project.name} - Time-bound: Constrained by pipeline execution budget ## Constraints - Quality threshold: {config.research.quality_threshold} - Daily paper target: {config.research.daily_paper_count} ## Success Criteria - At least 2 falsifiable hypotheses - Executable experiment code and results analysis - Revised paper passing quality gate ## Generated {_utcnow_iso()} """ (stage_dir / "goal.md").write_text(goal_md, encoding="utf-8") # --- Hardware detection (GPU / MPS / CPU) --- hw = detect_hardware() (stage_dir / "hardware_profile.json").write_text( json.dumps(hw.to_dict(), indent=2), encoding="utf-8" ) if hw.warning: logger.warning("Hardware advisory: %s", hw.warning) else: logger.info("Hardware detected: %s (%s, %s MB VRAM)", hw.gpu_name, hw.gpu_type, hw.vram_mb) # --- Optionally ensure PyTorch is available --- if hw.has_gpu and config.experiment.mode == "sandbox": torch_ok = ensure_torch_available(config.experiment.sandbox.python_path, hw.gpu_type) if torch_ok: logger.info("PyTorch is available for sandbox experiments") else: logger.warning("PyTorch could not be installed; sandbox will use CPU-only packages") elif hw.has_gpu and config.experiment.mode == "docker": logger.info("Docker sandbox: PyTorch pre-installed in container image") return StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md", "hardware_profile.json"), evidence_refs=("stage-01/goal.md", "stage-01/hardware_profile.json"), ) def _execute_problem_decompose( stage_dir: Path, run_dir: Path, config: RCConfig, adapters: AdapterBundle, *, llm: LLMClient | None = None, prompts: PromptManager | None = None, ) -> StageResult: goal_text = _read_prior_artifact(run_dir, "goal.md") or "" if llm is not None: _pm = prompts or PromptManager() _overlay = _get_evolution_overlay(run_dir, "problem_decompose") sp = _pm.for_stage( "problem_decompose", evolution_overlay=_overlay, topic=config.research.topic, goal_text=goal_text, ) resp = llm.chat( [{"role": "user", "content": sp.user}], system=sp.system, ) body = resp.content else: body = f"""# Problem Decomposition ## Source Derived from `goal.md` for topic: {config.research.topic} ## Sub-questions 1. Which problem settings and benchmarks define current SOTA? 2. Which methodological gaps remain unresolved? 3. Which hypotheses are testable under realistic constraints? 4. Which datasets and metrics best discriminate method quality? 5. Which failure modes can invalidate expected gains? ## Priority Ranking 1. Problem framing and benchmark setup 2. Gap identification and hypothesis formulation 3. Experiment and metric design 4. Failure analysis and robustness checks ## Risks - Ambiguous task definition - Dataset leakage or metric mismatch ## Generated {_utcnow_iso()} """ (stage_dir / "problem_tree.md").write_text(body, encoding="utf-8") # IMP-35: Topic/title quality pre-evaluation # Quick LLM check: is the topic well-scoped for a conference paper? if llm is not None: try: _eval_resp = llm.chat( [ { "role": "user", "content": ( "Evaluate this research topic for a top ML conference paper. " "Score 1-10 on: (a) novelty, (b) specificity, (c) feasibility. " "If overall score < 5, suggest a refined topic.\n\n" f"Topic: {config.research.topic}\n\n" "Reply as JSON: {\"novelty\": N, \"specificity\": N, " "\"feasibility\": N, \"overall\": N, \"suggestion\": \"...\"}" ), } ], system=( f"You are a senior {_detect_domain(config.research.topic, config.research.domains)[1]} " f"researcher evaluating research topic quality." ), ) _eval_data = _safe_json_loads(_eval_resp.content, {}) if isinstance(_eval_data, dict): overall = _eval_data.get("overall", 10) if isinstance(overall, (int, float)) and overall < 5: logger.warning( "IMP-35: Topic quality score %s/10 — consider refining: %s", overall, _eval_data.get("suggestion", ""), ) else: logger.info("IMP-35: Topic quality score %s/10", overall) (stage_dir / "topic_evaluation.json").write_text( json.dumps(_eval_data, indent=2), encoding="utf-8" ) except Exception: # noqa: BLE001 logger.debug("IMP-35: Topic evaluation skipped (non-blocking)") return StageResult( stage=Stage.PROBLEM_DECOMPOSE, status=StageStatus.DONE, artifacts=("problem_tree.md",), evidence_refs=("stage-02/problem_tree.md",), ) ================================================ FILE: researchclaw/pipeline/stages.py ================================================ """23-stage ResearchClaw pipeline state machine. Defines the stage sequence, status transitions, gate logic, and rollback rules. Migrated from arc/state_machine.py (19 stages) with the following changes: - SEARCH_PLAN + SOURCE_CONNECT → SEARCH_STRATEGY - RELEVANCE_SCREEN + QUALITY_SCREEN → LITERATURE_SCREEN - CLUSTER_TOPICS + GAP_ANALYSIS → SYNTHESIS - EXPERIMENT_DESIGN split → EXPERIMENT_DESIGN + CODE_GENERATION - EXECUTE split → EXPERIMENT_RUN + ITERATIVE_REFINE - WRITE_DRAFT split → PAPER_OUTLINE + PAPER_DRAFT - Added PAPER_REVISION, QUALITY_GATE, EXPORT_PUBLISH - RETROSPECTIVE_ARCHIVE split → KNOWLEDGE_ARCHIVE (+ QUALITY_GATE + EXPORT_PUBLISH) """ from __future__ import annotations from dataclasses import dataclass from enum import Enum, IntEnum from typing import Iterable class Stage(IntEnum): """23-stage research pipeline.""" # Phase A: Research Scoping TOPIC_INIT = 1 PROBLEM_DECOMPOSE = 2 # Phase B: Literature Discovery SEARCH_STRATEGY = 3 LITERATURE_COLLECT = 4 LITERATURE_SCREEN = 5 # GATE KNOWLEDGE_EXTRACT = 6 # Phase C: Knowledge Synthesis SYNTHESIS = 7 HYPOTHESIS_GEN = 8 # Phase D: Experiment Design EXPERIMENT_DESIGN = 9 # GATE CODE_GENERATION = 10 # NEW RESOURCE_PLANNING = 11 # Phase E: Experiment Execution EXPERIMENT_RUN = 12 ITERATIVE_REFINE = 13 # NEW # Phase F: Analysis & Decision RESULT_ANALYSIS = 14 RESEARCH_DECISION = 15 # Phase G: Paper Writing PAPER_OUTLINE = 16 PAPER_DRAFT = 17 PEER_REVIEW = 18 PAPER_REVISION = 19 # NEW # Phase H: Finalization QUALITY_GATE = 20 # GATE KNOWLEDGE_ARCHIVE = 21 EXPORT_PUBLISH = 22 CITATION_VERIFY = 23 class StageStatus(str, Enum): PENDING = "pending" RUNNING = "running" BLOCKED_APPROVAL = "blocked_approval" APPROVED = "approved" REJECTED = "rejected" PAUSED = "paused" RETRYING = "retrying" FAILED = "failed" DONE = "done" class TransitionEvent(str, Enum): START = "start" SUCCEED = "succeed" APPROVE = "approve" REJECT = "reject" TIMEOUT = "timeout" FAIL = "fail" RETRY = "retry" RESUME = "resume" PAUSE = "pause" # --------------------------------------------------------------------------- # Stage navigation # --------------------------------------------------------------------------- STAGE_SEQUENCE: tuple[Stage, ...] = tuple(Stage) NEXT_STAGE: dict[Stage, Stage | None] = { stage: STAGE_SEQUENCE[idx + 1] if idx + 1 < len(STAGE_SEQUENCE) else None for idx, stage in enumerate(STAGE_SEQUENCE) } PREVIOUS_STAGE: dict[Stage, Stage | None] = { stage: STAGE_SEQUENCE[idx - 1] if idx > 0 else None for idx, stage in enumerate(STAGE_SEQUENCE) } # --------------------------------------------------------------------------- # Gate stages — require approval before proceeding # --------------------------------------------------------------------------- GATE_STAGES: frozenset[Stage] = frozenset( { Stage.LITERATURE_SCREEN, Stage.EXPERIMENT_DESIGN, Stage.QUALITY_GATE, } ) # Gate rollback targets: when a gate rejects, where to roll back GATE_ROLLBACK: dict[Stage, Stage] = { Stage.LITERATURE_SCREEN: Stage.LITERATURE_COLLECT, # reject → re-collect Stage.EXPERIMENT_DESIGN: Stage.HYPOTHESIS_GEN, # reject → re-hypothesize Stage.QUALITY_GATE: Stage.PAPER_OUTLINE, # reject → rewrite paper } # --------------------------------------------------------------------------- # Research decision rollback targets (PIVOT/REFINE from Stage 15) # --------------------------------------------------------------------------- DECISION_ROLLBACK: dict[str, Stage] = { "pivot": Stage.HYPOTHESIS_GEN, # Discard hypotheses, re-generate "refine": Stage.ITERATIVE_REFINE, # Keep hypotheses, re-run experiments } MAX_DECISION_PIVOTS: int = 2 # Prevent infinite loops # --------------------------------------------------------------------------- # Noncritical stages — can be skipped on failure without aborting pipeline # --------------------------------------------------------------------------- NONCRITICAL_STAGES: frozenset[Stage] = frozenset( { Stage.QUALITY_GATE, # 20: low quality should warn, not block deliverables Stage.KNOWLEDGE_ARCHIVE, # 21: archival doesn't affect paper output # T3.4: CITATION_VERIFY removed — hallucinated citations MUST block export } ) # --------------------------------------------------------------------------- # Phase groupings (for UI and reporting) # --------------------------------------------------------------------------- PHASE_MAP: dict[str, tuple[Stage, ...]] = { "A: Research Scoping": (Stage.TOPIC_INIT, Stage.PROBLEM_DECOMPOSE), "B: Literature Discovery": ( Stage.SEARCH_STRATEGY, Stage.LITERATURE_COLLECT, Stage.LITERATURE_SCREEN, Stage.KNOWLEDGE_EXTRACT, ), "C: Knowledge Synthesis": (Stage.SYNTHESIS, Stage.HYPOTHESIS_GEN), "D: Experiment Design": ( Stage.EXPERIMENT_DESIGN, Stage.CODE_GENERATION, Stage.RESOURCE_PLANNING, ), "E: Experiment Execution": (Stage.EXPERIMENT_RUN, Stage.ITERATIVE_REFINE), "F: Analysis & Decision": (Stage.RESULT_ANALYSIS, Stage.RESEARCH_DECISION), "G: Paper Writing": ( Stage.PAPER_OUTLINE, Stage.PAPER_DRAFT, Stage.PEER_REVIEW, Stage.PAPER_REVISION, ), "H: Finalization": ( Stage.QUALITY_GATE, Stage.KNOWLEDGE_ARCHIVE, Stage.EXPORT_PUBLISH, Stage.CITATION_VERIFY, ), } # --------------------------------------------------------------------------- # Transition logic # --------------------------------------------------------------------------- TRANSITION_MAP: dict[StageStatus, frozenset[StageStatus]] = { StageStatus.PENDING: frozenset({StageStatus.RUNNING}), StageStatus.RUNNING: frozenset( {StageStatus.DONE, StageStatus.BLOCKED_APPROVAL, StageStatus.FAILED} ), StageStatus.BLOCKED_APPROVAL: frozenset( {StageStatus.APPROVED, StageStatus.REJECTED, StageStatus.PAUSED} ), StageStatus.APPROVED: frozenset({StageStatus.DONE}), StageStatus.REJECTED: frozenset({StageStatus.PENDING}), StageStatus.PAUSED: frozenset({StageStatus.RUNNING}), StageStatus.RETRYING: frozenset({StageStatus.RUNNING}), StageStatus.FAILED: frozenset({StageStatus.RETRYING, StageStatus.PAUSED}), StageStatus.DONE: frozenset(), } @dataclass(frozen=True) class TransitionOutcome: stage: Stage status: StageStatus next_stage: Stage | None rollback_stage: Stage | None = None checkpoint_required: bool = False decision: str = "proceed" def gate_required( stage: Stage, hitl_required_stages: Iterable[int] | None = None, ) -> bool: """Check whether a stage requires human-in-the-loop approval.""" if stage not in GATE_STAGES: return False if hitl_required_stages is not None: return int(stage) in frozenset(hitl_required_stages) return True # Default: all gate stages require approval def default_rollback_stage(stage: Stage) -> Stage: """Return the configured rollback target, or the previous stage.""" return GATE_ROLLBACK.get(stage) or PREVIOUS_STAGE.get(stage) or stage def advance( stage: Stage, status: StageStatus, event: TransitionEvent | str, *, hitl_required_stages: Iterable[int] | None = None, rollback_stage: Stage | None = None, ) -> TransitionOutcome: """Compute the next state given current stage, status, and event. Raises ValueError on unsupported transitions. """ event = TransitionEvent(event) target_rollback = rollback_stage or default_rollback_stage(stage) # START → RUNNING if event is TransitionEvent.START and status in { StageStatus.PENDING, StageStatus.RETRYING, StageStatus.PAUSED, }: return TransitionOutcome( stage=stage, status=StageStatus.RUNNING, next_stage=stage ) # SUCCEED while RUNNING if event is TransitionEvent.SUCCEED and status is StageStatus.RUNNING: if gate_required(stage, hitl_required_stages): return TransitionOutcome( stage=stage, status=StageStatus.BLOCKED_APPROVAL, next_stage=stage, checkpoint_required=False, decision="block", ) return TransitionOutcome( stage=stage, status=StageStatus.DONE, next_stage=NEXT_STAGE[stage], checkpoint_required=True, ) # APPROVE while BLOCKED if event is TransitionEvent.APPROVE and status is StageStatus.BLOCKED_APPROVAL: return TransitionOutcome( stage=stage, status=StageStatus.DONE, next_stage=NEXT_STAGE[stage], checkpoint_required=True, ) # REJECT while BLOCKED → rollback if event is TransitionEvent.REJECT and status is StageStatus.BLOCKED_APPROVAL: return TransitionOutcome( stage=target_rollback, status=StageStatus.PENDING, next_stage=target_rollback, rollback_stage=target_rollback, checkpoint_required=True, decision="pivot", ) # TIMEOUT while BLOCKED → pause if event is TransitionEvent.TIMEOUT and status is StageStatus.BLOCKED_APPROVAL: return TransitionOutcome( stage=stage, status=StageStatus.PAUSED, next_stage=stage, checkpoint_required=True, decision="block", ) # FAIL while RUNNING if event is TransitionEvent.FAIL and status is StageStatus.RUNNING: return TransitionOutcome( stage=stage, status=StageStatus.FAILED, next_stage=stage, checkpoint_required=True, decision="retry", ) # RETRY while FAILED if event is TransitionEvent.RETRY and status is StageStatus.FAILED: return TransitionOutcome( stage=stage, status=StageStatus.RETRYING, next_stage=stage, decision="retry", ) # RESUME while PAUSED if event is TransitionEvent.RESUME and status is StageStatus.PAUSED: return TransitionOutcome( stage=stage, status=StageStatus.RUNNING, next_stage=stage ) # PAUSE while FAILED if event is TransitionEvent.PAUSE and status is StageStatus.FAILED: return TransitionOutcome( stage=stage, status=StageStatus.PAUSED, next_stage=stage, checkpoint_required=True, decision="block", ) raise ValueError( f"Unsupported transition: {status.value} + {event.value} for stage {int(stage)}" ) ================================================ FILE: researchclaw/pipeline/verified_registry.py ================================================ """Verified Value Registry — ground truth for all experiment-sourced numbers. Builds a whitelist of numeric values, condition names, and training config from ``experiment_summary.json`` and ``refinement_log.json``. Used by ``paper_verifier.py`` and ``results_table_builder.py`` to ensure that generated papers contain ONLY numbers grounded in real experiment data. """ from __future__ import annotations import logging import math import re from dataclasses import dataclass, field from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # Infrastructure metric keys — allowed in paper without verification _INFRA_KEYS: set[str] = { "elapsed_sec", "total_elapsed_seconds", "TIME_ESTIMATE", "SEED_COUNT", "time_budget_sec", "condition_count", "total_runs", "total_conditions", "total_metric_keys", "stopped_early", } # Metric key patterns for per-seed results (e.g. "DQN/0/metric") _PER_SEED_PATTERN = re.compile(r"^(.+)/(\d+)/(.+)$") @dataclass class ConditionResult: """Aggregated results for one experimental condition.""" name: str per_seed_values: dict[int, float] = field(default_factory=dict) mean: float | None = None std: float | None = None n_seeds: int = 0 aggregate_metric: float | None = None # The condition-level metric def compute_stats(self) -> None: """Compute mean and std from per-seed values.""" vals = [v for v in self.per_seed_values.values() if _is_finite(v)] self.n_seeds = len(vals) if not vals: return self.mean = sum(vals) / len(vals) if len(vals) >= 2: variance = sum((v - self.mean) ** 2 for v in vals) / (len(vals) - 1) self.std = math.sqrt(variance) else: self.std = 0.0 @dataclass class VerifiedRegistry: """Registry of all numbers grounded in experiment data.""" values: dict[float, str] = field(default_factory=dict) condition_names: set[str] = field(default_factory=set) conditions: dict[str, ConditionResult] = field(default_factory=dict) primary_metric: float | None = None primary_metric_std: float | None = None metric_direction: str = "maximize" # "maximize" or "minimize" training_config: dict[str, Any] = field(default_factory=dict) def add_value(self, value: float, source: str) -> None: """Register a verified numeric value with its provenance.""" if not _is_finite(value): return self.values[value] = source # Also register common transformations self._add_variants(value, source) def _add_variants(self, value: float, source: str) -> None: """Register rounding variants and percentage conversions.""" # Rounded variants (2, 3, 4 decimal places) for dp in (1, 2, 3, 4): rounded = round(value, dp) if rounded != value and rounded not in self.values: self.values[rounded] = f"{source} (rounded to {dp}dp)" # Percentage conversion: if value is in [0, 1], also register value*100 if 0.0 < abs(value) <= 1.0: pct = value * 100.0 if pct not in self.values: self.values[pct] = f"{source} (×100)" for dp in (1, 2, 3, 4): pct_r = round(pct, dp) if pct_r not in self.values: self.values[pct_r] = f"{source} (×100, {dp}dp)" # If value > 1 and could be a percentage, also register value/100 if abs(value) > 1.0: frac = value / 100.0 if frac not in self.values: self.values[frac] = f"{source} (÷100)" def is_verified(self, number: float, tolerance: float = 0.01) -> bool: """Check if *number* matches any verified value within relative tolerance.""" if not _is_finite(number): return False for v in self.values: if v == 0.0: if abs(number) < 1e-6: return True elif abs(number - v) / max(abs(v), 1e-9) <= tolerance: return True return False def lookup(self, number: float, tolerance: float = 0.01) -> str | None: """Return the source description if *number* is verified, else None.""" if not _is_finite(number): return None for v, src in self.values.items(): if v == 0.0: if abs(number) < 1e-6: return src elif abs(number - v) / max(abs(v), 1e-9) <= tolerance: return src return None def verify_condition(self, name: str) -> bool: """Check if condition name was actually run.""" return name in self.condition_names @classmethod def from_experiment( cls, experiment_summary: dict, refinement_log: dict | None = None, *, metric_direction: str = "maximize", ) -> VerifiedRegistry: """Build registry from experiment artifacts. Parameters ---------- experiment_summary: Parsed ``experiment_summary.json``. refinement_log: Parsed ``refinement_log.json`` (optional, provides richer per-seed data). metric_direction: ``"maximize"`` or ``"minimize"`` — used for best-result detection. """ reg = cls(metric_direction=metric_direction) # --- 1. Extract condition-level and per-seed metrics --- best_run = experiment_summary.get("best_run", {}) metrics = best_run.get("metrics", {}) # Parse per-seed structure: "CondName/seed/metric_key" → value for key, value in metrics.items(): if not isinstance(value, (int, float)) or not _is_finite(value): continue if key in _INFRA_KEYS: reg.training_config[key] = value continue reg.add_value(value, f"best_run.metrics.{key}") m = _PER_SEED_PATTERN.match(key) if m: cond_name, seed_str, _metric_name = m.group(1), m.group(2), m.group(3) seed_idx = int(seed_str) if cond_name not in reg.conditions: reg.conditions[cond_name] = ConditionResult(name=cond_name) reg.conditions[cond_name].per_seed_values[seed_idx] = value reg.condition_names.add(cond_name) # --- 2. Extract condition_summaries --- for cond_name, cond_data in experiment_summary.get("condition_summaries", {}).items(): reg.condition_names.add(cond_name) if cond_name not in reg.conditions: reg.conditions[cond_name] = ConditionResult(name=cond_name) cond_metrics = cond_data.get("metrics", {}) for mk, mv in cond_metrics.items(): if isinstance(mv, (int, float)) and _is_finite(mv): reg.add_value(mv, f"condition_summaries.{cond_name}.{mk}") reg.conditions[cond_name].aggregate_metric = mv # --- 3. Extract metrics_summary (min/max/mean per key) --- for key, stats in experiment_summary.get("metrics_summary", {}).items(): if key in _INFRA_KEYS: continue for stat_name in ("min", "max", "mean"): v = stats.get(stat_name) if isinstance(v, (int, float)) and _is_finite(v): reg.add_value(v, f"metrics_summary.{key}.{stat_name}") # --- 4. Extract primary_metric --- pm = _extract_primary_metric(metrics) if pm is not None: reg.primary_metric = pm reg.add_value(pm, "primary_metric") pm_std = metrics.get("primary_metric_std") if isinstance(pm_std, (int, float)) and _is_finite(pm_std): reg.primary_metric_std = pm_std reg.add_value(pm_std, "primary_metric_std") # --- 5. Compute per-condition stats --- for cond in reg.conditions.values(): cond.compute_stats() if cond.mean is not None: reg.add_value(cond.mean, f"{cond.name}.mean") if cond.std is not None and cond.std > 0: reg.add_value(cond.std, f"{cond.name}.std") # --- 6. Compute pairwise differences (for comparative claims) --- cond_list = [c for c in reg.conditions.values() if c.mean is not None] for i, c1 in enumerate(cond_list): for c2 in cond_list[i + 1 :]: diff = c1.mean - c2.mean # type: ignore[operator] if _is_finite(diff): reg.add_value(diff, f"diff({c1.name}-{c2.name})") reg.add_value(abs(diff), f"|diff({c1.name},{c2.name})|") # Relative improvement if c2.mean and abs(c2.mean) > 1e-9: # type: ignore[operator] rel = (c1.mean - c2.mean) / abs(c2.mean) * 100.0 # type: ignore[operator] if _is_finite(rel): reg.add_value(rel, f"rel_improve({c1.name} vs {c2.name})") reg.add_value(abs(rel), f"|rel_improve({c1.name},{c2.name})|") # --- 7. Enrich from refinement_log (best iteration only) --- if refinement_log: _enrich_from_refinement_log(reg, refinement_log) logger.info( "VerifiedRegistry: %d values, %d conditions (%s), primary_metric=%s", len(reg.values), len(reg.condition_names), ", ".join(sorted(reg.condition_names)), reg.primary_metric, ) return reg @classmethod def from_run_dir( cls, run_dir: Path, *, metric_direction: str = "maximize", best_only: bool = False, ) -> VerifiedRegistry: """Build registry from experiment data sources in *run_dir*. Parameters ---------- best_only: BUG-222: When True, use ONLY ``experiment_summary_best.json`` (the promoted best iteration) as the ground truth. This prevents regressed REFINE iterations from polluting the verified value set. When False (default), merges all ``stage-14*`` data for backward compatibility (e.g., pre-built table generation that needs all condition names). Scans (when ``best_only=False``): 1. All ``stage-14*/experiment_summary.json`` (sorted, every version) 2. ``experiment_summary_best.json`` at run root (repair cycle output) 3. All ``stage-13*/refinement_log.json`` for enrichment """ import json as _json_rd target = cls(metric_direction=metric_direction) if best_only: # BUG-222: Only use promoted best data best_path = run_dir / "experiment_summary_best.json" if best_path.is_file(): try: best_data = _json_rd.loads(best_path.read_text(encoding="utf-8")) if isinstance(best_data, dict): sub = cls.from_experiment(best_data, metric_direction=metric_direction) _merge_into(target, sub) logger.debug("from_run_dir(best_only): using experiment_summary_best.json (%d values)", len(sub.values)) except (OSError, _json_rd.JSONDecodeError, Exception): # noqa: BLE001 logger.debug("from_run_dir(best_only): failed to load experiment_summary_best.json", exc_info=True) if not target.values: # Fallback: no best.json or it was empty — use stage-14/ (non-versioned) s14_path = run_dir / "stage-14" / "experiment_summary.json" if s14_path.is_file(): try: es_data = _json_rd.loads(s14_path.read_text(encoding="utf-8")) if isinstance(es_data, dict): sub = cls.from_experiment(es_data, metric_direction=metric_direction) _merge_into(target, sub) except (OSError, _json_rd.JSONDecodeError, Exception): # noqa: BLE001 pass else: # --- 1. All stage-14* experiment summaries --- for es_path in sorted(run_dir.glob("stage-14*/experiment_summary.json")): try: es_data = _json_rd.loads(es_path.read_text(encoding="utf-8")) if not isinstance(es_data, dict): continue sub = cls.from_experiment(es_data, metric_direction=metric_direction) _merge_into(target, sub) logger.debug("from_run_dir: merged %s (%d values)", es_path.name, len(sub.values)) except (OSError, _json_rd.JSONDecodeError, Exception): # noqa: BLE001 logger.debug("from_run_dir: skipping %s", es_path, exc_info=True) # --- 2. experiment_summary_best.json (repair cycle output) --- best_path = run_dir / "experiment_summary_best.json" if best_path.is_file(): try: best_data = _json_rd.loads(best_path.read_text(encoding="utf-8")) if isinstance(best_data, dict): sub = cls.from_experiment(best_data, metric_direction=metric_direction) _merge_into(target, sub) logger.debug("from_run_dir: merged experiment_summary_best.json (%d values)", len(sub.values)) except (OSError, _json_rd.JSONDecodeError, Exception): # noqa: BLE001 logger.debug("from_run_dir: skipping experiment_summary_best.json", exc_info=True) # --- 3. All refinement logs (enrichment) --- for rl_path in sorted(run_dir.glob("stage-13*/refinement_log.json")): try: rl_data = _json_rd.loads(rl_path.read_text(encoding="utf-8")) if isinstance(rl_data, dict): _enrich_from_refinement_log(target, rl_data) logger.debug("from_run_dir: enriched from %s", rl_path.name) except (OSError, _json_rd.JSONDecodeError, Exception): # noqa: BLE001 logger.debug("from_run_dir: skipping %s", rl_path, exc_info=True) # Recompute per-condition stats after merging for cond in target.conditions.values(): cond.compute_stats() if cond.mean is not None: target.add_value(cond.mean, f"{cond.name}.mean") if cond.std is not None and cond.std > 0: target.add_value(cond.std, f"{cond.name}.std") logger.info( "VerifiedRegistry.from_run_dir(%s): %d values, %d conditions (%s)", "best_only" if best_only else "all", len(target.values), len(target.condition_names), ", ".join(sorted(target.condition_names)) if target.condition_names else "none", ) return target @classmethod def from_files( cls, experiment_summary_path: Path, refinement_log_path: Path | None = None, *, metric_direction: str = "maximize", ) -> VerifiedRegistry: """Convenience: build registry from file paths.""" import json exp_data = json.loads(experiment_summary_path.read_text(encoding="utf-8")) ref_data = None if refinement_log_path and refinement_log_path.exists(): ref_data = json.loads(refinement_log_path.read_text(encoding="utf-8")) return cls.from_experiment(exp_data, ref_data, metric_direction=metric_direction) def _merge_into(target: VerifiedRegistry, source: VerifiedRegistry) -> None: """Merge *source* values, conditions, and condition_names into *target*.""" for v, desc in source.values.items(): if v not in target.values: target.values[v] = desc target.condition_names |= source.condition_names for cname, cresult in source.conditions.items(): if cname not in target.conditions: target.conditions[cname] = ConditionResult(name=cname) existing = target.conditions[cname] # Merge per-seed values (source wins on conflict — later data is better) existing.per_seed_values.update(cresult.per_seed_values) if cresult.aggregate_metric is not None: existing.aggregate_metric = cresult.aggregate_metric # Keep the best primary metric if source.primary_metric is not None: if target.primary_metric is None: target.primary_metric = source.primary_metric elif target.metric_direction == "maximize": target.primary_metric = max(target.primary_metric, source.primary_metric) else: target.primary_metric = min(target.primary_metric, source.primary_metric) if source.primary_metric_std is not None: # Only update std if the source's primary_metric actually won if target.primary_metric == source.primary_metric: target.primary_metric_std = source.primary_metric_std target.training_config.update(source.training_config) def _enrich_from_refinement_log(reg: VerifiedRegistry, refinement_log: dict) -> None: """Add values from the best refinement iteration.""" best_metric = refinement_log.get("best_metric") if isinstance(best_metric, (int, float)) and _is_finite(best_metric): reg.add_value(best_metric, "refinement_log.best_metric") best_version = refinement_log.get("best_version", "") iterations = refinement_log.get("iterations", []) for it in iterations: ver = it.get("version_dir", "") metric = it.get("metric") if isinstance(metric, (int, float)) and _is_finite(metric): reg.add_value(metric, f"refinement_log.iteration.{ver}") # Extract per-seed values from sandbox stdout if available for sandbox_key in ("sandbox", "sandbox_after_fix"): sandbox = it.get(sandbox_key, {}) if not isinstance(sandbox, dict): continue sb_metrics = sandbox.get("metrics", {}) if isinstance(sb_metrics, dict): for mk, mv in sb_metrics.items(): if isinstance(mv, (int, float)) and _is_finite(mv) and mk not in _INFRA_KEYS: reg.add_value(mv, f"refinement.{ver}.{sandbox_key}.{mk}") # Parse per-seed keys here too m = _PER_SEED_PATTERN.match(mk) if m: cond_name = m.group(1) seed_idx = int(m.group(2)) reg.condition_names.add(cond_name) if cond_name not in reg.conditions: reg.conditions[cond_name] = ConditionResult(name=cond_name) # Only update per_seed if this is the best version if ver == best_version or best_version in ver: reg.conditions[cond_name].per_seed_values[seed_idx] = mv def _extract_primary_metric(metrics: dict) -> float | None: """Extract primary_metric from metrics dict.""" pm = metrics.get("primary_metric") if isinstance(pm, (int, float)) and _is_finite(pm): return float(pm) return None def _is_finite(value: Any) -> bool: """Check if value is a finite number (not NaN, not Inf, not bool).""" if isinstance(value, bool): return False if not isinstance(value, (int, float)): return False return math.isfinite(value) ================================================ FILE: researchclaw/project/__init__.py ================================================ """Multi-project management for AutoResearchClaw.""" from researchclaw.project.models import Idea, Project from researchclaw.project.manager import ProjectManager from researchclaw.project.scheduler import ProjectScheduler from researchclaw.project.idea_pool import IdeaPool __all__ = ["Idea", "Project", "ProjectManager", "ProjectScheduler", "IdeaPool"] ================================================ FILE: researchclaw/project/idea_pool.py ================================================ """Idea pool: collect, evaluate, rank, and convert research ideas to projects.""" from __future__ import annotations import json import logging import uuid from pathlib import Path from typing import Any from researchclaw.project.models import Idea, Project logger = logging.getLogger(__name__) class IdeaPool: """Manage a pool of research ideas with evaluation and ranking.""" def __init__(self, pool_path: str | Path) -> None: self.pool_path = Path(pool_path).expanduser().resolve() self.ideas: dict[str, Idea] = {} self._load() # ── persistence ─────────────────────────────────────────────── def _load(self) -> None: if not self.pool_path.exists(): return try: data = json.loads(self.pool_path.read_text(encoding="utf-8")) for entry in data.get("ideas", []): idea = Idea.from_dict(entry) self.ideas[idea.id] = idea except (json.JSONDecodeError, KeyError) as exc: logger.warning("Failed to load idea pool: %s", exc) def _save(self) -> None: self.pool_path.parent.mkdir(parents=True, exist_ok=True) data = {"ideas": [idea.to_dict() for idea in self.ideas.values()]} self.pool_path.write_text( json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8" ) # ── CRUD ────────────────────────────────────────────────────── def add(self, title: str, description: str, domains: list[str] | None = None) -> Idea: """Add a new idea to the pool.""" idea_id = uuid.uuid4().hex[:8] idea = Idea( id=idea_id, title=title, description=description, domains=domains or [], ) self.ideas[idea_id] = idea self._save() logger.info("Added idea %s: %s", idea_id, title) return idea def remove(self, idea_id: str) -> None: """Remove an idea from the pool.""" if idea_id not in self.ideas: raise KeyError(f"Unknown idea: {idea_id}") del self.ideas[idea_id] self._save() def get(self, idea_id: str) -> Idea: """Get an idea by ID.""" if idea_id not in self.ideas: raise KeyError(f"Unknown idea: {idea_id}") return self.ideas[idea_id] # ── evaluation ──────────────────────────────────────────────── def evaluate(self, idea_id: str, feasibility: float, novelty: float) -> dict[str, Any]: """Set feasibility and novelty scores for an idea.""" idea = self.get(idea_id) idea.feasibility = max(0.0, min(1.0, feasibility)) idea.novelty = max(0.0, min(1.0, novelty)) idea.status = "evaluated" self._save() return { "id": idea.id, "feasibility": idea.feasibility, "novelty": idea.novelty, "score": idea.score, } def rank(self) -> list[Idea]: """Return all ideas sorted by composite score (descending).""" return sorted(self.ideas.values(), key=lambda i: i.score, reverse=True) # ── conversion ──────────────────────────────────────────────── def to_project(self, idea_id: str, config_path: str, projects_dir: str | Path) -> Project: """Convert an idea into a project skeleton.""" idea = self.get(idea_id) from researchclaw.project.manager import ProjectManager manager = ProjectManager(projects_dir) project = manager.create( name=idea.title.lower().replace(" ", "_")[:40], config_path=config_path, topic=idea.description, ) idea.status = "planned" self._save() return project def list_all(self) -> list[Idea]: """Return all ideas sorted by creation time.""" return sorted(self.ideas.values(), key=lambda i: i.created_at) ================================================ FILE: researchclaw/project/manager.py ================================================ """Project manager: CRUD operations and status tracking for research projects.""" from __future__ import annotations import json import logging import shutil from pathlib import Path from typing import Any from researchclaw.project.models import Project logger = logging.getLogger(__name__) _REGISTRY_FILE = "registry.json" class ProjectManager: """Manage multiple research projects with independent directories and configs.""" def __init__(self, projects_dir: str | Path) -> None: self.projects_dir = Path(projects_dir).expanduser().resolve() self.projects: dict[str, Project] = {} self._active: str | None = None self._load_registry() # ── persistence ─────────────────────────────────────────────── def _registry_path(self) -> Path: return self.projects_dir / _REGISTRY_FILE def _load_registry(self) -> None: """Load project registry from disk.""" path = self._registry_path() if not path.exists(): return try: data = json.loads(path.read_text(encoding="utf-8")) for entry in data.get("projects", []): proj = Project.from_dict(entry) self.projects[proj.name] = proj self._active = data.get("active") except (json.JSONDecodeError, KeyError) as exc: logger.warning("Failed to load project registry: %s", exc) def _save_registry(self) -> None: """Persist project registry to disk.""" self.projects_dir.mkdir(parents=True, exist_ok=True) data = { "active": self._active, "projects": [p.to_dict() for p in self.projects.values()], } self._registry_path().write_text( json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8" ) # ── CRUD ────────────────────────────────────────────────────── def create( self, name: str, config_path: str, topic: str | None = None, ) -> Project: """Create a new project with an independent directory and config copy.""" if name in self.projects: raise ValueError(f"Project already exists: {name}") project_dir = self.projects_dir / name project_dir.mkdir(parents=True, exist_ok=True) # Copy config to project directory src = Path(config_path).expanduser().resolve() if src.exists(): dst = project_dir / "config.yaml" shutil.copy2(src, dst) stored_config = str(dst) else: stored_config = config_path run_dir = str(project_dir / "artifacts") Path(run_dir).mkdir(parents=True, exist_ok=True) project = Project( name=name, config_path=stored_config, run_dir=run_dir, topic=topic or "", ) self.projects[name] = project if self._active is None: self._active = name self._save_registry() logger.info("Created project: %s", name) return project def delete(self, name: str) -> None: """Remove project from registry. Does NOT delete artifacts on disk.""" if name not in self.projects: raise KeyError(f"Unknown project: {name}") del self.projects[name] if self._active == name: self._active = next(iter(self.projects), None) self._save_registry() logger.info("Deleted project (registry only): %s", name) def get(self, name: str) -> Project: """Get a single project by name.""" if name not in self.projects: raise KeyError(f"Unknown project: {name}") return self.projects[name] def list_all(self) -> list[Project]: """Return all projects sorted by creation time.""" return sorted(self.projects.values(), key=lambda p: p.created_at) def get_status(self) -> dict[str, Any]: """Summary of all project statuses.""" projects = self.list_all() return { "total": len(projects), "active": self._active, "by_status": _count_by(projects, "status"), "projects": [ {"name": p.name, "status": p.status, "topic": p.topic} for p in projects ], } # ── project switching ───────────────────────────────────────── def switch(self, name: str) -> Project: """Set the active project.""" if name not in self.projects: raise KeyError(f"Unknown project: {name}") self._active = name self._save_registry() return self.projects[name] @property def active(self) -> Project | None: """Currently active project.""" if self._active and self._active in self.projects: return self.projects[self._active] return None # ── comparison ──────────────────────────────────────────────── def compare(self, name_a: str, name_b: str) -> dict[str, Any]: """Compare metrics and status of two projects.""" a = self.get(name_a) b = self.get(name_b) return { "project_a": {"name": a.name, "status": a.status, "topic": a.topic, "metrics": a.metrics}, "project_b": {"name": b.name, "status": b.status, "topic": b.topic, "metrics": b.metrics}, "metric_diff": _metric_diff(a.metrics, b.metrics), } # ── run lifecycle ───────────────────────────────────────────── def start_run(self, name: str, run_id: str) -> str: """Mark a project as running with a new run ID.""" proj = self.get(name) proj.status = "running" proj.last_run_id = run_id self._save_registry() return run_id def finish_run(self, name: str, status: str, metrics: dict[str, Any] | None = None) -> None: """Mark a project run as completed or failed.""" proj = self.get(name) proj.status = status if metrics: proj.metrics = metrics self._save_registry() def _count_by(projects: list[Project], attr: str) -> dict[str, int]: counts: dict[str, int] = {} for p in projects: val = getattr(p, attr, "unknown") counts[val] = counts.get(val, 0) + 1 return counts def _metric_diff(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]: all_keys = set(a) | set(b) diff: dict[str, Any] = {} for key in sorted(all_keys): va, vb = a.get(key), b.get(key) if isinstance(va, (int, float)) and isinstance(vb, (int, float)): diff[key] = {"a": va, "b": vb, "delta": round(vb - va, 6)} else: diff[key] = {"a": va, "b": vb} return diff ================================================ FILE: researchclaw/project/models.py ================================================ """Data models for multi-project management.""" from __future__ import annotations from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Any @dataclass class Project: """A research project managed by AutoResearchClaw.""" name: str config_path: str run_dir: str status: str = "idle" # idle | running | completed | failed created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) last_run_id: str | None = None topic: str = "" metrics: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: """Serialize project to a dictionary.""" return { "name": self.name, "config_path": self.config_path, "run_dir": self.run_dir, "status": self.status, "created_at": self.created_at.isoformat(), "last_run_id": self.last_run_id, "topic": self.topic, "metrics": self.metrics, } @classmethod def from_dict(cls, data: dict[str, Any]) -> Project: """Deserialize project from a dictionary.""" created_at = data.get("created_at") if isinstance(created_at, str): created_at = datetime.fromisoformat(created_at) elif created_at is None: created_at = datetime.now(timezone.utc) return cls( name=data["name"], config_path=data["config_path"], run_dir=data["run_dir"], status=data.get("status", "idle"), created_at=created_at, last_run_id=data.get("last_run_id"), topic=data.get("topic", ""), metrics=data.get("metrics", {}), ) @dataclass class Idea: """A research idea that can be evaluated and converted to a project.""" id: str title: str description: str status: str = "draft" # draft | evaluated | planned | running | completed feasibility: float = 0.0 # 0-1 novelty: float = 0.0 # 0-1 domains: list[str] = field(default_factory=list) created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) @property def score(self) -> float: """Composite score: weighted average of feasibility and novelty.""" return 0.4 * self.feasibility + 0.6 * self.novelty def to_dict(self) -> dict[str, Any]: """Serialize idea to a dictionary.""" return { "id": self.id, "title": self.title, "description": self.description, "status": self.status, "feasibility": self.feasibility, "novelty": self.novelty, "domains": self.domains, "created_at": self.created_at.isoformat(), } @classmethod def from_dict(cls, data: dict[str, Any]) -> Idea: """Deserialize idea from a dictionary.""" created_at = data.get("created_at") if isinstance(created_at, str): created_at = datetime.fromisoformat(created_at) elif created_at is None: created_at = datetime.now(timezone.utc) return cls( id=data["id"], title=data["title"], description=data["description"], status=data.get("status", "draft"), feasibility=float(data.get("feasibility", 0.0)), novelty=float(data.get("novelty", 0.0)), domains=data.get("domains", []), created_at=created_at, ) ================================================ FILE: researchclaw/project/scheduler.py ================================================ """Project scheduler: priority queue and concurrency control for pipeline runs.""" from __future__ import annotations import heapq import logging from dataclasses import dataclass, field from typing import Any from researchclaw.project.manager import ProjectManager logger = logging.getLogger(__name__) @dataclass(order=True) class _QueueEntry: """Priority queue entry (lower priority number = higher priority).""" priority: int project_name: str = field(compare=False) class ProjectScheduler: """Schedule project pipeline runs with priority and concurrency limits.""" def __init__(self, manager: ProjectManager, max_concurrent: int = 2) -> None: self.manager = manager self.max_concurrent = max_concurrent self._queue: list[_QueueEntry] = [] self._running: set[str] = set() def enqueue(self, project_name: str, priority: int = 0) -> None: """Add a project to the run queue.""" if project_name not in self.manager.projects: raise KeyError(f"Unknown project: {project_name}") # Avoid duplicate enqueue for entry in self._queue: if entry.project_name == project_name: logger.info("Project %s already in queue", project_name) return if project_name in self._running: logger.info("Project %s already running", project_name) return heapq.heappush(self._queue, _QueueEntry(priority=priority, project_name=project_name)) logger.info("Enqueued project %s with priority %d", project_name, priority) def dequeue(self) -> str | None: """Remove and return the highest-priority project from the queue.""" if not self._queue: return None entry = heapq.heappop(self._queue) return entry.project_name def next(self) -> str | None: """Get the next project that should run, if a slot is available.""" if not self.can_start(): return None name = self.dequeue() if name is not None: self._running.add(name) return name def can_start(self) -> bool: """Check whether there is capacity to start another run.""" return len(self._running) < self.max_concurrent and len(self._queue) > 0 def mark_done(self, project_name: str) -> None: """Mark a running project as finished (frees a concurrency slot).""" self._running.discard(project_name) @property def queue_size(self) -> int: """Number of projects waiting in the queue.""" return len(self._queue) @property def running_count(self) -> int: """Number of projects currently running.""" return len(self._running) def get_status(self) -> dict[str, Any]: """Scheduler status overview.""" return { "max_concurrent": self.max_concurrent, "running": sorted(self._running), "running_count": len(self._running), "queued": [e.project_name for e in sorted(self._queue)], "queue_size": len(self._queue), } ================================================ FILE: researchclaw/prompts.py ================================================ """Prompt externalization for the ResearchClaw pipeline. All 23 stage prompts are defined here as defaults and can be overridden via a user-provided YAML file. Users customize prompts without touching Python source code. Architecture ------------ * ``_DEFAULT_STAGES`` — every LLM-facing prompt, keyed by stage name. * ``_DEFAULT_BLOCKS`` — reusable prompt fragments (topic constraint, etc.). * ``_DEFAULT_SUB_PROMPTS`` — secondary prompts (code repair, etc.). * ``PromptManager`` — loads defaults → merges user overrides → renders templates. * ``_render()`` — safe ``{variable}`` substitution that leaves unmatched patterns (JSON schemas, curly-brace literals) untouched. Usage ----- :: from researchclaw.prompts import PromptManager pm = PromptManager() # defaults only pm = PromptManager("my_prompts.yaml") # with user overrides sp = pm.for_stage("topic_init", topic="RL for drug discovery", domains="ml, bio") resp = llm.chat( [{"role": "user", "content": sp.user}], system=sp.system, json_mode=sp.json_mode, max_tokens=sp.max_tokens, ) """ from __future__ import annotations import logging import re from dataclasses import dataclass from pathlib import Path from typing import Any import yaml logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Template rendering # --------------------------------------------------------------------------- def _render(template: str, variables: dict[str, str]) -> str: """Replace ``{var_name}`` placeholders with *variables* values. Only bare ``{word_chars}`` tokens are substituted — JSON schema examples like ``{candidates:[...]}`` or ``{score_1_to_10:number}`` are left untouched because the regex requires the closing ``}`` immediately after the identifier. """ def _replacer(match: re.Match[str]) -> str: key = match.group(1) return str(variables[key]) if key in variables else match.group(0) return re.sub(r"\{(\w+)\}", _replacer, template) # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass(frozen=True) class RenderedPrompt: """Fully rendered prompt ready for ``llm.chat()``.""" system: str user: str json_mode: bool = False max_tokens: int | None = None # --------------------------------------------------------------------------- # PromptManager # --------------------------------------------------------------------------- class PromptManager: """Central registry for pipeline prompts with optional YAML overrides.""" def __init__(self, overrides_path: str | Path | None = None) -> None: # Deep-copy defaults so mutations don't leak across instances self._stages: dict[str, dict[str, Any]] = { k: dict(v) for k, v in _DEFAULT_STAGES.items() } self._blocks: dict[str, str] = dict(_DEFAULT_BLOCKS) self._sub_prompts: dict[str, dict[str, Any]] = { k: dict(v) for k, v in _DEFAULT_SUB_PROMPTS.items() } if overrides_path: self._load_overrides(Path(overrides_path)) # -- loading ---------------------------------------------------------- def _load_overrides(self, path: Path) -> None: if not path.exists(): logger.warning("Prompts file not found: %s — using defaults", path) return try: data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} except yaml.YAMLError as exc: logger.warning("Bad prompts YAML %s: %s — using defaults", path, exc) return for stage_name, stage_data in (data.get("stages") or {}).items(): if stage_name in self._stages and isinstance(stage_data, dict): self._stages[stage_name].update(stage_data) else: logger.warning("Unknown stage in prompts file: %s", stage_name) for block_name, block_text in (data.get("blocks") or {}).items(): if isinstance(block_text, str): self._blocks[block_name] = block_text for sub_name, sub_data in (data.get("sub_prompts") or {}).items(): if sub_name in self._sub_prompts and isinstance(sub_data, dict): self._sub_prompts[sub_name].update(sub_data) logger.info("Loaded prompt overrides from %s", path) # -- primary API ------------------------------------------------------ def for_stage( self, stage: str, *, evolution_overlay: str = "", **kwargs: Any, ) -> RenderedPrompt: """Return a fully rendered prompt for *stage* with variables filled. If *evolution_overlay* is provided, it is appended to the user prompt so the LLM can learn from prior run lessons. """ entry = self._stages[stage] kw = {k: str(v) for k, v in kwargs.items()} user_text = _render(entry["user"], kw) if evolution_overlay: user_text = f"{user_text}\n\n{evolution_overlay}" return RenderedPrompt( system=_render(entry["system"], kw), user=user_text, json_mode=entry.get("json_mode", False), max_tokens=entry.get("max_tokens"), ) def system(self, stage: str) -> str: """Return the raw system prompt template for *stage*.""" return self._stages[stage]["system"] def user(self, stage: str, **kwargs: Any) -> str: """Return the rendered user prompt for *stage*.""" return _render( self._stages[stage]["user"], {k: str(v) for k, v in kwargs.items()}, ) def json_mode(self, stage: str) -> bool: return self._stages[stage].get("json_mode", False) def max_tokens(self, stage: str) -> int | None: return self._stages[stage].get("max_tokens") # -- blocks ----------------------------------------------------------- def block(self, name: str, **kwargs: Any) -> str: """Render a reusable prompt block.""" return _render( self._blocks[name], {k: str(v) for k, v in kwargs.items()}, ) # -- sub-prompts (code repair, etc.) ---------------------------------- def sub_prompt(self, name: str, **kwargs: Any) -> RenderedPrompt: """Return a rendered sub-prompt (e.g. code_repair).""" entry = self._sub_prompts[name] kw = {k: str(v) for k, v in kwargs.items()} return RenderedPrompt( system=_render(entry["system"], kw), user=_render(entry["user"], kw), ) # -- introspection ---------------------------------------------------- def stage_names(self) -> list[str]: return list(self._stages.keys()) def has_stage(self, stage: str) -> bool: return stage in self._stages def export_yaml(self, path: Path) -> None: """Write current prompts (defaults + overrides) to a YAML file.""" data: dict[str, Any] = { "version": "1.0", "blocks": dict(self._blocks), "stages": {k: dict(v) for k, v in self._stages.items()}, "sub_prompts": {k: dict(v) for k, v in self._sub_prompts.items()}, } path.write_text( yaml.dump(data, default_flow_style=False, allow_unicode=True, width=120), encoding="utf-8", ) # ======================================================================== # DEFAULT PROMPTS — edit prompts.yaml to override; do NOT edit these. # ======================================================================== # -- Canonical section word-count targets ---------------------------------- # Single source of truth for per-section word-count ranges. # Used by executor._validate_draft_quality() and converter.check_paper_completeness(). SECTION_WORD_TARGETS: dict[str, tuple[int, int]] = { "abstract": (180, 220), "introduction": (800, 1000), "related work": (600, 800), "method": (1000, 1500), "experiments": (800, 1200), "results": (600, 800), "discussion": (400, 600), "limitations": (200, 300), "conclusion": (200, 300), "broader impact": (200, 400), } # Aliases mapping heading variants to canonical names in SECTION_WORD_TARGETS. _SECTION_TARGET_ALIASES: dict[str, str] = { "methods": "method", "methodology": "method", "proposed method": "method", "approach": "method", "experimental setup": "experiments", "experimental results": "results", "results and discussion": "results", "results and analysis": "results", "conclusions": "conclusion", "conclusion and future work": "conclusion", "summary": "conclusion", "background": "related work", "literature review": "related work", "prior work": "related work", "limitation": "limitations", "limitations and future work": "limitations", "broader impacts": "broader impact", "societal impact": "broader impact", "ethical considerations": "broader impact", } # -- Reusable blocks ----------------------------------------------------- _DEFAULT_BLOCKS: dict[str, str] = { "title_guidelines": ( "\n## TITLE RULES (Hard Constraints)\n" "1. MAXIMUM 14 words. Ideal: 8-12 words. NEVER exceed 14.\n" "2. Preferred structure: 'MethodName: Descriptive Phrase' (colon format)\n" " - Create a catchy 1-3 word method name (acronym, portmanteau, or evocative word)\n" " - Subtitle explains what it does: 'for X' / 'via Y' / 'in Z'\n" " - Examples: 'AlphaEdit: Null-Space Knowledge Editing for LMs' (8 words)\n" " - Examples: 'VAR: Visual Autoregressive Modeling via Next-Scale Prediction' (8 words)\n" "3. Alternative: Bold declarative claim that surprises the reader\n" " - 'Not All Tokens Are What You Need for Pretraining' (9 words)\n" " - 'Vision Transformers Need Registers' (4 words)\n" "4. FORBIDDEN patterns:\n" " - 'Investigating...', 'An Empirical Study of...', 'Towards...'\n" " - 'A Novel Approach to...', 'On the...' (generic academic filler)\n" " - Repeating the full method description as title\n" " - Weakness qualifiers: 'in Two Runs', 'Under Limited Data'\n" "5. MUST define a short method name (2-5 chars) that serves as memorable handle.\n" " The reader should be able to say 'Have you read the X paper?'\n" "6. No abbreviations unless universally known (LLM, RL, GAN, NLP are OK).\n" ), "abstract_structure": ( "\n## ABSTRACT (Hard Rules — 180-220 words, 5-7 sentences)\n" "STRUCTURE (PMR+ format):\n" "S1-S2: PROBLEM — What gap exists? Why does it matter? (NO method names yet)\n" "S3-S4: METHOD — Name your system. One-sentence description of key insight.\n" "S5-S6: RESULTS — At most 3 specific numbers. Use relative improvements\n" " ('X% over baseline') not raw values ('0.7667'). Bold the single most\n" " important result.\n" "S7 (optional): IMPACT — What does this enable?\n\n" "HARD CONSTRAINTS:\n" "- NO \\texttt{{}} in abstract\n" "- NO more than 3 numeric values in the entire abstract\n" "- NO per-seed breakdowns or confidence intervals\n" "- NO method names longer than 3 words (use the short system name)\n" "- The abstract must be readable by a researcher who skimmed only the title\n" "- First sentence must NOT start with 'We' or 'This paper'\n" ), "compute_budget": ( "\n## Compute Budget Constraint\n" "- Total execution time limit: {time_budget_sec} seconds\n" "- You MUST design experiments that complete within this budget\n" "- Estimate: a simple numpy loop runs ~10M iterations/sec; a nested loop over\n" " conditions runs proportionally slower\n" "- SCALING RULES (mandatory):\n" " - If total conditions > 100: reduce seeds to 3-5 (not 20)\n" " - If total conditions > 500: reduce to 2-3 representative conditions per factor\n" " - If time_budget < 300s: limit total optimization steps to ≤5,000 per run\n" " - If time_budget < 120s: limit total optimization steps to ≤1,000 per run\n" " - Always print intermediate results so partial data is captured on timeout\n" "- MANDATORY: print a 'TIME_ESTIMATE: Xs' line before the main loop,\n" " estimating total runtime based on a small pilot (run 1 condition, extrapolate)\n" "- MANDATORY: implement a time guard — check elapsed time periodically and\n" " stop gracefully if approaching 80% of budget, saving all results collected so far\n" "- MANDATORY: add NaN/divergence fast-fail guard:\n" " - After each optimization step, check if loss is NaN or > 100\n" " - If detected, print 'FAIL: NaN/divergence detected', save partial results, and exit\n" " - Do NOT waste compute on a diverging run\n" "- MINIMUM TRAINING EPOCHS (CRITICAL for meaningful results):\n" " - CIFAR-10/100 with ResNet/CNN: minimum 50 epochs (200 recommended)\n" " - FashionMNIST with small CNN: minimum 20 epochs\n" " - RL environments: follow the RL STEP BUDGET below (CRITICAL)\n" " - If time_budget is too short for minimum epochs, REDUCE model complexity\n" " or dataset size INSTEAD of reducing epochs. 8 epochs on CIFAR-10 will\n" " produce random-chance accuracy (~10%), making all comparisons meaningless.\n" " - Use a SMALL model (simple CNN, few layers) to fit enough epochs into the budget.\n" " - A converged small model is worth infinitely more than a diverged large model.\n" "- MANDATORY: use the experiment_harness module (pre-installed in sandbox):\n" " ```\n" " from experiment_harness import ExperimentHarness\n" " harness = ExperimentHarness(time_budget={time_budget_sec})\n" " # In your experiment loop:\n" " if harness.should_stop():\n" " break # graceful stop at 80% of budget\n" " if not harness.check_value(value, 'metric_name'):\n" " print('SKIP: NaN/Inf detected') # skip invalid values\n" " continue\n" " harness.report_metric('metric_name', value) # validated output\n" " # At the end of ALL experiments:\n" " harness.finalize() # writes results.json — MUST be called\n" " ```\n" " The harness provides: time budget enforcement, NaN/Inf detection,\n" " validated metric reporting, and results.json output. NOT using it\n" " means your metrics may be lost or malformed.\n" ), "topic_constraint": ( "\n\n=== HARD TOPIC CONSTRAINT ===\n" "The paper MUST be about: {topic}\n" "PROHIBITED content (unless user explicitly specifies case-study mode):\n" "- Do NOT treat environment setup, dependency installation, or infrastructure " "failures as a research contribution.\n" "- Do NOT present debugging logs, system errors, or configuration issues " "as experimental findings.\n" "- Do NOT drift to tangential topics not directly related to the stated topic.\n" "- Every section MUST connect back to the core research question.\n" "- The Abstract and Introduction MUST clearly state the research problem " "derived from: {topic}\n" "- The Method section MUST describe a technical approach, not a workflow.\n" "- The Results section MUST report quantitative outcomes of experiments, " "not environment status.\n" "=== END CONSTRAINT ===\n" ), "pkg_hint_sandbox": ( "\nAVAILABLE PACKAGES (sandbox mode): Python stdlib, numpy, math, random, " "statistics, json.\n" "Do NOT use: torch, tensorflow, jax, sklearn, pandas, scipy, matplotlib, " "or any deep learning framework.\n" "Write the experiment using ONLY numpy and stdlib.\n" ), "dataset_guidance": ( "\n## Standard Datasets & Real Baselines (MANDATORY when applicable)\n" "You MUST use real benchmark datasets — NEVER synthetic torch.randn() data.\n\n" "### Tier 1: Pre-cached (ALWAYS available, use download=False)\n" "These datasets are already in the Docker image. Use download=False:\n" "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n" "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n" "### Tier 2: Downloadable (use setup.py to download before main.py runs)\n" "For any dataset NOT in Tier 1, create a `setup.py` file that downloads it.\n" "setup.py runs WITH network access; main.py runs WITHOUT network.\n" "- Any torchvision dataset (Caltech-101, Flowers102, etc.)\n" "- HuggingFace datasets: `from datasets import load_dataset`\n" " Examples: IMDB, AG News, WikiText, SST-2, SQuAD, MMLU\n" "- OGB benchmarks: ogbg-molhiv, ogbn-arxiv, etc.\n" "- Tiny-ImageNet (237MB, 200 classes) — good ImageNet proxy\n\n" "### Tier 3: Too large for download (use alternatives)\n" "These datasets are TOO LARGE to download within experiment time limits:\n" "- ImageNet-1K (168GB) → use Tiny-ImageNet or CIFAR-100 as proxy\n" "- LAION (>1TB) → use smaller HuggingFace image-text datasets\n" "- Common Crawl, The Pile → use WikiText-103 or pre-tokenized subsets\n" "NEVER generate 'ImageNet-like' synthetic data — always use a real alternative.\n\n" "### ANTI-PATTERNS (NEVER DO THESE):\n" "- `torch.randn(N, 3, 224, 224)` as dataset → use real datasets\n" "- `download=True` in main.py → put downloads in setup.py\n" "- `download=False` for non-cached datasets → will FileNotFoundError\n" "- Random train/test splits → use official splits from dataset\n" "- `os.makedirs('/opt/datasets/...')` → /opt/datasets is READ-ONLY\n\n" "DATA PATH: For Tier 1 pre-cached datasets, use `/opt/datasets` as root.\n" "For Tier 2 datasets downloaded by setup.py, use `/workspace/data` as root.\n" "WARNING: `/opt/datasets` is READ-ONLY. NEVER call os.makedirs() on it.\n" "Just pass `root='/opt/datasets'` directly to torchvision dataset constructors.\n\n" "DISTRIBUTION SHIFT — use torchvision corruption transforms:\n" "- Gaussian noise: `transforms.Lambda(lambda x: x + torch.randn_like(x) * sigma)`\n" "- Brightness shift: `transforms.ColorJitter(brightness=0.5)`\n" "- Contrast shift: `transforms.ColorJitter(contrast=0.5)`\n" "- Blur: `transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0))`\n" "- For CIFAR-10-C style corruptions, apply transforms to test set only.\n\n" "REAL BASELINES & MODERN BENCHMARKS (CRITICAL):\n" "- Use proper train/test splits from the dataset (never split randomly in code)\n" "- Use standard architectures (ResNet-18/50, ViT, ConvNeXt) — not toy 2-layer MLPs\n" "- CIFAR INPUT SIZE (IMPORTANT): CIFAR images are 32×32. Two valid approaches:\n" " 1. PRETRAINED models (ImageNet weights): Use `transforms.Resize(224)` — " "pretrained models require 224×224 inputs.\n" " 2. TRAINING FROM SCRATCH (most experiments): Modify the model for 32×32 " "inputs instead of resizing. For ResNet: use `nn.Conv2d(3,64,3,1,1)` as " "first conv (not 7×7/stride-2) and REMOVE the initial MaxPool. This is 49× " "more memory-efficient and trains faster than Resize(224). Use the `timm` " "library's CIFAR variants or build a custom `get_resnet18_cifar()` helper.\n" "- Report standard metrics (top-1 accuracy for classification tasks)\n" "- Compare against published baselines where available\n" "- BASELINES MUST BE CURRENT: Use baselines from recent top-venue papers " "(2023-2026). Do NOT use outdated methods as the primary comparison.\n" " * AlexNet, VGG-16 → use ResNet-50, ViT, ConvNeXt instead\n" " * Vanilla SGD → use AdamW, SGD+momentum+cosine LR\n" " * Simple RNN/LSTM for NLP → use Transformer-based models\n" "- Include at LEAST one strong, modern baseline (near-SOTA).\n" "- BENCHMARKS MUST BE STANDARD and actively used in the community.\n\n" "WHEN TO USE SYNTHETIC DATA (required for these domains):\n" "- **PDE / Scientific computing**: Generate synthetic PDE data (Burgers " "equation, Darcy flow, heat equation, Navier-Stokes). Use numerical solvers " "(scipy.integrate, finite differences) to create ground truth.\n" "- **Combinatorial optimization** (TSP, graph coloring, scheduling): Generate " "random problem instances (random TSP cities, Erdos-Renyi graphs).\n" "- **Theoretical analysis**: Synthetic optimization landscapes, toy problems.\n" "- **Domain with no standard dataset**: Novel combinatorial or mathematical domains.\n" "For these domains, do NOT use CIFAR/MNIST/ImageNet — they are irrelevant. " "Generate problem-specific synthetic data in main.py.\n\n" "DOMAIN-DATASET MATCHING (CRITICAL):\n" "- Image classification → CIFAR-10/100, MNIST, ImageNet variants\n" "- NLP → IMDB, AG News, SST-2, WikiText\n" "- Graph learning → Cora, CiteSeer, ogbn-arxiv\n" "- PDE/Physics → SYNTHETIC (Burgers, Darcy, Navier-Stokes)\n" "- Combinatorial optimization → SYNTHETIC (random TSP, graph instances)\n" "- RL → Gymnasium environments (CartPole, LunarLander, HalfCheetah)\n" "NEVER use image datasets for non-image problems.\n" ), "setup_script_guidance": ( "\n## Setup Script (setup.py) — Dataset Download & Preparation\n" "If your experiment needs datasets NOT in the pre-cached list, generate " "a SEPARATE file called `setup.py` that downloads and prepares them.\n" "The setup.py runs WITH NETWORK ACCESS before main.py (which runs WITHOUT network).\n\n" "IMPORTANT: All download logic MUST be in setup.py, NOT in main.py.\n" "main.py should only load pre-cached data from /opt/datasets (download=False) " "or downloaded data from /workspace/data.\n\n" "Example setup.py:\n" "```python\n" "import os\n" "DATA_DIR = '/workspace/data'\n" "os.makedirs(DATA_DIR, exist_ok=True)\n\n" "# Download torchvision datasets\n" "import torchvision\n" "torchvision.datasets.Caltech101(root=DATA_DIR, download=True)\n\n" "# Download HuggingFace datasets\n" "from datasets import load_dataset\n" "ds = load_dataset('imdb', cache_dir=os.path.join(DATA_DIR, 'hf'))\n\n" "# Download OGB benchmarks\n" "# from ogb.graphproppred import PygGraphPropPredDataset\n" "# dataset = PygGraphPropPredDataset(name='ogbg-molhiv', root=DATA_DIR)\n\n" "print('[setup] Dataset download complete.')\n" "```\n\n" "IMPORT ANTI-PATTERN (NEVER DO THIS):\n" "```python\n" "from datasets import load_dataset\n" "datasets.load_dataset('imdb', ...) # WRONG — NameError!\n" "```\n" "If you write `from datasets import load_dataset`, call `load_dataset(...)` directly.\n" "If you write `import datasets`, call `datasets.load_dataset(...)` with module prefix.\n" "NEVER mix the two styles.\n\n" "If ALL your datasets are pre-cached (CIFAR-10/100, MNIST, FashionMNIST, " "STL-10, SVHN), you do NOT need setup.py — just use download=False in main.py.\n\n" "You may also include a `requirements.txt` file listing any additional " "pip packages your experiment needs beyond the pre-installed set.\n" ), "network_disabled_guidance": ( "\n## ⚠️ NO NETWORK ACCESS — CRITICAL CONSTRAINT ⚠️\n" "This experiment runs with network_policy='none'. There is NO network access\n" "at ANY phase (no pip install, no dataset downloads, no HTTP requests).\n\n" "### ONLY these pre-cached datasets are available:\n" "- `torchvision.datasets.CIFAR10(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.CIFAR100(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.MNIST(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.FashionMNIST(root='/opt/datasets', train=True/False, download=False)`\n" "- `torchvision.datasets.STL10(root='/opt/datasets', split='train'/'test', download=False)`\n" "- `torchvision.datasets.SVHN(root='/opt/datasets', split='train'/'test', download=False)`\n\n" "### FORBIDDEN (will cause runtime failure):\n" "- Do NOT create setup.py (it cannot run without network)\n" "- Do NOT create requirements.txt (pip install is unavailable)\n" "- Do NOT use `download=True` on any dataset\n" "- Do NOT use `urllib`, `requests`, `httpx`, or any HTTP library\n" "- Do NOT use `datasets.load_dataset()` from HuggingFace (requires download)\n" "- Do NOT import packages not pre-installed in the Docker image\n\n" "### Available pre-installed packages:\n" "torch, torchvision, torchaudio, numpy, scipy, sklearn, matplotlib, seaborn,\n" "pandas, tqdm, gymnasium, networkx, PyYAML, Pillow, timm, einops, torchmetrics,\n" "h5py, transformers, datasets, accelerate, peft, bitsandbytes.\n\n" "If your research topic requires a dataset NOT in the pre-cached list,\n" "you MUST adapt to use one of the 6 pre-cached datasets instead.\n" ), "network_full_guidance": ( "\n## Network Access: Full\n" "This experiment runs with network_policy='full'. Network access is available\n" "throughout ALL execution phases (setup, pip install, and main experiment).\n" "You may download datasets, install packages, and make HTTP requests at any time.\n" ), "hp_reporting": ( "\n## Hyperparameter Reporting (MANDATORY)\n" "At the TOP of main.py, define a HYPERPARAMETERS dictionary containing ALL " "tunable hyperparameters used in your experiment:\n" "```python\n" "HYPERPARAMETERS = {\n" " 'learning_rate': 0.001,\n" " 'batch_size': 64,\n" " 'num_epochs': 50,\n" " 'hidden_dim': 256,\n" " # ... all other hyperparameters\n" "}\n" "```\n" "At the end of main.py, save hyperparameters to results.json:\n" "```python\n" "import json\n" "results = {'hyperparameters': HYPERPARAMETERS, 'metrics': collected_metrics}\n" "with open('results.json', 'w') as f:\n" " json.dump(results, f, indent=2)\n" "```\n" "EVERY hyperparameter must be used in the code — no dead parameters.\n" "The paper MUST include a hyperparameter table — this data feeds into it.\n" ), "rl_step_guidance": ( "\n## RL Training Step Budget (MANDATORY for RL experiments)\n" "Reinforcement learning requires MANY more training steps than supervised learning.\n" "Under-trained RL agents produce random-chance performance, making ALL comparisons\n" "meaningless and the paper unpublishable.\n\n" "### Environment Availability:\n" "#### Always available (classic control — no extra dependencies):\n" "- CartPole-v1, Pendulum-v1, MountainCar-v0, MountainCarContinuous-v0,\n" " Acrobot-v1, LunarLander-v3\n" "- These are lightweight and fast — PREFER these unless MuJoCo is specifically required.\n\n" "#### MuJoCo environments (pre-installed in Docker image):\n" "- HalfCheetah-v5, Hopper-v5, Walker2d-v5, Ant-v5, Humanoid-v5,\n" " Swimmer-v5, Reacher-v5, InvertedPendulum-v5, InvertedDoublePendulum-v5\n" "- Require MuJoCo runtime — available in Docker but NOT in basic sandbox mode.\n\n" "#### RULE: If the research topic says 'MuJoCo-free', 'without MuJoCo',\n" " or 'classic control only' → you MUST use classic control environments ONLY.\n" " Do NOT import or reference MuJoCo in any way.\n\n" "#### DEFAULT RECOMMENDATION: Prefer classic control environments unless the\n" " research topic specifically requires MuJoCo locomotion tasks.\n\n" "### ALGORITHM-ENVIRONMENT COMPATIBILITY (HARD RULE — violation = crash):\n" "- DQN is ONLY for DISCRETE action spaces (CartPole, LunarLander, Acrobot, Atari).\n" " DQN will CRASH on Pendulum, HalfCheetah, Hopper, Walker2d, etc.\n" "- For CONTINUOUS action spaces: use SAC, TD3, or PPO.\n" "- PPO works for both discrete and continuous.\n" "- NEVER combine DQN + any continuous environment.\n\n" "### TIME BUDGET RULES FOR RL:\n" "- If time_budget ≤ 3600s → ONLY classic control " "(CartPole, Pendulum, MountainCar, Acrobot, LunarLander)\n" "- If time_budget ≤ 1800s → ONLY CartPole or Pendulum (simplest)\n" "- MuJoCo requires >5000s for meaningful results.\n\n" "### Minimum Steps by Algorithm Family:\n" "| Algorithm | Environment | Min Steps | Recommended |\n" "|-----------|-------------|-----------|-------------|\n" "| PPO | MuJoCo (Ant, HalfCheetah, Humanoid) | 500K | 1M-3M |\n" "| PPO | Simple control (CartPole, Pendulum) | 100K | 500K |\n" "| SAC/TD3 | MuJoCo locomotion | 300K | 1M |\n" "| SAC/TD3 | Simple control | 50K | 200K |\n" "| DQN/Rainbow | Atari | 1M | 10M |\n" "| A2C/A3C | Any continuous | 500K | 2M |\n" "| REINFORCE | Any | 200K | 1M |\n\n" "### Step Budget Allocation Strategy:\n" "1. Compute pilot_time = time for 1000 steps of 1 condition.\n" "2. steps_per_sec = 1000 / pilot_time.\n" "3. max_steps_per_condition = (time_budget * 0.7) / num_conditions * steps_per_sec.\n" "4. If max_steps < min_steps for the algorithm, REDUCE num_seeds to 3 (not steps).\n" "5. If STILL under min_steps, use a simpler environment (e.g., Pendulum instead of Ant).\n" "6. NEVER reduce steps below the minimum — it wastes compute on meaningless results.\n\n" "### Evaluation Protocol for RL:\n" "- Evaluate every N_eval steps (e.g., every 10K steps) using deterministic policy.\n" "- Run 10 evaluation episodes per checkpoint.\n" "- Report: mean return, std return, success rate (if applicable).\n" "- Plot learning curves (return vs steps) — this is EXPECTED by reviewers.\n" "- Final metric = mean over last 10 evaluation checkpoints (NOT last episode).\n\n" "### Gymnasium Environment Version (CRITICAL):\n" "- Use v5 environments (NOT v4): `gym.make('HalfCheetah-v5')`, `gym.make('Hopper-v5')`\n" "- v4 environments are deprecated and will produce warnings.\n" "- Available MuJoCo v5 envs: HalfCheetah-v5, Hopper-v5, Walker2d-v5, Ant-v5,\n" " Humanoid-v5, Swimmer-v5, Reacher-v5, InvertedPendulum-v5, InvertedDoublePendulum-v5\n" "- For simple/fast experiments: use Pendulum-v1, CartPole-v1, MountainCarContinuous-v0\n\n" "### Gymnasium API (CRITICAL — common crash source):\n" "- `env.reset()` returns `(obs, info)` — ALWAYS unpack both:\n" " `obs, info = env.reset(seed=seed)`\n" "- `env.step(action)` returns `(obs, reward, terminated, truncated, info)` — 5 values:\n" " `obs, reward, terminated, truncated, info = env.step(action)`\n" " `done = terminated or truncated`\n" "- DO NOT use old `done = env.step(action)[2]` — this is the Gym (v0.26-) API.\n" "- `reward` is a scalar float, NOT an array. Do NOT index it: use `reward` directly.\n" "- `obs` shape depends on env: discrete envs give 1D array, image envs give 3D.\n" " Always check `env.observation_space.shape` and handle accordingly.\n\n" "### Learning Curve Logging (MANDATORY for RL papers):\n" "- Print evaluation metrics at regular intervals: every N_eval steps\n" " `EVAL: step= condition= seed= return=`\n" "- This enables plotting learning curves (return vs training steps)\n" "- Learning curves are EXPECTED by RL reviewers — a paper without them\n" " will be rejected regardless of final performance.\n" "- At the end, print the full curve:\n" " `LEARNING_CURVE: condition= seed= steps=[...] returns=[...]`\n" ), "multi_seed_enforcement": ( "\n## Multi-Seed Experiment Requirement (MANDATORY — NO EXCEPTIONS)\n" "Running each condition with only 1 seed is NEVER acceptable. Results from\n" "a single seed cannot distinguish signal from noise and reviewers will reject.\n\n" "### HARD REQUIREMENT:\n" "- You MUST use exactly seeds = [0, 1, 2] (3 seeds minimum).\n" "- Each condition MUST loop over ALL seeds.\n" "- Print per-seed: `condition=X seed=S {metric_key}: V`\n" "- Print aggregated: `condition=X {metric_key}_mean: M {metric_key}_std: S`\n" "- Tables MUST show mean ± std, NEVER single-run values.\n\n" "### Implementation Pattern (copy this structure):\n" "```python\n" "SEEDS = [0, 1, 2] # EXACTLY 3 seeds — mandatory minimum\n" "all_results = {} # {condition_name: {seed: metric_value}}\n\n" "for condition_name, ConditionClass in conditions.items():\n" " all_results[condition_name] = {}\n" " for seed in SEEDS:\n" " set_all_seeds(seed) # torch, numpy, random\n" " result = run_single(ConditionClass, seed=seed)\n" " all_results[condition_name][seed] = result\n" " print(f'condition={condition_name} seed={seed} metric: {result}')\n" " values = list(all_results[condition_name].values())\n" " print(f'condition={condition_name} metric_mean: {np.mean(values):.4f} '\n" " f'metric_std: {np.std(values):.4f}')\n" "```\n\n" "### Reporting Requirements:\n" "- Print per-seed results: `condition=X seed=S metric: V`\n" "- Print aggregated: `condition=X metric_mean: M metric_std: S`\n" "- Tables in the paper MUST show mean ± std, NEVER single-run values.\n" "- If time budget forces < 5 seeds, use EXACTLY 3 seeds (minimum).\n" " Print: `SEED_WARNING: only 3 seeds used due to time budget`.\n" ), "writing_structure": ( "\n## Paper Section Writing Rules\n" "MARKDOWN FORMATTING (CRITICAL):\n" "- Use `# Title` (H1) for the paper title\n" "- Use `# Abstract`, `# Introduction`, `# Method`, etc. (H1) for MAIN sections\n" "- Use `## Subsection Name` (H2) for subsections WITHIN a main section\n" "- NEVER use `##` for main sections — that produces wrong LaTeX heading levels\n" "- Each main section (H1) MUST contain subsections (H2) when it exceeds 3 paragraphs\n" "- NEVER place sub-topics (e.g., 'Knowledge Distillation for Compact Models') " "at the same heading level as main sections (e.g., 'Related Work')\n" "- NEVER wrap the paper in ```markdown fences\n" "- NEVER use raw variable names (e.g., `method_name/metric_key = 0.85`) — " "always use human-readable text\n\n" "ABSTRACT (150-200 words, 5-sentence structure):\n" "- (1) Problem and significance (2) Prior approaches and gaps\n" "- (3) Your approach and novelty (4) Key results with 2-3 specific numbers\n" "- (5) Implication/takeaway\n" "- Do NOT list per-seed ranges (e.g., '0.71-0.73 across seeds') — use mean +/- std\n" "- Do NOT repeat numbers that appear in the Results section — pick the 2-3 most impactful\n\n" "INTRODUCTION (4 paragraphs, 800-1000 words, cite 8-12 references):\n" "Paragraph 1: Problem motivation (why this matters). " "Paragraph 2: What exists and why it falls short. " "Paragraph 3: Your approach and key insight. " "Paragraph 4: Contributions (2-3 bullet points allowed here ONLY).\n\n" "RELATED WORK:\n" "Organize by sub-topic, not chronologically. " "End each paragraph with how YOUR work differs from the cited work. " "Cite at least 15 references, all directly relevant.\n\n" "METHOD:\n" "Write as flowing narrative prose (NOT bullet points). " "Include full algorithm description with pseudocode or step-by-step. " "State all hyperparameters with values and justification. " "Provide architecture details sufficient for reproduction.\n\n" "RESULTS:\n" "- Do NOT repeat the same number more than twice across the paper\n" "- Each number in a table should be discussed AT MOST once in text\n" "- Tables: mean +/- std with 95% CI in parentheses\n" "- Bold the best result in each column\n" "- Every comparison claim must cite a p-value or note multiple seeds\n" "- Report the number of random seeds/runs used\n\n" "FIGURES AND TABLES:\n" "- Every figure MUST be referenced in the text (e.g., 'As shown in Figure 1')\n" "- Every table MUST be referenced in the text (e.g., 'Table 2 summarizes')\n" "- Figure captions: 1-2 descriptive sentences (not just 'Results comparison')\n" "- Table captions go ABOVE the table; figure captions go BELOW the figure\n" "- Axis labels must include units where applicable\n" "- Use consistent font sizes across all figures\n\n" "DISCUSSION (if applicable, can be merged into Results):\n" "- Paragraph 1: Summarize key findings and their significance\n" "- Paragraph 2: Compare with prior work — explain WHY results differ\n" "- Paragraph 3: Discuss unexpected or negative results honestly\n" "- Paragraph 4: Broader implications and practical applications\n\n" "LIMITATIONS (3-5 points):\n" "- State each limitation ONCE, here only — not scattered throughout\n" "- No disclaimers like 'due to computational constraints'\n" "- Include compute resources used (GPU type, training time)\n\n" "CONCLUSION:\n" "- Summarize findings (match actual results, no aspirational claims)\n" "- 2-3 sentences of future work\n\n" "PROSE QUALITY (CRITICAL — violation = desk reject):\n" "- Write FLOWING ACADEMIC PARAGRAPHS, not bullet-point lists.\n" "- Each paragraph must have 4-8 sentences with smooth transitions.\n" "- Introduction, Related Work, and Method must each be >=3 paragraphs.\n" "- FORBIDDEN: starting 3+ consecutive paragraphs with the same word.\n" "- FORBIDDEN: bullet-point lists in Introduction or Related Work sections.\n" "- Use varied sentence structures: mix simple, compound, and complex sentences.\n" "- Connect paragraphs with transition phrases: 'Building on this insight...', " "'In contrast to prior work...', 'To address this limitation...'.\n" "- Each Related Work paragraph must COMPARE your approach to cited work, " "not merely summarize what each paper does.\n" "- FORBIDDEN AI-BOILERPLATE phrases (instant credibility loss):\n" " 'delves into', 'it is worth noting', 'plays a crucial role',\n" " 'leverages the power of', 'paves the way', 'a myriad of',\n" " 'paradigm shift', 'groundbreaking', 'in the realm of',\n" " 'holistic approach', 'multifaceted', 'navigate the complexities'.\n" " Replace ALL such phrases with precise, specific academic language.\n" ), "llm_training_guidance": ( "\n## LLM Fine-Tuning Guidance (when topic involves language model training)\n" "AVAILABLE FRAMEWORKS (pre-installed in Docker):\n" "- transformers (AutoModelForCausalLM, AutoTokenizer, Trainer)\n" "- peft (LoraConfig, get_peft_model, PeftModel)\n" "- trl (SFTTrainer, DPOTrainer, GRPOTrainer)\n" "- datasets (load_dataset, Dataset)\n" "- accelerate (Accelerator)\n" "- bitsandbytes (4-bit/8-bit quantization)\n\n" "GPU MEMORY GUIDELINES (RTX 6000 Ada, 49GB VRAM):\n" "- Full fine-tune: <=3B parameters\n" "- LoRA (16-bit): <=14B parameters\n" "- QLoRA (4-bit): <=72B parameters (practical limit ~14B for training)\n" "- Optimal: 7B-14B model with QLoRA (rank 16-64)\n\n" "RECOMMENDED TRAINING PATTERN:\n" "```python\n" "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n" "from peft import LoraConfig, get_peft_model, TaskType\n" "from trl import SFTTrainer, SFTConfig\n" "from datasets import load_dataset\n\n" "# 4-bit quantization for memory efficiency\n" "bnb_config = BitsAndBytesConfig(\n" " load_in_4bit=True,\n" " bnb_4bit_quant_type='nf4',\n" " bnb_4bit_compute_dtype=torch.bfloat16,\n" ")\n" "model = AutoModelForCausalLM.from_pretrained(\n" " model_name, quantization_config=bnb_config, device_map='auto'\n" ")\n" "lora_config = LoraConfig(\n" " r=16, lora_alpha=32, target_modules='all-linear',\n" " lora_dropout=0.05, task_type=TaskType.CAUSAL_LM,\n" ")\n" "model = get_peft_model(model, lora_config)\n" "```\n\n" "KEY HYPERPARAMETERS:\n" "- learning_rate: 1e-4 to 2e-4 (LoRA), 5e-5 to 1e-4 (full FT)\n" "- lora_r: 8 (minimal) to 64 (high-capacity)\n" "- lora_alpha: typically 2x lora_r\n" "- batch_size: 1-4 per device (use gradient_accumulation_steps for effective batch)\n" "- gradient_accumulation_steps: 4-16 (effective_batch = per_device * accum)\n" "- max_seq_length: 512 (short), 1024-2048 (standard), 4096 (long)\n" "- warmup_ratio: 0.03-0.1\n" "- weight_decay: 0.01-0.1\n\n" "DATA FORMAT (use datasets library):\n" "- Instruction tuning: {'instruction': '...', 'output': '...'}\n" "- Chat format: {'messages': [{'role': 'user', 'content': '...'}, ...]}\n" "- DPO: {'prompt': '...', 'chosen': '...', 'rejected': '...'}\n" "- Use load_dataset('json', data_files='train.json') for local data\n" "- Use load_dataset('HuggingFace/dataset_name') for HF Hub datasets\n\n" "EVALUATION:\n" "- Use evaluate library for standard metrics\n" "- Common: perplexity, ROUGE (summarization), BLEU (translation), accuracy\n" "- LLM benchmarks: MMLU, ARC, HellaSwag, TruthfulQA\n" "- Generate sample outputs for qualitative comparison\n\n" "MODEL DOWNLOAD:\n" "- Models will be downloaded from HuggingFace Hub at runtime\n" "- Use 'trust_remote_code=True' for custom model architectures\n" "- Cache directory: default HF cache (~/.cache/huggingface)\n" "- Common models: Qwen/Qwen2.5-7B, meta-llama/Llama-3.1-8B, " "microsoft/Phi-4, google/gemma-2-9b\n\n" "CRITICAL — NO SIMULATION:\n" "- You MUST load and train a REAL model from HuggingFace Hub.\n" "- NEVER simulate training with synthetic utility functions or random scores.\n" "- NEVER replace model training with np.random/torch.randn mock results.\n" "- A real experiment loads a model, tokenizes data, runs optimizer steps, " "and measures real loss/perplexity/accuracy on held-out data.\n" "- If compute budget is tight, use a SMALLER model (Qwen2.5-0.5B or 1.5B) " "with fewer training steps rather than simulating.\n" ), "llm_eval_guidance": ( "\n## LLM Evaluation Guidance\n" "STANDARD BENCHMARKS:\n" "- Reasoning: MMLU, ARC-Challenge, HellaSwag, WinoGrande\n" "- Math: GSM8K, MATH, MathVista\n" "- Coding: HumanEval, MBPP, LiveCodeBench\n" "- Safety: TruthfulQA, BBQ, CrowS-Pairs\n" "- Instruction following: MT-Bench, AlpacaEval, IFEval\n" "- Multimodal: MMBench, POPE, MathVista, MMMU\n\n" "EVALUATION FRAMEWORKS:\n" "- lm-eval-harness: Standard eval framework, run via CLI or Python API\n" "- vllm: Fast inference engine for throughput-focused evaluation\n" "- lighteval: HuggingFace's lightweight eval framework\n\n" "EVALUATION PROTOCOL:\n" "- Report on at least 3 benchmarks relevant to the task\n" "- Compare with published baselines from model cards/leaderboards\n" "- Report both zero-shot and few-shot results where applicable\n" "- Include perplexity on held-out test set\n" ), # IMP-20: Academic writing style guide (from NeurIPS/ICLR/ICML 2024-2025 best papers) "academic_style_guide": ( "\n## ACADEMIC WRITING STANDARDS (from NeurIPS/ICLR/ICML 2024-2025 best papers)\n\n" "### Title Standards\n" "- Target 8-14 words. Median of award-winning papers: ~10 words.\n" "- Preferred format: 'SystemName: Descriptive Subtitle' (35% of best papers)\n" " e.g., 'AlphaEdit: Null-Space Constrained Knowledge Editing for Language Models'\n" "- Alternative: Declarative statement that surprises\n" " e.g., 'Not All Tokens Are What You Need for Pretraining'\n" "- Give your method a memorable, catchy name (VAR, Genie, PRISM, SEDD).\n" "- NEVER exceed 18 words. NEVER use 'A Novel Approach to...' or 'Investigating...'\n\n" "### Abstract Standards (PMR+ Structure, 180-220 words)\n" "S1-S2: PROBLEM — State the gap. Open with a challenge or status-quo critique.\n" "S3-S4: METHOD — Name your system by sentence 3. Describe the key insight.\n" "S5-S6: RESULTS — At least 2-3 concrete quantitative claims:\n" " - One relative improvement ('36.7% boost over baseline')\n" " - One absolute benchmark score ('FID of 1.01 on ImageNet')\n" "AVOID: Per-seed ranges, excessive texttt, defensive hedging.\n\n" "### Section Writing Standards\n" "INTRODUCTION (800-1000 words, 4 paragraphs):\n" " - Para 1: Motivation; Para 2: Gap (cite 3-5 papers); Para 3: Your approach;\n" " Para 4: Contributions (bullet list of 3-4 specific contributions)\n" " - MUST cite 8-12 references throughout Introduction\n\n" "RELATED WORK (600-800 words):\n" " - Organize by sub-topic (2-3 subsections), NOT as a flat list\n" " - End each subsection with how YOUR work differs\n" " - Target >= 15 unique references in this section alone\n\n" "METHOD (1000-1500 words):\n" " - Start with problem formulation (notation, objective function)\n" " - Use algorithm environment for pseudocode (not verbatim)\n" " - Write as a flowing narrative, NOT bullet points\n\n" "EXPERIMENTS (800-1200 words):\n" " - Experimental setup as subsection (datasets, baselines, metrics, hardware)\n" " - Hyperparameter table (Table 1 always)\n" " - MUST reference figures: 'As shown in Figure 1, our method...'\n" " - MUST cite baseline method papers (not just name them)\n\n" "RESULTS (600-800 words):\n" " - Main results table with descriptive caption\n" " - Ablation study table\n" " - Analysis paragraphs connecting numbers to insights\n" " - DO NOT repeat the same numbers from Experiments section\n" " - Reference figures for visual evidence\n\n" "DISCUSSION (400-600 words):\n" " - Compare findings with prior work (cite papers here!)\n" " - Explain surprising results; broader implications\n\n" "LIMITATIONS (200-300 words): 3-5 specific, concrete limitations. ALL caveats go HERE.\n\n" "CONCLUSION: Summarize in 2-3 sentences, future work in 2-3 sentences.\n\n" "### Writing Quality Rules\n" "- Write as FLOWING PROSE, not bullet points or enumerated lists\n" "- Each paragraph: topic sentence, evidence, analysis, transition\n" "- Use transitions: 'Building on this insight...', 'In contrast to...'\n" "- Academic tone: confident but precise\n" "- Vary sentence structure: mix short declarative with longer analytical\n" "- AVOID: Starting 3+ consecutive sentences with 'We', 'The', 'Our'\n" "- AVOID: 'It is worth noting that', 'It should be mentioned that' (filler)\n" "- Citations belong in EVERY section, not just Introduction and Related Work\n" ), # IMP-25: Narrative writing requirements "narrative_writing_rules": ( "\n## NARRATIVE WRITING REQUIREMENTS\n\n" "You are writing a paper for human reviewers at a top AI conference. The paper\n" "must read like a cohesive academic story, NOT a technical report or bullet list.\n\n" "### Structure of Each Paragraph\n" "Every paragraph MUST follow this pattern:\n" "1. TOPIC SENTENCE — states the main claim or finding\n" "2. EVIDENCE — data, citations, or reasoning that supports the claim\n" "3. ANALYSIS — what the evidence means, why it matters\n" "4. TRANSITION — connects to the next paragraph's topic\n\n" "### FORBIDDEN Writing Patterns\n" "- Bullet-point lists in the main body (ONLY allowed in Contributions paragraph\n" " of Introduction and Limitations section)\n" "- Numbered lists of findings or results\n" "- Starting a paragraph with 'Table X shows...' without context first\n" "- Consecutive short sentences without analysis between them\n" "- Repeating the same sentence structure 3+ times in a row\n\n" "### REQUIRED Writing Patterns\n" "- Transition phrases: 'Building on this observation...', 'In contrast to prior work...'\n" "- Vary sentence length: alternate between short impactful and longer analytical\n" "- Ground every claim in evidence: '[Result] because [mechanism] (cite)'\n" "- Discuss implications: 'This X% improvement indicates that [mechanism Y]\n" " is more effective than [mechanism Z] for [context]'\n" "- For temporal data: describe trends in prose rather than bullet-point lists\n\n" "### Example: BAD vs GOOD Method Description\n" "BAD (bullet-list style):\n" " 'Our method has three components:\n" " - Component A\n" " - Component B\n" " - Component C'\n\n" "GOOD (narrative style):\n" " 'Our method builds on the insight that [core problem] stems from\n" " [root cause identified in Section 2]. To address this, we introduce\n" " [MethodName], a [N]-stage framework. First, [Stage 1] maps inputs\n" " to [representation]. These representations feed into [Stage 2],\n" " enabling [benefit] without [drawback of prior approaches].\n" " Crucially, we augment this with [Stage 3] based on [technical\n" " foundation] (cite original paper), triggering [mechanism] when\n" " [condition is met].'\n" " NOTE: Replace all [placeholders] with YOUR actual method details.\n" " Do NOT copy this template verbatim.\n" ), # IMP-31: Anti-hedging rules "anti_hedging_rules": ( "\n## ANTI-HEDGING RULES (MANDATORY)\n" "1. The following phrases are BANNED from the paper body:\n" " - 'we do not claim' / 'we cannot claim'\n" " - 'we intentionally frame this conservatively'\n" " - 'the evidence does not support' (unless followed by what it DOES support)\n" " - 'only N seeds/runs' (belongs ONLY in Limitations, stated ONCE)\n" " - 'this paper is not' / 'we do not' as paragraph openers\n" "2. Limitations and caveats MUST be consolidated in the Limitations section.\n" " They may NOT appear in Introduction, Method, Results, or Conclusion.\n" "3. Confidence framing: Instead of 'we cannot prove X', write 'our results\n" " provide evidence for X' or 'X is supported by [metrics]'.\n" "4. If you have a negative result, frame it as an INSIGHT:\n" " BAD: 'Our method failed to outperform the baseline, we do not claim...'\n" " GOOD: 'Surprisingly, the standard baseline proved competitive, suggesting\n" " that [insight about why] — an observation with practical implications for...'\n" ), # IMP-24: Anti-repetition rules "anti_repetition_rules": ( "\n## ANTI-REPETITION RULE\n" "Each specific number (e.g., '0.7667', '36.7%') may appear in AT MOST 2 sections:\n" " - Once in Results/Experiments (where it is first reported)\n" " - Once in Abstract (as a summary highlight)\n" "The Introduction, Discussion, and Conclusion MUST refer to results qualitatively\n" "('significantly outperformed', 'X% improvement') WITHOUT repeating exact numbers\n" "from the Results section. Violation of this rule will result in desk rejection.\n" ), } # -- Debate role prompts (multi-perspective generation) ------------------- DEBATE_ROLES_HYPOTHESIS: dict[str, dict[str, str]] = { "innovator": { "system": ( "You are a bold, creative researcher who thinks outside the box. " "You pursue high-risk high-reward ideas, draw cross-domain analogies, " "and propose counter-intuitive hypotheses that challenge mainstream thinking." ), "user": ( "Generate at least 2 novel, unconventional hypotheses from the synthesis below.\n" "CRITICAL REQUIREMENTS for EVERY hypothesis:\n" "1. NOVELTY: Must go beyond incremental combination of existing methods.\n" "2. FEASIBILITY: Must be testable within 30 minutes of compute on a single GPU.\n" "3. FALSIFIABILITY: Must define a specific metric threshold that would reject it.\n" "For each hypothesis provide:\n" "- A bold claim that pushes boundaries\n" "- Cross-domain inspiration (if applicable)\n" "- Rationale grounded in the literature gaps\n" "- Measurable prediction and failure condition\n" "- Estimated risk level (low/medium/high)\n\n" "Topic: {topic}\n" "Synthesis:\n{synthesis}" ), }, "pragmatist": { "system": ( "You are a practical ML engineer focused on what actually works. " "You prioritize computational feasibility, engineering simplicity, " "reliable baselines, and incremental but solid improvements." ), "user": ( "Generate at least 2 feasible, well-grounded hypotheses from the synthesis below.\n" "For each hypothesis provide:\n" "- A concrete, testable claim with clear methodology\n" "- Why this is achievable with limited compute\n" "- Rationale based on proven techniques\n" "- Measurable prediction and failure condition\n" "- Resource requirements estimate\n\n" "Topic: {topic}\n" "Synthesis:\n{synthesis}" ), }, "contrarian": { "system": ( "You are a rigorous devil's advocate who challenges assumptions. " "You find blind spots, hidden failure modes, and counter-evidence. " "Your value is in finding problems others ignore. Be provocative " "but always grounded in evidence." ), "user": ( "Critically examine the synthesis and generate at least 2 contrarian hypotheses.\n" "For each hypothesis provide:\n" "- A challenge to a widely-held assumption in this area\n" "- Evidence or reasoning for why the mainstream view may be wrong\n" "- An alternative hypothesis that accounts for overlooked factors\n" "- Measurable prediction and failure condition\n" "- Potential negative results that would be informative\n\n" "Topic: {topic}\n" "Synthesis:\n{synthesis}" ), }, } DEBATE_ROLES_ANALYSIS: dict[str, dict[str, str]] = { "optimist": { "system": ( "You highlight positive findings, promising extensions, and silver linings " "in experimental results. You identify what worked well and why, " "and suggest how to build on successes." ), "user": ( "Analyze the experiment results from an optimistic perspective.\n" "Cover:\n" "- What worked well and why\n" "- Unexpected positive findings\n" "- Promising extensions and next steps\n" "- Silver linings in any negative results\n\n" "{preamble}\n{data_context}\n" "Run context:\n{context}" ), }, "skeptic": { "system": ( "You question the significance of results with maximum rigor. " "You check statistical validity, identify confounds, and demand " "stronger evidence. Every claim must earn its place." ), "user": ( "Critically scrutinize the experiment results.\n" "Cover:\n" "- Statistical concerns (significance, sample size, multiple comparisons)\n" "- Potential confounds and alternative explanations\n" "- Missing evidence or controls\n" "- Whether metrics truly capture the intended phenomenon\n\n" "{preamble}\n{data_context}\n" "Run context:\n{context}" ), }, "methodologist": { "system": ( "You scrutinize HOW experiments were conducted. You audit " "internal/external validity, reproducibility, baseline fairness, " "and evaluation protocols." ), "user": ( "Audit the experimental methodology.\n" "Cover:\n" "- Baseline fairness and completeness\n" "- Metric appropriateness for the research question\n" "- Evaluation protocol (data leakage, contamination risks)\n" "- Ablation completeness\n" "- Reproducibility assessment\n" "- Specific methodology improvements needed\n\n" "{preamble}\n{data_context}\n" "Run context:\n{context}" ), }, } # -- Sub-prompts (secondary LLM calls within a stage) -------------------- _DEFAULT_SUB_PROMPTS: dict[str, dict[str, Any]] = { "hypothesis_synthesize": { "system": ( "You are a senior research director synthesizing multiple perspectives " "into a decisive research proposal. The best synthesis is not a " "compromise but takes the strongest elements from each viewpoint. " "Preserve genuine disagreements — do not flatten controversy." ), "user": ( "Below are hypotheses generated from three different research perspectives.\n" "Synthesize them into a final set of 2-4 hypotheses that:\n" "1. Take the strongest, most novel ideas\n" "2. Address critical concerns raised by the contrarian\n" "3. Ensure feasibility (pragmatist's input)\n" "4. Note unresolved disagreements between perspectives\n" "5. For each final hypothesis: rationale, measurable prediction, " "failure condition\n\n" "{perspectives}" ), }, "analysis_synthesize": { "system": ( "You are a senior research director synthesizing multiple analytical " "perspectives into a comprehensive assessment. Find the truth — if " "the skeptic or methodologist raise valid concerns, acknowledge them. " "Do not suppress criticism." ), "user": ( "Below are analyses from three different perspectives (optimist, " "skeptic, methodologist).\n" "Produce a unified analysis that:\n" "1. Identifies consensus points (high-confidence conclusions)\n" "2. Resolves conflicts with evidence-based judgment\n" "3. Rates result quality (1-10 with justification)\n" "4. Lists 3-5 key findings\n" "5. Notes methodology gaps that need addressing\n" "6. Gives a clear PROCEED/PIVOT/REFINE recommendation\n\n" "Required sections: Metrics Summary, Consensus Findings, " "Contested Points, Statistical Checks, Methodology Audit, " "Limitations, Conclusion.\n\n" "{perspectives}" ), "max_tokens": 8192, }, "code_repair": { "system": "You fix Python code validation errors while preserving functionality.", "user": ( "The file `{fname}` in the experiment project has validation errors. " "Fix ALL issues and return ONLY the corrected file.\n\n" "## Validation Issues in {fname}\n{issues_text}\n\n" "## All Project Files\n{all_files_ctx}\n\n" "IMPORTANT: Do NOT use subprocess, os.system, eval, exec, or any " "network/shell calls.\n" "NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, " "np.bool/int/float→Python builtins.\n" "Return ONLY the corrected code for `{fname}`." ), }, "iterative_improve": { "system": ( "You improve experiment projects and return valid executable Python code. " "Use ```filename:xxx.py format for each file." ), "user": ( "Improve the experiment code based on prior run results.\n" "Return the improved files using ```filename:xxx.py format for each file.\n" "Primary metric key: {metric_key}\n" "Metric direction: {metric_direction}\n" "Do not use subprocess, os.system, eval, exec, or any network/shell calls.\n" "NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, " "np.bool/int/float→Python builtins, np.math→math.\n\n" "EXPERIMENT PLAN ANCHOR (CRITICAL — read before making changes):\n" "The research topic is: {topic}\n" "{exp_plan_anchor}" "RULES FOR REFINEMENT:\n" "- NEVER rename, remove, or replace existing condition names. " "The condition names in the code MUST match the experiment plan.\n" "- NEVER add new conditions that are not in the experiment plan.\n" "- ONLY improve the IMPLEMENTATION of existing conditions " "(fix bugs, tune hyperparameters, improve training loops).\n" "- If the code has fundamental issues (wrong algorithm, missing " "components), fix the implementation but keep the same condition " "names and class hierarchy.\n\n" "{condition_coverage_hint}" "SEED ENFORCEMENT (MANDATORY — BUG-183):\n" "- You MUST use exactly seeds = [0, 1, 2] (3 seeds minimum).\n" "- Each condition MUST loop over ALL seeds.\n" "- Print per-seed: condition=X seed=S {metric_key}: V\n" "- Print aggregated: condition=X {metric_key}_mean: M {metric_key}_std: S\n" "- If 3 seeds × all conditions exceeds the time budget, REDUCE training " "epochs or conditions — NEVER reduce seed count below 3.\n\n" "CONDITION COUNT LIMIT (HARD RULE):\n" "- MAXIMUM 8 total conditions (baselines + methods + ablations).\n" "- If the previous code had >8 conditions, consolidate ablations to 2-3 values.\n\n" "DOCKER MOUNT TOPOLOGY (for fixing PermissionError/path issues):\n" "- WRITABLE: /workspace/ (project files), /tmp/, /workspace/data/\n" "- READ-ONLY: /opt/datasets/ (pre-cached CIFAR-10/100, MNIST, etc)\n" "- If you see PermissionError on /opt/datasets, do NOT call " "os.makedirs() there. Use root='/opt/datasets' with download=False.\n" "- For new data downloads, use /workspace/data/ as root.\n\n" "Current project files:\n{files_context}\n" "Run summaries (JSON):\n{run_summaries}" ), "max_tokens": 8192, }, "iterative_repair": { "system": "You fix Python validation issues without adding unsafe behavior.", "user": ( "Fix all validation issues in main.py and return corrected Python code only.\n\n" "## Validation Issues\n{issue_text}\n\n" "## Common RL Stability Fixes (apply if NaN/divergence detected):\n" "- Add gradient clipping: `torch.nn.utils.clip_grad_norm_(params, 1.0)`\n" "- Lower learning rate to 1e-4 or 3e-4\n" "- Add reward normalization/clipping: `reward = np.clip(reward, -10, 10)`\n" "- Add NaN guard: `if torch.isnan(loss): continue`\n" "- Use float32 (not float16) for RL value functions\n" "- NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, " "np.bool/int/float→Python builtins\n\n" "## All Project Files\n{all_files_ctx}" ), }, # ── Advanced Code Agent sub-prompts ────────────────────────────────── "architecture_planning": { "system": ( "You are a senior software architect who designs implementation " "blueprints for scientific experiment codebases. You produce detailed, " "directly-implementable specifications with pseudocode for every " "class method and explicit tensor shape annotations. You emphasize " "separation of concerns: data loading, model definition, training " "loop, and evaluation are distinct components. You understand ML " "training deeply and design for correctness: proper .detach(), " "consistent tensor shapes, and correct gradient flow.\n\n" "NUMPY 2.x COMPATIBILITY (CRITICAL):\n" "- np.trapz is REMOVED → use np.trapezoid\n" "- np.erfinv does NOT exist → use scipy.special.erfinv\n" "- np.bool, np.int, np.float, np.complex are REMOVED → use Python builtins\n" "- np.str, np.object are REMOVED → use str, object\n" "- np.math is REMOVED → use math module" ), "user": ( "Create a detailed IMPLEMENTATION BLUEPRINT for an experiment codebase.\n\n" "## Research Context\n" "TOPIC: {topic}\n" "PRIMARY METRIC: {metric}\n\n" "## Experiment Plan\n{exp_plan}\n\n" "## Requirements\n" "1. `main.py` MUST be the entry point — runs ALL conditions sequentially.\n" "2. Each condition MUST be a SEPARATE class with DISTINCT implementation.\n" "3. Data loading and model definitions in separate modules.\n" "4. No more than 5 Python files total.\n" "5. Every class must have at least 20 lines of effective code.\n" "6. Child classes MUST override at least one core method with DIFFERENT logic.\n" "7. NEVER override nn.Module.train/eval with different signatures.\n" "8. Design child classes as STRATEGY variants, not PARAMETER variants.\n\n" "## Blueprint Format (YAML)\n" "The blueprint MUST include ALL of the following for EACH file:\n" "- `generation_order`: integer (1=first to generate, higher=later)\n" "- `dependencies`: list of other files this file imports from\n" "- `classes` or `functions`: with pseudocode for each method\n" "- For neural network classes: input/output tensor shapes\n\n" "```yaml\n" "files:\n" " - name: config.py\n" " generation_order: 1\n" " dependencies: []\n" " purpose: Hyperparameter configuration\n" " classes:\n" " - name: Config\n" " fields:\n" " - lr: 0.01\n" " - batch_size: 128\n" " - epochs: 20\n" " - hidden_dim: 128\n\n" " - name: data.py\n" " generation_order: 2\n" " dependencies: [config.py]\n" " purpose: Dataset loading and preprocessing\n" " functions:\n" " - name: get_dataloaders\n" " signature: (config) -> (train_loader, val_loader, test_loader)\n" " pseudocode: |\n" " 1. Load dataset from torchvision/disk\n" " 2. Apply standard transforms (normalize, augment)\n" " 3. Split train into train/val (90/10)\n" " 4. Return DataLoaders with config.batch_size\n\n" " - name: models.py\n" " generation_order: 3\n" " dependencies: [config.py]\n" " purpose: All model implementations\n" " classes:\n" " - name: BaseModel(nn.Module)\n" " input_shape: [B, 3, 32, 32]\n" " output_shape: [B, 10]\n" " methods:\n" " - name: __init__\n" " pseudocode: Define layers (conv/linear/attention)\n" " - name: forward\n" " pseudocode: |\n" " 1. x = self.encoder(x) # [B,3,32,32] -> [B, hidden]\n" " 2. logits = self.classifier(x) # [B, hidden] -> [B, 10]\n" " 3. return logits\n" " - name: ProposedMethod(BaseModel)\n" " differentiator: Uses novel component X\n" " overrides: [forward]\n" " methods:\n" " - name: forward\n" " pseudocode: |\n" " 1. x = self.encoder(x)\n" " 2. x = self.novel_component(x) # KEY DIFFERENCE\n" " 3. logits = self.classifier(x)\n" " 4. return logits\n" " - name: compute_special_loss\n" " pseudocode: |\n" " 1. Compute task loss: CE(logits, labels)\n" " 2. Compute novel regularizer\n" " 3. return task_loss + lambda * reg\n\n" " - name: training.py\n" " generation_order: 4\n" " dependencies: [config.py, data.py, models.py]\n" " purpose: Training loop and evaluation\n" " functions:\n" " - name: train_one_epoch\n" " signature: (model, loader, optimizer, device) -> float\n" " pseudocode: |\n" " 1. model.train()\n" " 2. For each batch: forward, loss, backward, step\n" " 3. Return average loss\n" " - name: evaluate\n" " signature: (model, loader, device) -> dict\n" " pseudocode: |\n" " 1. model.eval() with torch.no_grad()\n" " 2. For each batch: forward, argmax predictions\n" " 3. Return {accuracy, loss}\n\n" " - name: main.py\n" " generation_order: 5\n" " dependencies: [config.py, data.py, models.py, training.py]\n" " purpose: Entry point — runs ALL conditions\n" " contract:\n" " prints_metric_def: true\n" " prints_registered_conditions: true\n" " runs_all_conditions: true\n" " per_seed_reporting: true\n" " time_budget_guard: true\n" " functions:\n" " - name: main\n" " pseudocode: |\n" " 1. Print METRIC_DEF line\n" " 2. Print REGISTERED_CONDITIONS\n" " 3. Setup time budget guard\n" " 4. For each condition:\n" " a. Create model instance\n" " b. For each seed:\n" " - Set random seed\n" " - Train model\n" " - Evaluate and print per-seed metrics\n" " c. Print mean/std across seeds\n" " 5. Print SUMMARY comparison\n\n" "verification_criteria:\n" " - All condition classes have DIFFERENT forward/step implementations\n" " - Input/output tensor shapes are consistent across data->model->loss\n" " - Time budget guard exists in main training loop\n" " - Per-seed random state isolation\n" " - All .detach() calls present for values used across iterations\n\n" "conditions:\n" " - name: ConditionName\n" " class: ClassName\n" " description: What makes it different\n" "```\n\n" "Output ONLY the YAML specification wrapped in ```yaml``` fences.\n" "Be SPECIFIC in pseudocode — include tensor shapes, loss formulas, " "and algorithmic details from the experiment plan.\n" "Every class must have detailed pseudocode showing HOW it differs " "from others, not just THAT it differs." ), "max_tokens": 8192, }, "generate_single_file": { "system": ( "You are an expert ML engineer who writes production-quality Python code " "for scientific experiments. You follow implementation blueprints precisely, " "ensuring tensor shapes match, gradients flow correctly, and all imports " "resolve. You write complete, runnable code — never stubs or placeholders." ), "user": ( "Generate the Python file `{file_name}` for an ML experiment project.\n\n" "## File Specification\n{file_spec}\n\n" "## Full Project Blueprint\n{blueprint}\n\n" "## Already Generated Files (summaries)\n{dependency_summaries}\n\n" "## Already Generated Files (full code of direct dependencies)\n" "{dependency_code}\n\n" "## Research Topic\n{topic}\n\n" "## Experiment Plan\n{exp_plan}\n\n" "## Environment\n{pkg_hint}\n\n" "## CRITICAL Rules\n" "1. Follow the blueprint specification EXACTLY — implement every class " "and function listed for this file.\n" "2. Tensor shapes MUST match the blueprint annotations.\n" "3. Imports from dependency files MUST use the exact class/function names " "from the already-generated code.\n" "4. Every method must have a REAL implementation — no `pass`, no `...`, " "no `raise NotImplementedError`.\n" "5. NEVER use random numbers as fake metrics.\n" "6. For RL code: .detach() ALL values from previous iterations before " "using in current loss.\n" "7. For neural networks: create layers in __init__, not in forward().\n" "8. METHOD RICHNESS: Every non-trivial method should be >=5 lines of " "real logic. If a method only calls super() or returns a constant, " "add the actual computation it should perform. Training methods should " "include proper gradient handling, metric logging, and error checks.\n" "9. ABLATION DIFFERENTIATION: If this file contains ablation/variant " "classes, each MUST differ in actual algorithm logic — not just in " "parameter values or by removing a line. Ablations should clearly " "implement a different computational path.\n" "10. NO CLI CONDITION ARGS: If this is main.py, NEVER add argparse " "arguments like --condition or --method. All conditions must be " "iterated inside main.py with a for-loop. The harness runs " "`python main.py` with no arguments.\n" "11. NUMPY 2.x COMPATIBILITY: np.trapz→np.trapezoid, " "np.erfinv→scipy.special.erfinv, np.bool/np.int/np.float→Python builtins, " "np.str/np.object→str/object, np.math→math.\n\n" "Output ONLY the Python code for `{file_name}` — no markdown fences, " "no explanations, just the code." ), "max_tokens": 8192, }, "code_exec_fix": { "system": ( "You are a debugging expert who fixes runtime errors in Python " "experiment code. You preserve the original experiment design and " "scientific methodology while fixing the specific error. You fix " "the ROOT CAUSE, not just the symptom." ), "user": ( "The following experiment code crashed during execution.\n\n" "## Error Output (stderr, last 3000 chars)\n" "```\n{stderr}\n```\n\n" "## Standard Output (last 50 lines)\n" "```\n{stdout_tail}\n```\n\n" "## Return Code: {returncode}\n\n" "## Current Code Files\n{files_context}\n\n" "## Instructions\n" "1. Identify the ROOT CAUSE of the error.\n" "2. Fix it while preserving the experiment design.\n" "3. Check for similar potential issues in ALL files.\n" "4. Do NOT simplify or remove experiment logic — fix the bug.\n" "5. Do NOT add subprocess, os.system, eval, exec, or network calls.\n" "6. COMMON BUG: If error is about `train()` missing arguments, it means " "a class overrode nn.Module.train() with a custom signature. Fix by " "renaming the custom method to `fit()` or `run_training()` and updating " "all callers. Never override nn.Module.train/eval with extra args.\n" "7. NUMPY 2.x: np.trapz→np.trapezoid, np.erfinv→scipy.special.erfinv, " "np.bool/int/float/complex→Python builtins, np.str/object→str/object.\n\n" "Output ALL files in ```filename:xxx.py``` format, including files " "that don't need changes." ), "max_tokens": 16384, }, "code_reviewer": { "system": ( "You are a meticulous experiment code reviewer focused on " "scientific correctness, statistical rigor, and code quality. " "You catch bugs that static analysis cannot: incorrect algorithm " "implementations, missing controls, wrong metric computation, " "and experimental design flaws." ), "user": ( "Review this experiment code for correctness and quality.\n\n" "## Research Context\n" "TOPIC: {topic}\n" "PRIMARY METRIC: {metric}\n\n" "## Experiment Plan\n{exp_plan}\n\n" "## Code Files\n{files_context}\n\n" "## Review Criteria\n" "1. **CORRECTNESS**: Does the code correctly implement the " "experiment plan? Are algorithms implemented properly?\n" "2. **COMPLETENESS**: Are all conditions/ablations implemented " "with DISTINCT logic? (Not just renamed copies of baseline.)\n" "3. **STATISTICAL RIGOR**: Multiple seeds? Results averaged and " "reported with std? Paired comparisons?\n" "4. **METRIC REPORTING**: Is {metric} correctly computed and " "printed in the required format?\n" "5. **ROBUSTNESS**: Shape mismatches? Missing imports? Type " "errors? Division by zero? GPU/CPU device conflicts?\n" "6. **CLASS DEPTH**: Each experimental condition class must have " "at least 20 lines of effective code with distinct logic. Classes " "that only override __init__ to change parameters are CRITICAL " "issues — they indicate the condition is not truly different.\n\n" "## Output Format (JSON)\n" "```json\n" '{{\n' ' "verdict": "APPROVE or REVISE",\n' ' "score": 1-10,\n' ' "critical_issues": ["issue1", "issue2"],\n' ' "suggestions": ["suggestion1", "suggestion2"]\n' '}}\n' "```\n\n" "Only use verdict REVISE if there are critical issues that would " "cause the code to crash or produce scientifically invalid results." ), "json_mode": True, "max_tokens": 4096, }, } # -- Stage prompts (one entry per LLM-calling stage) --------------------- _DEFAULT_STAGES: dict[str, dict[str, Any]] = { # ── Phase A: Research Scoping ──────────────────────────────────────── "topic_init": { "system": ( "You are a rigorous research planner who identifies NOVEL, TIMELY " "research angles. You follow recent trends from top venues in the " "relevant domain and propose research that advances " "the frontier rather than repeating known results.\n\n" "NOVELTY PRINCIPLES:\n" "- A good research angle addresses a GAP not yet covered by existing work.\n" "- Avoid pure benchmark/comparison studies unless the methodology is novel.\n" "- Prefer angles that combine existing techniques in new ways, apply methods " "to underexplored domains, or challenge common assumptions.\n" "- The research must be FEASIBLE with limited compute (single GPU, hours not days).\n" "- Check: would a reviewer say 'this is already well-known'? If so, find a sharper angle." ), "user": ( "Create a SMART research goal in markdown.\n" "Topic: {topic}\n" "Domains: {domains}\n" "Project: {project_name}\n" "Quality threshold: {quality_threshold}\n\n" "Required sections:\n" "- **Topic**: The broad area\n" "- **Novel Angle**: What specific aspect has NOT been well-studied? " "Why is this timely NOW (2024-2026)? What recent development creates " "an opportunity? How does this differ from standard approaches?\n" "- **Scope**: Focused enough for a single paper\n" "- **SMART Goal**: Specific, Measurable, Achievable, Relevant, Time-bound\n" "- **Constraints**: Compute budget, available tools, data access\n" "- **Success Criteria**: What results would make this publishable?\n" "- **Generated**: Timestamp\n\n" "IMPORTANT: The 'Novel Angle' section must convincingly argue why this " "specific research direction is NOT already covered by existing work. " "If the topic is well-studied (e.g., 'comparing optimizers'), you MUST " "find a specific unexplored aspect (e.g., 'under distribution shift with " "noisy gradients', 'in the few-shot regime', 'with modern architectures').\n\n" "TREND VALIDATION (MANDATORY):\n" "- Identify 2-3 recent papers (2024-2026) that establish the relevance " "of this research direction.\n" "- Name the specific benchmark/dataset that will be used for evaluation.\n" "- If no standard benchmark exists, explain how results will be measured.\n" "- State whether SOTA results exist on this benchmark and what they are.\n" "- Add a 'Benchmark' subsection listing: name, source, metrics, " "current SOTA (if known)." ), }, "problem_decompose": { "system": "You are a senior research strategist.", "user": ( "Decompose this research problem into at least 4 prioritized " "sub-questions.\n" "Topic: {topic}\n" "Output markdown with sections: Source, Sub-questions, Priority " "Ranking, Risks.\n" "Goal context:\n{goal_text}" ), }, # ── Phase B: Literature Discovery ──────────────────────────────────── "search_strategy": { "system": ( "You design literature retrieval strategies and source verification plans." ), "user": ( "Create a merged search strategy package.\n" "Return a JSON object with keys: search_plan_yaml, sources.\n" "search_plan_yaml must be valid YAML text.\n" "sources must include id,name,type,url,status,query,verified_at.\n" "Topic: {topic}\n" "Problem tree:\n{problem_tree}" ), "json_mode": True, }, "literature_collect": { "system": "You are a literature mining assistant.", "user": ( "Generate candidate papers from the search plan.\n" "Return JSON: {candidates:[...]} with >=8 rows.\n" "Each candidate must include id,title,source,url,year,abstract," "collected_at.\n" "Topic: {topic}\n" "Search plan:\n{plan_text}" ), "json_mode": True, }, "literature_screen": { "system": ( "You are a strict domain-aware reviewer with zero tolerance for " "cross-domain false positives. You MUST reject papers that are " "from unrelated fields, even if they share superficial keyword " "overlap. A paper about 'normalization in database systems' is " "NOT relevant to 'normalization in deep learning'. A paper about " "'graph theory in social networks' is NOT relevant to 'graph " "neural networks for molecular property prediction'." ), "user": ( "Perform merged relevance+quality screening and return shortlist.\n" "Return JSON: {shortlist:[...]} each with title, cite_key " "(if present), relevance_score (0-1), quality_score (0-1), " "keep_reason.\n" "Preserve all original fields (paper_id, doi, arxiv_id, cite_key, " "etc.) from the input.\n" "Topic: {topic}\n" "Domains: {domains}\n" "Threshold: {quality_threshold}\n\n" "SCREENING RULES (apply strictly):\n" "1. DOMAIN MATCH: The paper's actual research domain must match " "the topic's domain. Shared keywords across domains do NOT count.\n" "2. METHOD RELEVANCE: The paper must discuss methods, benchmarks, " "or findings directly applicable to the research topic.\n" "3. CROSS-DOMAIN REJECTION: Reject papers from unrelated fields " "(e.g., wireless communications, database systems, social science) " "even if they use similar terminology.\n" "4. RECENCY PREFERENCE: Prefer papers from 2020+ for methodology, " "but accept foundational papers (pre-2020) if they introduced key " "techniques still in use today.\n" "5. SEMINAL PAPERS: Papers marked as source='seminal_library' are " "pre-vetted foundational references — keep them if their keywords " "match the topic (relevance_score >= 0.7).\n" "6. QUALITY FLOOR: Reject papers with no abstract, no venue, and " "no citation count (likely not real papers).\n" "Candidates JSONL:\n{candidates_text}" ), "json_mode": True, }, "knowledge_extract": { "system": "You extract high-signal evidence cards from papers.", "user": ( "Extract structured knowledge cards from shortlist.\n" "Return JSON: {cards:[{card_id,title,cite_key,problem,method," "data,metrics,findings,limitations,citation}]}.\n" "IMPORTANT: If the input contains cite_key fields, preserve them " "exactly in the output.\n" "Shortlist:\n{shortlist}" ), "json_mode": True, }, # ── Phase C: Knowledge Synthesis ───────────────────────────────────── "synthesis": { "system": "You are a synthesis specialist for literature reviews.", "user": ( "Produce merged synthesis output (topic clusters + research gaps).\n" "Output markdown with sections: Cluster Overview, Cluster 1..N, " "Gap 1..N, Prioritized Opportunities.\n" "Topic: {topic}\n" "Cards context:\n{cards_context}" ), "max_tokens": 8192, }, "hypothesis_gen": { "system": ( "You formulate testable scientific hypotheses that address gaps " "NOT covered by existing literature. Your hypotheses must be:\n" "1. NOVEL: Not simply replicating known results or testing obvious things.\n" "2. GAP-FILLING: Address specific weaknesses or blind spots identified " "in the literature synthesis.\n" "3. FEASIBLE: Testable with limited compute (single GPU, <1 day runtime).\n" "4. FALSIFIABLE: Have clear failure conditions that would definitively " "reject the hypothesis.\n" "5. SURPRISING: At least one hypothesis should challenge conventional " "wisdom or test a counter-intuitive prediction." ), "user": ( "Generate at least 2 falsifiable hypotheses from the synthesis below.\n" "For each hypothesis provide:\n" "- **Hypothesis statement**: A clear, testable claim\n" "- **Novelty argument**: Why this has NOT been tested before, citing " "specific gaps from the synthesis\n" "- **Rationale**: Theoretical or empirical basis for expecting this result\n" "- **Measurable prediction**: Specific quantitative outcome expected\n" "- **Failure condition**: What result would reject this hypothesis?\n" "- **Required baselines**: What modern, state-of-the-art methods must be " "compared against to make the finding meaningful?\n\n" "AVOID:\n" "- Hypotheses that are trivially obvious (e.g., 'more data improves accuracy')\n" "- Hypotheses that replicate well-known results already in the literature\n" "- Hypotheses that cannot be tested within the compute budget\n\n" "Synthesis:\n{synthesis}" ), }, # ── Phase D: Experiment Design ─────────────────────────────────────── "experiment_design": { "system": "You are a principal investigator designing rigorous research experiments.", "user": ( "{preamble}\n\n" "Design an experiment plan as YAML.\n" "Required keys: objectives,datasets,baselines,proposed_methods," "ablations,metrics,risks,compute_budget.\n\n" "NAMING REQUIREMENT (CRITICAL for paper quality):\n" "- Every condition name in baselines, proposed_methods, and ablations MUST be " "a DESCRIPTIVE algorithm name DERIVED FROM THE HYPOTHESES ABOVE, NOT a generic label.\n" "- WRONG: baseline_1, baseline_2, method_variant_1, method_variant_2\n" "- WRONG: random_search, bayesian_optimization, ppo_policy, curiosity_driven_rl " "(these are generic defaults — NEVER use them unless they are actually what " "the hypotheses call for)\n" "- RIGHT: names that reflect the specific methods/architectures/algorithms in " "the hypotheses (e.g., rim_agent, monolithic_gru, ewc_baseline, sleep_consolidation, " "no_sleep_ablation, coarse_routing, fine_routing)\n" "- The name should immediately tell a reader WHAT algorithm or strategy is used.\n" "- This is critical because these names appear directly in the paper.\n\n" "BASELINE & BENCHMARK MODERNITY (CRITICAL for acceptance):\n" "- Baselines MUST be modern, widely-adopted methods from recent top-venue " "papers (2023-2026). Beating only outdated or weak baselines is NOT a valid " "contribution and will result in desk rejection.\n" "- Include at LEAST one strong baseline that represents current SOTA or " "near-SOTA in the specific sub-area. Check recent NeurIPS/ICML/ICLR papers " "to identify appropriate baselines.\n" "- Benchmarks MUST be standard and actively used. If a benchmark has been " "superseded, use the newer version.\n" "- For each baseline, cite the original paper and note why it is a fair " "and competitive comparison.\n\n" "HYPOTHESIS ALIGNMENT (CRITICAL — most common failure mode):\n" "- Your experiment plan MUST directly test the hypotheses listed above.\n" "- Each hypothesis should map to at least one comparison between conditions.\n" "- Baselines must be the specific alternatives named in the hypotheses, NOT " "generic optimization methods like random_search or bayesian_optimization.\n" "- If a hypothesis says 'X outperforms Y', then X must be a proposed_method " "and Y must be a baseline.\n" "- Ablations must isolate the specific components claimed to matter in the " "hypotheses (e.g., if hypothesis claims routing helps, ablate routing).\n\n" "STABILITY & REPRODUCIBILITY (CRITICAL for RL-based methods):\n" "- Under `proposed_methods`, specify key hyperparameters (learning rate, " "gradient clip threshold, entropy coefficient, etc.).\n" "- Under `risks`, explicitly list numerical stability concerns " "(NaN/divergence, reward explosion, policy collapse) and mitigations " "(gradient clipping, reward normalization, early stopping on NaN).\n" "- Under `metrics`, include:\n" " * Primary metric: `{metric_key}` with direction: `{metric_direction}` " "and units\n" " * IMPORTANT: The metric direction MUST be `{metric_direction}` — do " "NOT use a different direction. If {metric_direction}=='minimize', lower " "is better. If {metric_direction}=='maximize', higher is better.\n" " * `success_rate`: fraction of seeds that complete without NaN/crash\n" " * At least ONE discovery-aligned endpoint (e.g., identification " "accuracy, time-to-discovery, final posterior mass on true hypothesis) " "in addition to any proxy metric\n" "{dataset_guidance}\n\n" "- Under `datasets`, specify AT LEAST 2 regime factors to stratify by " "(e.g., noise_level: [low, high], hypothesis_space_size: [small, large]). " "Results MUST be reported per-regime. A single-regime experiment cannot " "support generality claims and will be rejected by reviewers.\n" "- FACTORIAL DESIGN PREFERRED: If you vary multiple factors (e.g., scale AND " "noise), design a factorial grid (e.g., small+low, small+high, large+low, " "large+high) so each factor's effect can be isolated. Bundling factors " "(e.g., easy=small+low, hard=large+high) is a confounder and reviewers will " "flag it. If computational budget limits the grid, at minimum acknowledge " "that factors are bundled and limit claims accordingly.\n" "- Under `compute_budget`, plan for minimum 10 seeds per condition to " "ensure valid statistical comparisons.\n\n" "STATISTICAL POWER REQUIREMENTS (CRITICAL for publishability):\n" "- Use AT LEAST 5 random seeds per condition (10 preferred)\n" "- Use AT LEAST 30 episodes per seed for RL methods\n" "- Report: mean ± std, 95% bootstrap CI, per-seed raw values\n" "- For method comparisons: use paired bootstrap or Wilcoxon signed-rank test " "(NOT paired t-test with n < 10)\n" "- Report effect sizes (Cohen's d or rank-biserial correlation)\n" "- 3 seeds is INSUFFICIENT — reviewers will reject papers with n=3\n\n" "HARDWARE ENVIRONMENT (your experiments run on THIS exact machine):\n" "{hardware_profile}\n" "- You have exactly ONE GPU. No distributed training. No multi-GPU. No multi-node.\n" "- Design experiments that fit this single GPU.\n\n" "COMPUTE BUDGET CONSTRAINT (CRITICAL — experiments MUST fit time budget):\n" "- Total experiment time budget: {time_budget_sec} seconds.\n" "- Per-condition budget: ~{per_condition_budget_sec} seconds " "(= time_budget × 0.7 / 6 conditions).\n" "- Pre-cached datasets (instant, no download): {available_tier1_datasets}\n" "- DO NOT plan experiments requiring multiple GPUs or more than " "{time_budget_sec}s.\n" "- HARD CONDITION LIMIT: The total number of conditions (baselines + " "proposed_methods + ablations) MUST NOT exceed 8 for budgets ≤ 3600s.\n" " * Recommended: 2-3 baselines + 1-2 proposed methods + 2-3 ablations = 5-8 total.\n" " * Generating 10+ conditions guarantees most will time out and data will be wasted.\n" " * Quality over quantity: 6 well-run conditions with 5 seeds each >> 20 conditions " "with 1 seed each.\n" "- Each run needs AT LEAST 60 seconds for RL (environment setup + " "training + evaluation). For deep learning with GPU, at least 120 seconds.\n" "- HARD CAP: total_conditions × num_seeds × seconds_per_run MUST be < " "{time_budget_sec} × 0.8 (leave 20% margin for overhead).\n" "- If total would exceed the budget, you MUST reduce by:\n" " 1. First: reduce conditions (merge similar ablations, keep strongest baselines)\n" " 2. Then: reduce seeds to 5 (minimum for statistical validity)\n" " 3. Then: reduce regimes/environments to 1\n" "- Example: {time_budget_sec}s budget with 120s/condition/seed, 5 seeds → " "max {time_budget_sec} / (120 * 5) ≈ 4 conditions.\n\n" "IMPLEMENTATION SPECIFICATION (CRITICAL for code generation):\n" "For each proposed method AND each baseline, you MUST include an " "'implementation_spec' key with:\n" " - class_name: the Python class name for this method\n" " - key_methods: list of methods the class must implement " "(e.g., [__init__, forward, train_step, predict])\n" " - algorithm_steps: pseudocode-level description of the core algorithm " "(3-10 steps), e.g.:\n" " 1. Encode input via encoder network (MLP: input_dim -> hidden_dim)\n" " 2. Compute attention weights over memory buffer\n" " 3. Aggregate attended features with learned gate\n" " 4. Decode to output via decoder network\n" " - loss_function: the mathematical formula for the training loss " "(e.g., 'L = CE(y_pred, y_true) + lambda * KL(q||p)')\n" " - key_hyperparameters: dict of hyperparameter name -> default value\n" " - differentiator: what makes THIS method different from others " "(must be an algorithmic difference, not just a hyperparameter change)\n\n" "For each ablation, you MUST specify:\n" " - what_is_removed: the specific component being ablated\n" " - how_it_differs: concrete code-level description of the change " "(e.g., 'replace attention layer with mean pooling', 'set routing " "weight to uniform 1/N', 'remove skip connection in block 3')\n" " - expected_effect: why removing this should change results\n\n" "This specification is MANDATORY — without it, the code generation " "stage cannot produce correct implementations.\n\n" "Hypotheses:\n{hypotheses}" ), }, "code_generation": { "system": ( "You are a computational scientist who writes real, runnable " "experiments. Your code implements actual algorithms with real " "mathematical operations. You NEVER fake results with random number " "generators. Always use the ```filename:xxx.py format for each file. " "Use numpy for numerical computation. Keep code self-contained " "and deterministic." ), "user": ( "Generate a Python experiment project for the following research " "topic:\n" "TOPIC: {topic}\n\n" "CRITICAL REQUIREMENTS — your code MUST satisfy ALL of these:\n" "1. Implement the ACTUAL experiment described in the topic and " "plan below.\n" " If the topic is about simulation (e.g., multi-agent systems, " "network dynamics),\n" " write simulation code. If about optimization, write " "optimization code.\n" " Match the code to the topic — do NOT default to generic " "gradient descent.\n" "2. Use proper mathematical models appropriate to the research " "question.\n" " Examples: agent-based simulation, graph algorithms, " "statistical analysis,\n" " optimization, Monte Carlo methods — whatever fits the topic.\n" "3. Run REAL computational experiments with meaningful " "parameters.\n" "4. Collect REAL metrics that directly answer the research " "question.\n" "5. The code must be scientifically meaningful — a reviewer should " "see\n" " actual implementations relevant to the TOPIC, not a generic " "optimizer.\n\n" "OUTPUT FORMAT — return multiple files using this exact format:\n" "```filename:main.py\n" "# entry point code\n" "```\n\n" "```filename:models.py\n" "# model/algorithm implementations\n" "```\n\n" "Only create additional files (optimizers.py, data_utils.py, etc.) " "if they contain substantial logic (>20 lines). Do NOT create stub " "files with only imports or pass statements.\n\n" "CODE STRUCTURE:\n" "- main.py: entry point that runs experiments and prints metrics\n" "- main.py MUST begin with a docstring specifying:\n" " (a) Dataset used and how it is loaded\n" " (b) Distribution shift / corruption definition (if applicable)\n" " (c) Model architecture (layers, dimensions, activation)\n" " (d) Training protocol (optimizer, epochs, batch size, LR schedule)\n" " (e) Evaluation protocol (train/test split, metrics computed)\n" "- Additional modules for algorithms, objective functions, " "utilities\n" "- Primary metric key: {metric}\n" "- main.py must print metric lines as `name: value` (one per " "line)\n" "- Use deterministic seeds (numpy.random.seed or random.seed)\n" "- No external data files, no network calls, no GPU required\n" "- FORBIDDEN: subprocess, os.system, eval, exec, shutil, socket\n" "{pkg_hint}\n" "ANTI-PATTERNS (do NOT do these):\n" "- Do NOT generate random numbers and pretend they are experiment " "results\n" "- Do NOT use `random.uniform()` to simulate a decreasing loss " "curve\n" "- Do NOT hardcode metric values or use trivial arithmetic as " "metrics\n\n" "MULTI-CONDITION REQUIREMENT (CRITICAL):\n" "The experiment plan below specifies multiple conditions, treatments, " "or strategies to compare. Your code MUST:\n" "1. Implement ALL conditions/treatments listed in the experiment plan " "— not just one baseline.\n" "2. Run each condition independently with the same controlled setup " "(same seeds, same initialization, same budget).\n" " IMPORTANT: All conditions MUST be iterated INSIDE main.py using a " "for-loop or dispatch table. NEVER use argparse --condition or any CLI " "argument to select a condition. The harness calls `python main.py` " "with NO arguments — if you add a required --condition arg it will crash.\n" "3. Print metrics with condition labels: " "`condition= {metric}: ` for EACH condition.\n" "4. After all conditions, print a summary comparison line: " "`SUMMARY: condition1=, condition2=, ...`\n" "5. If the plan has N conditions, the output MUST contain N separate " "labeled metric streams. Running only one condition is NOT acceptable.\n" "6. BREADTH-FIRST ORDERING: Run ONE representative configuration per " "condition FIRST (e.g., default parameters), so that ALL conditions " "produce at least one result. Only AFTER all conditions have results, " "run additional parameter sweeps if time remains. This prevents the " "time budget from being exhausted on condition 1's parameter sweep " "while conditions 2..N never execute.\n" "7. CONDITION COMPLETENESS: After code generation, mentally verify that " "EVERY condition in the experiment plan below has a corresponding code " "path. If the plan lists conditions A, B, C, D — your code must handle " "all four, not just A, B, C. Missing conditions invalidate the experiment.\n" "8. CRASH RESILIENCE: Wrap each condition's execution in a try/except " "block so that if one condition crashes (e.g., NaN, timeout, config error), " "the remaining conditions still execute. Print `CONDITION_FAILED: " "` on failure and continue to the next condition. A partial result " "set is far more valuable than a complete crash.\n" "9. CONDITION REGISTRY VALIDATION: At startup (before running experiments), " "enumerate all condition names and verify each has a valid code path. Print " "`REGISTERED_CONDITIONS: , , ...` at the top of output. If " "any condition is unrecognized, print `MISSING_CONDITION: ` and skip " "it gracefully rather than raising an exception.\n" "10. TOTAL CONDITIONS LIMIT (HARD RULE): Your code MUST NOT register more " "than 8 total conditions. If the experiment plan lists ablations with many " "parameter values (e.g., 'test decay rates 0.9, 0.99, 0.995, 0.999, 0.9999'), " "pick the 2-3 most informative values — do NOT create a separate condition for " "each value. 8 conditions × 3 seeds × budget ÷ conditions = tight timing. " "Quality of each condition matters more than quantity.\n\n" "METRIC DEFINITION REQUIREMENT (CRITICAL):\n" "- At the top of main.py, include a docstring or comment block that defines:\n" " * METRIC NAME: the exact key printed as `{metric}: `\n" " * DIRECTION: {metric_direction_hint}\n" " * UNITS/SCALE: what the number represents (e.g., MSE in log scale, " "accuracy 0-1, discovery rate per episode)\n" " * FORMULA: how the metric is computed from raw experiment outputs\n" " * AGGREGATION: how per-step/per-episode values are reduced to a scalar\n" "- Print this definition at runtime: `METRIC_DEF: {metric} | direction= " "| desc=`\n" "- Without this definition, the metric is UNINTERPRETABLE and the paper cannot " "make any claims about which method is better.\n\n" "STATISTICAL RIGOR REQUIREMENT:\n" "- Run each condition with at least 5 different random seeds (10+ preferred " "if time budget allows). Minimum 3 seeds is MANDATORY.\n" "- Print per-seed results: `condition= seed= {metric}: `\n" "- Print mean and std across seeds: " "`condition= {metric}_mean: {metric}_std: `\n" "- If time budget is tight, reduce per-seed iterations rather than " "reducing seed count. Minimum 3 seeds is non-negotiable.\n" "- SEED COUNT IS FIXED AT 3 MINIMUM. Do NOT compute seed count dynamically.\n" " Hardcode `SEEDS = [0, 1, 2]`. If 3 seeds × all conditions exceeds the time " "budget, REDUCE the number of conditions or training epochs — NEVER reduce seeds.\n" " Print: `SEED_COUNT: 3 (fixed minimum, budget={time_budget}s, conditions=N)`.\n" "- Report bootstrap 95% confidence intervals when n >= 5.\n\n" "FAILURE-AWARE REPORTING (CRITICAL for RL/unstable methods):\n" "- Track how many seeds succeed vs fail (NaN, divergence, crash) per " "condition. Print: `condition= success_rate: /`\n" "- Compute UNCONDITIONAL metrics: treat failed seeds as worst-case " "(e.g., metric=0 or metric=worst_baseline). Print: " "`condition= unconditional_{metric}_mean: `\n" "- This prevents survivorship bias where a method looks good only " "because failed runs are excluded.\n" "- For RL methods, add STABILITY SAFEGUARDS in the code:\n" " * Gradient clipping (max norm 1.0)\n" " * Reward normalization/clipping to [-10, 10]\n" " * NaN checks on loss/gradients with graceful early stop (not crash)\n" " * Learning rate warmup or conservative initial learning rate\n" " These safeguards should PREVENT most NaN/divergence, not just catch " "them after the fact.\n\n" "PYTORCH RL IMPLEMENTATION BUGS (CRITICAL — these cause 100% crash rate):\n" "- 'Trying to backward through the graph a second time' is the #1 crash.\n" " CAUSE: reusing a computed tensor across multiple backward() calls.\n" " FIX: Always .detach() values used in the next iteration:\n" " ```\n" " # WRONG:\n" " old_log_prob = policy.log_prob(action) # still attached to graph\n" " # ... later in update loop:\n" " ratio = new_log_prob / old_log_prob # backward crashes\n" " \n" " # CORRECT:\n" " old_log_prob = policy.log_prob(action).detach() # detach!\n" " # ... later in update loop:\n" " ratio = new_log_prob / old_log_prob.detach() # safe\n" " ```\n" "- For PPO: old_log_probs MUST be .detach()ed when stored for later ratio computation.\n" "- For value functions: target values MUST be .detach()ed (don't backprop through targets).\n" "- For curiosity/intrinsic reward: prediction errors used as reward MUST be .detach()ed.\n" "- General rule: any tensor from a PREVIOUS forward pass that is used in the CURRENT " "loss computation MUST be .detach()ed.\n" "- When in doubt, add .detach() — it never causes crashes, but missing it always does.\n\n" "NEURAL NETWORK DIMENSION CONSISTENCY (CRITICAL — #2 crash cause):\n" "- 'input and weight.T shapes cannot be multiplied' means obs_dim != network input_dim.\n" "- When the environment observation size VARIES across regimes (e.g., easy=6, hard=8), " "the neural network's input layer MUST match EACH regime's obs_dim.\n" "- FIX: Create the network INSIDE the per-regime loop, or parameterize input_dim:\n" " ```\n" " # WRONG: fixed input_dim for all regimes\n" " policy = PolicyNet(input_dim=10) # breaks if obs_dim != 10\n" " for regime in regimes:\n" " obs = env.reset() # obs.shape may vary!\n" " \n" " # CORRECT: dynamic input_dim per regime\n" " for regime in regimes:\n" " obs = env.reset()\n" " obs_dim = obs.shape[-1] # or len(obs)\n" " policy = PolicyNet(input_dim=obs_dim) # fresh network per regime\n" " ```\n" "- ALWAYS initialize neural networks AFTER knowing the observation dimension.\n\n" "KNOWLEDGE DISTILLATION (KD) STABILITY (if applicable):\n" "- Teacher network MUST be frozen: `teacher.eval()` and " "`for p in teacher.parameters(): p.requires_grad = False`\n" "- Temperature parameter T: typical range 1-20. Use T=4 as default. " "NEVER use T<1 (causes sharp distributions → NaN gradients).\n" "- Loss balance: `loss = alpha * kd_loss + (1-alpha) * task_loss` — " "set alpha=0.5-0.9. If kd_loss scale >> task_loss, val_loss becomes NaN.\n" "- PROJECTION LAYERS: If teacher and student have different intermediate " "dimensions (e.g., teacher_dim=768, student_dim=256), you MUST add " "`nn.Linear(student_dim, teacher_dim)` to align features before computing " "distillation loss. Without projection layers, tensor shape mismatch WILL crash.\n" "- Common KD NaN causes: (1) no temperature scaling on logits, " "(2) missing gradient clipping, (3) learning rate too high (use ≤1e-3), " "(4) teacher not frozen → unstable targets.\n\n" "PAIRED STATISTICAL ANALYSIS (CRITICAL for publishable results):\n" "- Use the SAME random seeds across all conditions so results are paired.\n" "- After collecting per-seed results for all conditions, compute paired " "differences: for each seed s, diff(s) = method(s) - baseline(s).\n" "- Print paired analysis: " "`PAIRED: vs mean_diff= std_diff= " "t_stat= p_value=`\n" "- Also print bootstrap 95% CI of the paired difference.\n" "- This is FAR more powerful than independent comparisons because it " "controls for seed-to-seed variance.\n\n" "MULTI-REGIME REQUIREMENT (CRITICAL for generality claims):\n" "- The experiment MUST test at least 2 different difficulty/noise regimes " "(e.g., low noise vs high noise, small hypothesis space vs large).\n" "- Report results per-regime, not just aggregated across regimes.\n" "- Print regime labels: " "`condition= regime= {metric}: `\n" "- This prevents conclusions that only hold in one setting from being " "presented as general findings.\n\n" "DIMENSION CONSISTENCY CHECK (CRITICAL for RL/neural methods):\n" "- Before passing observations/states to neural networks or policy " "parameters, VERIFY that dimensions match. Common bug: environment " "state has dimension D1 but network expects D2.\n" "- At the start of each condition, print the state/observation " "dimension and the network input dimension. If they mismatch, " "reshape or adjust the network before proceeding.\n" "- Test EVERY condition with a single dry-run step before the full " "loop to catch shape mismatches early.\n\n" "TIME-TO-EVENT METRIC BUG PREVENTION (CRITICAL — common silent bug):\n" "- If the primary metric is a 'time-to-X' measure (e.g., time-to-discovery, " "steps-to-convergence, episodes-to-threshold), you MUST check the success " "criterion at EVERY step inside the loop, not only at the end.\n" "- WRONG pattern (produces degenerate ceiling data):\n" " ```\n" " for t in range(horizon):\n" " obs, r, done, info = env.step(a)\n" " success = check(info) # only checked ONCE at end\n" " time_to_X = horizon if not success else t + 1 # t+1 = horizon always!\n" " ```\n" "- CORRECT pattern (captures actual first-success time):\n" " ```\n" " time_to_X = horizon # default: never succeeded\n" " for t in range(horizon):\n" " obs, r, done, info = env.step(a)\n" " if check(info) and time_to_X == horizon: # first success\n" " time_to_X = t + 1\n" " if done: break\n" " ```\n" "- This bug causes ALL methods to return the same ceiling value, making " "the entire experiment useless. Every method looks identical at the cap.\n" "- APPLY THIS TO ALL CONDITIONS: RandomSearch, BO, RL — every single " "condition must check at every step. If even one condition uses the wrong " "pattern, the comparison is invalid.\n\n" "METRIC DISCRIMINATION VALIDATION (CRITICAL):\n" "- After running all conditions, check if all conditions produce the SAME " "mean metric value. If they do, the metric is NOT discriminative and the " "experiment is scientifically useless.\n" "- Common causes: ceiling/floor effects, too-easy or too-hard tasks, " "time-to-event bug above, metric that doesn't capture real differences.\n" "- If all conditions have identical means, print " "`WARNING: DEGENERATE_METRICS all conditions have same mean=` " "and you MUST take corrective action:\n" " (a) If all means = 1.0 or max: increase task difficulty (reduce budget, " "increase noise, enlarge hypothesis space)\n" " (b) If all means = 0.0: decrease difficulty\n" " (c) Re-run after adjustment and verify means now differ\n" " (d) If adjustments don't help, switch to a different primary metric\n" "- A degenerate experiment CANNOT produce a publishable paper. Fix it.\n\n" "DIFFICULTY CALIBRATION (CRITICAL for meaningful results):\n" "- After running a pilot (3-5 seeds, 2 conditions: random_search + one RL), " "check BOTH success rate AND metric discrimination.\n" "- TWO things must be true for the experiment to be informative:\n" " 1. Success rate between 30-80% (not too hard, not too easy)\n" " 2. Primary metric varies across conditions (not all methods score the same)\n" "- CEILING DETECTION (CRITICAL): If primary_metric is 1.0 (or max possible) " "for ALL pilot seeds in ALL pilot conditions, the task is TRIVIALLY EASY. " "You MUST increase difficulty until the metric varies. Options:\n" " * Reduce experiment budget/horizon (fewer steps to find solution)\n" " * Increase hypothesis space size\n" " * Increase observation noise\n" " * Tighten the success criterion (e.g., require closer match)\n" " * Reduce the number of allowed experiments per episode\n" "- FLOOR DETECTION: If primary_metric is 0.0 for all conditions, task is " "too hard. Reduce noise, enlarge budget, simplify.\n" "- Print `CALIBRATION: regime= pilot_success_rate= " "pilot_primary_metric_std=` after calibration.\n" "- If std=0, the metric is NOT discriminative — adjust until std > 0.\n" "- Run a calibration loop: pilot → check → adjust → re-pilot (max 3 iterations).\n\n" "ALGORITHM IMPLEMENTATION INTEGRITY (CRITICAL — mismatch = academic fraud):\n" "1. If you name a method 'Bayesian Optimization', you MUST implement:\n" " - A surrogate model (e.g., Gaussian Process or random forest)\n" " - An acquisition function (e.g., Expected Improvement, UCB)\n" " - Surrogate model updates after each observation\n" " DO NOT implement UCB1 bandit and call it 'Bayesian Optimization'.\n" "2. If you name a method 'PPO', you MUST implement:\n" " - A clipped surrogate objective: min(r_t * A_t, clip(r_t, 1-eps, 1+eps) * A_t)\n" " - A learned value function baseline\n" " - The clip_eps parameter MUST be used in the policy update\n" " DO NOT implement vanilla REINFORCE and call it 'PPO'.\n" "3. Every declared hyperparameter MUST be used in the algorithm:\n" " - If you declare clip_eps, it must appear in the loss computation\n" " - If you declare entropy_coef, it must be added to the policy loss\n" " - Dead parameters (declared but never used) are strictly forbidden\n" "4. Ablation conditions MUST produce different behavior:\n" " - Two conditions that differ only in a parameter that is never read are IDENTICAL\n" " - Verify: if two conditions produce identical outputs on the same seed, " "the ablation is broken and MUST be fixed\n" " ABLATION DESIGN PATTERN (CRITICAL — #1 cause of broken ablations):\n" " - 'no_key_component': Must REMOVE a core algorithmic component " "(e.g., disable the graph structure by zeroing the adjacency, or remove " "the contrastive loss, or disable the RL policy and use random actions). " "The removal MUST change the forward() / step() computation.\n" " - 'reduced_capacity': Must REDUCE model capacity by at least 2x " "(e.g., halve hidden dimensions, reduce layers, shrink embedding size). " "This MUST create a new model with different architecture, NOT just " "rename a parameter with the same value.\n" " - SELF-TEST: After implementing ablations, add a startup check that " "runs one forward pass per condition on the SAME input and asserts outputs " "differ. Print: `ABLATION_CHECK: vs outputs_differ=True`.\n" " - If outputs are identical, the ablation is BROKEN — do not proceed.\n\n" "CODE IMPLEMENTATION DEPTH (CRITICAL — shallow code = reject):\n" "- Each algorithm/method MUST be a separate Python class with genuine logic.\n" "- Each class MUST have at least: __init__(), and one core method " "(forward/predict/train_step/step) with non-trivial implementation.\n" "- The core method of the MAIN proposed method MUST be at least 20 lines " "of effective code (excluding comments, blanks, imports).\n" "- FORBIDDEN patterns that will be detected and rejected:\n" " * `class MethodB(MethodA): pass` — empty subclass\n" " * Two classes with identical method bodies but different names\n" " * nn.Linear/nn.Conv2d created inside forward() instead of __init__()\n" " * Variables defined only inside an if-branch but used after the branch\n" " * Using np.erf() (doesn't exist — use scipy.special.erf or math.erf)\n" " * Using ndarray.ptp() (removed in NumPy 2.0 — use np.ptp(arr) or arr.max()-arr.min())\n" " * Using np.bool, np.int, np.float, np.complex (removed in NumPy 2.0 — use np.bool_, np.int64, etc.)\n" " * Replacing real model training with synthetic utility functions or random scores\n" " * Using dict[key] without ensuring key exists — use dict.get(key, default) " "or verify key is in dict before access\n" "- If the experiment plan includes 'implementation_spec', you MUST follow " "the pseudocode steps exactly. Each algorithm_step should correspond to " "1-3 lines of code in the class.\n" "- Ablation variants MUST modify the forward() or step() logic, not just " "change a hyperparameter value.\n\n" "MINIMUM SEED COUNT (CRITICAL — 3 seeds = unpublishable):\n" "- Use AT LEAST 5 random seeds per condition (10 preferred if time permits)\n" "- Use AT LEAST 30 episodes per seed for RL methods\n" "- When computing bootstrap CIs, use at least 1000 bootstrap samples\n" "- For method comparisons: use paired bootstrap or Wilcoxon signed-rank test\n" "- Report effect sizes (Cohen's d) alongside p-values\n\n" "Experiment plan:\n{exp_plan}" ), "max_tokens": 8192, }, "resource_planning": { "system": "You are an experiment scheduler.", "user": ( "Create schedule JSON with GPU/time estimates.\n" "Schema: {tasks:[{id,name,depends_on,gpu_count,estimated_minutes," "priority}], total_gpu_budget, generated}.\n" "Experiment plan:\n{exp_plan}" ), "json_mode": True, }, # ── Phase F: Analysis & Decision ───────────────────────────────────── "result_analysis": { "system": ( "You are a quantitative research analyst. Always cite exact numbers " "from the provided data." ), "user": ( "{preamble}\n\n" "{data_context}\n\n" "Analyze run metrics and produce markdown report with statistical " "interpretation.\n" "Use the ACTUAL quantitative values provided above — do NOT invent " "numbers.\n\n" "SANITY CHECKS (perform BEFORE interpreting results):\n" "1. MONOTONICITY: If a condition scales a parameter (e.g., N agents, " "model size), check whether metrics move in the expected direction. " "If accuracy *decreases* when adding more agents under majority voting, " "flag this as a likely implementation bug (vote parsing, normalization, " "or aggregation issue).\n" "2. BASELINE PLAUSIBILITY: Random-chance baselines should match " "theoretical expectations (e.g., 1/K for K-class classification).\n" "3. CROSS-CONDITION CONSISTENCY: Results across datasets or conditions " "should be internally coherent — wildly different patterns may indicate " "confounds or bugs.\n" "4. REPLICATION: If results are from a single seed (n=1), explicitly " "note that no statistical significance claims can be made.\n" "5. ABLATION ISOLATION: Compare per-seed values across conditions. If " "two conditions produce IDENTICAL values for the same seed, this is a " "RED FLAG — the ablation/variant may not have actually changed the code " "path (e.g., config not applied, caching, shared state). Flag this " "explicitly and recommend a config/registry audit.\n" "6. METRIC DEFINITION CHECK: Look for a `METRIC_DEF:` line in the output. " "If absent, flag that the primary metric is UNDEFINED — direction, units, " "and formula are unknown, making all comparisons uninterpretable. This is " "a critical methodology gap.\n" "7. CONDITION COMPLETENESS CHECK: Look for `REGISTERED_CONDITIONS:` in " "the output. Compare against the experiment plan. If conditions are missing " "or failed (look for `CONDITION_FAILED:`), list them explicitly and assess " "whether the remaining conditions can still answer the research question.\n" "8. DEGENERATE METRICS CHECK: If ALL conditions (or all but one) produce " "the SAME mean primary metric value, flag this as DEGENERATE — the metric " "is NOT discriminative. Common causes: (a) time-to-event metric that only " "checks success at the final step (returns horizon for all methods), " "(b) ceiling/floor effects from too-easy or too-hard tasks, " "(c) metric capped at a budget value. This makes the experiment " "scientifically useless — recommend REFINE with a note to fix the metric " "computation or task difficulty. Look for `WARNING: DEGENERATE_METRICS` " "in stdout. Even if not printed, check the numbers yourself.\n\n" "Required sections: Metrics Summary (with real values), " "Consensus Findings (high confidence), " "Contested Points (with evidence-based resolution), " "Statistical Checks, Methodology Audit, Limitations, Conclusion.\n" "In the Conclusion, include:\n" "- Result quality rating (1-10)\n" "- Key findings (3-5)\n" "- Methodology gaps to address next\n" "- Recommendation: PROCEED / REFINE / PIVOT\n\n" "Run context:\n{context}" ), "max_tokens": 8192, }, "research_decision": { "system": "You are a research program lead making go/no-go decisions.", "user": ( "Based on the analysis, make one of three decisions:\n" "- **PROCEED** — results are sufficient, move to paper writing\n" "- **PIVOT** — hypotheses are fundamentally flawed, generate new ones\n" "- **REFINE** — hypotheses are sound but experiments need re-tuning\n\n" "MINIMUM QUALITY CRITERIA for PROCEED (ALL must be met):\n" "1. At least 2 baselines AND the proposed method have results\n" "2. The primary metric is defined (direction, units known)\n" "3. Each condition has results from ≥3 seeds\n" "4. No identical per-seed values across different conditions (ablation integrity)\n" "5. The analysis quality rating is ≥4/10\n" "If ANY criterion is not met, you MUST choose REFINE (not PROCEED).\n\n" "Output markdown with sections:\n" "## Decision\n" "State exactly one of: PROCEED, PIVOT, or REFINE\n\n" "## Justification\n" "Why this decision is warranted based on evidence.\n\n" "## Evidence\n" "Key data points supporting the decision.\n\n" "## Next Actions\n" "Concrete steps for the chosen path.\n\n" "Analysis:\n{analysis}" ), }, # ── Phase G: Paper Writing ─────────────────────────────────────────── "paper_outline": { "system": "You are an academic writing planner for top-tier AI conferences.", "user": ( "{preamble}\n\n" "{academic_style_guide}\n\n" "Create a detailed paper outline in markdown.\n" "Include per-section goals, word count targets, and evidence links.\n" "The outline MUST include a catchy method name (2-5 chars) for the paper title.\n" "Propose 3 candidate titles following the 'MethodName: Subtitle' format " "(each <= 14 words). Rate each on memorability (1-5), specificity (1-5), " "and novelty signal (1-5).\n" "{topic_constraint}" "{feedback}" "Analysis:\n{analysis}\n\nDecision:\n{decision}" ), "max_tokens": 8192, }, "paper_draft": { "system": ( "You are a top-tier academic paper author writing for leading venues.\n\n" "KEY PRINCIPLES (from accepted paper analyses):\n" "1. NOVELTY: A good paper has 1-2 key ideas and keeps the rest simple.\n" "2. NARRATIVE: A short, rigorous, evidence-based technical story with a takeaway.\n" "3. STRONG BASELINES: Invest real effort in making baselines competitive.\n" "4. ABLATIONS: Remove one component at a time and measure the effect.\n" "5. HONESTY: Acknowledge limitations explicitly.\n" "6. REPRODUCIBILITY: Include all details needed to reproduce results.\n\n" "EVIDENCE-BOUNDING RULES (CRITICAL — violation = reject):\n" "7. EVERY claim in the title, abstract, and conclusion MUST be directly " "supported by specific experimental metrics provided below.\n" "8. If the experiment only covers partial conditions, the title MUST NOT " "make global causal claims. Use 'Toward...', 'Investigating...', or " "'An Empirical Study of...' instead of 'X Dominates Y'.\n" "9. BEFORE writing the title, list the conditions actually tested and " "their metric values. The title must only claim what those numbers show.\n" "10. If a metric is a single scalar without condition labels, do NOT " "claim comparative results between strategies/methods.\n" "11. Distinguish between 'we propose and validate' (has full results) vs " "'we propose and present preliminary evidence' (partial results).\n\n" "You ONLY use real experimental data — never fabricate or approximate numbers.\n\n" "METHOD SECTION REQUIREMENTS:\n" "12. The Method section MUST include ALL implementation details needed " "for reproduction: algorithm pseudocode or step-by-step description, " "hyperparameters (learning rate, clipping, discount factor, etc.), " "state/observation representation, reward definition, and baseline " "configurations.\n" "13. For learning-based methods: specify model architecture, training procedure " "(iterations, epochs, batch handling), and any stability " "mechanisms (regularization, normalization).\n" "14. For baselines: specify the exact algorithm/method configuration " "and any tuning performed to make baselines competitive.\n\n" "FAILURE-AWARE REPORTING REQUIREMENTS:\n" "15. If any method has a success rate < 100%, the Results section " "MUST report success rates per method and explain inclusion/exclusion " "criteria.\n" "16. Report BOTH conditional metrics (successful runs only) AND " "unconditional metrics (treating failures as worst-case). Without " "both, comparative claims are biased by survivorship.\n" "17. The Limitations section MUST discuss stability/reliability " "if any method showed NaN/divergence/crashes.\n\n" "BENCHMARK & ENVIRONMENT SPECIFICATION:\n" "18. The Experiments section MUST fully specify the evaluation " "environment: state/observation space, action space, hypothesis space, " "noise model, episode length, and any randomization procedures.\n" "19. Report results PER REGIME (e.g., per noise level, per problem " "size) with separate tables or sub-sections. Aggregated-only results " "cannot support claims about robustness or generality.\n" "20. Include a table comparing all methods across all regimes with " "paired statistical tests (bootstrap CI of paired differences, or " "paired t-test p-values). Without this, comparative claims lack " "statistical grounding.\n\n" "METHOD NAMING RULES:\n" "21. NEVER use generic labels like 'baseline_1', 'method_variant_1', " "'method_variant_2' in the paper. Use descriptive algorithm/method names " "that reflect what the method actually does. Generic labels make the paper " "scientifically uninterpretable.\n" "22. Each method MUST have a full description: architecture, " "training procedure, key hyperparameters, and implementation details. " "A reader should be able to reimplement every method from the paper alone.\n\n" "STATISTICAL REPORTING (MANDATORY for acceptance):\n" "23. EVERY result table MUST include 95% confidence intervals " "(mean +/- CI or [low, high]).\n" "24. EVERY comparison claim ('A outperforms B') MUST cite p-value. " "If p >= 0.05, write: 'The difference is not statistically significant.'\n" "25. If the proposed method does NOT statistically significantly " "outperform a baseline, do NOT claim superiority. Reframe as " "'comparable', 'competitive', or 'negative result'.\n\n" "WRITING STYLE RULES:\n" "26. DO NOT repeat disclaimers like 'due to computational constraints, " "this analysis was not conducted' more than once. State each limitation " "ONCE in the Limitations section.\n" "27. The Limitations section should be concise (200-400 words) listing " "3-5 key limitations. Do NOT scatter limitation disclaimers throughout " "every section.\n" "28. Focus 80% of the paper on WHAT YOU DID and WHAT YOU FOUND, not " "on what you could not do. Positive scientific contribution should " "dominate the paper.\n" "29. Cite 25-40 unique references in the paper body. The Related Work " "section alone should cite at least 15 references. Cite only directly " "relevant work — do NOT pad with tangentially related papers.\n" "30. CITE ORIGINAL PAPERS: When discussing a technique (e.g., Batch " "Normalization, ResNet, Adam, PPO), ALWAYS cite the original paper that " "introduced it. Do NOT cite a survey or follow-up instead of the original. " "The available references list includes foundational papers — use them.\n" "31. BASELINE MODERNITY: When discussing baselines and comparisons, ensure " "the paper acknowledges whether the baselines represent current practice. " "If baselines are older methods, explicitly discuss why they were chosen " "and acknowledge stronger modern alternatives exist." ), "user": ( "{preamble}\n\n" "{academic_style_guide}\n" "{narrative_writing_rules}\n" "{anti_hedging_rules}\n" "{anti_repetition_rules}\n" "Write a full paper draft section by section in markdown.\n" "Required sections: Title, Abstract, Introduction, Related Work, " "Method, Experiments, Results, Discussion, Limitations, Broader Impact, " "Conclusion, References.\n" "The Broader Impact section (2-3 paragraphs) MUST discuss: " "(1) potential positive societal impacts of this work, " "(2) potential negative societal impacts or risks, " "(3) ethical considerations specific to this research area. " "This section is MANDATORY for top ML venues and recommended for all research papers.\n" "{writing_structure}\n" "{topic_constraint}" "{exp_metrics_instruction}" "{citation_instruction}" "All experimental results MUST be presented in LaTeX tables or inline prose. " "Raw metric path formats like 'method/env/step/metric: value' are FORBIDDEN " "in the paper text. Convert all data to clean, formatted presentation.\n" "The paper MUST fit within 10 pages (excluding references and appendix). " "Aim for 8-9 pages of main content. Be concise.\n" "FIGURE RULES: When referencing figures, use ONLY \\ref{fig:label} cross-references. " "NEVER add bold standalone paragraphs like '**Figure 1.**' after figure environments. " "Do NOT add \\clearpage before or after figures/tables unless absolutely necessary.\n" "TABLE RULES: Tables MUST use standard LaTeX tabular syntax with bare braces: " "\\begin{tabular}{lcc}, NOT \\begin{tabular}\\{lcc\\}. " "NEVER use '--' as placeholder values in table cells. " "If a metric is unavailable, write 'N/A' or omit the row entirely.\n" "Outline:\n{outline}" ), "max_tokens": 16384, }, "peer_review": { "system": "You are a balanced conference reviewer.", "user": ( "Simulate peer review from at least 3 reviewer perspectives.\n" "Output markdown with Reviewer A (methodology expert), " "Reviewer B (domain expert), and Reviewer C (statistics/rigor expert), " "each including strengths, weaknesses, and actionable revisions.\n\n" "Check specifically:\n" "1. TOPIC ALIGNMENT: Does the paper stay on topic ({topic})? " "Flag any sections where the paper drifts to unrelated topics or " "presents environment issues as contributions.\n" "2. CLAIM-EVIDENCE ALIGNMENT: For EACH claim in the title, abstract, " "and conclusion, verify there is a specific metric/table/figure in " "the Results section supporting it. Flag unsupported claims.\n" "3. STATISTICAL VALIDITY: Are confidence intervals or error bars " "reported? Is n>1 (multiple seeds)? Are significance tests appropriate?\n" "4. COMPLETENESS: Does the paper have all required sections with " "sufficient depth? A NeurIPS paper body should be 5,000-6,500 words.\n" "5. REPRODUCIBILITY: Are hyperparameters, random seeds, compute " "resources, and dataset details fully specified?\n" "6. WRITING QUALITY: Is the paper written in flowing prose or bullet lists? " "Flag any bullet-point lists in Method/Results/Discussion. Check for " "excessive hedging ('we do not claim'). Verify title is <= 14 words.\n" "7. FIGURES: Does the paper include at least 2 figures? Zero figures = desk reject.\n" "8. CITATION DISTRIBUTION: Are citations only in Intro/Related Work? " "Method, Experiments, and Discussion MUST also cite relevant papers.\n\n" "Paper draft:\n{draft}\n\n" "Experiment evidence for verification:\n{experiment_evidence}" ), "max_tokens": 8192, }, "paper_revision": { "system": ( "You are a paper revision expert.\n\n" "TITLE AND ABSTRACT ALIGNMENT (CRITICAL):\n" "- After reviewing experimental evidence, UPDATE the title if results " "do not support the original claim.\n" "- If the proposed method does NOT beat baselines, use a title like " "'An Empirical Study of...', 'When X Falls Short: ...', or " "'Investigating ... : Negative Results and Insights'.\n" "- Rewrite the abstract to accurately reflect what was FOUND, not " "what was hoped. The abstract must match actual numbers.\n" "- The conclusion MUST match actual results — no aspirational claims.\n\n" "IMPORTANT WRITING RULES:\n" "- Do NOT add disclaimers like 'due to computational constraints' " "or 'this analysis was not conducted'. If a limitation exists, " "mention it ONCE in the Limitations section only.\n" "- Focus 80% of the paper on what was DONE and what was FOUND.\n" "- Do NOT add hedging language that was not in the original draft.\n" "- Keep Limitations to 200-400 words with 3-5 concise points.\n" "- Ensure every comparison claim cites a p-value or states that " "the difference is not statistically significant.\n" ), "user": ( "{academic_style_guide}\n" "{narrative_writing_rules}\n" "{anti_hedging_rules}\n" "{anti_repetition_rules}\n" "Revise the paper draft to address all review comments.\n" "Return revised markdown only.\n\n" "CRITICAL REVISION RULES:\n" "- Transform any remaining bullet-point lists in the body into flowing " "prose paragraphs. The only allowed lists are in the Introduction's contribution " "paragraph and the Limitations section.\n" "- The title MUST be <= 14 words with a catchy method name.\n" "- MANDATORY: The revised paper MUST contain at least 2 markdown image references\n" " (![Caption](charts/...)). If the draft has zero figures, ADD them in the Results\n" " section using the chart files. A paper with zero figures will be desk-rejected.\n" "- Consolidate ALL hedging/caveats into Limitations section only.\n" "- The final paper body MUST be <= 6,500 words (standard 9-page conference limit).\n" " If the current draft exceeds this, compress by removing redundant restatements.\n" "- If the paper exceeds 10 pages, aggressively cut redundant content, " "merge similar sections, and tighten prose. Target 8-9 pages of main content.\n" "- Do NOT add '**Figure N.**' bold paragraphs after figure environments — " "use only \\ref{fig:label} cross-references. Do NOT add \\clearpage " "before figures or tables.\n" "- NEVER use '--' placeholder values in tables. Replace with actual values or 'N/A'.\n" "- CITATION FORMAT (CRITICAL): All citations MUST remain in [cite_key] bracket " "format exactly as they appear in the draft, e.g. [smith2024transformer]. " "Do NOT convert them to author-year format like [Smith et al., 2024] or " "(Smith et al., 2024). The downstream LaTeX converter relies on the " "[cite_key] format to generate \\cite{{}} commands. Changing the format " "will break all references in the final PDF.\n" "- CITATION KEYS (CRITICAL): Do NOT invent or add new citation keys that " "are not already present in the draft. If you want to reference additional " "prior work, describe it in prose WITHOUT a citation bracket. Every " "[cite_key] you write MUST already exist in the bibliography. Adding " "hallucinated keys like [smith2020method] creates broken [?] references " "in the final PDF.\n" "{writing_structure}\n" "{topic_constraint}" "Draft:\n{draft}\n\nReviews:\n{reviews}" ), "max_tokens": 16384, }, # ── Phase H: Finalization ──────────────────────────────────────────── "quality_gate": { "system": "You are a final quality gate evaluator.", "user": ( "Evaluate revised paper quality and return JSON.\n" "Schema: {score_1_to_10:number, verdict:string, strengths:[...], " "weaknesses:[...], required_actions:[...]}.\n" "Threshold: {quality_threshold}\n" "Paper:\n{revised}" ), "json_mode": True, }, "knowledge_archive": { "system": "You produce reproducibility-focused research retrospectives.", "user": ( "{preamble}\n\n" "Write retrospective archive markdown with lessons, " "reproducibility notes, and future work.\n" "Decision:\n{decision}\n\nAnalysis:\n{analysis}\n\n" "Revised paper:\n{revised}" ), "max_tokens": 8192, }, "export_publish": { "system": "You are a publication formatting editor.", "user": ( "Format revised paper into clean final markdown for publication " "export.\n" "Preserve content quality and readability.\n" "CITATION FORMAT (CRITICAL): All citations MUST remain in [cite_key] bracket " "format, e.g. [smith2024transformer]. Do NOT convert to author-year " "format like [Smith et al., 2024]. The [cite_key] format is required " "for downstream LaTeX \\cite{{}} generation.\n" "Input paper:\n{revised}" ), "max_tokens": 16384, }, } ================================================ FILE: researchclaw/quality.py ================================================ """Content quality assessment — template detection and metrics. Detects placeholder/template content in LLM-generated text and provides quality metrics for pipeline outputs. """ from __future__ import annotations import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) _TEMPLATE_PATTERNS: list[tuple[str, str]] = [ ( r"(?i)template\s+(abstract|introduction|method|methodology|conclusion|discussion|results|related\s+work)", "Template section header", ), (r"(?i)\[INSERT\s+.*?\]", "Insert placeholder"), (r"(?i)\[TODO\s*:?\s*.*?\]", "TODO placeholder"), (r"(?i)\[PLACEHOLDER\s*:?\s*.*?\]", "Explicit placeholder"), (r"(?i)lorem\s+ipsum", "Lorem ipsum filler"), ( r"(?i)this\s+section\s+will\s+(describe|discuss|present|outline|explain)", "Future-tense placeholder", ), ( r"(?i)we\s+will\s+(describe|discuss|present|outline|explain)\s+in\s+this\s+section", "Future-tense placeholder", ), ( r"(?i)add\s+(your|the)\s+(content|text|description)\s+here", "Add content placeholder", ), (r"(?i)replace\s+this\s+(text|content|section)", "Replace placeholder"), (r"(?i)^#+\s*section\s+\d+\s*$", "Generic section header"), ( r"(?i)your\s+(abstract|introduction|method|results)\s+goes?\s+here", "Content placeholder", ), (r"(?i)sample\s+(abstract|introduction|text|content)", "Sample content marker"), ] @dataclass(frozen=True) class TemplateMatch: """A single template/placeholder detection.""" pattern_desc: str line_number: int excerpt: str @dataclass(frozen=True) class QualityReport: """Quality assessment for a text document.""" total_lines: int total_chars: int template_matches: tuple[TemplateMatch, ...] = () template_ratio: float = 0.0 @property def has_template_content(self) -> bool: return len(self.template_matches) > 0 @property def match_count(self) -> int: return len(self.template_matches) def to_dict(self) -> dict[str, object]: match_rows: list[dict[str, object]] = [ { "pattern": m.pattern_desc, "line": m.line_number, "excerpt": m.excerpt, } for m in self.template_matches ] return { "total_lines": self.total_lines, "total_chars": self.total_chars, "template_matches": match_rows, "template_ratio": round(self.template_ratio, 4), "has_template_content": self.has_template_content, "match_count": self.match_count, } def detect_template_content(text: str) -> list[TemplateMatch]: """Scan text for template/placeholder patterns. Returns list of TemplateMatch objects for each detected pattern. """ matches: list[TemplateMatch] = [] lines = text.split("\n") for line_num, line in enumerate(lines, start=1): stripped = line.strip() if not stripped: continue for pattern, desc in _TEMPLATE_PATTERNS: for m in re.finditer(pattern, stripped): excerpt = m.group(0)[:100] matches.append( TemplateMatch( pattern_desc=desc, line_number=line_num, excerpt=excerpt, ) ) return matches def compute_template_ratio(text: str) -> float: """Estimate what fraction of the text is template/placeholder content. Returns 0.0 (fully original) to 1.0 (fully template). Simple heuristic: count characters in matched lines vs total. """ if not text.strip(): return 0.0 lines = text.split("\n") total_chars = sum(len(line.strip()) for line in lines if line.strip()) if total_chars == 0: return 0.0 template_chars = 0 for line in lines: stripped = line.strip() if not stripped: continue for pattern, _ in _TEMPLATE_PATTERNS: if re.search(pattern, stripped): template_chars += len(stripped) break return min(template_chars / total_chars, 1.0) def assess_quality(text: str) -> QualityReport: """Full quality assessment of a text document.""" lines = text.split("\n") matches = detect_template_content(text) ratio = compute_template_ratio(text) report = QualityReport( total_lines=len(lines), total_chars=len(text), template_matches=tuple(matches), template_ratio=ratio, ) logger.debug( "quality assessed lines=%d chars=%d matches=%d ratio=%.4f", report.total_lines, report.total_chars, report.match_count, report.template_ratio, ) return report def check_strict_quality(text: str, *, threshold: float = 0.05) -> tuple[bool, str]: """Check if text passes strict quality gate. Returns (passed, message). Fails if template_ratio > threshold. """ report = assess_quality(text) if report.template_ratio > threshold: details = "; ".join( f"L{m.line_number}: {m.excerpt}" for m in report.template_matches[:5] ) return False, ( f"Template content detected: ratio={report.template_ratio:.2%}, " f"{report.match_count} matches. Examples: {details}" ) return True, f"Quality check passed: template_ratio={report.template_ratio:.2%}" ================================================ FILE: researchclaw/report.py ================================================ """Generate human-readable run reports from pipeline artifacts.""" # pyright: basic from __future__ import annotations import json import logging import re from pathlib import Path from typing import Any logger = logging.getLogger(__name__) def generate_report(run_dir: Path) -> str: """Generate a Markdown report from a pipeline run directory. Args: run_dir: Path to the run artifacts directory (e.g., artifacts/rc-xxx/) Returns: Markdown string with the report content. Raises: FileNotFoundError: If run_dir doesn't exist. ValueError: If run_dir has no pipeline_summary.json. """ if not run_dir.exists(): raise FileNotFoundError(f"Run directory not found: {run_dir}") summary_path = run_dir / "pipeline_summary.json" if not summary_path.exists(): raise ValueError(f"No pipeline_summary.json found in {run_dir}") loaded = json.loads(summary_path.read_text(encoding="utf-8")) summary = loaded if isinstance(loaded, dict) else {} sections = [] sections.append(_header(summary, run_dir)) sections.append(_paper_section(run_dir)) sections.append(_experiment_section(run_dir)) sections.append(_citation_section(run_dir)) sections.append(_warnings_section(summary)) return "\n\n".join(section for section in sections if section) def _header(summary: dict[str, Any], run_dir: Path) -> str: run_id = summary.get("run_id", "unknown") stages_done = summary.get("stages_done", 0) stages_total = summary.get("stages_executed", 0) status = summary.get("final_status", "unknown") generated = summary.get("generated", "unknown") status_icon = "✅" if status == "done" else "❌" if status == "failed" else "⚠️" lines = [ "# ResearchClaw Run Report", "", f"**Run ID**: {run_id}", f"**Date**: {generated}", f"**Status**: {status_icon} {status} ({stages_done}/{stages_total} stages done)", f"**Artifacts**: `{run_dir}`", ] return "\n".join(lines) def _paper_section(run_dir: Path) -> str: lines = ["## Paper"] draft_path = run_dir / "stage-17" / "paper_draft.md" if draft_path.exists(): text = draft_path.read_text(encoding="utf-8") word_count = len(text.split()) lines.append( f"- Draft: `{draft_path.relative_to(run_dir)}` (~{word_count} words)" ) else: lines.append("- Draft: not generated") final_path = run_dir / "stage-22" / "paper_final.md" if final_path.exists(): lines.append(f"- Final: `{final_path.relative_to(run_dir)}`") tex_path = run_dir / "stage-22" / "paper.tex" if tex_path.exists(): lines.append(f"- LaTeX: `{tex_path.relative_to(run_dir)}`") rev_path = run_dir / "stage-19" / "paper_revised.md" if rev_path.exists(): lines.append(f"- Revised: `{rev_path.relative_to(run_dir)}`") return "\n".join(lines) def _experiment_section(run_dir: Path) -> str: lines = ["## Experiments"] code_path = run_dir / "stage-10" / "experiment_code.py" if code_path.exists(): lines.append(f"- Code: `{code_path.relative_to(run_dir)}`") results_path = run_dir / "stage-12" / "experiment_results.json" if results_path.exists(): try: loaded = json.loads(results_path.read_text(encoding="utf-8")) if isinstance(loaded, dict): data = loaded runs_default: list[Any] = [] iterations = data.get("iterations", data.get("runs", runs_default)) if isinstance(iterations, list): lines.append(f"- Runs: {len(iterations)} iterations") best = data.get("best_metric") or data.get("best_result") if best is not None: lines.append(f"- Best metric: {best}") except (json.JSONDecodeError, TypeError): lines.append("- Results: present (parse error)") else: lines.append("- Results: not available") # BUG-215: Also search stage-14* versioned dirs when stage-14/ is missing. analysis_path = run_dir / "stage-14" / "analysis.md" if not analysis_path.exists(): for _s14 in sorted(run_dir.glob("stage-14*"), reverse=True): _alt = _s14 / "analysis.md" if _alt.exists(): analysis_path = _alt break if analysis_path.exists(): lines.append(f"- Analysis: `{analysis_path.relative_to(run_dir)}`") return "\n".join(lines) def _citation_section(run_dir: Path) -> str: lines = ["## Citations"] bib_path = run_dir / "stage-22" / "references.bib" if not bib_path.exists(): bib_path = run_dir / "stage-04" / "references.bib" if bib_path.exists(): text = bib_path.read_text(encoding="utf-8") entries = re.findall(r"@\w+\{", text) lines.append(f"- References: {len(entries)} BibTeX entries") else: lines.append("- References: not available") verify_path = run_dir / "stage-23" / "verification_report.json" if verify_path.exists(): try: loaded = json.loads(verify_path.read_text(encoding="utf-8")) vdata = loaded if isinstance(loaded, dict) else {} total = int(vdata.get("total_references", 0)) verified = int(vdata.get("verified_count", 0)) suspicious = int(vdata.get("suspicious_count", 0)) hallucinated = int(vdata.get("hallucinated_count", 0)) pct = f"{verified / total * 100:.1f}%" if total > 0 else "N/A" lines.append(f"- Verified: {verified}/{total} ({pct})") if suspicious: lines.append(f"- Suspicious: {suspicious}") if hallucinated: lines.append(f"- Hallucinated: {hallucinated}") except (json.JSONDecodeError, TypeError, ZeroDivisionError): lines.append("- Verification: present (parse error)") else: lines.append("- Verification: not run") return "\n".join(lines) def _warnings_section(summary: dict[str, Any]) -> str: warnings: list[str] = [] stages_failed = summary.get("stages_failed", 0) if stages_failed: warnings.append(f"- ⚠️ {stages_failed} stage(s) failed during execution") content_metrics = summary.get("content_metrics", {}) if isinstance(content_metrics, dict): template_ratio = content_metrics.get("template_ratio") if isinstance(template_ratio, (int, float)) and template_ratio > 0.1: warnings.append( f"- ⚠️ Template content detected: {template_ratio:.1%} of paper may be template text" ) degraded = content_metrics.get("degraded_sources", []) if isinstance(degraded, list) and degraded: warnings.append(f"- ⚠️ Degraded sources: {', '.join(degraded)}") if not warnings: return "" return "## Warnings\n" + "\n".join(warnings) def print_report(run_dir: Path) -> None: print(generate_report(run_dir)) def write_report(run_dir: Path, output_path: Path) -> None: report = generate_report(run_dir) _ = output_path.write_text(report, encoding="utf-8") ================================================ FILE: researchclaw/server/__init__.py ================================================ """ResearchClaw Web server package.""" ================================================ FILE: researchclaw/server/app.py ================================================ """FastAPI application factory.""" from __future__ import annotations import asyncio import logging from pathlib import Path from typing import Any from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles from researchclaw.config import RCConfig from researchclaw.server.middleware.auth import TokenAuthMiddleware from researchclaw.server.websocket.manager import ConnectionManager from researchclaw.server.websocket.events import Event, EventType logger = logging.getLogger(__name__) # Shared application state accessible by routes _app_state: dict[str, Any] = {} def create_app( config: RCConfig, *, dashboard_only: bool = False, monitor_dir: str | None = None, ) -> FastAPI: """Create and configure the FastAPI application. Args: config: ResearchClaw configuration. dashboard_only: If True, only mount dashboard routes. monitor_dir: Specific run directory to monitor. """ app = FastAPI( title="ResearchClaw", description="Autonomous Research Pipeline — Web Interface", version="0.5.0", ) # Store config in shared state _app_state["config"] = config _app_state["monitor_dir"] = monitor_dir # --- CORS --- app.add_middleware( CORSMiddleware, allow_origins=list(config.server.cors_origins), allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # --- Token auth --- if config.server.auth_token: app.add_middleware(TokenAuthMiddleware, token=config.server.auth_token) # --- WebSocket manager --- event_manager = ConnectionManager() _app_state["event_manager"] = event_manager # --- Health endpoint --- @app.get("/api/health") async def health() -> dict[str, Any]: return { "status": "ok", "version": "0.5.0", "active_connections": event_manager.active_count, } @app.get("/api/config") async def config_summary() -> dict[str, Any]: return { "project": config.project.name, "topic": config.research.topic, "mode": config.experiment.mode, "server": { "voice_enabled": config.server.voice_enabled, "dashboard_enabled": config.dashboard.enabled, }, } # --- Routes --- from researchclaw.server.routes.pipeline import router as pipeline_router from researchclaw.server.routes.projects import router as projects_router app.include_router(pipeline_router) app.include_router(projects_router) if not dashboard_only: from researchclaw.server.routes.chat import router as chat_router, set_chat_manager set_chat_manager(event_manager) app.include_router(chat_router) if config.server.voice_enabled: from researchclaw.server.routes.voice import router as voice_router app.include_router(voice_router) # --- WebSocket events endpoint --- from fastapi import WebSocket, WebSocketDisconnect import uuid @app.websocket("/ws/events") async def events_ws(websocket: WebSocket) -> None: """Real-time event stream for dashboard.""" client_id = f"evt-{uuid.uuid4().hex[:8]}" await event_manager.connect(websocket, client_id) try: while True: # Keep connection alive; client can send pings await websocket.receive_text() except WebSocketDisconnect: event_manager.disconnect(client_id) # --- Static files (frontend) --- frontend_dir = Path(__file__).resolve().parent.parent.parent / "frontend" if frontend_dir.is_dir(): app.mount("/static", StaticFiles(directory=str(frontend_dir)), name="static") # Serve index.html at root from fastapi.responses import FileResponse @app.get("/") async def index() -> FileResponse: return FileResponse(str(frontend_dir / "index.html")) # --- Background tasks --- @app.on_event("startup") async def startup() -> None: asyncio.create_task(event_manager.heartbeat_loop(interval=15.0)) if config.dashboard.enabled: from researchclaw.dashboard.broadcaster import start_dashboard_loop asyncio.create_task( start_dashboard_loop( event_manager, interval=config.dashboard.refresh_interval_sec, monitor_dir=monitor_dir, ) ) logger.info("ResearchClaw Web server started") return app ================================================ FILE: researchclaw/server/dialog/__init__.py ================================================ """Dialog / conversational research modules.""" ================================================ FILE: researchclaw/server/dialog/intents.py ================================================ """Intent classification for conversational research.""" from __future__ import annotations import re from enum import Enum from typing import Any class Intent(str, Enum): """Research chat intents.""" TOPIC_SELECTION = "topic_selection" START_PIPELINE = "start_pipeline" CHECK_STATUS = "check_status" MODIFY_CONFIG = "modify_config" DISCUSS_RESULTS = "discuss_results" EDIT_PAPER = "edit_paper" GENERAL_CHAT = "general_chat" HELP = "help" # Keyword patterns for fast classification _INTENT_PATTERNS: list[tuple[Intent, re.Pattern[str]]] = [ (Intent.HELP, re.compile( r"(?:^\s*help\s*$|\bhow\s+to\b|\busage\b|帮助|怎么用)", re.IGNORECASE )), (Intent.START_PIPELINE, re.compile( r"(?:\b(?:start|run|begin|launch)\b|开始|启动|跑|运行)", re.IGNORECASE, )), (Intent.CHECK_STATUS, re.compile( r"(?:\b(?:status|progress|stage|current)\b|阶段|进度|到哪|第几|哪一步)", re.IGNORECASE )), (Intent.TOPIC_SELECTION, re.compile( r"(?:\b(?:topic|idea|direction)\b|research\s+direction|研究方向|选题|研究主题|想法)", re.IGNORECASE, )), (Intent.MODIFY_CONFIG, re.compile( r"(?:\b(?:config|setting|parameter|batch|epoch)\b|learning\s+rate|学习率|修改|设置)", re.IGNORECASE, )), (Intent.DISCUSS_RESULTS, re.compile( r"(?:\b(?:results?|metrics?|accuracy|loss|performance)\b|结果|指标|效果|怎么样)", re.IGNORECASE, )), (Intent.EDIT_PAPER, re.compile( r"(?:\b(?:paper|abstract|introduction|draft)\b|论文|摘要|改一下|写)", re.IGNORECASE, )), ] def classify_intent(message: str) -> tuple[Intent, float]: """Classify user intent from message text. Returns (intent, confidence) where confidence is 0-1. Uses keyword matching for speed; can be replaced with LLM. """ message_lower = message.strip().lower() if not message_lower: return Intent.GENERAL_CHAT, 0.0 for intent, pattern in _INTENT_PATTERNS: if pattern.search(message_lower): return intent, 0.8 return Intent.GENERAL_CHAT, 0.5 ================================================ FILE: researchclaw/server/dialog/router.py ================================================ """Dialog router — routes messages to appropriate handlers.""" from __future__ import annotations import json import logging from pathlib import Path from typing import Any from researchclaw.server.dialog.intents import Intent, classify_intent from researchclaw.server.dialog.session import ChatSession, SessionManager logger = logging.getLogger(__name__) _session_manager = SessionManager() async def route_message(raw_message: str, client_id: str) -> str: """Route incoming chat message and return response.""" # Parse message (could be plain text or JSON) try: msg_data = json.loads(raw_message) text = msg_data.get("message", msg_data.get("text", raw_message)) except (json.JSONDecodeError, TypeError): text = raw_message session = _session_manager.get_or_create(client_id) session.add_message("user", text) intent, confidence = classify_intent(text) logger.debug("Intent: %s (%.2f) for: %s", intent.value, confidence, text[:50]) handler = _HANDLERS.get(intent, _handle_general) response = await handler(text, session) session.add_message("assistant", response) return response async def _handle_help(text: str, session: ChatSession) -> str: return ( "I can help you with:\n" "- **Select a research topic**: describe your area of interest\n" "- **Start a pipeline run**: say 'start experiment' or 'run pipeline'\n" "- **Check progress**: ask 'what stage are we at?'\n" "- **View results**: ask about metrics, accuracy, or results\n" "- **Modify settings**: change learning rate, epochs, etc.\n" "- **Edit paper**: suggest changes to abstract, introduction, etc.\n\n" "Just type naturally — I'll figure out what you need!" ) async def _handle_status(text: str, session: ChatSession) -> str: from researchclaw.dashboard.collector import DashboardCollector collector = DashboardCollector() runs = collector.collect_all() if not runs: return "No pipeline runs found. Start one with 'start pipeline'." active = [r for r in runs if r.is_active] if active: r = active[0] return ( f"**Active run**: {r.run_id}\n" f"- Stage: {r.current_stage}/23 ({r.current_stage_name})\n" f"- Status: {r.status}\n" f"- Topic: {r.topic or '(not set)'}" ) latest = runs[0] return ( f"**Latest run**: {latest.run_id}\n" f"- Stage: {latest.current_stage}/23\n" f"- Status: {latest.status}\n" f"- Stages completed: {len(latest.stages_completed)}" ) async def _handle_start(text: str, session: ChatSession) -> str: return ( "To start a pipeline run, use the dashboard or API:\n" "```\n" "POST /api/pipeline/start\n" '{"topic": "your research topic", "auto_approve": true}\n' "```\n" "Or run from CLI: `researchclaw run -c config.yaml`\n\n" "Would you like me to help you set up the configuration?" ) async def _handle_topic(text: str, session: ChatSession) -> str: return ( "Let me help you find a research direction!\n\n" "Please tell me:\n" "1. Your research **domain** (e.g., CV, NLP, RL, AI4Science)\n" "2. Any **specific interests** (e.g., robustness, efficiency, fairness)\n" "3. Your **target venue** (e.g., NeurIPS, ICML, ICLR)\n\n" "I'll suggest novel, timely research angles based on recent trends." ) async def _handle_config(text: str, session: ChatSession) -> str: return ( "You can modify the configuration through:\n" "1. Edit `config.yaml` directly\n" "2. Use the wizard: `researchclaw wizard`\n" "3. Pass overrides when starting: " '`POST /api/pipeline/start {"config_overrides": {...}}`\n\n' "What setting would you like to change?" ) async def _handle_results(text: str, session: ChatSession) -> str: from researchclaw.dashboard.collector import DashboardCollector collector = DashboardCollector() runs = collector.collect_all() if not runs: return "No runs found yet. Start a pipeline first." latest = runs[0] if not latest.metrics: return f"Run {latest.run_id} has no metrics yet (stage {latest.current_stage}/23)." lines = [f"**Results for {latest.run_id}**:\n"] for key, value in latest.metrics.items(): if isinstance(value, (int, float)): lines.append(f"- {key}: {value}") return "\n".join(lines) if len(lines) > 1 else f"Metrics: {latest.metrics}" async def _handle_paper(text: str, session: ChatSession) -> str: return ( "Paper editing is available after Stage 17 (Paper Draft).\n\n" "I can help with:\n" "- Review and suggest improvements to the abstract\n" "- Check the introduction structure\n" "- Verify experiment descriptions match actual results\n" "- Improve related work coverage\n\n" "Which section would you like to work on?" ) async def _handle_general(text: str, session: ChatSession) -> str: return ( "I'm your ResearchClaw assistant. I can help with:\n" "- Selecting research topics\n" "- Running experiments\n" "- Monitoring progress\n" "- Analyzing results\n" "- Editing papers\n\n" "What would you like to do?" ) _HANDLERS = { Intent.HELP: _handle_help, Intent.CHECK_STATUS: _handle_status, Intent.START_PIPELINE: _handle_start, Intent.TOPIC_SELECTION: _handle_topic, Intent.MODIFY_CONFIG: _handle_config, Intent.DISCUSS_RESULTS: _handle_results, Intent.EDIT_PAPER: _handle_paper, Intent.GENERAL_CHAT: _handle_general, } ================================================ FILE: researchclaw/server/dialog/session.py ================================================ """Conversation session management.""" from __future__ import annotations import json import logging import time from dataclasses import dataclass, field from pathlib import Path from typing import Any logger = logging.getLogger(__name__) @dataclass class ChatMessage: """A single chat message.""" role: str # "user" or "assistant" content: str timestamp: float = field(default_factory=time.time) def to_dict(self) -> dict[str, Any]: return {"role": self.role, "content": self.content, "timestamp": self.timestamp} @dataclass class ChatSession: """Per-client chat session state.""" client_id: str history: list[ChatMessage] = field(default_factory=list) current_project: str = "" current_run: str = "" created_at: float = field(default_factory=time.time) MAX_HISTORY: int = 50 def add_message(self, role: str, content: str) -> ChatMessage: msg = ChatMessage(role=role, content=content) self.history.append(msg) # Trim to prevent unbounded growth if len(self.history) > self.MAX_HISTORY: self.history = self.history[-self.MAX_HISTORY:] return msg def get_context(self, last_n: int = 10) -> list[dict[str, str]]: """Get recent messages for LLM context.""" return [ {"role": m.role, "content": m.content} for m in self.history[-last_n:] ] def to_dict(self) -> dict[str, Any]: return { "client_id": self.client_id, "current_project": self.current_project, "current_run": self.current_run, "history": [m.to_dict() for m in self.history], "created_at": self.created_at, } class SessionManager: """Manage chat sessions.""" def __init__(self, persist_dir: str = ".researchclaw/sessions") -> None: self._sessions: dict[str, ChatSession] = {} self._persist_dir = Path(persist_dir) def get_or_create(self, client_id: str) -> ChatSession: """Get existing session or create new one.""" if client_id not in self._sessions: self._sessions[client_id] = ChatSession(client_id=client_id) return self._sessions[client_id] def remove(self, client_id: str) -> None: """Remove a session.""" self._sessions.pop(client_id, None) def save(self, client_id: str) -> None: """Persist session to disk.""" session = self._sessions.get(client_id) if not session: return self._persist_dir.mkdir(parents=True, exist_ok=True) path = self._persist_dir / f"{client_id}.json" try: with path.open("w", encoding="utf-8") as f: json.dump(session.to_dict(), f, ensure_ascii=False, indent=2) except Exception: logger.debug("Failed to persist session %s", client_id) def load(self, client_id: str) -> ChatSession | None: """Load session from disk.""" path = self._persist_dir / f"{client_id}.json" if not path.exists(): return None try: with path.open() as f: data = json.load(f) session = ChatSession( client_id=data["client_id"], current_project=data.get("current_project", ""), current_run=data.get("current_run", ""), created_at=data.get("created_at", time.time()), ) for m in data.get("history", []): session.history.append( ChatMessage( role=m["role"], content=m["content"], timestamp=m.get("timestamp", 0), ) ) self._sessions[client_id] = session return session except Exception: logger.debug("Failed to load session %s", client_id) return None ================================================ FILE: researchclaw/server/middleware/__init__.py ================================================ """Server middleware modules.""" ================================================ FILE: researchclaw/server/middleware/auth.py ================================================ """Basic token authentication middleware.""" from __future__ import annotations from typing import Callable, Awaitable from starlette.middleware.base import BaseHTTPMiddleware from starlette.requests import Request from starlette.responses import JSONResponse, Response class TokenAuthMiddleware(BaseHTTPMiddleware): """Optional bearer-token authentication. If *token* is empty, all requests are allowed (no-op). """ # Paths that never require auth EXEMPT_PATHS = frozenset({"/api/health", "/docs", "/openapi.json"}) def __init__(self, app: object, token: str = "") -> None: super().__init__(app) # type: ignore[arg-type] self._token = token async def dispatch( self, request: Request, call_next: Callable[[Request], Awaitable[Response]], ) -> Response: # No-op when token is unset if not self._token: return await call_next(request) # Skip auth for exempt paths and static files path = request.url.path if path in self.EXEMPT_PATHS or path.startswith("/static"): return await call_next(request) # WebSocket connections carry token as query param if path.startswith("/ws"): token = request.query_params.get("token", "") else: auth_header = request.headers.get("authorization", "") token = auth_header.removeprefix("Bearer ").strip() if token != self._token: return JSONResponse( {"detail": "Unauthorized"}, status_code=401 ) return await call_next(request) ================================================ FILE: researchclaw/server/routes/__init__.py ================================================ """API route modules.""" ================================================ FILE: researchclaw/server/routes/chat.py ================================================ """Chat WebSocket endpoint for conversational research.""" from __future__ import annotations import logging import uuid from fastapi import APIRouter, WebSocket, WebSocketDisconnect from researchclaw.server.websocket.events import Event, EventType from researchclaw.server.websocket.manager import ConnectionManager logger = logging.getLogger(__name__) router = APIRouter(tags=["chat"]) # Global connection manager (initialized by app.py) _chat_manager: ConnectionManager | None = None def set_chat_manager(manager: ConnectionManager) -> None: """Set the shared connection manager.""" global _chat_manager _chat_manager = manager def get_chat_manager() -> ConnectionManager: """Get the shared connection manager.""" if _chat_manager is None: raise RuntimeError("Chat manager not initialized") return _chat_manager @router.websocket("/ws/chat") async def chat_websocket(websocket: WebSocket) -> None: """WebSocket endpoint for conversational research chat.""" manager = get_chat_manager() client_id = str(uuid.uuid4())[:8] await manager.connect(websocket, client_id) try: while True: raw = await websocket.receive_text() try: from researchclaw.server.dialog.router import route_message response = await route_message(raw, client_id) await manager.send_to( client_id, Event( type=EventType.CHAT_RESPONSE, data={"message": response, "client_id": client_id}, ), ) except Exception as exc: logger.exception("Chat error for %s", client_id) await manager.send_to( client_id, Event( type=EventType.ERROR, data={"error": str(exc), "client_id": client_id}, ), ) except WebSocketDisconnect: manager.disconnect(client_id) ================================================ FILE: researchclaw/server/routes/pipeline.py ================================================ """Pipeline control API routes.""" from __future__ import annotations import asyncio import json import logging from pathlib import Path from typing import Any from fastapi import APIRouter, HTTPException from pydantic import BaseModel logger = logging.getLogger(__name__) import re as _re _RUN_ID_RE = _re.compile(r"^rc-\d{8}-\d{6}-[a-f0-9]+$") def _validated_run_dir(run_id: str) -> Path: """Validate run_id format and return the run directory path.""" if not _RUN_ID_RE.match(run_id): raise HTTPException(status_code=400, detail=f"Invalid run_id format: {run_id}") run_dir = Path("artifacts") / run_id # Ensure resolved path is under artifacts/ if not run_dir.resolve().is_relative_to(Path("artifacts").resolve()): raise HTTPException(status_code=400, detail=f"Invalid run_id: {run_id}") return run_dir router = APIRouter(prefix="/api", tags=["pipeline"]) class PipelineStartRequest(BaseModel): """Request body for starting a pipeline run.""" topic: str | None = None config_overrides: dict[str, Any] | None = None auto_approve: bool = True class PipelineStartResponse(BaseModel): """Response after starting a pipeline.""" run_id: str status: str output_dir: str # In-memory tracking of the active run (single-tenant MVP) _active_run: dict[str, Any] | None = None _run_task: asyncio.Task[Any] | None = None def _get_app_state() -> dict[str, Any]: """Get shared application state (set by app.py).""" from researchclaw.server.app import _app_state return _app_state @router.post("/pipeline/start", response_model=PipelineStartResponse) async def start_pipeline(req: PipelineStartRequest) -> PipelineStartResponse: """Start a new pipeline run.""" global _active_run, _run_task if _active_run and _active_run.get("status") == "running": raise HTTPException(status_code=409, detail="A pipeline is already running") state = _get_app_state() config = state["config"] if req.topic: import dataclasses new_research = dataclasses.replace(config.research, topic=req.topic) config = dataclasses.replace(config, research=new_research) import hashlib from datetime import datetime, timezone ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") topic_hash = hashlib.sha256(config.research.topic.encode()).hexdigest()[:6] run_id = f"rc-{ts}-{topic_hash}" run_dir = _validated_run_dir(run_id) run_dir.mkdir(parents=True, exist_ok=True) _active_run = { "run_id": run_id, "status": "running", "output_dir": str(run_dir), "topic": config.research.topic, } async def _run_in_background() -> None: global _active_run try: from researchclaw.adapters import AdapterBundle from researchclaw.pipeline.runner import execute_pipeline kb_root = Path(config.knowledge_base.root) if config.knowledge_base.root else None if kb_root: kb_root.mkdir(parents=True, exist_ok=True) loop = asyncio.get_event_loop() results = await loop.run_in_executor( None, lambda: execute_pipeline( run_dir=run_dir, run_id=run_id, config=config, adapters=AdapterBundle(), auto_approve_gates=req.auto_approve, skip_noncritical=True, kb_root=kb_root, ), ) done = sum(1 for r in results if r.status.value == "done") failed = sum(1 for r in results if r.status.value == "failed") if _active_run: _active_run["status"] = "completed" if failed == 0 else "failed" _active_run["stages_done"] = done _active_run["stages_failed"] = failed except Exception as exc: logger.exception("Pipeline run failed") if _active_run: _active_run["status"] = "failed" _active_run["error"] = str(exc) _run_task = asyncio.create_task(_run_in_background()) return PipelineStartResponse( run_id=run_id, status="running", output_dir=str(run_dir), ) @router.post("/pipeline/stop") async def stop_pipeline() -> dict[str, str]: """Stop the currently running pipeline.""" global _active_run, _run_task if not _run_task or not _active_run: raise HTTPException(status_code=404, detail="No pipeline is running") _run_task.cancel() _active_run["status"] = "stopped" return {"status": "stopped"} @router.get("/pipeline/status") async def pipeline_status() -> dict[str, Any]: """Get current pipeline run status.""" if not _active_run: return {"status": "idle"} return _active_run @router.get("/pipeline/stages") async def pipeline_stages() -> dict[str, Any]: """Get the 23-stage pipeline definition.""" from researchclaw.pipeline.stages import Stage stages = [] for s in Stage: stages.append({ "number": int(s), "name": s.name, "label": getattr(s, "label", s.name.replace("_", " ").title()), "phase": getattr(s, "phase", ""), }) return {"stages": stages} @router.get("/runs") async def list_runs() -> dict[str, Any]: """List historical pipeline runs from artifacts/ directory.""" artifacts = Path("artifacts") runs: list[dict[str, Any]] = [] if artifacts.exists(): for d in sorted(artifacts.iterdir(), reverse=True): if d.is_dir() and d.name.startswith("rc-"): info: dict[str, Any] = {"run_id": d.name, "path": str(d)} # Try reading checkpoint ckpt = d / "checkpoint.json" if ckpt.exists(): try: with ckpt.open() as f: info["checkpoint"] = json.load(f) except Exception: pass runs.append(info) return {"runs": runs[:50]} # limit to 50 most recent @router.get("/runs/{run_id}") async def get_run(run_id: str) -> dict[str, Any]: """Get details for a specific run.""" run_dir = _validated_run_dir(run_id) if not run_dir.exists(): raise HTTPException(status_code=404, detail=f"Run not found: {run_id}") info: dict[str, Any] = {"run_id": run_id, "path": str(run_dir)} ckpt = run_dir / "checkpoint.json" if ckpt.exists(): try: with ckpt.open() as f: info["checkpoint"] = json.load(f) except Exception: pass # List stage directories stage_dirs = sorted( [d.name for d in run_dir.iterdir() if d.is_dir() and d.name.startswith("stage-")] ) info["stages_completed"] = stage_dirs # Check for paper for pattern in ["paper.md", "paper.tex", "paper.pdf"]: found = list(run_dir.rglob(pattern)) if found: info[f"has_{pattern.split('.')[1]}"] = True return info @router.get("/runs/{run_id}/metrics") async def get_run_metrics(run_id: str) -> dict[str, Any]: """Get experiment metrics for a run.""" run_dir = _validated_run_dir(run_id) if not run_dir.exists(): raise HTTPException(status_code=404, detail=f"Run not found: {run_id}") metrics: dict[str, Any] = {} results_file = run_dir / "results.json" if results_file.exists(): try: with results_file.open() as f: metrics = json.load(f) except Exception: pass return {"run_id": run_id, "metrics": metrics} ================================================ FILE: researchclaw/server/routes/projects.py ================================================ """Project listing / status API routes.""" from __future__ import annotations import json from pathlib import Path from typing import Any from fastapi import APIRouter router = APIRouter(prefix="/api", tags=["projects"]) @router.get("/projects") async def list_projects() -> dict[str, Any]: """List all project directories (artifacts/rc-*).""" artifacts = Path("artifacts") projects: list[dict[str, Any]] = [] if artifacts.exists(): for d in sorted(artifacts.iterdir(), reverse=True): if d.is_dir() and d.name.startswith("rc-"): proj: dict[str, Any] = { "id": d.name, "path": str(d), } ckpt = d / "checkpoint.json" if ckpt.exists(): try: with ckpt.open() as f: ckpt_data = json.load(f) proj["current_stage"] = ckpt_data.get("stage") proj["status"] = ckpt_data.get("status", "unknown") except Exception: proj["status"] = "unknown" else: proj["status"] = "no_checkpoint" projects.append(proj) return {"projects": projects} ================================================ FILE: researchclaw/server/routes/voice.py ================================================ """Voice upload / transcription API routes.""" from __future__ import annotations import logging from typing import Any from fastapi import APIRouter, HTTPException, UploadFile, File logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/voice", tags=["voice"]) @router.post("/transcribe") async def transcribe_audio( file: UploadFile = File(...), language: str = "zh", ) -> dict[str, Any]: """Transcribe uploaded audio using Whisper API.""" try: from researchclaw.voice.transcriber import VoiceTranscriber except ImportError: raise HTTPException( status_code=501, detail="Voice dependencies not installed. Run: pip install researchclaw[voice]", ) from researchclaw.server.app import _app_state config = _app_state.get("config") if not config or not config.server.voice_enabled: raise HTTPException(status_code=403, detail="Voice is not enabled in config") audio_bytes = await file.read() transcriber = VoiceTranscriber(config.server) text = await transcriber.transcribe(audio_bytes, language=language) return {"text": text, "language": language} ================================================ FILE: researchclaw/server/websocket/__init__.py ================================================ """WebSocket modules.""" ================================================ FILE: researchclaw/server/websocket/events.py ================================================ """WebSocket event type definitions.""" from __future__ import annotations from dataclasses import asdict, dataclass, field from enum import Enum from typing import Any import json import time class EventType(str, Enum): """All WebSocket event types.""" # Lifecycle CONNECTED = "connected" HEARTBEAT = "heartbeat" ERROR = "error" # Pipeline PIPELINE_STARTED = "pipeline_started" PIPELINE_COMPLETED = "pipeline_completed" STAGE_START = "stage_start" STAGE_COMPLETE = "stage_complete" STAGE_FAIL = "stage_fail" METRIC_UPDATE = "metric_update" LOG_LINE = "log_line" PAPER_READY = "paper_ready" # Chat CHAT_RESPONSE = "chat_response" CHAT_TYPING = "chat_typing" CHAT_SUGGESTION = "chat_suggestion" # System RUN_DISCOVERED = "run_discovered" RUN_STATUS_CHANGED = "run_status_changed" @dataclass class Event: """A WebSocket event.""" type: EventType data: dict[str, Any] = field(default_factory=dict) timestamp: float = field(default_factory=time.time) def to_json(self) -> str: """Serialize to JSON string.""" return json.dumps( { "type": self.type.value, "data": self.data, "timestamp": self.timestamp, } ) @classmethod def from_json(cls, raw: str) -> Event: """Deserialize from JSON string.""" obj = json.loads(raw) return cls( type=EventType(obj["type"]), data=obj.get("data", {}), timestamp=obj.get("timestamp", time.time()), ) ================================================ FILE: researchclaw/server/websocket/manager.py ================================================ """WebSocket connection manager.""" from __future__ import annotations import asyncio import logging import time from typing import Any from fastapi import WebSocket from .events import Event, EventType logger = logging.getLogger(__name__) class ConnectionManager: """Manage WebSocket connections and broadcast events.""" def __init__(self) -> None: self._connections: dict[str, WebSocket] = {} self._event_queue: asyncio.Queue[Event] = asyncio.Queue() @property def active_count(self) -> int: return len(self._connections) async def connect(self, websocket: WebSocket, client_id: str) -> None: """Accept and register a WebSocket connection.""" await websocket.accept() self._connections[client_id] = websocket logger.info("WebSocket connected: %s (total: %d)", client_id, self.active_count) await self._send( websocket, Event(type=EventType.CONNECTED, data={"client_id": client_id}), ) def disconnect(self, client_id: str) -> None: """Remove a disconnected client.""" self._connections.pop(client_id, None) logger.info("WebSocket disconnected: %s (total: %d)", client_id, self.active_count) async def broadcast(self, event: Event) -> None: """Send event to all connected clients.""" dead: list[str] = [] for cid, ws in self._connections.items(): try: await self._send(ws, event) except Exception: dead.append(cid) for cid in dead: self.disconnect(cid) async def send_to(self, client_id: str, event: Event) -> None: """Send event to a specific client.""" ws = self._connections.get(client_id) if ws: try: await self._send(ws, event) except Exception: self.disconnect(client_id) async def _send(self, ws: WebSocket, event: Event) -> None: await ws.send_text(event.to_json()) def publish(self, event: Event) -> None: """Non-async publish for use from sync code (thread-safe queue).""" try: self._event_queue.put_nowait(event) except asyncio.QueueFull: logger.warning("Event queue full, dropping event: %s", event.type) async def drain_queue(self) -> None: """Process queued events and broadcast them.""" while not self._event_queue.empty(): event = self._event_queue.get_nowait() await self.broadcast(event) async def heartbeat_loop(self, interval: float = 15.0) -> None: """Send periodic heartbeat to all clients.""" while True: await asyncio.sleep(interval) await self.broadcast( Event( type=EventType.HEARTBEAT, data={"active_clients": self.active_count}, ) ) await self.drain_queue() ================================================ FILE: researchclaw/servers/__init__.py ================================================ """Multi-server resource scheduling for AutoResearchClaw.""" from researchclaw.servers.registry import ServerRegistry from researchclaw.servers.monitor import ServerMonitor from researchclaw.servers.dispatcher import TaskDispatcher __all__ = ["ServerRegistry", "ServerMonitor", "TaskDispatcher"] ================================================ FILE: researchclaw/servers/cloud_executor.py ================================================ """Cloud executor: stub for AWS/GCP/Azure GPU instance management.""" from __future__ import annotations import logging from typing import Any from researchclaw.servers.registry import ServerEntry logger = logging.getLogger(__name__) class CloudExecutor: """Manage cloud GPU instances for experiment execution. This is a stub implementation. Actual cloud provider APIs (boto3, google-cloud, azure-mgmt) are imported lazily to avoid hard dependencies. """ def __init__(self, server: ServerEntry) -> None: if server.server_type != "cloud": raise ValueError(f"Server {server.name} is not a cloud server") self.server = server self.provider = server.cloud_provider async def launch_instance(self) -> dict[str, Any]: """Launch a cloud GPU instance.""" logger.info( "Launching %s instance (%s) for %s", self.provider, self.server.cloud_instance_type, self.server.name, ) # Stub: actual implementation would call provider SDK return { "provider": self.provider, "instance_type": self.server.cloud_instance_type, "status": "stub_launched", "instance_id": f"stub-{self.server.name}", "cost_per_hour": self.server.cost_per_hour, } async def terminate_instance(self, instance_id: str) -> None: """Terminate a cloud instance.""" logger.info("Terminating instance %s on %s", instance_id, self.provider) async def get_instance_status(self, instance_id: str) -> dict[str, Any]: """Check instance status.""" return {"instance_id": instance_id, "status": "stub_unknown"} ================================================ FILE: researchclaw/servers/dispatcher.py ================================================ """Task dispatcher: route experiment tasks to the best available server.""" from __future__ import annotations import asyncio import logging import uuid from typing import Any from researchclaw.servers.registry import ServerEntry, ServerRegistry from researchclaw.servers.monitor import ServerMonitor from researchclaw.servers.ssh_executor import SSHExecutor from researchclaw.servers.slurm_executor import SlurmExecutor logger = logging.getLogger(__name__) class TaskDispatcher: """Dispatch experiment tasks to the best available server.""" def __init__( self, registry: ServerRegistry, monitor: ServerMonitor, prefer_free: bool = True, failover: bool = True, ) -> None: self.registry = registry self.monitor = monitor self.prefer_free = prefer_free self.failover = failover self._tasks: dict[str, dict[str, Any]] = {} self._busy_servers: set[str] = set() async def dispatch(self, task: dict[str, Any]) -> str: """Dispatch a task to the best available server. Args: task: dict with keys: command, local_dir, requirements (optional) Returns: task_id for tracking """ task_id = uuid.uuid4().hex[:12] requirements = task.get("requirements", {}) # Find best server server = self.registry.get_best_match( requirements=requirements, prefer_free=self.prefer_free, ) if server is None: self._tasks[task_id] = {"status": "queued", "task": task, "error": "No matching server"} logger.warning("No server available for task %s, queued", task_id) return task_id # Dispatch based on server type self._tasks[task_id] = { "status": "dispatched", "server": server.name, "task": task, } self._busy_servers.add(server.name) logger.info("Dispatched task %s to %s (%s)", task_id, server.name, server.server_type) return task_id async def execute_task(self, task_id: str) -> dict[str, Any]: """Execute a dispatched task on its assigned server.""" info = self._tasks.get(task_id) if not info or info["status"] != "dispatched": return {"success": False, "error": "Task not dispatched"} server = self.registry.get(info["server"]) task = info["task"] remote_dir = f"/tmp/researchclaw_{task_id}" try: if server.server_type == "slurm": executor = SlurmExecutor(server) job_id = await executor.submit_job( command=task["command"], remote_dir=remote_dir, resources=task.get("requirements"), ) info["status"] = "running" info["job_id"] = job_id return {"success": True, "job_id": job_id} else: # Default: SSH executor executor = SSHExecutor(server) # type: ignore[assignment] result = await executor.run_experiment( remote_dir=remote_dir, command=task["command"], timeout=task.get("timeout", 3600), ) info["status"] = "completed" if result["success"] else "failed" info["result"] = result return result except Exception as exc: logger.error("Task %s failed: %s", task_id, exc) info["status"] = "failed" info["error"] = str(exc) # Failover: try another server (non-recursive, single attempt) if self.failover: tried = {server.name} alt = self.registry.get_best_match( requirements=task.get("requirements"), prefer_free=self.prefer_free, ) if alt and alt.name not in tried: logger.info("Failing over task %s to %s", task_id, alt.name) info["server"] = alt.name info["status"] = "dispatched" try: alt_server = self.registry.get(alt.name) result = await alt_server.run_experiment( remote_dir=task.get("remote_dir", ""), command=task.get("command", ""), timeout=task.get("timeout", 3600), ) info["status"] = "completed" return result except Exception as alt_exc: logger.error("Failover also failed: %s", alt_exc) return {"success": False, "error": str(exc)} finally: self._busy_servers.discard(server.name) def get_task_status(self, task_id: str) -> dict[str, Any]: """Get the status of a task.""" info = self._tasks.get(task_id) if not info: return {"task_id": task_id, "status": "unknown"} return { "task_id": task_id, "status": info["status"], "server": info.get("server"), "error": info.get("error"), } ================================================ FILE: researchclaw/servers/monitor.py ================================================ """Server monitor: check health and resource usage of registered servers.""" from __future__ import annotations import asyncio import logging from typing import Any from researchclaw.servers.registry import ServerEntry, ServerRegistry logger = logging.getLogger(__name__) class ServerMonitor: """Monitor health and resource usage of registered servers.""" def __init__(self, registry: ServerRegistry) -> None: self.registry = registry self._status_cache: dict[str, dict[str, Any]] = {} async def check_status(self, server: ServerEntry) -> dict[str, Any]: """Check a single server's status via SSH (nvidia-smi, free, uptime).""" try: result = await _ssh_command(server.host, "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits 2>/dev/null; echo '---'; free -m | head -2; echo '---'; uptime") status = _parse_status_output(result, server) status["reachable"] = True except Exception as exc: logger.warning("Cannot reach server %s: %s", server.name, exc) status = {"reachable": False, "error": str(exc)} self._status_cache[server.name] = status return status async def check_all(self) -> dict[str, dict[str, Any]]: """Check all servers concurrently.""" servers = self.registry.list_all() tasks = [self.check_status(s) for s in servers] results = await asyncio.gather(*tasks, return_exceptions=True) out: dict[str, dict[str, Any]] = {} for server, result in zip(servers, results): if isinstance(result, Exception): out[server.name] = {"reachable": False, "error": str(result)} else: out[server.name] = result return out def get_cached(self, name: str) -> dict[str, Any] | None: """Return cached status for a server.""" return self._status_cache.get(name) def get_gpu_usage(self, server: ServerEntry) -> dict[str, Any]: """Return cached GPU usage for a server (sync convenience).""" cached = self._status_cache.get(server.name, {}) return cached.get("gpu", {}) async def _ssh_command(host: str, command: str) -> str: """Run a command on a remote host via SSH.""" proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=5", "-o", "StrictHostKeyChecking=no", host, command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"SSH command failed (rc={proc.returncode}): {stderr.decode().strip()}") return stdout.decode() def _parse_status_output(raw: str, server: ServerEntry) -> dict[str, Any]: """Parse combined nvidia-smi + free + uptime output.""" sections = raw.split("---") status: dict[str, Any] = {"server": server.name, "host": server.host} # GPU section if len(sections) >= 1: gpu_lines = [l.strip() for l in sections[0].strip().splitlines() if l.strip()] gpus = [] for line in gpu_lines: parts = [p.strip() for p in line.split(",")] if len(parts) >= 3: gpus.append({ "utilization_pct": int(parts[0]), "memory_used_mb": int(parts[1]), "memory_total_mb": int(parts[2]), }) status["gpu"] = {"count": len(gpus), "devices": gpus} # Memory section if len(sections) >= 2: mem_lines = sections[1].strip().splitlines() if len(mem_lines) >= 2: parts = mem_lines[1].split() if len(parts) >= 4: status["memory"] = { "total_mb": int(parts[1]), "used_mb": int(parts[2]), "free_mb": int(parts[3]), } # Uptime section if len(sections) >= 3: status["uptime"] = sections[2].strip() return status ================================================ FILE: researchclaw/servers/registry.py ================================================ """Server registry: manage available compute servers.""" from __future__ import annotations import logging from typing import Any logger = logging.getLogger(__name__) class ServerEntry: """A compute server that can run experiments.""" def __init__( self, name: str, host: str, server_type: str = "ssh", gpu: str = "", vram_gb: int = 0, priority: int = 1, scheduler: str = "", cloud_provider: str = "", cloud_instance_type: str = "", cost_per_hour: float = 0.0, ) -> None: self.name = name self.host = host self.server_type = server_type # ssh | slurm | cloud self.gpu = gpu self.vram_gb = vram_gb self.priority = priority self.scheduler = scheduler # slurm | pbs | lsf self.cloud_provider = cloud_provider # aws | gcp | azure self.cloud_instance_type = cloud_instance_type self.cost_per_hour = cost_per_hour def to_dict(self) -> dict[str, Any]: return { "name": self.name, "host": self.host, "server_type": self.server_type, "gpu": self.gpu, "vram_gb": self.vram_gb, "priority": self.priority, "scheduler": self.scheduler, "cloud_provider": self.cloud_provider, "cloud_instance_type": self.cloud_instance_type, "cost_per_hour": self.cost_per_hour, } @classmethod def from_dict(cls, data: dict[str, Any]) -> ServerEntry: return cls( name=data["name"], host=data.get("host", ""), server_type=data.get("server_type", "ssh"), gpu=data.get("gpu", ""), vram_gb=int(data.get("vram_gb", 0)), priority=int(data.get("priority", 1)), scheduler=data.get("scheduler", ""), cloud_provider=data.get("cloud_provider", ""), cloud_instance_type=data.get("cloud_instance_type", ""), cost_per_hour=float(data.get("cost_per_hour", 0.0)), ) class ServerRegistry: """Registry of available compute servers.""" def __init__(self, servers: list[ServerEntry] | None = None) -> None: self._servers: dict[str, ServerEntry] = {} for s in (servers or []): self._servers[s.name] = s def add(self, server: ServerEntry) -> None: """Register a new server.""" self._servers[server.name] = server logger.info("Registered server: %s (%s)", server.name, server.host) def remove(self, name: str) -> None: """Remove a server from the registry.""" if name not in self._servers: raise KeyError(f"Unknown server: {name}") del self._servers[name] def get(self, name: str) -> ServerEntry: """Get a server by name.""" if name not in self._servers: raise KeyError(f"Unknown server: {name}") return self._servers[name] def list_all(self) -> list[ServerEntry]: """Return all registered servers sorted by priority (lower = higher priority).""" return sorted(self._servers.values(), key=lambda s: s.priority) def get_available(self, exclude: set[str] | None = None) -> list[ServerEntry]: """Return servers not in the exclude set, sorted by priority.""" excluded = exclude or set() return [s for s in self.list_all() if s.name not in excluded] def get_best_match( self, requirements: dict[str, Any] | None = None, prefer_free: bool = True, ) -> ServerEntry | None: """Find the best server matching resource requirements. Args: requirements: dict with optional keys: min_vram_gb, server_type, gpu prefer_free: prefer servers with cost_per_hour == 0 """ reqs = requirements or {} candidates = self.list_all() # Filter by minimum VRAM min_vram = reqs.get("min_vram_gb", 0) if min_vram: candidates = [s for s in candidates if s.vram_gb >= min_vram] # Filter by server type stype = reqs.get("server_type") if stype: candidates = [s for s in candidates if s.server_type == stype] # Filter by GPU model substring gpu_req = reqs.get("gpu") if gpu_req: candidates = [s for s in candidates if gpu_req.lower() in s.gpu.lower()] if not candidates: return None # Sort: prefer free servers, then by priority if prefer_free: candidates.sort(key=lambda s: (s.cost_per_hour > 0, s.priority)) return candidates[0] @property def count(self) -> int: return len(self._servers) ================================================ FILE: researchclaw/servers/slurm_executor.py ================================================ """Slurm HPC executor: submit, monitor, and cancel batch jobs.""" from __future__ import annotations import asyncio import logging import textwrap from typing import Any from researchclaw.servers.registry import ServerEntry logger = logging.getLogger(__name__) class SlurmExecutor: """Submit and manage Slurm batch jobs via SSH.""" def __init__(self, server: ServerEntry) -> None: if server.server_type != "slurm": raise ValueError(f"Server {server.name} is not a slurm server") self.server = server self.host = server.host def _generate_sbatch_script( self, command: str, job_name: str = "researchclaw", resources: dict[str, Any] | None = None, ) -> str: """Generate an sbatch submission script.""" res = resources or {} gpus = res.get("gpus", 1) mem = res.get("mem_gb", 16) time_limit = res.get("time", "01:00:00") partition = res.get("partition", "") lines = [ "#!/bin/bash", f"#SBATCH --job-name={job_name}", f"#SBATCH --gres=gpu:{gpus}", f"#SBATCH --mem={mem}G", f"#SBATCH --time={time_limit}", "#SBATCH --output=slurm-%j.out", "#SBATCH --error=slurm-%j.err", ] if partition: lines.append(f"#SBATCH --partition={partition}") lines.append("") lines.append(command) return "\n".join(lines) async def submit_job( self, command: str, remote_dir: str, job_name: str = "researchclaw", resources: dict[str, Any] | None = None, ) -> str: """Submit a Slurm job and return the job ID.""" script = self._generate_sbatch_script(command, job_name, resources) # Write script and submit via SSH import shlex as _shlex ssh_cmd = ( f"cd {_shlex.quote(remote_dir)} && " f"cat <<'EOFSCRIPT' > _job.sh\n{script}\nEOFSCRIPT\n" f"&& sbatch _job.sh" ) proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no", self.host, ssh_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"sbatch failed: {stderr.decode().strip()}") # Parse "Submitted batch job 12345" output = stdout.decode().strip() parts = output.split() if len(parts) >= 4 and parts[-1].isdigit(): job_id = parts[-1] logger.info("Submitted Slurm job %s on %s", job_id, self.server.name) return job_id raise RuntimeError(f"Could not parse sbatch output: {output}") async def check_job(self, job_id: str) -> dict[str, Any]: """Check job status via squeue/sacct.""" proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no", self.host, f"squeue -j {job_id} -h -o '%T' 2>/dev/null || sacct -j {job_id} -n -o State -P 2>/dev/null", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, _ = await proc.communicate() state = stdout.decode().strip().split("\n")[0].strip() if stdout else "UNKNOWN" return {"job_id": job_id, "state": state} async def cancel_job(self, job_id: str) -> None: """Cancel a running job.""" proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no", self.host, f"scancel {job_id}", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) await proc.communicate() logger.info("Cancelled Slurm job %s on %s", job_id, self.server.name) ================================================ FILE: researchclaw/servers/ssh_executor.py ================================================ """SSH remote executor: upload code, run experiments, download results.""" from __future__ import annotations import asyncio import logging import shlex from pathlib import Path from typing import Any from researchclaw.servers.registry import ServerEntry logger = logging.getLogger(__name__) class SSHExecutor: """Execute experiments on remote servers via SSH/rsync.""" def __init__(self, server: ServerEntry) -> None: self.server = server self.host = server.host async def upload_code(self, local_dir: Path, remote_dir: str) -> None: """Upload experiment code via rsync.""" local = str(local_dir.resolve()) + "/" remote = f"{self.host}:{remote_dir}/" logger.info("Uploading %s -> %s", local, remote) proc = await asyncio.create_subprocess_exec( "rsync", "-az", "--delete", "-e", "ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no", local, remote, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"rsync upload failed: {stderr.decode().strip()}") async def run_experiment( self, remote_dir: str, command: str, timeout: int = 3600, ) -> dict[str, Any]: """Run an experiment command on the remote server.""" full_cmd = f"cd {shlex.quote(remote_dir)} && {command}" logger.info("Running on %s: %s", self.host, full_cmd) proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no", self.host, full_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) try: stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout) except asyncio.TimeoutError: proc.kill() await proc.wait() return {"success": False, "error": f"Timeout after {timeout}s", "returncode": -1} return { "success": proc.returncode == 0, "stdout": stdout.decode(), "stderr": stderr.decode(), "returncode": proc.returncode, } async def download_results(self, remote_dir: str, local_dir: Path) -> None: """Download experiment results via rsync.""" local_dir.mkdir(parents=True, exist_ok=True) remote = f"{self.host}:{remote_dir}/" local = str(local_dir.resolve()) + "/" logger.info("Downloading %s -> %s", remote, local) proc = await asyncio.create_subprocess_exec( "rsync", "-az", "-e", "ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no", remote, local, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"rsync download failed: {stderr.decode().strip()}") async def cleanup(self, remote_dir: str) -> None: """Remove remote experiment directory.""" logger.info("Cleaning up %s:%s", self.host, remote_dir) proc = await asyncio.create_subprocess_exec( "ssh", "-o", "ConnectTimeout=10", "-o", "StrictHostKeyChecking=no", self.host, f"rm -rf {shlex.quote(remote_dir)}", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) await proc.communicate() ================================================ FILE: researchclaw/skills/__init__.py ================================================ """Dynamic skills library for AutoResearchClaw. Provides a registry of reusable research/engineering/writing skills that can be automatically matched to pipeline stages and injected into LLM prompts. """ from researchclaw.skills.schema import Skill from researchclaw.skills.registry import SkillRegistry __all__ = ["Skill", "SkillRegistry"] ================================================ FILE: researchclaw/skills/builtin/__init__.py ================================================ ================================================ FILE: researchclaw/skills/builtin/domain/cv-classification/SKILL.md ================================================ --- name: cv-classification description: Best practices for image classification tasks. Use when working on CIFAR, ImageNet, or other classification benchmarks. metadata: category: domain trigger-keywords: "classification,image,cifar,imagenet,resnet,vision,cnn,vit" applicable-stages: "9,10" priority: "3" version: "1.0" author: researchclaw references: "He et al., Deep Residual Learning, CVPR 2016; Dosovitskiy et al., An Image is Worth 16x16 Words, ICLR 2021" --- ## Image Classification Best Practice Architecture selection: - Small scale (CIFAR-10/100): ResNet-18/34, WideResNet, Simple ViT - Medium scale: ResNet-50, EfficientNet-B0/B1, DeiT-Small - Large scale: ViT-B/16, ConvNeXt, Swin Transformer Training recipe: - Optimizer: AdamW (lr=1e-3 to 3e-4) or SGD (lr=0.1 with cosine decay) - Weight decay: 0.01-0.1 for AdamW, 5e-4 for SGD - Data augmentation: RandomCrop, RandomHorizontalFlip, Cutout/CutMix - Warmup: 5-10 epochs linear warmup for transformers - Batch size: 128-256 for CNNs, 512-1024 for ViTs (if memory allows) Standard benchmarks: - CIFAR-10: ~96% (ResNet-18), ~97% (WideResNet) - CIFAR-100: ~80% (ResNet-18), ~84% (WideResNet) - ImageNet: ~76% (ResNet-50), ~81% (ViT-B/16) ================================================ FILE: researchclaw/skills/builtin/domain/cv-detection/SKILL.md ================================================ --- name: cv-detection description: Best practices for object detection tasks. Use when working on COCO, VOC, or detection architectures like YOLO and DETR. metadata: category: domain trigger-keywords: "detection,object,bbox,yolo,coco,anchor,faster rcnn" applicable-stages: "9,10" priority: "5" version: "1.0" author: researchclaw references: "Ren et al., Faster R-CNN, NeurIPS 2015; Carion et al., End-to-End Object Detection with Transformers, ECCV 2020" --- ## Object Detection Best Practice Architecture families: - One-stage: YOLO (v5/v8), SSD, RetinaNet, FCOS - Two-stage: Faster R-CNN, Cascade R-CNN - Transformer: DETR, DINO, RT-DETR Training recipe: - Use pre-trained backbone (ImageNet) - Multi-scale training and testing - IoU threshold: 0.5 for mAP50, 0.5:0.95 for mAP - Use FPN for multi-scale feature extraction - Focal loss for class imbalance in one-stage detectors Standard benchmarks: - COCO val2017: ~37 mAP (Faster R-CNN R50), ~51 mAP (DINO Swin-L) - Pascal VOC: ~80 mAP50 (Faster R-CNN) ================================================ FILE: researchclaw/skills/builtin/domain/nlp-alignment/SKILL.md ================================================ --- name: nlp-alignment description: Best practices for LLM alignment techniques including RLHF, DPO, and instruction tuning. Use when working on alignment or safety. metadata: category: domain trigger-keywords: "alignment,rlhf,dpo,reward model,preference,instruction tuning,safety" applicable-stages: "9,10" priority: "4" version: "1.0" author: researchclaw references: "Ouyang et al., Training language models to follow instructions, NeurIPS 2022; Rafailov et al., DPO, NeurIPS 2023" --- ## LLM Alignment Best Practice Methods: - RLHF: Train reward model → PPO fine-tuning (complex but powerful) - DPO: Direct preference optimization (simpler, no reward model needed) - GRPO: Group relative policy optimization - SFT: Supervised fine-tuning as alignment baseline Training recipe: - Start with SFT on high-quality instruction data - DPO: lr=5e-7, beta=0.1, batch_size=64 - PPO: lr=1e-6, clip=0.2, KL coeff=0.02 - Use reference model for KL penalty - Evaluate on safety benchmarks (TruthfulQA, BBQ, etc.) Common pitfalls: - Reward hacking: model finds shortcuts to high reward - Mode collapse: model generates repetitive outputs - Catastrophic forgetting: loses general capabilities ================================================ FILE: researchclaw/skills/builtin/domain/nlp-pretraining/SKILL.md ================================================ --- name: nlp-pretraining description: Best practices for language model pretraining and fine-tuning. Use when generating or reviewing NLP training code. metadata: category: domain trigger-keywords: "language model,pretraining,fine-tuning,bert,gpt,llm,transformer,nlp,text" applicable-stages: "9,10" priority: "3" version: "1.0" author: researchclaw references: "Devlin et al., BERT, NAACL 2019; Hu et al., LoRA, ICLR 2022" --- ## NLP Pretraining/Fine-tuning Best Practice Fine-tuning recipe: - Use pre-trained checkpoints (HuggingFace hub) - AdamW optimizer, lr=2e-5 to 5e-5 - Linear warmup (6% of total steps) + linear decay - Batch size: 16-32 (use gradient accumulation for larger effective batch) - 3-5 epochs for classification, 1-2 for generation - Weight decay: 0.01 Parameter-efficient methods: - LoRA: r=8-64, alpha=16-128, apply to q/v projections - Prefix tuning: 10-20 prefix tokens - Adapters: bottleneck dimension 64-256 Evaluation: - Classification: accuracy, F1 (macro for imbalanced) - Generation: perplexity, BLEU/ROUGE, human evaluation - Use multiple seeds and report mean +/- std ================================================ FILE: researchclaw/skills/builtin/domain/rl-policy-optimization/SKILL.md ================================================ --- name: rl-policy-optimization description: Best practices for reinforcement learning policy optimization. Use when working on RL agents, PPO, SAC, or reward design. metadata: category: domain trigger-keywords: "reinforcement learning,rl,policy,reward,agent,environment,ppo,sac" applicable-stages: "9,10" priority: "3" version: "1.0" author: researchclaw references: "Schulman et al., Proximal Policy Optimization, 2017; Haarnoja et al., Soft Actor-Critic, ICML 2018" --- ## RL Policy Optimization Best Practice Algorithm selection: - Discrete actions: PPO, DQN, A2C - Continuous actions: SAC, TD3, PPO - Multi-agent: MAPPO, QMIX - Offline: CQL, IQL, Decision Transformer Training recipe: - PPO: clip=0.2, lr=3e-4, gamma=0.99, GAE lambda=0.95 - SAC: lr=3e-4, tau=0.005, auto-tune alpha - Use vectorized environments (e.g., gymnasium.vector) - Normalize observations and rewards - Log episode return, episode length, value loss, policy entropy Evaluation: - Report mean +/- std over 10+ evaluation episodes - Use deterministic policy for evaluation - Compare against random policy and simple baselines - Report sample efficiency (return vs. env steps) Common pitfalls: - Reward shaping can introduce bias - Seed sensitivity is HIGH — use 5+ seeds - Hyperparameter sensitivity — do a small sweep ================================================ FILE: researchclaw/skills/builtin/experiment/experimental-design/SKILL.md ================================================ --- name: experimental-design description: Best practices for designing reproducible ML experiments. Use when planning ablations, baselines, or controlled experiments. metadata: category: experiment trigger-keywords: "experiment,ablation,baseline,control,hypothesis,reproducib" applicable-stages: "9,10,12" priority: "2" version: "1.0" author: researchclaw references: "Bouthillier et al., Accounting for Variance in ML Benchmarks, MLSys 2021" --- ## Experimental Design Best Practice 1. ALWAYS include meaningful baselines (not just random): - At least one classical method baseline - At least one recent SOTA method baseline - A simple-but-strong baseline (e.g., linear probe, k-NN) 2. Use MULTIPLE random seeds (minimum 3, ideally 5) 3. Report mean +/- std across seeds 4. Design ablations that isolate EACH key component: - Remove one component at a time - Each ablation must be meaningfully different from baseline 5. Control variables: change only ONE thing per comparison 6. Use standard splits (train/val/test) — never test on training data 7. Report wall-clock time and memory usage alongside accuracy ================================================ FILE: researchclaw/skills/builtin/experiment/meta-analysis/SKILL.md ================================================ --- name: meta-analysis description: Statistical methods for combining results across multiple studies. Use when aggregating cross-study or cross-experiment results. metadata: category: experiment trigger-keywords: "meta-analysis,effect size,pooled,cross-study,aggregat" applicable-stages: "7,14" priority: "5" version: "1.0" author: researchclaw references: "Borenstein et al., Introduction to Meta-Analysis, 2009" --- ## Meta-Analysis Best Practice When comparing results across studies or experiments: 1. Report effect sizes, not just p-values 2. Use standardized metrics for cross-study comparison 3. Account for heterogeneity (different setups, datasets, seeds) 4. Report confidence intervals alongside point estimates 5. Use forest plots to visualize cross-study comparisons 6. Identify and discuss outliers or inconsistent results 7. Consider publication bias when interpreting aggregate results ================================================ FILE: researchclaw/skills/builtin/experiment/systematic-review/SKILL.md ================================================ --- name: systematic-review description: Structured methodology for comprehensive literature review following PRISMA guidelines. Use during literature search and screening stages. metadata: category: experiment trigger-keywords: "literature,review,survey,related work,prior work" applicable-stages: "3,4,5,6" priority: "3" version: "1.0" author: researchclaw references: "Page et al., The PRISMA 2020 statement, BMJ 2021" --- ## Systematic Review Best Practice Follow PRISMA-like methodology for literature search: 1. Define clear inclusion/exclusion criteria BEFORE searching 2. Use multiple databases (Semantic Scholar, arXiv, OpenAlex) 3. Search with both broad and narrow queries 4. Screen by title/abstract first, then full text 5. Extract: method, dataset, metrics, key findings 6. Synthesize gaps and opportunities, not just summaries 7. Prioritize recent (last 2-3 years) high-citation papers 8. Include at least one seminal/foundational paper per sub-topic ================================================ FILE: researchclaw/skills/builtin/tooling/data-loading/SKILL.md ================================================ --- name: data-loading description: Optimize data loading pipeline to prevent GPU starvation. Use when setting up DataLoader or data preprocessing. metadata: category: tooling trigger-keywords: "data,loading,dataloader,dataset,preprocessing,augmentation" applicable-stages: "10" priority: "6" version: "1.0" author: researchclaw references: "PyTorch Data Loading Tutorial, pytorch.org" --- ## Efficient Data Loading Best Practice 1. Use num_workers = min(8, os.cpu_count()) for DataLoader 2. Enable pin_memory=True when using GPU 3. Use persistent_workers=True to avoid re-spawning 4. Pre-compute and cache transformations when possible 5. For image data: use torchvision.transforms.v2 (faster) 6. For large datasets: consider memory-mapped files or WebDataset 7. Profile with torch.utils.bottleneck to find I/O bottlenecks ================================================ FILE: researchclaw/skills/builtin/tooling/distributed-training/SKILL.md ================================================ --- name: distributed-training description: Multi-GPU and distributed training patterns with PyTorch DDP. Use when scaling training across GPUs. metadata: category: tooling trigger-keywords: "distributed,multi-gpu,parallel,ddp,scale" applicable-stages: "10,12" priority: "7" version: "1.0" author: researchclaw references: "PyTorch DDP Tutorial, pytorch.org; Goyal et al., Accurate Large Minibatch SGD, 2017" --- ## Distributed Training Best Practice 1. Use DistributedDataParallel (DDP) over DataParallel for multi-GPU 2. Initialize process group: dist.init_process_group(backend='nccl') 3. Use DistributedSampler for data sharding 4. Synchronize batch norm: nn.SyncBatchNorm.convert_sync_batchnorm() 5. Only save checkpoint on rank 0 6. Scale learning rate linearly with world size 7. Use gradient accumulation for effectively larger batch sizes ================================================ FILE: researchclaw/skills/builtin/tooling/mixed-precision/SKILL.md ================================================ --- name: mixed-precision description: Use FP16/BF16 mixed precision to accelerate training and reduce memory. Use when optimizing GPU performance. metadata: category: tooling trigger-keywords: "training,gpu,memory,speed,precision,fp16,bf16" applicable-stages: "10,12" priority: "5" version: "1.0" author: researchclaw references: "Micikevicius et al., Mixed Precision Training, ICLR 2018" code-template: | scaler = torch.cuda.amp.GradScaler() for batch in dataloader: optimizer.zero_grad() with torch.cuda.amp.autocast(): output = model(batch) loss = criterion(output, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() --- ## Mixed Precision Training Best Practice Use torch.cuda.amp for automatic mixed precision: - Wrap forward pass in torch.cuda.amp.autocast() - Use GradScaler for loss scaling - BF16 preferred over FP16 on Ampere+ GPUs (RTX 3xxx, A100, RTX 4xxx) - Watch for NaN gradients — reduce learning rate if needed - Do NOT use amp with custom CUDA kernels unless tested ================================================ FILE: researchclaw/skills/builtin/tooling/pytorch-training/SKILL.md ================================================ --- name: pytorch-training description: Best practices for building robust PyTorch training loops. Use when generating or reviewing ML training code. metadata: category: tooling trigger-keywords: "training,pytorch,torch,deep learning,neural network,model" applicable-stages: "10,12" priority: "3" version: "1.0" author: researchclaw references: "PyTorch Performance Tuning Guide, pytorch.org" code-template: | import torch import torch.nn as nn from torch.utils.data import DataLoader # Reproducibility torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True # Training loop model.train() for epoch in range(num_epochs): for batch in train_loader: optimizer.zero_grad(set_to_none=True) loss = criterion(model(batch['input']), batch['target']) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() --- ## PyTorch Training Best Practice 1. Use torch.manual_seed() for reproducibility (set for torch, numpy, random) 2. Use DataLoader with num_workers>0 and pin_memory=True for GPU 3. Enable cudnn.benchmark=True for fixed input sizes 4. Use learning rate schedulers (CosineAnnealingLR or OneCycleLR) 5. Implement early stopping based on validation metric 6. Log metrics every epoch, save best model checkpoint 7. Use torch.no_grad() for evaluation 8. Clear gradients with optimizer.zero_grad(set_to_none=True) for efficiency ================================================ FILE: researchclaw/skills/loader.py ================================================ """Skill file loader — supports YAML, JSON, and SKILL.md (agentskills.io).""" from __future__ import annotations import json import logging from pathlib import Path import yaml from researchclaw.skills.schema import Skill logger = logging.getLogger(__name__) # ── SKILL.md loader ────────────────────────────────────────────────── def load_skill_from_skillmd(path: Path) -> Skill | None: """Load a skill from a ``SKILL.md`` file (agentskills.io format). Expected layout:: --- name: kebab-case-id description: one-liner metadata: category: domain trigger-keywords: "kw1,kw2" --- Markdown body here ... Args: path: Path to the SKILL.md file. Returns: Parsed :class:`Skill`, or *None* on failure. """ try: text = path.read_text(encoding="utf-8") except Exception as exc: logger.warning("Failed to read SKILL.md at %s: %s", path, exc) return None # Split on YAML frontmatter markers parts = text.split("---", 2) if len(parts) < 3: logger.warning("SKILL.md missing frontmatter delimiters: %s", path) return None try: header = yaml.safe_load(parts[1]) except Exception as exc: logger.warning("Invalid YAML frontmatter in %s: %s", path, exc) return None if not isinstance(header, dict): logger.warning("Frontmatter is not a dict in %s", path) return None name = str(header.get("name", "")) if not name: logger.warning("SKILL.md missing 'name' field: %s", path) return None description = str(header.get("description", "")) body = parts[2].strip() # Build metadata — flatten nested 'metadata' dict from frontmatter metadata: dict[str, str] = {} raw_meta = header.get("metadata") if isinstance(raw_meta, dict): for k, v in raw_meta.items(): metadata[str(k)] = str(v) # Also pull top-level keys that map to metadata for key in ("category", "license", "compatibility", "version", "author"): if key in header and key not in metadata: metadata[key] = str(header[key]) skill_license = str(header.get("license", "")) compatibility = str(header.get("compatibility", "")) return Skill( name=name, description=description, body=body, license=skill_license, compatibility=compatibility, metadata=metadata, source_dir=path.parent, source_format="skillmd", ) def load_skillmd_from_directory(directory: Path) -> list[Skill]: """Scan *directory* for ``*/SKILL.md`` sub-directories. Each immediate sub-directory containing a ``SKILL.md`` file is treated as a single skill. """ skills: list[Skill] = [] if not directory.exists(): return skills for skill_md in sorted(directory.rglob("SKILL.md")): skill = load_skill_from_skillmd(skill_md) if skill: skills.append(skill) return skills # ── Legacy YAML / JSON loader ──────────────────────────────────────── def load_skill_file(path: Path) -> Skill | None: """Load a single skill from a YAML or JSON file. Args: path: Path to the skill file. Returns: Parsed Skill object, or None if loading fails. """ try: text = path.read_text(encoding="utf-8") if path.suffix in (".yaml", ".yml"): data = yaml.safe_load(text) elif path.suffix == ".json": data = json.loads(text) else: logger.warning("Unsupported skill file format: %s", path) return None if not isinstance(data, dict): logger.warning("Skill file is not a dict: %s", path) return None skill = Skill.from_dict(data) if not skill.name: logger.warning("Skill missing name/id: %s", path) return None return skill except Exception as exc: logger.warning("Failed to load skill from %s: %s", path, exc) return None def load_skills_from_directory(directory: Path) -> list[Skill]: """Recursively load all skills from a directory. Supports both ``SKILL.md`` (agentskills.io) and legacy YAML/JSON. When both formats exist for the same skill name, SKILL.md wins. Args: directory: Root directory to scan. Returns: List of successfully loaded Skill objects. """ skills_by_name: dict[str, Skill] = {} if not directory.exists(): return [] # 1. Load SKILL.md files first (higher priority) for skill in load_skillmd_from_directory(directory): skills_by_name[skill.name] = skill # 2. Load legacy YAML/JSON (only if no SKILL.md with same name) for pattern in ("*.yaml", "*.yml", "*.json"): for path in sorted(directory.rglob(pattern)): if path.name == "__init__.py": continue skill = load_skill_file(path) if skill and skill.name not in skills_by_name: skills_by_name[skill.name] = skill skills = list(skills_by_name.values()) logger.info("Loaded %d skills from %s", len(skills), directory) return skills ================================================ FILE: researchclaw/skills/matcher.py ================================================ """Skill-to-stage matching engine.""" from __future__ import annotations import logging import re from researchclaw.skills.schema import STAGE_NAME_TO_NUMBER, Skill logger = logging.getLogger(__name__) def _tokenize(text: str) -> set[str]: """Extract lowercase tokens from text.""" return set(re.findall(r"[a-z0-9_]+", text.lower())) def _resolve_stage(stage: int | str) -> int: """Convert a stage name to its number, or pass through an int.""" if isinstance(stage, int): return stage return STAGE_NAME_TO_NUMBER.get(stage, -1) def match_skills( skills: list[Skill], context: str, stage: int | str, top_k: int = 3, *, fallback_matching: bool = True, ) -> list[Skill]: """Match skills to the current context and stage. Scoring: - Stage applicability (must match, or empty = all stages) - Keyword overlap with context - Description-based fallback at 0.5x discount (for skills without trigger_keywords) - Priority (lower = higher priority) Args: skills: Available skills to match against. context: Current task context text. stage: Current pipeline stage number or name. top_k: Maximum number of skills to return. fallback_matching: Enable description-based matching for skills without trigger_keywords. Returns: List of matched skills sorted by relevance. """ stage_num = _resolve_stage(stage) context_tokens = _tokenize(context) scored: list[tuple[float, Skill]] = [] for skill in skills: # Filter by stage applicability if skill.applicable_stages and stage_num not in skill.applicable_stages: continue # Keyword matching score keyword_score = 0.0 has_keywords = bool(skill.trigger_keywords) for kw in skill.trigger_keywords: kw_tokens = _tokenize(kw) if kw_tokens & context_tokens: keyword_score += 1.0 # Description-based fallback for external skills without keywords if keyword_score == 0.0 and not has_keywords and fallback_matching: desc_tokens = _tokenize(skill.description) overlap = len(desc_tokens & context_tokens) if overlap > 0: keyword_score = overlap * 0.5 # 0.5x discount max_possible = max(len(desc_tokens), 1) normalized_kw = keyword_score / max_possible else: continue elif keyword_score == 0.0: continue else: max_possible = max(len(skill.trigger_keywords), 1) normalized_kw = keyword_score / max_possible # Priority adjustment (priority 1 → boost 0.5, priority 10 → boost 0.0) priority_boost = (10 - skill.priority) / 20.0 total_score = normalized_kw + priority_boost scored.append((total_score, skill)) scored.sort(key=lambda x: (-x[0], x[1].priority)) return [skill for _, skill in scored[:top_k]] def format_skills_for_prompt(skills: list[Skill], max_chars: int = 4000) -> str: """Format matched skills as prompt injection text. Uses ``skill.body`` as primary content. Truncates long bodies (common with external skills) to ``max_chars / len(skills)`` per skill. Args: skills: List of matched skills. max_chars: Maximum character limit. Returns: Formatted string for LLM prompt injection. """ if not skills: return "" per_skill_budget = max_chars // max(len(skills), 1) parts: list[str] = [] total_len = 0 for skill in skills: content = skill.body or skill.prompt_template # Truncate long bodies if len(content) > per_skill_budget: content = content[:per_skill_budget - 20] + "\n\n[... truncated]" section = f"### {skill.name} ({skill.category})\n{content}" if skill.code_template: section += f"\n**Code Template:**\n```python\n{skill.code_template}\n```" if skill.references: section += "\n**References:** " + "; ".join(skill.references) if total_len + len(section) > max_chars: break parts.append(section) total_len += len(section) return "\n\n".join(parts) ================================================ FILE: researchclaw/skills/registry.py ================================================ """Skill registry — central hub for loading and querying skills.""" from __future__ import annotations import logging from pathlib import Path from researchclaw.skills.loader import load_skills_from_directory from researchclaw.skills.matcher import format_skills_for_prompt, match_skills from researchclaw.skills.schema import Skill logger = logging.getLogger(__name__) # Default builtin directory relative to this file _BUILTIN_DIR = Path(__file__).parent / "builtin" class SkillRegistry: """Central registry for managing and querying skills. Loads builtin skills on init, then optionally loads custom and external skills from user-specified directories. """ def __init__( self, builtin_dir: str | Path = "", custom_dirs: tuple[str, ...] | list[str] = (), external_dirs: tuple[str, ...] | list[str] = (), auto_match: bool = True, max_skills_per_stage: int = 3, fallback_matching: bool = True, ) -> None: self._skills: dict[str, Skill] = {} self._auto_match = auto_match self._max_skills = max_skills_per_stage self._fallback_matching = fallback_matching # Load builtin skills builtin = Path(builtin_dir) if builtin_dir else _BUILTIN_DIR self._load_from_dir(builtin) # Load custom skills for d in custom_dirs: self._load_from_dir(Path(d)) # Load external skills (same mechanism) for d in external_dirs: self._load_from_dir(Path(d)) def _load_from_dir(self, directory: Path) -> None: """Load skills from a directory and register them.""" skills = load_skills_from_directory(directory) for skill in skills: self.register(skill) def register(self, skill: Skill) -> None: """Register a skill. Overwrites existing skill with same name. Args: skill: The skill to register. """ self._skills[skill.name] = skill logger.debug("Registered skill: %s", skill.name) def unregister(self, skill_id: str) -> bool: """Remove a skill from the registry. Args: skill_id: The name/ID of the skill to remove. Returns: True if skill was found and removed. """ if skill_id in self._skills: del self._skills[skill_id] return True return False def get(self, skill_id: str) -> Skill | None: """Get a skill by name/ID.""" return self._skills.get(skill_id) def list_all(self) -> list[Skill]: """Return all registered skills.""" return list(self._skills.values()) def list_by_category(self, category: str) -> list[Skill]: """Return skills filtered by category.""" return [s for s in self._skills.values() if s.category == category] def list_by_stage(self, stage: int) -> list[Skill]: """Return skills applicable to a specific stage.""" return [ s for s in self._skills.values() if not s.applicable_stages or stage in s.applicable_stages ] def match( self, context: str, stage: int | str, top_k: int | None = None, ) -> list[Skill]: """Match skills to current context and stage. Args: context: Task context text. stage: Current pipeline stage number or name. top_k: Max results (defaults to max_skills_per_stage). Returns: List of matched skills sorted by relevance. """ k = top_k or self._max_skills return match_skills( list(self._skills.values()), context, stage, top_k=k, fallback_matching=self._fallback_matching, ) def export_for_prompt( self, skills: list[Skill], max_chars: int = 4000, ) -> str: """Format matched skills as prompt injection text. Args: skills: List of matched skills. max_chars: Character limit. Returns: Formatted prompt text. """ return format_skills_for_prompt(skills, max_chars=max_chars) def count(self) -> int: """Return total number of registered skills.""" return len(self._skills) ================================================ FILE: researchclaw/skills/schema.py ================================================ """Skill data model definition (agentskills.io compatible).""" from __future__ import annotations from dataclasses import dataclass, field from pathlib import Path from typing import Any # Maps pipeline stage names to stage numbers. STAGE_NAME_TO_NUMBER: dict[str, int] = { "topic_init": 1, "problem_decompose": 2, "search_strategy": 3, "literature_collect": 4, "literature_screen": 5, "knowledge_extract": 6, "synthesis": 7, "hypothesis_gen": 8, "experiment_design": 9, "code_generation": 10, "resource_planning": 11, "experiment_run": 12, "iterative_refine": 13, "result_analysis": 14, "research_decision": 15, "paper_outline": 16, "paper_draft": 17, "peer_review": 18, "paper_revision": 19, "quality_gate": 20, "knowledge_archive": 21, "export_publish": 22, "citation_verify": 23, } # Valid categories in the new taxonomy. VALID_CATEGORIES = ("writing", "domain", "experiment", "tooling") @dataclass class Skill: """A single skill definition (agentskills.io compatible). Standard fields follow the agentskills.io specification. Legacy YAML fields are accessible via backward-compat properties that read from ``metadata``. """ # agentskills.io standard fields name: str description: str body: str = "" license: str = "" compatibility: str = "" metadata: dict[str, str] = field(default_factory=dict) # filesystem context source_dir: Path | None = None source_format: str = "skillmd" # "skillmd" | "yaml" # ── backward-compat property accessors ─────────────────────── @property def id(self) -> str: # noqa: A003 """Alias for ``name`` (legacy).""" return self.name @property def category(self) -> str: return self.metadata.get("category", "domain") @property def trigger_keywords(self) -> list[str]: raw = self.metadata.get("trigger-keywords", "") return [k.strip() for k in raw.split(",") if k.strip()] if raw else [] @property def applicable_stages(self) -> list[int]: raw = self.metadata.get("applicable-stages", "") if not raw: return [] parts: list[int] = [] for tok in raw.split(","): tok = tok.strip() if tok.isdigit(): parts.append(int(tok)) return parts @property def priority(self) -> int: return int(self.metadata.get("priority", "5")) @property def prompt_template(self) -> str: """Alias for ``body`` (legacy).""" return self.body @property def code_template(self) -> str | None: return self.metadata.get("code-template") or None @property def references(self) -> list[str]: raw = self.metadata.get("references", "") return [r.strip() for r in raw.split(";") if r.strip()] if raw else [] @property def version(self) -> str: return self.metadata.get("version", "1.0") # ── serialization ──────────────────────────────────────────── def to_dict(self) -> dict[str, Any]: """Serialize to dictionary (legacy-compatible output).""" return { "id": self.name, "name": self.name, "category": self.category, "description": self.description, "trigger_keywords": self.trigger_keywords, "applicable_stages": self.applicable_stages, "prompt_template": self.body, "code_template": self.code_template, "references": self.references, "version": self.version, "priority": self.priority, } @classmethod def from_dict(cls, data: dict[str, Any]) -> Skill: """Deserialize from a legacy YAML/JSON dictionary.""" # Pack legacy top-level fields into metadata meta: dict[str, str] = {} if data.get("category"): meta["category"] = str(data["category"]) kw = data.get("trigger_keywords") or [] if kw: meta["trigger-keywords"] = ",".join(str(k) for k in kw) stages = data.get("applicable_stages") or [] if stages: meta["applicable-stages"] = ",".join(str(s) for s in stages) if data.get("priority") is not None: meta["priority"] = str(data["priority"]) if data.get("version"): meta["version"] = str(data["version"]) if data.get("code_template"): meta["code-template"] = str(data["code_template"]) refs = data.get("references") or [] if refs: meta["references"] = "; ".join(str(r) for r in refs) # Merge any explicit metadata from the dict if isinstance(data.get("metadata"), dict): for k, v in data["metadata"].items(): meta.setdefault(str(k), str(v)) name = str(data.get("name") or data.get("id") or "") # For legacy YAML, use 'id' if 'name' looks like a display name # and 'id' looks like a slug raw_id = str(data.get("id", "")) if raw_id and "-" in raw_id: name = raw_id return cls( name=name, description=str(data.get("description", "")), body=str(data.get("prompt_template", "")), metadata=meta, source_format="yaml", ) ================================================ FILE: researchclaw/templates/__init__.py ================================================ """Conference-aware LaTeX template system. Supports automatic template switching for NeurIPS, ICLR, and ICML. Given a target conference name, generates a complete ``.tex`` file from Markdown paper content + BibTeX references. Usage:: from researchclaw.templates import get_template, markdown_to_latex tpl = get_template("neurips_2025") tex = markdown_to_latex(paper_md, tpl, title=..., authors=..., bib_file="references.bib") """ from researchclaw.templates.conference import ( CONFERENCE_REGISTRY, ConferenceTemplate, get_template, list_conferences, ) from researchclaw.templates.converter import markdown_to_latex __all__ = [ "CONFERENCE_REGISTRY", "ConferenceTemplate", "get_template", "list_conferences", "markdown_to_latex", ] ================================================ FILE: researchclaw/templates/compiler.py ================================================ """LaTeX compilation and error repair utilities (IMP-18). Provides ``compile_latex()`` which attempts ``pdflatex`` compilation, parses the log for common errors, applies automated fixes, and retries up to 3 times. Designed to run inside ``_package_deliverables()`` so that the final paper.tex in ``deliverables/`` is compile-tested. If pdflatex is not installed the module gracefully returns a failure report without raising. """ from __future__ import annotations import logging import re import shutil import subprocess import tempfile from dataclasses import dataclass, field from pathlib import Path logger = logging.getLogger(__name__) # BUG-201: Cyrillic → Latin transliteration for author names from Semantic Scholar. # pdflatex without T2A font encoding chokes on Cyrillic (e.g. "А. И. Колесников"). _CYRILLIC_TO_LATIN_MAP: dict[str, str] = { "А": "A", "Б": "B", "В": "V", "Г": "G", "Д": "D", "Е": "E", "Ё": "E", "Ж": "Zh", "З": "Z", "И": "I", "Й": "Y", "К": "K", "Л": "L", "М": "M", "Н": "N", "О": "O", "П": "P", "Р": "R", "С": "S", "Т": "T", "У": "U", "Ф": "F", "Х": "Kh", "Ц": "Ts", "Ч": "Ch", "Ш": "Sh", "Щ": "Shch", "Ъ": "", "Ы": "Y", "Ь": "", "Э": "E", "Ю": "Yu", "Я": "Ya", "а": "a", "б": "b", "в": "v", "г": "g", "д": "d", "е": "e", "ё": "e", "ж": "zh", "з": "z", "и": "i", "й": "y", "к": "k", "л": "l", "м": "m", "н": "n", "о": "o", "п": "p", "р": "r", "с": "s", "т": "t", "у": "u", "ф": "f", "х": "kh", "ц": "ts", "ч": "ch", "ш": "sh", "щ": "shch", "ъ": "", "ы": "y", "ь": "", "э": "e", "ю": "yu", "я": "ya", } @dataclass class CompileResult: """Outcome of a LaTeX compilation attempt.""" success: bool log_excerpt: str = "" errors: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) fixes_applied: list[str] = field(default_factory=list) attempts: int = 0 def compile_latex( tex_path: Path, *, max_attempts: int = 3, timeout: int = 120, ) -> CompileResult: """Compile *tex_path* with pdflatex, auto-fixing common errors. Parameters ---------- tex_path: Path to the ``.tex`` file. Must be inside a directory that also contains ``references.bib`` and any required ``.sty`` files. max_attempts: Maximum compile→fix cycles. timeout: Seconds before killing a stuck pdflatex process. Returns ------- CompileResult Contains success flag, log excerpt, errors found, and fixes applied. """ if not shutil.which("pdflatex"): return CompileResult( success=False, log_excerpt="pdflatex not found on PATH", errors=["pdflatex not installed"], ) result = CompileResult(success=False) work_dir = tex_path.parent tex_name = tex_path.name bib_stem = tex_name.rsplit(".", 1)[0] # Pre-flight: sanitize .bib file (escape bare & in field values) # Find bib filename from \bibliography{...} in the tex source _tex_src = tex_path.read_text(encoding="utf-8", errors="replace") _bib_match = re.search(r"\\bibliography\{([^}]+)\}", _tex_src) _bib_name = _bib_match.group(1) if _bib_match else bib_stem _sanitize_bib_file(work_dir / f"{_bib_name}.bib") # BUG-197: Pre-flight — strip invisible/problematic Unicode from .tex. # Characters like U+202F (NARROW NO-BREAK SPACE) cause pdflatex to emit # broken UTF-8 in error messages, which crashes subprocess text decoding # and prevents the bibtex + multi-pass pipeline from completing. _sanitize_tex_unicode(tex_path) for attempt in range(1, max_attempts + 1): result.attempts = attempt # --- Full 3-pass compilation: pdflatex → bibtex → pdflatex × 2 --- # Pass 1: generate .aux (needed by bibtex). Use nonstopmode (NOT # halt-on-error) so .aux is written even when there are non-fatal # errors like missing figures or overfull hboxes. log_text, pass1_ok = _run_pdflatex(work_dir, tex_name, timeout) if log_text is None: result.errors.append(f"pdflatex failed on pass 1 (attempt {attempt})") break # BibTeX: always run after pass 1 — it only needs .aux + .bib. # Previously gated behind pass1 success, which meant citations were # always [?] when the first pass had non-fatal errors. _run_bibtex(work_dir, bib_stem, timeout=60) # Passes 2-3: resolve cross-references and bibliography for _pass in (2, 3): pass_log, _ = _run_pdflatex(work_dir, tex_name, timeout) if pass_log is not None: log_text = pass_log # keep final pass log for error analysis # Parse the final log for errors/warnings errors, warnings = _parse_log(log_text) result.warnings = warnings result.log_excerpt = log_text[-2000:] if len(log_text) > 2000 else log_text # Check for fatal errors only — non-fatal ones (overfull hbox, # missing figure in draft) don't prevent a valid PDF. fatal = [e for e in errors if _is_fatal_error(e)] result.errors = errors if not fatal: result.success = True logger.info("IMP-18: LaTeX compiled successfully on attempt %d", attempt) break # Try to auto-fix fatal errors tex_text = tex_path.read_text(encoding="utf-8") fixed_text, fixes = fix_common_latex_errors(tex_text, errors) if fixes: result.fixes_applied.extend(fixes) tex_path.write_text(fixed_text, encoding="utf-8") logger.info( "IMP-18: Applied %d fixes on attempt %d: %s", len(fixes), attempt, fixes, ) else: # No fixes available — stop retrying logger.warning( "IMP-18: Compilation failed on attempt %d with %d unfixable errors", attempt, len(fatal), ) break return result def fix_common_latex_errors( tex_text: str, errors: list[str] ) -> tuple[str, list[str]]: """Apply automated fixes for common LaTeX errors. Returns ``(fixed_text, list_of_fix_descriptions)``. """ fixes: list[str] = [] fixed = tex_text # --- Pre-error-loop fixes: structural repairs that prevent compilation --- # Fix escaped braces in tabular column specs: \{lcccc\} → {lcccc} if re.search(r"\\begin\{tabular\}\\\{", fixed): fixed = re.sub( r"\\begin\{tabular\}\\\{([^}]*?)\\\}", r"\\begin{tabular}{\1}", fixed, ) fixes.append("Fixed escaped braces in tabular column specs") # Fix escaped & inside tabular data rows: \& → & (column separator). # The converter's _escape_latex escapes & globally; inside tabular # environments the & must remain unescaped as the column separator. if "\\begin{tabular}" in fixed and "\\&" in fixed: fixed, n_tab_amp = _fix_escaped_ampersand_in_tabular(fixed) if n_tab_amp: fixes.append(f"Un-escaped \\& in {n_tab_amp} tabular data row(s)") # Fix escaped \} at end of \caption{...}: \caption{text.\}} → \caption{text.} if re.search(r"\\caption\{.*?\\\}", fixed): fixed = re.sub( r"(\\caption\{[^}]*?)\\\}", r"\1}", fixed, ) fixes.append("Fixed escaped \\} in \\caption arguments") # Collapse multiple consecutive \clearpage into one if re.search(r"(\\clearpage\s*){2,}", fixed): fixed = re.sub(r"(\\clearpage\s*){2,}", "\\\\clearpage\n", fixed) fixes.append("Collapsed multiple \\clearpage commands") # Remove \textbf{Figure N.} paragraphs that follow \end{figure} dup_cap = re.search( r"(\\end\{figure\})\s*\n\s*\\textbf\{Figure\s+\d+", fixed, ) if dup_cap: fixed = re.sub( r"(\\end\{figure\})\s*\n\s*\\textbf\{Figure\s+\d+[.:].*?\}\s*\n", r"\1\n", fixed, ) fixes.append("Removed duplicate bold Figure captions after \\end{figure}") # BUG-189: Fix Python-style pseudocode inside algorithmic environments. # LLM generates `# comment` (LaTeX macro param char) and `var_name` # (unescaped underscore) inside \STATE commands — causes cascading errors. _algo_pat = re.compile( r"(\\begin\{algorithmic\}.*?\\end\{algorithmic\})", re.DOTALL ) def _fix_algo_block(m: re.Match) -> str: block = m.group(0) lines = block.split("\n") out: list[str] = [] for line in lines: if line.strip().startswith(("\\begin{", "\\end{")): out.append(line) continue # Replace # (Python comment) with \COMMENT{...} if "#" in line and "\\#" not in line: line = re.sub(r"#\s*(.*)$", r"\\COMMENT{\1}", line) # Escape bare underscores not already in math mode # Don't touch \STATE, \IF, \FOR, etc. commands parts = re.split(r"(\\\w+\{[^}]*\}|\$[^$]+\$)", line) fixed_parts = [] for part in parts: if part.startswith("\\") or part.startswith("$"): fixed_parts.append(part) else: fixed_parts.append(re.sub(r'(? tuple[list[str], list[str]]: """Parse pdflatex log output for errors and warnings.""" errors: list[str] = [] warnings: list[str] = [] for line in log_text.split("\n"): line_stripped = line.strip() line_lower = line_stripped.lower() if line_stripped.startswith("!"): errors.append(line_stripped) elif "LaTeX Warning:" in line_stripped: warnings.append(line_stripped) # BUG-R6-26: Use elif to avoid duplicating "!" lines elif "Undefined control sequence" in line_stripped: errors.append(line_stripped) elif "Missing" in line_stripped and "inserted" in line_stripped: errors.append(line_stripped) elif "File" in line_stripped and "not found" in line_stripped: errors.append(line_stripped) # BUG-R6-21: Detect "Float(s) lost" and "Too many unprocessed floats" # even when they don't start with "!" elif "float(s) lost" in line_lower: errors.append(line_stripped) elif "too many unprocessed floats" in line_lower: errors.append(line_stripped) return errors, warnings @dataclass class QualityCheckResult: """Results of post-compilation quality checks.""" unresolved_refs: list[str] = field(default_factory=list) unresolved_cites: list[str] = field(default_factory=list) overfull_hboxes: list[str] = field(default_factory=list) underfull_hboxes: list[str] = field(default_factory=list) page_count: int = 0 orphan_figures: list[str] = field(default_factory=list) orphan_labels: list[str] = field(default_factory=list) warnings_summary: list[str] = field(default_factory=list) @property def has_critical_issues(self) -> bool: return bool(self.unresolved_refs or self.unresolved_cites) def check_compiled_quality( tex_path: Path, *, page_limit: int = 10, ) -> QualityCheckResult: """Run post-compilation quality checks on a LaTeX document. Parses the .log file and .tex source for: - Unresolved references (??) - Unresolved citations - Overfull/underfull hboxes - Page count vs limit - Orphan figures (defined but never referenced, or vice versa) """ result = QualityCheckResult() work_dir = tex_path.parent stem = tex_path.stem # --- Parse .log file --- log_path = work_dir / f"{stem}.log" if log_path.exists(): log_text = log_path.read_text(encoding="utf-8", errors="replace") for line in log_text.split("\n"): line_s = line.strip() # Unresolved references if "LaTeX Warning: Reference" in line_s and "undefined" in line_s: result.unresolved_refs.append(line_s) # Unresolved citations if "LaTeX Warning: Citation" in line_s and "undefined" in line_s: result.unresolved_cites.append(line_s) # Overfull hboxes (only flag significant ones > 1pt) if "Overfull \\hbox" in line_s: m = re.search(r"(\d+\.?\d*)pt", line_s) if m and float(m.group(1)) > 1.0: result.overfull_hboxes.append(line_s) # Underfull hboxes (badness >= 5000) if "Underfull \\hbox" in line_s and "badness" in line_s: m = re.search(r"badness (\d+)", line_s) if m and int(m.group(1)) >= 5000: result.underfull_hboxes.append(line_s) # --- Count pages from .aux or .log --- aux_path = work_dir / f"{stem}.aux" if aux_path.exists(): aux_text = aux_path.read_text(encoding="utf-8", errors="replace") # Look for \newlabel{LastPage}{{N}{...}} m = re.search(r"\\newlabel\{LastPage\}\{\{(\d+)\}", aux_text) if m: result.page_count = int(m.group(1)) if result.page_count == 0 and log_path.exists(): # Fallback: count "Output written on ... (N pages)" m = re.search(r"Output written on .* \((\d+) page", log_text) if m: result.page_count = int(m.group(1)) # --- Cross-reference validation --- tex_text = tex_path.read_text(encoding="utf-8", errors="replace") # Find all \label{fig:X} fig_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", tex_text)) # Find all \ref{fig:X} fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", tex_text)) # Orphan labels (defined but never referenced) result.orphan_labels = sorted(fig_labels - fig_refs) # Orphan references (referenced but never defined) result.orphan_figures = sorted(fig_refs - fig_labels) # --- Build warnings summary --- if result.unresolved_refs: result.warnings_summary.append( f"{len(result.unresolved_refs)} unresolved reference(s)" ) if result.unresolved_cites: result.warnings_summary.append( f"{len(result.unresolved_cites)} unresolved citation(s)" ) if result.overfull_hboxes: result.warnings_summary.append( f"{len(result.overfull_hboxes)} overfull hbox(es) > 1pt" ) if result.page_count > page_limit: result.warnings_summary.append( f"Page count {result.page_count} exceeds limit {page_limit}" ) if result.orphan_figures: result.warnings_summary.append( f"{len(result.orphan_figures)} referenced but undefined figure(s): " + ", ".join(result.orphan_figures[:3]) ) if result.orphan_labels: result.warnings_summary.append( f"{len(result.orphan_labels)} defined but unreferenced figure(s): " + ", ".join(result.orphan_labels[:3]) ) return result def remove_missing_figures(tex_text: str, stage_dir: Path) -> tuple[str, list[str]]: """Remove \\begin{figure}...\\end{figure} blocks that reference missing images. Returns ``(fixed_text, list_of_removed_paths)``. """ removed: list[str] = [] def _check_fig(m: re.Match) -> str: block = m.group(0) img_match = re.search(r"\\includegraphics.*?\{([^}]+)\}", block) if img_match: img_rel = img_match.group(1) img_path = stage_dir / img_rel if not img_path.exists(): # Try prefix-matching: fig_main_results.png → fig_main_results_comparison.png parent = img_path.parent stem = img_path.stem # e.g. "fig_main_results" if parent.exists(): candidates = sorted(parent.glob(f"{stem}*.png")) if len(candidates) == 1: new_rel = str(candidates[0].relative_to(stage_dir)) logger.info( "Auto-mapped missing figure: %s → %s", img_rel, new_rel, ) return block.replace(img_rel, new_rel) logger.warning( "Removing figure block with missing image: %s", img_rel, ) removed.append(img_rel) return "" # Remove the entire figure block return block fixed = re.sub( r"\\begin\{figure\}.*?\\end\{figure\}", _check_fig, tex_text, flags=re.DOTALL, ) # Clean up orphan \ref{fig:X} that point to removed/nonexistent figures. # These render as "??" in the PDF. if removed: remaining_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", fixed)) all_fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", fixed)) orphan = all_fig_refs - remaining_labels for oref in orphan: # Replace "Figure \ref{fig:X}" or "Fig. \ref{fig:X}" with empty fixed = re.sub( rf"(?:Figure|Fig\.?)\s*~?\\ref\{{{re.escape(oref)}\}}", "(figure omitted)", fixed, ) # Replace standalone \ref{fig:X} fixed = fixed.replace(f"\\ref{{{oref}}}", "(ref omitted)") return fixed, removed def _sanitize_tex_unicode(tex_path: Path) -> None: """Strip problematic Unicode characters from .tex source. BUG-197: Characters like U+202F (NARROW NO-BREAK SPACE), U+2009 (THIN SPACE), U+00A0 (NO-BREAK SPACE), and other non-ASCII whitespace cause pdflatex to emit broken UTF-8 in error messages, which crashes Python's ``subprocess.run(text=True)`` and prevents the bibtex + multi-pass pipeline from completing. These characters appear when LLMs copy-paste text from web sources or academic papers. The safe replacement is a normal ASCII space for whitespace-like chars, and empty string for invisible control chars. """ if not tex_path.exists(): return try: text = tex_path.read_text(encoding="utf-8", errors="replace") except Exception: return # Whitespace-like Unicode → ASCII space _UNICODE_SPACES = ( "\u00a0", # NO-BREAK SPACE "\u202f", # NARROW NO-BREAK SPACE (BUG-197 trigger) "\u2009", # THIN SPACE "\u2007", # FIGURE SPACE "\u2008", # PUNCTUATION SPACE "\u200a", # HAIR SPACE "\u205f", # MEDIUM MATHEMATICAL SPACE "\u3000", # IDEOGRAPHIC SPACE ) # Invisible control characters → remove _INVISIBLE_CHARS = ( "\u200e", # LEFT-TO-RIGHT MARK "\u200f", # RIGHT-TO-LEFT MARK "\ufeff", # BOM / ZERO-WIDTH NO-BREAK SPACE "\u200b", # ZERO-WIDTH SPACE "\u200c", # ZERO-WIDTH NON-JOINER "\u200d", # ZERO-WIDTH JOINER "\u00ad", # SOFT HYPHEN "\u2060", # WORD JOINER "\u2028", # LINE SEPARATOR "\u2029", # PARAGRAPH SEPARATOR ) changed = False for ch in _UNICODE_SPACES: if ch in text: text = text.replace(ch, " ") changed = True for ch in _INVISIBLE_CHARS: if ch in text: text = text.replace(ch, "") changed = True # BUG-201: Transliterate any Cyrillic that leaked into .tex (from bib # entries inlined by bibtex, or from LLM-generated text). _has_cyrillic = any("\u0400" <= ch <= "\u04ff" for ch in text) if _has_cyrillic: for cyr, lat in _CYRILLIC_TO_LATIN_MAP.items(): if cyr in text: text = text.replace(cyr, lat) changed = True if changed: tex_path.write_text(text, encoding="utf-8") logger.info("BUG-197: Sanitized problematic Unicode in %s", tex_path.name) def _sanitize_bib_file(bib_path: Path) -> None: """Sanitize .bib files: escape bare ``&`` and strip invisible Unicode. BibTeX treats ``&`` as a special character; journal names like "Science & Technology" must use ``\\&``. BUG-180: Invisible Unicode characters (U+200E LEFT-TO-RIGHT MARK, U+200F RIGHT-TO-LEFT MARK, U+FEFF BOM, U+200B ZERO-WIDTH SPACE, U+200C/U+200D joiners, U+00AD soft hyphen) can appear in copy-pasted author names and break pdflatex. """ if not bib_path.exists(): return try: text = bib_path.read_text(encoding="utf-8") except Exception: return # BUG-180: Strip invisible Unicode characters _INVISIBLE_CHARS = ( "\u200e", # LEFT-TO-RIGHT MARK "\u200f", # RIGHT-TO-LEFT MARK "\ufeff", # BOM / ZERO-WIDTH NO-BREAK SPACE "\u200b", # ZERO-WIDTH SPACE "\u200c", # ZERO-WIDTH NON-JOINER "\u200d", # ZERO-WIDTH JOINER "\u00ad", # SOFT HYPHEN "\u2060", # WORD JOINER "\u2028", # LINE SEPARATOR "\u2029", # PARAGRAPH SEPARATOR ) for ch in _INVISIBLE_CHARS: if ch in text: text = text.replace(ch, "") # BUG-201: Transliterate Cyrillic characters to Latin equivalents. # Russian author names (e.g. "А. И. Колесников") from Semantic Scholar # cause "! LaTeX Error: Unicode character" when pdflatex runs without T2A # font encoding. Transliterating preserves name readability. _orig_text = text for cyr, lat in _CYRILLIC_TO_LATIN_MAP.items(): if cyr in text: text = text.replace(cyr, lat) # BUG-217: Strip literal escape sequences (\n, \r, \t) in bib field values. # These appear when API responses embed Python-style escapes into titles. # A literal `\n` is never a valid BibTeX/LaTeX command and causes # "Undefined control sequence" errors during compilation. text = re.sub(r"\\n(?=\s)", " ", text) text = re.sub(r"\\r(?=\s)", "", text) text = re.sub(r"\\t(?=\s)", " ", text) lines = text.split("\n") changed = text != _orig_text for i, line in enumerate(lines): stripped = line.strip() # Only fix field-value lines (e.g. journal = {Science & Technology},) # Skip @type{ lines, key lines, and URL/DOI fields (BUG-DA8-12) if "=" in stripped and "{" in stripped and "&" in stripped and "\\&" not in stripped: _field_name = stripped.split("=", 1)[0].strip().lower() if _field_name in ("url", "doi", "howpublished", "eprint"): continue # Don't escape & in URLs lines[i] = line.replace("&", "\\&") changed = True new_text = "\n".join(lines) if new_text != text or changed: bib_path.write_text(new_text, encoding="utf-8") logger.info("Sanitized bib file %s", bib_path.name) def _fix_escaped_ampersand_in_tabular(tex: str) -> tuple[str, int]: """Replace ``\\&`` with ``&`` inside tabular environments. Only touches data rows (between \\toprule/\\midrule/\\bottomrule) to avoid corrupting ``\\&`` in regular text. Returns the fixed text and the count of rows fixed. """ count = 0 def _fix_tabular(m: re.Match[str]) -> str: nonlocal count block = m.group(0) if "\\&" not in block: return block # Only un-escape \& on lines that look like data rows (contain \\) lines = block.split("\n") for i, line in enumerate(lines): if "\\&" in line and "\\\\" in line: lines[i] = line.replace("\\&", "&") count += 1 return "\n".join(lines) tex = re.sub( r"\\begin\{tabular\}.*?\\end\{tabular\}", _fix_tabular, tex, flags=re.DOTALL, ) return tex, count def _run_pdflatex( work_dir: Path, tex_name: str, timeout: int = 120, ) -> tuple[str | None, bool]: """Run a single pdflatex pass with ``-interaction=nonstopmode``. Returns ``(log_text, success)``. *log_text* is ``None`` only on hard failures (timeout, binary missing). BUG-197: Uses bytes mode with manual UTF-8 decoding (errors="replace") instead of ``text=True``. pdflatex stdout can contain broken UTF-8 sequences (e.g. from U+202F NARROW NO-BREAK SPACE error messages), which cause ``UnicodeDecodeError`` with ``text=True`` and kill the entire compilation pipeline — bibtex never runs, all citations [?]. """ try: proc = subprocess.run( ["pdflatex", "-interaction=nonstopmode", tex_name], cwd=work_dir, capture_output=True, timeout=timeout, ) except subprocess.TimeoutExpired: logger.warning("pdflatex timed out after %ds", timeout) return None, False except FileNotFoundError: return None, False stdout = proc.stdout.decode("utf-8", errors="replace") stderr = proc.stderr.decode("utf-8", errors="replace") log_text = stdout + "\n" + stderr return log_text, proc.returncode == 0 # Fatal error patterns — these prevent a valid PDF from being generated. # Non-fatal issues (overfull hbox, missing figure, float warnings) still # produce a usable PDF and should NOT trigger the auto-fix retry loop. _FATAL_ERROR_PATTERNS = [ "runaway argument", "emergency stop", "fatal error", "undefined control sequence", "missing $ inserted", "extra alignment tab", "misplaced alignment tab", "missing \\begin{document}", "file `" , # file not found (sty, cls) "file not found", ] def _is_fatal_error(err: str) -> bool: """Return True if *err* represents a fatal LaTeX error.""" err_lower = err.lower() # "!" prefix errors are almost always fatal if err.startswith("!"): # Non-fatal "!" errors — PDF is still generated if "overfull" in err_lower or "underfull" in err_lower: return False if "float(s) lost" in err_lower: return False if "too many unprocessed floats" in err_lower: return False # amsmath commands outside math mode — PDF still generates if "allowed only in math mode" in err_lower: return False # Encoding errors for special characters — PDF still generates if "unavailable in encoding" in err_lower: return False # BUG-197: Unicode character errors (e.g. U+202F NARROW NO-BREAK # SPACE "not set up for use with LaTeX") — pdflatex skips the # character and generates a valid PDF. Treating these as fatal # prevents the retry loop from succeeding and blocks bibtex. # The error line is "! LaTeX Error: Unicode character X (U+XXXX)" # — the "not set up" text is on a continuation line. if "unicode character" in err_lower: return False return True for pat in _FATAL_ERROR_PATTERNS: if pat in err_lower: return True return False def _run_bibtex(work_dir: Path, stem: str, timeout: int = 60) -> bool: """Run bibtex if the binary exists. Returns True on success. BUG-197: Uses bytes mode with manual UTF-8 decoding (errors="replace") to avoid ``UnicodeDecodeError`` from non-ASCII bib content. Logs failures so that silent bibtex issues are diagnosable. """ if not shutil.which("bibtex"): logger.warning("bibtex not found on PATH — citations will be [?]") return False try: proc = subprocess.run( ["bibtex", stem], cwd=work_dir, capture_output=True, timeout=timeout, ) stdout = proc.stdout.decode("utf-8", errors="replace") stderr = proc.stderr.decode("utf-8", errors="replace") if proc.returncode != 0: logger.warning( "bibtex returned %d: %s", proc.returncode, (stdout + stderr).strip()[:500], ) return False # Log bibtex output at debug level for diagnostics if stdout.strip(): logger.debug("bibtex output: %s", stdout.strip()[:300]) # Verify .bbl was actually generated bbl_path = work_dir / f"{stem}.bbl" if not bbl_path.exists(): logger.warning("bibtex ran but %s.bbl was not generated", stem) return False return True except subprocess.TimeoutExpired: logger.warning("bibtex timed out after %ds", timeout) return False except FileNotFoundError: return False ================================================ FILE: researchclaw/templates/conference.py ================================================ """Conference template definitions for NeurIPS, ICLR, and ICML. Each template stores the LaTeX preamble, document structure, author format, and bibliography style needed to produce a submission-ready ``.tex`` file. Style files (``.sty``) are NOT bundled — the generated ``.tex`` references them, and users download the official files from the conference website. Download URLs are included as comments in the output. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path # Root directory for bundled style files _STYLES_DIR = Path(__file__).parent / "styles" @dataclass(frozen=True) class ConferenceTemplate: """LaTeX template specification for one conference.""" name: str display_name: str year: int document_class: str style_package: str style_options: str extra_packages: tuple[str, ...] author_format: str # "neurips" | "iclr" | "icml" bib_style: str columns: int # 1 or 2 style_download_url: str preamble_extra: str = "" def render_preamble( self, title: str, authors: str, abstract: str, ) -> str: # Style options (e.g. "preprint") go on the style package, not documentclass options = f"[{self.style_options}]" if self.style_options else "" pkg_lines = "\n".join(f"\\usepackage{{{p}}}" for p in self.extra_packages) author_block = self._render_authors(authors) # Substitute __TITLE__ placeholder in preamble_extra (e.g. ICML running title) preamble_extra = self.preamble_extra.replace("__TITLE__", title) style_line = ( f"\\usepackage{options}{{{self.style_package}}}\n" if self.style_package else "" ) style_comment = ( f"% Style file: {self.style_download_url}\n" if self.style_download_url else "" ) # BUG-51 fix: ICML's \begin{icmlauthorlist} is an environment that # must appear AFTER \begin{document}. For non-ICML templates the # \author{} command is a preamble declaration and stays before. if self.author_format == "icml": preamble_author = "" post_doc_author = f"{author_block}\n\n" else: preamble_author = f"{author_block}\n" post_doc_author = "" return ( f"{style_comment}" f"\\documentclass{{{self.document_class}}}\n" f"{style_line}" f"{pkg_lines}\n" f"{preamble_extra}\n" f"\n" f"\\title{{{title}}}\n" f"\n" f"{preamble_author}" f"\n" f"\\begin{{document}}\n" f"{post_doc_author}" f"\\begin{{abstract}}\n" f"{abstract}\n" f"\\end{{abstract}}\n" f"\n" f"\\maketitle\n" ) def render_footer(self, bib_file: str = "references") -> str: return ( f"\n\\bibliographystyle{{{self.bib_style}}}\n" f"\\bibliography{{{bib_file}}}\n" f"\n" f"\\end{{document}}\n" ) def get_style_files(self) -> list[Path]: """Return paths to bundled ``.sty`` and ``.bst`` files for this template. Files are stored under ``researchclaw/templates/styles//``. Returns only files that exist on disk. """ style_dir = _STYLES_DIR / self.name if not style_dir.is_dir(): return [] return sorted( p for p in style_dir.iterdir() if p.suffix in {".sty", ".bst", ".cls"} ) def _render_authors(self, authors: str) -> str: if self.author_format == "icml": return ( f"\\begin{{icmlauthorlist}}\n" f"\\icmlauthor{{{authors}}}{{aff1}}\n" f"\\end{{icmlauthorlist}}\n" f"\\icmlaffiliation{{aff1}}{{Affiliation}}" ) return f"\\author{{{authors}}}" # --------------------------------------------------------------------------- # Template definitions # --------------------------------------------------------------------------- # -- Legacy (kept for backward compat) -- NEURIPS_2024 = ConferenceTemplate( name="neurips_2024", display_name="NeurIPS 2024", year=2024, document_class="article", style_package="neurips_2024", style_options="preprint", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", ), author_format="neurips", bib_style="plainnat", columns=1, style_download_url="https://media.neurips.cc/Conferences/NeurIPS2024/Styles.zip", preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}", ) ICLR_2025 = ConferenceTemplate( name="iclr_2025", display_name="ICLR 2025", year=2025, document_class="article", style_package="iclr2025_conference", style_options="", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", ), author_format="iclr", bib_style="iclr2025_conference", columns=1, style_download_url="https://github.com/ICLR/Master-Template/raw/master/iclr2025.zip", ) ICML_2025 = ConferenceTemplate( name="icml_2025", display_name="ICML 2025", year=2025, document_class="article", style_package="icml2025", style_options="", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", ), author_format="icml", bib_style="icml2025", columns=2, style_download_url="https://icml.cc/Conferences/2025/StyleAuthorInstructions", preamble_extra="\\icmltitlerunning{__TITLE__}", ) # -- Current (2025/2026) -- NEURIPS_2025 = ConferenceTemplate( name="neurips_2025", display_name="NeurIPS 2025", year=2025, document_class="article", style_package="neurips_2025", style_options="preprint", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", ), author_format="neurips", bib_style="plainnat", columns=1, style_download_url="https://media.neurips.cc/Conferences/NeurIPS2025/Styles.zip", preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}", ) ICLR_2026 = ConferenceTemplate( name="iclr_2026", display_name="ICLR 2026", year=2026, document_class="article", style_package="iclr2026_conference", style_options="", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", ), author_format="iclr", bib_style="iclr2026_conference", columns=1, style_download_url="https://github.com/ICLR/Master-Template", ) ICML_2026 = ConferenceTemplate( name="icml_2026", display_name="ICML 2026", year=2026, document_class="article", style_package="icml2026", style_options="", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "nicefrac", "microtype", "graphicx", "natbib", "algorithm", "algorithmic", "adjustbox", "morefloats", ), author_format="icml", bib_style="icml2026", columns=2, style_download_url="https://icml.cc/Conferences/2026/AuthorInstructions", preamble_extra="\\icmltitlerunning{__TITLE__}", ) # -- Generic (non-ML) -- GENERIC = ConferenceTemplate( name="generic", display_name="Generic Academic Paper", year=2025, document_class="article", style_package="", style_options="", extra_packages=( "hyperref", "url", "booktabs", "amsfonts", "amsmath", "graphicx", "natbib", "geometry", "adjustbox", ), author_format="neurips", bib_style="plainnat", columns=1, style_download_url="", preamble_extra="\\usepackage[utf8]{inputenc}\n\\usepackage[T1]{fontenc}\n\\usepackage{lmodern}\n\\usepackage[margin=1in]{geometry}", ) # --------------------------------------------------------------------------- # Registry — short aliases point to LATEST version of each conference # --------------------------------------------------------------------------- CONFERENCE_REGISTRY: dict[str, ConferenceTemplate] = { # Latest (default aliases) "neurips": NEURIPS_2025, "iclr": ICLR_2026, "icml": ICML_2026, # Generic for non-ML domains "generic": GENERIC, "article": GENERIC, # Versioned keys (all versions) "neurips_2025": NEURIPS_2025, "neurips_2024": NEURIPS_2024, "iclr_2026": ICLR_2026, "iclr_2025": ICLR_2025, "icml_2026": ICML_2026, "icml_2025": ICML_2025, } def get_template(name: str) -> ConferenceTemplate: """Look up a conference template by name. Raises ``KeyError`` if *name* is not in the registry. Accepts both full names (``"neurips_2024"``) and short aliases (``"neurips"``). """ key = name.lower().strip().replace("-", "_").replace(" ", "_") if key not in CONFERENCE_REGISTRY: available = ", ".join(sorted({t.name for t in CONFERENCE_REGISTRY.values()})) raise KeyError(f"Unknown conference template: {name!r}. Available: {available}") return CONFERENCE_REGISTRY[key] def list_conferences() -> list[str]: """Return deduplicated list of canonical template names.""" return sorted({t.name for t in CONFERENCE_REGISTRY.values()}) ================================================ FILE: researchclaw/templates/converter.py ================================================ """Markdown-to-LaTeX converter with conference template support. Converts a ResearchClaw paper (Markdown with embedded LaTeX math) into a complete ``.tex`` file using a :class:`ConferenceTemplate` for preamble, author block, bibliography style, and document structure. Design constraints: - **Zero new dependencies** — stdlib only (``re``, ``textwrap``). - Handles inline math ``\\(...\\)``, display math ``\\[...\\]``, bold/italic, bullet lists, numbered lists, code blocks, tables, and ``\\cite{...}`` references. - Extracts abstract from ``# Abstract`` or ``## Abstract`` section. - ICML two-column structure handled via template's ``render_preamble``. """ from __future__ import annotations import re import textwrap import threading from dataclasses import dataclass, field from researchclaw.templates.conference import ConferenceTemplate _render_counters = threading.local() def _reset_render_counters() -> None: """Reset per-render figure and table counters for the current thread.""" _render_counters.table = 0 _render_counters.figure = 0 def _next_table_num() -> int: """Return the next table number for the current thread.""" next_num = getattr(_render_counters, "table", 0) + 1 _render_counters.table = next_num return next_num def _next_figure_num() -> int: """Return the next figure number for the current thread.""" next_num = getattr(_render_counters, "figure", 0) + 1 _render_counters.figure = next_num return next_num # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def markdown_to_latex( paper_md: str, template: ConferenceTemplate, *, title: str = "", authors: str = "Anonymous", bib_file: str = "references", bib_entries: dict[str, str] | None = None, ) -> str: """Convert a Markdown paper to a complete LaTeX document. Parameters ---------- paper_md: Full paper in Markdown with embedded LaTeX math. template: Conference template controlling preamble and structure. title: Paper title. If empty, extracted from ``# Title`` heading or the first ``# ...`` heading in *paper_md*. authors: Author string inserted into the template author block. bib_file: Bibliography filename (without ``.bib`` extension). bib_entries: Optional mapping of author-year patterns to cite_keys for recovering author-year citations that slipped through earlier processing, e.g. ``{"Raissi et al., 2019": "raissi2019physics"}``. Returns ------- str A complete ``.tex`` file ready for compilation. """ _reset_render_counters() paper_md = _preprocess_markdown(paper_md) paper_md = _round_raw_metrics(paper_md) sections = _parse_sections(paper_md) # Extract title from first H1 heading if not provided if not title: title = _extract_title(sections, paper_md) # Extract abstract abstract = _extract_abstract(sections) # Build body (everything except title/abstract headings) body = _build_body(sections, title=title) # IMP-30: Detect and remove duplicate tables body = _deduplicate_tables(body) # R10-Fix5: Completeness check completeness_warnings = check_paper_completeness(sections) if completeness_warnings: import logging _logger = logging.getLogger(__name__) for warning in completeness_warnings: _logger.warning("LaTeX completeness check: %s", warning) # BUG-28: Log warnings only — don't inject comments into LaTeX body preamble = template.render_preamble( title=_escape_latex(title), authors=authors, abstract=_convert_inline(abstract), ) footer = template.render_footer(bib_file) tex = preamble + "\n" + body + footer # Final sanitization pass on the complete LaTeX output tex = _sanitize_latex_output(tex, bib_entries=bib_entries) return tex # --------------------------------------------------------------------------- # Post-processing: sanitize final LaTeX # --------------------------------------------------------------------------- def _sanitize_latex_output( tex: str, *, bib_entries: dict[str, str] | None = None, ) -> str: """Remove artifacts that slip through pre-processing into the final .tex.""" # 0. BUG-102 safety net: Convert remaining author-year citations to \cite{}. # If upstream conversion missed any [Author et al., 2024] patterns, catch them here. if bib_entries: for ay_pattern in sorted(bib_entries, key=len, reverse=True): cite_key = bib_entries[ay_pattern] # [Author et al., 2024] → \cite{key} tex = tex.replace(f"[{ay_pattern}]", f"\\cite{{{cite_key}}}") # Also handle inside existing brackets (multi-citation) tex = tex.replace(ay_pattern, f"\\cite{{{cite_key}}}") # Clean up double-nested \cite from multi-citation brackets: # [\cite{a}, \cite{b}] → \cite{a, b} def _merge_bracket_cites(m: re.Match[str]) -> str: inner = m.group(1) keys = re.findall(r"\\cite\{([^}]+)\}", inner) if keys: return "\\cite{" + ", ".join(keys) + "}" return m.group(0) tex = re.sub(r"\[([^\]]*\\cite\{[^\]]+)\]", _merge_bracket_cites, tex) # 1. Remove broken citation markers: \cite{?key:NOT_IN_BIB} or literal [?key:NOT_IN_BIB] tex = re.sub(r"\\cite\{\?[^}]*:NOT_IN_BIB\}", "", tex) tex = re.sub(r"\[\?[a-zA-Z0-9_:-]+:NOT_IN_BIB\]", "", tex) # 1b. Convert leftover raw bracket citations [key2019word, key2020word] → \cite{...} # Skip inside verbatim/lstlisting environments to avoid corrupting code blocks. _CITE_KEY_PAT_L = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*" _VERBATIM_RE = re.compile( r"(\\begin\{(?:verbatim|lstlisting|minted)\}.*?\\end\{(?:verbatim|lstlisting|minted)\})", re.DOTALL, ) _cite_re = re.compile( rf"\[({_CITE_KEY_PAT_L}(?:\s*,\s*{_CITE_KEY_PAT_L})*)\]" ) def _cite_outside_verbatim(tex_src: str) -> str: parts = _VERBATIM_RE.split(tex_src) for i, part in enumerate(parts): if not _VERBATIM_RE.match(part): parts[i] = _cite_re.sub(r"\\cite{\1}", part) return "".join(parts) tex = _cite_outside_verbatim(tex) # 1c. BUG-110 safety net: Replace any remaining Unicode Greek/math symbols. # _convert_inline handles most, but titles, captions, and preamble # fragments can still contain raw Unicode that kills pdflatex. for _uchar, _lcmd in _UNICODE_GREEK_TO_LATEX.items(): if _uchar in tex: tex = tex.replace(_uchar, _lcmd) # 2. Remove HTML entities that survived pre-processing tex = tex.replace(" ", "~") tex = tex.replace("&", "\\&") # 2b. Fix escaped \& inside tabular data rows. The converter's # _convert_inline escapes & globally; inside tabular environments # the & must remain unescaped as the column separator. if "\\begin{tabular}" in tex and "\\&" in tex: def _fix_tabular_amp(m: re.Match[str]) -> str: block = m.group(0) if "\\&" not in block: return block lines = block.split("\n") for i, line in enumerate(lines): if "\\&" in line and "\\\\" in line: lines[i] = line.replace("\\&", "&") return "\n".join(lines) tex = re.sub( r"\\begin\{tabular\}.*?\\end\{tabular\}", _fix_tabular_amp, tex, flags=re.DOTALL, ) # 3. Remove stray markdown code fences in LaTeX body (outside verbatim) # Only match fences NOT inside \begin{verbatim}...\end{verbatim} # Simple approach: remove ``` lines that don't have verbatim nearby tex = re.sub(r"^(\s*```[a-z]*\s*)$", r"% removed stray fence: \1", tex, flags=re.MULTILINE) # 4. Fix placeholder table captions: \caption{Table N} → descriptive # Can't auto-generate content, but at least don't leave "Table 1" as # the only caption text — append " -- See text for details." tex = re.sub( r"\\caption\{(Table\s+\d+)\}", r"\\caption{\1 -- Summary of experimental results.}", tex, ) # 4b. Auto-map orphan \ref{fig:X} to closest \label{fig:Y} by prefix. # The converter generates long labels from captions (fig:overall_cifar_100) # but the LLM references short names (fig:overall). fig_labels = set(re.findall(r"\\label\{(fig:[^}]+)\}", tex)) fig_refs = set(re.findall(r"\\ref\{(fig:[^}]+)\}", tex)) orphan_refs = fig_refs - fig_labels orphan_labels = fig_labels - fig_refs if orphan_refs and orphan_labels: for oref in orphan_refs: # Find a label that starts with the ref prefix candidates = [l for l in orphan_labels if l.startswith(oref)] if len(candidates) == 1: tex = tex.replace(f"\\ref{{{oref}}}", f"\\ref{{{candidates[0]}}}") orphan_labels.discard(candidates[0]) # 5. Fix "Untitled Paper" / "Running Title" fallback titles tex = re.sub( r"\\title\{Untitled Paper\}", r"\\title{[Title Generation Failed -- Manual Title Required]}", tex, ) tex = re.sub( r"\\icmltitlerunning\{Running Title\}", "", tex, ) # 6. Remove \texttt{} wrapped raw metric paths that the LLM dumped # Handles both raw underscores and LaTeX-escaped underscores (\_) # Pattern: condition/env/step/metric_name: value (3+ path segments) tex = re.sub( r"\\texttt\{[a-zA-Z0-9_\\_/.:=-]+(?:/[a-zA-Z0-9_\\_/.:=-]+){2,}(?:\s*[=:]\s*[^}]*)?\}", "", tex, ) # 6b. Remove entire \item lines that are just metric paths tex = re.sub( r"^\s*\\item\s*\\texttt\{[^}]*\}\s*$", "", tex, flags=re.MULTILINE, ) # 7. Clean up empty \item lines that result from removed content tex = re.sub(r"\\item\s*\n\s*\\item", r"\\item", tex) # Also remove completely empty \item lines (just whitespace after \item) tex = re.sub(r"^\s*\\item\s*$", "", tex, flags=re.MULTILINE) # 8. Remove consecutive blank lines (more than 2) tex = re.sub(r"\n{3,}", "\n\n", tex) return tex # --------------------------------------------------------------------------- # Pre-processing # --------------------------------------------------------------------------- _OUTER_FENCE_RE = re.compile( r"^\s*```(?:markdown|md|latex|tex)?\s*\n(.*?)^\s*```\s*$", re.MULTILINE | re.DOTALL, ) # Greedy variant — matches the *last* closing fence so inner code blocks # (```text … ```) don't truncate the capture prematurely. _OUTER_FENCE_GREEDY_RE = re.compile( r"^\s*```(?:markdown|md|latex|tex)?\s*\n(.*)^\s*```\s*$", re.MULTILINE | re.DOTALL, ) # Pattern for raw metric values with excessive decimal places # e.g. 0.9717036975193437 → 0.972 _RAW_METRIC_RE = re.compile(r"(\d+\.\d{5,})") def _round_raw_metrics(text: str) -> str: """Round excessively precise metric values (>4 decimal places). Uses significant-figure-aware rounding so small values like learning rates (e.g. 0.00001) are preserved instead of becoming 0.0000. """ def _rounder(m: re.Match[str]) -> str: try: val = float(m.group(1)) if val == 0.0: return "0.0" # For very small values (< 0.001), use 2 significant figures # to preserve scientific meaning (e.g. lr=0.00003 → 0.00003) import math abs_val = abs(val) if abs_val < 0.001: sig_figs = 2 digits = sig_figs - int(math.floor(math.log10(abs_val))) - 1 return f"{val:.{digits}f}" # Normal range: 4 decimal places return f"{val:.4f}" except (ValueError, OverflowError): return m.group(0) return _RAW_METRIC_RE.sub(_rounder, text) def _preprocess_markdown(md: str) -> str: """Clean up common LLM artifacts before parsing. 1. Strip outer fenced code blocks (e.g. triple-backtick markdown) that LLMs around the entire paper content. 2. Remove standalone Markdown horizontal rules (``---``, ``***``, ``___``). 3. Convert blockquotes (``> text``) to a form the converter can handle. 4. Round excessively precise metric values. """ text = md # 1. Strip outer markdown fences (LLMs sometimes wrap entire paper in them) # Repeatedly strip in case of double-wrapping. # Try greedy match first (handles papers with inner code blocks), # then fall back to non-greedy if greedy doesn't help. for _ in range(3): stripped = False for pat in (_OUTER_FENCE_GREEDY_RE, _OUTER_FENCE_RE): m = pat.search(text) if m and len(m.group(1)) > len(text) * 0.5: text = m.group(1) stripped = True break if not stripped: # Also handle the case where the first line is ```markdown # and the last non-blank line is ``` (simple boundary strip) lines = text.split("\n") first = lines[0].strip() if lines else "" last_idx = len(lines) - 1 while last_idx > 0 and not lines[last_idx].strip(): last_idx -= 1 last = lines[last_idx].strip() if last_idx > 0 else "" if ( re.match(r"^```(?:markdown|md|latex|tex)?\s*$", first) and last == "```" ): text = "\n".join(lines[1:last_idx]) stripped = True if not stripped: break # 2. Remove standalone horizontal rules (---, ***, ___) text = re.sub(r"^\s*[-*_]{3,}\s*$", "", text, flags=re.MULTILINE) # 2a. Strip HTML entities that LLMs inject into markdown text = text.replace(" ", " ") text = text.replace("&", "&") text = text.replace("<", "<") text = text.replace(">", ">") text = text.replace("—", "---") text = text.replace("–", "--") # 2b. Note: stray code fences are handled in _sanitize_latex_output # after conversion, not here (to avoid breaking real code blocks). # 2c. Round excessively precise metric values (e.g. 0.9717036975 → 0.9717) text = _round_raw_metrics(text) # 2d. Remove raw \texttt{...} or backtick-wrapped metric key paths # Pattern: \texttt{some/long/metric_path/name: 0.1234} or `path/to/metric: val` text = re.sub( r"\\texttt\{[a-zA-Z0-9_/.:=-]+(?:/[a-zA-Z0-9_/.:=-]+){2,}(?:\s*[=:]\s*[^}]*)?\}", "", text, ) # Also strip backtick-wrapped metric paths in markdown source text = re.sub( r"`[a-zA-Z0-9_/.-]+(?:/[a-zA-Z0-9_/.-]+){2,}(?:\s*[=:]\s*[^`]*)?`", "", text, ) # 2e. Clean NOT_IN_BIB citation markers: [?key:NOT_IN_BIB] → remove text = re.sub(r"\[\?[a-zA-Z0-9_:-]+:NOT_IN_BIB\]", "", text) # 3. Convert blockquotes: > text → \begin{quote}text\end{quote} # Collect consecutive > lines into a single quote block. lines = text.split("\n") out_lines: list[str] = [] in_quote = False quote_buf: list[str] = [] for line in lines: stripped = line.strip() if stripped.startswith("> "): if not in_quote: in_quote = True quote_buf = [] quote_buf.append(stripped[2:]) elif stripped == ">" and in_quote: quote_buf.append("") else: if in_quote: out_lines.append("\\begin{quote}") out_lines.extend(quote_buf) out_lines.append("\\end{quote}") in_quote = False quote_buf = [] out_lines.append(line) if in_quote: out_lines.append("\\begin{quote}") out_lines.extend(quote_buf) out_lines.append("\\end{quote}") text = "\n".join(out_lines) # 4. T1.2: Remove stray markdown/latex/text fences that appear mid-document. # LLMs sometimes emit ```markdown or ```latex between sections. # Only remove documentation fences — preserve code fences (```python etc.) _CODE_LANGS = frozenset({ "python", "java", "cpp", "c", "javascript", "typescript", "rust", "go", "ruby", "bash", "sh", "sql", "r", "julia", "lua", "perl", "scala", "kotlin", "swift", "haskell", "algorithm", "pseudocode", }) _lines = text.split("\n") _cleaned: list[str] = [] _in_code = False for _l in _lines: _stripped = _l.strip() if _stripped.startswith("```") and not _in_code: _lang = _stripped[3:].strip().lower() if _lang in _CODE_LANGS or _lang.startswith("algorithm"): # Real code block — keep _in_code = True _cleaned.append(_l) elif _lang in ("markdown", "md", "latex", "tex", "text", "", "bibtex"): # Documentation/wrapper fence — remove pass else: # Unknown lang — keep to be safe _in_code = True _cleaned.append(_l) elif _stripped == "```" and _in_code: # Closing fence for a code block — keep _in_code = False _cleaned.append(_l) elif _stripped == "```" and not _in_code: # Stray fence — remove pass else: _cleaned.append(_l) text = "\n".join(_cleaned) # 5. Normalize mid-line section headings (IMP-17) # LLM output may concatenate sections onto single long lines: # "...text ## Abstract Body text ## 1. Introduction More text..." # Ensure each heading marker starts on its own line so _parse_sections # can detect them with the ^-anchored regex. text = re.sub(r"(?<=[^\n]) +(#{1,4}) +", r"\n\n\1 ", text) return text # --------------------------------------------------------------------------- # Section parsing # --------------------------------------------------------------------------- @dataclass class _Section: """A parsed Markdown section.""" level: int # 1 = ``#``, 2 = ``##``, 3 = ``###``, etc. heading: str body: str heading_lower: str = field(init=False) def __post_init__(self) -> None: self.heading_lower = self.heading.strip().lower() _HEADING_RE = re.compile(r"^(#{1,4})\s+(.+)$", re.MULTILINE) # Known section heading names used to separate heading from concatenated body _KNOWN_SECTION_NAMES = { "abstract", "introduction", "related work", "background", "method", "methods", "methodology", "approach", "framework", "experiments", "experiment", "experimental setup", "experimental results", "results", "results and discussion", "analysis", "discussion", "conclusion", "conclusions", "limitations", "acknowledgments", "acknowledgements", "references", "appendix", "contributions", "problem setting", "problem statement", "problem definition", "problem formulation", "study positioning", "study positioning and scope", "evaluation", "evaluation environment", "design rationale", "complexity", "unified algorithm", "method positioning", "methods compared", "common protonet backbone", "preference optimization backbone", } _HEADING_CONNECTORS = frozenset( { "and", "or", "for", "in", "of", "the", "a", "an", "with", "under", "to", "on", "at", "by", "as", "via", "from", "not", "but", "yet", "nor", "vs", "versus", "is", "are", } ) _SENTENCE_STARTERS = frozenset( { "the", "a", "an", "this", "these", "those", "that", "it", "we", "our", "their", "its", "each", "every", "in", "for", "to", "here", "there", "however", "moreover", "furthermore", "additionally", "specifically", "notably", "all", "many", "several", "some", "most", "both", "among", "between", "across", "unlike", "given", "such", "while", "although", "because", "since", "when", "where", "rather", "let", "table", "figure", "as", "at", "if", } ) def _separate_heading_body(heading: str) -> tuple[str, str]: """Separate heading text from accidentally concatenated body text. LLM output may produce lines like ``## Abstract Body text here...`` where the heading is just ``Abstract`` and the rest is body. Returns (heading, extra_body) where extra_body may be empty. """ # Very short headings are fine as-is if len(heading) <= 60: return heading, "" # Strip optional leading section number for matching num_match = re.match(r"^(\d+(?:\.\d+)*\.?\s+)", heading) num_prefix = num_match.group(1) if num_match else "" rest = heading[len(num_prefix):] rest_lower = rest.lower() # Check against known section heading names for name in sorted(_KNOWN_SECTION_NAMES, key=len, reverse=True): if rest_lower.startswith(name) and len(rest) > len(name) + 1: after = rest[len(name) :] if after and after[0] in " \t": return (num_prefix + rest[: len(name)]).strip(), after.strip() # Word-count heuristic for unknown subsection headings. # Scan for the first plausible heading-body boundary. words = heading.split() if len(words) > 6: for n in range(2, min(12, len(words) - 2)): curr = words[n] if not curr or not curr[0].isupper(): continue prev_word = words[n - 1].rstrip(".,;:").lower() if prev_word in _HEADING_CONNECTORS: continue remaining = " ".join(words[n:]) if len(remaining) <= 30: continue # Strong signal: common sentence-starting word if curr.lower() in _SENTENCE_STARTERS: return " ".join(words[:n]).strip(), remaining.strip() # Medium signal: next word is lowercase (sentence-like) # and heading has >= 4 words, body is substantial (> 100 chars) if n >= 4 and n + 1 < len(words): next_w = words[n + 1].rstrip(".,;:") if next_w and next_w[0].islower() and len(remaining) > 100: return " ".join(words[:n]).strip(), remaining.strip() # Weak fallback for very long headings (conservative) if n >= 8 and len(remaining) > 100: return " ".join(words[:n]).strip(), remaining.strip() # Detect repeated multi-word opening phrase: the body often starts with # the same words as the heading (e.g. "Graph-memory methods Graph-memory # methods maintain a graph..."). half = len(rest) // 2 for phrase_len in range(min(30, half), 14, -1): phrase = rest[:phrase_len] if " " not in phrase: continue repeat_pos = rest.find(phrase, phrase_len) if repeat_pos > 0: return ( (num_prefix + rest[:repeat_pos]).strip(), rest[repeat_pos:].strip(), ) # Fallback: try to split at a sentence boundary within first 200 chars if len(heading) > 200: m = re.search(r"[.;:]\s+([A-Z])", heading[:300]) if m and m.start() > 10: return heading[: m.start() + 1].strip(), heading[m.start() + 2 :].strip() return heading, "" def _parse_sections(md: str) -> list[_Section]: """Split Markdown into a flat list of sections by heading.""" matches = list(_HEADING_RE.finditer(md)) if not matches: return [_Section(level=1, heading="", body=md)] sections: list[_Section] = [] # Text before first heading (if any) if matches[0].start() > 0: preamble_text = md[: matches[0].start()].strip() if preamble_text: sections.append(_Section(level=0, heading="", body=preamble_text)) for i, m in enumerate(matches): level = len(m.group(1)) heading = m.group(2).strip() start = m.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(md) body = md[start:end].strip() # IMP-17: Handle concatenated heading+body on same line heading, body_prefix = _separate_heading_body(heading) if body_prefix: body = body_prefix + ("\n\n" + body if body else "") sections.append(_Section(level=level, heading=heading, body=body)) return sections # --------------------------------------------------------------------------- # Extraction helpers # --------------------------------------------------------------------------- _TITLE_SKIP = { "title", "abstract", "references", "appendix", "acknowledgments", "acknowledgements", } # T1.1: Headings that are NOT valid paper titles (tables, figures, etc.) _TITLE_REJECT_RE = re.compile( r"^(?:table|figure|fig\.|tab\.|algorithm|listing|appendix)\s", re.IGNORECASE, ) # T1.1: Headings that look like metric dumps rather than titles _METRIC_DUMP_RE = re.compile( r"(?:primary_metric|accuracy|loss|f1_score|precision|recall)\b", re.IGNORECASE, ) def _extract_title(sections: list[_Section], raw_md: str) -> str: """Extract paper title from sections or raw markdown.""" # Look for an explicit "# Title" or "## Title" section whose body is the # actual title, or whose heading is "## Title Actual Paper Title". for sec in sections: if sec.level in (1, 2) and sec.heading_lower == "title": # The body often starts with **Bold Title** on the first line first_line = sec.body.split("\n")[0].strip() # Strip bold markers first_line = re.sub(r"\*\*(.+?)\*\*", r"\1", first_line) if first_line and not _is_bad_title(first_line): return first_line # Handle "## Title Actual Paper Title" pattern (title embedded in heading) if sec.level in (1, 2) and sec.heading_lower.startswith("title ") and len(sec.heading) > 6: return sec.heading[6:].strip() # Fallback: first H1/H2 heading that isn't a meta-heading or artefact for sec in sections: if ( sec.level in (1, 2) and sec.heading and sec.heading_lower not in _TITLE_SKIP and not _is_bad_title(sec.heading) ): return sec.heading # Last resort: first non-empty line (still filtered) for line in raw_md.splitlines(): stripped = line.strip().lstrip("#").strip() if stripped and not _is_bad_title(stripped): return stripped return "Untitled Paper" def _is_bad_title(candidate: str) -> bool: """Return True if *candidate* is clearly not a paper title.""" # Reject "Table 1 – ...", "Figure 2: ...", etc. if _TITLE_REJECT_RE.match(candidate): return True # Reject raw metric key dumps if _METRIC_DUMP_RE.search(candidate): return True # Reject if it contains raw underscore variable names (e.g. primary_metric) if re.search(r"\w+_\w+/\w+", candidate): return True return False def _extract_abstract(sections: list[_Section]) -> str: """Extract abstract text from sections.""" for sec in sections: if sec.heading_lower == "abstract": return sec.body # IMP-17 fallback: heading may still contain body text if # _separate_heading_body didn't recognise the pattern. if sec.heading_lower.startswith("abstract ") and len(sec.heading) > 20: extra = sec.heading[len("Abstract") :].strip() return extra + ("\n\n" + sec.body if sec.body else "") return "" # --------------------------------------------------------------------------- # Body building # --------------------------------------------------------------------------- _SKIP_HEADINGS = {"title", "abstract"} def _build_body(sections: list[_Section], *, title: str = "") -> str: """Convert all non-title/abstract sections to LaTeX body text. When a paper has its title as an H1 heading (``# My Paper Title``), that heading is already rendered via ``\\title{}`` in the preamble. We skip it here and promote remaining headings so that H2 (``##``) maps to ``\\section``, H3 to ``\\subsection``, etc. """ title_lower = title.strip().lower() # Determine minimum heading level used for real body sections # (skip title/abstract/references). title_h1_found = False for sec in sections: if ( sec.level == 1 and sec.heading and sec.heading.strip().lower() == title_lower ): title_h1_found = True break # T1.3: Auto-detect when all body sections use H2 (##) instead of H1 (#). # This happens when the LLM uses ## for main sections (Introduction, Method, etc.) # without an explicit H1 title heading. We must promote H2→\section. body_levels: set[int] = set() for sec in sections: if sec.heading_lower not in _SKIP_HEADINGS and sec.level >= 1: if not (sec.level == 1 and sec.heading.strip().lower() == title_lower): body_levels.add(sec.level) min_body_level = min(body_levels) if body_levels else 1 # Promote if: (a) title was H1 and body starts at H2, OR # (b) no title H1 found but all body sections are H2+ (LLM omitted H1 title) # BUG-166: When title is H1 AND body also uses H1 for main sections, # offset must be 0 — otherwise H1→max(1,1-1)=1 and H2→max(1,2-1)=1 # both collapse to \section, losing all subsection hierarchy. if title_h1_found: level_offset = 1 if min_body_level >= 2 else 0 elif min_body_level >= 2: # All body sections are H2 or deeper — promote so H2→\section level_offset = min_body_level - 1 else: level_offset = 0 _level_map = { 1: "section", 2: "subsection", 3: "subsubsection", 4: "paragraph", } parts: list[str] = [] for sec in sections: # Skip title-only and abstract sections if sec.heading_lower in _SKIP_HEADINGS: continue # Skip the H1 heading that was used as the paper title if ( sec.level == 1 and sec.heading and sec.heading.strip().lower() == title_lower ): continue if sec.level == 0: # Preamble text before any heading — include as-is parts.append(_convert_block(sec.body)) continue effective_level = max(1, sec.level - level_offset) cmd = _level_map.get(effective_level, "paragraph") heading_tex = _escape_latex(sec.heading) # Strip leading manual section numbers: "1. Introduction" → "Introduction" # Handles: "1 Intro", "2.1 Related", "3.2.1 Details", "1. Intro" heading_tex = re.sub(r"^\d+(?:\.\d+)*\.?\s+", "", heading_tex) parts.append(f"\\{cmd}{{{heading_tex}}}") # Generate a label for cross-referencing if cmd in ("section", "subsection", "subsubsection"): label_key = re.sub(r"[^a-z0-9]+", "_", heading_tex.lower()).strip("_")[:40] if label_key: parts.append(f"\\label{{sec:{label_key}}}") if sec.body: parts.append(_convert_block(sec.body)) return "\n\n".join(parts) + "\n" def _deduplicate_tables(body: str) -> str: """IMP-30: Remove duplicate tables that share the same header row. LLMs sometimes repeat tables (e.g. same results table in Results and Discussion). We keep the first occurrence and drop subsequent copies. """ import logging as _dup_log _TABLE_ENV_RE = re.compile( r"(\\begin\{table\}.*?\\end\{table\})", re.DOTALL ) tables = list(_TABLE_ENV_RE.finditer(body)) if len(tables) < 2: return body seen_headers: dict[str, int] = {} drop_spans: list[tuple[int, int]] = [] for m in tables: table_text = m.group(1) # Extract header row (first row after \toprule) header_match = re.search(r"\\toprule\s*\n(.+?)\\\\", table_text) if not header_match: continue header_key = re.sub(r"\s+", " ", header_match.group(1).strip()) if header_key in seen_headers: drop_spans.append((m.start(), m.end())) _dup_log.getLogger(__name__).info( "IMP-30: Dropping duplicate table (same header as table #%d)", seen_headers[header_key], ) else: seen_headers[header_key] = len(seen_headers) + 1 # Remove duplicates in reverse order to preserve offsets for start, end in reversed(drop_spans): body = body[:start] + body[end:] return body # --------------------------------------------------------------------------- # Block-level conversion # --------------------------------------------------------------------------- # Patterns for block-level structures _DISPLAY_MATH_RE = re.compile(r"^\\\[(.+?)\\\]$", re.MULTILINE | re.DOTALL) # $$...$$ display math (single- or multi-line) _DISPLAY_MATH_DOLLAR_RE = re.compile( r"^\$\$\s*\n?(.*?)\n?\s*\$\$$", re.MULTILINE | re.DOTALL ) _FENCED_CODE_RE = re.compile(r"^```(\w*)\n(.*?)^```", re.MULTILINE | re.DOTALL) _TABLE_SEP_RE = re.compile(r"^\|[-:| ]+\|$") # Markdown image pattern: ![caption](path) _IMAGE_RE = re.compile(r"^!\[([^\]]*)\]\(([^)]+)\)\s*$") # Bullet / numbered list patterns _BULLET_RE = re.compile(r"^(\s*)-\s+(.+)") _NUMBERED_RE = re.compile(r"^(\s*)\d+\.\s+(.+)") def _convert_block(text: str) -> str: """Convert a block of Markdown body text to LaTeX.""" # Protect display math from further processing math_blocks: list[str] = [] def _stash_math(m: re.Match[str]) -> str: idx = len(math_blocks) math_blocks.append(m.group(0)) # Keep \\[...\\] as-is return f"%%MATH_BLOCK_{idx}%%" def _stash_dollar_math(m: re.Match[str]) -> str: """Convert $$...$$ to \\begin{equation}...\\end{equation}.""" idx = len(math_blocks) inner = m.group(1).strip() math_blocks.append( f"\\begin{{equation}}\n{inner}\n\\end{{equation}}" ) return f"%%MATH_BLOCK_{idx}%%" text = _DISPLAY_MATH_RE.sub(_stash_math, text) # Also handle $$...$$ display math text = _DISPLAY_MATH_DOLLAR_RE.sub(_stash_dollar_math, text) # Protect fenced code blocks code_blocks: list[str] = [] def _stash_code(m: re.Match[str]) -> str: idx = len(code_blocks) lang = m.group(1) or "" code = m.group(2) code_blocks.append(_render_code_block(lang, code)) return f"%%CODE_BLOCK_{idx}%%" text = _FENCED_CODE_RE.sub(_stash_code, text) # Protect raw LaTeX environments (table, figure, algorithm, etc.) # These appear when pre-built LaTeX (e.g. anti-fabrication result tables) # is embedded directly in the markdown. Without protection, their # contents go through _convert_inline which double-escapes {, }, _, &. latex_env_blocks: list[str] = [] def _stash_latex_env(m: re.Match[str]) -> str: idx = len(latex_env_blocks) latex_env_blocks.append(m.group(0)) return f"%%LATEX_ENV_{idx}%%" # Match \begin{env}...\end{env} for environments that should pass through. text = re.sub( r"\\begin\{(table|figure|tabular|algorithm|algorithmic|equation|align" r"|gather|multline|minipage|tikzpicture)\*?\}.*?" r"\\end\{\1\*?\}", _stash_latex_env, text, flags=re.DOTALL, ) # Process line by line for lists, tables, and paragraphs lines = text.split("\n") output: list[str] = [] i = 0 while i < len(lines): line = lines[i] # Check for stashed blocks if line.strip().startswith("%%MATH_BLOCK_"): idx = int(re.search(r"\d+", line.strip()).group()) # type: ignore[union-attr] output.append(math_blocks[idx]) i += 1 continue if line.strip().startswith("%%CODE_BLOCK_"): idx = int(re.search(r"\d+", line.strip()).group()) # type: ignore[union-attr] output.append(code_blocks[idx]) i += 1 continue # Stashed LaTeX environments — pass through unchanged if line.strip().startswith("%%LATEX_ENV_"): idx = int(re.search(r"\d+", line.strip()).group()) # type: ignore[union-attr] output.append(latex_env_blocks[idx]) i += 1 continue # Bullet list if _BULLET_RE.match(line): items, i = _collect_list(lines, i, _BULLET_RE) output.append(_render_itemize(items)) continue # Numbered list if _NUMBERED_RE.match(line): items, i = _collect_list(lines, i, _NUMBERED_RE) output.append(_render_enumerate(items)) continue # Table detection (line starts with |) if ( line.strip().startswith("|") and i + 1 < len(lines) and _TABLE_SEP_RE.match(lines[i + 1].strip()) ): # Check if previous line is a table caption (e.g. **Table 1: ...**) table_caption = "" if output: prev = output[-1].strip() # Match bold caption: \textbf{Table N...} (already converted) # or raw markdown: **Table N: ...** cap_m = re.match( r"(?:\\textbf\{|[*]{2})\s*Table\s+\d+[.:]?\s*(.*?)(?:\}|[*]{2})$", prev, ) if cap_m: table_caption = f"Table {cap_m.group(1)}" if cap_m.group(1) else "" if not table_caption: table_caption = prev output.pop() # Remove caption line from output (now inside table) table_lines, i = _collect_table(lines, i) output.append(_render_table(table_lines, caption=table_caption)) continue # Markdown image: ![caption](path) img_match = _IMAGE_RE.match(line.strip()) if img_match: output.append(_render_figure(img_match.group(1), img_match.group(2))) i += 1 continue # Regular paragraph line output.append(_convert_inline(line)) i += 1 return "\n".join(output) # --------------------------------------------------------------------------- # List handling # --------------------------------------------------------------------------- def _collect_list( lines: list[str], start: int, pattern: re.Pattern[str] ) -> tuple[list[str], int]: """Collect consecutive list items matching *pattern*.""" items: list[str] = [] i = start while i < len(lines): m = pattern.match(lines[i]) if m: items.append(m.group(2)) i += 1 elif lines[i].strip() == "": # Blank line — might continue list or end it if i + 1 < len(lines) and pattern.match(lines[i + 1]): i += 1 # skip blank, continue else: break elif lines[i].startswith(" ") or lines[i].startswith("\t"): # Continuation of previous item if items: items[-1] += " " + lines[i].strip() i += 1 else: break return items, i def _render_itemize(items: list[str]) -> str: inner = "\n".join(f" \\item {_convert_inline(item)}" for item in items) return f"\\begin{{itemize}}\n{inner}\n\\end{{itemize}}" def _render_enumerate(items: list[str]) -> str: inner = "\n".join(f" \\item {_convert_inline(item)}" for item in items) return f"\\begin{{enumerate}}\n{inner}\n\\end{{enumerate}}" # --------------------------------------------------------------------------- # Table handling # --------------------------------------------------------------------------- def _collect_table(lines: list[str], start: int) -> tuple[list[str], int]: """Collect table lines (header + separator + body rows).""" table: list[str] = [] i = start while i < len(lines) and lines[i].strip().startswith("|"): table.append(lines[i]) i += 1 return table, i def _render_table(table_lines: list[str], caption: str = "") -> str: """Render a Markdown table as a LaTeX tabular inside a table environment. IMP-23: Auto-wraps in ``\\resizebox`` when columns > 5 or any cell text exceeds 25 characters, preventing overflow in conference formats. IMP-32: Generates descriptive captions from header columns when the caption is empty or just 'Table N'. """ if len(table_lines) < 2: return "" header = _parse_table_row(table_lines[0]) # Skip separator (line 1) body_rows = [_parse_table_row(line) for line in table_lines[2:] if line.strip()] ncols = len(header) # Determine alignment from separator alignments = _parse_alignments(table_lines[1], ncols) col_spec = "".join(alignments) table_num = _next_table_num() # IMP-23: Detect wide tables that need resizebox max_cell_len = max( (len(c) for row in [header] + body_rows for c in row), default=0, ) needs_resize = ncols > 5 or max_cell_len > 25 lines_out: list[str] = [] lines_out.append("\\begin{table}[ht]") lines_out.append("\\centering") # Caption ABOVE table (standard academic convention) if caption: cap_text = re.sub(r"^Table\s+\d+[.:]\s*", "", caption).strip() if cap_text: lines_out.append(f"\\caption{{{_convert_inline(cap_text)}}}") else: auto_cap = _auto_table_caption(header, table_num) lines_out.append(f"\\caption{{{auto_cap}}}") else: auto_cap = _auto_table_caption(header, table_num) lines_out.append(f"\\caption{{{auto_cap}}}") lines_out.append(f"\\label{{tab:{table_num}}}") if needs_resize: # BUG-109b fix: Use \columnwidth (works in both 1-col and 2-col layouts) # \textwidth in 2-column formats (ICML) is full page width, causing # floats wider than a column to be "lost" by LaTeX. lines_out.append("\\resizebox{\\columnwidth}{!}{%") lines_out.append(f"\\begin{{tabular}}{{{col_spec}}}") lines_out.append("\\toprule") lines_out.append( " & ".join(f"\\textbf{{{_convert_inline(c)}}}" for c in header) + " \\\\" ) lines_out.append("\\midrule") for row in body_rows: # Pad row to match header length padded = row + [""] * (ncols - len(row)) lines_out.append( " & ".join(_convert_inline(c) for c in padded[:ncols]) + " \\\\" ) lines_out.append("\\bottomrule") lines_out.append("\\end{tabular}") if needs_resize: lines_out.append("}") # close resizebox lines_out.append("\\end{table}") return "\n".join(lines_out) def _auto_table_caption(header: list[str], table_num: int) -> str: """IMP-32: Generate a descriptive caption from table header columns.""" if len(header) <= 1: return f"Table {table_num}" cols = [c.strip() for c in header if c.strip()] if len(cols) < 2: return f"Table {table_num}" col0 = cols[0].lower() rest = [_convert_inline(c) for c in cols[1:min(5, len(cols))]] # Detect common table types by first-column header _HP_HINTS = {"hyperparameter", "parameter", "param", "hp", "setting", "config"} _ABL_HINTS = {"component", "variant", "ablation", "configuration", "module"} _MODEL_HINTS = {"model", "method", "approach", "algorithm", "baseline"} if any(h in col0 for h in _HP_HINTS): return f"Hyperparameter settings" if any(h in col0 for h in _ABL_HINTS): return f"Ablation study results across {', '.join(rest)}" if any(h in col0 for h in _MODEL_HINTS): return f"Performance comparison of different methods on {', '.join(rest)}" return f"Comparison of {_convert_inline(cols[0])} across {', '.join(rest)}" def _parse_table_row(line: str) -> list[str]: """Parse ``| a | b | c |`` into ``['a', 'b', 'c']``.""" line = line.strip() if line.startswith("|"): line = line[1:] if line.endswith("|"): line = line[:-1] return [cell.strip() for cell in line.split("|")] def _parse_alignments(sep_line: str, ncols: int) -> list[str]: """Parse alignment indicators from separator line.""" cells = _parse_table_row(sep_line) aligns: list[str] = [] for cell in cells: raw = cell.strip() left = raw.startswith(":") right = raw.endswith(":") if left and right: aligns.append("c") elif right: aligns.append("r") else: aligns.append("l") # Pad to ncols while len(aligns) < ncols: aligns.append("l") return aligns[:ncols] # --------------------------------------------------------------------------- # Code block rendering # --------------------------------------------------------------------------- _UNICODE_TO_ASCII: dict[str, str] = { "\u2190": "<-", "\u2192": "->", "\u21d0": "<=", "\u21d2": "=>", "\u2264": "<=", "\u2265": ">=", "\u2260": "!=", "\u2248": "~=", "\u2208": " in ", "\u2209": " not in ", "\u2200": "forall ", "\u2203": "exists ", "\u2207": "nabla", "\u221e": "inf", "\u00b1": "+/-", "\u00d7": "x", "\u00b7": "*", "\u2026": "...", "\u03b1": "alpha", "\u03b2": "beta", "\u03b3": "gamma", "\u03b4": "delta", "\u03b5": "epsilon", "\u03b6": "zeta", "\u03b7": "eta", "\u03b8": "theta", "\u03b9": "iota", "\u03ba": "kappa", "\u03bb": "lambda", "\u03bc": "mu", "\u03bd": "nu", "\u03be": "xi", "\u03c0": "pi", "\u03c1": "rho", "\u03c3": "sigma", "\u03c4": "tau", "\u03c5": "upsilon", "\u03c6": "phi", "\u03c7": "chi", "\u03c8": "psi", "\u03c9": "omega", "\u0394": "Delta", "\u0398": "Theta", "\u039b": "Lambda", "\u03a3": "Sigma", "\u03a6": "Phi", "\u03a8": "Psi", "\u03a9": "Omega", "\u2113": "ell", "\u2202": "d", "\u222b": "int", } # BUG-110: Unicode Greek → LaTeX math replacements for inline text. # Used in _convert_inline() and _sanitize_latex_output(). _UNICODE_GREEK_TO_LATEX: dict[str, str] = { # Lowercase "\u03b1": "$\\alpha$", "\u03b2": "$\\beta$", "\u03b3": "$\\gamma$", "\u03b4": "$\\delta$", "\u03b5": "$\\epsilon$", "\u03b6": "$\\zeta$", "\u03b7": "$\\eta$", "\u03b8": "$\\theta$", "\u03b9": "$\\iota$", "\u03ba": "$\\kappa$", "\u03bb": "$\\lambda$", "\u03bc": "$\\mu$", "\u03bd": "$\\nu$", "\u03be": "$\\xi$", "\u03c0": "$\\pi$", "\u03c1": "$\\rho$", "\u03c3": "$\\sigma$", "\u03c4": "$\\tau$", "\u03c5": "$\\upsilon$", "\u03c6": "$\\phi$", "\u03c7": "$\\chi$", "\u03c8": "$\\psi$", "\u03c9": "$\\omega$", # Uppercase "\u0393": "$\\Gamma$", "\u0394": "$\\Delta$", "\u0398": "$\\Theta$", "\u039b": "$\\Lambda$", "\u039e": "$\\Xi$", "\u03a0": "$\\Pi$", "\u03a3": "$\\Sigma$", "\u03a6": "$\\Phi$", "\u03a8": "$\\Psi$", "\u03a9": "$\\Omega$", # Common math symbols not already handled "\u2200": "$\\forall$", "\u2203": "$\\exists$", "\u2207": "$\\nabla$", "\u2202": "$\\partial$", "\u2026": "\\ldots{}", "\u22c5": "$\\cdot$", "\u2113": "$\\ell$", "\u222b": "$\\int$", "\u2209": "$\\notin$", # Common symbols that cause null-byte corruption if not converted "\u00b1": "$\\pm$", # ± "\u00d7": "$\\times$", # × "\u2248": "$\\approx$", # ≈ "\u2264": "$\\leq$", # ≤ "\u2265": "$\\geq$", # ≥ "\u2260": "$\\neq$", # ≠ "\u221e": "$\\infty$", # ∞ # Additional symbols found in Runs 49-52 "\u2212": "$-$", # − (minus sign, distinct from hyphen) "\u2282": "$\\subset$", # ⊂ "\u222a": "$\\cup$", # ∪ "\u211d": "$\\mathbb{R}$", # ℝ "\u0302": "\\^{}", # ̂ (combining circumflex) "\u0303": "\\~{}", # ̃ (combining tilde — Run 61 pseudocode) "\u221d": "$\\propto$", # ∝ (proportional to) "\u2208": "$\\in$", # ∈ } _ALGO_KEYWORDS = re.compile( r"\b(Input|Output|Return|While|For|If|Else|Repeat|Until|Function|Procedure|Algorithm)\b", re.IGNORECASE, ) def _escape_algo_line(line: str) -> str: """Escape LaTeX special characters in an algorithmic pseudocode line. BUG-177: Raw pseudocode lines contain Python/math syntax that breaks pdflatex: ``#`` (comment char), ``_`` (subscript), ``%`` (comment), ``&`` (alignment), ``{}``, ``~``, ``^``. Strategy: 1. Convert ``# comment`` at end of line → ``\\COMMENT{comment}`` 2. Protect existing LaTeX commands and math delimiters 3. Escape remaining special characters """ # Step 1: Convert Python-style end-of-line comments → \COMMENT{...} # Match `# comment` that isn't at the start of the line (those are full-line comments) _comment_match = re.search(r"(?<=\s)#\s*(.+)$", line) comment_suffix = "" if _comment_match: comment_text = _comment_match.group(1).strip() line = line[: _comment_match.start()].rstrip() comment_suffix = f" \\COMMENT{{{comment_text}}}" elif line.strip().startswith("#"): # Full-line comment comment_text = line.strip().lstrip("#").strip() return f"\\COMMENT{{{comment_text}}}" # Step 2: Protect existing LaTeX commands and math mode from escaping protected: list[str] = [] def _protect(m: re.Match[str]) -> str: idx = len(protected) protected.append(m.group(0)) return f"\x00ALG{idx}\x00" # Protect: \command{...}, $...$, \(...\) line = re.sub(r"\\[a-zA-Z]+\{[^}]*\}", _protect, line) line = re.sub(r"\$[^$]+\$", _protect, line) line = re.sub(r"\\\(.+?\\\)", _protect, line) # Step 3: Escape special characters line = line.replace("&", "\\&") line = line.replace("%", "\\%") line = line.replace("#", "\\#") line = line.replace("_", "\\_") line = line.replace("{", "\\{") line = line.replace("}", "\\}") line = line.replace("~", "\\textasciitilde{}") line = line.replace("^", "\\textasciicircum{}") # Step 4: Restore protected regions for idx, val in enumerate(protected): line = line.replace(f"\x00ALG{idx}\x00", val) return line + comment_suffix def _render_code_block(lang: str, code: str) -> str: """Render a fenced code block as a LaTeX environment. IMP-28: Detects pseudocode blocks (language hint 'algorithm' / 'pseudocode', or 3+ algorithm keywords) and renders them inside an ``algorithm`` + ``algorithmic`` environment instead of verbatim. Replaces Unicode characters (Greek letters, arrows, math symbols) with ASCII equivalents so pdflatex can compile the block. """ import unicodedata escaped = code.rstrip("\n") for uni, ascii_eq in _UNICODE_TO_ASCII.items(): escaped = escaped.replace(uni, ascii_eq) # Strip combining characters (tildes, hats, etc.) that break pdflatex escaped = "".join( c for c in escaped if not unicodedata.combining(c) ) # IMP-28: Detect pseudocode and use algorithm environment lang_lower = lang.lower().strip() is_algo = lang_lower in ("algorithm", "pseudocode", "algo") if not is_algo: # Heuristic: ≥3 algorithm keywords → treat as pseudocode is_algo = len(_ALGO_KEYWORDS.findall(escaped)) >= 3 if is_algo: # Extract caption from first comment line if present algo_lines = escaped.split("\n") caption = "Algorithm" if algo_lines and algo_lines[0].strip().startswith("//"): caption = algo_lines[0].strip().lstrip("/ ").strip() algo_lines = algo_lines[1:] # Wrap raw lines in \STATE unless they already use algorithmic commands _algo_cmds = {"\\STATE", "\\IF", "\\ELSE", "\\ELSIF", "\\ENDIF", "\\FOR", "\\ENDFOR", "\\WHILE", "\\ENDWHILE", "\\REPEAT", "\\UNTIL", "\\RETURN", "\\REQUIRE", "\\ENSURE"} wrapped_lines = [] for al in algo_lines: stripped = al.strip() if not stripped: continue if any(stripped.startswith(cmd) for cmd in _algo_cmds): wrapped_lines.append(stripped) else: # BUG-177: Escape LaTeX special chars in pseudocode lines wrapped_lines.append(f"\\STATE {_escape_algo_line(stripped)}") body = "\n".join(wrapped_lines) return ( "\\begin{algorithm}[ht]\n" f"\\caption{{{_convert_inline(caption)}}}\n" "\\begin{algorithmic}[1]\n" f"{body}\n" "\\end{algorithmic}\n" "\\end{algorithm}" ) return f"\\begin{{verbatim}}\n{escaped}\n\\end{{verbatim}}" # --------------------------------------------------------------------------- # Figure rendering # --------------------------------------------------------------------------- def _render_figure(caption: str, path: str) -> str: """Render a markdown image as a LaTeX figure environment.""" fig_num = _next_figure_num() # Sanitize path for LaTeX: replace spaces, keep underscores path = path.replace(" ", "_") cap_tex = _convert_inline(caption) if caption else f"Figure {fig_num}" label_key = re.sub(r"[^a-z0-9]+", "_", caption.lower()).strip("_")[:30] if not label_key: label_key = str(fig_num) return ( "\\begin{figure}[t]\n" "\\centering\n" f"\\includegraphics[width=0.95\\columnwidth]{{{path}}}\n" f"\\caption{{{cap_tex}}}\n" f"\\label{{fig:{label_key}}}\n" "\\end{figure}" ) # --------------------------------------------------------------------------- # Inline conversion # --------------------------------------------------------------------------- # Order matters: process bold before italic to avoid conflicts. _BOLD_RE = re.compile(r"\*\*(.+?)\*\*") _ITALIC_RE = re.compile(r"(? str: """Convert inline Markdown formatting to LaTeX. Preserves: - Inline math ``\\(...\\)`` and ``$...$`` - ``\\cite{...}`` references - Display math markers (already handled at block level) """ # Normalize Unicode punctuation to LaTeX equivalents text = text.replace("\u2014", "---") # em-dash — text = text.replace("\u2013", "--") # en-dash – text = text.replace("\u201c", "``") # left double quote " text = text.replace("\u201d", "''") # right double quote " text = text.replace("\u2018", "`") # left single quote ' text = text.replace("\u2019", "'") # right single quote ' text = text.replace("\u00b1", "$\\pm$") # ± text = text.replace("\u2248", "$\\approx$") # ≈ text = text.replace("\u2264", "$\\leq$") # ≤ text = text.replace("\u2265", "$\\geq$") # ≥ text = text.replace("\u2192", "$\\rightarrow$") # → text = text.replace("\u2190", "$\\leftarrow$") # ← text = text.replace("\u00d7", "$\\times$") # × text = text.replace("\u2260", "$\\neq$") # ≠ text = text.replace("\u2208", "$\\in$") # ∈ text = text.replace("\u221e", "$\\infty$") # ∞ # BUG-110: Replace Unicode Greek letters with LaTeX math equivalents. # These appear when LLMs emit raw Unicode (e.g. "ε-greedy" instead of # "$\epsilon$-greedy") and cause fatal pdflatex errors. for _uchar, _lcmd in _UNICODE_GREEK_TO_LATEX.items(): if _uchar in text: text = text.replace(_uchar, _lcmd) # Protect math and cite from escaping protected: list[str] = [] def _protect(m: re.Match[str]) -> str: idx = len(protected) protected.append(m.group(0)) return f"\x00PROT{idx}\x00" # Protect inline math: \(...\) and $...$ text = re.sub(r"\\\(.+?\\\)", _protect, text) text = re.sub(r"(? str: href = f"\\href{{{m.group(2)}}}{{{m.group(1)}}}" idx = len(protected) protected.append(href) return f"\x00PROT{idx}\x00" text = _LINK_RE.sub(_convert_and_protect_link, text) # Escape special LaTeX characters text = _LATEX_SPECIAL.sub(r"\\\1", text) text = _LATEX_TILDE.sub(r"\\textasciitilde{}", text) text = _LATEX_CARET.sub(r"\\textasciicircum{}", text) text = _LATEX_DOLLAR.sub(r"\\$", text) # Convert bold **text** → \textbf{text} text = _BOLD_RE.sub(r"\\textbf{\1}", text) # Convert italic *text* → \textit{text} text = _ITALIC_RE.sub(r"\\textit{\1}", text) # Convert inline code `text` → \texttt{text} text = _INLINE_CODE_RE.sub(r"\\texttt{\1}", text) # Links and images were already converted+protected before escaping. # Fallback: convert any remaining [cite_key] patterns to \cite{key} # This catches citations that were not converted upstream. # BUG-32 fix: key pattern must also match author2017keyword style keys # (e.g., roijers2017multiobjective, abels2019dynamic) _CITE_KEY_PAT = r"[a-zA-Z][a-zA-Z0-9_-]*\d{4}[a-zA-Z0-9_]*" text = re.sub( rf"\[({_CITE_KEY_PAT}(?:\s*,\s*{_CITE_KEY_PAT})*)\]", r"\\cite{\1}", text, ) # Restore protected segments in reverse order so that nested # markers (e.g. PROT0 inside PROT1's value) are resolved correctly. for idx in range(len(protected) - 1, -1, -1): text = text.replace(f"\x00PROT{idx}\x00", protected[idx]) return text # --------------------------------------------------------------------------- # Completeness checking (R10-Fix5) # --------------------------------------------------------------------------- _EXPECTED_SECTIONS = { "introduction", "related work", "method", "experiment", "result", "discussion", "conclusion", } _SECTION_ALIASES: dict[str, str] = { "methodology": "method", "methods": "method", "proposed method": "method", "approach": "method", "experiments": "experiment", "experimental setup": "experiment", "experimental results": "result", "results": "result", "results and discussion": "result", "results and analysis": "result", "discussion and results": "result", "conclusions": "conclusion", "conclusion and future work": "conclusion", "summary": "conclusion", "background": "related work", "literature review": "related work", "prior work": "related work", } def check_paper_completeness(sections: list[_Section]) -> list[str]: """Check whether a paper contains all expected sections. Returns a list of warning strings. Empty list means the paper structure looks complete. """ warnings: list[str] = [] # Check for valid title — look for any H1/H2 heading that could be a title _has_title = any( sec.level in (1, 2) and sec.heading_lower not in ("abstract", "introduction", "related work", "method", "methods", "methodology", "experiments", "results", "discussion", "conclusion", "limitations", "references") for sec in sections ) if not _has_title: warnings.append( "No valid title found in paper. The output may lack proper heading structure." ) found_sections: set[str] = set() section_headings: list[str] = [] for sec in sections: if sec.level in (1, 2) and sec.heading: heading_lower = sec.heading.strip().lower() section_headings.append(heading_lower) if heading_lower in _EXPECTED_SECTIONS: found_sections.add(heading_lower) elif heading_lower in _SECTION_ALIASES: found_sections.add(_SECTION_ALIASES[heading_lower]) else: for expected in _EXPECTED_SECTIONS: if expected in heading_lower: found_sections.add(expected) break missing = _EXPECTED_SECTIONS - found_sections if missing: warnings.append( f"Missing sections: {', '.join(sorted(missing))}. " f"Found: {', '.join(section_headings)}" ) # T2.5: Check for required conference sections (NeurIPS/ICLR mandate Limitations) _required_extras = {"limitations"} _extra_aliases = { "limitation": "limitations", "limitations and future work": "limitations", "limitations and broader impact": "limitations", } found_extras: set[str] = set() for sec in sections: if sec.level in (1, 2) and sec.heading: hl = sec.heading.strip().lower() if hl in _required_extras: found_extras.add(hl) elif hl in _extra_aliases: found_extras.add(_extra_aliases[hl]) elif "limitation" in hl: found_extras.add("limitations") missing_extras = _required_extras - found_extras if missing_extras: warnings.append( f"Missing required sections for NeurIPS/ICLR: " f"{', '.join(sorted(missing_extras))}." ) # T1.5: Abstract length and quality checks abstract_text = "" for sec in sections: if sec.heading_lower == "abstract": abstract_text = sec.body break if abstract_text: word_count = len(abstract_text.split()) if word_count > 300: warnings.append( f"Abstract is {word_count} words (conference limit: 150-250). " f"Must be shortened." ) elif word_count < 150: warnings.append( f"Abstract is only {word_count} words (expected 150-250 for conferences)." ) # Detect raw variable names / metric key dumps raw_vars = re.findall(r"\b\w+_\w+/\w+(?:_\w+)*\s*=", abstract_text) if raw_vars: warnings.append( f"Abstract contains raw variable names: {raw_vars[:3]}. " f"Replace with human-readable descriptions." ) # Detect truncation markers all_body = " ".join(sec.body for sec in sections) truncation_markers = [ "further sections continue", "remaining sections unchanged", "sections continue unchanged", "content continues", "[to be continued]", "[remaining content]", ] for marker in truncation_markers: if marker in all_body.lower(): warnings.append( f"Truncation marker detected: '{marker}'. " f"Paper content may be incomplete." ) # Word count check total_words = sum(len(sec.body.split()) for sec in sections) if total_words < 2000: warnings.append( f"Paper body is only {total_words} words " f"(expected 5,000-6,500 for conference paper). " f"Content may be severely truncated." ) # Per-section word count check (safety net during LaTeX conversion) from researchclaw.prompts import SECTION_WORD_TARGETS, _SECTION_TARGET_ALIASES for sec in sections: if sec.level not in (1, 2) or not sec.heading: continue canon = sec.heading_lower if canon not in SECTION_WORD_TARGETS: canon = _SECTION_TARGET_ALIASES.get(sec.heading_lower, "") if not canon or canon not in SECTION_WORD_TARGETS: continue lo, hi = SECTION_WORD_TARGETS[canon] wc = len(sec.body.split()) if wc < int(lo * 0.6): warnings.append( f"Section '{sec.heading}' is only {wc} words " f"(expected {lo}-{hi}). Content may be severely truncated." ) elif wc > int(hi * 1.5): warnings.append( f"Section '{sec.heading}' is {wc} words " f"(expected {lo}-{hi}). Consider trimming." ) # Bullet density check for body sections _bullet_re_cc = re.compile(r"^\s*[-*]\s+", re.MULTILINE) _numbered_re_cc = re.compile(r"^\s*\d+\.\s+", re.MULTILINE) _bullet_ok_sections = {"introduction", "limitations", "limitation", "abstract"} for sec in sections: if sec.level not in (1, 2) or not sec.heading: continue hl = sec.heading_lower if hl in _bullet_ok_sections: continue if not sec.body: continue total_lines = len([ln for ln in sec.body.splitlines() if ln.strip()]) if total_lines < 4: continue bullet_count = ( len(_bullet_re_cc.findall(sec.body)) + len(_numbered_re_cc.findall(sec.body)) ) density = bullet_count / total_lines if density > 0.30: warnings.append( f"Section '{sec.heading}' has high bullet-point density " f"({bullet_count}/{total_lines} lines = {density:.0%}). " f"Conference papers should use flowing prose." ) return warnings def _escape_latex(text: str) -> str: """Escape LaTeX special characters in plain text (titles, headings). Does NOT escape inside math delimiters or \\commands. """ # Protect math first protected: list[str] = [] def _protect(m: re.Match[str]) -> str: idx = len(protected) protected.append(m.group(0)) return f"\x00PROT{idx}\x00" text = re.sub(r"\\\(.+?\\\)", _protect, text) text = re.sub(r"(? list[LatexTable]: """Generate LaTeX tables from a VerifiedRegistry. Parameters ---------- registry: The verified registry built from experiment data. metric_name: Human-readable name for the primary metric column. metric_direction: ``"maximize"`` or ``"minimize"`` — determines which result is bolded. two_column: If True, use ``table*`` environment (for 2-column formats like ICML). Returns ------- list[LatexTable] One or more tables. Usually just one main results table. """ tables: list[LatexTable] = [] # --- Main results table --- conditions = _get_reportable_conditions(registry) if not conditions: logger.warning("No reportable conditions — skipping table generation") return tables main_table = _build_main_table( conditions, metric_name=metric_name, metric_direction=metric_direction, two_column=two_column, ) tables.append(main_table) # --- Per-seed breakdown table (if seeds > 1 for any condition) --- has_multi_seed = any(c.n_seeds >= 2 for c in conditions) if has_multi_seed: seed_table = _build_per_seed_table( conditions, metric_name=metric_name, two_column=two_column, ) tables.append(seed_table) return tables def _get_reportable_conditions(registry: VerifiedRegistry) -> list[ConditionResult]: """Filter conditions to only those with at least 1 valid seed.""" results = [] for cond in registry.conditions.values(): if cond.n_seeds >= 1 and cond.mean is not None and math.isfinite(cond.mean): results.append(cond) # Sort alphabetically for consistency results.sort(key=lambda c: c.name) return results def _build_main_table( conditions: list[ConditionResult], *, metric_name: str, metric_direction: str, two_column: bool, ) -> LatexTable: """Build the main results table with mean ± std per condition.""" verified: set[float] = set() # Find best condition for bolding best_idx = _find_best(conditions, metric_direction) # Build rows rows: list[str] = [] for i, cond in enumerate(conditions): mean_str = _fmt(cond.mean) if cond.mean is not None: verified.add(round(cond.mean, 4)) if cond.std is not None and cond.std > 0 and cond.n_seeds >= 2: std_str = _fmt(cond.std) val_str = f"{mean_str} $\\pm$ {std_str}" verified.add(round(cond.std, 4)) elif cond.n_seeds == 1: val_str = f"{mean_str}$^{{\\ddagger}}$" else: val_str = mean_str if i == best_idx: val_str = f"\\textbf{{{val_str}}}" n_str = str(cond.n_seeds) name_escaped = _escape_latex(cond.name) rows.append(f"{name_escaped} & {val_str} & {n_str} \\\\") # Compose table table_env = "table*" if two_column else "table" col_spec = "l c r" body = "\n".join(rows) note_lines = [] if any(c.n_seeds == 1 for c in conditions): note_lines.append( "$^{\\ddagger}$Single seed; no standard deviation available." ) notes = "\n".join(note_lines) if notes: notes = f"\n\\vspace{{2pt}}\\par\\footnotesize {notes}\n" latex = ( f"\\begin{{{table_env}}}[htbp]\n" f"\\centering\n" f"\\caption{{Experimental results. " f"{len(conditions)} conditions evaluated.}}\n" f"\\label{{tab:main_results}}\n" f"% AUTO-GENERATED FROM EXPERIMENT DATA — DO NOT MODIFY NUMBERS\n" f"\\begin{{tabular}}{{{col_spec}}}\n" f"\\toprule\n" f"Method & {metric_name} & $n$ \\\\\n" f"\\midrule\n" f"{body}\n" f"\\bottomrule\n" f"\\end{{tabular}}{notes}\n" f"\\end{{{table_env}}}" ) return LatexTable( label="tab:main_results", caption=f"Experimental results. {len(conditions)} conditions evaluated.", latex_code=latex, verified_values=verified, n_conditions=len(conditions), n_total_seeds=sum(c.n_seeds for c in conditions), ) def _build_per_seed_table( conditions: list[ConditionResult], *, metric_name: str, two_column: bool, ) -> LatexTable: """Build per-seed breakdown table.""" verified: set[float] = set() # Determine max seeds across conditions max_seeds = max(c.n_seeds for c in conditions) # Build header seed_cols = " & ".join(f"Seed {i}" for i in range(max_seeds)) col_spec = "l " + " ".join("r" for _ in range(max_seeds)) + " r" # Build rows rows: list[str] = [] for cond in conditions: name_escaped = _escape_latex(cond.name) cells = [] for seed_idx in range(max_seeds): val = cond.per_seed_values.get(seed_idx) if val is not None and math.isfinite(val): cells.append(_fmt(val)) verified.add(round(val, 4)) else: cells.append("---") mean_str = _fmt(cond.mean) if cond.mean is not None else "---" cells_str = " & ".join(cells) rows.append(f"{name_escaped} & {cells_str} & {mean_str} \\\\") body = "\n".join(rows) table_env = "table*" if two_column else "table" latex = ( f"\\begin{{{table_env}}}[htbp]\n" f"\\centering\n" f"\\caption{{Per-seed results breakdown.}}\n" f"\\label{{tab:per_seed}}\n" f"% AUTO-GENERATED FROM EXPERIMENT DATA — DO NOT MODIFY NUMBERS\n" f"\\begin{{tabular}}{{{col_spec}}}\n" f"\\toprule\n" f"Method & {seed_cols} & Mean \\\\\n" f"\\midrule\n" f"{body}\n" f"\\bottomrule\n" f"\\end{{tabular}}\n" f"\\end{{{table_env}}}" ) return LatexTable( label="tab:per_seed", caption="Per-seed results breakdown.", latex_code=latex, verified_values=verified, n_conditions=len(conditions), n_total_seeds=sum(c.n_seeds for c in conditions), ) def build_condition_whitelist(registry: VerifiedRegistry) -> str: """Generate a human-readable condition whitelist for the LLM prompt. Example output:: CONDITION WHITELIST (you may ONLY discuss these conditions): - DQN (3 seeds, mean=206.10) - DQN+Abstraction (3 seeds, mean=278.93) - DQN+RawCount (3 seeds, mean=180.80) """ lines = ["CONDITION WHITELIST (you may ONLY discuss these conditions):"] for cond in sorted(registry.conditions.values(), key=lambda c: c.name): if cond.n_seeds == 0 or cond.mean is None or not math.isfinite(cond.mean): continue mean_str = f"{cond.mean:.4f}" lines.append(f"- {cond.name} ({cond.n_seeds} seed(s), mean={mean_str})") if len(lines) == 1: lines.append("- (no conditions completed)") return "\n".join(lines) def _find_best(conditions: list[ConditionResult], direction: str) -> int | None: """Return index of best condition, or None if empty.""" if not conditions: return None best_idx = 0 for i, c in enumerate(conditions): if c.mean is None: continue if conditions[best_idx].mean is None: best_idx = i continue if direction == "maximize" and c.mean > conditions[best_idx].mean: best_idx = i elif direction == "minimize" and c.mean < conditions[best_idx].mean: best_idx = i return best_idx def _fmt(value: float | None) -> str: """Format a number for LaTeX tables with sig-fig-aware rounding.""" if value is None or not math.isfinite(value): return "---" # Sig-fig-aware formatting (same approach as BUG-83 fix) av = abs(value) if av >= 100: return f"{value:.2f}" elif av >= 1: return f"{value:.4f}" elif av >= 0.001: return f"{value:.4f}" elif av > 0: # Very small values: use 2 significant figures import decimal d = decimal.Decimal(str(value)).normalize() # Count leading zeros after decimal point exp = d.adjusted() sig_digits = max(2, -exp + 1) return f"{value:.{sig_digits}f}" else: return "0.0000" def _escape_latex(text: str) -> str: """Escape special LaTeX characters in condition names.""" # Backslash must be first to avoid double-escaping replacements = [ ("\\", "\\textbackslash{}"), ("&", "\\&"), ("%", "\\%"), ("#", "\\#"), ("_", "\\_"), ("$", "\\$"), ("{", "\\{"), ("}", "\\}"), ("~", "\\textasciitilde{}"), ("^", "\\textasciicircum{}"), ] for old, new in replacements: text = text.replace(old, new) return text ================================================ FILE: researchclaw/templates/styles/iclr_2025/iclr2025_conference.bst ================================================ %% iclr2025_conference.bst — ICLR 2025 bibliography style %% Symlink-equivalent to iclr2026_conference.bst (same format). %% Bundled by AutoResearchClaw for offline compilation. ENTRY { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass } {} { label } INTEGERS { output.state before.all mid.sentence after.sentence after.block } FUNCTION {init.state.consts} { #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := } STRINGS { s t } FUNCTION {output.nonnull} { 's := output.state mid.sentence = { ", " * write$ } { output.state after.block = { add.period$ write$ newline$ "\newblock " write$ } { output.state before.all = 'write$ { add.period$ " " * write$ } if$ } if$ mid.sentence 'output.state := } if$ s } FUNCTION {output} { duplicate$ empty$ 'pop$ 'output.nonnull if$ } FUNCTION {output.check} { 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ } FUNCTION {fin.entry} { add.period$ write$ newline$ } FUNCTION {new.block} { output.state before.all = 'skip$ { after.block 'output.state := } if$ } FUNCTION {not} { { #0 } { #1 } if$ } FUNCTION {and} { 'skip$ { pop$ #0 } if$ } FUNCTION {or} { { pop$ #1 } 'skip$ if$ } FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ } FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ } INTEGERS { nameptr namesleft numnames } FUNCTION {format.names} { 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft := { namesleft #0 > } { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$ nameptr #1 + 'nameptr := namesleft #1 - 'namesleft := } while$ } FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ } FUNCTION {format.title} { title empty$ { "" } { title } if$ } FUNCTION {format.btitle} { title emphasize } FUNCTION {format.date} { year empty$ { "" } { year } if$ } FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ } FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ } FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ } FUNCTION {output.bibitem} { newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := } FUNCTION {article} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {inproceedings} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {conference} { inproceedings } FUNCTION {book} { output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry } FUNCTION {misc} { output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry } FUNCTION {techreport} { misc } FUNCTION {phdthesis} { misc } FUNCTION {mastersthesis} { misc } FUNCTION {unpublished} { misc } FUNCTION {default.type} { misc } READ FUNCTION {sortify} { purify$ "l" change.case$ } FUNCTION {presort} { cite$ 'label := label sortify " " * #1 entry.max$ substring$ 'sort.key$ := } ITERATE {presort} SORT FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ } FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ } EXECUTE {begin.bib} EXECUTE {init.state.consts} ITERATE {call.type$} EXECUTE {end.bib} ================================================ FILE: researchclaw/templates/styles/iclr_2025/iclr2025_conference.sty ================================================ % iclr2025_conference.sty — ICLR 2025 conference style file % Based on the official ICLR submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://github.com/ICLR/Master-Template/raw/master/iclr2025.zip \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{iclr2025_conference}[2025/01/15 ICLR 2025 conference style] \newif\if@iclr@final \@iclr@finalfalse \newif\if@iclr@preprint \@iclr@preprintfalse \DeclareOption{final}{\@iclr@finaltrue} \DeclareOption{preprint}{\@iclr@preprinttrue} \ProcessOptions\relax \RequirePackage{geometry} \geometry{textwidth=5.5in,textheight=9.0in,top=1.0in,headheight=12pt,headsep=25pt,footskip=30pt} \RequirePackage{times} \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} \renewcommand{\section}{\@startsection{section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}{1.0ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} \def\@maketitle{% \vbox to 0pt{}\vskip -0.5in \begin{center}% {\LARGE\bfseries \@title \par}\vskip 0.3in \if@iclr@final {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}% \else {\large Anonymous authors\par}{\normalsize Paper under double-blind review\par}% \fi \vskip 0.3in \end{center}\par\vskip 0.5em } \renewenvironment{abstract}{\centerline{\large\bfseries Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex} \RequirePackage{fancyhdr} \pagestyle{fancy}\fancyhf{} \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} \RequirePackage[numbers,sort&compress]{natbib} \endinput ================================================ FILE: researchclaw/templates/styles/iclr_2026/iclr2026_conference.bst ================================================ %% iclr2026_conference.bst — ICLR 2026 bibliography style %% Bundled by AutoResearchClaw for offline compilation. %% This is a minimal numeric bibliography style compatible with natbib. %% For full-fidelity formatting, download from https://github.com/ICLR/Master-Template ENTRY { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass } {} { label } INTEGERS { output.state before.all mid.sentence after.sentence after.block } FUNCTION {init.state.consts} { #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := } STRINGS { s t } FUNCTION {output.nonnull} { 's := output.state mid.sentence = { ", " * write$ } { output.state after.block = { add.period$ write$ newline$ "\newblock " write$ } { output.state before.all = 'write$ { add.period$ " " * write$ } if$ } if$ mid.sentence 'output.state := } if$ s } FUNCTION {output} { duplicate$ empty$ 'pop$ 'output.nonnull if$ } FUNCTION {output.check} { 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ } FUNCTION {fin.entry} { add.period$ write$ newline$ } FUNCTION {new.block} { output.state before.all = 'skip$ { after.block 'output.state := } if$ } FUNCTION {not} { { #0 } { #1 } if$ } FUNCTION {and} { 'skip$ { pop$ #0 } if$ } FUNCTION {or} { { pop$ #1 } 'skip$ if$ } FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ } FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ } INTEGERS { nameptr namesleft numnames } FUNCTION {format.names} { 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft := { namesleft #0 > } { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$ nameptr #1 + 'nameptr := namesleft #1 - 'namesleft := } while$ } FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ } FUNCTION {format.title} { title empty$ { "" } { title } if$ } FUNCTION {format.btitle} { title emphasize } FUNCTION {format.date} { year empty$ { "" } { year } if$ } FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ } FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ } FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ } FUNCTION {output.bibitem} { newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := } FUNCTION {article} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {inproceedings} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {conference} { inproceedings } FUNCTION {book} { output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry } FUNCTION {misc} { output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry } FUNCTION {techreport} { misc } FUNCTION {phdthesis} { misc } FUNCTION {mastersthesis} { misc } FUNCTION {unpublished} { misc } FUNCTION {default.type} { misc } READ FUNCTION {sortify} { purify$ "l" change.case$ } FUNCTION {presort} { cite$ 'label := label sortify " " * #1 entry.max$ substring$ 'sort.key$ := } ITERATE {presort} SORT FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ } FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ } EXECUTE {begin.bib} EXECUTE {init.state.consts} ITERATE {call.type$} EXECUTE {end.bib} ================================================ FILE: researchclaw/templates/styles/iclr_2026/iclr2026_conference.sty ================================================ % iclr2026_conference.sty — ICLR 2026 conference style file % Based on the official ICLR submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://github.com/ICLR/Master-Template \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{iclr2026_conference}[2026/01/15 ICLR 2026 conference style] % ── Options ────────────────────────────────────────────────────────── \newif\if@iclr@final \@iclr@finalfalse \newif\if@iclr@preprint \@iclr@preprintfalse \DeclareOption{final}{\@iclr@finaltrue} \DeclareOption{preprint}{\@iclr@preprinttrue} \ProcessOptions\relax % ── Page geometry ──────────────────────────────────────────────────── \RequirePackage{geometry} \geometry{ textwidth=5.5in, textheight=9.0in, top=1.0in, headheight=12pt, headsep=25pt, footskip=30pt, } % ── Fonts ──────────────────────────────────────────────────────────── \RequirePackage{times} % ── Spacing ────────────────────────────────────────────────────────── \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} % ── Section formatting ─────────────────────────────────────────────── \renewcommand{\section}{\@startsection {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}% {1.0ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}% {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}% {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} % ── Title formatting ──────────────────────────────────────────────── \def\@maketitle{% \vbox to 0pt{}% \vskip -0.5in \begin{center}% {\LARGE\bfseries \@title \par}% \vskip 0.3in \if@iclr@final {\large \lineskip .5em \begin{tabular}[t]{c}% \@author \end{tabular}\par}% \else {\large Anonymous authors\par}% {\normalsize Paper under double-blind review\par}% \fi \vskip 0.3in \end{center}% \par \vskip 0.5em } % ── Abstract ───────────────────────────────────────────────────────── \renewenvironment{abstract}{% \centerline{\large\bfseries Abstract}% \vspace{0.5ex}% \begin{quote}% }{% \par \end{quote}% \vskip 1ex } % ── Headers ────────────────────────────────────────────────────────── \RequirePackage{fancyhdr} \pagestyle{fancy} \fancyhf{} \if@iclr@final \fancyhead[C]{Published as a conference paper at ICLR 2026} \else \fancyhead[C]{} \fi \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} % ── Natbib ─────────────────────────────────────────────────────────── \RequirePackage[numbers,sort&compress]{natbib} \endinput ================================================ FILE: researchclaw/templates/styles/icml_2025/icml2025.bst ================================================ %% icml2025.bst — ICML 2025 bibliography style %% Bundled by AutoResearchClaw for offline compilation. %% Identical format to icml2026.bst. ENTRY { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass } {} { label } INTEGERS { output.state before.all mid.sentence after.sentence after.block } FUNCTION {init.state.consts} { #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := } STRINGS { s t } FUNCTION {output.nonnull} { 's := output.state mid.sentence = { ", " * write$ } { output.state after.block = { add.period$ write$ newline$ "\newblock " write$ } { output.state before.all = 'write$ { add.period$ " " * write$ } if$ } if$ mid.sentence 'output.state := } if$ s } FUNCTION {output} { duplicate$ empty$ 'pop$ 'output.nonnull if$ } FUNCTION {output.check} { 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ } FUNCTION {fin.entry} { add.period$ write$ newline$ } FUNCTION {new.block} { output.state before.all = 'skip$ { after.block 'output.state := } if$ } FUNCTION {not} { { #0 } { #1 } if$ } FUNCTION {and} { 'skip$ { pop$ #0 } if$ } FUNCTION {or} { { pop$ #1 } 'skip$ if$ } FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ } FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ } INTEGERS { nameptr namesleft numnames } FUNCTION {format.names} { 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft := { namesleft #0 > } { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$ nameptr #1 + 'nameptr := namesleft #1 - 'namesleft := } while$ } FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ } FUNCTION {format.title} { title empty$ { "" } { title } if$ } FUNCTION {format.btitle} { title emphasize } FUNCTION {format.date} { year empty$ { "" } { year } if$ } FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ } FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ } FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ } FUNCTION {output.bibitem} { newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := } FUNCTION {article} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {inproceedings} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {conference} { inproceedings } FUNCTION {book} { output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry } FUNCTION {misc} { output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry } FUNCTION {techreport} { misc } FUNCTION {phdthesis} { misc } FUNCTION {mastersthesis} { misc } FUNCTION {unpublished} { misc } FUNCTION {default.type} { misc } READ FUNCTION {sortify} { purify$ "l" change.case$ } FUNCTION {presort} { cite$ 'label := label sortify " " * #1 entry.max$ substring$ 'sort.key$ := } ITERATE {presort} SORT FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ } FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ } EXECUTE {begin.bib} EXECUTE {init.state.consts} ITERATE {call.type$} EXECUTE {end.bib} ================================================ FILE: researchclaw/templates/styles/icml_2025/icml2025.sty ================================================ % icml2025.sty — ICML 2025 style file % Based on the official ICML submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://icml.cc/Conferences/2025/StyleAuthorInstructions \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{icml2025}[2025/01/15 ICML 2025 submission style] \newif\if@icml@accepted \@icml@acceptedfalse \newif\if@icml@preprint \@icml@preprintfalse \DeclareOption{accepted}{\@icml@acceptedtrue} \DeclareOption{preprint}{\@icml@preprinttrue} \ProcessOptions\relax \RequirePackage{geometry} \geometry{textwidth=6.875in,textheight=9.25in,columnsep=0.25in,top=0.75in,headheight=12pt,headsep=12pt,footskip=20pt} \twocolumn \RequirePackage{times} \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} \renewcommand{\section}{\@startsection{section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}{0.3ex plus .2ex}{\normalfont\normalsize\bfseries}} \newenvironment{icmlauthorlist}{\begin{center}\large}{\end{center}} \newcommand{\icmlauthor}[2]{#1\textsuperscript{#2}} \newcommand{\icmlaffiliation}[2]{\par\normalsize\textsuperscript{#1}#2} \newcommand{\icmltitlerunning}[1]{\def\@icml@runningtitle{#1}} \def\@icml@runningtitle{} \def\@maketitle{% \twocolumn[% \vskip -0.3in \begin{center}% {\LARGE\bfseries \@title \par}\vskip 0.2in \if@icml@accepted {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}% \else\if@icml@preprint {\large\lineskip .5em\begin{tabular}[t]{c}\@author\end{tabular}\par}% \else {\large Anonymous submission\par}% \fi\fi \vskip 0.2in \end{center}% ]% } \renewenvironment{abstract}{\centerline{\bfseries Abstract}\vspace{0.5ex}\begin{quote}\small}{\par\end{quote}\vskip 1ex} \RequirePackage{fancyhdr} \pagestyle{fancy}\fancyhf{} \if@icml@accepted \fancyhead[C]{\small Proceedings of the $42^{nd}$ International Conference on Machine Learning, 2025} \else \fancyhead[C]{\small\@icml@runningtitle} \fi \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} \RequirePackage[numbers,sort&compress]{natbib} \endinput ================================================ FILE: researchclaw/templates/styles/icml_2026/icml2026.bst ================================================ %% icml2026.bst — ICML 2026 bibliography style %% Bundled by AutoResearchClaw for offline compilation. %% Minimal numeric bibliography style compatible with natbib. ENTRY { author title journal booktitle year volume number pages doi url note publisher address edition eprint archiveprefix primaryclass } {} { label } INTEGERS { output.state before.all mid.sentence after.sentence after.block } FUNCTION {init.state.consts} { #0 'before.all := #1 'mid.sentence := #2 'after.sentence := #3 'after.block := } STRINGS { s t } FUNCTION {output.nonnull} { 's := output.state mid.sentence = { ", " * write$ } { output.state after.block = { add.period$ write$ newline$ "\newblock " write$ } { output.state before.all = 'write$ { add.period$ " " * write$ } if$ } if$ mid.sentence 'output.state := } if$ s } FUNCTION {output} { duplicate$ empty$ 'pop$ 'output.nonnull if$ } FUNCTION {output.check} { 't := duplicate$ empty$ { pop$ "empty " t * " in " * cite$ * warning$ } 'output.nonnull if$ } FUNCTION {fin.entry} { add.period$ write$ newline$ } FUNCTION {new.block} { output.state before.all = 'skip$ { after.block 'output.state := } if$ } FUNCTION {not} { { #0 } { #1 } if$ } FUNCTION {and} { 'skip$ { pop$ #0 } if$ } FUNCTION {or} { { pop$ #1 } 'skip$ if$ } FUNCTION {field.or.null} { duplicate$ empty$ { pop$ "" } 'skip$ if$ } FUNCTION {emphasize} { duplicate$ empty$ { pop$ "" } { "\emph{" swap$ * "}" * } if$ } INTEGERS { nameptr namesleft numnames } FUNCTION {format.names} { 's := #1 'nameptr := s num.names$ 'numnames := numnames 'namesleft := { namesleft #0 > } { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := nameptr #1 > { namesleft #1 > { ", " * t * } { numnames #2 > { "," * } 'skip$ if$ t "others" = { " et~al." * } { " and " * t * } if$ } if$ } 't if$ nameptr #1 + 'nameptr := namesleft #1 - 'namesleft := } while$ } FUNCTION {format.authors} { author empty$ { "" } { author format.names } if$ } FUNCTION {format.title} { title empty$ { "" } { title } if$ } FUNCTION {format.btitle} { title emphasize } FUNCTION {format.date} { year empty$ { "" } { year } if$ } FUNCTION {format.bvolume} { volume empty$ { "" } { "volume " volume * } if$ } FUNCTION {format.pages} { pages empty$ { "" } { "pp. " pages * } if$ } FUNCTION {format.url} { url empty$ { "" } { "\url{" url * "}" * } if$ } FUNCTION {output.bibitem} { newline$ "\bibitem{" write$ cite$ write$ "}" write$ newline$ "" before.all 'output.state := } FUNCTION {article} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block journal emphasize "journal" output.check format.bvolume output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {inproceedings} { output.bibitem format.authors "author" output.check new.block format.title "title" output.check new.block "In " booktitle emphasize * output format.pages output format.date "year" output.check format.url output fin.entry } FUNCTION {conference} { inproceedings } FUNCTION {book} { output.bibitem format.authors "author" output.check new.block format.btitle "title" output.check publisher output format.date "year" output.check fin.entry } FUNCTION {misc} { output.bibitem format.authors output new.block format.title output new.block note output format.date output format.url output fin.entry } FUNCTION {techreport} { misc } FUNCTION {phdthesis} { misc } FUNCTION {mastersthesis} { misc } FUNCTION {unpublished} { misc } FUNCTION {default.type} { misc } READ FUNCTION {sortify} { purify$ "l" change.case$ } FUNCTION {presort} { cite$ 'label := label sortify " " * #1 entry.max$ substring$ 'sort.key$ := } ITERATE {presort} SORT FUNCTION {begin.bib} { preamble$ empty$ 'skip$ { preamble$ write$ newline$ } if$ "\begin{thebibliography}{99}" write$ newline$ } FUNCTION {end.bib} { newline$ "\end{thebibliography}" write$ newline$ } EXECUTE {begin.bib} EXECUTE {init.state.consts} ITERATE {call.type$} EXECUTE {end.bib} ================================================ FILE: researchclaw/templates/styles/icml_2026/icml2026.sty ================================================ % icml2026.sty — ICML 2026 style file % Based on the official ICML submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://icml.cc/Conferences/2026/AuthorInstructions \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{icml2026}[2026/01/15 ICML 2026 submission style] % ── Options ────────────────────────────────────────────────────────── \newif\if@icml@accepted \@icml@acceptedfalse \newif\if@icml@preprint \@icml@preprintfalse \DeclareOption{accepted}{\@icml@acceptedtrue} \DeclareOption{preprint}{\@icml@preprinttrue} \ProcessOptions\relax % ── Page geometry (2-column) ───────────────────────────────────────── \RequirePackage{geometry} \geometry{ textwidth=6.875in, textheight=9.25in, columnsep=0.25in, top=0.75in, headheight=12pt, headsep=12pt, footskip=20pt, } \twocolumn % ── Fonts ──────────────────────────────────────────────────────────── \RequirePackage{times} % ── Spacing ────────────────────────────────────────────────────────── \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} % ── Section formatting ─────────────────────────────────────────────── \renewcommand{\section}{\@startsection {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}% {0.8ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}% {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}% {0.3ex plus .2ex}{\normalfont\normalsize\bfseries}} % ── Title formatting ──────────────────────────────────────────────── % ICML-specific author macros \newenvironment{icmlauthorlist}{\begin{center}\large}{\end{center}} \newcommand{\icmlauthor}[2]{#1\textsuperscript{#2}} \newcommand{\icmlaffiliation}[2]{\par\normalsize\textsuperscript{#1}#2} \newcommand{\icmltitlerunning}[1]{\def\@icml@runningtitle{#1}} \def\@icml@runningtitle{} \def\@maketitle{% \twocolumn[% \vskip -0.3in \begin{center}% {\LARGE\bfseries \@title \par}% \vskip 0.2in \if@icml@accepted {\large \lineskip .5em \begin{tabular}[t]{c}% \@author \end{tabular}\par}% \else \if@icml@preprint {\large \lineskip .5em \begin{tabular}[t]{c}% \@author \end{tabular}\par}% \else {\large Anonymous submission\par}% \fi \fi \vskip 0.2in \end{center}% ]% } % ── Abstract ───────────────────────────────────────────────────────── \renewenvironment{abstract}{% \centerline{\bfseries Abstract}% \vspace{0.5ex}% \begin{quote}\small% }{% \par \end{quote}% \vskip 1ex } % ── Headers ────────────────────────────────────────────────────────── \RequirePackage{fancyhdr} \pagestyle{fancy} \fancyhf{} \if@icml@accepted \fancyhead[C]{\small Proceedings of the $43^{rd}$ International Conference on Machine Learning, 2026} \else \fancyhead[C]{\small\@icml@runningtitle} \fi \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} % ── Natbib ─────────────────────────────────────────────────────────── \RequirePackage[numbers,sort&compress]{natbib} \endinput ================================================ FILE: researchclaw/templates/styles/neurips_2024/neurips_2024.sty ================================================ % neurips_2024.sty — NeurIPS 2024 style file % Based on the official NeurIPS submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://media.neurips.cc/Conferences/NeurIPS2024/Styles.zip \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{neurips_2024}[2024/01/15 NeurIPS 2024 submission style] % ── Options ────────────────────────────────────────────────────────── \newif\if@neurips@preprint \@neurips@preprinttrue \newif\if@neurips@final \@neurips@finalfalse \newif\if@neurips@nonatbib \@neurips@nonatbibfalse \DeclareOption{preprint}{\@neurips@preprinttrue\@neurips@finalfalse} \DeclareOption{final}{\@neurips@finaltrue\@neurips@preprintfalse} \DeclareOption{nonatbib}{\@neurips@nonatbibtrue} \ProcessOptions\relax % ── Page geometry ──────────────────────────────────────────────────── \RequirePackage{geometry} \geometry{ textwidth=6.0in, textheight=9.0in, top=1.0in, headheight=12pt, headsep=25pt, footskip=30pt, } % ── Fonts ──────────────────────────────────────────────────────────── \RequirePackage{times} % ── Spacing ────────────────────────────────────────────────────────── \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} % ── Section formatting ─────────────────────────────────────────────── \renewcommand{\section}{\@startsection {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}% {1.0ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}% {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}% {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} % ── Title formatting ──────────────────────────────────────────────── \def\@maketitle{% \vbox to 0pt{}% \vskip -0.5in \begin{center}% {\LARGE\bfseries \@title \par}% \vskip 0.3in \if@neurips@preprint {\large\textit{Preprint. Under review.}\par}% \vskip 0.1in \fi {\large \lineskip .5em \begin{tabular}[t]{c}% \@author \end{tabular}\par}% \vskip 0.3in \end{center}% \par \vskip 0.5em } % ── Abstract ───────────────────────────────────────────────────────── \renewenvironment{abstract}{% \centerline{\large\bfseries Abstract}% \vspace{0.5ex}% \begin{quote}% }{% \par \end{quote}% \vskip 1ex } % ── Headers ────────────────────────────────────────────────────────── \RequirePackage{fancyhdr} \pagestyle{fancy} \fancyhf{} \fancyhead[C]{} \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} % ── Natbib ─────────────────────────────────────────────────────────── \if@neurips@nonatbib\else \RequirePackage[numbers,sort&compress]{natbib} \fi \endinput ================================================ FILE: researchclaw/templates/styles/neurips_2025/neurips_2025.sty ================================================ % neurips_2025.sty — NeurIPS 2025 style file % Based on the official NeurIPS submission template structure. % Bundled by AutoResearchClaw for offline compilation. % Official source: https://media.neurips.cc/Conferences/NeurIPS2025/Styles.zip \NeedsTeXFormat{LaTeX2e} \ProvidesPackage{neurips_2025}[2025/01/15 NeurIPS 2025 submission style] % ── Options ────────────────────────────────────────────────────────── \newif\if@neurips@preprint \@neurips@preprinttrue \newif\if@neurips@final \@neurips@finalfalse \newif\if@neurips@nonatbib \@neurips@nonatbibfalse \DeclareOption{preprint}{\@neurips@preprinttrue\@neurips@finalfalse} \DeclareOption{final}{\@neurips@finaltrue\@neurips@preprintfalse} \DeclareOption{nonatbib}{\@neurips@nonatbibtrue} \ProcessOptions\relax % ── Page geometry ──────────────────────────────────────────────────── \RequirePackage{geometry} \geometry{ textwidth=6.0in, textheight=9.0in, top=1.0in, headheight=12pt, headsep=25pt, footskip=30pt, } % ── Fonts ──────────────────────────────────────────────────────────── \RequirePackage{times} % ── Spacing ────────────────────────────────────────────────────────── \renewcommand{\baselinestretch}{1.0} \setlength{\parskip}{0pt} \setlength{\parindent}{1em} % ── Section formatting ─────────────────────────────────────────────── \renewcommand{\section}{\@startsection {section}{1}{0mm}{-2.0ex plus -0.5ex minus -.2ex}% {1.0ex plus .2ex}{\normalfont\large\bfseries}} \renewcommand{\subsection}{\@startsection {subsection}{2}{0mm}{-1.5ex plus -0.5ex minus -.2ex}% {0.8ex plus .2ex}{\normalfont\normalsize\bfseries}} \renewcommand{\subsubsection}{\@startsection {subsubsection}{3}{0mm}{-1.0ex plus -0.5ex minus -.2ex}% {0.5ex plus .2ex}{\normalfont\normalsize\bfseries}} % ── Title formatting ──────────────────────────────────────────────── \def\@maketitle{% \vbox to 0pt{}% \vskip -0.5in \begin{center}% {\LARGE\bfseries \@title \par}% \vskip 0.3in \if@neurips@preprint {\large\textit{Preprint. Under review.}\par}% \vskip 0.1in \fi {\large \lineskip .5em \begin{tabular}[t]{c}% \@author \end{tabular}\par}% \vskip 0.3in \end{center}% \par \vskip 0.5em } % ── Abstract ───────────────────────────────────────────────────────── \renewenvironment{abstract}{% \centerline{\large\bfseries Abstract}% \vspace{0.5ex}% \begin{quote}% }{% \par \end{quote}% \vskip 1ex } % ── Headers ────────────────────────────────────────────────────────── \RequirePackage{fancyhdr} \pagestyle{fancy} \fancyhf{} \fancyhead[C]{} \fancyfoot[C]{\thepage} \renewcommand{\headrulewidth}{0pt} % ── Natbib ─────────────────────────────────────────────────────────── \if@neurips@nonatbib\else \RequirePackage[numbers,sort&compress]{natbib} \fi % ── Hyperref-friendly ──────────────────────────────────────────────── \AtBeginDocument{% \@ifpackageloaded{hyperref}{% \hypersetup{colorlinks=true,linkcolor=red,citecolor=green,urlcolor=blue}% }{}% } \endinput ================================================ FILE: researchclaw/trends/__init__.py ================================================ """Research trend tracking and automatic topic generation.""" from researchclaw.trends.daily_digest import DailyDigest from researchclaw.trends.trend_analyzer import TrendAnalyzer from researchclaw.trends.opportunity_finder import OpportunityFinder from researchclaw.trends.auto_topic import AutoTopicGenerator from researchclaw.trends.feeds import FeedManager __all__ = [ "AutoTopicGenerator", "DailyDigest", "FeedManager", "OpportunityFinder", "TrendAnalyzer", ] ================================================ FILE: researchclaw/trends/auto_topic.py ================================================ """Automatic research topic generation (ClawZero mode).""" from __future__ import annotations import logging from typing import Any from researchclaw.trends.opportunity_finder import OpportunityFinder from researchclaw.trends.trend_analyzer import TrendAnalyzer logger = logging.getLogger(__name__) class AutoTopicGenerator: """Generate and rank candidate research topics automatically.""" def __init__( self, trend_analyzer: TrendAnalyzer, opportunity_finder: OpportunityFinder, llm_client: Any = None, ): self.trend_analyzer = trend_analyzer self.opportunity_finder = opportunity_finder self.llm = llm_client async def generate_candidates( self, domains: list[str], papers: list[dict[str, Any]] | None = None, count: int = 5, ) -> list[dict[str, Any]]: """Generate ranked candidate research topics.""" # 1. Analyze trends trend_analysis = self.trend_analyzer.analyze(papers or []) # 2. Find opportunities opportunities = await self.opportunity_finder.find_opportunities( trend_analysis, domains ) # 3. Score and rank candidates candidates = [] for opp in opportunities[:count]: score = self._score_candidate(opp, trend_analysis) candidates.append({ "topic": opp["topic"], "rationale": opp.get("rationale", ""), "feasibility": opp.get("feasibility", "medium"), "novelty_score": score["novelty"], "feasibility_score": score["feasibility"], "impact_score": score["impact"], "overall_score": score["overall"], "source": opp.get("source", "unknown"), }) candidates.sort(key=lambda c: -c["overall_score"]) return candidates[:count] async def auto_select( self, domains: list[str], papers: list[dict[str, Any]] | None = None, ) -> dict[str, Any]: """Fully automatic topic selection (Zero-Touch mode).""" candidates = await self.generate_candidates(domains, papers, count=5) if not candidates: return { "topic": f"Novel approaches in {domains[0] if domains else 'ML'}", "rationale": "Default topic (no trends data available)", "overall_score": 0.0, "source": "default", } return candidates[0] @staticmethod def _score_candidate( opportunity: dict[str, Any], trend_analysis: dict[str, Any], ) -> dict[str, float]: """Score a candidate topic on novelty, feasibility, and impact.""" feasibility_map = {"high": 0.9, "medium": 0.6, "low": 0.3} feasibility = feasibility_map.get( opportunity.get("feasibility", "medium"), 0.6 ) # Novelty: inverse of how much it's already been studied topic_words = set(opportunity.get("topic", "").lower().split()) keyword_overlap = 0 for kw in trend_analysis.get("rising_keywords", []): kw_words = set(kw.get("keyword", "").lower().split()) if topic_words & kw_words: keyword_overlap += 1 novelty = max(0.3, 1.0 - keyword_overlap * 0.15) # Impact: based on trend momentum paper_count = trend_analysis.get("paper_count", 0) impact = min(1.0, paper_count / 50) if paper_count > 0 else 0.5 overall = round( 0.4 * novelty + 0.3 * feasibility + 0.3 * impact, 3 ) return { "novelty": round(novelty, 3), "feasibility": round(feasibility, 3), "impact": round(impact, 3), "overall": overall, } def format_candidates( self, candidates: list[dict[str, Any]], ) -> str: """Format candidates as a readable string.""" if not candidates: return "No candidate topics generated." lines = ["Candidate Research Topics:", "=" * 40, ""] for i, c in enumerate(candidates, 1): lines.extend([ f"{i}. {c['topic']}", f" Score: {c['overall_score']:.2f} " f"(novelty={c['novelty_score']:.2f}, " f"feasibility={c['feasibility_score']:.2f}, " f"impact={c['impact_score']:.2f})", f" Rationale: {c.get('rationale', 'N/A')}", "", ]) return "\n".join(lines) ================================================ FILE: researchclaw/trends/daily_digest.py ================================================ """Daily paper digest generation.""" from __future__ import annotations import logging from datetime import date from pathlib import Path from typing import Any from researchclaw.trends.feeds import FeedManager logger = logging.getLogger(__name__) class DailyDigest: """Generate daily paper digest reports.""" def __init__( self, feed_manager: FeedManager, llm_client: Any = None, ): self.feeds = feed_manager self.llm = llm_client async def generate( self, domains: list[str] | None = None, max_papers: int = 20, target_date: date | None = None, ) -> str: """Generate a daily paper digest as Markdown.""" effective_domains = domains or ["machine learning"] today = target_date or date.today() papers = self.feeds.fetch_recent_papers( domains=effective_domains, max_papers=max_papers, since_date=today, ) if not papers: return ( f"## Daily Paper Digest ({today})\n\n" f"No new papers found for domains: {', '.join(effective_domains)}\n" ) if self.llm is not None: return await self._generate_with_llm(papers, effective_domains, today) return self._generate_basic(papers, effective_domains, today) async def _generate_with_llm( self, papers: list[dict[str, Any]], domains: list[str], today: date, ) -> str: """Generate digest with LLM-enhanced summaries.""" lines = [ f"## Daily Paper Digest ({today})", f"Domains: {', '.join(domains)}", f"Papers found: {len(papers)}", "", ] for i, paper in enumerate(papers, 1): title = paper.get("title", "Untitled") url = paper.get("url", "") abstract = paper.get("abstract", "")[:500] authors = paper.get("authors", []) if isinstance(authors, list): author_str = ", ".join( a if isinstance(a, str) else a.get("name", "") for a in authors[:3] ) if len(authors) > 3: author_str += " et al." else: author_str = str(authors) # Get LLM summary try: prompt = ( f"Summarize this paper in 2 sentences and rate its relevance " f"to {', '.join(domains)} on a scale of 1-5 stars.\n\n" f"Title: {title}\nAbstract: {abstract}\n\n" f"Format: SUMMARY: | RELEVANCE: <1-5>" ) response = await self.llm.chat_async(prompt) summary, relevance = self._parse_summary(response) except Exception: summary = abstract[:200] + "..." if len(abstract) > 200 else abstract relevance = 3 stars = "*" * relevance link = f"[{title}]({url})" if url else title lines.extend([ f"### {i}. {link}", f"**Authors**: {author_str}", f"**Relevance**: {stars}", f"**Summary**: {summary}", "", ]) return "\n".join(lines) def _generate_basic( self, papers: list[dict[str, Any]], domains: list[str], today: date, ) -> str: """Generate basic digest without LLM.""" lines = [ f"## Daily Paper Digest ({today})", f"Domains: {', '.join(domains)}", f"Papers found: {len(papers)}", "", ] for i, paper in enumerate(papers, 1): title = paper.get("title", "Untitled") url = paper.get("url", "") abstract = paper.get("abstract", "") authors = paper.get("authors", []) if isinstance(authors, list): author_str = ", ".join( a if isinstance(a, str) else a.get("name", "") for a in authors[:3] ) if len(authors) > 3: author_str += " et al." else: author_str = str(authors) short_abstract = ( abstract[:200] + "..." if len(abstract) > 200 else abstract ) link = f"[{title}]({url})" if url else title lines.extend([ f"### {i}. {link}", f"**Authors**: {author_str}", f"**Abstract**: {short_abstract}", "", ]) return "\n".join(lines) @staticmethod def _parse_summary(response: str) -> tuple[str, int]: """Parse LLM summary response.""" summary = response relevance = 3 if "SUMMARY:" in response: parts = response.split("|") summary = parts[0].split("SUMMARY:", 1)[-1].strip() if len(parts) > 1 and "RELEVANCE:" in parts[1]: try: rel_str = parts[1].split("RELEVANCE:", 1)[-1].strip() relevance = int(rel_str.strip("* ")) relevance = max(1, min(5, relevance)) except (ValueError, IndexError): pass return summary, relevance async def generate_and_save( self, output_dir: Path, domains: list[str] | None = None, max_papers: int = 20, ) -> Path: """Generate digest and save to a file.""" today = date.today() content = await self.generate(domains, max_papers, today) output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / f"digest_{today}.md" output_file.write_text(content, encoding="utf-8") return output_file ================================================ FILE: researchclaw/trends/feeds.py ================================================ """ArXiv / Semantic Scholar / OpenAlex feed management.""" from __future__ import annotations import logging from datetime import date, datetime from typing import Any logger = logging.getLogger(__name__) class FeedManager: """Manage paper feeds from multiple sources.""" SUPPORTED_SOURCES = ("arxiv", "semantic_scholar", "openalex") def __init__( self, sources: tuple[str, ...] = ("arxiv", "semantic_scholar"), s2_api_key: str = "", ): self.sources = tuple( s for s in sources if s in self.SUPPORTED_SOURCES ) self.s2_api_key = s2_api_key def fetch_recent_papers( self, domains: list[str], max_papers: int = 20, since_date: date | None = None, ) -> list[dict[str, Any]]: """Fetch recent papers from configured sources. Returns a list of paper dicts with: title, authors, abstract, url, source, published_date, domains. """ all_papers: list[dict[str, Any]] = [] target_date = since_date or date.today() for source in self.sources: try: if source == "arxiv": papers = self._fetch_arxiv(domains, max_papers, target_date) elif source == "semantic_scholar": papers = self._fetch_s2(domains, max_papers, target_date) elif source == "openalex": papers = self._fetch_openalex(domains, max_papers, target_date) else: continue all_papers.extend(papers) except Exception as exc: logger.warning("Feed fetch failed for %s: %s", source, exc) # Deduplicate by title similarity seen_titles: set[str] = set() deduped: list[dict[str, Any]] = [] for paper in all_papers: norm_title = paper.get("title", "").lower().strip() if norm_title and norm_title not in seen_titles: seen_titles.add(norm_title) deduped.append(paper) return deduped[:max_papers] def _fetch_arxiv( self, domains: list[str], max_papers: int, since_date: date, ) -> list[dict[str, Any]]: """Fetch papers from arXiv API.""" try: from researchclaw.literature.arxiv_client import search_arxiv except ImportError: logger.debug("arxiv_client not available") return [] query = " OR ".join(domains) if domains else "machine learning" try: results = search_arxiv(query, limit=max_papers) return [ { "title": r.get("title", ""), "authors": r.get("authors", []), "abstract": r.get("abstract", ""), "url": r.get("url", ""), "source": "arxiv", "published_date": r.get("published", since_date.isoformat()), "arxiv_id": r.get("arxiv_id", ""), } for r in results ] except Exception as exc: logger.warning("ArXiv fetch failed: %s", exc) return [] def _fetch_s2( self, domains: list[str], max_papers: int, since_date: date, ) -> list[dict[str, Any]]: """Fetch papers from Semantic Scholar API.""" try: from researchclaw.literature.semantic_scholar import search_s2 except ImportError: logger.debug("semantic_scholar client not available") return [] query = " ".join(domains) if domains else "machine learning" try: results = search_s2( query, limit=max_papers, year_min=since_date.year, api_key=self.s2_api_key, ) return [ { "title": r.get("title", ""), "authors": [ a.get("name", "") for a in r.get("authors", []) ], "abstract": r.get("abstract", ""), "url": r.get("url", ""), "source": "semantic_scholar", "published_date": str(r.get("year", since_date.year)), "citation_count": r.get("citationCount", 0), } for r in results ] except Exception as exc: logger.warning("S2 fetch failed: %s", exc) return [] def _fetch_openalex( self, domains: list[str], max_papers: int, since_date: date, ) -> list[dict[str, Any]]: """Fetch papers from OpenAlex API.""" try: from researchclaw.literature.openalex_client import search_openalex except ImportError: logger.debug("openalex_client not available") return [] query = " ".join(domains) if domains else "machine learning" try: results = search_openalex(query, limit=max_papers) return [ { "title": r.get("title", ""), "authors": r.get("authors", []), "abstract": r.get("abstract", ""), "url": r.get("url", ""), "source": "openalex", "published_date": r.get("publication_date", ""), "citation_count": r.get("cited_by_count", 0), } for r in results ] except Exception as exc: logger.warning("OpenAlex fetch failed: %s", exc) return [] ================================================ FILE: researchclaw/trends/opportunity_finder.py ================================================ """Research opportunity discovery.""" from __future__ import annotations import logging from typing import Any logger = logging.getLogger(__name__) class OpportunityFinder: """Identify research opportunities from trend analysis.""" def __init__(self, llm_client: Any = None): self.llm = llm_client async def find_opportunities( self, trend_analysis: dict[str, Any], domains: list[str], ) -> list[dict[str, Any]]: """Identify research gaps and opportunities.""" if self.llm is not None: return await self._llm_find_opportunities(trend_analysis, domains) return self._heuristic_find_opportunities(trend_analysis, domains) async def _llm_find_opportunities( self, trend_analysis: dict[str, Any], domains: list[str], ) -> list[dict[str, Any]]: """Use LLM to identify research opportunities.""" keywords = trend_analysis.get("rising_keywords", [])[:10] methods = trend_analysis.get("method_trends", [])[:5] prompt = ( "Based on the following research trends, identify 5 promising " "research opportunities:\n\n" f"Domains: {', '.join(domains)}\n" f"Trending keywords: {[k['keyword'] for k in keywords]}\n" f"Popular methods: {[m['method'] for m in methods]}\n\n" "For each opportunity, provide:\n" "1. A concise research question\n" "2. Why it's promising (1 sentence)\n" "3. Feasibility estimate (high/medium/low)\n\n" "Format each as: TOPIC: ... | WHY: ... | FEASIBILITY: ..." ) try: response = await self.llm.chat_async(prompt) return self._parse_opportunities(response) except Exception as exc: logger.warning("LLM opportunity finding failed: %s", exc) return self._heuristic_find_opportunities(trend_analysis, domains) @staticmethod def _parse_opportunities(response: str) -> list[dict[str, Any]]: """Parse LLM response into structured opportunities.""" opportunities = [] for line in response.strip().split("\n"): line = line.strip() if not line or not any( marker in line for marker in ("TOPIC:", "topic:", "1.", "2.", "3.") ): continue parts = line.split("|") topic = parts[0].split(":", 1)[-1].strip() if parts else line why = parts[1].split(":", 1)[-1].strip() if len(parts) > 1 else "" feasibility = ( parts[2].split(":", 1)[-1].strip().lower() if len(parts) > 2 else "medium" ) if topic: opportunities.append({ "topic": topic, "rationale": why, "feasibility": feasibility, "source": "llm", }) return opportunities[:5] @staticmethod def _heuristic_find_opportunities( trend_analysis: dict[str, Any], domains: list[str], ) -> list[dict[str, Any]]: """Simple heuristic-based opportunity finding.""" opportunities: list[dict[str, Any]] = [] keywords = trend_analysis.get("rising_keywords", []) methods = trend_analysis.get("method_trends", []) # Combine trending keywords with methods for opportunity generation for i, kw in enumerate(keywords[:3]): for j, method in enumerate(methods[:2]): topic = ( f"Applying {method['method']} to " f"{kw['keyword']} in {domains[0] if domains else 'ML'}" ) opportunities.append({ "topic": topic, "rationale": ( f"'{kw['keyword']}' is trending ({kw['count']} mentions) " f"and '{method['method']}' is a popular method" ), "feasibility": "medium", "source": "heuristic", }) if len(opportunities) >= 5: break if len(opportunities) >= 5: break return opportunities ================================================ FILE: researchclaw/trends/trend_analyzer.py ================================================ """Research trend analysis engine.""" from __future__ import annotations import re import logging from collections import Counter from typing import Any logger = logging.getLogger(__name__) # Common stopwords to exclude from keyword analysis _STOPWORDS = frozenset({ "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "need", "must", "that", "this", "these", "those", "it", "its", "we", "our", "their", "which", "what", "how", "when", "where", "who", "whom", "why", "not", "no", "nor", "as", "if", "then", "than", "both", "each", "all", "any", "few", "more", "most", "some", "such", "only", "very", "also", "about", "up", "out", "so", "into", "over", "after", "before", "between", "under", "through", "during", "using", "based", "via", "paper", "propose", "proposed", "method", "approach", "results", "show", "new", "novel", "model", "models", "data", "dataset", "task", "tasks", "performance", "learning", "training", }) class TrendAnalyzer: """Analyze research trends from paper collections.""" def __init__(self, min_keyword_length: int = 3): self.min_keyword_length = min_keyword_length def analyze( self, papers: list[dict[str, Any]], window_days: int = 30, ) -> dict[str, Any]: """Analyze trends in a collection of papers.""" if not papers: return { "rising_keywords": [], "hot_authors": [], "popular_datasets": [], "method_trends": [], "paper_count": 0, } keywords = self._extract_keywords(papers) authors = self._extract_authors(papers) datasets = self._extract_datasets(papers) methods = self._extract_methods(papers) return { "rising_keywords": keywords[:20], "hot_authors": authors[:10], "popular_datasets": datasets[:10], "method_trends": methods[:10], "paper_count": len(papers), "source_distribution": self._source_distribution(papers), } def _extract_keywords( self, papers: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Extract and rank keywords from paper titles and abstracts.""" word_counts: Counter[str] = Counter() bigram_counts: Counter[str] = Counter() for paper in papers: text = f"{paper.get('title', '')} {paper.get('abstract', '')}" words = self._tokenize(text) for w in words: if w not in _STOPWORDS and len(w) >= self.min_keyword_length: word_counts[w] += 1 for i in range(len(words) - 1): w1, w2 = words[i], words[i + 1] if ( w1 not in _STOPWORDS and w2 not in _STOPWORDS and len(w1) >= self.min_keyword_length ): bigram_counts[f"{w1} {w2}"] += 1 results = [] for keyword, count in bigram_counts.most_common(30): if count >= 2: results.append({"keyword": keyword, "count": count, "type": "bigram"}) for keyword, count in word_counts.most_common(30): if count >= 2: results.append({"keyword": keyword, "count": count, "type": "unigram"}) results.sort(key=lambda x: -x["count"]) return results[:20] def _extract_authors( self, papers: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Extract most prolific authors.""" author_counts: Counter[str] = Counter() for paper in papers: authors = paper.get("authors", []) if isinstance(authors, list): for author in authors: name = author if isinstance(author, str) else author.get("name", "") if name: author_counts[name] += 1 return [ {"author": name, "paper_count": count} for name, count in author_counts.most_common(10) if count >= 2 ] def _extract_datasets( self, papers: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Extract commonly mentioned datasets.""" dataset_patterns = [ "ImageNet", "CIFAR", "MNIST", "COCO", "SQuAD", "GLUE", "SuperGLUE", "WikiText", "Penn Treebank", "WMT", "OpenWebText", "Common Crawl", "BookCorpus", "MMLU", "HumanEval", "GSM8K", "ARC", "HellaSwag", ] dataset_counts: Counter[str] = Counter() for paper in papers: text = f"{paper.get('title', '')} {paper.get('abstract', '')}" for ds in dataset_patterns: if ds.lower() in text.lower(): dataset_counts[ds] += 1 return [ {"dataset": ds, "mention_count": count} for ds, count in dataset_counts.most_common(10) if count >= 1 ] def _extract_methods( self, papers: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Extract commonly mentioned methods/architectures.""" method_patterns = [ "transformer", "attention", "diffusion", "GAN", "VAE", "reinforcement learning", "contrastive learning", "self-supervised", "few-shot", "zero-shot", "in-context", "fine-tuning", "pre-training", "RLHF", "DPO", "chain-of-thought", "retrieval-augmented", "RAG", "mixture of experts", "MoE", "LoRA", "quantization", "knowledge distillation", "pruning", "graph neural", ] method_counts: Counter[str] = Counter() for paper in papers: text = f"{paper.get('title', '')} {paper.get('abstract', '')}" for method in method_patterns: if method.lower() in text.lower(): method_counts[method] += 1 return [ {"method": method, "mention_count": count} for method, count in method_counts.most_common(10) if count >= 1 ] @staticmethod def _source_distribution( papers: list[dict[str, Any]], ) -> dict[str, int]: """Count papers by source.""" dist: Counter[str] = Counter() for paper in papers: dist[paper.get("source", "unknown")] += 1 return dict(dist) @staticmethod def _tokenize(text: str) -> list[str]: """Simple word tokenization.""" return [w.lower() for w in re.findall(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)*", text)] def generate_trend_report( self, analysis: dict[str, Any], ) -> str: """Format trend analysis as a readable report.""" lines = [ f"Research Trend Analysis ({analysis.get('paper_count', 0)} papers)", "=" * 50, "", ] keywords = analysis.get("rising_keywords", []) if keywords: lines.append("Top Keywords:") for kw in keywords[:10]: lines.append(f" - {kw['keyword']} ({kw['count']} mentions)") lines.append("") authors = analysis.get("hot_authors", []) if authors: lines.append("Most Active Authors:") for a in authors[:5]: lines.append(f" - {a['author']} ({a['paper_count']} papers)") lines.append("") methods = analysis.get("method_trends", []) if methods: lines.append("Method Trends:") for m in methods[:5]: lines.append(f" - {m['method']} ({m['mention_count']} mentions)") lines.append("") return "\n".join(lines) ================================================ FILE: researchclaw/utils/__init__.py ================================================ """ResearchClaw utility functions.""" from researchclaw.utils.sanitize import sanitize_figure_id __all__ = ["sanitize_figure_id"] ================================================ FILE: researchclaw/utils/sanitize.py ================================================ """Input sanitization utilities for untrusted LLM-generated values.""" from __future__ import annotations import re def sanitize_figure_id(raw_id: str, *, fallback: str = "figure") -> str: """Sanitize a figure ID for safe use in file paths and Docker names. Strips path separators, dotdot sequences, and shell metacharacters. Returns *fallback* if the sanitized result is empty. >>> sanitize_figure_id("../../etc/evil") 'etc_evil' >>> sanitize_figure_id("fig test (v2)") 'fig_test_v2' >>> sanitize_figure_id("") 'figure' """ # Replace path separators and dangerous sequences cleaned = raw_id.replace("..", "").replace("/", "_").replace("\\", "_") # Keep only safe characters: alphanumeric, hyphen, underscore, dot cleaned = re.sub(r"[^a-zA-Z0-9_.-]", "_", cleaned) # Collapse multiple underscores cleaned = re.sub(r"_+", "_", cleaned).strip("_.") return cleaned or fallback ================================================ FILE: researchclaw/utils/thinking_tags.py ================================================ """Strip reasoning artifacts from LLM output before they leak into papers. Handles ALL known thinking/reasoning formats: - ``...`` -- DeepSeek-R1, QwQ, Gemini 2.5 format - ``[thinking] ...`` -- Claude Code / ACP output format (bracket-style) - Insight blocks -- Claude Code explanatory mode decorators - ``[plan] ...`` -- Claude Code plan mode markers - ``[tool] ...`` -- ACP tool invocation output - ``[client] ...``, ``[acpx] ...``, ``[done] ...`` -- acpx metadata Without this stripping, these artifacts contaminate: - Paper drafts (LaTeX / Markdown) - Generated experiment code - YAML/JSON responses (search plans, experiment plans) - Citation references Usage:: from researchclaw.utils.thinking_tags import strip_thinking_tags clean = strip_thinking_tags(raw_llm_output) """ from __future__ import annotations import re # --------------------------------------------------------------------------- # Pattern 1: XML-style ... (DeepSeek-R1, QwQ, Gemini) # --------------------------------------------------------------------------- _THINK_BLOCK_RE = re.compile( r".*?", re.DOTALL | re.IGNORECASE, ) _THINK_UNCLOSED_RE = re.compile( r".*", re.DOTALL | re.IGNORECASE, ) _THINK_STRAY_CLOSE_RE = re.compile( r"", re.IGNORECASE, ) # --------------------------------------------------------------------------- # Pattern 2: [thinking] blocks (Claude Code / ACP) # --------------------------------------------------------------------------- _BRACKET_THINKING_RE = re.compile( r"\[thinking\].*?(?=\n\n(?!\[thinking\])|\n(?:#{1,3}\s)|\n```|\Z)", re.DOTALL | re.IGNORECASE, ) # --------------------------------------------------------------------------- # Pattern 3: Insight blocks (Claude Code explanatory style) # --------------------------------------------------------------------------- _INSIGHT_BLOCK_RE = re.compile( r"`[*\u2605]\s*Insight[^`]*`\s*\n.*?`[\u2500-]+`", re.DOTALL, ) _INSIGHT_ASCII_RE = re.compile( r"`\*\s*Insight[-]+`\s*\n.*?`[-]+`", re.DOTALL, ) # --------------------------------------------------------------------------- # Pattern 4: [plan] blocks (Claude Code plan mode) # --------------------------------------------------------------------------- _PLAN_BLOCK_RE = re.compile( r"\[plan\].*?(?=\n\n|\Z)", re.DOTALL, ) # --------------------------------------------------------------------------- # Pattern 5: ACP/acpx metadata lines # --------------------------------------------------------------------------- _ACPX_LINE_RE = re.compile( r"^\[(client|acpx|tool|done)\](?!\().*$", re.MULTILINE | re.IGNORECASE, ) def strip_thinking_tags(text: str) -> str: """Remove all reasoning artifacts from LLM output. Handles XML tags, bracket [thinking] blocks, insight decorators, plan markers, and acpx metadata. Returns cleaned text suitable for paper drafts, code, or YAML/JSON. """ if not text: return text result = text # Phase 1: XML ... blocks if "think" in result.lower(): result = _THINK_BLOCK_RE.sub("", result) result = _THINK_UNCLOSED_RE.sub("", result) result = _THINK_STRAY_CLOSE_RE.sub("", result) # Phase 2: [thinking] blocks (ACP/Claude Code) if "[thinking]" in result.lower(): result = _BRACKET_THINKING_RE.sub("", result) result = re.sub( r"^\[thinking\].*$", "", result, flags=re.MULTILINE | re.IGNORECASE, ) # Phase 3: Insight blocks result = _INSIGHT_BLOCK_RE.sub("", result) result = _INSIGHT_ASCII_RE.sub("", result) # Phase 4: [plan] blocks if "[plan]" in result.lower(): result = _PLAN_BLOCK_RE.sub("", result) # Phase 5: acpx metadata lines result = _ACPX_LINE_RE.sub("", result) # Phase 6: Clean up artifacts result = re.sub(r"^`[\u2500-]+`\s*$", "", result, flags=re.MULTILINE) result = re.sub(r"^`[-]{20,}`\s*$", "", result, flags=re.MULTILINE) # Collapse excessive blank lines result = re.sub(r"\n{3,}", "\n\n", result) return result.strip() ================================================ FILE: researchclaw/voice/__init__.py ================================================ """Voice interaction modules.""" ================================================ FILE: researchclaw/voice/commands.py ================================================ """Voice command parsing.""" from __future__ import annotations import re from dataclasses import dataclass from enum import Enum class VoiceCommand(str, Enum): """Recognized voice commands.""" START = "start" STOP = "stop" PAUSE = "pause" RESUME = "resume" STATUS = "status" NONE = "none" # Not a command, forward to chat @dataclass class ParsedVoiceInput: """Result of parsing voice input.""" command: VoiceCommand text: str # original or remaining text # Command patterns (Chinese + English) _COMMAND_PATTERNS: list[tuple[VoiceCommand, re.Pattern[str]]] = [ (VoiceCommand.START, re.compile(r"^(?:start|run|开始|启动)", re.IGNORECASE)), (VoiceCommand.STOP, re.compile(r"^(?:stop|停止|结束|终止)", re.IGNORECASE)), (VoiceCommand.PAUSE, re.compile(r"^(?:pause|暂停|等一下)", re.IGNORECASE)), (VoiceCommand.RESUME, re.compile(r"^(?:resume|continue|继续|恢复)", re.IGNORECASE)), (VoiceCommand.STATUS, re.compile(r"^(?:status|progress|进度|到哪了|查看)", re.IGNORECASE)), ] def parse_voice_input(text: str) -> ParsedVoiceInput: """Parse transcribed voice input into command + text.""" stripped = text.strip() for cmd, pattern in _COMMAND_PATTERNS: if pattern.search(stripped): return ParsedVoiceInput(command=cmd, text=stripped) return ParsedVoiceInput(command=VoiceCommand.NONE, text=stripped) ================================================ FILE: researchclaw/voice/synthesizer.py ================================================ """Text-to-speech synthesis.""" from __future__ import annotations import logging from typing import Any logger = logging.getLogger(__name__) class VoiceSynthesizer: """Convert text to speech audio.""" def __init__(self, server_config: Any) -> None: self._config = server_config async def synthesize( self, text: str, voice: str = "alloy", speed: float = 1.0, ) -> bytes: """Synthesize text to audio bytes using OpenAI TTS API.""" try: import httpx except ImportError: raise RuntimeError("httpx required for TTS") import os api_key = os.environ.get("OPENAI_API_KEY", "") if not api_key: raise RuntimeError("OPENAI_API_KEY not set for TTS") async with httpx.AsyncClient(timeout=60.0) as client: response = await client.post( "https://api.openai.com/v1/audio/speech", headers={"Authorization": f"Bearer {api_key}"}, json={ "model": "tts-1", "input": text, "voice": voice, "speed": speed, }, ) response.raise_for_status() return response.content ================================================ FILE: researchclaw/voice/transcriber.py ================================================ """Voice transcription via Whisper API.""" from __future__ import annotations import logging from typing import Any, AsyncIterator logger = logging.getLogger(__name__) class VoiceTranscriber: """Transcribe audio to text using Whisper API.""" def __init__(self, server_config: Any) -> None: self._model = server_config.whisper_model self._api_url = server_config.whisper_api_url async def transcribe( self, audio_bytes: bytes, language: str = "zh", ) -> str: """Transcribe audio bytes to text. Uses OpenAI Whisper API or compatible endpoint. """ try: import httpx except ImportError: raise RuntimeError( "httpx is required for voice transcription. " "Install with: pip install httpx" ) url = self._api_url or "https://api.openai.com/v1/audio/transcriptions" import os api_key = os.environ.get("OPENAI_API_KEY", "") if not api_key: raise RuntimeError("OPENAI_API_KEY not set for Whisper API") async with httpx.AsyncClient(timeout=60.0) as client: response = await client.post( url, headers={"Authorization": f"Bearer {api_key}"}, files={"file": ("audio.webm", audio_bytes, "audio/webm")}, data={ "model": self._model, "language": language, }, ) response.raise_for_status() result = response.json() return result.get("text", "") async def transcribe_stream( self, audio_stream: AsyncIterator[bytes], language: str = "zh", ) -> AsyncIterator[str]: """Stream transcription (collects chunks then transcribes).""" chunks: list[bytes] = [] async for chunk in audio_stream: chunks.append(chunk) if chunks: full_audio = b"".join(chunks) text = await self.transcribe(full_audio, language=language) yield text ================================================ FILE: researchclaw/web/__init__.py ================================================ """Web search, crawling, and content extraction layer. Provides unified access to: - **Crawl4AI**: Web page → Markdown extraction - **Tavily**: AI-native web search API - **scholarly**: Google Scholar search - **PDF extraction**: Full-text from PDF files Public API ---------- - ``WebSearchAgent`` — orchestrates all web capabilities - ``WebCrawler`` — Crawl4AI wrapper - ``WebSearchClient`` — Tavily search wrapper - ``GoogleScholarClient`` — scholarly wrapper - ``PDFExtractor`` — PDF text extraction - ``check_url_ssrf`` — SSRF validation for URLs """ from researchclaw.web._ssrf import check_url_ssrf from researchclaw.web.crawler import WebCrawler from researchclaw.web.search import WebSearchClient from researchclaw.web.scholar import GoogleScholarClient from researchclaw.web.pdf_extractor import PDFExtractor from researchclaw.web.agent import WebSearchAgent __all__ = [ "check_url_ssrf", "WebCrawler", "WebSearchClient", "GoogleScholarClient", "PDFExtractor", "WebSearchAgent", ] ================================================ FILE: researchclaw/web/_ssrf.py ================================================ """SSRF validation for URLs fetched by the web layer.""" from __future__ import annotations import ipaddress import socket from urllib.parse import urlparse def check_url_ssrf(url: str) -> str | None: """Return an error message if *url* targets a private/internal host. Validates scheme (http/https only) and resolves the hostname to check against all RFC 1918, loopback, link-local, and reserved IP ranges using :func:`ipaddress.ip_address`. Returns ``None`` if the URL is safe to fetch. """ parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return f"Unsupported URL scheme: {parsed.scheme}" hostname = parsed.hostname or "" if not hostname: return "URL has no hostname" # Try parsing hostname as a literal IP address first try: addr = ipaddress.ip_address(hostname) except ValueError: # It's a domain name — resolve to IP try: info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM) addr = ipaddress.ip_address(info[0][4][0]) except (socket.gaierror, OSError, IndexError): # Can't resolve — let the actual request fail naturally return None if addr.is_private or addr.is_loopback or addr.is_link_local or addr.is_reserved: return f"Blocked internal/private URL: {hostname}" return None ================================================ FILE: researchclaw/web/agent.py ================================================ """Unified Web Search Agent. Orchestrates all web capabilities (Tavily, Google Scholar, Crawl4AI, PDF extraction) into a single search-and-extract pipeline. Usage:: agent = WebSearchAgent() result = agent.search_and_extract( topic="knowledge distillation for vision transformers", search_queries=["knowledge distillation survey", "ViT compression"], ) # result.papers — Google Scholar papers # result.web_results — Tavily/DDG web search results # result.crawled_pages — full-text from crawled URLs """ from __future__ import annotations import asyncio import logging import time from dataclasses import dataclass, field from typing import Any from researchclaw.web.crawler import CrawlResult, WebCrawler from researchclaw.web.pdf_extractor import PDFContent, PDFExtractor from researchclaw.web.scholar import GoogleScholarClient, ScholarPaper from researchclaw.web.search import SearchResult, WebSearchClient, WebSearchResponse logger = logging.getLogger(__name__) @dataclass class WebSearchAgentResult: """Combined result from all web search sources.""" topic: str web_results: list[SearchResult] = field(default_factory=list) scholar_papers: list[ScholarPaper] = field(default_factory=list) crawled_pages: list[CrawlResult] = field(default_factory=list) pdf_extractions: list[PDFContent] = field(default_factory=list) search_answer: str = "" # Tavily AI answer if available elapsed_seconds: float = 0.0 @property def total_results(self) -> int: return ( len(self.web_results) + len(self.scholar_papers) + len(self.crawled_pages) + len(self.pdf_extractions) ) def to_context_string(self, *, max_length: int = 30_000) -> str: """Convert all results to a single context string for LLM injection. The output is structured Markdown suitable for prompt injection. """ parts: list[str] = [] # Tavily AI answer if self.search_answer: parts.append("## AI Search Summary") parts.append(self.search_answer) parts.append("") # Web search results if self.web_results: parts.append("## Web Search Results") for i, r in enumerate(self.web_results[:15], 1): parts.append(f"### [{i}] {r.title}") parts.append(f"URL: {r.url}") if r.snippet: parts.append(r.snippet) parts.append("") # Google Scholar papers if self.scholar_papers: parts.append("## Google Scholar Papers") for i, p in enumerate(self.scholar_papers[:10], 1): authors = ", ".join(p.authors[:3]) if len(p.authors) > 3: authors += " et al." parts.append( f"- **{p.title}** ({authors}, {p.year}) " f"[{p.citation_count} citations]" ) if p.abstract: parts.append(f" {p.abstract[:200]}...") parts.append("") # Crawled page content if self.crawled_pages: parts.append("## Crawled Page Content") for cr in self.crawled_pages: if cr.has_content: parts.append(f"### {cr.title or cr.url}") parts.append(cr.markdown[:3000]) parts.append("") # PDF extractions if self.pdf_extractions: parts.append("## PDF Full-Text Extractions") for pdf in self.pdf_extractions: if pdf.has_content: label = pdf.title or pdf.path parts.append(f"### {label}") if pdf.abstract: parts.append(f"**Abstract:** {pdf.abstract}") parts.append(pdf.text[:3000]) parts.append("") result = "\n".join(parts) if len(result) > max_length: result = result[:max_length] + "\n\n[... truncated]" return result def to_dict(self) -> dict[str, Any]: """Serialize to dict for JSON output.""" return { "topic": self.topic, "web_results_count": len(self.web_results), "scholar_papers_count": len(self.scholar_papers), "crawled_pages_count": len(self.crawled_pages), "pdf_extractions_count": len(self.pdf_extractions), "has_search_answer": bool(self.search_answer), "elapsed_seconds": self.elapsed_seconds, "web_results": [r.to_dict() for r in self.web_results[:20]], "scholar_papers": [p.to_dict() for p in self.scholar_papers[:20]], } class WebSearchAgent: """Orchestrates all web search and content extraction capabilities. Parameters ---------- tavily_api_key: Tavily API key (optional, falls back to env var or DuckDuckGo). enable_scholar: Whether to include Google Scholar search. enable_crawling: Whether to crawl top URLs for full content. enable_pdf: Whether to extract PDF content. max_web_results: Maximum web search results. max_scholar_results: Maximum Google Scholar results. max_crawl_urls: Maximum URLs to crawl for full content. """ def __init__( self, *, tavily_api_key: str = "", enable_scholar: bool = True, enable_crawling: bool = True, enable_pdf: bool = True, max_web_results: int = 10, max_scholar_results: int = 10, max_crawl_urls: int = 5, ) -> None: self.web_client = WebSearchClient(api_key=tavily_api_key) try: self.scholar_client = GoogleScholarClient() except ImportError: self.scholar_client = None # type: ignore[assignment] self.crawler = WebCrawler() self.pdf_extractor = PDFExtractor() self.enable_scholar = enable_scholar self.enable_crawling = enable_crawling self.enable_pdf = enable_pdf self.max_web_results = max_web_results self.max_scholar_results = max_scholar_results self.max_crawl_urls = max_crawl_urls def search_and_extract( self, topic: str, *, search_queries: list[str] | None = None, crawl_urls: list[str] | None = None, pdf_urls: list[str] | None = None, ) -> WebSearchAgentResult: """Run the full search + extraction pipeline. Parameters ---------- topic: Research topic string. search_queries: Custom search queries. If None, auto-generates from topic. crawl_urls: Specific URLs to crawl. If None, crawls top search result URLs. pdf_urls: Specific PDF URLs to extract. If None, extracts PDFs from search. """ t0 = time.monotonic() result = WebSearchAgentResult(topic=topic) # 1. Generate search queries if not provided if search_queries is None: search_queries = self._generate_queries(topic) # 2. Web search (Tavily / DuckDuckGo) self._run_web_search(result, search_queries) # 3. Google Scholar search if self.enable_scholar and self.scholar_client and self.scholar_client.available: self._run_scholar_search(result, topic) # 4. Crawl top URLs for full content if self.enable_crawling: urls_to_crawl = crawl_urls or self._select_urls_to_crawl(result) if urls_to_crawl: self._run_crawling(result, urls_to_crawl) # 5. Extract PDFs if self.enable_pdf: pdf_targets = pdf_urls or self._find_pdf_urls(result) if pdf_targets: self._run_pdf_extraction(result, pdf_targets) result.elapsed_seconds = time.monotonic() - t0 logger.info( "[WebSearchAgent] Done: %d web, %d scholar, %d crawled, %d PDFs (%.1fs)", len(result.web_results), len(result.scholar_papers), len(result.crawled_pages), len(result.pdf_extractions), result.elapsed_seconds, ) return result # ------------------------------------------------------------------ # Pipeline steps # ------------------------------------------------------------------ def _run_web_search( self, result: WebSearchAgentResult, queries: list[str] ) -> None: """Run web search across all queries.""" try: responses = self.web_client.search_multi( queries, max_results=self.max_web_results ) for resp in responses: result.web_results.extend(resp.results) if resp.answer and not result.search_answer: result.search_answer = resp.answer except Exception as exc: # noqa: BLE001 logger.warning("Web search failed: %s", exc) def _run_scholar_search( self, result: WebSearchAgentResult, topic: str ) -> None: """Run Google Scholar search.""" try: papers = self.scholar_client.search( topic, limit=self.max_scholar_results ) result.scholar_papers.extend(papers) except Exception as exc: # noqa: BLE001 logger.warning("Scholar search failed: %s", exc) def _run_crawling( self, result: WebSearchAgentResult, urls: list[str] ) -> None: """Crawl URLs for full content.""" try: loop = None try: loop = asyncio.get_running_loop() except RuntimeError: pass if loop and loop.is_running(): # We're inside an async context — use sync fallback for url in urls[: self.max_crawl_urls]: cr = self.crawler.crawl_sync(url) if cr.has_content: result.crawled_pages.append(cr) else: crawl_results = asyncio.run( self.crawler.crawl_many(urls[: self.max_crawl_urls]) ) result.crawled_pages.extend( cr for cr in crawl_results if cr.has_content ) except Exception as exc: # noqa: BLE001 logger.warning("Crawling failed: %s", exc) def _run_pdf_extraction( self, result: WebSearchAgentResult, urls: list[str] ) -> None: """Extract text from PDF URLs.""" for url in urls[:5]: try: pdf = self.pdf_extractor.extract_from_url(url) if pdf.has_content: result.pdf_extractions.append(pdf) except Exception as exc: # noqa: BLE001 logger.warning("PDF extraction failed for %s: %s", url, exc) # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ @staticmethod def _generate_queries(topic: str) -> list[str]: """Generate search queries from a topic string.""" queries = [ topic, f"{topic} survey", f"{topic} benchmark state of the art", ] return queries def _select_urls_to_crawl(self, result: WebSearchAgentResult) -> list[str]: """Select top URLs from search results for crawling.""" urls = [] seen = set() for r in result.web_results: if r.url and r.url not in seen: # Skip PDF URLs (handled separately) and common non-content sites if r.url.endswith(".pdf"): continue seen.add(r.url) urls.append(r.url) if len(urls) >= self.max_crawl_urls: break return urls @staticmethod def _find_pdf_urls(result: WebSearchAgentResult) -> list[str]: """Find PDF URLs from search results.""" pdf_urls = [] seen = set() for r in result.web_results: if r.url and r.url.endswith(".pdf") and r.url not in seen: seen.add(r.url) pdf_urls.append(r.url) if len(pdf_urls) >= 3: break return pdf_urls ================================================ FILE: researchclaw/web/crawler.py ================================================ """Web page → Markdown extraction powered by Crawl4AI. Crawl4AI is the primary extraction engine (installed as a dependency). A lightweight urllib fallback exists for environments where Crawl4AI's browser dependency is not set up. Usage:: crawler = WebCrawler() result = await crawler.crawl("https://arxiv.org/abs/2301.00001") print(result.markdown) """ from __future__ import annotations import asyncio import logging import re import time from dataclasses import dataclass, field from typing import Any from urllib.request import Request, urlopen from researchclaw.web._ssrf import check_url_ssrf logger = logging.getLogger(__name__) @dataclass class CrawlResult: """Result of crawling a single URL.""" url: str markdown: str = "" title: str = "" success: bool = False error: str = "" metadata: dict[str, Any] = field(default_factory=dict) elapsed_seconds: float = 0.0 @property def has_content(self) -> bool: return bool(self.markdown and len(self.markdown.strip()) > 50) class WebCrawler: """Web page → Markdown crawler powered by Crawl4AI. Parameters ---------- timeout: Request timeout in seconds. max_content_length: Maximum content length in characters (truncate beyond this). """ def __init__( self, *, timeout: int = 30, max_content_length: int = 50_000, user_agent: str = "ResearchClaw/0.5 (Academic Research Bot)", ) -> None: self.timeout = timeout self.max_content_length = max_content_length self.user_agent = user_agent # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ async def crawl(self, url: str) -> CrawlResult: """Crawl a URL and return Markdown content (async).""" err = check_url_ssrf(url) if err: return CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0) t0 = time.monotonic() try: return await self._crawl_with_crawl4ai(url, t0) except Exception as exc: # noqa: BLE001 logger.debug("Crawl4AI failed for %s (%s), trying urllib fallback", url, exc) try: return self._crawl_with_urllib(url, t0) except Exception as exc2: # noqa: BLE001 elapsed = time.monotonic() - t0 logger.warning("All crawl backends failed for %s: %s", url, exc2) return CrawlResult(url=url, success=False, error=str(exc2), elapsed_seconds=elapsed) def crawl_sync(self, url: str) -> CrawlResult: """Synchronous crawl — tries Crawl4AI via asyncio.run, falls back to urllib.""" err = check_url_ssrf(url) if err: return CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0) t0 = time.monotonic() try: return asyncio.run(self._crawl_with_crawl4ai(url, t0)) except Exception: # noqa: BLE001 try: return self._crawl_with_urllib(url, t0) except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - t0 return CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed) async def crawl_many(self, urls: list[str]) -> list[CrawlResult]: """Crawl multiple URLs using Crawl4AI's async engine.""" results = [] try: from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig browser_config = BrowserConfig(headless=True) run_config = CrawlerRunConfig( word_count_threshold=10, excluded_tags=["nav", "footer", "header", "sidebar"], remove_overlay_elements=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: for url in urls: err = check_url_ssrf(url) if err: results.append(CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0)) continue t0 = time.monotonic() try: raw = await crawler.arun(url=url, config=run_config) elapsed = time.monotonic() - t0 if raw.success: md = self._extract_markdown(raw) results.append(CrawlResult( url=url, markdown=md, title=getattr(raw, "title", "") or "", success=True, elapsed_seconds=elapsed, metadata=raw.metadata if hasattr(raw, "metadata") and raw.metadata else {}, )) else: results.append(CrawlResult( url=url, success=False, error=getattr(raw, "error_message", "crawl failed"), elapsed_seconds=elapsed, )) except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - t0 results.append(CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed)) except ImportError: # Crawl4AI browser not set up — use urllib for each for url in urls: err = check_url_ssrf(url) if err: results.append(CrawlResult(url=url, success=False, error=err, elapsed_seconds=0.0)) continue t0 = time.monotonic() try: results.append(self._crawl_with_urllib(url, t0)) except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - t0 results.append(CrawlResult(url=url, success=False, error=str(exc), elapsed_seconds=elapsed)) return results # ------------------------------------------------------------------ # Crawl4AI backend (primary) # ------------------------------------------------------------------ async def _crawl_with_crawl4ai(self, url: str, t0: float) -> CrawlResult: """Use Crawl4AI for high-quality extraction.""" from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig browser_config = BrowserConfig(headless=True) run_config = CrawlerRunConfig( word_count_threshold=10, excluded_tags=["nav", "footer", "header", "sidebar"], remove_overlay_elements=True, ) async with AsyncWebCrawler(config=browser_config) as crawler: raw = await crawler.arun(url=url, config=run_config) elapsed = time.monotonic() - t0 if raw.success: md = self._extract_markdown(raw) return CrawlResult( url=url, markdown=md, title=getattr(raw, "title", "") or "", success=True, elapsed_seconds=elapsed, metadata=raw.metadata if hasattr(raw, "metadata") and raw.metadata else {}, ) return CrawlResult( url=url, success=False, error=getattr(raw, "error_message", "Unknown crawl4ai error"), elapsed_seconds=elapsed, ) def _extract_markdown(self, raw: Any) -> str: """Extract markdown from a Crawl4AI result object.""" # Crawl4AI v0.8+ uses markdown_v2.raw_markdown md = "" if hasattr(raw, "markdown_v2") and raw.markdown_v2: md = getattr(raw.markdown_v2, "raw_markdown", "") or "" if not md and hasattr(raw, "markdown"): md = raw.markdown or "" if len(md) > self.max_content_length: md = md[: self.max_content_length] + "\n\n[... truncated]" return md # ------------------------------------------------------------------ # urllib fallback (lightweight, no browser needed) # ------------------------------------------------------------------ def _crawl_with_urllib(self, url: str, t0: float) -> CrawlResult: """Lightweight fallback: fetch HTML and strip tags.""" req = Request(url, headers={"User-Agent": self.user_agent}) resp = urlopen(req, timeout=self.timeout) # noqa: S310 content_type = resp.headers.get("Content-Type", "") raw = resp.read() encoding = "utf-8" if "charset=" in content_type: encoding = content_type.split("charset=")[-1].split(";")[0].strip() html = raw.decode(encoding, errors="replace") title_match = re.search(r"]*>(.*?)", html, re.DOTALL | re.IGNORECASE) title = title_match.group(1).strip() if title_match else "" markdown = self._html_to_markdown(html) if len(markdown) > self.max_content_length: markdown = markdown[: self.max_content_length] + "\n\n[... truncated]" elapsed = time.monotonic() - t0 return CrawlResult( url=url, markdown=markdown, title=title, success=bool(markdown.strip()), elapsed_seconds=elapsed, ) @staticmethod def _html_to_markdown(html: str) -> str: """Best-effort HTML → Markdown conversion via regex.""" text = re.sub(r"<(script|style|noscript)[^>]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>(.*?)", r"\n# \1\n", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>(.*?)", r"\n## \1\n", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>(.*?)", r"\n### \1\n", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>(.*?)", r"\n- \1", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>(.*?)

", r"\n\1\n", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"", "\n", text, flags=re.IGNORECASE) text = re.sub(r"]*href=[\"']([^\"']*)[\"'][^>]*>(.*?)", r"[\2](\1)", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"<[^>]+>", "", text) import html as _html text = _html.unescape(text) text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r" {2,}", " ", text) return text.strip() ================================================ FILE: researchclaw/web/pdf_extractor.py ================================================ """PDF full-text extraction powered by PyMuPDF (fitz). PyMuPDF is installed as a dependency and provides fast, high-quality PDF text extraction with metadata, section detection, and table support. Usage:: extractor = PDFExtractor() result = extractor.extract("/path/to/paper.pdf") print(result.text[:1000]) """ from __future__ import annotations import logging import re import tempfile from dataclasses import dataclass, field from pathlib import Path from typing import Any from urllib.request import Request, urlopen from researchclaw.web._ssrf import check_url_ssrf try: import fitz # PyMuPDF HAS_FITZ = True except ImportError: fitz = None # type: ignore[assignment] HAS_FITZ = False logger = logging.getLogger(__name__) @dataclass class PDFContent: """Extracted content from a PDF file.""" path: str text: str = "" title: str = "" authors: list[str] = field(default_factory=list) abstract: str = "" sections: list[dict[str, str]] = field(default_factory=list) page_count: int = 0 success: bool = False error: str = "" backend: str = "pymupdf" metadata: dict[str, Any] = field(default_factory=dict) @property def has_content(self) -> bool: return bool(self.text and len(self.text.strip()) > 100) class PDFExtractor: """PDF text extraction using PyMuPDF. Parameters ---------- max_pages: Maximum pages to extract (0 = all). extract_sections: Whether to attempt section boundary detection. """ def __init__( self, *, max_pages: int = 0, extract_sections: bool = True, ) -> None: self.max_pages = max_pages self.extract_sections = extract_sections @property def backend(self) -> str: return "pymupdf" def extract(self, path: str | Path) -> PDFContent: """Extract text from a local PDF file using PyMuPDF.""" if not HAS_FITZ: return PDFContent( path=str(path), error="PyMuPDF not installed. Install: pip install 'researchclaw[pdf]'", ) path = Path(path) try: _exists = path.exists() except (PermissionError, OSError): _exists = False if not _exists: return PDFContent(path=str(path), error=f"File not found: {path}") try: with fitz.open(str(path)) as doc: pages_to_read = doc.page_count if self.max_pages > 0: pages_to_read = min(pages_to_read, self.max_pages) all_text = [] for i in range(pages_to_read): page = doc[i] all_text.append(page.get_text()) full_text = "\n".join(all_text) meta = doc.metadata or {} title = meta.get("title", "") author = meta.get("author", "") authors = [a.strip() for a in author.split(",")] if author else [] abstract = self._extract_abstract(full_text) sections = self._detect_sections(full_text) if self.extract_sections else [] page_count = doc.page_count return PDFContent( path=str(path), text=full_text, title=title, authors=authors, abstract=abstract, sections=sections, page_count=page_count, success=True, metadata=meta, ) except Exception as exc: # noqa: BLE001 logger.warning("PDF extraction failed for %s: %s", path, exc) return PDFContent(path=str(path), error=str(exc)) def extract_from_url(self, url: str) -> PDFContent: """Download a PDF from URL and extract text.""" err = check_url_ssrf(url) if err: return PDFContent(path=url, error=err) tmp_path = None try: req = Request(url, headers={ "User-Agent": "ResearchClaw/0.5 (Academic Research Bot)" }) resp = urlopen(req, timeout=30) # noqa: S310 data = resp.read() with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f: f.write(data) tmp_path = f.name result = self.extract(tmp_path) result.path = url return result except Exception as exc: # noqa: BLE001 logger.warning("PDF download failed for %s: %s", url, exc) return PDFContent(path=url, error=str(exc)) finally: if tmp_path: Path(tmp_path).unlink(missing_ok=True) # ------------------------------------------------------------------ # Section detection # ------------------------------------------------------------------ @staticmethod def _extract_abstract(text: str) -> str: """Extract abstract from paper text.""" match = re.search( r"(?:^|\n)\s*Abstract\s*\n(.*?)(?=\n\s*(?:\d+\.?\s+)?(?:Introduction|1\s))", text, re.DOTALL | re.IGNORECASE, ) if match: return match.group(1).strip() match = re.search( r"(?:^|\n)\s*Abstract[:\s]*\n?(.*?)(?:\n\n|\n\s*\n)", text, re.DOTALL | re.IGNORECASE, ) if match: return match.group(1).strip() return "" @staticmethod def _detect_sections(text: str) -> list[dict[str, str]]: """Detect section boundaries in paper text.""" sections: list[dict[str, str]] = [] pattern = re.compile(r"(?:^|\n)\s*(\d+\.?\s+[A-Z][^\n]{2,50})\s*\n", re.MULTILINE) matches = list(pattern.finditer(text)) for i, match in enumerate(matches): heading = match.group(1).strip() start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) body = text[start:end].strip() sections.append({"heading": heading, "text": body[:5000]}) return sections ================================================ FILE: researchclaw/web/scholar.py ================================================ """Google Scholar search powered by the ``scholarly`` library. scholarly is installed as a dependency and provides direct access to Google Scholar search, citation graph traversal, and author lookup. Usage:: client = GoogleScholarClient() papers = client.search("attention is all you need", limit=5) citing = client.get_citations(papers[0].scholar_id, limit=10) """ from __future__ import annotations import hashlib import logging import time from dataclasses import dataclass, field from typing import Any try: from scholarly import scholarly, ProxyGenerator HAS_SCHOLARLY = True except ImportError: scholarly = None # type: ignore[assignment] ProxyGenerator = None # type: ignore[assignment,misc] HAS_SCHOLARLY = False logger = logging.getLogger(__name__) @dataclass class ScholarPaper: """A paper result from Google Scholar.""" title: str authors: list[str] = field(default_factory=list) year: int = 0 abstract: str = "" citation_count: int = 0 url: str = "" scholar_id: str = "" venue: str = "" source: str = "google_scholar" def to_dict(self) -> dict[str, Any]: return { "title": self.title, "authors": self.authors, "year": self.year, "abstract": self.abstract, "citation_count": self.citation_count, "url": self.url, "scholar_id": self.scholar_id, "venue": self.venue, "source": self.source, } def to_literature_paper(self) -> Any: """Convert to researchclaw.literature.models.Paper.""" from researchclaw.literature.models import Author, Paper authors_tuple = tuple(Author(name=a) for a in self.authors) return Paper( paper_id=self.scholar_id or f"gs-{hashlib.sha256(self.title.encode()).hexdigest()[:8]}", title=self.title, authors=authors_tuple, year=self.year, abstract=self.abstract, venue=self.venue, citation_count=self.citation_count, url=self.url, source="google_scholar", ) class GoogleScholarClient: """Google Scholar search client using the ``scholarly`` library. Parameters ---------- inter_request_delay: Seconds between requests to avoid rate limiting. use_proxy: Whether to set up a free proxy to reduce blocking risk. """ def __init__( self, *, inter_request_delay: float = 2.0, use_proxy: bool = False, ) -> None: if not HAS_SCHOLARLY: raise ImportError( "scholarly is required for Google Scholar search. " "Install: pip install 'researchclaw[web]'" ) self.delay = inter_request_delay self._last_request_time: float = 0.0 if use_proxy: try: pg = ProxyGenerator() pg.FreeProxies() scholarly.use_proxy(pg) logger.info("Google Scholar: proxy enabled") except Exception as exc: # noqa: BLE001 logger.warning("Failed to set up proxy: %s", exc) @property def available(self) -> bool: """Always True — scholarly is installed as a dependency.""" return True def search(self, query: str, *, limit: int = 10) -> list[ScholarPaper]: """Search Google Scholar for papers matching query.""" self._rate_limit() results: list[ScholarPaper] = [] try: search_gen = scholarly.search_pubs(query) for i, pub in enumerate(search_gen): if i >= limit: break results.append(self._parse_pub(pub)) if i < limit - 1: self._rate_limit() logger.info("Google Scholar: found %d papers for %r", len(results), query) except Exception as exc: # noqa: BLE001 logger.warning("Google Scholar search failed: %s", exc) return results def get_citations(self, scholar_id: str, *, limit: int = 20) -> list[ScholarPaper]: """Get papers that cite the given paper (citation graph traversal).""" self._rate_limit() results: list[ScholarPaper] = [] try: pub = scholarly.search_single_pub(scholar_id) if pub: citations = scholarly.citedby(pub) for i, cit in enumerate(citations): if i >= limit: break results.append(self._parse_pub(cit)) if i < limit - 1: self._rate_limit() logger.info("Google Scholar: found %d citations for %s", len(results), scholar_id) except Exception as exc: # noqa: BLE001 logger.warning("Citation retrieval failed for %s: %s", scholar_id, exc) return results def search_author(self, name: str) -> list[dict[str, Any]]: """Search for an author on Google Scholar.""" self._rate_limit() try: results = [] for author in scholarly.search_author(name): results.append({ "name": author.get("name", ""), "affiliation": author.get("affiliation", ""), "scholar_id": author.get("scholar_id", ""), "citedby": author.get("citedby", 0), "interests": author.get("interests", []), }) if len(results) >= 5: break return results except Exception as exc: # noqa: BLE001 logger.warning("Author search failed for %s: %s", name, exc) return [] # ------------------------------------------------------------------ # Internals # ------------------------------------------------------------------ def _rate_limit(self) -> None: now = time.monotonic() elapsed = now - self._last_request_time if elapsed < self.delay: time.sleep(self.delay - elapsed) self._last_request_time = time.monotonic() @staticmethod def _parse_pub(pub: Any) -> ScholarPaper: """Parse a scholarly publication object into ScholarPaper.""" bib = pub.get("bib", {}) if isinstance(pub, dict) else getattr(pub, "bib", {}) info = pub if isinstance(pub, dict) else pub.__dict__ if hasattr(pub, "__dict__") else {} authors = bib.get("author", []) if isinstance(authors, str): authors = [a.strip() for a in authors.split(" and ")] year = 0 year_raw = bib.get("pub_year", bib.get("year", 0)) try: year = int(year_raw) except (ValueError, TypeError): pass cites_id = info.get("cites_id", []) scholar_id = info.get("author_pub_id", "") or ( cites_id[0] if isinstance(cites_id, list) and cites_id else "" ) return ScholarPaper( title=bib.get("title", ""), authors=authors, year=year, abstract=bib.get("abstract", ""), citation_count=info.get("num_citations", 0), url=info.get("pub_url", info.get("eprint_url", "")), scholar_id=scholar_id, venue=bib.get("venue", bib.get("journal", "")), ) ================================================ FILE: researchclaw/web/search.py ================================================ """Web search powered by Tavily AI Search API. Tavily is the primary search engine (installed as a dependency). A DuckDuckGo HTML scrape fallback exists for when no API key is set. Usage:: client = WebSearchClient(api_key="tvly-...") results = client.search("knowledge distillation survey 2024") """ from __future__ import annotations import logging import os import re import time from dataclasses import dataclass, field from typing import Any from urllib.request import Request, urlopen from urllib.parse import quote_plus logger = logging.getLogger(__name__) @dataclass class SearchResult: """A single web search result.""" title: str url: str snippet: str = "" content: str = "" score: float = 0.0 source: str = "" # "tavily" | "duckduckgo" def to_dict(self) -> dict[str, Any]: return { "title": self.title, "url": self.url, "snippet": self.snippet, "content": self.content, "score": self.score, "source": self.source, } @dataclass class WebSearchResponse: """Response from a web search query.""" query: str results: list[SearchResult] = field(default_factory=list) answer: str = "" # Tavily can provide a direct AI answer elapsed_seconds: float = 0.0 source: str = "" # "tavily" | "duckduckgo" @property def has_results(self) -> bool: return len(self.results) > 0 class WebSearchClient: """General-purpose web search client. Uses Tavily (installed) as primary engine. Falls back to DuckDuckGo HTML scraping only if no Tavily API key is available. Parameters ---------- api_key: Tavily API key. Falls back to ``TAVILY_API_KEY`` env var. max_results: Default number of results per query. search_depth: Tavily search depth: "basic" or "advanced". include_answer: Whether to request Tavily's AI-generated answer. """ def __init__( self, *, api_key: str = "", max_results: int = 10, search_depth: str = "advanced", include_answer: bool = True, ) -> None: self.api_key = api_key or os.environ.get("TAVILY_API_KEY", "") self.max_results = max_results self.search_depth = search_depth self.include_answer = include_answer def search( self, query: str, *, max_results: int | None = None, include_domains: list[str] | None = None, exclude_domains: list[str] | None = None, ) -> WebSearchResponse: """Search the web for a query.""" limit = max_results or self.max_results t0 = time.monotonic() # Tavily is the primary engine if self.api_key: try: return self._search_tavily(query, limit, include_domains, exclude_domains, t0) except Exception as exc: # noqa: BLE001 logger.warning("Tavily search failed, falling back to DuckDuckGo: %s", exc) return self._search_duckduckgo(query, limit, t0) def search_multi( self, queries: list[str], *, max_results: int | None = None, inter_query_delay: float = 1.0, ) -> list[WebSearchResponse]: """Run multiple search queries with cross-query deduplication.""" responses = [] seen_urls: set[str] = set() for i, query in enumerate(queries): if i > 0: time.sleep(inter_query_delay) resp = self.search(query, max_results=max_results) unique_results = [r for r in resp.results if r.url not in seen_urls] seen_urls.update(r.url for r in unique_results) resp.results = unique_results responses.append(resp) return responses # ------------------------------------------------------------------ # Tavily backend (primary — uses installed tavily-python SDK) # ------------------------------------------------------------------ def _search_tavily( self, query: str, limit: int, include_domains: list[str] | None, exclude_domains: list[str] | None, t0: float, ) -> WebSearchResponse: """Search using Tavily API (installed SDK).""" from tavily import TavilyClient client = TavilyClient(api_key=self.api_key) kwargs: dict[str, Any] = { "query": query, "max_results": limit, "search_depth": self.search_depth, "include_answer": self.include_answer, } if include_domains: kwargs["include_domains"] = include_domains if exclude_domains: kwargs["exclude_domains"] = exclude_domains response = client.search(**kwargs) elapsed = time.monotonic() - t0 results = [] for item in response.get("results", []): results.append(SearchResult( title=item.get("title", ""), url=item.get("url", ""), snippet=item.get("content", "")[:500], content=item.get("content", ""), score=item.get("score", 0.0), source="tavily", )) return WebSearchResponse( query=query, results=results, answer=response.get("answer", ""), elapsed_seconds=elapsed, source="tavily", ) # ------------------------------------------------------------------ # DuckDuckGo fallback (no API key needed) # ------------------------------------------------------------------ def _search_duckduckgo( self, query: str, limit: int, t0: float ) -> WebSearchResponse: """Fallback: scrape DuckDuckGo HTML search results.""" encoded = quote_plus(query) url = f"https://html.duckduckgo.com/html/?q={encoded}" req = Request(url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36", }) try: resp = urlopen(req, timeout=15) # noqa: S310 html = resp.read().decode("utf-8", errors="replace") except Exception as exc: # noqa: BLE001 elapsed = time.monotonic() - t0 logger.warning("DuckDuckGo search failed: %s", exc) return WebSearchResponse(query=query, elapsed_seconds=elapsed, source="duckduckgo") results = self._parse_ddg_html(html, limit) elapsed = time.monotonic() - t0 return WebSearchResponse(query=query, results=results, elapsed_seconds=elapsed, source="duckduckgo") @staticmethod def _parse_ddg_html(html: str, limit: int) -> list[SearchResult]: """Parse DuckDuckGo HTML results page.""" results = [] link_pattern = re.compile( r']*class="result__a"[^>]*href="([^"]*)"[^>]*>(.*?)', re.DOTALL, ) snippet_pattern = re.compile( r']*class="result__snippet"[^>]*>(.*?)', re.DOTALL, ) links = link_pattern.findall(html) snippets = snippet_pattern.findall(html) for i, (url, title_html) in enumerate(links[:limit]): title = re.sub(r"<[^>]+>", "", title_html).strip() snippet = re.sub(r"<[^>]+>", "", snippets[i]).strip() if i < len(snippets) else "" if "duckduckgo.com" in url: # Extract actual URL from DDG redirect: //duckduckgo.com/l/?uddg=https%3A... from urllib.parse import urlparse as _urlparse, parse_qs as _parse_qs, unquote as _unquote _parsed_ddg = _urlparse(url) _uddg = _parse_qs(_parsed_ddg.query).get("uddg") if _uddg: url = _unquote(_uddg[0]) else: continue results.append(SearchResult(title=title, url=url, snippet=snippet, source="duckduckgo")) return results ================================================ FILE: researchclaw/wizard/__init__.py ================================================ """Setup wizard modules.""" ================================================ FILE: researchclaw/wizard/quickstart.py ================================================ """Quick-start interactive setup wizard.""" from __future__ import annotations import sys from typing import Any from researchclaw.wizard.templates import TEMPLATES class QuickStartWizard: """Interactive configuration generator.""" def run_interactive(self, template: str | None = None) -> dict[str, Any]: """CLI interactive wizard — returns a config dict.""" print("\n=== ResearchClaw Setup Wizard ===\n") if template: return self._apply_template(template) config: dict[str, Any] = {} # 1. Project name name = self._ask("Project name", default="my-research") config["project"] = {"name": name, "mode": "full-auto"} # 2. Research topic topic = self._ask("Research topic (describe in one sentence)") if not topic: print("Topic is required.") return {} config["research"] = {"topic": topic} # 3. Research domain domains_str = self._ask( "Research domains (comma-separated: cv, nlp, rl, ml, ai4science)", default="ml", ) config["research"]["domains"] = [ d.strip() for d in domains_str.split(",") if d.strip() ] # 4. Experiment mode mode = self._choose( "Experiment mode", ["simulated", "docker", "sandbox"], default="docker", ) config["experiment"] = {"mode": mode} if mode == "docker": gpu = self._ask_yn("Enable GPU?", default=True) config["experiment"]["docker"] = { "gpu_enabled": gpu, "network_policy": "setup_only", } budget = self._ask("Time budget (seconds)", default="600") config["experiment"]["time_budget_sec"] = int(budget) # 5. LLM provider print("\n--- LLM Configuration ---") provider = self._choose( "LLM provider", ["openai-compatible", "acp"], default="openai-compatible", ) config["llm"] = {"provider": provider} if provider == "openai-compatible": base_url = self._ask("API base URL", default="https://api.openai.com/v1") api_key_env = self._ask("API key env var", default="OPENAI_API_KEY") model = self._ask("Model name", default="gpt-4o") config["llm"].update({ "base_url": base_url, "api_key_env": api_key_env, "primary_model": model, }) # 6. Output format conference = self._choose( "Target conference format", ["neurips_2025", "iclr_2025", "icml_2025", "arxiv"], default="neurips_2025", ) config["export"] = {"target_conference": conference} # 7. Runtime config["runtime"] = {"timezone": "UTC"} config["notifications"] = {"channel": "console"} config["knowledge_base"] = {"backend": "markdown", "root": "knowledge"} print("\n--- Configuration Summary ---") self._print_summary(config) confirm = self._ask_yn("\nSave this configuration?", default=True) if not confirm: print("Cancelled.") return {} return config def run_web(self, steps: list[dict[str, Any]]) -> dict[str, Any]: """Process wizard steps from web interface.""" config: dict[str, Any] = {} for step in steps: key = step.get("key", "") value = step.get("value", "") if key == "project_name": config.setdefault("project", {})["name"] = value elif key == "topic": config.setdefault("research", {})["topic"] = value elif key == "mode": config.setdefault("experiment", {})["mode"] = value elif key == "model": config.setdefault("llm", {})["primary_model"] = value return config def _apply_template(self, name: str) -> dict[str, Any]: """Apply a preset template.""" mapping = { "quick": "quick-demo", "standard": "standard-cv", "advanced": "deep-nlp", } tpl_name = mapping.get(name, name) tpl = TEMPLATES.get(tpl_name) if not tpl: print(f"Unknown template: {name}") return {} config = self._template_to_config(tpl) print(f"Applied template: {tpl_name}") print(f" Description: {tpl.get('description', '')}") self._print_summary(config) return config def _template_to_config(self, tpl: dict[str, Any]) -> dict[str, Any]: """Convert a flat template to nested config dict.""" config: dict[str, Any] = { "project": {"name": "wizard-project", "mode": "full-auto"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "console"}, "knowledge_base": {"backend": "markdown", "root": "knowledge"}, "research": {"topic": "Generated by wizard"}, "llm": {"provider": "openai-compatible", "api_key_env": "OPENAI_API_KEY"}, } for key, value in tpl.items(): if key == "description": continue parts = key.split(".") d = config for p in parts[:-1]: d = d.setdefault(p, {}) d[parts[-1]] = value return config def _ask(self, prompt: str, default: str = "") -> str: suffix = f" [{default}]" if default else "" try: answer = input(f" {prompt}{suffix}: ").strip() except (EOFError, KeyboardInterrupt): print() return default return answer or default def _ask_yn(self, prompt: str, default: bool = True) -> bool: suffix = " [Y/n]" if default else " [y/N]" try: answer = input(f" {prompt}{suffix}: ").strip().lower() except (EOFError, KeyboardInterrupt): print() return default if not answer: return default return answer in ("y", "yes", "1", "true") def _choose( self, prompt: str, options: list[str], default: str = "", ) -> str: print(f" {prompt}:") for i, opt in enumerate(options, 1): marker = " *" if opt == default else "" print(f" {i}. {opt}{marker}") try: answer = input(f" Choice [default={default}]: ").strip() except (EOFError, KeyboardInterrupt): print() return default if not answer: return default try: idx = int(answer) - 1 if 0 <= idx < len(options): return options[idx] except ValueError: if answer in options: return answer return default def _print_summary(self, config: dict[str, Any], indent: int = 2) -> None: import yaml print(yaml.dump(config, default_flow_style=False, allow_unicode=True)) ================================================ FILE: researchclaw/wizard/templates.py ================================================ """Preset research configuration templates.""" from __future__ import annotations from typing import Any TEMPLATES: dict[str, dict[str, Any]] = { "quick-demo": { "description": "5-minute quick demo (simulated mode, no GPU needed)", "experiment.mode": "simulated", "experiment.time_budget_sec": 60, "experiment.max_iterations": 3, }, "standard-cv": { "description": "Standard Computer Vision paper (Docker + CIFAR-10)", "research.domains": ["computer-vision"], "experiment.mode": "docker", "experiment.time_budget_sec": 600, "experiment.docker.gpu_enabled": True, "experiment.docker.network_policy": "setup_only", }, "deep-nlp": { "description": "Deep NLP research (Docker + GPU + transformers)", "research.domains": ["nlp", "transformers"], "experiment.mode": "docker", "experiment.time_budget_sec": 1200, "experiment.docker.gpu_enabled": True, "experiment.docker.memory_limit_mb": 16384, }, "rl-research": { "description": "Reinforcement Learning research (Docker + custom env)", "research.domains": ["reinforcement-learning"], "experiment.mode": "docker", "experiment.time_budget_sec": 900, "experiment.docker.gpu_enabled": True, }, "ai4science": { "description": "AI for Science (large compute budget)", "research.domains": ["ai4science"], "experiment.mode": "docker", "experiment.time_budget_sec": 1800, "experiment.docker.gpu_enabled": True, "experiment.docker.memory_limit_mb": 32768, }, } def get_template(name: str) -> dict[str, Any] | None: """Get a template by name.""" return TEMPLATES.get(name) def list_templates() -> list[dict[str, str]]: """List all available templates with descriptions.""" return [ {"name": name, "description": tpl.get("description", "")} for name, tpl in TEMPLATES.items() ] ================================================ FILE: researchclaw/wizard/validator.py ================================================ """Environment detection and recommendation for the setup wizard.""" from __future__ import annotations import os import shutil from dataclasses import dataclass, field from typing import Any @dataclass class EnvironmentReport: """Report of detected environment capabilities.""" has_gpu: bool = False gpu_name: str = "" gpu_vram_gb: float = 0.0 has_docker: bool = False docker_version: str = "" has_python: bool = True python_version: str = "" has_latex: bool = False available_memory_gb: float = 0.0 recommendations: list[str] = field(default_factory=list) def to_dict(self) -> dict[str, Any]: return { "has_gpu": self.has_gpu, "gpu_name": self.gpu_name, "gpu_vram_gb": self.gpu_vram_gb, "has_docker": self.has_docker, "docker_version": self.docker_version, "has_python": self.has_python, "python_version": self.python_version, "has_latex": self.has_latex, "available_memory_gb": round(self.available_memory_gb, 1), "recommendations": self.recommendations, } def detect_environment() -> EnvironmentReport: """Detect local environment and generate recommendations.""" import sys import subprocess report = EnvironmentReport() report.python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" # Docker if shutil.which("docker"): report.has_docker = True try: result = subprocess.run( ["docker", "--version"], capture_output=True, text=True, timeout=5 ) report.docker_version = result.stdout.strip() except Exception: pass # GPU try: import torch if torch.cuda.is_available(): report.has_gpu = True report.gpu_name = torch.cuda.get_device_name(0) report.gpu_vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) except ImportError: pass # LaTeX report.has_latex = shutil.which("pdflatex") is not None # Memory try: import psutil report.available_memory_gb = psutil.virtual_memory().available / (1024**3) except ImportError: pass # Recommendations if not report.has_docker: report.recommendations.append( "Install Docker for experiment isolation (recommended)" ) if not report.has_gpu: report.recommendations.append( "No GPU detected — use 'simulated' mode or remote GPU server" ) if not report.has_latex: report.recommendations.append( "Install LaTeX (texlive) for PDF paper export" ) if report.has_gpu and report.has_docker: report.recommendations.append( "Environment ready for full Docker GPU experiments" ) return report ================================================ FILE: researchclaw/writing_guide.py ================================================ """Conference-grade writing knowledge base. Structured tips from NeurIPS/ICML/ICLR best practices, reviewer feedback analysis, and accepted paper patterns. Can be loaded and injected into prompts at runtime, allowing updates without modifying prompt YAML. """ from __future__ import annotations CONFERENCE_WRITING_TIPS: dict[str, list[str]] = { "title": [ "Signal novelty — title should hint at what is new", "Be specific and concrete, under 15 words", "No abbreviations unless universally known", "Pattern: '[Finding]: [Evidence]' or '[Method]: [What it does]'", "Memeability test: would a reader enjoy telling a colleague about this?", ], "abstract": [ "5-sentence structure: (1) problem, (2) prior approaches + limitations, " "(3) your approach + novelty, (4) key results with numbers, (5) implication", "150-250 words for ML conferences", "Include at least 2 specific quantitative results", ], "figure_1": [ "Most important figure in the paper — many readers look at Figure 1 first", "Should convey the key idea or main result at a glance", "Invest significant time in this figure", ], "introduction": [ "State contributions clearly as bullet points", "Many reviewers stop reading carefully after the intro", "Include paper organization paragraph at the end", ], "experiments": [ "Strong baselines: tune baselines with the same effort as your method", "Ablations: remove one component at a time and measure the effect", "Reproducibility: include hyperparameters, seeds, hardware specs", "Statistical rigor: report variance, run multiple seeds", ], "common_rejections": [ "Weak baselines (79% of rejected papers)", "Missing ablations", "Overclaiming beyond evidence", "Poor reproducibility details", "Ignoring limitations", ], "rebuttal": [ "Start with positives reviewers identified", "Quote reviewers directly, then respond", "Provide new data/experiments rather than arguing", "Do not promise — deliver", ], } def format_writing_tips(categories: list[str] | None = None) -> str: """Format writing tips as a prompt-injectable string. Parameters ---------- categories: Subset of tip categories to include. If *None*, include all. Returns ------- str Formatted markdown-style tips block. """ lines: list[str] = ["## Conference Writing Best Practices"] cats = categories or list(CONFERENCE_WRITING_TIPS.keys()) for cat in cats: tips = CONFERENCE_WRITING_TIPS.get(cat, []) if not tips: continue lines.append(f"\n### {cat.replace('_', ' ').title()}") for tip in tips: lines.append(f"- {tip}") return "\n".join(lines) ================================================ FILE: scripts/metaclaw_start.sh ================================================ #!/bin/bash # Start MetaClaw proxy for AutoResearchClaw integration. # # Usage: # ./scripts/metaclaw_start.sh # skills_only mode (default) # ./scripts/metaclaw_start.sh madmax # madmax mode (with RL training) # ./scripts/metaclaw_start.sh skills_only # skills_only mode (explicit) set -e MODE="${1:-skills_only}" PORT="${2:-30000}" METACLAW_DIR="/home/jqliu/projects/MetaClaw" VENV="$METACLAW_DIR/.venv" if [ ! -d "$VENV" ]; then echo "ERROR: MetaClaw venv not found at $VENV" echo "Run: cd $METACLAW_DIR && python -m venv .venv && source .venv/bin/activate && pip install -e '.[evolve,embedding]'" exit 1 fi echo "Starting MetaClaw in ${MODE} mode on port ${PORT}..." # Activate venv and start source "$VENV/bin/activate" exec metaclaw start --mode "$MODE" --port "$PORT" ================================================ FILE: scripts/plot_iteration_showcase.py ================================================ """Generate promotional figure: Pipeline iterative improvement showcase. Shows two experiment cases side-by-side demonstrating how the AutoResearchClaw pipeline progressively improves experimental methods through self-iteration. """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np from pathlib import Path # ── Styling ────────────────────────────────────────────────────────────────── plt.rcParams.update({ "font.family": "serif", "font.size": 11, "axes.titlesize": 13, "axes.labelsize": 11, "figure.facecolor": "white", "axes.facecolor": "#FAFAFA", "axes.grid": True, "grid.alpha": 0.3, "grid.linestyle": "--", }) BLUE = "#1565C0" GREEN = "#2E7D32" RED = "#C62828" ORANGE = "#E65100" PURPLE = "#6A1B9A" GRAY = "#757575" # ── Data ───────────────────────────────────────────────────────────────────── # Case 1: Continual Meta-Learning for Few-Shot Adaptation case1_iters = [0, 1, 2, 3, 4] case1_labels = [ "Baseline\n(Initial Code)", "Iter 1\n(Deep Encoder\n+ Meta-SGD)", "Iter 2\n(Prototype Net\n— Regression)", "Iter 3\n(Linear Clf\n+ L2 Anchor)", "Iter 4\n(Converged)", ] case1_error = [0.7411, 0.1883, 0.2249, 0.0663, 0.0656] case1_accuracy = [100 * (1 - e) for e in case1_error] # Marker styles: green=improved, red=regressed, gray=no change case1_colors = [GRAY, GREEN, RED, GREEN, GRAY] case1_improved = [None, True, False, True, None] # Case 2: RLHF + Curriculum-Based Reward Shaping case2_iters = [0, 1, 2, 3, 4] case2_labels = [ "Baseline\n(Vanilla PPO)", "Iter 1\n(No Change)", "Iter 2\n(+Reward Model\n+Curriculum)", "Iter 3\n(+Rank-Norm\n+Policy EMA)", "Iter 4\n(+Confidence\nGating)", ] case2_error = [0.6443, 0.6443, 0.3843, 0.3696, 0.3344] case2_alignment = [100 * (1 - e) for e in case2_error] case2_colors = [GRAY, GRAY, GREEN, GREEN, GREEN] # ── Figure ─────────────────────────────────────────────────────────────────── fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7)) # ── Case 1: Meta-Learning ─────────────────────────────────────────────────── # Main line ax1.plot(case1_iters, case1_accuracy, "o-", color=BLUE, linewidth=2.5, markersize=10, zorder=5, label="Few-Shot Accuracy") # Colored markers for improvement status for i, (x, y, c) in enumerate(zip(case1_iters, case1_accuracy, case1_colors)): ax1.scatter(x, y, s=120, color=c, zorder=6, edgecolors="white", linewidths=1.5) # Annotate key improvements ax1.annotate( "+55.3 pts\nDeep encoder\n+ context-gated replay", xy=(1, case1_accuracy[1]), xytext=(1.3, 55), fontsize=8.5, color=GREEN, fontweight="bold", arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5), ha="left", ) ax1.annotate( "Prototype net\ntoo simple", xy=(2, case1_accuracy[2]), xytext=(2.25, 65), fontsize=8, color=RED, fontstyle="italic", arrowprops=dict(arrowstyle="->", color=RED, lw=1.2), ha="left", ) ax1.annotate( "+15.9 pts\nLinear clf + L2 anchor\n+ cosine gating", xy=(3, case1_accuracy[3]), xytext=(2.5, 98), fontsize=8.5, color=GREEN, fontweight="bold", arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5), ha="left", ) # Reference line for "ideal" performance ax1.axhline(y=100, color=ORANGE, linestyle=":", alpha=0.6, linewidth=1.5) ax1.text(4.3, 99, "Oracle (100%)", fontsize=8, color=ORANGE, ha="right", fontstyle="italic", va="top") # Shaded improvement region ax1.fill_between(case1_iters, case1_accuracy, case1_accuracy[0], where=[a >= case1_accuracy[0] for a in case1_accuracy], alpha=0.08, color=BLUE) ax1.set_xlabel("Self-Iteration Round", fontsize=12) ax1.set_ylabel("Few-Shot Accuracy (%)", fontsize=12) ax1.set_title("Case A: Continual Meta-Learning\nfor Few-Shot Adaptation", fontsize=13, fontweight="bold", pad=12) ax1.set_ylim(15, 105) ax1.set_xticks(case1_iters) ax1.set_xticklabels(case1_labels, fontsize=7.5, ha="center") # Summary box summary1 = f"Baseline: {case1_accuracy[0]:.1f}% → Best: {case1_accuracy[3]:.1f}%\nImprovement: +{case1_accuracy[3]-case1_accuracy[0]:.1f} pts ({(case1_accuracy[3]-case1_accuracy[0])/case1_accuracy[0]*100:.0f}% rel.)" ax1.text(0.02, 0.97, summary1, transform=ax1.transAxes, fontsize=9, verticalalignment="top", fontfamily="monospace", bbox=dict(boxstyle="round,pad=0.5", facecolor="#E3F2FD", alpha=0.9, edgecolor=BLUE, linewidth=1.2)) # ── Case 2: RLHF ──────────────────────────────────────────────────────────── ax2.plot(case2_iters, case2_alignment, "s-", color=PURPLE, linewidth=2.5, markersize=10, zorder=5, label="Alignment Score") for i, (x, y, c) in enumerate(zip(case2_iters, case2_alignment, case2_colors)): ax2.scatter(x, y, s=120, color=c, zorder=6, edgecolors="white", linewidths=1.5, marker="s") # Annotate ax2.annotate( "No improvement\n(minor code fix)", xy=(1, case2_alignment[1]), xytext=(1.3, 30), fontsize=8, color=GRAY, fontstyle="italic", arrowprops=dict(arrowstyle="->", color=GRAY, lw=1.2), ha="left", ) ax2.annotate( "+26.0 pts\n+Learned reward model\n+Curriculum scheduling", xy=(2, case2_alignment[2]), xytext=(1.8, 75), fontsize=8.5, color=GREEN, fontweight="bold", arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5), ha="left", ) ax2.annotate( "+1.4 pts\n+Rank-norm\n+Policy EMA", xy=(3, case2_alignment[3]), xytext=(3.2, 73), fontsize=8, color=GREEN, arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.2), ha="left", ) ax2.annotate( "+3.6 pts\n+Confidence gating\n+Mini-batch RM", xy=(4, case2_alignment[4]), xytext=(3.5, 80), fontsize=8.5, color=GREEN, fontweight="bold", arrowprops=dict(arrowstyle="->", color=GREEN, lw=1.5), ha="left", ) # Shaded improvement ax2.fill_between(case2_iters, case2_alignment, case2_alignment[0], where=[a >= case2_alignment[0] for a in case2_alignment], alpha=0.08, color=PURPLE) ax2.set_xlabel("Self-Iteration Round", fontsize=12) ax2.set_ylabel("LLM Alignment Score (%)", fontsize=12) ax2.set_title("Case B: RLHF with Curriculum-Based\nReward Shaping for LLM Alignment", fontsize=13, fontweight="bold", pad=12) ax2.set_ylim(15, 105) ax2.set_xticks(case2_iters) ax2.set_xticklabels(case2_labels, fontsize=7.5, ha="center") summary2 = f"Baseline: {case2_alignment[0]:.1f}% → Best: {case2_alignment[4]:.1f}%\nImprovement: +{case2_alignment[4]-case2_alignment[0]:.1f} pts ({(case2_alignment[4]-case2_alignment[0])/case2_alignment[0]*100:.0f}% rel.)" ax2.text(0.02, 0.97, summary2, transform=ax2.transAxes, fontsize=9, verticalalignment="top", fontfamily="monospace", bbox=dict(boxstyle="round,pad=0.5", facecolor="#F3E5F5", alpha=0.9, edgecolor=PURPLE, linewidth=1.2)) # ── Legend ─────────────────────────────────────────────────────────────────── legend_elements = [ mpatches.Patch(facecolor=GREEN, edgecolor="white", label="Improved"), mpatches.Patch(facecolor=RED, edgecolor="white", label="Regressed (auto-recovered)"), mpatches.Patch(facecolor=GRAY, edgecolor="white", label="No change / Baseline"), ] fig.legend(handles=legend_elements, loc="lower center", ncol=3, fontsize=10, frameon=True, fancybox=True, framealpha=0.9, bbox_to_anchor=(0.5, -0.02)) # ── Suptitle ───────────────────────────────────────────────────────────────── fig.suptitle( "AutoResearchClaw: Autonomous Self-Iterating Experiment Optimization", fontsize=15, fontweight="bold", y=1.02, ) fig.tight_layout(rect=[0, 0.04, 1, 0.98]) # ── Save ───────────────────────────────────────────────────────────────────── out_dir = Path(__file__).resolve().parent.parent / "docs" / "figures" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / "iteration_improvement_showcase.png" fig.savefig(out_path, dpi=200, bbox_inches="tight", facecolor="white") print(f"Saved: {out_path}") # Also save a PDF version for papers pdf_path = out_dir / "iteration_improvement_showcase.pdf" fig.savefig(pdf_path, bbox_inches="tight", facecolor="white") print(f"Saved: {pdf_path}") plt.close(fig) ================================================ FILE: scripts/test_beast_mode_e2e.py ================================================ #!/usr/bin/env python3 """End-to-end integration test for OpenCode Beast Mode. Simulates Pipeline stages 1-9 artifacts, then invokes Beast Mode to generate experiment code via OpenCode CLI. Usage: python scripts/test_beast_mode_e2e.py """ from __future__ import annotations import json import sys import textwrap import time from pathlib import Path # Add project root to path PROJECT_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from researchclaw.pipeline.opencode_bridge import ( OpenCodeBridge, count_historical_failures, score_complexity, ) # ============================================================ # Simulated Pipeline Artifacts # ============================================================ TOPIC = ( "Adaptive Mixtures of Local Experts for Image Classification: " "Dynamic Gating with Load-Balanced Sparse Routing on CIFAR-10" ) # Simulated Stage 9 output: exp_plan.yaml content EXP_PLAN = textwrap.dedent("""\ topic: > Adaptive Mixtures of Local Experts for Image Classification: Dynamic Gating with Load-Balanced Sparse Routing on CIFAR-10 objectives: - Investigate whether sparse Mixture-of-Experts (MoE) routing improves accuracy over dense baselines under a fixed parameter budget - Compare top-k routing vs soft routing vs hash-based routing - Ablate the load-balancing auxiliary loss datasets: - CIFAR-10 (pre-cached at /opt/datasets/cifar10) baselines: - name: dense_resnet18 description: Standard ResNet-18 with all parameters active implementation_spec: class_name: DenseResNet18Trainer key_hyperparameters: batch_size: 128 learning_rate: 0.1 epochs: 20 weight_decay: 5e-4 - name: dense_wider_resnet description: Wider ResNet with ~same FLOPs as MoE model implementation_spec: class_name: DenseWiderResNetTrainer key_hyperparameters: batch_size: 128 learning_rate: 0.1 epochs: 20 proposed_methods: - name: topk_sparse_moe description: > Sparse MoE with top-2 gating. Each MoE layer has 4 expert MLPs, a gating network selects top-2 per token. Load-balancing loss ensures even expert utilization. implementation_spec: class_name: TopKSparseMoETrainer algorithm_steps: - Build backbone CNN (first 3 ResNet blocks) - Replace final block with MoE layer (4 experts, top-2 gating) - Gating network: linear projection → softmax → top-k selection - Load-balance loss: CV of expert load across batch - Total loss = CE + lambda_lb * load_balance_loss key_hyperparameters: batch_size: 128 learning_rate: 0.05 epochs: 20 num_experts: 4 top_k: 2 lambda_lb: 0.01 - name: soft_routing_moe description: > Soft MoE where all experts contribute with learned weights (no hard top-k). Softer gradient flow but higher compute. implementation_spec: class_name: SoftRoutingMoETrainer key_hyperparameters: batch_size: 128 learning_rate: 0.05 epochs: 20 num_experts: 4 ablations: - name: topk_moe_no_load_balance description: TopK MoE without load-balancing loss (lambda_lb=0) what_is_removed: Load-balancing auxiliary loss expected_effect: Expert collapse — one expert dominates, accuracy drops how_it_differs: - Set lambda_lb = 0 - Everything else identical to topk_sparse_moe - name: topk_moe_single_expert description: TopK MoE with top_k=1 (only one expert per sample) what_is_removed: Multi-expert routing (reduced to single expert) expected_effect: Reduced model capacity per sample, likely lower accuracy how_it_differs: - Set top_k = 1 instead of 2 - Keep load-balancing loss active metrics: primary_metric: name: test_accuracy direction: maximize description: Classification accuracy on CIFAR-10 test set secondary_metrics: - name: expert_utilization_cv description: Coefficient of variation of expert usage (lower = more balanced) - name: training_time_sec description: Wall-clock training time compute_budget: effective_time_seconds: 240 estimated_seconds_per_run: 40 seeds_per_condition: 3 total_conditions: 6 notes: - Use small models (< 5M params) to fit within budget - Use 20 epochs max - Early stopping if no improvement for 5 epochs """) PKG_HINT = textwrap.dedent("""\ AVAILABLE PACKAGES (docker mode): Python stdlib, numpy, torch, sklearn, scipy, pandas, torchvision, torchaudio, matplotlib, seaborn, scipy, tqdm, transformers, datasets, timm, einops, torchmetrics, and additional pip-installable packages via requirements.txt. GPU: NVIDIA RTX 6000 Ada (cuda). You MAY use PyTorch with GPU acceleration. Use `device = torch.device('cuda')` for tensor operations. ## Compute Budget Constraint - Total execution time limit: 240 seconds - Design experiments that complete within this budget - Implement a time guard: stop gracefully at 80% of budget (192 seconds) """) EXTRA_GUIDANCE = textwrap.dedent("""\ ## Dataset Guidance CIFAR-10 is pre-cached at /opt/datasets/cifar10. Use: torchvision.datasets.CIFAR10(root='/opt/datasets/cifar10', download=False) ## Multi-Seed Enforcement Run each condition with seeds [0, 1, 2]. Report mean ± std for all metrics. ## Hyperparameter Reporting Print all hyperparameters at the start of each condition run. """) def main() -> None: print("=" * 70) print("OpenCode Beast Mode — End-to-End Integration Test") print("=" * 70) # Step 1: Complexity scoring print("\n[Step 1] Complexity scoring...") cplx = score_complexity( exp_plan=EXP_PLAN, topic=TOPIC, historical_failures=0, threshold=0.4, # Lower threshold to ensure trigger for this test ) print(f" Score: {cplx.score:.4f}") print(f" Signals: {json.dumps(cplx.signals, indent=4)}") print(f" Recommendation: {cplx.recommendation}") print(f" Reason: {cplx.reason}") if cplx.recommendation != "beast_mode": print("\n [!] Score below threshold. Forcing beast mode for test purposes.\n") # Step 2: Check OpenCode availability print("\n[Step 2] Checking OpenCode availability...") available = OpenCodeBridge.check_available() if not available: print(" [FATAL] OpenCode CLI not installed. Cannot proceed.") sys.exit(1) print(" OpenCode CLI: OK") # Step 3: Create test workspace and invoke print("\n[Step 3] Invoking OpenCode beast mode...") test_dir = PROJECT_ROOT / "test_outputs_beast_mode" test_dir.mkdir(parents=True, exist_ok=True) stage_dir = test_dir / f"stage-10_{int(time.time())}" stage_dir.mkdir(parents=True, exist_ok=True) # Write complexity analysis (stage_dir / "complexity_analysis.json").write_text( json.dumps({ "score": cplx.score, "signals": cplx.signals, "recommendation": cplx.recommendation, "reason": cplx.reason, }, indent=2), encoding="utf-8", ) # NOTE: Azure AI Services endpoints don't support OpenCode's Responses API. # The bridge auto-detects Azure and falls back to Anthropic provider. bridge = OpenCodeBridge( model="anthropic/claude-sonnet-4-6", # Direct Anthropic model llm_base_url="https://huaxi-mlg4x1rk-eastus2.services.ai.azure.com/openai/v1", api_key_env="AZURE_OPENAI_API_KEY", llm_provider="azure", timeout_sec=300, max_retries=1, workspace_cleanup=False, # Keep workspace for inspection ) t0 = time.time() result = bridge.generate( stage_dir=stage_dir, topic=TOPIC, exp_plan=EXP_PLAN, metric="test_accuracy", pkg_hint=PKG_HINT, extra_guidance=EXTRA_GUIDANCE, time_budget_sec=240, ) elapsed = time.time() - t0 # Step 4: Evaluate results print(f"\n[Step 4] Results (elapsed: {elapsed:.1f}s)") print(f" Success: {result.success}") print(f" Error: {result.error or 'None'}") print(f" Files: {list(result.files.keys())}") print(f" OpenCode elapsed: {result.elapsed_sec:.1f}s") if not result.success: print(f"\n [FAILED] Beast mode failed: {result.error}") print(f" Log (last 1000 chars):\n{result.opencode_log[-1000:]}") # Write log for debugging (stage_dir / "opencode_log.txt").write_text( result.opencode_log, encoding="utf-8", ) (stage_dir / "beast_mode_log.json").write_text( json.dumps({ "success": False, "error": result.error, "elapsed_sec": result.elapsed_sec, }, indent=2), encoding="utf-8", ) sys.exit(1) # Write generated files exp_dir = stage_dir / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) for fname, code in result.files.items(): fpath = exp_dir / fname fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(code, encoding="utf-8") print(f"\n Files written to: {exp_dir}") # Write beast mode log (stage_dir / "beast_mode_log.json").write_text( json.dumps({ "success": True, "elapsed_sec": result.elapsed_sec, "files": list(result.files.keys()), }, indent=2), encoding="utf-8", ) # Step 5: Quality evaluation print("\n[Step 5] Quality evaluation...") checks = { "main.py exists": "main.py" in result.files, "main.py is non-empty": len(result.files.get("main.py", "")) > 100, "Has metric print": "test_accuracy" in result.files.get("main.py", ""), "Has seed loop": "seed" in result.files.get("main.py", "").lower(), "Has CIFAR-10": "cifar" in result.files.get("main.py", "").lower(), "Has torch import": "import torch" in result.files.get("main.py", ""), "No argparse": "argparse" not in result.files.get("main.py", ""), "Has multiple conditions": any( kw in result.files.get("main.py", "").lower() for kw in ["baseline", "dense", "moe", "expert", "condition"] ), "Has time guard": any( kw in result.files.get("main.py", "") for kw in ["time.time", "time.monotonic", "time_budget", "time_limit"] ), } all_pass = True for check_name, passed in checks.items(): status = "PASS" if passed else "FAIL" if not passed: all_pass = False print(f" [{status}] {check_name}") # Count lines of code total_loc = sum(len(code.splitlines()) for code in result.files.values()) py_files = [f for f in result.files if f.endswith(".py")] print(f"\n Total files: {len(result.files)}") print(f" Python files: {len(py_files)}") print(f" Total lines of code: {total_loc}") # Try AST parsing main.py import ast try: ast.parse(result.files["main.py"]) print(" [PASS] main.py AST parse: valid Python") except SyntaxError as e: print(f" [FAIL] main.py AST parse error: {e}") all_pass = False # Print first 50 lines of main.py for manual inspection main_lines = result.files.get("main.py", "").splitlines() print(f"\n --- main.py preview (first 50 of {len(main_lines)} lines) ---") for i, line in enumerate(main_lines[:50], 1): print(f" {i:4d} | {line}") if len(main_lines) > 50: print(f" ... ({len(main_lines) - 50} more lines)") # Final verdict print("\n" + "=" * 70) pass_count = sum(1 for v in checks.values() if v) total = len(checks) if all_pass: print(f"VERDICT: ALL CHECKS PASSED ({pass_count}/{total})") else: print(f"VERDICT: {pass_count}/{total} checks passed") print(f"Stage dir: {stage_dir}") print("=" * 70) if __name__ == "__main__": main() ================================================ FILE: scripts/test_code_agent_live.py ================================================ #!/usr/bin/env python3 """Live test of CodeAgent with real LLM — evaluates code generation quality. This script directly invokes the CodeAgent with real experiment plans and evaluates the quality of generated code. No full pipeline needed. Usage: python scripts/test_code_agent_live.py [--model gpt-4.1] [--test-id 1] """ from __future__ import annotations import argparse import ast import json import os import sys import time from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from researchclaw.llm.client import LLMClient, LLMConfig from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig from researchclaw.prompts import PromptManager # --------------------------------------------------------------------------- # Test cases — progressively harder experiment scenarios # --------------------------------------------------------------------------- TEST_CASES = { 1: { "name": "Vision Transformer on CIFAR-10", "topic": ( "Comparing Vision Transformer (ViT) variants for image classification: " "investigate how patch size, number of attention heads, and positional " "encoding strategies affect classification accuracy on CIFAR-10" ), "exp_plan": """ objectives: - Compare ViT-Tiny variants with different patch sizes (4, 8, 16) - Evaluate multi-head self-attention with different head counts (2, 4, 8) - Compare learnable vs sinusoidal positional encodings datasets: - name: CIFAR-10 source: torchvision.datasets.CIFAR10 train_size: 50000 test_size: 10000 baselines: - name: SimpleViT-P16 description: Standard ViT with patch_size=16, 4 heads, learnable pos encoding proposed_methods: - name: SmallPatch-ViT implementation_spec: class_name: SmallPatchViT key_methods: [forward, _create_patches, _attention] differentiator: Uses patch_size=4 for finer-grained spatial features - name: ManyHead-ViT implementation_spec: class_name: ManyHeadViT key_methods: [forward, _multi_head_attention] differentiator: Uses 8 attention heads instead of 4 ablations: - name: SinusoidalPos-ViT description: Replace learnable positional encoding with sinusoidal metrics: - accuracy (higher is better) - training_loss compute_budget: time_limit_sec: 300 epochs: 10 """, "metric": "accuracy", "min_files": 2, "min_classes": 3, "required_imports": ["torch", "torchvision"], }, 2: { "name": "Distribution Shift Detection via Uncertainty", "topic": ( "Detecting distribution shift in deployed ML models using " "uncertainty estimation: comparing Monte Carlo Dropout, " "Deep Ensembles, and Spectral-Normalized Neural GP (SNGP) " "for out-of-distribution detection on corrupted CIFAR-10" ), "exp_plan": """ objectives: - Implement 3 uncertainty estimation methods for OOD detection - Evaluate on CIFAR-10 vs CIFAR-10-C (corrupted) as OOD - Compare AUROC for separating in-distribution from OOD samples datasets: - name: CIFAR-10 source: torchvision.datasets.CIFAR10 role: in-distribution - name: CIFAR-10-C source: Generated via Gaussian noise corruption role: out-of-distribution baselines: - name: MCDropout description: Monte Carlo Dropout with 30 forward passes, mean+std of softmax implementation_spec: class_name: MCDropoutDetector key_methods: [predict_with_uncertainty, _mc_forward, compute_auroc] differentiator: Standard MC Dropout baseline proposed_methods: - name: DeepEnsemble implementation_spec: class_name: DeepEnsembleDetector key_methods: [train_ensemble, predict_with_uncertainty, _member_forward] differentiator: Trains 3 independent models, uses prediction disagreement - name: SNGP implementation_spec: class_name: SNGPDetector key_methods: [forward, _spectral_norm_layer, _gp_output_layer] differentiator: Spectral normalization + GP output layer for distance-aware uncertainty ablations: - name: MCDropout-10passes description: MC Dropout with only 10 forward passes (reduced compute) metrics: - auroc (higher is better) - ece (expected calibration error, lower is better) compute_budget: time_limit_sec: 300 epochs: 5 """, "metric": "auroc", "min_files": 2, "min_classes": 4, "required_imports": ["torch", "numpy"], }, 3: { "name": "Meta-Learning Few-Shot with MAML", "topic": ( "Few-shot learning with gradient-based meta-learning: comparing " "MAML, Reptile, and Prototypical Networks on Omniglot-style " "synthetic tasks with 5-way 1-shot and 5-way 5-shot settings" ), "exp_plan": """ objectives: - Implement 3 few-shot learning algorithms from scratch - Evaluate on synthetic few-shot tasks (5-way, 1-shot and 5-shot) - Compare accuracy and convergence speed datasets: - name: SyntheticFewShot source: Generated in-code (random linear classification tasks) n_classes: 20 samples_per_class: 20 baselines: - name: ProtoNet description: Prototypical Networks — learn embedding, classify by nearest class prototype implementation_spec: class_name: PrototypicalNetwork key_methods: [embed, compute_prototypes, classify, meta_train_step] differentiator: Non-gradient meta-learning baseline using metric space proposed_methods: - name: MAML implementation_spec: class_name: MAMLLearner key_methods: [inner_loop, outer_loop, meta_train_step, adapt] differentiator: Second-order gradient-based meta-learning with inner loop adaptation - name: Reptile implementation_spec: class_name: ReptileLearner key_methods: [inner_loop, meta_update, meta_train_step] differentiator: First-order approximation — SGD on tasks, move toward task-optimal weights ablations: - name: MAML-FirstOrder description: MAML with first-order approximation (no second derivatives) metrics: - accuracy (higher is better) - meta_train_loss compute_budget: time_limit_sec: 300 meta_epochs: 200 inner_steps: 5 inner_lr: 0.01 """, "metric": "accuracy", "min_files": 2, "min_classes": 3, "required_imports": ["torch"], }, } # --------------------------------------------------------------------------- # Code quality analysis # --------------------------------------------------------------------------- def analyze_code_quality(files: dict[str, str], test_case: dict) -> dict: """Analyze the quality of generated code.""" report = { "test_name": test_case["name"], "num_files": len(files), "file_names": list(files.keys()), "total_lines": 0, "effective_lines": 0, "classes_found": [], "functions_found": [], "imports_found": [], "issues": [], "scores": {}, } all_code = "" for fname, code in files.items(): all_code += code + "\n" lines = code.split("\n") report["total_lines"] += len(lines) effective = [ l for l in lines if l.strip() and not l.strip().startswith("#") and not l.strip().startswith("import") and not l.strip().startswith("from") ] report["effective_lines"] += len(effective) # AST analysis try: tree = ast.parse(code) for node in ast.walk(tree): if isinstance(node, ast.ClassDef): methods = [ n.name for n in node.body if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) ] method_lines = sum( n.end_lineno - n.lineno + 1 for n in node.body if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) and n.end_lineno ) report["classes_found"].append({ "name": node.name, "file": fname, "methods": methods, "method_count": len(methods), "total_method_lines": method_lines, }) elif isinstance(node, ast.FunctionDef) and node.col_offset == 0: report["functions_found"].append({ "name": node.name, "file": fname, "lines": (node.end_lineno or node.lineno) - node.lineno + 1, }) elif isinstance(node, (ast.Import, ast.ImportFrom)): if isinstance(node, ast.Import): for alias in node.names: report["imports_found"].append(alias.name.split(".")[0]) else: if node.module: report["imports_found"].append(node.module.split(".")[0]) except SyntaxError as e: report["issues"].append(f"SyntaxError in {fname}: {e}") report["imports_found"] = sorted(set(report["imports_found"])) # Scoring # 1. File count (target: min_files) file_score = min(10, (len(files) / test_case["min_files"]) * 10) report["scores"]["file_structure"] = round(file_score, 1) # 2. Class count (target: min_classes) class_score = min(10, (len(report["classes_found"]) / test_case["min_classes"]) * 10) report["scores"]["class_coverage"] = round(class_score, 1) # 3. Code depth (effective lines) depth_score = min(10, report["effective_lines"] / 30) # 300 lines = 10 report["scores"]["code_depth"] = round(depth_score, 1) # 4. Method richness (average methods per class) if report["classes_found"]: avg_methods = sum(c["method_count"] for c in report["classes_found"]) / len(report["classes_found"]) method_score = min(10, avg_methods / 0.5) # 5 methods/class = 10 report["scores"]["method_richness"] = round(method_score, 1) else: report["scores"]["method_richness"] = 0 # 5. Import coverage required = set(test_case.get("required_imports", [])) found = set(report["imports_found"]) if required: import_score = len(required & found) / len(required) * 10 else: import_score = 10 report["scores"]["import_coverage"] = round(import_score, 1) # 6. Syntax validity syntax_score = 10 if not any("SyntaxError" in i for i in report["issues"]) else 0 report["scores"]["syntax_valid"] = syntax_score # Overall score scores = report["scores"] report["overall_score"] = round( sum(scores.values()) / len(scores), 1 ) # Quality checks if len(files) < test_case["min_files"]: report["issues"].append( f"Too few files: {len(files)} < {test_case['min_files']}" ) if len(report["classes_found"]) < test_case["min_classes"]: report["issues"].append( f"Too few classes: {len(report['classes_found'])} < {test_case['min_classes']}" ) for cls in report["classes_found"]: if cls["total_method_lines"] < 10: report["issues"].append( f"Class {cls['name']} has only {cls['total_method_lines']} method lines (too thin)" ) return report # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Live test CodeAgent quality") parser.add_argument("--model", default="gpt-4.1", help="Model to use") parser.add_argument("--test-id", type=int, default=0, help="Test case ID (0=all)") parser.add_argument("--no-sandbox", action="store_true", help="Skip sandbox exec-fix") parser.add_argument("--tree-search", action="store_true", help="Enable tree search") parser.add_argument("--output-dir", default="test_outputs", help="Output directory") args = parser.parse_args() # Setup LLM client base_url = os.environ.get("OPENAI_BASE_URL", "") api_key = os.environ.get("OPENAI_API_KEY", "") if not base_url or not api_key: print("ERROR: Set OPENAI_BASE_URL and OPENAI_API_KEY environment variables") sys.exit(1) llm_config = LLMConfig( base_url=base_url, api_key=api_key, primary_model=args.model, fallback_models=[], max_tokens=16384, temperature=0.7, timeout_sec=300, ) llm = LLMClient(llm_config) # Quick connectivity test print(f"Testing LLM connectivity ({args.model})... ", end="", flush=True) ok, msg = llm.preflight() if not ok: print(f"FAILED: {msg}") sys.exit(1) print("OK") pm = PromptManager() # Select test cases if args.test_id > 0: if args.test_id not in TEST_CASES: print(f"ERROR: Unknown test ID {args.test_id}. Available: {list(TEST_CASES.keys())}") sys.exit(1) cases = {args.test_id: TEST_CASES[args.test_id]} else: cases = TEST_CASES # Output directory output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) all_reports = [] for test_id, tc in cases.items(): print(f"\n{'='*60}") print(f"Test {test_id}: {tc['name']}") print(f"{'='*60}") stage_dir = output_dir / f"test_{test_id}" stage_dir.mkdir(parents=True, exist_ok=True) config = CodeAgentConfig( architecture_planning=True, exec_fix_max_iterations=0 if args.no_sandbox else 3, tree_search_enabled=args.tree_search, review_max_rounds=2, ) agent = CodeAgent( llm=llm, prompts=pm, config=config, stage_dir=stage_dir, sandbox_factory=None, # No sandbox for quick test ) t0 = time.time() result = agent.generate( topic=tc["topic"], exp_plan=tc["exp_plan"], metric=tc["metric"], pkg_hint=( "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, " "torch, torchvision, sklearn, scipy, pandas, matplotlib.\n" "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). " "Use `device = torch.device('cuda')` for tensor operations.\n" ), max_tokens=16384, ) elapsed = time.time() - t0 print(f"\nGeneration time: {elapsed:.1f}s") print(f"LLM calls: {result.total_llm_calls}") print(f"Review rounds: {result.review_rounds}") print(f"Architecture spec: {len(result.architecture_spec)} chars") # Write generated files for fname, code in result.files.items(): fpath = stage_dir / fname fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(code, encoding="utf-8") lines = len(code.split("\n")) print(f" {fname}: {lines} lines") # Write architecture spec if result.architecture_spec: (stage_dir / "architecture_spec.yaml").write_text( result.architecture_spec, encoding="utf-8" ) # Analyze quality report = analyze_code_quality(result.files, tc) report["generation_time_sec"] = round(elapsed, 1) report["llm_calls"] = result.total_llm_calls report["review_rounds"] = result.review_rounds report["architecture_spec_chars"] = len(result.architecture_spec) # Print report print(f"\n--- Quality Report ---") print(f"Files: {report['num_files']}") print(f"Total lines: {report['total_lines']}") print(f"Effective lines: {report['effective_lines']}") print(f"Classes: {len(report['classes_found'])}") for cls in report["classes_found"]: print(f" - {cls['name']} ({cls['method_count']} methods, {cls['total_method_lines']} lines)") print(f"Imports: {', '.join(report['imports_found'])}") print(f"\nScores:") for k, v in report["scores"].items(): print(f" {k}: {v}/10") print(f" OVERALL: {report['overall_score']}/10") if report["issues"]: print(f"\nIssues:") for issue in report["issues"]: print(f" - {issue}") # Save report (stage_dir / "quality_report.json").write_text( json.dumps(report, indent=2), encoding="utf-8" ) all_reports.append(report) # Summary if len(all_reports) > 1: print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") for r in all_reports: print(f" {r['test_name']}: {r['overall_score']}/10 " f"({r['effective_lines']} lines, {len(r['classes_found'])} classes)") avg = sum(r["overall_score"] for r in all_reports) / len(all_reports) print(f"\n Average: {avg:.1f}/10") # Save all reports (output_dir / "all_reports.json").write_text( json.dumps(all_reports, indent=2), encoding="utf-8" ) print(f"\nAll outputs saved to: {output_dir}/") if __name__ == "__main__": main() ================================================ FILE: scripts/test_code_agent_sandbox.py ================================================ #!/usr/bin/env python3 """Test CodeAgent with Docker sandbox exec-fix loop. Generates code with Phase 1-4 (architecture, exec-fix, review), runs in Docker sandbox, verifies the exec-fix loop catches and fixes errors. Usage: python scripts/test_code_agent_sandbox.py [--model gpt-5.1] [--test-id 1] """ from __future__ import annotations import argparse import json import os import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from researchclaw.config import DockerSandboxConfig, ExperimentConfig from researchclaw.experiment.docker_sandbox import DockerSandbox from researchclaw.llm.client import LLMClient, LLMConfig from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig from researchclaw.prompts import PromptManager # --------------------------------------------------------------------------- # Test case (simple — should run quickly in sandbox) # --------------------------------------------------------------------------- TEST_CASES = { 1: { "name": "ViT on CIFAR-10 (sandbox)", "topic": ( "Comparing Vision Transformer (ViT) variants for image classification: " "investigate how patch size and number of attention heads affect " "classification accuracy on CIFAR-10" ), "exp_plan": """ objectives: - Compare ViT-Tiny variants with different patch sizes (4, 16) - Evaluate multi-head self-attention with different head counts (4, 8) datasets: - name: CIFAR-10 source: torchvision.datasets.CIFAR10 train_size: 50000 test_size: 10000 baselines: - name: SimpleViT-P16 description: Standard ViT with patch_size=16, 4 heads, learnable pos encoding proposed_methods: - name: SmallPatch-ViT implementation_spec: class_name: SmallPatchViT key_methods: [forward, _create_patches, _attention] differentiator: Uses patch_size=4 for finer-grained spatial features - name: ManyHead-ViT implementation_spec: class_name: ManyHeadViT key_methods: [forward, _multi_head_attention] differentiator: Uses 8 attention heads instead of 4 ablations: - name: SinusoidalPos-ViT description: Replace learnable positional encoding with sinusoidal metrics: - accuracy (higher is better) - training_loss compute_budget: time_limit_sec: 120 epochs: 3 """, "metric": "accuracy", }, 2: { "name": "OOD Detection (sandbox)", "topic": ( "Detecting distribution shift using uncertainty estimation: " "comparing Monte Carlo Dropout and Deep Ensembles " "for out-of-distribution detection on corrupted CIFAR-10" ), "exp_plan": """ objectives: - Implement 2 uncertainty estimation methods for OOD detection - Evaluate on CIFAR-10 vs Gaussian noise corruption as OOD - Compare AUROC for separating in-distribution from OOD samples datasets: - name: CIFAR-10 source: torchvision.datasets.CIFAR10 role: in-distribution - name: CIFAR-10-C source: Generated via Gaussian noise corruption role: out-of-distribution baselines: - name: MCDropout description: Monte Carlo Dropout with 20 forward passes implementation_spec: class_name: MCDropoutDetector key_methods: [predict_with_uncertainty, _mc_forward, compute_auroc] proposed_methods: - name: DeepEnsemble implementation_spec: class_name: DeepEnsembleDetector key_methods: [train_ensemble, predict_with_uncertainty] differentiator: Trains 3 independent models, uses prediction disagreement ablations: - name: MCDropout-5passes description: MC Dropout with only 5 forward passes metrics: - auroc (higher is better) compute_budget: time_limit_sec: 120 epochs: 3 """, "metric": "auroc", }, } def make_sandbox_factory(docker_cfg: DockerSandboxConfig): """Return a factory function that creates DockerSandbox instances.""" def factory(exp_config, workdir: Path): return DockerSandbox(docker_cfg, workdir) return factory def main(): parser = argparse.ArgumentParser(description="Test CodeAgent with Docker sandbox") parser.add_argument("--model", default="gpt-5.1", help="Model to use") parser.add_argument("--test-id", type=int, default=1, help="Test case ID") parser.add_argument("--output-dir", default="test_outputs_sandbox", help="Output dir") parser.add_argument("--exec-fix-iters", type=int, default=3, help="Max exec-fix iterations") parser.add_argument("--timeout", type=int, default=180, help="Sandbox timeout (sec)") args = parser.parse_args() # Setup LLM base_url = os.environ.get("OPENAI_BASE_URL", "") api_key = os.environ.get("OPENAI_API_KEY", "") if not base_url or not api_key: print("ERROR: Set OPENAI_BASE_URL and OPENAI_API_KEY") sys.exit(1) llm_config = LLMConfig( base_url=base_url, api_key=api_key, primary_model=args.model, fallback_models=[], max_tokens=16384, temperature=0.7, timeout_sec=300, ) llm = LLMClient(llm_config) print(f"Testing LLM connectivity ({args.model})... ", end="", flush=True) ok, msg = llm.preflight() if not ok: print(f"FAILED: {msg}") sys.exit(1) print("OK") # Docker sandbox setup docker_cfg = DockerSandboxConfig( image="researchclaw/experiment:latest", gpu_enabled=True, memory_limit_mb=16384, network_policy="setup_only", ) if not DockerSandbox.check_docker_available(): print("ERROR: Docker not available") sys.exit(1) if not DockerSandbox.ensure_image(docker_cfg.image): print(f"ERROR: Docker image {docker_cfg.image} not found") sys.exit(1) print(f"Docker sandbox ready: {docker_cfg.image}") # Select test case tc = TEST_CASES.get(args.test_id) if not tc: print(f"ERROR: Unknown test ID {args.test_id}") sys.exit(1) pm = PromptManager() output_dir = Path(args.output_dir) stage_dir = output_dir / f"test_{args.test_id}" stage_dir.mkdir(parents=True, exist_ok=True) # CodeAgent with sandbox enabled config = CodeAgentConfig( architecture_planning=True, exec_fix_max_iterations=args.exec_fix_iters, exec_fix_timeout_sec=args.timeout, tree_search_enabled=False, review_max_rounds=2, ) sandbox_factory = make_sandbox_factory(docker_cfg) agent = CodeAgent( llm=llm, prompts=pm, config=config, stage_dir=stage_dir, sandbox_factory=sandbox_factory, ) print(f"\n{'='*60}") print(f"Test {args.test_id}: {tc['name']}") print(f" exec_fix_max_iterations: {args.exec_fix_iters}") print(f" sandbox_timeout: {args.timeout}s") print(f"{'='*60}") t0 = time.time() result = agent.generate( topic=tc["topic"], exp_plan=tc["exp_plan"], metric=tc["metric"], pkg_hint=( "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, " "torch, torchvision, sklearn, scipy, pandas, matplotlib, " "tqdm, timm, einops, torchmetrics, gymnasium, networkx.\n" "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). " "Use `device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')` " "for tensor operations.\n" "DATA PATH: CIFAR-10 is pre-cached at /opt/datasets/cifar-10-batches-py/. " "Use `torchvision.datasets.CIFAR10(root='/opt/datasets', download=False)`.\n" ), max_tokens=16384, ) elapsed = time.time() - t0 # Report print(f"\n--- Generation Report ---") print(f"Time: {elapsed:.1f}s") print(f"LLM calls: {result.total_llm_calls}") print(f"Sandbox runs: {result.total_sandbox_runs}") print(f"Review rounds: {result.review_rounds}") print(f"Best score: {result.best_score}") # Write files for fname, code in result.files.items(): fpath = stage_dir / fname fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(code, encoding="utf-8") lines = len(code.split("\n")) print(f" {fname}: {lines} lines") # Write arch spec if result.architecture_spec: (stage_dir / "architecture_spec.yaml").write_text( result.architecture_spec, encoding="utf-8" ) # Write validation log (stage_dir / "validation_log.json").write_text( json.dumps({ "log": result.validation_log, "total_llm_calls": result.total_llm_calls, "total_sandbox_runs": result.total_sandbox_runs, "review_rounds": result.review_rounds, "best_score": result.best_score, "elapsed_sec": round(elapsed, 1), }, indent=2), encoding="utf-8", ) # Final sandbox run for end-to-end verification print(f"\n--- Final sandbox verification ---") workdir = stage_dir / "_final_run" workdir.mkdir(parents=True, exist_ok=True) sandbox = DockerSandbox(docker_cfg, workdir) final_result = sandbox.run_project( stage_dir, entry_point="main.py", timeout_sec=args.timeout, ) print(f"Return code: {final_result.returncode}") print(f"Elapsed: {final_result.elapsed_sec:.1f}s") print(f"Timed out: {final_result.timed_out}") if final_result.metrics: print(f"Metrics: {json.dumps(dict(final_result.metrics), indent=2)}") if final_result.returncode != 0: print(f"STDERR (last 500):\n{final_result.stderr[-500:]}") else: print("SUCCESS: Code runs to completion in Docker sandbox!") stdout_lines = final_result.stdout.strip().split("\n") print(f"STDOUT (last 10 lines):") for line in stdout_lines[-10:]: print(f" {line}") # Save final run results (stage_dir / "final_run_result.json").write_text( json.dumps({ "returncode": final_result.returncode, "elapsed_sec": final_result.elapsed_sec, "timed_out": final_result.timed_out, "metrics": dict(final_result.metrics) if final_result.metrics else {}, "stdout_tail": "\n".join(stdout_lines[-20:]) if final_result.returncode == 0 else "", "stderr_tail": final_result.stderr[-1000:] if final_result.returncode != 0 else "", }, indent=2), encoding="utf-8", ) if __name__ == "__main__": main() ================================================ FILE: scripts/test_codegen_v2.py ================================================ #!/usr/bin/env python3 """Enhanced code generation test — generates code and runs in Docker sandbox. Tests the full code generation pipeline in isolation: 1. Load experiment plan (from previous run or built-in test case) 2. Generate code via CodeAgent 3. Validate generated code (AST, security, quality) 4. Run in Docker sandbox 5. Score results comprehensively Usage: # Run with built-in test case python scripts/test_codegen_v2.py --test-id 1 # Run with real experiment plan from a previous run python scripts/test_codegen_v2.py --from-run output/run20 # Run all built-in test cases python scripts/test_codegen_v2.py --test-id 0 # Skip sandbox (only test generation quality) python scripts/test_codegen_v2.py --test-id 1 --no-sandbox """ from __future__ import annotations import argparse import ast import json import os import re import sys import time from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from researchclaw.llm.client import LLMClient, LLMConfig from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig from researchclaw.prompts import PromptManager # --------------------------------------------------------------------------- # Built-in test cases # --------------------------------------------------------------------------- TEST_CASES = { 1: { "name": "KD for Compact ViTs (CIFAR-10)", "topic": ( "Knowledge Distillation for Compact Vision Transformers: " "Attention-Guided Feature Alignment on CIFAR-10" ), "exp_plan": """ topic: "Knowledge Distillation for Compact Vision Transformers" datasets: - name: CIFAR-10 source: torchvision.datasets.CIFAR10 path: /opt/datasets/cifar10 baselines: - name: TeacherResNet18 description: Pre-trained ResNet-18 teacher model (frozen) implementation_spec: class_name: TeacherResNet18 key_methods: [__init__, forward] algorithm_steps: - Load pre-trained ResNet-18 from torchvision - Freeze all parameters - Use as teacher for distillation - name: StudentViT_Baseline description: Compact ViT trained with standard cross-entropy (no KD) implementation_spec: class_name: StudentViTBaseline key_methods: [__init__, forward, train_epoch, evaluate] algorithm_steps: - Compact ViT with patch_size=4, dim=128, depth=4, heads=4 - Train with cross-entropy loss only - Standard SGD optimizer with cosine LR schedule loss_function: "L = CrossEntropy(student_logits, labels)" key_hyperparameters: lr: 0.01 batch_size: 128 epochs: 20 proposed_methods: - name: AttentionGuidedKD description: Knowledge distillation with attention-guided feature alignment aligns_hypothesis: H1 implementation_spec: class_name: AttentionGuidedKDStudent key_methods: [__init__, forward, compute_kd_loss, compute_attention_loss, train_epoch] algorithm_steps: - Same compact ViT architecture as baseline - KD loss with temperature T=4 - Attention transfer loss between teacher and student attention maps - Combined loss = alpha * KD_loss + beta * attention_loss + (1-alpha-beta) * CE_loss loss_function: "L = 0.5*KLDiv(s/T, t/T)*T^2 + 0.3*MSE(student_attn, teacher_attn) + 0.2*CE(s, y)" key_hyperparameters: temperature: 4 alpha: 0.5 beta: 0.3 lr: 0.01 differentiator: Uses attention map alignment between teacher and student ablations: - name: KD_NoAttentionTransfer based_on: AttentionGuidedKD what_is_removed: Attention transfer loss (beta=0) how_it_differs: Only uses KD loss + CE loss, no attention alignment expected_effect: Lower accuracy due to missing attention guidance - name: KD_ReducedCapacity based_on: AttentionGuidedKD what_is_removed: Half the model capacity (dim=64, depth=2, heads=2) how_it_differs: Smaller ViT architecture, same training procedure expected_effect: Lower accuracy due to reduced model capacity metrics: primary_metric: name: primary_metric direction: maximize description: Top-1 accuracy on CIFAR-10 test set compute_budget: total_time_seconds: 300 conditions: [TeacherResNet18, StudentViT_Baseline, AttentionGuidedKD, KD_NoAttentionTransfer, KD_ReducedCapacity] """, "metric": "primary_metric", "metric_direction": "maximize", }, 2: { "name": "PPO with Curiosity Reward (Gymnasium)", "topic": ( "Agent-Centric Reinforcement Learning with Adaptive Reward " "Decomposition for CartPole and LunarLander" ), "exp_plan": """ topic: "Agent-Centric RL with Adaptive Reward Decomposition" datasets: - name: CartPole-v1 source: gymnasium - name: LunarLander-v3 source: gymnasium baselines: - name: VanillaPPO description: Standard PPO with clipped surrogate objective implementation_spec: class_name: VanillaPPO key_methods: [__init__, select_action, update, train_episode] algorithm_steps: - Policy network (2-layer MLP, 64 hidden) - Value network (separate 2-layer MLP) - Clipped surrogate objective with epsilon=0.2 - GAE lambda=0.95 for advantage estimation loss_function: "L_policy = -min(r*A, clip(r,1-eps,1+eps)*A); L_value = MSE(V, R)" key_hyperparameters: lr: 3e-4 gamma: 0.99 clip_eps: 0.2 gae_lambda: 0.95 differentiator: Standard PPO baseline proposed_methods: - name: CuriosityPPO description: PPO with intrinsic curiosity module implementation_spec: class_name: CuriosityPPO key_methods: [__init__, select_action, compute_intrinsic_reward, update, train_episode] algorithm_steps: - Same PPO base as VanillaPPO - Forward dynamics model predicts next state from (state, action) - Intrinsic reward = prediction error of forward model - Total reward = extrinsic + eta * intrinsic loss_function: "L = L_ppo + L_forward_model; r_total = r_ext + eta * ||f(s,a) - s'||^2" key_hyperparameters: eta: 0.1 forward_model_lr: 1e-3 differentiator: Adds intrinsic curiosity-driven exploration reward ablations: - name: PPO_NoCuriosity based_on: CuriosityPPO what_is_removed: Intrinsic reward (eta=0, forward model not used) how_it_differs: Same architecture but intrinsic reward zeroed out expected_effect: Should match VanillaPPO performance - name: PPO_ReducedNetwork based_on: VanillaPPO what_is_removed: Half network capacity (32 hidden units) how_it_differs: Smaller policy and value networks expected_effect: Lower performance due to limited capacity metrics: primary_metric: name: primary_metric direction: maximize description: Average episodic reward over last 10 episodes compute_budget: total_time_seconds: 300 conditions: [VanillaPPO, CuriosityPPO, PPO_NoCuriosity, PPO_ReducedNetwork] """, "metric": "primary_metric", "metric_direction": "maximize", }, 3: { "name": "Graph Neural ODE (Synthetic)", "topic": ( "Graph Neural Ordinary Differential Equations for Dynamic System " "Modeling on Synthetic Coupled Oscillator Networks" ), "exp_plan": """ topic: "Graph Neural ODE for Dynamic System Modeling" datasets: - name: SyntheticOscillators source: Generated in-code description: Coupled spring-mass system on a random graph baselines: - name: StaticGCN description: Standard GCN applied at discrete time steps implementation_spec: class_name: StaticGCN key_methods: [__init__, forward, predict_trajectory] algorithm_steps: - 2-layer GCN with message passing - Discrete time step predictions - MSE loss on next-step prediction loss_function: "L = MSE(pred_next, true_next)" key_hyperparameters: hidden_dim: 64 num_layers: 2 lr: 1e-3 proposed_methods: - name: GraphNeuralODE description: Continuous-time dynamics via Neural ODE on graph implementation_spec: class_name: GraphNeuralODE key_methods: [__init__, forward, ode_func, predict_trajectory] algorithm_steps: - GNN-based ODE function f(t, x, A) that defines dx/dt - Neural ODE solver (torchdiffeq.odeint) for continuous trajectory - MSE loss on trajectory prediction at observed time points loss_function: "L = MSE(odeint(f, x0, t), x_true)" key_hyperparameters: hidden_dim: 64 solver: dopri5 lr: 1e-3 differentiator: Continuous-time dynamics via ODE solver ablations: - name: GraphODE_NoMessagePassing based_on: GraphNeuralODE what_is_removed: Graph structure (treats nodes independently) how_it_differs: ODE function ignores adjacency, no message passing expected_effect: Worse prediction on coupled systems - name: GraphODE_EulerSolver based_on: GraphNeuralODE what_is_removed: Adaptive ODE solver (uses fixed-step Euler) how_it_differs: Simple Euler integration instead of dopri5 expected_effect: Less accurate trajectories metrics: primary_metric: name: primary_metric direction: minimize description: MSE between predicted and true trajectories compute_budget: total_time_seconds: 300 conditions: [StaticGCN, GraphNeuralODE, GraphODE_NoMessagePassing, GraphODE_EulerSolver] """, "metric": "primary_metric", "metric_direction": "minimize", }, } # --------------------------------------------------------------------------- # Code quality analysis (comprehensive) # --------------------------------------------------------------------------- def analyze_code_quality(files: dict[str, str], test_case: dict) -> dict: """Comprehensive code quality analysis.""" report = { "test_name": test_case["name"], "num_files": len(files), "file_names": list(files.keys()), "total_lines": 0, "effective_lines": 0, "classes_found": [], "functions_found": [], "imports_found": [], "issues": [], "scores": {}, } for fname, code in files.items(): lines = code.split("\n") report["total_lines"] += len(lines) effective = [ l for l in lines if l.strip() and not l.strip().startswith("#") and not l.strip().startswith('"""') and not l.strip().startswith("'''") ] report["effective_lines"] += len(effective) # AST analysis try: tree = ast.parse(code) for node in ast.walk(tree): if isinstance(node, ast.ClassDef): methods = [ n.name for n in node.body if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) ] method_lines = sum( (n.end_lineno or n.lineno) - n.lineno + 1 for n in node.body if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef)) ) # Check for empty methods empty_methods = [] for n in node.body: if isinstance(n, ast.FunctionDef): body_stmts = [ s for s in n.body if not isinstance(s, (ast.Pass, ast.Expr)) or (isinstance(s, ast.Expr) and not isinstance(s.value, (ast.Constant, ast.Str))) ] if len(body_stmts) <= 1: empty_methods.append(n.name) report["classes_found"].append({ "name": node.name, "file": fname, "methods": methods, "method_count": len(methods), "total_method_lines": method_lines, "bases": [ast.unparse(b) for b in node.bases], "empty_methods": empty_methods, }) elif isinstance(node, ast.FunctionDef) and node.col_offset == 0: report["functions_found"].append({ "name": node.name, "file": fname, "lines": (node.end_lineno or node.lineno) - node.lineno + 1, }) elif isinstance(node, (ast.Import, ast.ImportFrom)): if isinstance(node, ast.Import): for alias in node.names: report["imports_found"].append(alias.name.split(".")[0]) elif node.module: report["imports_found"].append(node.module.split(".")[0]) except SyntaxError as e: report["issues"].append(f"CRITICAL: SyntaxError in {fname}: {e}") report["imports_found"] = sorted(set(report["imports_found"])) # ---- Scoring ---- # 1. Syntax validity (0 or 10) syntax_ok = not any("SyntaxError" in i for i in report["issues"]) report["scores"]["syntax_valid"] = 10 if syntax_ok else 0 # 2. File structure file_score = min(10, len(files) * 5) # 2+ files = 10 report["scores"]["file_structure"] = round(file_score, 1) # 3. Class coverage n_classes = len(report["classes_found"]) class_score = min(10, n_classes * 2.5) # 4+ classes = 10 report["scores"]["class_coverage"] = round(class_score, 1) # 4. Code depth depth_score = min(10, report["effective_lines"] / 40) # 400+ = 10 report["scores"]["code_depth"] = round(depth_score, 1) # 5. Method richness if report["classes_found"]: avg_methods = sum(c["method_count"] for c in report["classes_found"]) / n_classes method_score = min(10, avg_methods * 2) # 5+ methods = 10 else: method_score = 0 report["scores"]["method_richness"] = round(method_score, 1) # 6. Class distinctness (check for identical/empty classes) empty_class_count = sum( 1 for c in report["classes_found"] if c["total_method_lines"] < 5 ) identical_pairs = _check_identical_classes(files) distinctness = 10 if empty_class_count > 0: distinctness -= empty_class_count * 3 report["issues"].append( f"WARNING: {empty_class_count} classes have <5 method lines (too thin)" ) if identical_pairs: distinctness -= len(identical_pairs) * 4 for p in identical_pairs: report["issues"].append(f"WARNING: Identical classes: {p}") report["scores"]["class_distinctness"] = max(0, round(distinctness, 1)) # 7. Import appropriateness has_torch = "torch" in report["imports_found"] has_numpy = "numpy" in report["imports_found"] import_score = 5 # base if has_torch: import_score += 3 if has_numpy: import_score += 2 report["scores"]["imports"] = min(10, import_score) # Overall score scores = report["scores"] report["overall_score"] = round(sum(scores.values()) / len(scores), 1) return report def _check_identical_classes(files: dict[str, str]) -> list[str]: """Check for classes with identical method bodies.""" identical = [] class_bodies: dict[str, str] = {} for fname, code in files.items(): try: tree = ast.parse(code) except SyntaxError: continue for node in ast.walk(tree): if isinstance(node, ast.ClassDef): # Hash the method bodies method_code = "" for n in node.body: if isinstance(n, ast.FunctionDef): try: method_code += ast.unparse(n) + "\n" except Exception: pass if method_code: key = hash(method_code) if key in class_bodies: identical.append( f"{class_bodies[key]} == {node.name}" ) else: class_bodies[key] = node.name return identical # --------------------------------------------------------------------------- # Sandbox execution # --------------------------------------------------------------------------- def run_in_sandbox( files: dict[str, str], output_dir: Path, config_path: str | None = None, timeout_sec: int = 300, ) -> dict: """Run generated code in subprocess (or Docker sandbox if available).""" # Write files code_dir = output_dir / "experiment" code_dir.mkdir(parents=True, exist_ok=True) for fname, code in files.items(): (code_dir / fname).write_text(code, encoding="utf-8") # Try to run with subprocess as fallback import subprocess main_py = code_dir / "main.py" if not main_py.exists(): return {"status": "failed", "reason": "no main.py"} print(f" Running in subprocess (timeout={timeout_sec}s)...") try: proc = subprocess.run( [sys.executable, str(main_py)], cwd=str(code_dir), capture_output=True, text=True, timeout=timeout_sec, env={**os.environ, "PYTHONPATH": str(code_dir)}, ) stdout = proc.stdout stderr = proc.stderr returncode = proc.returncode timed_out = False except subprocess.TimeoutExpired: stdout = "" stderr = "TIMEOUT" returncode = -1 timed_out = True # Parse results result = { "status": "success" if returncode == 0 else "failed", "returncode": returncode, "timed_out": timed_out, "stdout_lines": len(stdout.split("\n")) if stdout else 0, "stderr_lines": len(stderr.split("\n")) if stderr else 0, "conditions_found": [], "metrics_found": {}, "has_metric_def": False, "has_registered_conditions": False, } # Parse stdout for conditions and metrics if stdout: for line in stdout.split("\n"): if line.startswith("METRIC_DEF:"): result["has_metric_def"] = True elif line.startswith("REGISTERED_CONDITIONS:"): result["has_registered_conditions"] = True conds = line.split(":", 1)[1].strip() result["conditions_found"] = [c.strip() for c in conds.split(",")] elif "condition=" in line: m = re.match(r"condition=(\S+)\s+(\S+):\s+(\S+)", line) if m: cond, metric_name, value = m.groups() if cond not in result["metrics_found"]: result["metrics_found"][cond] = {} try: result["metrics_found"][cond][metric_name] = float(value) except ValueError: pass # Score execution exec_score = 0 if returncode == 0: exec_score += 3 # runs if result["has_metric_def"]: exec_score += 1 if result["has_registered_conditions"]: exec_score += 1 if result["conditions_found"]: exec_score += min(3, len(result["conditions_found"])) # up to 3 for conditions if result["metrics_found"]: exec_score += 2 # produces metrics result["exec_score"] = min(10, exec_score) # Save stdout/stderr (output_dir / "stdout.txt").write_text(stdout or "(empty)", encoding="utf-8") (output_dir / "stderr.txt").write_text(stderr or "(empty)", encoding="utf-8") return result # --------------------------------------------------------------------------- # Load experiment plan from previous run # --------------------------------------------------------------------------- def load_from_run(run_dir: str) -> dict: """Load experiment plan and config from a previous pipeline run.""" run_path = Path(run_dir) if not run_path.exists(): print(f"ERROR: Run directory not found: {run_dir}") sys.exit(1) # Find exp_plan.yaml plan_path = None for s9_dir in sorted(run_path.glob("stage-09*"), reverse=True): candidate = s9_dir / "exp_plan.yaml" if candidate.exists(): plan_path = candidate break if plan_path is None: print(f"ERROR: No exp_plan.yaml found in {run_dir}/stage-09*/") sys.exit(1) exp_plan = plan_path.read_text(encoding="utf-8") # Load topic from config or stage-01 topic = "" for topic_file in ["topic_evaluation.json", "topic.json"]: for s_dir in sorted(run_path.glob("stage-0[12]*"), reverse=True): tf = s_dir / topic_file if tf.exists(): try: td = json.loads(tf.read_text(encoding="utf-8")) topic = td.get("topic", "") or td.get("research_topic", "") if topic: break except Exception: pass if topic: break # Try to extract topic from exp_plan if not found elsewhere if not topic: import yaml try: plan_data = yaml.safe_load(exp_plan) topic = plan_data.get("topic", "Unknown Topic") except Exception: topic = "Unknown Topic" return { "name": f"From {run_path.name}", "topic": topic, "exp_plan": exp_plan, "metric": "primary_metric", "metric_direction": "maximize", } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="Test code generation quality with optional sandbox execution" ) parser.add_argument("--model", default="gpt-5.1", help="Model to use") parser.add_argument("--test-id", type=int, default=0, help="Test case ID (0=all)") parser.add_argument("--from-run", default="", help="Load exp plan from run dir") parser.add_argument("--no-sandbox", action="store_true", help="Skip sandbox execution") parser.add_argument("--sandbox-timeout", type=int, default=300, help="Sandbox timeout (sec)") parser.add_argument("--output-dir", default="test_outputs_codegen", help="Output dir") parser.add_argument("--config", default="config_run20.yaml", help="Config file for LLM") args = parser.parse_args() # Setup LLM client # Try loading from config file first config_path = Path(args.config) if config_path.exists(): import yaml with open(config_path) as f: cfg = yaml.safe_load(f) llm_cfg = cfg.get("llm", {}) base_url = llm_cfg.get("base_url", "") api_key = llm_cfg.get("api_key", "") or os.environ.get( llm_cfg.get("api_key_env", "OPENAI_API_KEY"), "" ) else: base_url = os.environ.get("OPENAI_BASE_URL", "") api_key = os.environ.get("OPENAI_API_KEY", "") if not base_url or not api_key: print("ERROR: Need LLM config. Provide --config or set env vars.") sys.exit(1) llm_config = LLMConfig( base_url=base_url, api_key=api_key, primary_model=args.model, fallback_models=["gpt-4.1", "gpt-4o"], max_tokens=16384, temperature=0.7, timeout_sec=300, ) llm = LLMClient(llm_config) # Connectivity test print(f"Testing LLM ({args.model})...", end=" ", flush=True) ok, msg = llm.preflight() if not ok: print(f"FAILED: {msg}") sys.exit(1) print("OK") pm = PromptManager() # Select test cases if args.from_run: cases = {99: load_from_run(args.from_run)} elif args.test_id > 0: if args.test_id not in TEST_CASES: print(f"ERROR: Unknown test ID {args.test_id}. Available: {list(TEST_CASES.keys())}") sys.exit(1) cases = {args.test_id: TEST_CASES[args.test_id]} else: cases = dict(TEST_CASES) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) all_reports = [] for test_id, tc in cases.items(): print(f"\n{'='*70}") print(f" Test {test_id}: {tc['name']}") print(f"{'='*70}") stage_dir = output_dir / f"test_{test_id}" stage_dir.mkdir(parents=True, exist_ok=True) # Configure CodeAgent agent_config = CodeAgentConfig( architecture_planning=True, exec_fix_max_iterations=0, # no sandbox in generation phase tree_search_enabled=False, review_max_rounds=2, ) agent = CodeAgent( llm=llm, prompts=pm, config=agent_config, stage_dir=stage_dir, ) # Build pkg_hint pkg_hint = ( "\nAVAILABLE PACKAGES (docker mode): Python stdlib, numpy, torch, " "torchvision, torchaudio, matplotlib, seaborn, scipy, tqdm, " "torchdiffeq, gymnasium, networkx, PyYAML, Pillow, transformers, " "datasets, accelerate, peft, timm, einops, torchmetrics.\n" "GPU: NVIDIA RTX 6000 Ada (49GB VRAM). " "Use `device = torch.device('cuda')` for tensor operations.\n" ) metric_dir = tc.get("metric_direction", "maximize") pkg_hint += f"\nMETRIC DIRECTION: {metric_dir}\n" # Add compute budget pkg_hint += ( "\n## Compute Budget Constraint\n" "- Total execution time limit: 300 seconds\n" "- Design experiments that complete within this budget\n" "- Implement a time guard: stop gracefully at 80% of budget\n" ) # Generate t0 = time.time() result = agent.generate( topic=tc["topic"], exp_plan=tc["exp_plan"], metric=tc.get("metric", "primary_metric"), pkg_hint=pkg_hint, max_tokens=16384, ) gen_elapsed = time.time() - t0 print(f"\n Generation: {gen_elapsed:.1f}s, {result.total_llm_calls} LLM calls") print(f" Architecture spec: {len(result.architecture_spec)} chars") print(f" Review rounds: {result.review_rounds}") # Write files for fname, code in result.files.items(): fpath = stage_dir / fname fpath.parent.mkdir(parents=True, exist_ok=True) fpath.write_text(code, encoding="utf-8") print(f" -> {fname}: {len(code.split(chr(10)))} lines") if result.architecture_spec: (stage_dir / "architecture_spec.yaml").write_text( result.architecture_spec, encoding="utf-8" ) # Quality analysis report = analyze_code_quality(result.files, tc) report["generation_time_sec"] = round(gen_elapsed, 1) report["llm_calls"] = result.total_llm_calls # Sandbox execution exec_result = {"status": "skipped"} if not args.no_sandbox and result.files: exec_result = run_in_sandbox( result.files, stage_dir, timeout_sec=args.sandbox_timeout, ) report["execution"] = exec_result print(f"\n Execution: {exec_result['status']}") if exec_result.get("returncode") is not None: print(f" Return code: {exec_result['returncode']}") if exec_result.get("conditions_found"): print(f" Conditions: {', '.join(exec_result['conditions_found'])}") if exec_result.get("metrics_found"): for cond, metrics in exec_result["metrics_found"].items(): print(f" {cond}: {metrics}") # Print scores print(f"\n --- Scores ---") for k, v in report["scores"].items(): print(f" {k}: {v}/10") if exec_result.get("exec_score") is not None: print(f" execution: {exec_result['exec_score']}/10") print(f" OVERALL: {report['overall_score']}/10") if report["issues"]: print(f"\n Issues:") for issue in report["issues"]: print(f" - {issue}") # Save report (stage_dir / "quality_report.json").write_text( json.dumps(report, indent=2, default=str), encoding="utf-8" ) all_reports.append(report) # Summary if len(all_reports) > 1: print(f"\n{'='*70}") print(" SUMMARY") print(f"{'='*70}") for r in all_reports: exec_info = "" if "execution" in r: exec_info = f" | exec: {r['execution'].get('status', '?')}" print( f" {r['test_name']}: {r['overall_score']}/10 " f"({r['effective_lines']} lines, " f"{len(r['classes_found'])} classes{exec_info})" ) avg = sum(r["overall_score"] for r in all_reports) / len(all_reports) print(f"\n Average: {avg:.1f}/10") (output_dir / "summary.json").write_text( json.dumps(all_reports, indent=2, default=str), encoding="utf-8" ) print(f"\nAll outputs saved to: {output_dir}/") if __name__ == "__main__": main() ================================================ FILE: sentinel.sh ================================================ #!/usr/bin/env bash # sentinel.sh — Watchdog for AutoResearchClaw pipeline process. # # Monitors the pipeline heartbeat file and auto-restarts on crash. # Inspired by Sibyl's sentinel watchdog design. # # Usage: # ./sentinel.sh [--python ] # # The pipeline runner writes heartbeat.json after each stage. If the # heartbeat goes stale (>5 min) and the PID is dead, sentinel restarts. # # Configuration via environment: # SENTINEL_CHECK_INTERVAL — seconds between checks (default: 60) # SENTINEL_STALE_THRESHOLD — seconds before heartbeat is stale (default: 300) # SENTINEL_MAX_RETRIES — max restart attempts (default: 5) # SENTINEL_COOLDOWN — seconds to wait after 3 consecutive failures (default: 360) set -euo pipefail # --- Arguments --- RUN_DIR="${1:?Usage: sentinel.sh [--python ]}" PYTHON_PATH="python" shift while [[ $# -gt 0 ]]; do case "$1" in --python) PYTHON_PATH="$2" shift 2 ;; *) echo "Unknown argument: $1" >&2 exit 1 ;; esac done # --- Configuration --- CHECK_INTERVAL="${SENTINEL_CHECK_INTERVAL:-60}" STALE_THRESHOLD="${SENTINEL_STALE_THRESHOLD:-300}" MAX_RETRIES="${SENTINEL_MAX_RETRIES:-5}" COOLDOWN="${SENTINEL_COOLDOWN:-360}" HEARTBEAT_FILE="${RUN_DIR}/heartbeat.json" RECOVERY_LOG="${RUN_DIR}/sentinel_recovery.log" FAILED_LOG="${RUN_DIR}/sentinel_failed.log" retry_count=0 consecutive_failures=0 log() { local msg="[sentinel $(date '+%Y-%m-%dT%H:%M:%S')] $1" echo "$msg" echo "$msg" >> "$RECOVERY_LOG" } # --- Check if heartbeat is stale --- is_stale() { if [[ ! -f "$HEARTBEAT_FILE" ]]; then return 0 # No heartbeat = stale fi local now now=$(date +%s) # Extract timestamp from heartbeat.json local hb_ts hb_ts=$(python3 -c " import json, sys try: data = json.load(open('${HEARTBEAT_FILE}')) from datetime import datetime ts = datetime.fromisoformat(data['timestamp']) print(int(ts.timestamp())) except Exception: print(0) " 2>/dev/null || echo 0) local age=$(( now - hb_ts )) [[ $age -gt $STALE_THRESHOLD ]] } # --- Check if PID is alive --- pid_alive() { local pid_file="${RUN_DIR}/pipeline.pid" if [[ ! -f "$pid_file" ]]; then return 1 fi local pid pid=$(cat "$pid_file" 2>/dev/null || echo "") if [[ -z "$pid" ]]; then return 1 fi kill -0 "$pid" 2>/dev/null } # --- Check for active subprocesses --- has_active_children() { local pid_file="${RUN_DIR}/pipeline.pid" if [[ ! -f "$pid_file" ]]; then return 1 fi local pid pid=$(cat "$pid_file" 2>/dev/null || echo "") if [[ -z "$pid" ]]; then return 1 fi # Check if any child processes exist pgrep -P "$pid" > /dev/null 2>&1 } # --- Restart pipeline --- restart_pipeline() { log "Attempting pipeline restart (attempt $((retry_count + 1))/${MAX_RETRIES})" $PYTHON_PATH -m researchclaw run --resume --output "$RUN_DIR" & local new_pid=$! echo "$new_pid" > "${RUN_DIR}/pipeline.pid" log "Pipeline restarted with PID ${new_pid}" retry_count=$((retry_count + 1)) } # --- Main loop --- log "Sentinel started for ${RUN_DIR}" log "Check interval: ${CHECK_INTERVAL}s, Stale threshold: ${STALE_THRESHOLD}s" log "Max retries: ${MAX_RETRIES}, Cooldown: ${COOLDOWN}s" while true; do sleep "$CHECK_INTERVAL" # If PID is alive, reset failure counter if pid_alive; then consecutive_failures=0 continue fi # PID is dead — check if heartbeat is stale if ! is_stale; then # Heartbeat is fresh but PID is gone — might have just exited normally continue fi # Don't interrupt active subprocesses if has_active_children; then log "Active subprocesses detected — skipping restart" continue fi # Check retry limit if [[ $retry_count -ge $MAX_RETRIES ]]; then log "Max retries (${MAX_RETRIES}) reached — sentinel giving up" echo "Sentinel failed after ${MAX_RETRIES} retries at $(date)" >> "$FAILED_LOG" exit 1 fi # Cooldown after consecutive failures consecutive_failures=$((consecutive_failures + 1)) if [[ $consecutive_failures -ge 3 ]]; then log "3 consecutive failures — cooling down for ${COOLDOWN}s" sleep "$COOLDOWN" consecutive_failures=0 fi restart_pipeline done ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/conftest.py ================================================ # conftest.py — shared pytest fixtures for researchclaw tests ================================================ FILE: tests/e2e_docker_sandbox.py ================================================ #!/usr/bin/env python3 """End-to-end verification for Docker sandbox. Run after building the image: docker build -t researchclaw/experiment:latest researchclaw/docker/ python tests/e2e_docker_sandbox.py """ from __future__ import annotations import json import sys import tempfile from pathlib import Path # Add project root to path sys.path.insert(0, str(Path(__file__).parent.parent)) from researchclaw.config import DockerSandboxConfig, ExperimentConfig from researchclaw.experiment.docker_sandbox import DockerSandbox from researchclaw.experiment.factory import create_sandbox PASS = "\033[92mPASS\033[0m" FAIL = "\033[91mFAIL\033[0m" SKIP = "\033[93mSKIP\033[0m" results: list[tuple[str, bool, str]] = [] def check(name: str, ok: bool, detail: str = "") -> None: results.append((name, ok, detail)) tag = PASS if ok else FAIL msg = f" [{tag}] {name}" if detail: msg += f" — {detail}" print(msg) def main() -> None: print("=" * 60) print("Docker Sandbox End-to-End Verification") print("=" * 60) # ── Preflight ────────────────────────────────────────────── print("\n--- Preflight ---") docker_ok = DockerSandbox.check_docker_available() check("Docker daemon reachable", docker_ok) if not docker_ok: print("\nDocker is not available. Cannot proceed.") sys.exit(1) image_ok = DockerSandbox.ensure_image("researchclaw/experiment:latest") check("Image exists locally", image_ok) if not image_ok: print("\nImage not found. Build it first:") print(" docker build -t researchclaw/experiment:latest researchclaw/docker/") sys.exit(1) # ── Test 1: Basic execution + metrics ────────────────────── print("\n--- Test 1: Basic execution + metrics ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none") sandbox = DockerSandbox(cfg, Path(tmp) / "work") code = ( "import numpy as np\n" "x = np.random.randn(100)\n" "print(f'primary_metric: {float(np.mean(x**2)):.4f}')\n" "print(f'std: {float(np.std(x)):.4f}')\n" "print('Done.')\n" ) r = sandbox.run(code, timeout_sec=60) check("returncode == 0", r.returncode == 0, f"rc={r.returncode}") check("metrics parsed", "primary_metric" in r.metrics, str(r.metrics)) check("stdout non-empty", bool(r.stdout.strip()), repr(r.stdout[:100])) check("timed_out is False", r.timed_out is False) check("elapsed_sec > 0", r.elapsed_sec > 0, f"{r.elapsed_sec:.2f}s") # ── Test 2: Multi-file project ───────────────────────────── print("\n--- Test 2: Multi-file project ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none") sandbox = DockerSandbox(cfg, Path(tmp) / "work") project = Path(tmp) / "project" project.mkdir() (project / "utils.py").write_text( "def add(a, b): return a + b\n", encoding="utf-8" ) (project / "main.py").write_text( "from utils import add\n" "result = add(3, 4)\n" "print(f'primary_metric: {result}')\n", encoding="utf-8", ) r = sandbox.run_project(project, timeout_sec=60) check("project returncode == 0", r.returncode == 0, f"rc={r.returncode}") check("project metric correct", r.metrics.get("primary_metric") == 7.0, str(r.metrics)) # ── Test 3: results.json ─────────────────────────────────── print("\n--- Test 3: results.json from volume ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none") sandbox = DockerSandbox(cfg, Path(tmp) / "work") code = ( "import json\n" "results = {'accuracy': 0.92, 'f1': 0.88}\n" "with open('results.json', 'w') as f:\n" " json.dump(results, f)\n" "print('primary_metric: 0.92')\n" ) r = sandbox.run(code, timeout_sec=60) check("results.json metric merged", "f1" in r.metrics, str(r.metrics)) # ── Test 4: Network isolation ────────────────────────────── print("\n--- Test 4: Network isolation ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none") sandbox = DockerSandbox(cfg, Path(tmp) / "work") code = ( "import urllib.request\n" "try:\n" " urllib.request.urlopen('http://example.com', timeout=5)\n" " print('NETWORK_ACCESS: yes')\n" "except Exception as e:\n" " print('NETWORK_ACCESS: no')\n" " print(f'primary_metric: 1.0')\n" ) r = sandbox.run(code, timeout_sec=30) network_blocked = "NETWORK_ACCESS: no" in r.stdout check("Network blocked (--network=none)", network_blocked, r.stdout.strip()[:200]) # ── Test 5: GPU visibility ───────────────────────────────── print("\n--- Test 5: GPU visibility ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: cfg = DockerSandboxConfig(gpu_enabled=True, network_policy="none") sandbox = DockerSandbox(cfg, Path(tmp) / "work") code = ( "import torch\n" "gpu_available = torch.cuda.is_available()\n" "if gpu_available:\n" " print(f'GPU: {torch.cuda.get_device_name(0)}')\n" " print('primary_metric: 1.0')\n" "else:\n" " print('GPU: none')\n" " print('primary_metric: 0.0')\n" ) r = sandbox.run(code, timeout_sec=60) gpu_visible = "primary_metric" in r.metrics and r.metrics["primary_metric"] == 1.0 if gpu_visible: check("GPU visible in container", True, r.stdout.strip()[:200]) else: # Not a hard failure — might not have NVIDIA runtime print(f" [{SKIP}] GPU not visible (NVIDIA Container Toolkit may not be installed)") print(f" stdout: {r.stdout.strip()[:200]}") print(f" stderr: {r.stderr.strip()[:200]}") # ── Test 6: Memory limit ────────────────────────────────── print("\n--- Test 6: Memory limit enforcement ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: # Set a very low memory limit to trigger OOM cfg = DockerSandboxConfig( gpu_enabled=False, network_policy="none", memory_limit_mb=64 ) sandbox = DockerSandbox(cfg, Path(tmp) / "work") code = ( "import numpy as np\n" "# Allocate ~200MB to exceed 64MB limit\n" "x = np.ones((25_000_000,), dtype=np.float64)\n" "print(f'primary_metric: {x.sum()}')\n" ) r = sandbox.run(code, timeout_sec=30) oom = r.returncode != 0 check("OOM kills container (64MB limit, 200MB alloc)", oom, f"rc={r.returncode}, stderr={r.stderr.strip()[:200]}") # ── Test 7: Factory integration ──────────────────────────── print("\n--- Test 7: Factory integration ---") with tempfile.TemporaryDirectory(prefix="rc_e2e_") as tmp: config = ExperimentConfig(mode="docker", docker=DockerSandboxConfig(gpu_enabled=False)) sandbox = create_sandbox(config, Path(tmp) / "work") check("Factory returns DockerSandbox", isinstance(sandbox, DockerSandbox)) r = sandbox.run("print('primary_metric: 42.0')", timeout_sec=30) check("Factory sandbox executes", r.returncode == 0 and r.metrics.get("primary_metric") == 42.0, str(r.metrics)) # ── Summary ──────────────────────────────────────────────── print("\n" + "=" * 60) passed = sum(1 for _, ok, _ in results if ok) failed = sum(1 for _, ok, _ in results if not ok) print(f"Results: {passed} passed, {failed} failed") if failed: print("\nFailed tests:") for name, ok, detail in results: if not ok: print(f" - {name}: {detail}") sys.exit(1) else: print("All tests passed!") if __name__ == "__main__": main() ================================================ FILE: tests/e2e_real_llm.py ================================================ #!/usr/bin/env python3 """Real E2E test: run all 22 stages with actual LLM API calls. Usage: .venv_arc/bin/python3 tests/e2e_real_llm.py """ from __future__ import annotations import json import sys import time from pathlib import Path import yaml # Ensure project root is on path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from researchclaw.config import RCConfig from researchclaw.adapters import AdapterBundle from researchclaw.llm.client import LLMClient from researchclaw.pipeline.stages import Stage, STAGE_SEQUENCE from researchclaw.pipeline.executor import execute_stage, StageResult from researchclaw.pipeline.runner import execute_pipeline def main() -> None: # --- Load config --- config_path = Path("config.arc.yaml") if not config_path.exists(): print("ERROR: config.arc.yaml not found") sys.exit(1) with open(config_path) as f: raw = yaml.safe_load(f) # Override for test raw["research"]["topic"] = ( "Efficient Attention Mechanisms for Long-Context Language Models" ) raw["experiment"]["mode"] = "sandbox" raw["experiment"]["time_budget_sec"] = 60 raw["experiment"]["max_iterations"] = 3 config = RCConfig.from_dict(raw, check_paths=False) adapters = AdapterBundle() # --- Create run directory --- run_dir = Path("artifacts/e2e-real-llm-run") run_dir.mkdir(parents=True, exist_ok=True) run_id = f"e2e-real-{int(time.time())}" print(f"=" * 70) print(f"ResearchClaw E2E Test — Real LLM API") print(f"Topic: {config.research.topic}") print(f"Run ID: {run_id}") print(f"Output: {run_dir}") print(f"=" * 70) # --- Run full pipeline --- start = time.time() results = execute_pipeline( run_dir=run_dir, run_id=run_id, config=config, adapters=adapters, auto_approve_gates=True, # Auto-approve all gates for E2E test kb_root=run_dir / "kb", ) total_time = time.time() - start # --- Report --- print(f"\n{'=' * 70}") print(f"RESULTS: {len(results)}/22 stages executed in {total_time:.1f}s") print(f"{'=' * 70}") passed = 0 failed = 0 for r in results: status_icon = "✅" if r.status.value == "done" else "❌" print( f" {status_icon} Stage {int(r.stage):02d} {r.stage.name}: {r.status.value} | artifacts: {r.artifacts}" ) if r.status.value == "done": passed += 1 else: failed += 1 print(f"\n{'=' * 70}") print(f"SUMMARY: {passed} passed, {failed} failed, {total_time:.1f}s total") print(f"{'=' * 70}") # --- Validate key artifacts --- checks = [ ("Stage 1 goal.md", "stage-01/goal.md"), ("Stage 10 experiment.py", "stage-10/experiment.py"), ("Stage 12 runs/", "stage-12/runs"), ("Stage 14 experiment_summary.json", "stage-14/experiment_summary.json"), ("Stage 17 paper_draft.md", "stage-17/paper_draft.md"), ("Stage 22 export files", "stage-22"), ] print("\nArtifact Checks:") for label, path in checks: full = run_dir / path exists = full.exists() if full.is_file(): size = full.stat().st_size print(f" {'✅' if exists else '❌'} {label}: {size} bytes") elif full.is_dir(): count = len(list(full.iterdir())) if exists else 0 print(f" {'✅' if exists else '❌'} {label}: {count} items") else: print(f" {'❌'} {label}: NOT FOUND") # --- Check experiment_summary.json has real data --- summary_path = run_dir / "stage-14" / "experiment_summary.json" if summary_path.exists(): summary = json.loads(summary_path.read_text()) has_metrics = bool(summary.get("metrics_summary")) print( f"\n 📊 Experiment summary has real metrics: {'YES' if has_metrics else 'NO'}" ) if has_metrics: for k, v in summary["metrics_summary"].items(): print(f" - {k}: {v}") # --- Check paper draft has real data (not placeholder) --- draft_path = run_dir / "stage-17" / "paper_draft.md" if draft_path.exists(): draft = draft_path.read_text() has_placeholder = "no quantitative results yet" in draft.lower() has_template = draft.count("Template") > 3 print( f" 📝 Paper draft: {len(draft)} chars, placeholder={has_placeholder}, template={has_template}" ) # --- Check validation report --- val_report = run_dir / "stage-10" / "validation_report.md" if val_report.exists(): print(f" 🔍 Code validation report: {val_report.stat().st_size} bytes") print(f" {val_report.read_text()[:200]}") # Final verdict if passed == 22 and failed == 0: print(f"\n🎉 ALL 22 STAGES PASSED!") sys.exit(0) else: print(f"\n⚠️ {failed} stages did not pass.") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tests/test_anthropic.py ================================================ """测试 Anthropic Messages 兼容 API 是否可用。""" from __future__ import annotations import os from typing import Any import httpx import pytest pytestmark = pytest.mark.skipif( "ANTHROPIC_API_KEY" not in os.environ, reason="ANTHROPIC_API_KEY not set", ) BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") MODEL = os.environ.get("ANTHROPIC_MODEL", "claude-haiku-4-5-20251001") def _create_message() -> dict[str, Any]: url = f"{BASE_URL.rstrip('/')}/v1/messages" headers = { "content-type": "application/json", "anthropic-version": "2023-06-01", "x-api-key": API_KEY, } payload = { "model": MODEL, "max_tokens": 256, "messages": [{"role": "user", "content": "Say hello in one sentence."}], } with httpx.Client(timeout=30.0) as client: response = client.post(url, headers=headers, json=payload) response.raise_for_status() return response.json() def test_anthropic_api() -> None: message = _create_message() usage = message.get("usage", {}) content = message.get("content", []) text_blocks = [block.get("text", "") for block in content if block.get("type") == "text"] print(f"Status: stop_reason={message.get('stop_reason')}") print(f"Model: {message.get('model')}") print(f"Usage: input={usage.get('input_tokens')}, output={usage.get('output_tokens')}") print(f"Response: {' '.join(text_blocks)}") assert message.get("type") == "message" assert len(content) > 0 print("\n✅ API 可用!") if __name__ == "__main__": test_anthropic_api() ================================================ FILE: tests/test_assessor.py ================================================ """Tests for researchclaw.assessor — Paper Quality Assessor (Agent D3). 20+ tests covering rubrics, scorer, venue_recommender, and comparator. """ from __future__ import annotations import asyncio import json from pathlib import Path from typing import Any from unittest.mock import AsyncMock import pytest from researchclaw.assessor.rubrics import RUBRICS, Rubric from researchclaw.assessor.scorer import PaperScorer from researchclaw.assessor.venue_recommender import VenueRecommender from researchclaw.assessor.comparator import HistoryComparator # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def _sample_paper() -> str: return ( "# Novel Graph Attention Networks\n\n" "## Abstract\nWe propose a new method for graph-based learning.\n\n" "## Experiments\nWe compare against baseline on CIFAR-10.\n" "Results are shown in table 1 and figure 2.\n" "Our method achieves 95.2% accuracy.\n" ) * 5 # ~500 words def _sample_scores(overall: float = 7.5) -> dict[str, Any]: return { "scores": { "novelty": 7.0, "rigor": 8.0, "clarity": 7.0, "impact": 7.5, "experiments": 8.0, }, "overall": overall, } class MockLLM: """Minimal mock LLM client.""" def __init__(self, response: str = "SCORE: 7\nREASON: Solid contribution"): self.response = response async def chat_async(self, prompt: str) -> str: return self.response class FailingLLM: async def chat_async(self, prompt: str) -> str: raise RuntimeError("API error") # =================================================================== # Rubric tests # =================================================================== class TestRubrics: def test_all_five_dimensions_present(self): assert set(RUBRICS.keys()) == { "novelty", "rigor", "clarity", "impact", "experiments" } def test_rubric_is_frozen(self): r = RUBRICS["novelty"] with pytest.raises(AttributeError): r.name = "changed" # type: ignore[misc] def test_rubric_has_criteria_and_scale(self): for dim, rubric in RUBRICS.items(): assert rubric.criteria, f"{dim} missing criteria" assert rubric.scale, f"{dim} missing scale" def test_default_weight(self): r = Rubric(name="test", criteria="test criteria", scale="1-10") assert r.weight == 1.0 # =================================================================== # PaperScorer tests # =================================================================== class TestPaperScorer: def test_score_without_llm(self): scorer = PaperScorer() result = asyncio.run(scorer.score(_sample_paper())) assert "overall" in result assert "scores" in result assert isinstance(result["overall"], float) assert len(result["dimensions_evaluated"]) == 5 def test_score_with_mock_llm(self): llm = MockLLM("SCORE: 8\nREASON: Excellent work") scorer = PaperScorer(llm_client=llm) result = asyncio.run(scorer.score(_sample_paper())) assert result["overall"] == 8.0 for dim in result["scores"]: assert result["scores"][dim] == 8.0 def test_score_with_failing_llm_falls_back(self): scorer = PaperScorer(llm_client=FailingLLM()) result = asyncio.run(scorer.score(_sample_paper())) # Should still return valid scores via heuristic assert "overall" in result assert result["overall"] > 0 def test_score_subset_dimensions(self): scorer = PaperScorer(dimensions=("novelty", "clarity")) result = asyncio.run(scorer.score(_sample_paper())) assert len(result["dimensions_evaluated"]) == 2 def test_parse_score_valid(self): score, reason = PaperScorer._parse_score_response( "SCORE: 9\nREASON: Breakthrough paper", "novelty" ) assert score == 9.0 assert reason == "Breakthrough paper" def test_parse_score_clamped(self): score, _ = PaperScorer._parse_score_response("SCORE: 15", "test") assert score == 10.0 score, _ = PaperScorer._parse_score_response("SCORE: 0", "test") assert score == 1.0 def test_parse_score_missing(self): score, reason = PaperScorer._parse_score_response("No format here", "test") assert score == 5.0 # default assert reason == "No detail provided" def test_heuristic_clarity_long_paper(self): long_paper = "word " * 4000 score, detail = PaperScorer._heuristic_score(long_paper, RUBRICS["clarity"]) assert score == 6.0 assert "4000" in detail def test_heuristic_clarity_short_paper(self): short_paper = "word " * 500 score, _ = PaperScorer._heuristic_score(short_paper, RUBRICS["clarity"]) assert score == 3.0 def test_heuristic_experiments_with_table_and_figure(self): paper = "Results in table 1 and figure 3 show improvements." score, _ = PaperScorer._heuristic_score(paper, RUBRICS["experiments"]) assert score == 7.0 # 4.0 + 1.5 + 1.5 def test_heuristic_experiments_no_evidence(self): paper = "We discuss theoretical implications." score, _ = PaperScorer._heuristic_score(paper, RUBRICS["experiments"]) assert score == 4.0 def test_heuristic_default_dimension(self): paper = "Some paper content" score, reason = PaperScorer._heuristic_score(paper, RUBRICS["novelty"]) assert score == 5.0 assert "default" in reason.lower() # =================================================================== # VenueRecommender tests # =================================================================== class TestVenueRecommender: def test_recommend_high_score(self): rec = VenueRecommender() scores = _sample_scores(overall=9.0) results = rec.recommend(scores) # Should include tier 1 venues tier_1_venues = [r for r in results if r["tier"] == "tier_1"] assert len(tier_1_venues) > 0 def test_recommend_low_score(self): rec = VenueRecommender() scores = _sample_scores(overall=2.0) results = rec.recommend(scores) assert len(results) == 0 def test_recommend_medium_score_no_tier1(self): rec = VenueRecommender() scores = _sample_scores(overall=5.0) results = rec.recommend(scores) tier_1 = [r for r in results if r["tier"] == "tier_1"] assert len(tier_1) == 0 def test_recommend_filter_by_domain(self): rec = VenueRecommender() scores = _sample_scores(overall=9.0) results = rec.recommend(scores, domains=["cv"]) for r in results: assert "cv" in r["venue_domains"] or "deep-learning" in r["venue_domains"] def test_get_suggestion_weak_dimension(self): scores = {"scores": {"novelty": 3, "clarity": 8}, "overall": 5.5} suggestion = VenueRecommender._get_suggestion("ICML", scores) assert "novelty" in suggestion.lower() assert "Strengthen" in suggestion def test_get_suggestion_moderate(self): scores = {"scores": {"novelty": 6, "clarity": 8}, "overall": 7.0} suggestion = VenueRecommender._get_suggestion("ICML", scores) assert "improving" in suggestion.lower() def test_get_suggestion_strong(self): scores = {"scores": {"novelty": 8, "clarity": 9}, "overall": 8.5} suggestion = VenueRecommender._get_suggestion("ICML", scores) assert "strong" in suggestion.lower() def test_get_suggestion_no_scores(self): scores = {"overall": 5.0} suggestion = VenueRecommender._get_suggestion("ICML", scores) assert "Evaluate" in suggestion def test_format_recommendations_empty(self): rec = VenueRecommender() output = rec.format_recommendations([]) assert "No suitable venues" in output def test_format_recommendations_with_data(self): rec = VenueRecommender() results = rec.recommend(_sample_scores(overall=9.0)) output = rec.format_recommendations(results) assert "Venue Recommendations" in output # =================================================================== # HistoryComparator tests # =================================================================== class TestHistoryComparator: def test_record_and_get_history(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(7.5)) history = comp.get_history() assert len(history) == 1 assert history[0]["run_id"] == "run-1" def test_record_persists_to_disk(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(7.5)) # Reload from disk comp2 = HistoryComparator(history_dir=tmp_path) assert len(comp2.get_history()) == 1 def test_compare_no_history(self): comp = HistoryComparator() result = comp.compare(_sample_scores(8.0)) assert result["comparison"] == "no_history" def test_compare_with_previous(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(6.0)) result = comp.compare(_sample_scores(8.0), previous_run_id="run-1") assert result["comparison"] == "success" assert result["delta"] == 2.0 assert result["trend"] == "improved" def test_compare_stable_trend(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(7.5)) result = comp.compare(_sample_scores(7.5)) assert result["trend"] == "stable" def test_compare_declined_trend(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(9.0)) result = comp.compare(_sample_scores(7.0)) assert result["trend"] == "declined" def test_compare_not_found(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(7.0)) result = comp.compare(_sample_scores(8.0), previous_run_id="nonexistent") assert result["comparison"] == "not_found" def test_get_best_run(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) comp.record("run-1", "topic A", _sample_scores(6.0)) comp.record("run-2", "topic B", _sample_scores(9.0)) comp.record("run-3", "topic C", _sample_scores(7.5)) best = comp.get_best_run() assert best is not None assert best["run_id"] == "run-2" def test_get_best_run_empty(self): comp = HistoryComparator() assert comp.get_best_run() is None def test_dimension_deltas(self, tmp_path: Path): comp = HistoryComparator(history_dir=tmp_path) scores_old = { "scores": {"novelty": 5.0, "clarity": 6.0}, "overall": 5.5, } scores_new = { "scores": {"novelty": 7.0, "clarity": 8.0}, "overall": 7.5, } comp.record("run-1", "topic A", scores_old) result = comp.compare(scores_new, previous_run_id="run-1") assert result["dimension_deltas"]["novelty"] == 2.0 assert result["dimension_deltas"]["clarity"] == 2.0 ================================================ FILE: tests/test_benchmark_agent.py ================================================ """Tests for the BenchmarkAgent multi-agent system.""" from __future__ import annotations import json from dataclasses import dataclass, field from pathlib import Path from typing import Any import pytest import yaml # --------------------------------------------------------------------------- # Fake LLM client (same pattern as test_code_agent.py) # --------------------------------------------------------------------------- @dataclass class FakeLLMResponse: content: str = "" model: str = "fake" prompt_tokens: int = 10 completion_tokens: int = 20 total_tokens: int = 30 finish_reason: str = "stop" truncated: bool = False raw: dict = field(default_factory=dict) class FakeLLM: """Fake LLM that returns preconfigured responses.""" def __init__(self, responses: list[str] | None = None) -> None: self._responses = list(responses or []) self._idx = 0 self.calls: list[dict[str, Any]] = [] def chat(self, messages, **kwargs) -> FakeLLMResponse: self.calls.append({"messages": messages, **kwargs}) if self._idx < len(self._responses): content = self._responses[self._idx] self._idx += 1 else: content = '{"benchmarks": [], "baselines": []}' return FakeLLMResponse(content=content) # --------------------------------------------------------------------------- # Knowledge base tests # --------------------------------------------------------------------------- class TestBenchmarkKnowledge: """Test the benchmark_knowledge.yaml file.""" def test_knowledge_file_exists(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH assert _KNOWLEDGE_PATH.exists(), f"Knowledge file missing: {_KNOWLEDGE_PATH}" def test_knowledge_loads(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) assert isinstance(data, dict) assert "domains" in data def test_knowledge_has_domains(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) domains = data["domains"] assert len(domains) >= 10, f"Expected 10+ domains, got {len(domains)}" def test_each_domain_has_benchmarks_and_baselines(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) for did, info in data["domains"].items(): assert "keywords" in info, f"Domain {did} missing keywords" assert "standard_benchmarks" in info, f"Domain {did} missing benchmarks" assert "common_baselines" in info, f"Domain {did} missing baselines" assert len(info["standard_benchmarks"]) > 0, f"Domain {did} has 0 benchmarks" assert len(info["common_baselines"]) > 0, f"Domain {did} has 0 baselines" def test_benchmark_entries_have_required_fields(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) for did, info in data["domains"].items(): for b in info["standard_benchmarks"]: assert "name" in b, f"Benchmark in {did} missing name" assert "tier" in b, f"Benchmark {b.get('name')} in {did} missing tier" assert b["tier"] in (1, 2, 3), f"Invalid tier for {b.get('name')}" def test_baseline_entries_have_required_fields(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import _KNOWLEDGE_PATH data = yaml.safe_load(_KNOWLEDGE_PATH.read_text(encoding="utf-8")) for did, info in data["domains"].items(): for bl in info["common_baselines"]: assert "name" in bl, f"Baseline in {did} missing name" assert "source" in bl, f"Baseline {bl.get('name')} in {did} missing source" assert "paper" in bl, f"Baseline {bl.get('name')} in {did} missing paper" # --------------------------------------------------------------------------- # Surveyor tests # --------------------------------------------------------------------------- class TestSurveyor: """Test SurveyorAgent domain matching and local search.""" def test_domain_matching_image_classification(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) domains = agent._match_domains( "Image Classification with Contrastive Learning" ) assert "image_classification" in domains def test_domain_matching_rl(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) domains = agent._match_domains( "Reinforcement Learning for Continuous Control" ) assert "reinforcement_learning" in domains def test_domain_matching_knowledge_distillation(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) domains = agent._match_domains( "Knowledge Distillation with Feature Alignment" ) assert "knowledge_distillation" in domains def test_domain_matching_multiple(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) domains = agent._match_domains( "Self-Supervised Contrastive Learning for Image Classification" ) assert len(domains) >= 2 def test_local_candidates_returns_benchmarks(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) result = agent._get_local_candidates(["image_classification"]) assert len(result["benchmarks"]) > 0 assert len(result["baselines"]) > 0 def test_execute_returns_benchmarks(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) result = agent.execute({ "topic": "Image Classification with Data Augmentation", "hypothesis": "Novel augmentation improves accuracy", }) assert result.success assert len(result.data["benchmarks"]) > 0 def test_execute_with_unknown_topic_uses_llm_fallback(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent llm = FakeLLM([json.dumps({ "benchmarks": [{"name": "CustomDS", "tier": 2}], "baselines": [{"name": "CustomBL", "source": "custom", "paper": "X"}], "rationale": "test", })]) agent = SurveyorAgent(llm, enable_hf_search=False) result = agent.execute({ "topic": "Completely Novel Alien Technology Classification", "hypothesis": "", }) assert result.success assert result.data["llm_fallback_used"] def test_extract_search_keywords(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent kws = SurveyorAgent._extract_search_keywords( "Novel Approach for Image Classification using Contrastive Learning" ) assert len(kws) >= 1 for kw in kws: assert "novel" not in kw.lower() assert "using" not in kw.lower() def test_execute_empty_topic_fails(self) -> None: from researchclaw.agents.benchmark_agent.surveyor import SurveyorAgent agent = SurveyorAgent(FakeLLM(), enable_hf_search=False) result = agent.execute({"topic": ""}) assert not result.success # --------------------------------------------------------------------------- # Selector tests # --------------------------------------------------------------------------- class TestSelector: """Test SelectorAgent filtering and ranking logic.""" @pytest.fixture() def benchmarks(self) -> list[dict]: return [ {"name": "CIFAR-10", "tier": 1, "size_mb": 170, "origin": "knowledge_base", "metrics": ["accuracy"]}, {"name": "CIFAR-100", "tier": 1, "size_mb": 170, "origin": "knowledge_base", "metrics": ["accuracy"]}, {"name": "Tiny-ImageNet", "tier": 2, "size_mb": 237, "origin": "knowledge_base", "metrics": ["top1_accuracy"]}, {"name": "ImageNet-1K", "tier": 3, "size_mb": 168000, "origin": "knowledge_base", "metrics": ["top1_accuracy"]}, {"name": "hf/custom-ds", "tier": 2, "size_mb": 500, "origin": "huggingface_hub", "downloads": 1000}, ] @pytest.fixture() def baselines(self) -> list[dict]: return [ {"name": "ResNet-18", "origin": "knowledge_base", "pip": [], "paper": "He et al."}, {"name": "ViT-B/16", "origin": "knowledge_base", "pip": ["timm"], "paper": "Dosovitskiy et al."}, ] def test_filter_excludes_tier3(self, benchmarks: list[dict]) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent agent = SelectorAgent(FakeLLM(), tier_limit=2) filtered = agent._filter_benchmarks(benchmarks) names = [b["name"] for b in filtered] assert "ImageNet-1K" not in names assert "CIFAR-10" in names def test_filter_network_none_only_tier1(self, benchmarks: list[dict]) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent agent = SelectorAgent(FakeLLM(), network_policy="none") filtered = agent._filter_benchmarks(benchmarks) for b in filtered: assert b["tier"] == 1 def test_ranking_prefers_tier1(self, benchmarks: list[dict]) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent agent = SelectorAgent(FakeLLM()) filtered = agent._filter_benchmarks(benchmarks) ranked = agent._rank_benchmarks(filtered) # Tier 1 should come first assert ranked[0]["tier"] == 1 def test_ranking_prefers_knowledge_base(self, benchmarks: list[dict]) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent agent = SelectorAgent(FakeLLM()) filtered = agent._filter_benchmarks(benchmarks) ranked = agent._rank_benchmarks(filtered) # Knowledge base entries should precede HF entries of same tier kb_indices = [i for i, b in enumerate(ranked) if b["origin"] == "knowledge_base"] hf_indices = [i for i, b in enumerate(ranked) if b["origin"] == "huggingface_hub"] if kb_indices and hf_indices: assert min(kb_indices) < min(hf_indices) def test_execute_selects_minimum(self, benchmarks: list[dict], baselines: list[dict]) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent llm = FakeLLM([json.dumps({ "primary_benchmark": "CIFAR-10", "secondary_benchmarks": ["CIFAR-100"], "selected_baselines": ["ResNet-18", "ViT-B/16"], "rationale": "Standard benchmarks", "experiment_notes": "", })]) agent = SelectorAgent(llm, min_benchmarks=1, min_baselines=2) result = agent.execute({ "topic": "Image Classification", "survey": {"benchmarks": benchmarks, "baselines": baselines}, }) assert result.success assert len(result.data["selected_benchmarks"]) >= 1 assert len(result.data["selected_baselines"]) >= 2 # --------------------------------------------------------------------------- # Acquirer tests # --------------------------------------------------------------------------- class TestAcquirer: """Test AcquirerAgent code generation.""" def test_generate_setup_script_tier1_only(self) -> None: from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent agent = AcquirerAgent(FakeLLM()) script = agent._generate_setup_script( [{"name": "CIFAR-10", "tier": 1, "api": "torchvision..."}], [] ) # Tier 1 datasets don't need setup scripts assert script == "" def test_generate_setup_script_tier2(self) -> None: from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent agent = AcquirerAgent(FakeLLM()) script = agent._generate_setup_script( [{"name": "IMDB", "tier": 2, "api": "datasets.load_dataset('imdb', cache_dir='/workspace/data/hf')"}], [], ) assert "download_datasets" in script assert "load_dataset" in script def test_generate_requirements_filters_builtin(self) -> None: from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent agent = AcquirerAgent(FakeLLM()) reqs = agent._generate_requirements(["torch", "numpy", "xgboost", "timm"]) assert "torch" not in reqs assert "numpy" not in reqs assert "timm" not in reqs assert "xgboost" in reqs def test_strip_fences(self) -> None: from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent code = "```python\nimport torch\n```" assert AcquirerAgent._strip_fences(code) == "import torch" def test_execute_generates_code(self) -> None: from researchclaw.agents.benchmark_agent.acquirer import AcquirerAgent llm = FakeLLM([ "import torchvision\ndef get_datasets(): pass", "import torch.nn as nn\ndef get_baselines(): pass", ]) agent = AcquirerAgent(llm) result = agent.execute({ "topic": "Image Classification", "selection": { "selected_benchmarks": [ {"name": "CIFAR-10", "tier": 1, "role": "primary", "api": "torchvision.datasets.CIFAR10(...)"}, ], "selected_baselines": [ {"name": "ResNet-18", "source": "torchvision.models.resnet18()", "paper": "He et al.", "pip": []}, ], "required_pip": [], }, }) assert result.success assert result.data["data_loader_code"] # --------------------------------------------------------------------------- # Validator tests # --------------------------------------------------------------------------- class TestValidator: """Test ValidatorAgent code validation.""" def test_syntax_check_valid(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) errors = agent._check_syntax("import torch\nx = 1 + 2", "test") assert errors == [] def test_syntax_check_invalid(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) errors = agent._check_syntax("def foo(\n x = ", "test") assert len(errors) > 0 assert "SyntaxError" in errors[0] def test_import_check_builtin_ok(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) warnings = agent._check_imports("import torch\nimport numpy", "test", []) assert warnings == [] def test_import_check_unknown(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) warnings = agent._check_imports("import some_obscure_lib", "test", []) assert len(warnings) > 0 def test_import_check_with_requirements(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) warnings = agent._check_imports( "import xgboost", "test", ["xgboost"], ) assert warnings == [] def test_execute_passes_valid_code(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent llm = FakeLLM([json.dumps({ "passed": True, "issues": [], "suggestions": [], "severity": "none", })]) agent = ValidatorAgent(llm) result = agent.execute({ "acquisition": { "data_loader_code": "import torch\ndef get_datasets(): pass", "baseline_code": "import torch.nn as nn\ndef get_baselines(): pass", "setup_code": "", "requirements": "", "benchmark_names": ["CIFAR-10"], "baseline_names": ["ResNet-18"], }, }) assert result.success assert result.data["passed"] def test_execute_fails_syntax_error(self) -> None: from researchclaw.agents.benchmark_agent.validator import ValidatorAgent agent = ValidatorAgent(FakeLLM()) result = agent.execute({ "acquisition": { "data_loader_code": "def foo(\n x = ", "baseline_code": "", "setup_code": "", "requirements": "", "benchmark_names": [], "baseline_names": [], }, }) assert not result.data["passed"] assert len(result.data["errors"]) > 0 # --------------------------------------------------------------------------- # Orchestrator tests # --------------------------------------------------------------------------- class TestOrchestrator: """Test BenchmarkOrchestrator end-to-end.""" def test_orchestrate_produces_plan(self, tmp_path: Path) -> None: from researchclaw.agents.benchmark_agent.orchestrator import ( BenchmarkAgentConfig, BenchmarkOrchestrator, ) responses = [ # Selector LLM response json.dumps({ "primary_benchmark": "CIFAR-10", "secondary_benchmarks": ["CIFAR-100"], "selected_baselines": ["ResNet-18", "ViT-B/16"], "rationale": "Standard CV benchmarks", "experiment_notes": "Use standard augmentation", }), # Acquirer: data_loader_code "import torchvision\ndef get_datasets(data_root='/workspace/data'):\n return {}", # Acquirer: baseline_code "import torch.nn as nn\ndef get_baselines(num_classes=10):\n return {}", # Validator: LLM review json.dumps({ "passed": True, "issues": [], "suggestions": ["Add transforms"], "severity": "none", }), ] cfg = BenchmarkAgentConfig(enable_hf_search=False) orchestrator = BenchmarkOrchestrator( FakeLLM(responses), config=cfg, stage_dir=tmp_path / "benchmark_agent", ) plan = orchestrator.orchestrate({ "topic": "Image Classification with Data Augmentation", "hypothesis": "Novel augmentation improves accuracy", }) assert len(plan.selected_benchmarks) >= 1 assert len(plan.selected_baselines) >= 1 assert plan.validation_passed assert plan.total_llm_calls > 0 assert plan.elapsed_sec > 0 def test_orchestrate_saves_artifacts(self, tmp_path: Path) -> None: from researchclaw.agents.benchmark_agent.orchestrator import ( BenchmarkAgentConfig, BenchmarkOrchestrator, ) responses = [ json.dumps({ "primary_benchmark": "CIFAR-10", "secondary_benchmarks": [], "selected_baselines": ["ResNet-18"], "rationale": "test", "experiment_notes": "", }), "def get_datasets(): pass", "def get_baselines(): pass", json.dumps({"passed": True, "issues": [], "suggestions": [], "severity": "none"}), ] stage_dir = tmp_path / "benchmark_agent" cfg = BenchmarkAgentConfig(enable_hf_search=False) orchestrator = BenchmarkOrchestrator( FakeLLM(responses), config=cfg, stage_dir=stage_dir, ) orchestrator.orchestrate({ "topic": "Image Classification", "hypothesis": "", }) assert (stage_dir / "survey_results.json").exists() assert (stage_dir / "selection_results.json").exists() assert (stage_dir / "benchmark_plan.json").exists() def test_plan_to_prompt_block(self) -> None: from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan plan = BenchmarkPlan( selected_benchmarks=[ {"name": "CIFAR-10", "role": "primary", "metrics": ["accuracy"], "api": "torchvision.datasets.CIFAR10(...)"}, ], selected_baselines=[ {"name": "ResNet-18", "source": "torchvision.models.resnet18()", "paper": "He et al."}, ], data_loader_code="def get_datasets(): pass", baseline_code="def get_baselines(): pass", ) block = plan.to_prompt_block() assert "CIFAR-10" in block assert "ResNet-18" in block assert "get_datasets" in block assert "get_baselines" in block def test_plan_to_dict_serializable(self) -> None: from researchclaw.agents.benchmark_agent.orchestrator import BenchmarkPlan plan = BenchmarkPlan( selected_benchmarks=[{"name": "test"}], data_loader_code="code", ) d = plan.to_dict() # Should be JSON-serializable json_str = json.dumps(d) assert "test" in json_str # --------------------------------------------------------------------------- # Config tests # --------------------------------------------------------------------------- class TestConfig: """Test BenchmarkAgentConfig in config.py.""" def test_default_config_has_benchmark_agent(self) -> None: from researchclaw.config import ExperimentConfig cfg = ExperimentConfig() assert hasattr(cfg, "benchmark_agent") assert cfg.benchmark_agent.enabled is True def test_parse_benchmark_agent_config(self) -> None: from researchclaw.config import _parse_benchmark_agent_config cfg = _parse_benchmark_agent_config({ "enabled": False, "tier_limit": 1, "min_baselines": 3, }) assert cfg.enabled is False assert cfg.tier_limit == 1 assert cfg.min_baselines == 3 def test_parse_benchmark_agent_config_empty(self) -> None: from researchclaw.config import _parse_benchmark_agent_config cfg = _parse_benchmark_agent_config({}) assert cfg.enabled is True assert cfg.tier_limit == 2 # --------------------------------------------------------------------------- # Base agent tests # --------------------------------------------------------------------------- class TestBaseAgent: """Test the base agent class.""" def test_parse_json_direct(self) -> None: from researchclaw.agents.base import BaseAgent result = BaseAgent._parse_json('{"key": "value"}') assert result == {"key": "value"} def test_parse_json_fenced(self) -> None: from researchclaw.agents.base import BaseAgent result = BaseAgent._parse_json('Some text\n```json\n{"key": 1}\n```\nMore text') assert result == {"key": 1} def test_parse_json_embedded(self) -> None: from researchclaw.agents.base import BaseAgent result = BaseAgent._parse_json('Here is the result: {"a": 2} end') assert result == {"a": 2} def test_parse_json_invalid(self) -> None: from researchclaw.agents.base import BaseAgent result = BaseAgent._parse_json("no json here at all") assert result is None # --------------------------------------------------------------------------- # Required baselines injection (Improvement E) # --------------------------------------------------------------------------- class TestRequiredBaselines: """Test that required baselines are injected from knowledge base.""" def test_inject_required_baselines_image_classification(self) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent llm = FakeLLM() agent = SelectorAgent(llm, min_baselines=1) selected: list[dict[str, Any]] = [ {"name": "EfficientNet-B0", "origin": "knowledge_base"}, ] injected = agent._inject_required_baselines( "image classification on CIFAR-10", selected, [], ) # Should inject ResNet-50 and ViT-B/16 (required for image_classification) injected_names = {b["name"] for b in injected} assert "ResNet-50" in injected_names assert "ViT-B/16" in injected_names # Already-present baselines should not be duplicated assert sum(1 for b in selected if b["name"] == "EfficientNet-B0") == 1 def test_inject_required_baselines_no_duplicates(self) -> None: from researchclaw.agents.benchmark_agent.selector import SelectorAgent llm = FakeLLM() agent = SelectorAgent(llm, min_baselines=1) selected: list[dict[str, Any]] = [ {"name": "ResNet-50", "origin": "knowledge_base"}, {"name": "ViT-B/16", "origin": "llm_suggestion"}, ] injected = agent._inject_required_baselines( "image classification on CIFAR-10", selected, [], ) # Both are already present → nothing should be injected assert len(injected) == 0 ================================================ FILE: tests/test_calendar.py ================================================ """Tests for researchclaw.calendar — Conference Deadline Calendar (Agent D4). 15+ tests covering deadlines, planner, and reminder modules. """ from __future__ import annotations from datetime import date, timedelta from pathlib import Path import pytest import yaml from researchclaw.calendar.deadlines import Conference, ConferenceCalendar from researchclaw.calendar.planner import SubmissionPlanner from researchclaw.calendar.reminder import Reminder, ReminderCalculator # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def _make_conference( name: str = "TestConf", full_name: str = "Test Conference", domains: tuple[str, ...] = ("ml",), tier: int = 1, abstract_deadline: date | None = None, paper_deadline: date | None = None, **kwargs, ) -> Conference: return Conference( name=name, full_name=full_name, domains=domains, tier=tier, abstract_deadline=abstract_deadline, paper_deadline=paper_deadline, **kwargs, ) def _future(days: int) -> date: return date.today() + timedelta(days=days) def _past(days: int) -> date: return date.today() - timedelta(days=days) # =================================================================== # Conference dataclass tests # =================================================================== class TestConference: def test_from_dict_minimal(self): data = {"name": "NeurIPS"} conf = Conference.from_dict(data) assert conf.name == "NeurIPS" assert conf.tier == 3 # default assert conf.domains == () def test_from_dict_full(self): data = { "name": "ICML", "full_name": "International Conference on Machine Learning", "domains": ["ml", "ai"], "tier": 1, "url": "https://icml.cc", "abstract_deadline": "2026-06-01", "paper_deadline": "2026-06-08", } conf = Conference.from_dict(data) assert conf.name == "ICML" assert conf.full_name == "International Conference on Machine Learning" assert conf.domains == ("ml", "ai") assert conf.tier == 1 assert conf.abstract_deadline == date(2026, 6, 1) assert conf.paper_deadline == date(2026, 6, 8) def test_from_dict_date_passthrough(self): """date objects in YAML are already date instances.""" data = { "name": "X", "abstract_deadline": date(2026, 12, 1), } conf = Conference.from_dict(data) assert conf.abstract_deadline == date(2026, 12, 1) def test_next_deadline_returns_earliest_future(self): conf = _make_conference( abstract_deadline=_future(10), paper_deadline=_future(20), ) assert conf.next_deadline == _future(10) def test_next_deadline_skips_past(self): conf = _make_conference( abstract_deadline=_past(5), paper_deadline=_future(15), ) assert conf.next_deadline == _future(15) def test_next_deadline_none_when_all_past(self): conf = _make_conference( abstract_deadline=_past(10), paper_deadline=_past(5), ) assert conf.next_deadline is None def test_days_until_deadline(self): conf = _make_conference(paper_deadline=_future(30)) assert conf.days_until_deadline == 30 def test_days_until_deadline_none(self): conf = _make_conference() assert conf.days_until_deadline is None # =================================================================== # ConferenceCalendar tests # =================================================================== class TestConferenceCalendar: def test_load_from_yaml(self, tmp_path: Path): data = { "conferences": [ { "name": "TestConf", "domains": ["ml"], "tier": 1, "paper_deadline": (_future(30)).isoformat(), }, { "name": "TestConf2", "domains": ["cv"], "tier": 2, "paper_deadline": (_future(60)).isoformat(), }, ] } yaml_path = tmp_path / "conferences.yaml" yaml_path.write_text(yaml.dump(data), encoding="utf-8") cal = ConferenceCalendar.load(yaml_path) assert len(cal.conferences) == 2 assert cal.conferences[0].name == "TestConf" def test_load_skips_invalid_entries(self, tmp_path: Path): data = { "conferences": [ {"name": "Valid", "tier": 1}, {"invalid": "no name field"}, ] } yaml_path = tmp_path / "conf.yaml" yaml_path.write_text(yaml.dump(data), encoding="utf-8") cal = ConferenceCalendar.load(yaml_path) assert len(cal.conferences) == 1 def test_get_upcoming_filters_by_days(self): confs = [ _make_conference(name="Soon", paper_deadline=_future(10)), _make_conference(name="Far", paper_deadline=_future(200)), ] cal = ConferenceCalendar(confs) upcoming = cal.get_upcoming(days=90) assert len(upcoming) == 1 assert upcoming[0].name == "Soon" def test_get_upcoming_filters_by_domain(self): confs = [ _make_conference(name="ML", domains=("ml",), paper_deadline=_future(10)), _make_conference(name="CV", domains=("cv",), paper_deadline=_future(10)), ] cal = ConferenceCalendar(confs) result = cal.get_upcoming(domains=["ml"], days=90) assert len(result) == 1 assert result[0].name == "ML" def test_get_upcoming_filters_by_tier(self): confs = [ _make_conference(name="T1", tier=1, paper_deadline=_future(10)), _make_conference(name="T3", tier=3, paper_deadline=_future(10)), ] cal = ConferenceCalendar(confs) result = cal.get_upcoming(tier=1, days=90) assert len(result) == 1 assert result[0].name == "T1" def test_get_by_name_case_insensitive(self): confs = [_make_conference(name="NeurIPS")] cal = ConferenceCalendar(confs) assert cal.get_by_name("neurips") is not None assert cal.get_by_name("NEURIPS") is not None assert cal.get_by_name("nonexistent") is None def test_get_by_domain(self): confs = [ _make_conference(name="A", domains=("ml", "ai")), _make_conference(name="B", domains=("cv",)), ] cal = ConferenceCalendar(confs) assert len(cal.get_by_domain("ml")) == 1 assert len(cal.get_by_domain("cv")) == 1 assert len(cal.get_by_domain("nlp")) == 0 def test_format_upcoming_no_deadlines(self): cal = ConferenceCalendar([]) output = cal.format_upcoming() assert "No upcoming deadlines" in output def test_format_upcoming_with_deadlines(self): confs = [_make_conference( name="ICML", paper_deadline=_future(15), url="https://icml.cc" )] cal = ConferenceCalendar(confs) output = cal.format_upcoming(days=90) assert "ICML" in output assert "15 days left" in output assert "https://icml.cc" in output def test_load_builtin(self): """Built-in conferences.yaml should load without error.""" cal = ConferenceCalendar.load_builtin() assert isinstance(cal.conferences, list) # =================================================================== # SubmissionPlanner tests # =================================================================== class TestSubmissionPlanner: def test_plan_basic(self): conf = _make_conference(name="TestConf", paper_deadline=_future(100)) cal = ConferenceCalendar([conf]) planner = SubmissionPlanner(cal) plan = planner.plan("TestConf", start_date=date.today()) assert plan["venue"] == "TestConf" assert plan["total_days"] == 100 assert len(plan["milestones"]) == 8 # 8 stages in STAGE_PROPORTIONS def test_plan_unknown_venue(self): cal = ConferenceCalendar([]) planner = SubmissionPlanner(cal) result = planner.plan("NonExistent") assert "error" in result def test_plan_past_deadline(self): conf = _make_conference(name="Past", paper_deadline=_past(5)) cal = ConferenceCalendar([conf]) planner = SubmissionPlanner(cal) result = planner.plan("Past", start_date=date.today()) assert "error" in result assert "passed" in result["error"] def test_format_plan(self): conf = _make_conference(name="ICML", paper_deadline=_future(60)) cal = ConferenceCalendar([conf]) planner = SubmissionPlanner(cal) output = planner.format_plan("ICML", start_date=date.today()) assert "Submission Plan for ICML" in output assert "Milestones:" in output def test_format_plan_error(self): cal = ConferenceCalendar([]) planner = SubmissionPlanner(cal) output = planner.format_plan("None") assert "Error:" in output # =================================================================== # ReminderCalculator tests # =================================================================== class TestReminderCalculator: def test_check_fires_on_matching_day(self): deadline = date.today() + timedelta(days=7) conf = _make_conference(name="Conf", paper_deadline=deadline) calc = ReminderCalculator(reminder_days=(7,)) reminders = calc.check([conf]) assert len(reminders) == 1 assert reminders[0].days_until == 7 def test_check_no_fire_on_non_matching_day(self): deadline = date.today() + timedelta(days=8) conf = _make_conference(name="Conf", paper_deadline=deadline) calc = ReminderCalculator(reminder_days=(7,)) reminders = calc.check([conf]) assert len(reminders) == 0 def test_check_skips_past_deadlines(self): conf = _make_conference(name="Conf", paper_deadline=_past(3)) calc = ReminderCalculator(reminder_days=(3,)) assert len(calc.check([conf])) == 0 def test_urgency_critical(self): assert ReminderCalculator._classify_urgency(1) == "critical" assert ReminderCalculator._classify_urgency(3) == "critical" def test_urgency_warning(self): assert ReminderCalculator._classify_urgency(7) == "warning" assert ReminderCalculator._classify_urgency(14) == "warning" def test_urgency_info(self): assert ReminderCalculator._classify_urgency(30) == "info" def test_get_active_reminders(self): confs = [ _make_conference(name="Soon", paper_deadline=_future(5)), _make_conference(name="Far", paper_deadline=_future(100)), ] calc = ReminderCalculator(reminder_days=(30, 14, 7, 3, 1)) active = calc.get_active_reminders(confs) assert len(active) == 1 assert active[0].conference_name == "Soon" def test_format_reminders_empty(self): calc = ReminderCalculator() assert "No upcoming" in calc.format_reminders([]) def test_format_reminders_with_data(self): r = Reminder( conference_name="ICML", deadline_type="paper", deadline_date=_future(3), days_until=3, urgency="critical", ) calc = ReminderCalculator() output = calc.format_reminders([r]) assert "ICML" in output assert "!!!" in output def test_reminder_frozen(self): r = Reminder("X", "paper", date.today(), 5, "info") with pytest.raises(AttributeError): r.days_until = 10 # type: ignore[misc] ================================================ FILE: tests/test_cli.py ================================================ """Tests for CLI setup helpers.""" from __future__ import annotations from unittest.mock import MagicMock, patch from researchclaw import cli def test_install_opencode_uses_which_resolved_npm_path(): mock_result = MagicMock() mock_result.returncode = 0 with patch( "researchclaw.cli.shutil.which", return_value=r"C:\Program Files\nodejs\npm.cmd", ), patch("researchclaw.cli.subprocess.run", return_value=mock_result) as run_mock: assert cli._install_opencode() is True run_mock.assert_called_once() assert run_mock.call_args.args[0][0] == r"C:\Program Files\nodejs\npm.cmd" def test_install_opencode_returns_false_when_npm_missing(): with patch("researchclaw.cli.shutil.which", return_value=None): assert cli._install_opencode() is False def test_is_opencode_installed_uses_which_resolved_path(): mock_result = MagicMock() mock_result.returncode = 0 with patch( "researchclaw.cli.shutil.which", return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd", ), patch("researchclaw.cli.subprocess.run", return_value=mock_result) as run_mock: assert cli._is_opencode_installed() is True run_mock.assert_called_once() assert run_mock.call_args.args[0][0].endswith("opencode.cmd") ================================================ FILE: tests/test_code_agent.py ================================================ """Tests for the advanced multi-phase code generation agent (F-02).""" from __future__ import annotations import json from dataclasses import dataclass, field from pathlib import Path from typing import Any import pytest from researchclaw.llm.client import LLMResponse from researchclaw.pipeline.code_agent import ( CodeAgent, CodeAgentConfig, CodeAgentResult, SolutionNode, _SimpleResult, ) from researchclaw.prompts import PromptManager # --------------------------------------------------------------------------- # Test fixtures # --------------------------------------------------------------------------- class FakeLLM: """Fake LLM client that returns configurable responses.""" def __init__(self, responses: list[str] | None = None): self.calls: list[dict[str, Any]] = [] self._responses = list(responses or []) self._call_idx = 0 def chat(self, messages: list[dict], **kwargs: Any) -> LLMResponse: self.calls.append({"messages": messages, **kwargs}) if self._responses: text = self._responses[min(self._call_idx, len(self._responses) - 1)] else: text = '```filename:main.py\nprint("hello")\n```' self._call_idx += 1 return LLMResponse(content=text, model="fake-model") @dataclass class FakeSandboxResult: returncode: int = 0 stdout: str = "primary_metric: 0.95" stderr: str = "" elapsed_sec: float = 1.0 metrics: dict[str, object] = field(default_factory=dict) timed_out: bool = False class FakeSandbox: """Fake sandbox for testing.""" def __init__(self, results: list[FakeSandboxResult] | None = None): self.runs: list[Path] = [] self._results = list(results or [FakeSandboxResult()]) self._run_idx = 0 def run_project( self, project_dir: Path, *, entry_point: str = "main.py", timeout_sec: int = 300, ) -> FakeSandboxResult: self.runs.append(project_dir) result = self._results[min(self._run_idx, len(self._results) - 1)] self._run_idx += 1 return result @pytest.fixture() def stage_dir(tmp_path: Path) -> Path: d = tmp_path / "stage-10" d.mkdir() return d @pytest.fixture() def pm() -> PromptManager: return PromptManager() # --------------------------------------------------------------------------- # CodeAgentConfig tests # --------------------------------------------------------------------------- class TestCodeAgentConfig: def test_default_values(self) -> None: cfg = CodeAgentConfig() assert cfg.enabled is True assert cfg.architecture_planning is True assert cfg.exec_fix_max_iterations == 3 assert cfg.tree_search_enabled is False assert cfg.review_max_rounds == 2 def test_custom_values(self) -> None: cfg = CodeAgentConfig( enabled=False, exec_fix_max_iterations=5, tree_search_enabled=True, tree_search_candidates=5, ) assert cfg.enabled is False assert cfg.exec_fix_max_iterations == 5 assert cfg.tree_search_enabled is True assert cfg.tree_search_candidates == 5 # --------------------------------------------------------------------------- # Phase 1: Architecture Planning # --------------------------------------------------------------------------- class TestPhase1Architecture: def test_architecture_planning_produces_spec( self, stage_dir: Path, pm: PromptManager, ) -> None: arch_yaml = ( "```yaml\nfiles:\n - name: main.py\n purpose: entry point\n" " - name: models.py\n purpose: models\n```" ) code = '```filename:main.py\nprint("metric: 1.0")\n```' # reviewer approves immediately review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}' llm = FakeLLM(responses=[arch_yaml, code, review]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig(architecture_planning=True), stage_dir=stage_dir, ) result = agent.generate( topic="test topic", exp_plan="objectives: test", metric="accuracy", pkg_hint="numpy, torch", ) assert result.architecture_spec assert "main.py" in result.architecture_spec assert result.files assert result.total_llm_calls >= 2 # arch + codegen + review def test_architecture_planning_disabled( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("metric: 1.0")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[code, review]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig(architecture_planning=False), stage_dir=stage_dir, ) result = agent.generate( topic="test", exp_plan="plan", metric="m", pkg_hint="", ) assert result.architecture_spec == "" assert result.files # First call should be code_generation, not the architecture planning prompt first_call_user = llm.calls[0]["messages"][0]["content"] # The architecture planning prompt has "Design the architecture" phrasing assert "design the architecture for an experiment" not in first_call_user.lower() # --------------------------------------------------------------------------- # Phase 2: Execution-in-the-Loop # --------------------------------------------------------------------------- class TestPhase2ExecFix: def test_exec_fix_loop_fixes_crashing_code( self, stage_dir: Path, pm: PromptManager, ) -> None: # Initial code crashes, then fix succeeds initial_code = '```filename:main.py\nraise RuntimeError("bug")\n```' fixed_code = '```filename:main.py\nprint("metric: 1.0")\n```' review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}' llm = FakeLLM(responses=[ initial_code, # phase 2: initial generation (no arch) fixed_code, # phase 2: exec-fix iteration review, # phase 4: review ]) sandbox_results = [ FakeSandboxResult(returncode=1, stderr="RuntimeError: bug"), FakeSandboxResult(returncode=0, stdout="metric: 1.0"), ] fake_sandbox = FakeSandbox(results=sandbox_results) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, exec_fix_max_iterations=3, ), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: fake_sandbox, experiment_config=None, ) result = agent.generate( topic="test", exp_plan="plan", metric="metric", pkg_hint="", ) assert result.files assert result.total_sandbox_runs >= 1 def test_exec_fix_skipped_without_sandbox( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("m: 1")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[code, review]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig(architecture_planning=False), stage_dir=stage_dir, sandbox_factory=None, ) result = agent.generate( topic="t", exp_plan="p", metric="m", pkg_hint="", ) assert result.total_sandbox_runs == 0 assert result.files def test_exec_fix_max_iterations_respected( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nraise RuntimeError("persistent")\n```' review = '{"verdict": "APPROVE", "score": 5, "critical_issues": []}' llm = FakeLLM(responses=[code, code, code, code, review]) always_crash = FakeSandbox( results=[FakeSandboxResult(returncode=1, stderr="RuntimeError")] ) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, exec_fix_max_iterations=2, ), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: always_crash, experiment_config=None, ) result = agent.generate( topic="t", exp_plan="p", metric="m", pkg_hint="", ) # Should have exactly 2 sandbox runs (max iterations) assert result.total_sandbox_runs == 2 # --------------------------------------------------------------------------- # Phase 3: Solution Tree Search # --------------------------------------------------------------------------- class TestPhase3TreeSearch: def test_tree_search_generates_multiple_candidates( self, stage_dir: Path, pm: PromptManager, ) -> None: code_a = '```filename:main.py\nprint("metric: 0.5")\n```' code_b = '```filename:main.py\nprint("metric: 0.9")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[code_a, code_b, review]) sandbox = FakeSandbox(results=[ FakeSandboxResult(returncode=0, stdout="metric: 0.5", metrics={"metric": 0.5}), FakeSandboxResult(returncode=0, stdout="metric: 0.9", metrics={"metric": 0.9}), ]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, tree_search_enabled=True, tree_search_candidates=2, tree_search_max_depth=1, ), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: sandbox, experiment_config=None, ) result = agent.generate( topic="t", exp_plan="p", metric="metric", pkg_hint="", ) assert result.tree_nodes_explored >= 2 assert result.files def test_tree_search_fixes_crashing_candidates( self, stage_dir: Path, pm: PromptManager, ) -> None: crash_code = '```filename:main.py\nraise ValueError("x")\n```' fixed_code = '```filename:main.py\nprint("metric: 1.0")\n```' review = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}' llm = FakeLLM(responses=[ crash_code, # candidate 0 crash_code, # candidate 1 fixed_code, # fix for candidate 0 fixed_code, # fix for candidate 1 review, # review ]) results_seq = [ FakeSandboxResult(returncode=1, stderr="ValueError: x"), FakeSandboxResult(returncode=1, stderr="ValueError: x"), FakeSandboxResult(returncode=0, stdout="metric: 1.0"), FakeSandboxResult(returncode=0, stdout="metric: 1.0"), ] sandbox = FakeSandbox(results=results_seq) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, tree_search_enabled=True, tree_search_candidates=2, tree_search_max_depth=2, ), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: sandbox, experiment_config=None, ) result = agent.generate( topic="t", exp_plan="p", metric="metric", pkg_hint="", ) assert result.tree_nodes_explored >= 2 # --------------------------------------------------------------------------- # Phase 4: Multi-Agent Review # --------------------------------------------------------------------------- class TestPhase4Review: def test_review_approves_on_first_round( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("m: 1")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[code, review]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, review_max_rounds=2, ), stage_dir=stage_dir, ) result = agent.generate( topic="t", exp_plan="p", metric="m", pkg_hint="", ) assert result.review_rounds == 1 def test_review_triggers_fix_on_critical_issues( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("m: 1")\n```' review1 = json.dumps({ "verdict": "REVISE", "score": 3, "critical_issues": ["Missing seed handling", "Wrong metric name"], "suggestions": [], }) fixed = '```filename:main.py\nimport random\nrandom.seed(42)\nprint("m: 1")\n```' review2 = '{"verdict": "APPROVE", "score": 8, "critical_issues": []}' llm = FakeLLM(responses=[code, review1, fixed, review2]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, review_max_rounds=3, hard_validation=False, # Test focuses on review, not validation ), stage_dir=stage_dir, ) result = agent.generate( topic="t", exp_plan="p", metric="m", pkg_hint="", ) assert result.review_rounds == 2 assert result.total_llm_calls == 4 # codegen + review1 + fix + review2 def test_review_disabled( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("m: 1")\n```' llm = FakeLLM(responses=[code]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=False, review_max_rounds=0, hard_validation=False, # Test focuses on review, not validation ), stage_dir=stage_dir, ) result = agent.generate( topic="t", exp_plan="p", metric="m", pkg_hint="", ) assert result.review_rounds == 0 assert result.total_llm_calls == 1 # only codegen # --------------------------------------------------------------------------- # Full pipeline tests # --------------------------------------------------------------------------- class TestFullPipeline: def test_all_phases_end_to_end( self, stage_dir: Path, pm: PromptManager, ) -> None: arch = "```yaml\nfiles:\n - name: main.py\n```" code = '```filename:main.py\nprint("acc: 0.9")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[arch, code, review]) sandbox = FakeSandbox(results=[ FakeSandboxResult(returncode=0, stdout="acc: 0.9"), ]) agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig( architecture_planning=True, exec_fix_max_iterations=2, review_max_rounds=1, ), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: sandbox, experiment_config=None, ) result = agent.generate( topic="image classification", exp_plan="test plan", metric="accuracy", pkg_hint="torch", ) assert result.architecture_spec assert "main.py" in result.files assert result.total_llm_calls >= 3 # arch + code + review assert result.total_sandbox_runs >= 1 assert result.review_rounds == 1 assert result.validation_log def test_agent_writes_attempt_directories( self, stage_dir: Path, pm: PromptManager, ) -> None: code = '```filename:main.py\nprint("x: 1")\n```' review = '{"verdict": "APPROVE", "score": 9, "critical_issues": []}' llm = FakeLLM(responses=[code, review]) sandbox = FakeSandbox() agent = CodeAgent( llm=llm, prompts=pm, config=CodeAgentConfig(architecture_planning=False), stage_dir=stage_dir, sandbox_factory=lambda cfg, wd: sandbox, experiment_config=None, ) result = agent.generate( topic="t", exp_plan="p", metric="x", pkg_hint="", ) attempt_dir = stage_dir / "agent_runs" / "attempt_001" assert attempt_dir.exists() assert (attempt_dir / "main.py").exists() # --------------------------------------------------------------------------- # SolutionNode and scoring # --------------------------------------------------------------------------- class TestSolutionNodeScoring: def test_score_running_node(self) -> None: node = SolutionNode( node_id="test", files={"main.py": "x"}, runs_ok=True, stdout="lots of output " * 20, metrics={"metric": 0.95}, ) score = CodeAgent._score_node(node, "metric") assert score >= 2.0 # runs_ok(1.0) + output(0.3) + metrics(0.5) + key(0.5) def test_score_crashing_node(self) -> None: node = SolutionNode( node_id="test", files={"main.py": "x"}, runs_ok=False, stderr="Error: something broke", ) score = CodeAgent._score_node(node, "metric") assert score == 0.0 # no runs_ok, error penalty, max(0) def test_score_partial_output(self) -> None: node = SolutionNode( node_id="test", files={"main.py": "x"}, runs_ok=True, stdout="short", metrics={}, ) score = CodeAgent._score_node(node, "metric") assert score == 1.0 # only runs_ok # --------------------------------------------------------------------------- # Helper methods # --------------------------------------------------------------------------- class TestHelpers: def test_format_files(self) -> None: files = {"main.py": "print(1)", "utils.py": "x = 2"} formatted = CodeAgent._format_files(files) assert "```filename:main.py" in formatted assert "```filename:utils.py" in formatted assert "print(1)" in formatted def test_parse_json_direct(self) -> None: result = CodeAgent._parse_json('{"score": 5}') assert result == {"score": 5} def test_parse_json_fenced(self) -> None: text = 'Some text\n```json\n{"verdict": "APPROVE"}\n```\nmore text' result = CodeAgent._parse_json(text) assert result == {"verdict": "APPROVE"} def test_parse_json_embedded(self) -> None: text = 'The review is: {"score": 7, "verdict": "REVISE"} end' result = CodeAgent._parse_json(text) assert result is not None assert result["score"] == 7 def test_parse_json_invalid(self) -> None: result = CodeAgent._parse_json("not json at all") assert result is None def test_simple_result_defaults(self) -> None: r = _SimpleResult() assert r.returncode == 1 assert r.stdout == "" assert r.timed_out is False # --------------------------------------------------------------------------- # Config integration test # --------------------------------------------------------------------------- class TestConfigIntegration: def test_code_agent_config_in_experiment_config(self) -> None: from researchclaw.config import CodeAgentConfig, ExperimentConfig exp = ExperimentConfig() assert hasattr(exp, "code_agent") assert isinstance(exp.code_agent, CodeAgentConfig) assert exp.code_agent.enabled is True def test_code_agent_config_from_dict(self, tmp_path: Path) -> None: from researchclaw.config import RCConfig data = { "project": {"name": "test", "mode": "docs-first"}, "research": { "topic": "test", "domains": ["ml"], "daily_paper_count": 1, "quality_threshold": 7.0, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": { "backend": "markdown", "root": str(tmp_path / "kb"), }, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "TEST", "api_key": "test-key", "primary_model": "test", "fallback_models": [], }, "experiment": { "mode": "sandbox", "code_agent": { "enabled": False, "tree_search_enabled": True, "tree_search_candidates": 5, }, }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) assert cfg.experiment.code_agent.enabled is False assert cfg.experiment.code_agent.tree_search_enabled is True assert cfg.experiment.code_agent.tree_search_candidates == 5 # --------------------------------------------------------------------------- # Prompts integration test # --------------------------------------------------------------------------- class TestPromptsIntegration: def test_architecture_planning_prompt_exists(self, pm: PromptManager) -> None: sp = pm.sub_prompt( "architecture_planning", topic="image classification", exp_plan="test plan", metric="accuracy", ) assert "architect" in sp.system.lower() assert "accuracy" in sp.user assert "image classification" in sp.user def test_code_exec_fix_prompt_exists(self, pm: PromptManager) -> None: sp = pm.sub_prompt( "code_exec_fix", stderr="ImportError: no module named foo", stdout_tail="loading data...", returncode="1", files_context="```filename:main.py\nimport foo\n```", ) assert "debug" in sp.system.lower() or "fix" in sp.system.lower() assert "ImportError" in sp.user def test_code_reviewer_prompt_exists(self, pm: PromptManager) -> None: sp = pm.sub_prompt( "code_reviewer", topic="RL", exp_plan="test plan", metric="reward", files_context="```filename:main.py\nprint('hi')\n```", ) assert "review" in sp.system.lower() assert "reward" in sp.user assert "APPROVE" in sp.user or "REVISE" in sp.user ================================================ FILE: tests/test_code_searcher.py ================================================ """Tests for the Code Searcher agent.""" from __future__ import annotations import json import time import pytest from pathlib import Path from unittest.mock import MagicMock, patch from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult from researchclaw.agents.code_searcher.cache import SearchCache from researchclaw.agents.code_searcher.github_client import ( CodeSnippet, GitHubClient, RepoAnalysis, RepoInfo, ) from researchclaw.agents.code_searcher.pattern_extractor import ( CodePatterns, extract_patterns, _heuristic_extract, ) from researchclaw.agents.code_searcher.query_gen import ( generate_search_queries, _heuristic_generate, _extract_key_phrases, ) from researchclaw.domains.detector import DomainProfile, get_profile # --------------------------------------------------------------------------- # Query Generation tests # --------------------------------------------------------------------------- class TestQueryGeneration: def test_heuristic_generates_queries(self): queries = _heuristic_generate( topic="finite element method for Poisson equation", domain_name="PDE Solvers", libraries=["numpy", "scipy", "fenics"], needs=["FEM assembly", "mesh generation"], ) assert len(queries) >= 3 assert len(queries) <= 5 # Should include library names any_lib = any("numpy" in q or "scipy" in q or "fenics" in q for q in queries) assert any_lib def test_heuristic_no_duplicates(self): queries = _heuristic_generate( topic="simple test", domain_name="Test", libraries=["numpy"], needs=[], ) # No exact duplicates assert len(queries) == len(set(q.lower().strip() for q in queries)) def test_extract_key_phrases(self): result = _extract_key_phrases("A Novel Approach for Image Classification Using Deep Learning") # Should remove filler words assert "novel" not in result.lower() assert "using" not in result.lower() def test_generate_without_llm(self): queries = generate_search_queries( topic="molecular dynamics simulation", domain_name="Computational Physics", core_libraries=["jax", "numpy"], llm=None, ) assert isinstance(queries, list) assert len(queries) >= 2 # --------------------------------------------------------------------------- # Pattern Extractor tests # --------------------------------------------------------------------------- class TestPatternExtractor: def test_heuristic_extract_imports(self): snippets = [ "import numpy as np\nimport scipy.sparse as sp\n\ndef solve():\n pass", "from pyscf import gto, scf\nmol = gto.M(atom='H 0 0 0')", ] patterns = _heuristic_extract(snippets) assert len(patterns.api_patterns) > 0 assert any("numpy" in p for p in patterns.api_patterns) def test_heuristic_extract_functions(self): snippets = [ "class Solver:\n pass\ndef solve_pde():\n pass\ndef analyze():\n pass", ] patterns = _heuristic_extract(snippets) assert len(patterns.file_structure) > 0 def test_empty_snippets(self): patterns = extract_patterns([], topic="test", domain_name="test") assert not patterns.has_content def test_code_patterns_to_prompt(self): patterns = CodePatterns( api_patterns=["import numpy as np\nresult = np.linalg.solve(A, b)"], file_structure={"solver.py": "Main solver implementation"}, evaluation_patterns=["error = np.linalg.norm(x - x_exact)"], ) ctx = patterns.to_prompt_context() assert "numpy" in ctx assert "solver.py" in ctx assert "error" in ctx def test_code_patterns_has_content(self): empty = CodePatterns() assert not empty.has_content with_data = CodePatterns(api_patterns=["import x"]) assert with_data.has_content # --------------------------------------------------------------------------- # Search Cache tests # --------------------------------------------------------------------------- class TestSearchCache: def test_put_and_get(self, tmp_path): cache = SearchCache(cache_dir=tmp_path, ttl_days=30) data = {"api_patterns": ["import numpy"], "repos": []} cache.put("ml_vision", "image classification", data) result = cache.get("ml_vision", "image classification") assert result is not None assert result["api_patterns"] == ["import numpy"] def test_cache_miss(self, tmp_path): cache = SearchCache(cache_dir=tmp_path) result = cache.get("unknown", "unknown topic") assert result is None def test_cache_expiry(self, tmp_path): cache = SearchCache(cache_dir=tmp_path, ttl_days=0) # immediate expiry data = {"test": True} cache.put("test", "topic", data) # Manually set old timestamp cache_path = tmp_path / "test" for f in cache_path.glob("*.json"): content = json.loads(f.read_text()) content["_cached_at"] = time.time() - 86400 # 1 day ago f.write_text(json.dumps(content)) result = cache.get("test", "topic") assert result is None # expired def test_clear_domain(self, tmp_path): cache = SearchCache(cache_dir=tmp_path) cache.put("ml_vision", "topic1", {"data": 1}) cache.put("ml_vision", "topic2", {"data": 2}) cache.put("physics", "topic3", {"data": 3}) count = cache.clear("ml_vision") assert count == 2 assert cache.get("ml_vision", "topic1") is None assert cache.get("physics", "topic3") is not None def test_clear_all(self, tmp_path): cache = SearchCache(cache_dir=tmp_path) cache.put("a", "t1", {"x": 1}) cache.put("b", "t2", {"x": 2}) count = cache.clear() assert count == 2 def test_stats(self, tmp_path): cache = SearchCache(cache_dir=tmp_path) cache.put("ml_vision", "t1", {"x": 1}) cache.put("ml_vision", "t2", {"x": 2}) cache.put("physics", "t3", {"x": 3}) stats = cache.stats() assert stats["total"] == 3 assert stats.get("ml_vision", 0) == 2 def test_topic_hash_deterministic(self): h1 = SearchCache._topic_hash("test topic") h2 = SearchCache._topic_hash("test topic") assert h1 == h2 def test_topic_hash_case_insensitive(self): h1 = SearchCache._topic_hash("Test Topic") h2 = SearchCache._topic_hash("test topic") assert h1 == h2 # --------------------------------------------------------------------------- # GitHubClient tests (mocked) # --------------------------------------------------------------------------- class TestGitHubClient: def test_has_token_false(self): with patch.dict("os.environ", {}, clear=True): client = GitHubClient(token="") # Can't easily clear env, but token="" means no token assert not client.has_token def test_has_token_true(self): client = GitHubClient(token="ghp_test123") assert client.has_token def test_headers_with_token(self): client = GitHubClient(token="ghp_test123") headers = client._headers() assert "Authorization" in headers assert "Bearer" in headers["Authorization"] def test_headers_without_token(self): client = GitHubClient(token="") headers = client._headers() assert "Authorization" not in headers # --------------------------------------------------------------------------- # RepoInfo / CodeSnippet data class tests # --------------------------------------------------------------------------- class TestDataClasses: def test_repo_info_defaults(self): repo = RepoInfo(full_name="owner/repo") assert repo.stars == 0 assert repo.default_branch == "main" def test_code_snippet(self): snippet = CodeSnippet( repo_full_name="owner/repo", file_path="src/main.py", ) assert snippet.content == "" def test_repo_analysis(self): analysis = RepoAnalysis( repo=RepoInfo(full_name="test/repo"), readme="# Test Repo", requirements=["numpy", "scipy"], ) assert len(analysis.requirements) == 2 # --------------------------------------------------------------------------- # CodeSearchResult tests # --------------------------------------------------------------------------- class TestCodeSearchResult: def test_empty_result(self): result = CodeSearchResult() assert result.to_prompt_context() == "" assert not result.from_cache def test_result_with_patterns(self): result = CodeSearchResult( patterns=CodePatterns( api_patterns=["import numpy as np"], file_structure={"main.py": "Entry point"}, ), ) ctx = result.to_prompt_context() assert "numpy" in ctx def test_cache_roundtrip(self): result = CodeSearchResult( patterns=CodePatterns( api_patterns=["import numpy"], file_structure={"main.py": "Entry"}, evaluation_patterns=["error = norm(diff)"], ), repos_found=[ RepoInfo(full_name="test/repo", stars=100, html_url="https://example.com"), ], queries_used=["test query"], ) cache_dict = result.to_cache_dict() restored = CodeSearchResult.from_cache_dict(cache_dict) assert restored.from_cache assert restored.patterns.api_patterns == ["import numpy"] assert len(restored.repos_found) == 1 assert restored.queries_used == ["test query"] # --------------------------------------------------------------------------- # CodeSearchAgent tests (mocked GitHub) # --------------------------------------------------------------------------- class TestCodeSearchAgent: def _mock_github(self): """Create a mock GitHub client.""" mock = MagicMock(spec=GitHubClient) mock.search_repos.return_value = [ RepoInfo( full_name="user/physics-sim", description="Physics simulation framework", stars=500, html_url="https://github.com/user/physics-sim", ), ] mock.search_code.return_value = [ CodeSnippet( repo_full_name="user/physics-sim", file_path="main.py", score=10.0, ), ] mock.get_readme.return_value = "# Physics Simulation\nA framework for physics sims." mock.get_repo_tree.return_value = ["main.py", "solver.py", "requirements.txt"] mock.get_file_content.return_value = "import numpy as np\ndef solve(): pass" mock.request_count = 5 return mock def test_search_uses_cache(self, tmp_path): cache = SearchCache(cache_dir=tmp_path) cache.put("physics_simulation", "N-body sim", { "api_patterns": ["cached pattern"], "file_structure": {}, "evaluation_patterns": [], "library_versions": {}, "repos": [], "queries": ["cached query"], }) agent = CodeSearchAgent(cache=cache) profile = DomainProfile( domain_id="physics_simulation", display_name="Physics", core_libraries=["numpy"], ) result = agent.search("N-body sim", profile) assert result.from_cache assert result.patterns.api_patterns == ["cached pattern"] def test_search_with_mock_github(self, tmp_path): mock_github = self._mock_github() cache = SearchCache(cache_dir=tmp_path) agent = CodeSearchAgent(cache=cache) agent._github = mock_github profile = DomainProfile( domain_id="physics_simulation", display_name="Computational Physics", core_libraries=["numpy", "scipy"], github_search_terms=["physics simulation python"], ) result = agent.search("molecular dynamics simulation", profile) assert not result.from_cache assert len(result.queries_used) >= 2 mock_github.search_repos.assert_called_once() def test_search_graceful_failure(self, tmp_path): """If GitHub fails, should still return empty result without crashing.""" mock_github = MagicMock(spec=GitHubClient) mock_github.search_repos.side_effect = Exception("Network error") mock_github.search_code.side_effect = Exception("Network error") mock_github.request_count = 0 cache = SearchCache(cache_dir=tmp_path) agent = CodeSearchAgent(cache=cache) agent._github = mock_github profile = DomainProfile( domain_id="test", display_name="Test", core_libraries=["numpy"], ) result = agent.search("test topic", profile) # Should not crash assert isinstance(result, CodeSearchResult) ================================================ FILE: tests/test_collaboration.py ================================================ """Tests for the collaboration system (15+ tests). Covers: - ResearchRepository (publish, search, list) - ArtifactPublisher (extraction from run dirs) - ArtifactSubscriber (queries) - Deduplication (content_hash, deduplicate_artifacts) """ from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.collaboration.repository import ResearchRepository from researchclaw.collaboration.publisher import ArtifactPublisher from researchclaw.collaboration.subscriber import ArtifactSubscriber from researchclaw.collaboration.dedup import content_hash, deduplicate_artifacts # ── Fixtures ───────────────────────────────────────────────────────── @pytest.fixture def repo(tmp_path: Path) -> ResearchRepository: return ResearchRepository(repo_dir=tmp_path / "shared_repo") @pytest.fixture def populated_repo(repo: ResearchRepository) -> ResearchRepository: repo.publish( run_id="run-001", artifacts={ "literature_summary": {"papers": ["Paper A on transformer", "Paper B on vision"]}, "experiment_results": {"accuracy": 0.95, "model": "ResNet50"}, }, ) repo.publish( run_id="run-002", artifacts={ "literature_summary": {"papers": ["Paper C on nlp transformer"]}, "code_template": "import torch\nmodel = ResNet()\n# pytorch training", }, ) return repo @pytest.fixture def run_dir(tmp_path: Path) -> Path: """Create a fake pipeline run directory with stage outputs.""" d = tmp_path / "run-test" d.mkdir() # Stage 07 — literature synthesis s07 = d / "stage-07-literature_synthesis" s07.mkdir() (s07 / "synthesis.json").write_text( json.dumps({"papers": [{"title": "Test Paper", "year": 2024}]}), encoding="utf-8", ) # Stage 10 — code generation s10 = d / "stage-10-code_generation" s10.mkdir() (s10 / "main.py").write_text("print('hello')", encoding="utf-8") # Stage 14 — result analysis s14 = d / "stage-14-result_analysis" s14.mkdir() (s14 / "experiment_summary.json").write_text( json.dumps({"accuracy": 0.92}), encoding="utf-8" ) # Stage 18 — peer review s18 = d / "stage-18-peer_review" s18.mkdir() (s18 / "review.md").write_text("Good paper overall.", encoding="utf-8") return d # ── Repository Tests ───────────────────────────────────────────────── class TestResearchRepository: def test_publish(self, repo: ResearchRepository) -> None: count = repo.publish( run_id="run-001", artifacts={"literature_summary": {"papers": ["P1"]}}, ) assert count == 1 def test_publish_creates_dirs(self, repo: ResearchRepository) -> None: repo.publish( run_id="run-new", artifacts={"code_template": "print('hi')"}, ) assert (repo.repo_dir / "run-new").is_dir() def test_publish_unknown_type_skipped(self, repo: ResearchRepository) -> None: count = repo.publish( run_id="run-bad", artifacts={"unknown_type": "data"}, ) assert count == 0 def test_search_by_query(self, populated_repo: ResearchRepository) -> None: results = populated_repo.search("transformer") assert len(results) >= 2 def test_search_by_type(self, populated_repo: ResearchRepository) -> None: results = populated_repo.search( "paper", artifact_type="literature_summary" ) assert len(results) >= 1 def test_search_no_results(self, populated_repo: ResearchRepository) -> None: results = populated_repo.search("quantum_nonexistent_xyz") assert len(results) == 0 def test_search_empty_repo(self, repo: ResearchRepository) -> None: results = repo.search("anything") assert results == [] def test_list_runs(self, populated_repo: ResearchRepository) -> None: runs = populated_repo.list_runs() assert "run-001" in runs assert "run-002" in runs def test_list_runs_empty(self, repo: ResearchRepository) -> None: runs = repo.list_runs() assert runs == [] def test_get_run_artifacts(self, populated_repo: ResearchRepository) -> None: artifacts = populated_repo.get_run_artifacts("run-001") assert "literature_summary" in artifacts assert "experiment_results" in artifacts def test_get_run_artifacts_missing(self, populated_repo: ResearchRepository) -> None: artifacts = populated_repo.get_run_artifacts("run-999") assert artifacts == {} def test_import_literature(self, populated_repo: ResearchRepository) -> None: lit = populated_repo.import_literature("run-001") assert isinstance(lit, list) assert len(lit) >= 1 def test_import_literature_missing_run(self, populated_repo: ResearchRepository) -> None: lit = populated_repo.import_literature("run-999") assert lit == [] def test_import_code_template(self, populated_repo: ResearchRepository) -> None: code = populated_repo.import_code_template("run-002", "pytorch") assert code is not None assert "torch" in code def test_import_code_template_no_match(self, populated_repo: ResearchRepository) -> None: code = populated_repo.import_code_template("run-002", "tensorflow_xyz") assert code is None # ── Publisher Tests ────────────────────────────────────────────────── class TestArtifactPublisher: def test_publish_from_run_dir(self, run_dir: Path, tmp_path: Path) -> None: repo = ResearchRepository(repo_dir=tmp_path / "pub_repo") publisher = ArtifactPublisher(repo) count = publisher.publish_from_run_dir("test-run", run_dir) assert count >= 1 def test_publish_empty_dir(self, tmp_path: Path) -> None: empty = tmp_path / "empty_run" empty.mkdir() repo = ResearchRepository(repo_dir=tmp_path / "pub_repo2") publisher = ArtifactPublisher(repo) count = publisher.publish_from_run_dir("empty", empty) assert count == 0 def test_publish_nonexistent_dir(self, tmp_path: Path) -> None: repo = ResearchRepository(repo_dir=tmp_path / "pub_repo3") publisher = ArtifactPublisher(repo) count = publisher.publish_from_run_dir("missing", tmp_path / "nope") assert count == 0 # ── Subscriber Tests ───────────────────────────────────────────────── class TestArtifactSubscriber: def test_find_relevant_literature(self, populated_repo: ResearchRepository) -> None: sub = ArtifactSubscriber(populated_repo) results = sub.find_relevant_literature("transformer") assert len(results) >= 1 def test_find_similar_experiments(self, populated_repo: ResearchRepository) -> None: sub = ArtifactSubscriber(populated_repo) results = sub.find_similar_experiments("resnet") assert len(results) >= 1 def test_find_code_templates(self, populated_repo: ResearchRepository) -> None: sub = ArtifactSubscriber(populated_repo) results = sub.find_code_templates("pytorch") assert len(results) >= 1 def test_import_best_practices(self, populated_repo: ResearchRepository) -> None: sub = ArtifactSubscriber(populated_repo) practices = sub.import_best_practices("transformer") assert isinstance(practices, str) def test_import_best_practices_empty(self, repo: ResearchRepository) -> None: sub = ArtifactSubscriber(repo) practices = sub.import_best_practices("nonexistent") assert practices == "" # ── Dedup Tests ────────────────────────────────────────────────────── class TestDedup: def test_content_hash_deterministic(self) -> None: h1 = content_hash({"a": 1, "b": 2}) h2 = content_hash({"b": 2, "a": 1}) assert h1 == h2 def test_content_hash_different(self) -> None: h1 = content_hash({"a": 1}) h2 = content_hash({"a": 2}) assert h1 != h2 def test_deduplicate_artifacts(self) -> None: artifacts = [ {"content": {"x": 1}, "tags": ["a"]}, {"content": {"x": 1}, "tags": ["b"]}, # duplicate content {"content": {"y": 2}, "tags": ["c"]}, ] unique = deduplicate_artifacts(artifacts) assert len(unique) == 2 def test_deduplicate_empty(self) -> None: assert deduplicate_artifacts([]) == [] ================================================ FILE: tests/test_compiler.py ================================================ """Tests for researchclaw.templates.compiler — BUG-197 and general compilation. BUG-197: pdflatex stdout containing broken UTF-8 (from U+202F error messages) caused UnicodeDecodeError that killed the compilation pipeline, preventing bibtex from running and leaving all citations as [?]. """ from __future__ import annotations import re from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.templates.compiler import ( CompileResult, _is_fatal_error, _sanitize_tex_unicode, fix_common_latex_errors, ) # --------------------------------------------------------------------------- # _is_fatal_error # --------------------------------------------------------------------------- class TestIsFatalError: """Test that _is_fatal_error correctly classifies errors.""" def test_unicode_char_not_set_up_is_nonfatal(self): """BUG-197: Unicode character errors should be non-fatal. The error line captured by _parse_log is a single line: ``! LaTeX Error: Unicode character X (U+202F)`` (the "not set up" text is on a continuation line). """ err = "! LaTeX Error: Unicode character \u202f (U+202F)" assert not _is_fatal_error(err) def test_unicode_char_various_codepoints_nonfatal(self): """Various Unicode character codepoints should be non-fatal.""" for cp in ["U+00A0", "U+2009", "U+2007", "U+3000"]: err = f"! LaTeX Error: Unicode character X ({cp})" assert not _is_fatal_error(err), f"Expected non-fatal for {cp}" def test_undefined_control_sequence_is_fatal(self): err = "! Undefined control sequence." assert _is_fatal_error(err) def test_missing_dollar_is_fatal(self): err = "! Missing $ inserted." assert _is_fatal_error(err) def test_overfull_hbox_is_nonfatal(self): err = "! Overfull \\hbox (12.3pt too wide)" assert not _is_fatal_error(err) def test_float_lost_is_nonfatal(self): err = "! Float(s) lost." assert not _is_fatal_error(err) def test_unavailable_in_encoding_is_nonfatal(self): err = "! Package inputenc Error: Unicode character unavailable in encoding OT1." assert not _is_fatal_error(err) def test_emergency_stop_is_fatal(self): err = "! ==> Fatal error occurred, no output PDF file produced!" assert _is_fatal_error(err) def test_non_bang_file_not_found_is_fatal(self): err = "File `missing.sty' not found." assert _is_fatal_error(err) # --------------------------------------------------------------------------- # _sanitize_tex_unicode # --------------------------------------------------------------------------- class TestSanitizeTexUnicode: """Test that _sanitize_tex_unicode strips problematic Unicode.""" def test_replaces_narrow_no_break_space(self, tmp_path: Path): """BUG-197: U+202F should be replaced with ASCII space.""" tex = tmp_path / "test.tex" tex.write_text("Hello\u202fWorld\n", encoding="utf-8") _sanitize_tex_unicode(tex) assert tex.read_text(encoding="utf-8") == "Hello World\n" def test_replaces_no_break_space(self, tmp_path: Path): """U+00A0 should be replaced with ASCII space.""" tex = tmp_path / "test.tex" tex.write_text("Hello\u00a0World\n", encoding="utf-8") _sanitize_tex_unicode(tex) assert tex.read_text(encoding="utf-8") == "Hello World\n" def test_removes_zero_width_space(self, tmp_path: Path): """U+200B should be removed entirely.""" tex = tmp_path / "test.tex" tex.write_text("Hello\u200bWorld\n", encoding="utf-8") _sanitize_tex_unicode(tex) assert tex.read_text(encoding="utf-8") == "HelloWorld\n" def test_removes_bom(self, tmp_path: Path): """U+FEFF BOM should be removed.""" tex = tmp_path / "test.tex" tex.write_text("\ufeffHello\n", encoding="utf-8") _sanitize_tex_unicode(tex) assert tex.read_text(encoding="utf-8") == "Hello\n" def test_preserves_normal_text(self, tmp_path: Path): """Normal ASCII + standard Unicode should be untouched.""" content = "Hello World, \\section{Intro} $x^2$\n" tex = tmp_path / "test.tex" tex.write_text(content, encoding="utf-8") _sanitize_tex_unicode(tex) assert tex.read_text(encoding="utf-8") == content def test_handles_multiple_types(self, tmp_path: Path): """Multiple types of problematic chars in one file.""" tex = tmp_path / "test.tex" tex.write_text( "A\u202fB\u00a0C\u200bD\u200eE\n", encoding="utf-8", ) _sanitize_tex_unicode(tex) result = tex.read_text(encoding="utf-8") assert result == "A B CDE\n" def test_nonexistent_file(self, tmp_path: Path): """Should not crash on nonexistent file.""" _sanitize_tex_unicode(tmp_path / "nonexistent.tex") def test_cyrillic_transliterated_to_latin(self, tmp_path: Path): """BUG-201: Cyrillic author names should be transliterated.""" tex = tmp_path / "test.tex" tex.write_text( "А. И. Колесников\n", encoding="utf-8", ) _sanitize_tex_unicode(tex) result = tex.read_text(encoding="utf-8") assert "А" not in result # no Cyrillic left assert "И" not in result assert "A. I. Kolesnikov" in result # --------------------------------------------------------------------------- # _sanitize_bib_file — Cyrillic transliteration # --------------------------------------------------------------------------- class TestSanitizeBibFile: """Test _sanitize_bib_file fixes.""" def test_cyrillic_author_transliterated(self, tmp_path: Path): """BUG-201: Cyrillic in bib author names should be transliterated.""" from researchclaw.templates.compiler import _sanitize_bib_file bib = tmp_path / "references.bib" bib.write_text( '@article{dehghani2023scaling,\n' ' author = {А. И. Колесников and J. Doe},\n' ' title = {Scaling Vision},\n' '}\n', encoding="utf-8", ) _sanitize_bib_file(bib) result = bib.read_text(encoding="utf-8") assert "А" not in result assert "A. I. Kolesnikov" in result assert "J. Doe" in result # Latin unchanged # --------------------------------------------------------------------------- # fix_common_latex_errors — Unicode handler # --------------------------------------------------------------------------- class TestFixUnicodeErrors: """Test fix_common_latex_errors for Unicode character issues.""" def test_unicode_u202f_replaced_with_space(self): """BUG-197: U+202F in text should be replaced with space.""" tex = "Hello\u202fWorld" errors = [ "! LaTeX Error: Unicode character \u202f (U+202F)" ] fixed, fixes = fix_common_latex_errors(tex, errors) assert "\u202f" not in fixed assert "Hello World" in fixed assert any("U+202F" in f for f in fixes) def test_unicode_u200b_removed(self): """U+200B (zero-width space, category Cf) should be removed.""" tex = "Hello\u200bWorld" errors = [ "! LaTeX Error: Unicode character \u200b (U+200B)" ] fixed, fixes = fix_common_latex_errors(tex, errors) assert "\u200b" not in fixed assert "HelloWorld" in fixed def test_no_unicode_error_no_change(self): """Text without the offending char should not be modified.""" tex = "Hello World" errors = [ "! LaTeX Error: Unicode character \u202f (U+202F)" ] fixed, fixes = fix_common_latex_errors(tex, errors) assert fixed == tex # No fix should be applied since the char isn't in the text assert not any("U+202F" in f for f in fixes) # --------------------------------------------------------------------------- # _run_pdflatex — bytes mode decoding # --------------------------------------------------------------------------- class TestRunPdflatexByteMode: """Test that _run_pdflatex handles broken UTF-8 in stdout.""" @patch("researchclaw.templates.compiler.subprocess.run") def test_broken_utf8_in_stdout_does_not_crash(self, mock_run): """BUG-197: Broken UTF-8 bytes should be decoded with replacement.""" from researchclaw.templates.compiler import _run_pdflatex # Simulate pdflatex returning broken UTF-8 in stdout mock_proc = MagicMock() mock_proc.stdout = b"Normal output \xe2\x80 broken" # Invalid UTF-8 mock_proc.stderr = b"" mock_proc.returncode = 1 mock_run.return_value = mock_proc log_text, success = _run_pdflatex(Path("/tmp"), "test.tex", timeout=60) assert log_text is not None assert "Normal output" in log_text assert not success @patch("researchclaw.templates.compiler.subprocess.run") def test_valid_utf8_works(self, mock_run): """Normal UTF-8 output should work fine.""" from researchclaw.templates.compiler import _run_pdflatex mock_proc = MagicMock() mock_proc.stdout = b"Output written on test.pdf (1 page)" mock_proc.stderr = b"" mock_proc.returncode = 0 mock_run.return_value = mock_proc log_text, success = _run_pdflatex(Path("/tmp"), "test.tex", timeout=60) assert log_text is not None assert "Output written" in log_text assert success # --------------------------------------------------------------------------- # _run_bibtex — bytes mode decoding + logging # --------------------------------------------------------------------------- class TestRunBibtex: """Test that _run_bibtex handles errors and logs properly.""" @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex") @patch("researchclaw.templates.compiler.subprocess.run") def test_bibtex_failure_logged(self, mock_run, mock_which, tmp_path): """Failed bibtex should log warning and return False.""" from researchclaw.templates.compiler import _run_bibtex mock_proc = MagicMock() mock_proc.stdout = b"I couldn't open file name.aux" mock_proc.stderr = b"" mock_proc.returncode = 1 mock_run.return_value = mock_proc result = _run_bibtex(tmp_path, "paper", timeout=60) assert result is False @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex") @patch("researchclaw.templates.compiler.subprocess.run") def test_bibtex_success_with_bbl(self, mock_run, mock_which, tmp_path): """Successful bibtex with .bbl creation should return True.""" from researchclaw.templates.compiler import _run_bibtex # Create fake .bbl so the check passes (tmp_path / "paper.bbl").write_text("\\begin{thebibliography}{}") mock_proc = MagicMock() mock_proc.stdout = b"Database file #1: references.bib" mock_proc.stderr = b"" mock_proc.returncode = 0 mock_run.return_value = mock_proc result = _run_bibtex(tmp_path, "paper", timeout=60) assert result is True @patch("researchclaw.templates.compiler.shutil.which", return_value=None) def test_bibtex_not_found(self, mock_which, tmp_path): """Missing bibtex binary should return False.""" from researchclaw.templates.compiler import _run_bibtex result = _run_bibtex(tmp_path, "paper", timeout=60) assert result is False @patch("researchclaw.templates.compiler.shutil.which", return_value="/usr/bin/bibtex") @patch("researchclaw.templates.compiler.subprocess.run") def test_bibtex_broken_utf8(self, mock_run, mock_which, tmp_path): """BUG-197: Broken UTF-8 in bibtex output should not crash.""" from researchclaw.templates.compiler import _run_bibtex (tmp_path / "paper.bbl").write_text("\\begin{thebibliography}{}") mock_proc = MagicMock() mock_proc.stdout = b"Database file \xe2\x80 broken" mock_proc.stderr = b"" mock_proc.returncode = 0 mock_run.return_value = mock_proc # Should not raise result = _run_bibtex(tmp_path, "paper", timeout=60) assert result is True ================================================ FILE: tests/test_convergence_evaluator.py ================================================ """Tests for the convergence study evaluator.""" from __future__ import annotations import math import pytest from researchclaw.experiment.evaluators.convergence import ( ConvergenceReport, ConvergenceResult, analyze_convergence, compute_convergence_order, ) # --------------------------------------------------------------------------- # compute_convergence_order tests # --------------------------------------------------------------------------- class TestComputeConvergenceOrder: def test_second_order(self): """h, h/2, h/4, h/8 with error ~ h^2.""" hs = [0.1, 0.05, 0.025, 0.0125] errors = [h**2 for h in hs] order, r2 = compute_convergence_order(hs, errors) assert abs(order - 2.0) < 0.1 assert r2 > 0.99 def test_fourth_order(self): """Error ~ h^4.""" hs = [0.1, 0.05, 0.025, 0.0125] errors = [h**4 for h in hs] order, r2 = compute_convergence_order(hs, errors) assert abs(order - 4.0) < 0.1 assert r2 > 0.99 def test_first_order(self): """Error ~ h.""" hs = [0.1, 0.05, 0.025, 0.0125] errors = [h for h in hs] order, r2 = compute_convergence_order(hs, errors) assert abs(order - 1.0) < 0.1 def test_too_few_points(self): order, r2 = compute_convergence_order([0.1], [0.01]) assert order == 0.0 assert r2 == 0.0 def test_empty_input(self): order, r2 = compute_convergence_order([], []) assert order == 0.0 def test_filters_invalid(self): hs = [0.1, 0.0, 0.025, -0.01] # 0 and negative should be filtered errors = [0.01, 0.0, 0.001, 0.0001] order, r2 = compute_convergence_order(hs, errors) # Should still work with valid points assert order > 0 # --------------------------------------------------------------------------- # analyze_convergence tests # --------------------------------------------------------------------------- class TestAnalyzeConvergence: def test_single_method(self): data = { "euler": [ {"h": 0.1, "error": 0.1}, {"h": 0.05, "error": 0.05}, {"h": 0.025, "error": 0.025}, ] } report = analyze_convergence(data) assert len(report.methods) == 1 assert report.methods[0].method == "euler" assert abs(report.methods[0].convergence_order - 1.0) < 0.2 assert report.best_method == "euler" def test_multiple_methods(self): data = { "euler": [ {"h": 0.1, "error": 0.1}, {"h": 0.05, "error": 0.05}, {"h": 0.025, "error": 0.025}, ], "rk4": [ {"h": 0.1, "error": 1e-4}, {"h": 0.05, "error": 6.25e-6}, {"h": 0.025, "error": 3.9e-7}, ], } report = analyze_convergence(data) assert len(report.methods) == 2 # RK4 should have higher order orders = {r.method: r.convergence_order for r in report.methods} assert orders["rk4"] > orders["euler"] assert report.best_method == "rk4" def test_expected_orders(self): data = { "euler": [ {"h": 0.1, "error": 0.1}, {"h": 0.05, "error": 0.05}, {"h": 0.025, "error": 0.025}, ], } report = analyze_convergence(data, expected_orders={"euler": 1.0}) assert report.methods[0].expected_order == 1.0 assert report.methods[0].order_matches_expected is True def test_non_converging(self): data = { "bad_method": [ {"h": 0.1, "error": 0.5}, {"h": 0.05, "error": 0.6}, # error increases {"h": 0.025, "error": 0.7}, ], } report = analyze_convergence(data) # Negative or very low order indicates no convergence assert not report.methods[0].is_converging def test_summary_string(self): data = { "method_a": [ {"h": 0.1, "error": 0.01}, {"h": 0.05, "error": 0.0025}, ], } report = analyze_convergence(data) assert report.summary # should not be empty assert "method_a" in report.summary def test_l2_error_key(self): """Should handle l2_error as the error key.""" data = { "fem": [ {"h": 0.1, "l2_error": 0.01}, {"h": 0.05, "l2_error": 0.0025}, {"h": 0.025, "l2_error": 0.000625}, ], } report = analyze_convergence(data) assert abs(report.methods[0].convergence_order - 2.0) < 0.2 def test_empty_data(self): report = analyze_convergence({}) assert len(report.methods) == 0 assert report.best_method == "" ================================================ FILE: tests/test_copilot.py ================================================ """Tests for researchclaw.copilot — Interactive Co-Pilot Mode (Agent D2). 30+ tests covering modes, feedback, branching, and controller. """ from __future__ import annotations import json import shutil import time from datetime import date, timedelta from pathlib import Path from typing import Any from unittest.mock import patch import pytest from researchclaw.copilot.modes import ResearchMode from researchclaw.copilot.feedback import ( FEEDBACK_ACTIONS, Feedback, FeedbackHandler, ) from researchclaw.copilot.branching import BranchManager from researchclaw.copilot.controller import CoPilotController from researchclaw.config import CoPilotConfig # =================================================================== # ResearchMode tests # =================================================================== class TestResearchMode: def test_all_modes(self): assert ResearchMode.CO_PILOT.value == "co-pilot" assert ResearchMode.AUTO_PILOT.value == "auto-pilot" assert ResearchMode.ZERO_TOUCH.value == "zero-touch" def test_from_value(self): assert ResearchMode("co-pilot") == ResearchMode.CO_PILOT assert ResearchMode("auto-pilot") == ResearchMode.AUTO_PILOT assert ResearchMode("zero-touch") == ResearchMode.ZERO_TOUCH def test_invalid_mode_raises(self): with pytest.raises(ValueError): ResearchMode("invalid") def test_mode_count(self): assert len(ResearchMode) == 3 # =================================================================== # Feedback tests # =================================================================== class TestFeedback: def test_feedback_actions_defined(self): expected = {"approve", "modify", "retry", "skip", "discuss", "branch", "rollback"} assert FEEDBACK_ACTIONS == expected def test_feedback_frozen(self): fb = Feedback(action="approve", stage=5) with pytest.raises(AttributeError): fb.action = "retry" # type: ignore[misc] def test_feedback_defaults(self): fb = Feedback(action="approve", stage=1) assert fb.message == "" assert fb.modifications is None assert fb.branch_name == "" assert fb.rollback_to is None def test_feedback_with_modifications(self): fb = Feedback( action="modify", stage=5, message="Update hypothesis", modifications={"hypothesis": "new hypothesis"}, ) assert fb.modifications == {"hypothesis": "new hypothesis"} # =================================================================== # FeedbackHandler tests # =================================================================== class TestFeedbackHandler: def test_write_feedback_request(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) request_path = handler.write_feedback_request( stage=5, stage_name="LITERATURE_SCREEN", summary="10 papers screened", ) assert request_path.exists() data = json.loads(request_path.read_text(encoding="utf-8")) assert data["stage"] == 5 assert data["stage_name"] == "LITERATURE_SCREEN" assert data["status"] == "waiting" assert isinstance(data["options"], list) def test_read_feedback_response_valid(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) response = { "action": "approve", "stage": 5, "message": "Looks good", } resp_path = tmp_path / "copilot_feedback_response.json" resp_path.write_text(json.dumps(response), encoding="utf-8") fb = handler.read_feedback_response() assert fb is not None assert fb.action == "approve" assert fb.stage == 5 assert fb.message == "Looks good" def test_read_feedback_response_invalid_action(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) response = {"action": "invalid_action", "stage": 5} resp_path = tmp_path / "copilot_feedback_response.json" resp_path.write_text(json.dumps(response), encoding="utf-8") fb = handler.read_feedback_response() assert fb is None def test_read_feedback_response_missing(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) assert handler.read_feedback_response() is None def test_read_feedback_response_malformed(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) resp_path = tmp_path / "copilot_feedback_response.json" resp_path.write_text("{invalid json", encoding="utf-8") assert handler.read_feedback_response() is None def test_read_feedback_response_with_rollback(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) response = { "action": "rollback", "stage": 15, "rollback_to": 8, } resp_path = tmp_path / "copilot_feedback_response.json" resp_path.write_text(json.dumps(response), encoding="utf-8") fb = handler.read_feedback_response() assert fb is not None assert fb.action == "rollback" assert fb.rollback_to == 8 def test_read_feedback_response_branch(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) response = { "action": "branch", "stage": 9, "branch_name": "alt_experiment", } resp_path = tmp_path / "copilot_feedback_response.json" resp_path.write_text(json.dumps(response), encoding="utf-8") fb = handler.read_feedback_response() assert fb is not None assert fb.branch_name == "alt_experiment" def test_clear_request(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) handler.write_feedback_request(1, "TOPIC_INIT", "Done") handler.clear_request() assert not (tmp_path / "copilot_feedback_request.json").exists() def test_clear_request_no_file(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) handler.clear_request() # should not raise def test_wait_for_feedback_timeout(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) result = handler.wait_for_feedback(stage=1, timeout_sec=0, poll_interval_sec=0.01) assert result is None def test_wait_for_feedback_finds_response(self, tmp_path: Path): handler = FeedbackHandler(tmp_path) # Pre-clear any stale response (wait_for_feedback clears first) # Then write a response matching stage response = {"action": "approve", "stage": 5} resp_path = tmp_path / "copilot_feedback_response.json" def write_response(): """Simulate delayed response writing.""" time.sleep(0.05) resp_path.write_text(json.dumps(response), encoding="utf-8") import threading t = threading.Thread(target=write_response) t.start() fb = handler.wait_for_feedback(stage=5, timeout_sec=2, poll_interval_sec=0.02) t.join() assert fb is not None assert fb.action == "approve" # =================================================================== # BranchManager tests # =================================================================== class TestBranchManager: def test_create_branch(self, tmp_path: Path): # Create stage dirs (tmp_path / "stage-01").mkdir() (tmp_path / "stage-01" / "output.json").write_text("{}") (tmp_path / "stage-02").mkdir() (tmp_path / "stage-02" / "result.txt").write_text("ok") bm = BranchManager(tmp_path, max_branches=3) branch_path = bm.create_branch("exp_alt", from_stage=2) assert Path(branch_path).exists() assert (Path(branch_path) / "stage-01" / "output.json").exists() assert (Path(branch_path) / "stage-02" / "result.txt").exists() assert (Path(branch_path) / "branch_meta.json").exists() meta = json.loads( (Path(branch_path) / "branch_meta.json").read_text(encoding="utf-8") ) assert meta["name"] == "exp_alt" assert meta["from_stage"] == 2 def test_create_branch_max_reached(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=1) bm.create_branch("b1", from_stage=1) with pytest.raises(ValueError, match="Maximum branches"): bm.create_branch("b2", from_stage=1) def test_create_branch_duplicate_name(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=5) bm.create_branch("dup", from_stage=1) with pytest.raises(ValueError, match="already exists"): bm.create_branch("dup", from_stage=1) def test_list_branches_empty(self, tmp_path: Path): bm = BranchManager(tmp_path) assert bm.list_branches() == [] def test_list_branches(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=5) bm.create_branch("alpha", from_stage=1) bm.create_branch("beta", from_stage=2) branches = bm.list_branches() assert len(branches) == 2 names = {b["name"] for b in branches} assert names == {"alpha", "beta"} def test_switch_branch(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=3) bm.create_branch("test_branch", from_stage=1) path = bm.switch_branch("test_branch") assert path.exists() def test_switch_branch_nonexistent(self, tmp_path: Path): bm = BranchManager(tmp_path) with pytest.raises(ValueError, match="does not exist"): bm.switch_branch("nonexistent") def test_delete_branch(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=3) bm.create_branch("doomed", from_stage=1) assert len(bm.list_branches()) == 1 bm.delete_branch("doomed") assert len(bm.list_branches()) == 0 def test_delete_branch_nonexistent(self, tmp_path: Path): bm = BranchManager(tmp_path) with pytest.raises(ValueError, match="does not exist"): bm.delete_branch("ghost") def test_compare_branches(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=5) (tmp_path / "stage-01").mkdir() (tmp_path / "stage-02").mkdir() bm.create_branch("a", from_stage=2) bm.create_branch("b", from_stage=1) result = bm.compare_branches("a", "b") assert result["branch_a"] == "a" assert result["stages_a"] == 2 assert result["stages_b"] == 1 def test_compare_branches_nonexistent(self, tmp_path: Path): bm = BranchManager(tmp_path, max_branches=3) bm.create_branch("real", from_stage=1) result = bm.compare_branches("real", "fake") assert "error" in result def test_count_stages(self, tmp_path: Path): (tmp_path / "stage-01").mkdir() (tmp_path / "stage-02").mkdir() (tmp_path / "other_dir").mkdir() assert BranchManager._count_stages(tmp_path) == 2 # =================================================================== # CoPilotController tests # =================================================================== class TestCoPilotController: def _make_config(self, **overrides) -> CoPilotConfig: defaults = { "mode": "co-pilot", "pause_at_gates": True, "pause_at_every_stage": False, "feedback_timeout_sec": 3600, "allow_branching": True, "max_branches": 3, } defaults.update(overrides) return CoPilotConfig(**defaults) def test_should_pause_zero_touch(self, tmp_path: Path): config = self._make_config(mode="zero-touch") ctrl = CoPilotController(config, tmp_path) assert ctrl.should_pause(5, is_gate=True) is False assert ctrl.should_pause(1, is_gate=False) is False def test_should_pause_auto_pilot_gate(self, tmp_path: Path): config = self._make_config(mode="auto-pilot") ctrl = CoPilotController(config, tmp_path) assert ctrl.should_pause(5, is_gate=True) is True assert ctrl.should_pause(1, is_gate=False) is False def test_should_pause_auto_pilot_gates_disabled(self, tmp_path: Path): config = self._make_config(mode="auto-pilot", pause_at_gates=False) ctrl = CoPilotController(config, tmp_path) assert ctrl.should_pause(5, is_gate=True) is False def test_should_pause_copilot_every_stage(self, tmp_path: Path): config = self._make_config(mode="co-pilot", pause_at_every_stage=True) ctrl = CoPilotController(config, tmp_path) assert ctrl.should_pause(1, is_gate=False) is True assert ctrl.should_pause(5, is_gate=True) is True def test_should_pause_copilot_gates_only(self, tmp_path: Path): config = self._make_config(mode="co-pilot", pause_at_every_stage=False) ctrl = CoPilotController(config, tmp_path) assert ctrl.should_pause(5, is_gate=True) is True assert ctrl.should_pause(1, is_gate=False) is False def test_present_stage_result(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) summary = ctrl.present_stage_result( stage_num=5, stage_name="LITERATURE_SCREEN", artifacts=["screen_report.json"], status="done", ) assert "Stage 5: LITERATURE_SCREEN" in summary assert "Status: done" in summary assert "screen_report.json" in summary def test_present_stage_result_with_error(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) summary = ctrl.present_stage_result( stage_num=12, stage_name="EXPERIMENT_RUN", artifacts=[], status="failed", error="CUDA out of memory", ) assert "Error: CUDA out of memory" in summary def test_handle_feedback_approve(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="approve", stage=5) result = ctrl.handle_feedback(fb) assert result["instruction"] == "continue" def test_handle_feedback_modify(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) fb = Feedback( action="modify", stage=5, message="Change approach", modifications={"key": "value"}, ) result = ctrl.handle_feedback(fb) assert result["instruction"] == "apply_modifications" assert result["modifications"] == {"key": "value"} def test_handle_feedback_retry(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="retry", stage=12) result = ctrl.handle_feedback(fb) assert result["instruction"] == "rerun_stage" def test_handle_feedback_skip(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="skip", stage=21) result = ctrl.handle_feedback(fb) assert result["instruction"] == "skip_stage" def test_handle_feedback_branch(self, tmp_path: Path): config = self._make_config(allow_branching=True) ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="branch", stage=9, branch_name="alt_design") result = ctrl.handle_feedback(fb) assert result["instruction"] == "branch_created" assert result["branch_name"] == "alt_design" def test_handle_feedback_branch_disabled(self, tmp_path: Path): config = self._make_config(allow_branching=False) ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="branch", stage=9) result = ctrl.handle_feedback(fb) assert result["instruction"] == "branching_disabled" def test_handle_feedback_branch_max_reached(self, tmp_path: Path): config = self._make_config(allow_branching=True, max_branches=1) ctrl = CoPilotController(config, tmp_path) # Create first branch fb1 = Feedback(action="branch", stage=1, branch_name="b1") ctrl.handle_feedback(fb1) # Second branch should fail fb2 = Feedback(action="branch", stage=2, branch_name="b2") result = ctrl.handle_feedback(fb2) assert result["instruction"] == "branch_failed" def test_handle_feedback_rollback(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="rollback", stage=15, rollback_to=8) result = ctrl.handle_feedback(fb) assert result["instruction"] == "rollback" assert result["rollback_to"] == 8 def test_handle_feedback_unknown_action(self, tmp_path: Path): config = self._make_config() ctrl = CoPilotController(config, tmp_path) # Construct with a technically valid action but unhandled by match fb = Feedback(action="discuss", stage=1) result = ctrl.handle_feedback(fb) assert result["instruction"] == "continue" def test_from_config_zero_touch_returns_none(self, tmp_path: Path): config = self._make_config(mode="zero-touch") ctrl = CoPilotController.from_config(config, tmp_path) assert ctrl is None def test_from_config_copilot_returns_controller(self, tmp_path: Path): config = self._make_config(mode="co-pilot") ctrl = CoPilotController.from_config(config, tmp_path) assert ctrl is not None assert isinstance(ctrl, CoPilotController) def test_from_config_auto_pilot_returns_controller(self, tmp_path: Path): config = self._make_config(mode="auto-pilot") ctrl = CoPilotController.from_config(config, tmp_path) assert ctrl is not None def test_handle_feedback_branch_default_name(self, tmp_path: Path): config = self._make_config(allow_branching=True) ctrl = CoPilotController(config, tmp_path) fb = Feedback(action="branch", stage=9) # no branch_name result = ctrl.handle_feedback(fb) assert result["instruction"] == "branch_created" assert result["branch_name"] == "branch_9" ================================================ FILE: tests/test_decision_agent.py ================================================ """Tests for FigureDecisionAgent, NanoBananaAgent, and Docker renderer. Covers: - FigureDecisionAgent._parse_decisions() — JSON parsing edge cases - FigureDecisionAgent._heuristic_decide() — fallback coverage - FigureDecisionAgent._infer_backend() — backend classification - FigureDecisionAgent._enforce_bounds() — min/max enforcement - NanoBananaAgent._build_prompt() — prompt construction - NanoBananaAgent._get_type_guidelines() — guideline lookup - RendererAgent._execute_in_docker() — docker command construction - strip_thinking_tags() — safety verification - End-to-end decision + orchestration with mock LLM """ from __future__ import annotations import json import os import subprocess from dataclasses import dataclass from pathlib import Path from typing import Any from unittest import mock import pytest # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @dataclass class _FakeLLMResponse: content: str = "" model: str = "gpt-4.1" prompt_tokens: int = 100 completion_tokens: int = 200 total_tokens: int = 300 finish_reason: str = "stop" truncated: bool = False raw: dict = None # type: ignore[assignment] def __post_init__(self): if self.raw is None: self.raw = {} class _FakeLLM: """Minimal mock LLM client.""" def __init__(self, response: str = "{}"): self._response = response self.calls: list[dict[str, Any]] = [] def chat(self, messages, *, system=None, max_tokens=None, temperature=None, json_mode=False, **kwargs): self.calls.append({ "messages": messages, "system": system, "json_mode": json_mode, }) return _FakeLLMResponse(content=self._response) # ========================================================================= # FigureDecisionAgent._parse_decisions() # ========================================================================= class TestParseDecisions: """Edge cases for JSON parsing in the decision agent.""" def _agent(self): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent return FigureDecisionAgent(_FakeLLM()) def test_valid_json_array(self): agent = self._agent() raw = json.dumps([ { "section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Architecture overview", "priority": 1, }, { "section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Main results", "priority": 1, }, ]) decisions = agent._parse_decisions(raw) assert len(decisions) == 2 assert decisions[0]["backend"] == "image" assert decisions[1]["backend"] == "code" def test_json_inside_markdown_fences(self): agent = self._agent() raw = '```json\n[{"section": "Method", "figure_type": "pipeline_overview", "backend": "image", "description": "Pipeline", "priority": 1}]\n```' decisions = agent._parse_decisions(raw) assert len(decisions) == 1 assert decisions[0]["figure_type"] == "pipeline_overview" def test_json_with_surrounding_text(self): agent = self._agent() raw = 'Here are the decisions:\n[{"section": "Results", "figure_type": "heatmap", "backend": "code", "description": "Heatmap", "priority": 2}]\nThat is all.' decisions = agent._parse_decisions(raw) assert len(decisions) == 1 def test_no_json_array_raises(self): agent = self._agent() with pytest.raises(ValueError, match="No JSON array"): agent._parse_decisions("This is not JSON at all.") def test_empty_array(self): agent = self._agent() decisions = agent._parse_decisions("[]") assert decisions == [] def test_non_dict_items_skipped(self): agent = self._agent() raw = json.dumps([ "not a dict", 42, {"section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Arch", "priority": 1}, ]) decisions = agent._parse_decisions(raw) assert len(decisions) == 1 def test_invalid_backend_auto_inferred(self): agent = self._agent() raw = json.dumps([ {"section": "Method", "figure_type": "architecture_diagram", "backend": "invalid_backend", "description": "Arch", "priority": 1}, ]) decisions = agent._parse_decisions(raw) assert decisions[0]["backend"] == "image" # architecture → image def test_missing_fields_get_defaults(self): agent = self._agent() raw = json.dumps([{}]) decisions = agent._parse_decisions(raw) assert len(decisions) == 1 assert decisions[0]["section"] == "Results" assert decisions[0]["figure_type"] == "bar_comparison" assert decisions[0]["backend"] == "code" assert decisions[0]["priority"] == 2 # ========================================================================= # FigureDecisionAgent._heuristic_decide() # ========================================================================= class TestHeuristicDecide: """Test the rule-based fallback decision logic.""" def _agent(self, min_figures=3, max_figures=10): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent return FigureDecisionAgent( _FakeLLM(), min_figures=min_figures, max_figures=max_figures ) def test_with_experiments(self): agent = self._agent() decisions = agent._heuristic_decide( topic="Graph anomaly detection", has_experiments=True, condition_summaries={"proposed": {}, "baseline": {}, "ablation": {}}, ) # Should have: arch_diagram + bar_comparison + training_curve + pipeline assert len(decisions) >= 4 backends = {d["backend"] for d in decisions} assert "code" in backends assert "image" in backends def test_without_experiments(self): agent = self._agent() decisions = agent._heuristic_decide( topic="Theoretical framework", has_experiments=False, condition_summaries={}, ) # Should have: arch_diagram + pipeline (image only, no code) assert len(decisions) >= 2 assert all(d["backend"] == "image" for d in decisions) def test_ablation_trigger(self): """When >= 4 conditions, an ablation figure should be added.""" agent = self._agent() decisions = agent._heuristic_decide( topic="Test", has_experiments=True, condition_summaries={"a": {}, "b": {}, "c": {}, "d": {}}, ) descriptions = [d["description"].lower() for d in decisions] assert any("ablation" in desc for desc in descriptions) def test_max_figures_respected(self): agent = self._agent(max_figures=2) decisions = agent._heuristic_decide( topic="Test", has_experiments=True, condition_summaries={"a": {}, "b": {}, "c": {}, "d": {}}, ) assert len(decisions) <= 2 # ========================================================================= # FigureDecisionAgent._infer_backend() # ========================================================================= class TestInferBackend: def test_code_types(self): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent code_types = [ "bar_comparison", "line_chart", "heatmap", "confusion_matrix", "training_curve", "ablation_chart", "scatter_plot", ] for t in code_types: assert FigureDecisionAgent._infer_backend(t) == "code", f"Failed for {t}" def test_image_types(self): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent image_types = [ "architecture_diagram", "method_flowchart", "pipeline_overview", "concept_illustration", "system_diagram", ] for t in image_types: assert FigureDecisionAgent._infer_backend(t) == "image", f"Failed for {t}" def test_unknown_defaults_to_image(self): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent assert FigureDecisionAgent._infer_backend("unknown_chart_type") == "image" # ========================================================================= # FigureDecisionAgent._enforce_bounds() # ========================================================================= class TestEnforceBounds: def _agent(self, min_figures=3, max_figures=6): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent return FigureDecisionAgent( _FakeLLM(), min_figures=min_figures, max_figures=max_figures ) def test_min_padding(self): """When fewer than min figures, should pad.""" agent = self._agent(min_figures=4) decisions = [ {"section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Test", "priority": 1}, ] result = agent._enforce_bounds(decisions, has_experiments=True) assert len(result) >= 4 def test_max_truncation(self): """When more than max figures, should truncate.""" agent = self._agent(max_figures=3) decisions = [ {"section": f"S{i}", "figure_type": "bar_comparison", "backend": "code", "description": f"Fig {i}", "priority": i} for i in range(8) ] result = agent._enforce_bounds(decisions, has_experiments=True) assert len(result) <= 3 def test_ensures_image_figure(self): """Should add architecture diagram if none present.""" agent = self._agent(min_figures=1) decisions = [ {"section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Bar", "priority": 1}, ] result = agent._enforce_bounds(decisions, has_experiments=True) assert any(d["backend"] == "image" for d in result) def test_ensures_code_figure_with_experiments(self): """Should add bar_comparison if experiments exist but no code figure.""" agent = self._agent(min_figures=1) decisions = [ {"section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Arch", "priority": 1}, ] result = agent._enforce_bounds(decisions, has_experiments=True) assert any(d["backend"] == "code" for d in result) # ========================================================================= # NanoBananaAgent._build_prompt() # ========================================================================= class TestBuildPrompt: def _agent(self): from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent return NanoBananaAgent( _FakeLLM(), gemini_api_key="fake-key", use_sdk=False, ) def test_prompt_contains_description(self): agent = self._agent() prompt = agent._build_prompt( description="Encoder-decoder with attention", figure_type="architecture_diagram", section="Method", topic="Graph anomaly detection", ) assert "Encoder-decoder with attention" in prompt assert "Method" in prompt assert "Graph anomaly detection" in prompt def test_prompt_contains_style(self): agent = self._agent() prompt = agent._build_prompt( description="Test", figure_type="architecture_diagram", section="Method", topic="Test", ) assert "academic" in prompt.lower() assert "publication" in prompt.lower() def test_prompt_varies_by_type(self): agent = self._agent() arch_prompt = agent._build_prompt( description="Test", figure_type="architecture_diagram", section="Method", topic="Test", ) flow_prompt = agent._build_prompt( description="Test", figure_type="method_flowchart", section="Method", topic="Test", ) # Different guidelines for different types assert arch_prompt != flow_prompt # ========================================================================= # NanoBananaAgent._get_type_guidelines() # ========================================================================= class TestGetTypeGuidelines: def test_known_types(self): from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent known = [ "architecture_diagram", "method_flowchart", "pipeline_overview", "concept_illustration", "system_diagram", "attention_visualization", "comparison_illustration", ] for t in known: g = NanoBananaAgent._get_type_guidelines(t) assert len(g) > 0, f"Empty guidelines for {t}" def test_unknown_type_falls_back(self): from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent g = NanoBananaAgent._get_type_guidelines("totally_unknown") fallback = NanoBananaAgent._get_type_guidelines("concept_illustration") assert g == fallback # ========================================================================= # NanoBananaAgent — no API key # ========================================================================= class TestNanoBananaNoKey: def test_execute_without_key_fails(self, tmp_path): from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent # Clear env with mock.patch.dict(os.environ, {}, clear=True): agent = NanoBananaAgent( _FakeLLM(), gemini_api_key="", use_sdk=False, ) result = agent.execute({ "image_figures": [ {"figure_id": "fig_1", "description": "Test", "figure_type": "architecture_diagram", "section": "Method"}, ], "topic": "Test", "output_dir": str(tmp_path), }) assert not result.success assert "API key" in result.error def test_execute_empty_figures_succeeds(self, tmp_path): from researchclaw.agents.figure_agent.nano_banana import NanoBananaAgent with mock.patch.dict(os.environ, {}, clear=True): agent = NanoBananaAgent( _FakeLLM(), gemini_api_key="", use_sdk=False, ) result = agent.execute({ "image_figures": [], "topic": "Test", "output_dir": str(tmp_path), }) assert result.success assert result.data["count"] == 0 # ========================================================================= # RendererAgent._execute_in_docker() — Docker command construction # ========================================================================= class TestDockerRenderer: def _agent(self): from researchclaw.agents.figure_agent.renderer import RendererAgent return RendererAgent( _FakeLLM(), timeout_sec=10, use_docker=True, docker_image="researchclaw/experiment:latest", ) def test_docker_command_construction(self, tmp_path): """Verify docker command includes security flags.""" agent = self._agent() script_path = tmp_path / "scripts" / "fig_test.py" script_path.parent.mkdir(parents=True, exist_ok=True) script_path.write_text("print('hello')") output_dir = tmp_path / "output" output_dir.mkdir() with mock.patch("subprocess.run") as mock_run: mock_run.return_value = subprocess.CompletedProcess( args=[], returncode=0, stdout="", stderr="" ) agent._execute_in_docker( script_path=script_path, output_dir=output_dir, figure_id="fig_test", ) args = mock_run.call_args cmd = args[0][0] # Verify security flags assert "--network" in cmd assert "none" in cmd assert "--read-only" in cmd assert "--rm" in cmd assert "--memory=512m" in cmd # Verify mount binds cmd_str = " ".join(cmd) assert "script.py:ro" in cmd_str # read-only script assert "output:rw" in cmd_str # writable output def test_docker_timeout_kills_container(self, tmp_path): """Verify container is killed on timeout.""" agent = self._agent() script_path = tmp_path / "scripts" / "fig_timeout.py" script_path.parent.mkdir(parents=True, exist_ok=True) script_path.write_text("import time; time.sleep(999)") output_dir = tmp_path / "output" output_dir.mkdir() with mock.patch("subprocess.run") as mock_run: mock_run.side_effect = subprocess.TimeoutExpired( cmd=["docker", "run"], timeout=10 ) result = agent._execute_in_docker( script_path=script_path, output_dir=output_dir, figure_id="fig_timeout", ) assert "timed out" in result["error"] def test_docker_not_found(self, tmp_path): """Verify graceful handling when Docker is not installed.""" agent = self._agent() script_path = tmp_path / "scripts" / "fig_no_docker.py" script_path.parent.mkdir(parents=True, exist_ok=True) script_path.write_text("print('hello')") output_dir = tmp_path / "output" output_dir.mkdir() with mock.patch("subprocess.run") as mock_run: mock_run.side_effect = FileNotFoundError("docker not found") result = agent._execute_in_docker( script_path=script_path, output_dir=output_dir, figure_id="fig_no_docker", ) assert "not found" in result["error"] def test_docker_script_failure(self, tmp_path): """Verify error message includes stderr on non-zero exit.""" agent = self._agent() script_path = tmp_path / "scripts" / "fig_fail.py" script_path.parent.mkdir(parents=True, exist_ok=True) script_path.write_text("raise Exception('boom')") output_dir = tmp_path / "output" output_dir.mkdir() with mock.patch("subprocess.run") as mock_run: mock_run.return_value = subprocess.CompletedProcess( args=[], returncode=1, stdout="", stderr="Traceback: Exception: boom", ) result = agent._execute_in_docker( script_path=script_path, output_dir=output_dir, figure_id="fig_fail", ) assert result["error"] assert "boom" in result["error"] # ========================================================================= # strip_thinking_tags() — safety tests # ========================================================================= class TestStripThinkingTags: def test_closed_tags_removed(self): from researchclaw.utils.thinking_tags import strip_thinking_tags text = "Hello internal reasoning World" assert strip_thinking_tags(text) == "Hello World" def test_no_tags(self): from researchclaw.utils.thinking_tags import strip_thinking_tags text = "Normal text without tags" assert strip_thinking_tags(text) == text def test_empty_string(self): from researchclaw.utils.thinking_tags import strip_thinking_tags assert strip_thinking_tags("") == "" def test_nested_code_preserved(self): """Literal in code blocks should NOT be corrupted when used by chat() without strip_thinking=True.""" text = '```python\n# The tag is used by...\nprint("hello")\n```' # Without stripping, text is untouched assert "" in text def test_unclosed_tag_behavior(self): """Document the behavior: unclosed removes everything after it.""" from researchclaw.utils.thinking_tags import strip_thinking_tags text = "Prefix reasoning that never closes" result = strip_thinking_tags(text) # The unclosed tag strips everything after assert "Prefix" in result assert "reasoning" not in result # ========================================================================= # FigureDecisionAgent.execute() — full integration with mock LLM # ========================================================================= class TestDecisionAgentExecute: def test_llm_decision(self): from researchclaw.agents.figure_agent.decision import FigureDecisionAgent llm_response = json.dumps([ {"section": "Method", "figure_type": "architecture_diagram", "backend": "image", "description": "Arch", "priority": 1}, {"section": "Results", "figure_type": "bar_comparison", "backend": "code", "description": "Results", "priority": 1}, {"section": "Results", "figure_type": "heatmap", "backend": "code", "description": "Heatmap", "priority": 2}, ]) agent = FigureDecisionAgent(_FakeLLM(llm_response), min_figures=3) result = agent.execute({ "topic": "Graph anomaly detection", "hypothesis": "GRACE improves detection", "paper_draft": "# Introduction\n...", "has_experiments": True, "condition_summaries": {"proposed": {}, "baseline": {}}, }) assert result.success assert result.data["total"] >= 3 assert len(result.data["code_figures"]) >= 1 assert len(result.data["image_figures"]) >= 1 def test_fallback_on_bad_llm(self): """When LLM returns garbage, heuristic fallback should kick in.""" from researchclaw.agents.figure_agent.decision import FigureDecisionAgent agent = FigureDecisionAgent( _FakeLLM("This is not JSON"), min_figures=3, ) result = agent.execute({ "topic": "Test topic", "has_experiments": True, "condition_summaries": {"a": {}, "b": {}}, }) assert result.success # fallback succeeds assert result.data["total"] >= 3 def test_fallback_on_no_llm(self): """When LLM is None, heuristic fallback should work.""" from researchclaw.agents.figure_agent.decision import FigureDecisionAgent agent = FigureDecisionAgent(None, min_figures=2) result = agent.execute({ "topic": "Test", "has_experiments": False, "condition_summaries": {}, }) assert result.success assert result.data["total"] >= 2 # ========================================================================= # CWD regression test (Issue #2) # ========================================================================= class TestRendererCwd: """Verify the CWD is set to output_dir, not its parent.""" def test_local_cwd_is_output_dir(self, tmp_path): """Scripts using relative savefig should write to output_dir.""" from researchclaw.agents.figure_agent.renderer import RendererAgent agent = RendererAgent(_FakeLLM(), timeout_sec=10, use_docker=False) output_dir = tmp_path / "charts" with mock.patch("subprocess.run") as mock_run: mock_run.return_value = subprocess.CompletedProcess( args=[], returncode=0, stdout="", stderr="" ) agent._execute_local( script_path=tmp_path / "test.py", output_dir=output_dir, ) call_kwargs = mock_run.call_args cwd = call_kwargs[1]["cwd"] if isinstance(call_kwargs[1], dict) else None # CWD should be output_dir, NOT output_dir.parent assert cwd == str(output_dir.resolve()) # ========================================================================= # chat(strip_thinking=True) — opt-in parameter (Issue #1 fix) # ========================================================================= class TestChatStripThinking: """Verify the opt-in strip_thinking parameter on LLMClient.chat().""" def test_strip_thinking_false_by_default(self): """Default chat() should NOT strip tags.""" from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse config = LLMConfig( base_url="http://fake", api_key="fake-key", primary_model="test-model", ) client = LLMClient(config) response_with_think = ( 'internal reasoningThe actual answer is 42.' ) fake_api_response = { "choices": [{ "message": {"content": response_with_think}, "finish_reason": "stop", }], "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, "model": "test-model", } with mock.patch("urllib.request.urlopen") as mock_urlopen: mock_resp = mock.MagicMock() mock_resp.read.return_value = json.dumps(fake_api_response).encode() mock_resp.__enter__ = mock.MagicMock(return_value=mock_resp) mock_resp.__exit__ = mock.MagicMock(return_value=False) mock_urlopen.return_value = mock_resp result = client.chat( [{"role": "user", "content": "test"}], strip_thinking=False, ) # With strip_thinking=False, tags are preserved assert "" in result.content def test_strip_thinking_true_removes_tags(self): """chat(strip_thinking=True) should strip tags.""" from researchclaw.llm.client import LLMClient, LLMConfig config = LLMConfig( base_url="http://fake", api_key="fake-key", primary_model="test-model", ) client = LLMClient(config) response_with_think = ( 'internal reasoningThe actual answer is 42.' ) fake_api_response = { "choices": [{ "message": {"content": response_with_think}, "finish_reason": "stop", }], "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, "model": "test-model", } with mock.patch("urllib.request.urlopen") as mock_urlopen: mock_resp = mock.MagicMock() mock_resp.read.return_value = json.dumps(fake_api_response).encode() mock_resp.__enter__ = mock.MagicMock(return_value=mock_resp) mock_resp.__exit__ = mock.MagicMock(return_value=False) mock_urlopen.return_value = mock_resp result = client.chat( [{"role": "user", "content": "test"}], strip_thinking=True, ) # With strip_thinking=True, tags are removed assert "" not in result.content assert "The actual answer is 42." in result.content # ========================================================================= # LaTeX converter — display math $$...$$ fix # ========================================================================= class TestLatexDisplayMath: """Verify the $$...$$ → equation environment fix in converter.py.""" def test_dollar_dollar_to_equation(self): """$$...$$ display math should become \\begin{equation}.""" from researchclaw.templates.converter import _convert_block md = ( "Some text before.\n" "\n" "$$\\alpha_{ij} = \\frac{x}{y}$$\n" "\n" "Some text after." ) result = _convert_block(md) assert "\\begin{equation}" in result assert "\\end{equation}" in result assert "\\alpha_{ij}" in result # Should NOT contain escaped $$ assert "\\$\\$" not in result def test_multiline_dollar_dollar(self): """$$...$$ spanning multiple lines should also convert.""" from researchclaw.templates.converter import _convert_block md = ( "$$\n" "\\mathcal{L} = -\\log \\frac{a}{b}\n" "$$\n" ) result = _convert_block(md) assert "\\begin{equation}" in result assert "\\mathcal{L}" in result def test_inline_dollar_dollar_not_escaped(self): """$$ in inline context should not be escaped to \\$\\$.""" from researchclaw.templates.converter import _convert_inline text = "The formula $$x+y$$ is important" result = _convert_inline(text) # Should not contain \\textasciicircum or \\$ assert "\\textasciicircum" not in result # ========================================================================= # LaTeX converter — figure [t] placement # ========================================================================= class TestLatexFigurePlacement: """Verify figures use [t] placement specifier.""" def test_figure_uses_top_placement(self): from researchclaw.templates.converter import _render_figure result = _render_figure("Test Caption", "charts/test.png") assert "\\begin{figure}[t]" in result assert "[ht]" not in result def test_figure_has_centering(self): from researchclaw.templates.converter import _render_figure result = _render_figure("My Figure", "path/to/image.png") assert "\\centering" in result assert "\\includegraphics" in result assert "\\caption{My Figure}" in result assert "\\label{fig:" in result # ========================================================================= # Pipeline wrapper — _chat_with_prompt strip_thinking default # ========================================================================= class TestChatWithPromptStripThinking: """Verify _chat_with_prompt passes strip_thinking to llm.chat().""" def test_default_strips_thinking(self): """_chat_with_prompt should pass strip_thinking=True by default.""" from unittest.mock import MagicMock from researchclaw.pipeline.executor import _chat_with_prompt from researchclaw.llm.client import LLMResponse mock_llm = MagicMock() mock_llm.chat.return_value = LLMResponse( content="clean output", model="test", finish_reason="stop", ) result = _chat_with_prompt(mock_llm, system="sys", user="hello") call_kwargs = mock_llm.chat.call_args assert call_kwargs.kwargs.get("strip_thinking") is True def test_can_disable_stripping(self): """_chat_with_prompt(strip_thinking=False) should forward the flag.""" from unittest.mock import MagicMock from researchclaw.pipeline.executor import _chat_with_prompt from researchclaw.llm.client import LLMResponse mock_llm = MagicMock() mock_llm.chat.return_value = LLMResponse( content="reasoningoutput", model="test", finish_reason="stop", ) _chat_with_prompt( mock_llm, system="sys", user="hello", strip_thinking=False, ) call_kwargs = mock_llm.chat.call_args assert call_kwargs.kwargs.get("strip_thinking") is False ================================================ FILE: tests/test_domain_detector.py ================================================ """Tests for domain detection and profile loading.""" from __future__ import annotations import pytest from pathlib import Path from researchclaw.domains.detector import ( DomainProfile, ExperimentParadigm, MetricType, detect_domain, detect_domain_id, get_generic_profile, get_profile, is_ml_domain, load_all_profiles, _keyword_detect, _profile_cache, ) # --------------------------------------------------------------------------- # Profile loading tests # --------------------------------------------------------------------------- class TestProfileLoading: def setup_method(self): _profile_cache.clear() def test_load_all_profiles_returns_dict(self): profiles = load_all_profiles() assert isinstance(profiles, dict) assert len(profiles) >= 10 # we created 14 profiles def test_profiles_have_required_fields(self): profiles = load_all_profiles() for domain_id, profile in profiles.items(): assert profile.domain_id == domain_id assert profile.display_name assert profile.experiment_paradigm assert profile.entry_point def test_get_profile_existing(self): profile = get_profile("ml_vision") assert profile is not None assert profile.domain_id == "ml_vision" assert profile.display_name == "Computer Vision (ML)" assert profile.gpu_required is True def test_get_profile_nonexistent(self): profile = get_profile("nonexistent_domain_xyz") assert profile is None def test_get_generic_profile(self): profile = get_generic_profile() assert profile.domain_id == "generic" assert "numpy" in profile.core_libraries def test_ml_profiles_exist(self): for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]: profile = get_profile(domain_id) assert profile is not None, f"Missing profile: {domain_id}" def test_physics_profiles_exist(self): for domain_id in ["physics_simulation", "physics_pde"]: profile = get_profile(domain_id) assert profile is not None, f"Missing profile: {domain_id}" def test_other_domain_profiles_exist(self): for domain_id in [ "mathematics_numerical", "chemistry_qm", "chemistry_molprop", "biology_singlecell", "economics_empirical", "security_detection", "robotics_control", ]: profile = get_profile(domain_id) assert profile is not None, f"Missing profile: {domain_id}" def test_physics_profile_paradigm(self): profile = get_profile("physics_pde") assert profile is not None assert profile.experiment_paradigm == "convergence" assert "convergence_order_fit" in profile.statistical_tests def test_economics_profile_paradigm(self): profile = get_profile("economics_empirical") assert profile is not None assert profile.experiment_paradigm == "progressive_spec" assert "hausman_test" in profile.statistical_tests # --------------------------------------------------------------------------- # Keyword detection tests # --------------------------------------------------------------------------- class TestKeywordDetection: def test_ml_vision_keywords(self): assert _keyword_detect("image classification with ResNet") == "ml_vision" assert _keyword_detect("convolutional neural network for object detection") == "ml_vision" def test_ml_nlp_keywords(self): assert _keyword_detect("text classification using BERT") == "ml_nlp" assert _keyword_detect("natural language processing transformer") == "ml_nlp" def test_ml_rl_keywords(self): assert _keyword_detect("reinforcement learning policy gradient") == "ml_rl" assert _keyword_detect("actor-critic algorithm for robot control") == "ml_rl" def test_physics_keywords(self): assert _keyword_detect("molecular dynamics simulation with Lennard-Jones") == "physics_simulation" assert _keyword_detect("finite element method for Navier-Stokes equation") == "physics_pde" def test_chemistry_keywords(self): assert _keyword_detect("DFT calculation with PySCF") == "chemistry_qm" assert _keyword_detect("molecular property prediction using RDKit fingerprints") == "chemistry_molprop" def test_biology_keywords(self): assert _keyword_detect("single-cell RNA-seq analysis with scanpy") == "biology_singlecell" def test_economics_keywords(self): assert _keyword_detect("panel data regression with fixed effects") == "economics_empirical" assert _keyword_detect("instrumental variable causal inference") == "economics_empirical" def test_math_keywords(self): assert _keyword_detect("Runge-Kutta ODE solver convergence") == "mathematics_numerical" assert _keyword_detect("numerical analysis of quadrature methods") == "mathematics_numerical" def test_security_keywords(self): assert _keyword_detect("intrusion detection system for network traffic") == "security_detection" def test_robotics_keywords(self): assert _keyword_detect("robot manipulation with MuJoCo") == "robotics_control" def test_generic_ml_fallback(self): assert _keyword_detect("neural network training with pytorch") == "ml_generic" assert _keyword_detect("deep learning for regression") == "ml_generic" def test_unknown_topic(self): assert _keyword_detect("cooking recipes for italian food") is None def test_case_insensitive(self): assert _keyword_detect("IMAGE CLASSIFICATION WITH RESNET") == "ml_vision" assert _keyword_detect("DFT Calculation") == "chemistry_qm" # --------------------------------------------------------------------------- # detect_domain tests # --------------------------------------------------------------------------- class TestDetectDomain: def test_detect_ml_vision(self): profile = detect_domain("image classification on CIFAR-10") assert is_ml_domain(profile) assert profile.domain_id == "ml_vision" def test_detect_physics(self): profile = detect_domain("molecular dynamics simulation of Lennard-Jones fluid") assert profile.domain_id == "physics_simulation" assert not is_ml_domain(profile) def test_detect_with_hypotheses(self): profile = detect_domain( topic="novel numerical scheme", hypotheses="We propose a 4th order finite difference scheme for the Poisson equation", ) assert profile.domain_id == "physics_pde" def test_detect_generic_fallback(self): profile = detect_domain("studying the behavior of abstract systems") assert profile.domain_id == "generic" def test_detect_domain_id_shortcut(self): domain_id = detect_domain_id("image classification") assert domain_id == "ml_vision" domain_id = detect_domain_id("cooking recipes") assert domain_id == "generic" # --------------------------------------------------------------------------- # is_ml_domain tests # --------------------------------------------------------------------------- class TestIsMLDomain: def test_ml_domains(self): for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]: profile = get_profile(domain_id) assert profile is not None assert is_ml_domain(profile) def test_non_ml_domains(self): for domain_id in ["physics_simulation", "chemistry_qm", "economics_empirical"]: profile = get_profile(domain_id) assert profile is not None assert not is_ml_domain(profile) def test_generic_not_ml(self): profile = get_generic_profile() assert not is_ml_domain(profile) # --------------------------------------------------------------------------- # DomainProfile dataclass tests # --------------------------------------------------------------------------- class TestDomainProfile: def test_default_values(self): profile = DomainProfile(domain_id="test", display_name="Test") assert profile.experiment_paradigm == ExperimentParadigm.COMPARISON.value assert profile.entry_point == "main.py" assert profile.gpu_required is False assert "paired_t_test" in profile.statistical_tests def test_custom_values(self): profile = DomainProfile( domain_id="custom", display_name="Custom Domain", experiment_paradigm="convergence", gpu_required=True, core_libraries=["numpy", "custom_lib"], ) assert profile.experiment_paradigm == "convergence" assert profile.gpu_required is True assert "custom_lib" in profile.core_libraries # --------------------------------------------------------------------------- # Enum tests # --------------------------------------------------------------------------- class TestEnums: def test_experiment_paradigm_values(self): assert ExperimentParadigm.COMPARISON.value == "comparison" assert ExperimentParadigm.CONVERGENCE.value == "convergence" assert ExperimentParadigm.PROGRESSIVE_SPEC.value == "progressive_spec" assert ExperimentParadigm.SIMULATION.value == "simulation" def test_metric_type_values(self): assert MetricType.SCALAR.value == "scalar" assert MetricType.TABLE.value == "table" assert MetricType.CONVERGENCE.value == "convergence" # --------------------------------------------------------------------------- # Domain detection accuracy test (50-topic benchmark) # --------------------------------------------------------------------------- class TestDetectionAccuracy: """Test domain detection accuracy on a diverse set of topics.""" TOPIC_EXPECTATIONS = [ # ML topics ("Image classification with ResNet on CIFAR-10", "ml_vision"), ("Object detection using YOLO", "ml_vision"), ("Text sentiment analysis with BERT", "ml_nlp"), ("Language model fine-tuning", "ml_nlp"), ("Reinforcement learning for Atari games", "ml_rl"), ("Policy gradient optimization in continuous control", "ml_rl"), ("Graph neural network for node classification", "ml_graph"), ("Knowledge distillation from large teacher models", "ml_compression"), ("GAN for image synthesis", "ml_generative"), ("Tabular data prediction with XGBoost", "ml_tabular"), ("Deep learning regression model", "ml_generic"), ("Neural network for time series forecasting", "ml_generic"), # Physics topics ("Molecular dynamics of Lennard-Jones particles", "physics_simulation"), ("N-body gravitational simulation", "physics_simulation"), ("Symplectic integrator for Hamiltonian systems", "physics_simulation"), ("Finite element solution of Poisson equation", "physics_pde"), ("Heat equation solver comparison", "physics_pde"), ("Navier-Stokes finite difference scheme", "physics_pde"), # Chemistry topics ("Hartree-Fock calculation for small molecules", "chemistry_qm"), ("DFT energy with PySCF", "chemistry_qm"), ("Molecular property prediction from SMILES", "chemistry_molprop"), ("Drug binding affinity with RDKit fingerprints", "chemistry_molprop"), # Biology topics ("Single-cell clustering with scanpy", "biology_singlecell"), ("scRNA-seq differential expression analysis", "biology_singlecell"), ("Genome variant calling pipeline", "biology_genomics"), ("Protein folding prediction", "biology_protein"), # Economics topics ("Panel data regression with fixed effects", "economics_empirical"), ("Instrumental variable estimation", "economics_empirical"), ("Causal inference with difference-in-differences", "economics_empirical"), # Math topics ("Runge-Kutta ODE solver convergence analysis", "mathematics_numerical"), ("Numerical quadrature comparison", "mathematics_numerical"), ("Convex optimization benchmark", "mathematics_optimization"), # Security topics ("Network intrusion detection system", "security_detection"), ("Malware classification using random forest", "security_detection"), # Robotics topics ("Robot manipulation policy learning", "robotics_control"), ("Locomotion control with MuJoCo", "robotics_control"), ] def test_keyword_detection_accuracy(self): """Test that keyword detection achieves > 90% accuracy.""" correct = 0 total = len(self.TOPIC_EXPECTATIONS) for topic, expected_domain in self.TOPIC_EXPECTATIONS: detected = _keyword_detect(topic) if detected == expected_domain: correct += 1 accuracy = correct / total assert accuracy > 0.90, ( f"Keyword detection accuracy: {accuracy:.1%} ({correct}/{total}). " f"Expected > 90%." ) def test_full_detection_accuracy(self): """Test that full detect_domain achieves > 90% accuracy.""" correct = 0 total = len(self.TOPIC_EXPECTATIONS) for topic, expected_domain in self.TOPIC_EXPECTATIONS: profile = detect_domain(topic) if profile.domain_id == expected_domain: correct += 1 accuracy = correct / total assert accuracy > 0.90, ( f"Full detection accuracy: {accuracy:.1%} ({correct}/{total}). " f"Expected > 90%." ) ================================================ FILE: tests/test_entry_point_validation.py ================================================ """Tests for entry point path traversal validation.""" from __future__ import annotations from pathlib import Path from unittest.mock import patch from researchclaw.experiment.sandbox import ( ExperimentSandbox, validate_entry_point, validate_entry_point_resolved, ) # ── Unit tests: validate_entry_point (syntax) ───────────────────────── class TestValidateEntryPoint: """Syntax-only checks — no filesystem needed.""" def test_valid_entry_point(self) -> None: assert validate_entry_point("main.py") is None def test_valid_nested_entry_point(self) -> None: assert validate_entry_point("src/train.py") is None def test_valid_dot_slash_prefix(self) -> None: assert validate_entry_point("./main.py") is None def test_valid_dot_in_middle(self) -> None: assert validate_entry_point("src/./train.py") is None def test_valid_deeply_nested(self) -> None: assert validate_entry_point("a/b/c/d/main.py") is None def test_rejects_absolute_path(self) -> None: err = validate_entry_point("/etc/passwd") assert err is not None assert "relative" in err.lower() or "absolute" in err.lower() def test_rejects_path_traversal(self) -> None: err = validate_entry_point("../../../etc/passwd") assert err is not None assert ".." in err def test_rejects_dotdot_in_middle(self) -> None: err = validate_entry_point("src/../../etc/passwd") assert err is not None assert ".." in err def test_rejects_empty_string(self) -> None: err = validate_entry_point("") assert err is not None assert "empty" in err.lower() def test_rejects_whitespace_only(self) -> None: err = validate_entry_point(" ") assert err is not None assert "empty" in err.lower() # ── Unit tests: validate_entry_point_resolved (containment) ─────────── class TestValidateEntryPointResolved: """Resolve-based checks — needs a real staging directory.""" def test_valid_path_passes(self, tmp_path: Path) -> None: (tmp_path / "main.py").write_text("pass") assert validate_entry_point_resolved(tmp_path, "main.py") is None def test_symlink_escape_rejected(self, tmp_path: Path) -> None: """A symlink pointing outside staging must be caught.""" escape_target = tmp_path / "outside" / "secret.py" escape_target.parent.mkdir() escape_target.write_text("print('escaped!')") staging = tmp_path / "staging" staging.mkdir() (staging / "legit.py").symlink_to(escape_target) err = validate_entry_point_resolved(staging, "legit.py") assert err is not None assert "escapes" in err.lower() def test_nested_valid_path_passes(self, tmp_path: Path) -> None: sub = tmp_path / "src" sub.mkdir() (sub / "train.py").write_text("pass") assert validate_entry_point_resolved(tmp_path, "src/train.py") is None # ── Integration tests: ExperimentSandbox.run_project() ──────────────── class TestExperimentSandboxEntryPointValidation: """Verify validation is wired into ExperimentSandbox.run_project().""" def _make_sandbox(self, tmp_path: Path) -> ExperimentSandbox: from researchclaw.config import SandboxConfig cfg = SandboxConfig() return ExperimentSandbox(cfg, tmp_path / "work") def test_rejects_path_traversal(self, tmp_path: Path) -> None: project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") sandbox = self._make_sandbox(tmp_path) # Create escape target so .exists() alone wouldn't catch it work = tmp_path / "work" work.mkdir(parents=True, exist_ok=True) (work / "escape.py").write_text("print('escaped!')") with patch("subprocess.run") as mock_run: result = sandbox.run_project(project, entry_point="../escape.py") assert result.returncode == -1 assert ".." in result.stderr mock_run.assert_not_called() def test_rejects_absolute_path(self, tmp_path: Path) -> None: project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") sandbox = self._make_sandbox(tmp_path) with patch("subprocess.run") as mock_run: result = sandbox.run_project(project, entry_point="/etc/passwd") assert result.returncode == -1 assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower() mock_run.assert_not_called() # NOTE: A symlink integration test is not included here because the # copy loop (write_bytes/read_bytes) follows symlinks and creates # regular files in staging. The resolve check is defense-in-depth # for future copy mechanism changes; see # TestValidateEntryPointResolved.test_symlink_escape_rejected for # the unit-level proof that the function catches symlink escapes. ================================================ FILE: tests/test_experiment_diagnosis.py ================================================ """Tests for experiment_diagnosis — failure analysis agent.""" from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.pipeline.experiment_diagnosis import ( DeficiencyType, ExperimentDiagnosis, ExperimentQualityAssessment, PaperMode, assess_experiment_quality, diagnose_experiment, ) ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts" # --------------------------------------------------------------------------- # Unit tests — individual checks # --------------------------------------------------------------------------- class TestMissingDependency: def test_detects_module_not_found(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stdout="", stderr="ModuleNotFoundError: No module named 'utils'", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.MISSING_DEPENDENCY in types def test_detects_box2d(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stdout="BOX2D_WARNING: Box2D/LunarLander-v3 not available; skipping", stderr="", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.MISSING_DEPENDENCY in types class TestPermissionError: def test_detects_hf_permission(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stdout="", stderr="PermissionError: Cannot download huggingface model", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.PERMISSION_ERROR in types class TestTimeGuard: def test_detects_dominant_time_guard(self): summary = { "condition_summaries": {"CondA": {"metrics": {"metric": 80.0}}}, "best_run": {"metrics": {}}, } plan = {"conditions": [{"name": "CondA"}, {"name": "CondB"}, {"name": "CondC"}, {"name": "CondD"}]} diag = diagnose_experiment( experiment_summary=summary, experiment_plan=plan, stdout="TIME_GUARD: skipping CondB\nTIME_GUARD: skipping CondC\nTIME_GUARD: skipping CondD", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.TIME_GUARD_DOMINANT in types def test_no_time_guard_if_most_complete(self): summary = { "condition_summaries": { "A": {"metrics": {"metric": 1.0}}, "B": {"metrics": {"metric": 2.0}}, "C": {"metrics": {"metric": 3.0}}, }, "best_run": {"metrics": {}}, } plan = {"conditions": [{"name": "A"}, {"name": "B"}, {"name": "C"}, {"name": "D"}]} diag = diagnose_experiment(experiment_summary=summary, experiment_plan=plan) types = {d.type for d in diag.deficiencies} # 1/4 skipped = 25%, below 50% threshold assert DeficiencyType.TIME_GUARD_DOMINANT not in types class TestSyntheticData: def test_detects_synthetic_fallback(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stdout="[data] WARNING: Alpaca load failed ... using synthetic data.", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.SYNTHETIC_DATA_FALLBACK in types class TestGPUOOM: def test_detects_oom(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stderr="RuntimeError: CUDA out of memory. Tried to allocate 2.00 GiB", ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.GPU_OOM in types class TestIdenticalConditions: def test_detects_from_ablation_warnings(self): summary = { "condition_summaries": {"A": {"metrics": {"m": 1}}, "B": {"metrics": {"m": 1}}}, "best_run": {"metrics": {}}, "ablation_warnings": [ "ABLATION FAILURE: Conditions 'A' and 'B' produce identical outputs across all 1 metrics." ], } diag = diagnose_experiment(experiment_summary=summary) types = {d.type for d in diag.deficiencies} assert DeficiencyType.IDENTICAL_CONDITIONS in types class TestCodeCrash: def test_detects_traceback(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stderr=( "Traceback (most recent call last):\n" " File 'main.py', line 42, in main\n" " result = train(model)\n" "TypeError: train() missing argument 'data'\n" ), ) types = {d.type for d in diag.deficiencies} assert DeficiencyType.CODE_CRASH in types # --------------------------------------------------------------------------- # Quality assessment # --------------------------------------------------------------------------- class TestQualityAssessment: def test_full_paper_mode(self): summary = { "condition_summaries": { "A": {"metrics": {"metric": 80.0}}, "B": {"metrics": {"metric": 85.0}}, "C": {"metrics": {"metric": 90.0}}, }, "best_run": { "metrics": { "A/0/m": 80.0, "A/1/m": 81.0, "B/0/m": 85.0, "B/1/m": 86.0, "C/0/m": 90.0, "C/1/m": 91.0, }, }, } qa = assess_experiment_quality(summary) assert qa.mode == PaperMode.FULL_PAPER assert qa.sufficient def test_preliminary_study_mode(self): summary = { "condition_summaries": { "A": {"metrics": {"metric": 80.0}}, "B": {"metrics": {"metric": 85.0}}, }, "best_run": {"metrics": {"A/0/m": 80.0, "B/0/m": 85.0}}, } qa = assess_experiment_quality(summary) assert qa.mode == PaperMode.PRELIMINARY_STUDY assert not qa.sufficient def test_technical_report_no_conditions(self): summary = { "condition_summaries": {}, "best_run": {"metrics": {}}, } qa = assess_experiment_quality(summary) assert qa.mode == PaperMode.TECHNICAL_REPORT assert not qa.sufficient def test_technical_report_synthetic_data(self): summary = { "condition_summaries": {"A": {"metrics": {"metric": 80.0}}}, "best_run": {"metrics": {}, "stdout": "using synthetic data"}, } qa = assess_experiment_quality(summary) assert qa.mode == PaperMode.TECHNICAL_REPORT # --------------------------------------------------------------------------- # Repair prompt generation # --------------------------------------------------------------------------- class TestRepairPrompt: def test_generates_prompt(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stderr="ModuleNotFoundError: No module named 'special_lib'", ) prompt = diag.to_repair_prompt() assert "special_lib" in prompt assert "DIAGNOSIS" in prompt assert "CRITICAL" in prompt def test_serialization(self): diag = diagnose_experiment( experiment_summary={"condition_summaries": {"A": {"metrics": {"m": 1}}}, "best_run": {"metrics": {}}}, ) d = diag.to_dict() assert isinstance(d, dict) assert "deficiencies" in d assert "conditions_completed" in d # --------------------------------------------------------------------------- # Integration — real artifacts # --------------------------------------------------------------------------- class TestRealArtifacts: def _load(self, run_id: str) -> tuple[dict, dict | None]: pattern = f"rc-*-{run_id}" matches = sorted(ARTIFACTS.glob(pattern)) if not matches: pytest.skip(f"Artifact {run_id} not found") base = matches[0] summary_path = base / "stage-14" / "experiment_summary.json" ref_path = base / "stage-13" / "refinement_log.json" if not summary_path.exists(): pytest.skip(f"No experiment_summary for {run_id}") summary = json.loads(summary_path.read_text()) ref_log = json.loads(ref_path.read_text()) if ref_path.exists() else None return summary, ref_log def test_run_e57360_diagnosis(self): """Run 38 — 3/8 conditions completed, Box2D missing.""" summary, ref_log = self._load("e57360") qa = assess_experiment_quality(summary, ref_log) # Should identify issues and NOT rate as full_paper assert qa.mode != PaperMode.FULL_PAPER or len(qa.deficiencies) > 0 def test_run_8b4a1b_diagnosis(self): """Run 8b4a1b — all NaN, permission errors.""" summary, ref_log = self._load("8b4a1b") qa = assess_experiment_quality(summary, ref_log) # Should be technical_report or preliminary_study at best assert qa.mode in (PaperMode.TECHNICAL_REPORT, PaperMode.PRELIMINARY_STUDY) class TestDatasetNotFoundError: """BUG-203: HuggingFace DatasetNotFoundError should be caught.""" def test_detects_hf_dataset_not_found(self): stderr = ( "Traceback (most recent call last):\n" " File \"/workspace/setup.py\", line 11, in main\n" "datasets.exceptions.DatasetNotFoundError: " "Dataset 'cifar10_corrupted' doesn't exist on the Hub or cannot be accessed.\n" ) diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stderr=stderr, ) ds_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.DATASET_UNAVAILABLE] assert len(ds_issues) >= 1 assert "HuggingFace" in ds_issues[0].description # Should NOT also appear as a generic CODE_CRASH crashes = [d for d in diag.deficiencies if d.type == DeficiencyType.CODE_CRASH] assert not any("DatasetNotFoundError" in c.description for c in crashes) def test_suggested_fix_mentions_precached(self): stderr = ( "DatasetNotFoundError: Dataset 'imagenet_v2' " "doesn't exist on the Hub or cannot be accessed.\n" ) diag = diagnose_experiment( experiment_summary={"condition_summaries": {}, "best_run": {"metrics": {}}}, stderr=stderr, ) ds_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.DATASET_UNAVAILABLE] assert any("/opt/datasets" in d.suggested_fix for d in ds_issues) class TestNearRandomAccuracy: """BUG-204: Detect near-random accuracy in experiment results.""" def test_detects_near_random_cifar10(self): """8.91% accuracy on CIFAR-10 should be flagged.""" diag = diagnose_experiment( experiment_summary={ "condition_summaries": {"cond_a": {"metrics": {"top1_accuracy": 8.91}}}, "metrics_summary": {"top1_accuracy": {"min": 8.42, "max": 8.91, "mean": 8.67}}, "best_run": {"metrics": {}}, }, ) hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE] assert any("random chance" in d.description for d in hp_issues) def test_normal_accuracy_not_flagged(self): """73% accuracy should NOT be flagged.""" diag = diagnose_experiment( experiment_summary={ "condition_summaries": {"baseline": {"metrics": {"accuracy": 73.07}}}, "metrics_summary": {"accuracy": {"min": 68.0, "max": 73.07, "mean": 70.5}}, "best_run": {"metrics": {}}, }, ) hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE] assert not any("random chance" in d.description for d in hp_issues) def test_zero_accuracy_not_flagged(self): """0% accuracy (no data) should NOT be flagged by this check.""" diag = diagnose_experiment( experiment_summary={ "condition_summaries": {}, "metrics_summary": {}, "best_run": {"metrics": {}}, }, ) hp_issues = [d for d in diag.deficiencies if d.type == DeficiencyType.HYPERPARAMETER_ISSUE] assert not any("random chance" in d.description for d in hp_issues) class TestRealArtifactsContinued(TestRealArtifacts): """Continuation of real artifact tests (after TestDatasetNotFoundError).""" def test_run_acbdfa_diagnosis(self): """Run acbdfa — 2 architectures, S4D nearly random.""" summary, ref_log = self._load("acbdfa") diag = diagnose_experiment( experiment_summary=summary, refinement_log=ref_log, stdout=summary.get("best_run", {}).get("stdout", ""), stderr=summary.get("best_run", {}).get("stderr", ""), ) assert diag.completion_rate > 0 ================================================ FILE: tests/test_experiment_repair.py ================================================ """Tests for experiment_repair — repair loop and prompt generation.""" from __future__ import annotations import json from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.pipeline.experiment_diagnosis import ( DeficiencyType, Deficiency, ExperimentDiagnosis, PaperMode, ) from researchclaw.pipeline.experiment_repair import ( ExperimentRepairResult, RepairCycleResult, build_repair_prompt, run_repair_loop, select_best_results, _extract_code_blocks, _build_experiment_summary_from_run, _load_experiment_code, _load_experiment_summary, _summary_quality_score, ) # --------------------------------------------------------------------------- # build_repair_prompt tests # --------------------------------------------------------------------------- class TestBuildRepairPrompt: def test_basic_prompt(self): diag = ExperimentDiagnosis( deficiencies=[ Deficiency( type=DeficiencyType.MISSING_DEPENDENCY, severity="critical", description="Missing Python package: utils", suggested_fix="Add 'utils' to requirements.txt", ) ], conditions_completed=["CondA"], conditions_failed=["CondB"], total_planned=2, completion_rate=0.5, summary="1 deficiency. 1/2 conditions completed.", ) prompt = build_repair_prompt( diagnosis=diag, original_code={"main.py": "import utils\nprint('hello')"}, time_budget_sec=2400, ) assert "EXPERIMENT REPAIR TASK" in prompt assert "utils" in prompt assert "main.py" in prompt assert "2400" in prompt def test_scope_reduction_included(self): diag = ExperimentDiagnosis( deficiencies=[ Deficiency( type=DeficiencyType.TIME_GUARD_DOMINANT, severity="major", description="Time guard killed 8/10 conditions", affected_conditions=["C3", "C4", "C5"], suggested_fix="Reduce conditions", ) ], conditions_completed=["C1", "C2"], conditions_failed=["C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"], total_planned=10, completion_rate=0.2, ) prompt = build_repair_prompt(diag, original_code={}) assert "SCOPE REDUCTION" in prompt assert "BASELINE" in prompt def test_dep_fix_section(self): diag = ExperimentDiagnosis( deficiencies=[ Deficiency( type=DeficiencyType.MISSING_DEPENDENCY, severity="critical", description="Missing Python package: box2d-py", suggested_fix="Add 'box2d-py' to requirements.txt", ), ], ) prompt = build_repair_prompt(diag, original_code={}) assert "DEPENDENCY FIXES" in prompt assert "box2d-py" in prompt def test_long_code_truncated(self): long_code = "x = 1\n" * 5000 diag = ExperimentDiagnosis() prompt = build_repair_prompt(diag, original_code={"big.py": long_code}) assert "truncated" in prompt def test_output_format_section(self): diag = ExperimentDiagnosis() prompt = build_repair_prompt(diag, original_code={"main.py": "pass"}) assert "OUTPUT FORMAT" in prompt assert "filename.py" in prompt # --------------------------------------------------------------------------- # ExperimentRepairResult tests # --------------------------------------------------------------------------- class TestRepairResult: def test_serialization(self): result = ExperimentRepairResult( success=False, total_cycles=2, final_mode=PaperMode.PRELIMINARY_STUDY, ) d = result.to_dict() assert d["success"] is False assert d["total_cycles"] == 2 assert d["final_mode"] == "preliminary_study" def test_serialization_with_cycles(self): diag = ExperimentDiagnosis(summary="test") result = ExperimentRepairResult( success=True, total_cycles=1, final_mode=PaperMode.FULL_PAPER, cycle_history=[ RepairCycleResult( cycle=1, diagnosis=diag, repair_applied=True, repair_description="Fixed 2 files", ), ], ) d = result.to_dict() assert d["success"] is True assert len(d["cycle_history"]) == 1 assert d["cycle_history"][0]["repair_applied"] is True assert d["cycle_history"][0]["diagnosis_summary"] == "test" # --------------------------------------------------------------------------- # Code extraction tests # --------------------------------------------------------------------------- class TestExtractCodeBlocks: def test_named_blocks(self): text = """Here are the fixed files: ```python main.py import torch print("hello") ``` ```python requirements.txt torch>=2.0 numpy ``` """ files = _extract_code_blocks(text) assert "main.py" in files assert "requirements.txt" in files assert "torch" in files["main.py"] assert "numpy" in files["requirements.txt"] def test_unnamed_block_fallback(self): text = """```python import torch model = torch.nn.Linear(10, 2) print("condition=Baseline metric=0.95") ```""" files = _extract_code_blocks(text) assert "main.py" in files assert "torch" in files["main.py"] def test_no_blocks(self): text = "No code here, just text." files = _extract_code_blocks(text) assert files == {} def test_path_normalization(self): text = """```python src/models/main.py import torch print("hello world, this is a test of the extraction") ```""" files = _extract_code_blocks(text) assert "main.py" in files # --------------------------------------------------------------------------- # Summary building tests # --------------------------------------------------------------------------- class TestBuildExperimentSummary: def test_basic_summary(self): run_result = { "stdout": "condition=Baseline metric=80.0\ncondition=Proposed metric=90.0", "stderr": "", "returncode": 0, "metrics": { "Baseline/0/accuracy": 80.0, "Proposed/0/accuracy": 90.0, "primary_metric": 90.0, }, "elapsed_sec": 120.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {"main.py": "pass"}) assert "condition_summaries" in summary assert "Baseline" in summary["condition_summaries"] assert "Proposed" in summary["condition_summaries"] assert summary["total_conditions"] == 2 assert summary["best_run"]["status"] == "completed" def test_failed_run(self): run_result = { "stdout": "", "stderr": "Error: crash", "returncode": 1, "metrics": {}, "elapsed_sec": 5.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) assert summary["best_run"]["status"] == "failed" assert summary["total_conditions"] == 0 def test_multi_seed_grouping(self): run_result = { "stdout": "", "stderr": "", "returncode": 0, "metrics": { "Baseline/0/accuracy": 80.0, "Baseline/1/accuracy": 82.0, "Proposed/0/accuracy": 90.0, "Proposed/1/accuracy": 92.0, }, "elapsed_sec": 300.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) assert len(summary["condition_summaries"]) == 2 # Mean of 80.0 and 82.0 bl = summary["condition_summaries"]["Baseline"] assert abs(bl["metrics"]["accuracy"] - 81.0) < 0.01 assert bl["n_seeds"] == 2 # --------------------------------------------------------------------------- # File loading tests # --------------------------------------------------------------------------- class TestLoadExperimentCode: def test_loads_from_stage_13(self, tmp_path): exp_dir = tmp_path / "stage-13" / "experiment_final" exp_dir.mkdir(parents=True) (exp_dir / "main.py").write_text("print('hello')") (exp_dir / "requirements.txt").write_text("torch") code = _load_experiment_code(tmp_path) assert "main.py" in code assert "requirements.txt" in code def test_loads_from_stage_10(self, tmp_path): exp_dir = tmp_path / "stage-10" / "experiment" exp_dir.mkdir(parents=True) (exp_dir / "main.py").write_text("print('hello')") code = _load_experiment_code(tmp_path) assert "main.py" in code def test_empty_when_no_code(self, tmp_path): code = _load_experiment_code(tmp_path) assert code == {} class TestLoadExperimentSummary: def test_loads_summary(self, tmp_path): stage_dir = tmp_path / "stage-14" stage_dir.mkdir() summary = {"condition_summaries": {"A": {}}} (stage_dir / "experiment_summary.json").write_text(json.dumps(summary)) result = _load_experiment_summary(tmp_path) assert result is not None assert "A" in result["condition_summaries"] # --------------------------------------------------------------------------- # select_best_results tests # --------------------------------------------------------------------------- class TestSelectBestResults: def test_picks_best_across_cycles(self, tmp_path): # Original (1 condition) s14 = tmp_path / "stage-14" s14.mkdir() (s14 / "experiment_summary.json").write_text(json.dumps({ "condition_summaries": {"A": {}}, "best_run": {"metrics": {}}, })) # Repair v1 (3 conditions — better) r1 = tmp_path / "stage-14_repair_v1" r1.mkdir() (r1 / "experiment_summary.json").write_text(json.dumps({ "condition_summaries": {"A": {}, "B": {}, "C": {}}, "best_run": {"metrics": {"primary_metric": 90.0}}, })) best = select_best_results(tmp_path, []) assert best is not None assert len(best["condition_summaries"]) == 3 def test_returns_none_when_empty(self, tmp_path): result = select_best_results(tmp_path, []) assert result is None # --------------------------------------------------------------------------- # Full repair loop tests (mocked) # --------------------------------------------------------------------------- class TestRunRepairLoop: def _make_run_dir(self, tmp_path, n_conditions=1, has_code=True): """Create a minimal run directory for testing.""" # Stage 14 — experiment summary s14 = tmp_path / "stage-14" s14.mkdir() (s14 / "runs").mkdir() conds = {f"Cond{i}": {"metrics": {"accuracy": 70.0 + i}} for i in range(n_conditions)} summary = { "condition_summaries": conds, "best_run": {"metrics": {f"Cond{i}/0/accuracy": 70.0 + i for i in range(n_conditions)}}, "metrics_summary": {"accuracy": {"mean": 70.5}}, } (s14 / "experiment_summary.json").write_text(json.dumps(summary)) run_data = { "stdout": "\n".join(f"condition=Cond{i} metric={70.0 + i}" for i in range(n_conditions)), "stderr": "", } (s14 / "runs" / "run_0.json").write_text(json.dumps(run_data)) # Stage 10 — experiment code if has_code: s10 = tmp_path / "stage-10" / "experiment" s10.mkdir(parents=True) (s10 / "main.py").write_text("import torch\nprint('hello')") return tmp_path def test_skips_when_already_sufficient(self, tmp_path): """If experiment is already sufficient, return immediately.""" # 3 conditions with 2+ seeds = full_paper s14 = tmp_path / "stage-14" s14.mkdir() (s14 / "runs").mkdir() summary = { "condition_summaries": { "A": {"metrics": {"m": 80.0}}, "B": {"metrics": {"m": 85.0}}, "C": {"metrics": {"m": 90.0}}, }, "best_run": { "metrics": { "A/0/m": 80.0, "A/1/m": 81.0, "B/0/m": 85.0, "B/1/m": 86.0, "C/0/m": 90.0, "C/1/m": 91.0, }, }, } (s14 / "experiment_summary.json").write_text(json.dumps(summary)) from researchclaw.config import ExperimentConfig, ExperimentRepairConfig class FakeConfig: class experiment: time_budget_sec = 2400 repair = ExperimentRepairConfig(enabled=True) class llm: pass result = run_repair_loop(tmp_path, FakeConfig(), "test") assert result.success is True assert result.total_cycles == 0 assert result.final_mode == PaperMode.FULL_PAPER def test_returns_failure_when_no_code(self, tmp_path): """If no experiment code found, return failure.""" s14 = tmp_path / "stage-14" s14.mkdir() (s14 / "experiment_summary.json").write_text(json.dumps({ "condition_summaries": {"A": {"metrics": {"m": 80.0}}}, "best_run": {"metrics": {}}, })) from researchclaw.config import ExperimentRepairConfig class FakeConfig: class experiment: time_budget_sec = 2400 repair = ExperimentRepairConfig(enabled=True) class llm: pass result = run_repair_loop(tmp_path, FakeConfig(), "test") assert result.success is False assert result.total_cycles == 0 def test_repair_loop_with_mocked_llm(self, tmp_path): """Test full repair loop with mocked LLM and sandbox.""" run_dir = self._make_run_dir(tmp_path, n_conditions=1) from researchclaw.config import ExperimentRepairConfig, ExperimentConfig, OpenCodeConfig class FakeConfig: class experiment: time_budget_sec = 2400 mode = "simulated" repair = ExperimentRepairConfig(enabled=True, max_cycles=1, use_opencode=False) opencode = OpenCodeConfig(enabled=False) metric_key = "primary_metric" class llm: pass # Mock the LLM to return fixed code mock_llm = MagicMock() mock_resp = MagicMock() mock_resp.content = """```python main.py import torch for cond in ["Baseline", "Proposed", "Ablation"]: for seed in range(2): acc = 80.0 + hash(cond) % 20 + seed print(f"condition={cond}/{seed}/accuracy metric={acc}") print("condition=Baseline metric=80.0") print("condition=Proposed metric=90.0") print("condition=Ablation metric=85.0") ```""" mock_llm.chat.return_value = mock_resp # Mock sandbox to return good results mock_sandbox_result = MagicMock() mock_sandbox_result.stdout = ( "condition=Baseline/0/accuracy metric=80.0\n" "condition=Baseline/1/accuracy metric=82.0\n" "condition=Proposed/0/accuracy metric=90.0\n" "condition=Proposed/1/accuracy metric=92.0\n" "condition=Ablation/0/accuracy metric=85.0\n" "condition=Ablation/1/accuracy metric=87.0\n" ) mock_sandbox_result.stderr = "" mock_sandbox_result.returncode = 0 mock_sandbox_result.metrics = { "Baseline/0/accuracy": 80.0, "Baseline/1/accuracy": 82.0, "Proposed/0/accuracy": 90.0, "Proposed/1/accuracy": 92.0, "Ablation/0/accuracy": 85.0, "Ablation/1/accuracy": 87.0, } mock_sandbox_result.elapsed_sec = 120.0 mock_sandbox_result.timed_out = False mock_sandbox = MagicMock() mock_sandbox.run_project.return_value = mock_sandbox_result with patch("researchclaw.llm.create_llm_client") as mock_create_llm, \ patch("researchclaw.experiment.factory.create_sandbox") as mock_create_sb: mock_create_llm.return_value = mock_llm mock_create_sb.return_value = mock_sandbox result = run_repair_loop(run_dir, FakeConfig(), "test-mock") assert result.total_cycles == 1 assert len(result.cycle_history) == 1 assert result.cycle_history[0].repair_applied is True # Check that repair files were saved repair_dir = run_dir / "stage-14_repair_v1" assert repair_dir.exists() assert (repair_dir / "experiment" / "main.py").exists() assert (repair_dir / "experiment_summary.json").exists() # --------------------------------------------------------------------------- # BUG-199: 2-part metric keys (condition/metric) in summary builder # --------------------------------------------------------------------------- class TestBuildExperimentSummaryTwoPartKeys: """BUG-199: Stage 13 refinement produces 2-part keys (condition/metric) instead of 3-part keys (condition/seed/metric). The parser must handle both formats. """ def test_two_part_keys_parsed(self): """2-part keys like 'Baseline/accuracy' should create conditions.""" run_result = { "stdout": "", "stderr": "", "returncode": 0, "metrics": { "Baseline/accuracy": 0.85, "Proposed/accuracy": 0.94, "Ablation/accuracy": 0.88, }, "elapsed_sec": 120.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) assert summary["total_conditions"] == 3 assert "Baseline" in summary["condition_summaries"] assert "Proposed" in summary["condition_summaries"] assert "Ablation" in summary["condition_summaries"] assert summary["condition_summaries"]["Proposed"]["metrics"]["accuracy"] == 0.94 def test_two_part_keys_create_synthetic_seed(self): """2-part keys should create a synthetic seed '0' entry.""" run_result = { "stdout": "", "stderr": "", "returncode": 0, "metrics": { "Baseline/accuracy": 0.80, "Baseline/loss": 0.45, }, "elapsed_sec": 60.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) bl = summary["condition_summaries"]["Baseline"] assert bl["metrics"]["accuracy"] == 0.80 assert bl["metrics"]["loss"] == 0.45 assert bl["n_seeds"] == 1 # synthetic seed "0" def test_mixed_two_and_three_part_keys(self): """Mix of 2-part and 3-part keys for different conditions.""" run_result = { "stdout": "", "stderr": "", "returncode": 0, "metrics": { # 3-part keys (with seed) "Baseline/0/accuracy": 0.80, "Baseline/1/accuracy": 0.82, # 2-part keys (Stage 13 refinement output) "Proposed/accuracy": 0.94, }, "elapsed_sec": 120.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) assert summary["total_conditions"] == 2 # 3-part: mean of seeds bl = summary["condition_summaries"]["Baseline"] assert abs(bl["metrics"]["accuracy"] - 0.81) < 0.01 assert bl["n_seeds"] == 2 # 2-part: single value pr = summary["condition_summaries"]["Proposed"] assert pr["metrics"]["accuracy"] == 0.94 assert pr["n_seeds"] == 1 def test_empty_metrics_still_empty(self): """Empty metrics dict should still produce 0 conditions.""" run_result = { "stdout": "", "stderr": "", "returncode": 1, "metrics": {}, "elapsed_sec": 5.0, "timed_out": False, } summary = _build_experiment_summary_from_run(run_result, {}) assert summary["total_conditions"] == 0 # --------------------------------------------------------------------------- # BUG-198: Conditional promotion of repair summary in runner.py # --------------------------------------------------------------------------- class TestRepairSummaryPromotion: """BUG-198: runner.py should NOT overwrite a richer stage-14 summary with an empty/poorer repair result. """ def test_empty_repair_does_not_overwrite_rich_summary(self, tmp_path): """Repair result with 0 conditions must NOT replace a summary that has real conditions and metrics. """ # Create a rich existing stage-14 summary s14 = tmp_path / "stage-14" s14.mkdir() rich_summary = { "condition_summaries": { "Baseline": {"metrics": {"accuracy": 0.80}}, "Proposed": {"metrics": {"accuracy": 0.94}}, "Ablation": {"metrics": {"accuracy": 0.88}}, }, "best_run": { "metrics": { "Baseline/0/accuracy": 0.80, "Proposed/0/accuracy": 0.94, "primary_metric": 0.94, }, }, "total_conditions": 3, "total_metric_keys": 3, } (s14 / "experiment_summary.json").write_text(json.dumps(rich_summary)) # Compute scores to verify the logic rich_score = _summary_quality_score(rich_summary) empty_summary = { "condition_summaries": {}, "best_run": {"metrics": {}}, "total_conditions": 0, "total_metric_keys": 0, } empty_score = _summary_quality_score(empty_summary) # The rich summary must score higher assert rich_score > empty_score # Verify that the existing file is preserved (simulate what runner does) existing = json.loads( (s14 / "experiment_summary.json").read_text(encoding="utf-8") ) existing_score = _summary_quality_score(existing) repair_score = _summary_quality_score(empty_summary) # runner.py should NOT overwrite because repair_score <= existing_score assert repair_score <= existing_score # The file should still contain the rich data after = json.loads( (s14 / "experiment_summary.json").read_text(encoding="utf-8") ) assert len(after["condition_summaries"]) == 3 def test_richer_repair_does_overwrite(self, tmp_path): """Repair result with MORE conditions should replace a poorer summary.""" s14 = tmp_path / "stage-14" s14.mkdir() poor_summary = { "condition_summaries": {"A": {"metrics": {"m": 0.5}}}, "best_run": {"metrics": {}}, "total_conditions": 1, "total_metric_keys": 0, } (s14 / "experiment_summary.json").write_text(json.dumps(poor_summary)) rich_repair = { "condition_summaries": { "A": {"metrics": {"m": 0.80}}, "B": {"metrics": {"m": 0.85}}, "C": {"metrics": {"m": 0.90}}, }, "best_run": {"metrics": {"primary_metric": 0.90}}, "total_conditions": 3, "total_metric_keys": 4, } poor_score = _summary_quality_score(poor_summary) rich_score = _summary_quality_score(rich_repair) assert rich_score > poor_score ================================================ FILE: tests/test_experiment_schema.py ================================================ """Tests for the universal experiment schema.""" from __future__ import annotations import pytest import yaml from researchclaw.domains.experiment_schema import ( Condition, ConditionRole, EvaluationSpec, ExperimentType, MetricSpec, UniversalExperimentPlan, from_legacy_exp_plan, ) # --------------------------------------------------------------------------- # Condition tests # --------------------------------------------------------------------------- class TestCondition: def test_default_role(self): c = Condition(name="test") assert c.role == ConditionRole.PROPOSED.value def test_custom_role(self): c = Condition(name="baseline_method", role=ConditionRole.REFERENCE.value) assert c.role == "reference" def test_variant_with_parent(self): c = Condition( name="ablation_no_attn", role=ConditionRole.VARIANT.value, varies_from="proposed_method", variation="remove_attention", ) assert c.varies_from == "proposed_method" # --------------------------------------------------------------------------- # UniversalExperimentPlan tests # --------------------------------------------------------------------------- class TestUniversalExperimentPlan: def test_empty_plan(self): plan = UniversalExperimentPlan() assert plan.conditions == [] assert plan.experiment_type == "comparison" def test_plan_with_conditions(self): plan = UniversalExperimentPlan( experiment_type="comparison", conditions=[ Condition(name="baseline", role="reference"), Condition(name="proposed", role="proposed"), Condition(name="ablation", role="variant", varies_from="proposed"), ], ) assert len(plan.references) == 1 assert len(plan.proposed) == 1 assert len(plan.variants) == 1 def test_to_legacy_format(self): plan = UniversalExperimentPlan( conditions=[ Condition(name="ResNet-18", role="reference", description="Standard baseline"), Condition(name="OurMethod", role="proposed", description="Our new method"), Condition(name="OurMethod-NoAttn", role="variant", varies_from="OurMethod"), ], evaluation=EvaluationSpec( primary_metric=MetricSpec(name="accuracy", direction="maximize"), ), ) legacy = plan.to_legacy_format() assert len(legacy["baselines"]) == 1 assert legacy["baselines"][0]["name"] == "ResNet-18" assert len(legacy["proposed_methods"]) == 1 assert len(legacy["ablations"]) == 1 assert "accuracy" in legacy["metrics"] def test_to_yaml(self): plan = UniversalExperimentPlan( experiment_type="convergence", domain_id="physics_pde", conditions=[ Condition(name="FD2", role="reference"), Condition(name="FD4", role="proposed"), ], ) yaml_str = plan.to_yaml() data = yaml.safe_load(yaml_str) assert data["experiment"]["type"] == "convergence" assert data["experiment"]["domain"] == "physics_pde" assert len(data["experiment"]["conditions"]) == 2 # --------------------------------------------------------------------------- # from_legacy_exp_plan tests # --------------------------------------------------------------------------- class TestFromLegacy: def test_basic_legacy_plan(self): legacy = { "baselines": [ {"name": "ResNet-18", "description": "Standard CNN"}, ], "proposed_methods": [ {"name": "OurNet", "description": "Our new architecture"}, ], "ablations": [ {"name": "OurNet-NoSkip", "description": "Without skip connections"}, ], "metrics": { "accuracy": {"direction": "maximize"}, }, } plan = from_legacy_exp_plan(legacy, domain_id="ml_vision") assert plan.domain_id == "ml_vision" assert len(plan.references) == 1 assert plan.references[0].name == "ResNet-18" assert len(plan.proposed) == 1 assert len(plan.variants) == 1 assert plan.evaluation.primary_metric.name == "accuracy" assert plan.evaluation.primary_metric.direction == "maximize" def test_legacy_string_names(self): legacy = { "baselines": ["baseline_1", "baseline_2"], "proposed_methods": ["our_method"], "ablations": [], } plan = from_legacy_exp_plan(legacy) assert len(plan.references) == 2 assert plan.references[0].name == "baseline_1" def test_legacy_yaml_string(self): yaml_str = """ baselines: - name: Euler description: Basic Euler method proposed_methods: - name: RK4 description: Runge-Kutta 4th order metrics: convergence_order: direction: maximize """ plan = from_legacy_exp_plan(yaml_str, domain_id="mathematics_numerical") assert plan.domain_id == "mathematics_numerical" assert len(plan.references) == 1 assert plan.evaluation.primary_metric.name == "convergence_order" def test_roundtrip_legacy(self): """Test that converting to legacy and back preserves structure.""" plan = UniversalExperimentPlan( conditions=[ Condition(name="A", role="reference"), Condition(name="B", role="proposed"), ], evaluation=EvaluationSpec( primary_metric=MetricSpec(name="error", direction="minimize"), ), ) legacy = plan.to_legacy_format() plan2 = from_legacy_exp_plan(legacy) assert len(plan2.references) == 1 assert len(plan2.proposed) == 1 assert plan2.evaluation.primary_metric.direction == "minimize" def test_empty_legacy(self): plan = from_legacy_exp_plan({}) assert plan.conditions == [] def test_metrics_as_list(self): legacy = {"metrics": ["accuracy", "f1"]} plan = from_legacy_exp_plan(legacy) assert plan.evaluation.primary_metric.name == "accuracy" # --------------------------------------------------------------------------- # Enum tests # --------------------------------------------------------------------------- class TestEnums: def test_condition_role_values(self): assert ConditionRole.REFERENCE.value == "reference" assert ConditionRole.PROPOSED.value == "proposed" assert ConditionRole.VARIANT.value == "variant" def test_experiment_type_values(self): assert ExperimentType.COMPARISON.value == "comparison" assert ExperimentType.CONVERGENCE.value == "convergence" assert ExperimentType.PROGRESSIVE_SPEC.value == "progressive_spec" ================================================ FILE: tests/test_figure_agent.py ================================================ """Tests for the FigureAgent multi-agent chart generation system.""" from __future__ import annotations import json import os import sys import textwrap from dataclasses import dataclass from pathlib import Path from typing import Any from unittest import mock import pytest # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @dataclass class _FakeLLMResponse: content: str = "" model: str = "gpt-4.1" prompt_tokens: int = 100 completion_tokens: int = 200 total_tokens: int = 300 class _FakeLLM: """Minimal mock LLM client conforming to _LLMClientLike.""" def __init__(self, response: str = "{}"): self._response = response self.calls: list[dict[str, Any]] = [] def chat(self, messages, *, system=None, max_tokens=None, temperature=None, json_mode=False): self.calls.append({ "messages": messages, "system": system, "json_mode": json_mode, }) return _FakeLLMResponse(content=self._response) # Sample experiment data for tests _SAMPLE_CONDITIONS = { "proposed_method": { "metrics": { "primary_metric": 0.85, "primary_metric_mean": 0.85, "primary_metric_std": 0.02, "secondary_metric": 0.72, }, "ci95_low": 0.83, "ci95_high": 0.87, "n_seeds": 3, }, "baseline_resnet": { "metrics": { "primary_metric": 0.78, "primary_metric_mean": 0.78, "primary_metric_std": 0.03, "secondary_metric": 0.65, }, "ci95_low": 0.75, "ci95_high": 0.81, "n_seeds": 3, }, "ablation_no_attention": { "metrics": { "primary_metric": 0.80, "primary_metric_mean": 0.80, "primary_metric_std": 0.02, "secondary_metric": 0.68, }, "ci95_low": 0.78, "ci95_high": 0.82, "n_seeds": 3, }, } _SAMPLE_METRICS_SUMMARY = { "primary_metric": {"mean": 0.81, "min": 0.78, "max": 0.85, "count": 3}, "secondary_metric": {"mean": 0.68, "min": 0.65, "max": 0.72, "count": 3}, } # ========================================================================= # Style Config tests # ========================================================================= class TestStyleConfig: def test_constants_exist(self): from researchclaw.agents.figure_agent.style_config import ( COLORS_BRIGHT, DPI_PUBLICATION, FIGURE_WIDTH, MATPLOTLIB_STYLES, OUTPUT_FORMAT_PRIMARY, ) assert len(COLORS_BRIGHT) >= 7 assert DPI_PUBLICATION >= 300 assert "single_column" in FIGURE_WIDTH assert "double_column" in FIGURE_WIDTH assert len(MATPLOTLIB_STYLES) >= 1 assert OUTPUT_FORMAT_PRIMARY in ("pdf", "png") def test_get_style_preamble(self): from researchclaw.agents.figure_agent.style_config import get_style_preamble preamble = get_style_preamble() assert "matplotlib" in preamble assert "plt" in preamble assert "COLORS" in preamble assert "300" in preamble def test_custom_dpi(self): from researchclaw.agents.figure_agent.style_config import get_style_preamble preamble = get_style_preamble(dpi=150) assert "150" in preamble # ========================================================================= # Planner Agent tests # ========================================================================= class TestPlannerAgent: def test_domain_detection_classification(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) assert agent._detect_domain("Image classification with CIFAR-10") == "classification" def test_domain_detection_rl(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) assert agent._detect_domain("Reinforcement learning with reward shaping") == "reinforcement_learning" def test_domain_detection_default(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) assert agent._detect_domain("Quantum computing analysis") == "default" def test_analyze_data_basic(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) analysis = agent._analyze_data( results={}, conditions=["proposed", "baseline", "ablation_no_x"], metrics_summary=_SAMPLE_METRICS_SUMMARY, condition_summaries=_SAMPLE_CONDITIONS, metric_key="primary_metric", ) assert analysis["num_conditions"] == 3 assert analysis["has_ablation"] is True assert analysis["has_per_condition_data"] is True assert analysis["has_multiple_seeds"] is True def test_analyze_data_training_history(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) analysis = agent._analyze_data( results={"training_history": [1.0, 0.5, 0.3]}, conditions=["a"], metrics_summary={}, condition_summaries={}, metric_key="loss", ) assert analysis["has_training_history"] is True def test_fallback_plan(self): from researchclaw.agents.figure_agent.planner import PlannerAgent agent = PlannerAgent(_FakeLLM()) analysis = { "num_conditions": 3, "num_metrics": 2, "metric_names": ["primary_metric", "secondary_metric"], "has_training_history": False, "has_ablation": True, "has_multiple_seeds": True, "has_per_condition_data": True, "condition_values": {"proposed": 0.85, "baseline": 0.78}, } figures = agent._fallback_plan("classification", analysis, "primary_metric", ["proposed", "baseline"]) assert len(figures) >= 2 types = {f["chart_type"] for f in figures} assert "bar_comparison" in types assert "ablation_grouped" in types def test_execute_with_llm_response(self): from researchclaw.agents.figure_agent.planner import PlannerAgent llm = _FakeLLM(json.dumps({ "figures": [ { "figure_id": "fig_main", "chart_type": "bar_comparison", "title": "Main Results", "caption": "Comparison of methods.", "data_source": {"type": "condition_comparison", "metric": "primary_metric"}, "x_label": "Method", "y_label": "Accuracy", "width": "single_column", "priority": 1, "section": "results", }, { "figure_id": "fig_ablation", "chart_type": "ablation_grouped", "title": "Ablation", "caption": "Component analysis.", "data_source": {"type": "ablation_comparison", "metric": "primary_metric"}, "x_label": "Variant", "y_label": "Accuracy", "width": "single_column", "priority": 1, "section": "results", }, { "figure_id": "fig_heatmap", "chart_type": "heatmap", "title": "Metric Heatmap", "caption": "Cross-metric analysis.", "data_source": {"type": "multi_metric"}, "x_label": "Metric", "y_label": "Method", "width": "double_column", "priority": 2, "section": "analysis", }, ] })) agent = PlannerAgent(llm, min_figures=3) result = agent.execute({ "experiment_results": {}, "topic": "Image classification with knowledge distillation", "metric_key": "primary_metric", "conditions": list(_SAMPLE_CONDITIONS.keys()), "metrics_summary": _SAMPLE_METRICS_SUMMARY, "condition_summaries": _SAMPLE_CONDITIONS, }) assert result.success assert len(result.data["figures"]) == 3 def test_execute_fallback_on_empty_llm(self): from researchclaw.agents.figure_agent.planner import PlannerAgent llm = _FakeLLM("{}") # Empty response agent = PlannerAgent(llm, min_figures=2) result = agent.execute({ "experiment_results": {}, "topic": "Image classification", "metric_key": "primary_metric", "conditions": list(_SAMPLE_CONDITIONS.keys()), "metrics_summary": _SAMPLE_METRICS_SUMMARY, "condition_summaries": _SAMPLE_CONDITIONS, }) assert result.success assert len(result.data["figures"]) >= 2 # ========================================================================= # CodeGen Agent tests # ========================================================================= class TestCodeGenAgent: def test_template_bar_comparison(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent agent = CodeGenAgent(_FakeLLM()) result = agent.execute({ "figures": [{ "figure_id": "fig_main", "chart_type": "bar_comparison", "title": "Results", "caption": "Main results.", "data_source": {"type": "condition_comparison", "metric": "primary_metric"}, "x_label": "Method", "y_label": "Accuracy", "width": "single_column", "section": "results", }], "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "output_dir": "charts", }) assert result.success scripts = result.data["scripts"] assert len(scripts) == 1 script = scripts[0]["script"] assert "0.85" in script # proposed_method value assert "0.78" in script # baseline value assert "savefig" in script def test_template_grouped_bar(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent agent = CodeGenAgent(_FakeLLM()) result = agent.execute({ "figures": [{ "figure_id": "fig_multi", "chart_type": "grouped_bar", "title": "Multi-metric", "caption": "Multi-metric comparison.", "data_source": { "type": "multi_metric", "metrics": ["primary_metric", "secondary_metric"], }, "x_label": "Method", "y_label": "Score", "width": "double_column", "section": "analysis", }], "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "output_dir": "charts", }) assert result.success scripts = result.data["scripts"] assert len(scripts) == 1 assert "secondary_metric" in scripts[0]["script"] def test_template_heatmap(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent agent = CodeGenAgent(_FakeLLM()) result = agent.execute({ "figures": [{ "figure_id": "fig_heat", "chart_type": "heatmap", "title": "Heatmap", "caption": "Analysis.", "data_source": {"type": "heatmap"}, "x_label": "Metric", "y_label": "Method", "width": "double_column", "section": "analysis", }], "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "output_dir": "charts", }) assert result.success scripts = result.data["scripts"] assert len(scripts) == 1 assert "imshow" in scripts[0]["script"] def test_llm_fallback_for_unknown_type(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent llm = _FakeLLM("```python\nimport matplotlib\nmatplotlib.use('Agg')\nimport matplotlib.pyplot as plt\nfig, ax = plt.subplots()\nax.plot([1,2,3])\nfig.savefig('charts/fig_custom.png')\nplt.close(fig)\n```") agent = CodeGenAgent(llm) result = agent.execute({ "figures": [{ "figure_id": "fig_custom", "chart_type": "radar_chart", "title": "Radar", "caption": "Custom chart.", "data_source": {}, "x_label": "X", "y_label": "Y", "width": "single_column", "section": "analysis", }], "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "output_dir": "charts", }) assert result.success assert "matplotlib" in result.data["scripts"][0]["script"] def test_strip_fences(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent code = "```python\nprint('hello')\n```" assert CodeGenAgent._strip_fences(code) == "print('hello')" def test_strip_fences_no_fences(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent code = "print('hello')" assert CodeGenAgent._strip_fences(code) == "print('hello')" def test_multiple_figures(self): from researchclaw.agents.figure_agent.codegen import CodeGenAgent agent = CodeGenAgent(_FakeLLM()) figures = [ { "figure_id": f"fig_{i}", "chart_type": "bar_comparison", "title": f"Figure {i}", "caption": f"Caption {i}.", "data_source": {"type": "condition_comparison", "metric": "primary_metric"}, "x_label": "X", "y_label": "Y", "width": "single_column", "section": "results", } for i in range(3) ] result = agent.execute({ "figures": figures, "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "output_dir": "charts", }) assert result.success assert len(result.data["scripts"]) == 3 # ========================================================================= # Renderer Agent tests # ========================================================================= class TestRendererAgent: def test_render_simple_script(self, tmp_path): from researchclaw.agents.figure_agent.renderer import RendererAgent agent = RendererAgent(_FakeLLM(), timeout_sec=10, use_docker=False) output_dir = tmp_path / "charts" # Use a script that creates a valid PNG without matplotlib # (creates a minimal 1x1 PNG file directly) script = textwrap.dedent("""\ import struct, zlib output_path = "{output_dir}/fig_test.png" # Minimal valid PNG: 1x1 white pixel def write_png(path): sig = b'\\x89PNG\\r\\n\\x1a\\n' def chunk(ctype, data): c = ctype + data return struct.pack('>I', len(data)) + c + struct.pack('>I', zlib.crc32(c) & 0xffffffff) ihdr = struct.pack('>IIBBBBB', 1, 1, 8, 2, 0, 0, 0) raw = zlib.compress(b'\\x00\\xff\\xff\\xff') with open(path, 'wb') as f: f.write(sig) f.write(chunk(b'IHDR', ihdr)) f.write(chunk(b'IDAT', raw)) f.write(chunk(b'IEND', b'')) write_png(output_path) # Pad file to meet minimum size requirement with open(output_path, 'ab') as f: f.write(b'\\x00' * 2048) print(f"Saved: {{output_path}}") """).format(output_dir=output_dir) result = agent.execute({ "scripts": [{ "figure_id": "fig_test", "script": script, "output_filename": "fig_test.png", "title": "Test", "caption": "Test chart", "section": "results", }], "output_dir": str(output_dir), }) assert result.success rendered = result.data["rendered"] assert len(rendered) == 1 assert rendered[0]["success"] is True assert Path(rendered[0]["output_path"]).exists() def test_render_syntax_error(self, tmp_path): from researchclaw.agents.figure_agent.renderer import RendererAgent agent = RendererAgent(_FakeLLM(), timeout_sec=5) result = agent.execute({ "scripts": [{ "figure_id": "fig_bad", "script": "this is not valid python!!!", "output_filename": "fig_bad.png", }], "output_dir": str(tmp_path / "charts"), }) # The renderer itself succeeds (returns results), but individual # figures have success=False rendered = result.data["rendered"] assert len(rendered) == 1 assert rendered[0]["success"] is False assert rendered[0]["error"] def test_render_empty_script(self, tmp_path): from researchclaw.agents.figure_agent.renderer import RendererAgent agent = RendererAgent(_FakeLLM(), timeout_sec=5) result = agent.execute({ "scripts": [{ "figure_id": "fig_empty", "script": "", "output_filename": "fig_empty.png", }], "output_dir": str(tmp_path / "charts"), }) rendered = result.data["rendered"] assert rendered[0]["success"] is False assert "Empty" in rendered[0]["error"] def test_script_saved_for_reproducibility(self, tmp_path): from researchclaw.agents.figure_agent.renderer import RendererAgent agent = RendererAgent(_FakeLLM(), timeout_sec=5) output_dir = tmp_path / "charts" result = agent.execute({ "scripts": [{ "figure_id": "fig_save", "script": "print('hello')", "output_filename": "fig_save.png", }], "output_dir": str(output_dir), }) # Script should be saved even if rendering fails script_path = output_dir / "scripts" / "fig_save.py" assert script_path.exists() assert script_path.read_text() == "print('hello')" # ========================================================================= # Critic Agent tests # ========================================================================= class TestCriticAgent: def test_numerical_accuracy_pass(self): from researchclaw.agents.figure_agent.critic import CriticAgent llm = _FakeLLM(json.dumps({ "quality_score": 8, "issues": [], })) agent = CriticAgent(llm) script = "values = [0.85, 0.78, 0.80]\nax.bar(x, values)\nfig.savefig('out.png')\nplt.close(fig)" issues = agent._check_numerical_accuracy(script, _SAMPLE_CONDITIONS, "primary_metric") # Values 0.85 and 0.78 are in script → should pass assert not any(i["severity"] == "critical" for i in issues) def test_numerical_accuracy_fail(self): from researchclaw.agents.figure_agent.critic import CriticAgent agent = CriticAgent(_FakeLLM()) script = "values = [0.99, 0.98, 0.97]" # Wrong values issues = agent._check_numerical_accuracy(script, _SAMPLE_CONDITIONS, "primary_metric") assert any(i["severity"] == "critical" for i in issues) def test_text_correctness_missing_labels(self): from researchclaw.agents.figure_agent.critic import CriticAgent agent = CriticAgent(_FakeLLM()) script = "fig, ax = plt.subplots()\nax.bar([0], [1])" # Missing labels + savefig issues = agent._check_text_correctness(script, {}) types = {i["message"] for i in issues} assert any("x-axis" in t for t in types) assert any("savefig" in t for t in types) def test_text_correctness_all_present(self): from researchclaw.agents.figure_agent.critic import CriticAgent agent = CriticAgent(_FakeLLM()) script = ( "ax.set_xlabel('X')\n" "ax.set_ylabel('Y')\n" "ax.set_title('T')\n" "fig.savefig('out.png')\n" "plt.close(fig)" ) issues = agent._check_text_correctness(script, {}) assert len(issues) == 0 def test_visual_quality_llm_review(self): from researchclaw.agents.figure_agent.critic import CriticAgent llm = _FakeLLM(json.dumps({ "quality_score": 9, "issues": [], })) agent = CriticAgent(llm) issues = agent._check_visual_quality( "import matplotlib\nplt.figure()\nplt.savefig('x.png')", {"title": "Test"}, ) assert not any(i["severity"] == "critical" for i in issues) def test_visual_quality_low_score(self): from researchclaw.agents.figure_agent.critic import CriticAgent llm = _FakeLLM(json.dumps({ "quality_score": 3, "issues": [{"severity": "critical", "message": "Bad colors"}], })) agent = CriticAgent(llm) issues = agent._check_visual_quality("plt.plot([1,2])", {"title": "Bad"}) assert any(i["severity"] == "critical" for i in issues) def test_execute_full_review(self): from researchclaw.agents.figure_agent.critic import CriticAgent llm = _FakeLLM(json.dumps({ "quality_score": 8, "issues": [], })) agent = CriticAgent(llm) result = agent.execute({ "rendered": [ { "figure_id": "fig_1", "success": True, "output_path": "/tmp/fig.png", "title": "Test", "caption": "Test fig", }, ], "scripts": [ { "figure_id": "fig_1", "script": ( "values = [0.85, 0.78]\n" "ax.set_xlabel('X')\nax.set_ylabel('Y')\n" "ax.set_title('T')\nfig.savefig('x.png')\nplt.close(fig)" ), }, ], "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", }) assert result.success assert result.data["passed_count"] >= 0 def test_review_failed_render(self): from researchclaw.agents.figure_agent.critic import CriticAgent agent = CriticAgent(_FakeLLM()) result = agent.execute({ "rendered": [ {"figure_id": "fig_1", "success": False, "error": "Crash"}, ], "scripts": [], "condition_summaries": {}, "metrics_summary": {}, "metric_key": "primary_metric", }) assert result.success assert result.data["reviews"][0]["passed"] is False # ========================================================================= # Integrator Agent tests # ========================================================================= class TestIntegratorAgent: def test_build_manifest(self): from researchclaw.agents.figure_agent.integrator import IntegratorAgent agent = IntegratorAgent(_FakeLLM()) rendered = [ { "figure_id": "fig_main", "success": True, "output_path": "/tmp/charts/fig_main.png", "title": "Main Results", "caption": "Comparison.", "section": "results", "width": "single_column", }, { "figure_id": "fig_ablation", "success": True, "output_path": "/tmp/charts/fig_ablation.png", "title": "Ablation", "caption": "Analysis.", "section": "results", "width": "single_column", }, ] manifest = agent._build_manifest(rendered, Path("/tmp/charts")) assert len(manifest) == 2 assert manifest[0]["figure_number"] == 1 assert manifest[0]["paper_section"] == "Results" assert "charts/" in manifest[0]["file_path"] def test_generate_markdown_refs(self): from researchclaw.agents.figure_agent.integrator import IntegratorAgent agent = IntegratorAgent(_FakeLLM()) manifest = [ { "figure_number": 1, "file_path": "charts/fig_1.png", "caption": "Main results comparison", }, ] refs = agent._generate_markdown_refs(manifest) assert "![Figure 1:" in refs assert "charts/fig_1.png" in refs def test_generate_descriptions(self): from researchclaw.agents.figure_agent.integrator import IntegratorAgent agent = IntegratorAgent(_FakeLLM()) manifest = [ { "figure_number": 1, "file_path": "charts/fig_1.png", "title": "Main Results", "caption": "Comparison", "paper_section": "Results", }, ] desc = agent._generate_descriptions(manifest) assert "AVAILABLE FIGURES" in desc assert "Main Results" in desc assert "Results" in desc def test_execute_empty(self): from researchclaw.agents.figure_agent.integrator import IntegratorAgent agent = IntegratorAgent(_FakeLLM()) result = agent.execute({ "rendered": [], "topic": "Test", "output_dir": "/tmp/charts", }) assert result.success assert result.data["figure_count"] == 0 def test_execute_with_figures(self, tmp_path): from researchclaw.agents.figure_agent.integrator import IntegratorAgent agent = IntegratorAgent(_FakeLLM()) output_dir = tmp_path / "charts" output_dir.mkdir() result = agent.execute({ "rendered": [ { "figure_id": "fig_main", "success": True, "output_path": str(output_dir / "fig_main.png"), "title": "Main", "caption": "Main comparison.", "section": "results", }, ], "topic": "Test", "output_dir": str(output_dir), }) assert result.success assert result.data["figure_count"] == 1 assert (output_dir / "figure_manifest.json").exists() def test_section_ordering(self): from researchclaw.agents.figure_agent.integrator import IntegratorAgent assert IntegratorAgent._section_order("method") < IntegratorAgent._section_order("results") assert IntegratorAgent._section_order("results") < IntegratorAgent._section_order("analysis") # ========================================================================= # Orchestrator tests # ========================================================================= class TestOrchestrator: def test_orchestrate_basic(self, tmp_path): from researchclaw.agents.figure_agent.orchestrator import ( FigureAgentConfig, FigureOrchestrator, ) # LLM returns plan, then quality review responses = iter([ json.dumps({ "figures": [{ "figure_id": "fig_main", "chart_type": "bar_comparison", "title": "Main", "caption": "Main comparison.", "data_source": {"type": "condition_comparison", "metric": "primary_metric"}, "x_label": "Method", "y_label": "Accuracy", "width": "single_column", "priority": 1, "section": "results", }, { "figure_id": "fig_ablation", "chart_type": "ablation_grouped", "title": "Ablation", "caption": "Ablation study.", "data_source": {"type": "ablation_comparison", "metric": "primary_metric"}, "x_label": "Variant", "y_label": "Accuracy", "width": "single_column", "priority": 1, "section": "results", }, { "figure_id": "fig_heatmap", "chart_type": "heatmap", "title": "Heatmap", "caption": "Metric heatmap.", "data_source": {"type": "heatmap"}, "x_label": "Metric", "y_label": "Method", "width": "double_column", "priority": 2, "section": "analysis", }], }), # Critic review (called multiple times) json.dumps({"quality_score": 8, "issues": []}), json.dumps({"quality_score": 8, "issues": []}), json.dumps({"quality_score": 8, "issues": []}), ]) class _MultiLLM: def __init__(self): self.calls = [] def chat(self, messages, **kwargs): self.calls.append(messages) try: resp = next(responses) except StopIteration: resp = json.dumps({"quality_score": 8, "issues": []}) return _FakeLLMResponse(content=resp) cfg = FigureAgentConfig( min_figures=3, max_figures=5, max_iterations=1, render_timeout_sec=10, ) orch = FigureOrchestrator(_MultiLLM(), cfg, stage_dir=tmp_path) plan = orch.orchestrate({ "experiment_results": {}, "condition_summaries": _SAMPLE_CONDITIONS, "metrics_summary": _SAMPLE_METRICS_SUMMARY, "metric_key": "primary_metric", "conditions": list(_SAMPLE_CONDITIONS.keys()), "topic": "Image classification", "output_dir": str(tmp_path / "charts"), }) assert plan.total_llm_calls > 0 assert plan.elapsed_sec > 0 # Plan should have chart files (some may fail rendering, that's OK) assert isinstance(plan.manifest, list) def test_figure_plan_serialization(self): from researchclaw.agents.figure_agent.orchestrator import FigurePlan plan = FigurePlan( manifest=[{"figure_number": 1, "file_path": "charts/fig.png"}], figure_count=1, passed_count=1, ) d = plan.to_dict() assert d["figure_count"] == 1 assert len(d["manifest"]) == 1 def test_get_chart_files(self): from researchclaw.agents.figure_agent.orchestrator import FigurePlan plan = FigurePlan( manifest=[ {"figure_number": 1, "file_path": "charts/fig_main.png"}, {"figure_number": 2, "file_path": "charts/fig_ablation.png"}, ], ) files = plan.get_chart_files() assert files == ["fig_main.png", "fig_ablation.png"] # ========================================================================= # Config tests # ========================================================================= class TestFigureAgentConfig: def test_default_config(self): from researchclaw.config import FigureAgentConfig cfg = FigureAgentConfig() assert cfg.enabled is True assert cfg.min_figures == 3 assert cfg.max_figures == 8 assert cfg.max_iterations == 3 assert cfg.dpi == 300 assert cfg.strict_mode is False def test_parse_from_dict(self): from researchclaw.config import _parse_figure_agent_config cfg = _parse_figure_agent_config({ "enabled": False, "min_figures": 2, "max_figures": 6, "dpi": 150, }) assert cfg.enabled is False assert cfg.min_figures == 2 assert cfg.max_figures == 6 assert cfg.dpi == 150 def test_parse_from_dict_extended_fields(self): from researchclaw.config import _parse_figure_agent_config cfg = _parse_figure_agent_config({ "use_docker": False, "docker_image": "custom/figure:latest", "output_format": "latex", "gemini_api_key": "test-key", "gemini_model": "gemini-test", "nano_banana_enabled": False, }) assert cfg.use_docker is False assert cfg.docker_image == "custom/figure:latest" assert cfg.output_format == "latex" assert cfg.gemini_api_key == "test-key" assert cfg.gemini_model == "gemini-test" assert cfg.nano_banana_enabled is False def test_parse_empty(self): from researchclaw.config import _parse_figure_agent_config cfg = _parse_figure_agent_config({}) assert cfg.enabled is True assert cfg.min_figures == 3 def test_experiment_config_has_figure_agent(self): from researchclaw.config import ExperimentConfig ec = ExperimentConfig() assert hasattr(ec, "figure_agent") assert ec.figure_agent.enabled is True # ========================================================================= # Backward compatibility test # ========================================================================= class TestBackwardCompatibility: def test_visualize_still_importable(self): """Old visualize.py functions should still be importable.""" from researchclaw.experiment.visualize import ( generate_all_charts, plot_condition_comparison, plot_experiment_comparison, plot_metric_trajectory, ) assert callable(generate_all_charts) assert callable(plot_condition_comparison) assert callable(plot_experiment_comparison) assert callable(plot_metric_trajectory) def test_figure_agent_importable(self): from researchclaw.agents.figure_agent import FigureOrchestrator, FigurePlan assert FigureOrchestrator is not None assert FigurePlan is not None ================================================ FILE: tests/test_knowledge_graph.py ================================================ """Tests for the research knowledge graph (20+ tests). Covers: - Entity/Relation CRUD - Graph queries (gaps, trends, comparison) - JSON serialization/deserialization - Incremental updates - Visualizer exports """ from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.knowledge.graph.entities import Entity, EntityType from researchclaw.knowledge.graph.relations import Relation, RelationType from researchclaw.knowledge.graph.builder import KnowledgeGraphBuilder from researchclaw.knowledge.graph.query import KnowledgeGraphQuery from researchclaw.knowledge.graph.visualizer import ( export_to_dot, export_to_json_cytoscape, graph_summary, ) # ── Fixtures ───────────────────────────────────────────────────────── @pytest.fixture def graph() -> KnowledgeGraphBuilder: return KnowledgeGraphBuilder(max_entities=100) @pytest.fixture def populated_graph(graph: KnowledgeGraphBuilder) -> KnowledgeGraphBuilder: # Papers graph.add_paper("p1", "ResNet: Deep Residual Learning", year=2016, authors=["He"]) graph.add_paper("p2", "ViT: An Image is Worth 16x16 Words", year=2021, authors=["Dosovitskiy"]) graph.add_paper("p3", "DeiT: Training Data-efficient Image Transformers", year=2021, authors=["Touvron"]) # Methods graph.add_method("m1", "ResNet", description="Residual connections for deep networks") graph.add_method("m2", "Vision Transformer", description="Transformer for image classification") graph.add_method("m3", "Knowledge Distillation", description="Teacher-student learning") # Datasets graph.add_dataset("d1", "ImageNet", domain="computer vision") graph.add_dataset("d2", "CIFAR-10", domain="computer vision") graph.add_dataset("d3", "CIFAR-100", domain="computer vision") # Relations graph.add_relation(Relation("p2", "p1", RelationType.CITES)) graph.add_relation(Relation("p3", "p2", RelationType.EXTENDS)) graph.add_relation(Relation("p3", "p1", RelationType.CITES)) graph.add_relation(Relation("m1", "d1", RelationType.USES_DATASET)) graph.add_relation(Relation("m1", "d2", RelationType.USES_DATASET)) graph.add_relation(Relation("m2", "d1", RelationType.USES_DATASET)) graph.add_relation(Relation("m2", "d2", RelationType.USES_DATASET)) graph.add_relation(Relation("p1", "m1", RelationType.APPLIES_METHOD)) graph.add_relation(Relation("p2", "m2", RelationType.APPLIES_METHOD)) graph.add_relation(Relation("m2", "m1", RelationType.OUTPERFORMS, {"dataset": "ImageNet"})) return graph # ── Entity Tests ───────────────────────────────────────────────────── class TestEntity: def test_create_entity(self) -> None: e = Entity("e1", EntityType.PAPER, "Test Paper") assert e.id == "e1" assert e.entity_type == EntityType.PAPER def test_to_dict(self) -> None: e = Entity("e1", EntityType.METHOD, "TestMethod", {"key": "val"}) d = e.to_dict() assert d["entity_type"] == "method" assert d["attributes"]["key"] == "val" def test_from_dict(self) -> None: data = {"id": "x", "entity_type": "dataset", "name": "Test", "attributes": {}} e = Entity.from_dict(data) assert e.entity_type == EntityType.DATASET class TestRelation: def test_create_relation(self) -> None: r = Relation("a", "b", RelationType.CITES) assert r.source_id == "a" assert r.target_id == "b" def test_to_dict(self) -> None: r = Relation("a", "b", RelationType.OUTPERFORMS, {"margin": 0.05}) d = r.to_dict() assert d["relation_type"] == "outperforms" assert d["attributes"]["margin"] == 0.05 def test_from_dict(self) -> None: data = {"source_id": "x", "target_id": "y", "relation_type": "extends"} r = Relation.from_dict(data) assert r.relation_type == RelationType.EXTENDS # ── Builder Tests ──────────────────────────────────────────────────── class TestKnowledgeGraphBuilder: def test_add_entity(self, graph: KnowledgeGraphBuilder) -> None: e = Entity("e1", EntityType.PAPER, "Test") assert graph.add_entity(e) assert graph.entity_count == 1 def test_add_duplicate_updates(self, graph: KnowledgeGraphBuilder) -> None: graph.add_entity(Entity("e1", EntityType.PAPER, "V1", {"a": 1})) graph.add_entity(Entity("e1", EntityType.PAPER, "V2", {"b": 2})) assert graph.entity_count == 1 e = graph.get_entity("e1") assert e is not None assert e.name == "V2" assert e.attributes["a"] == 1 # merged assert e.attributes["b"] == 2 def test_capacity_limit(self) -> None: g = KnowledgeGraphBuilder(max_entities=2) g.add_entity(Entity("e1", EntityType.PAPER, "P1")) g.add_entity(Entity("e2", EntityType.PAPER, "P2")) assert not g.add_entity(Entity("e3", EntityType.PAPER, "P3")) assert g.entity_count == 2 def test_add_relation(self, graph: KnowledgeGraphBuilder) -> None: graph.add_entity(Entity("a", EntityType.PAPER, "A")) graph.add_entity(Entity("b", EntityType.PAPER, "B")) assert graph.add_relation(Relation("a", "b", RelationType.CITES)) assert graph.relation_count == 1 def test_add_relation_missing_entity(self, graph: KnowledgeGraphBuilder) -> None: graph.add_entity(Entity("a", EntityType.PAPER, "A")) assert not graph.add_relation(Relation("a", "missing", RelationType.CITES)) def test_duplicate_relation(self, graph: KnowledgeGraphBuilder) -> None: graph.add_entity(Entity("a", EntityType.PAPER, "A")) graph.add_entity(Entity("b", EntityType.PAPER, "B")) graph.add_relation(Relation("a", "b", RelationType.CITES)) graph.add_relation(Relation("a", "b", RelationType.CITES)) # duplicate assert graph.relation_count == 1 def test_get_entities_by_type(self, populated_graph: KnowledgeGraphBuilder) -> None: papers = populated_graph.get_entities_by_type(EntityType.PAPER) assert len(papers) == 3 def test_get_relations_for(self, populated_graph: KnowledgeGraphBuilder) -> None: rels = populated_graph.get_relations_for("p2") assert len(rels) >= 2 # outgoing + incoming def test_remove_entity(self, populated_graph: KnowledgeGraphBuilder) -> None: initial_rels = populated_graph.relation_count assert populated_graph.remove_entity("p1") assert populated_graph.get_entity("p1") is None assert populated_graph.relation_count < initial_rels def test_remove_nonexistent_entity(self, graph: KnowledgeGraphBuilder) -> None: assert not graph.remove_entity("nope") def test_convenience_methods(self, graph: KnowledgeGraphBuilder) -> None: paper = graph.add_paper("p1", "Test Paper", year=2024) method = graph.add_method("m1", "TestNet", description="A test") dataset = graph.add_dataset("d1", "TestSet", domain="cv") assert paper.entity_type == EntityType.PAPER assert method.entity_type == EntityType.METHOD assert dataset.entity_type == EntityType.DATASET # ── Persistence ────────────────────────────────────────────────────── class TestGraphPersistence: def test_save_and_load(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None: path = tmp_path / "graph.json" populated_graph.save(path) assert path.exists() new_graph = KnowledgeGraphBuilder() loaded = new_graph.load(path) assert loaded == populated_graph.entity_count assert new_graph.relation_count == populated_graph.relation_count def test_load_nonexistent(self, graph: KnowledgeGraphBuilder, tmp_path: Path) -> None: assert graph.load(tmp_path / "nope.json") == 0 def test_load_malformed(self, graph: KnowledgeGraphBuilder, tmp_path: Path) -> None: path = tmp_path / "bad.json" path.write_text("not json", encoding="utf-8") assert graph.load(path) == 0 # ── Query Engine ───────────────────────────────────────────────────── class TestKnowledgeGraphQuery: def test_find_research_gaps(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) gaps = query.find_research_gaps() # CIFAR-100 has no methods using it assert any("CIFAR-100" in g for g in gaps) def test_find_research_gaps_with_domain(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) gaps = query.find_research_gaps(domain="computer vision") assert isinstance(gaps, list) def test_find_trending_methods(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) trending = query.find_trending_methods(min_citations=1) assert len(trending) > 0 def test_get_method_comparison(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) comparison = query.get_method_comparison("ResNet", "Vision Transformer") assert "method_a" in comparison assert "method_b" in comparison assert "shared_datasets" in comparison def test_get_method_comparison_not_found(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) comparison = query.get_method_comparison("NonexistentA", "NonexistentB") assert "error" in comparison def test_suggest_topics(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) topics = query.suggest_topics(["transformer", "vision"], top_k=3) assert isinstance(topics, list) def test_suggest_topics_empty_interests(self, populated_graph: KnowledgeGraphBuilder) -> None: query = KnowledgeGraphQuery(populated_graph) topics = query.suggest_topics([]) assert isinstance(topics, list) # ── Visualizer ─────────────────────────────────────────────────────── class TestVisualizer: def test_export_dot(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None: path = tmp_path / "graph.dot" export_to_dot(populated_graph, path) assert path.exists() content = path.read_text(encoding="utf-8") assert "digraph" in content assert "ResNet" in content def test_export_cytoscape(self, populated_graph: KnowledgeGraphBuilder, tmp_path: Path) -> None: path = tmp_path / "graph.json" export_to_json_cytoscape(populated_graph, path) assert path.exists() data = json.loads(path.read_text(encoding="utf-8")) assert "elements" in data assert len(data["elements"]) > 0 def test_graph_summary(self, populated_graph: KnowledgeGraphBuilder) -> None: summary = graph_summary(populated_graph) assert "entities" in summary assert "relations" in summary assert "paper" in summary ================================================ FILE: tests/test_mcp.py ================================================ """Tests for MCP integration (C3): Server, Client, Tools, Transport, Registry.""" from __future__ import annotations import asyncio import pytest from researchclaw.mcp.tools import TOOL_DEFINITIONS, get_tool_schema, list_tool_names from researchclaw.mcp.server import ResearchClawMCPServer from researchclaw.mcp.client import MCPClient from researchclaw.mcp.registry import MCPServerRegistry from researchclaw.mcp.transport import SSETransport # ══════════════════════════════════════════════════════════════════ # MCP Tools tests # ══════════════════════════════════════════════════════════════════ class TestMCPTools: def test_tool_definitions_not_empty(self) -> None: assert len(TOOL_DEFINITIONS) >= 6 def test_all_tools_have_required_fields(self) -> None: for tool in TOOL_DEFINITIONS: assert "name" in tool assert "description" in tool assert "inputSchema" in tool assert tool["inputSchema"]["type"] == "object" def test_get_tool_schema_exists(self) -> None: schema = get_tool_schema("run_pipeline") assert schema is not None assert schema["name"] == "run_pipeline" def test_get_tool_schema_missing(self) -> None: assert get_tool_schema("nonexistent") is None def test_list_tool_names(self) -> None: names = list_tool_names() assert "run_pipeline" in names assert "get_pipeline_status" in names assert "search_literature" in names def test_run_pipeline_requires_topic(self) -> None: schema = get_tool_schema("run_pipeline") assert schema is not None assert "topic" in schema["inputSchema"]["required"] def test_get_paper_has_format_enum(self) -> None: schema = get_tool_schema("get_paper") assert schema is not None props = schema["inputSchema"]["properties"] assert "format" in props assert "enum" in props["format"] # ══════════════════════════════════════════════════════════════════ # MCP Server tests # ══════════════════════════════════════════════════════════════════ class TestMCPServer: def test_get_tools(self) -> None: server = ResearchClawMCPServer() tools = server.get_tools() assert len(tools) >= 6 names = [t["name"] for t in tools] assert "run_pipeline" in names def test_handle_unknown_tool(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("nonexistent", {})) assert result["success"] is False assert "Unknown tool" in result["error"] def test_handle_run_pipeline(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("run_pipeline", {"topic": "GNN"})) assert result["success"] is True assert "GNN" in result["message"] def test_handle_get_status_missing_run(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("get_pipeline_status", {"run_id": "nonexistent"})) assert result["success"] is False def test_handle_search_literature(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("search_literature", {"query": "transformers"})) assert result["success"] is True def test_handle_review_paper(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("review_paper", {"paper_path": "/tmp/paper.md"})) assert result["success"] is True def test_start_stop(self) -> None: server = ResearchClawMCPServer() assert not server.is_running async def _run() -> None: await server.start() assert server.is_running await server.stop() assert not server.is_running asyncio.run(_run()) def test_handle_get_results_missing(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("get_experiment_results", {"run_id": "missing"})) assert result["success"] is False def test_handle_get_paper_missing(self) -> None: server = ResearchClawMCPServer() result = asyncio.run(server.handle_tool_call("get_paper", {"run_id": "missing"})) assert result["success"] is False # ══════════════════════════════════════════════════════════════════ # MCP Client tests # ══════════════════════════════════════════════════════════════════ class TestMCPClient: def test_init(self) -> None: client = MCPClient("http://localhost:3000") assert client.uri == "http://localhost:3000" assert not client.is_connected def test_connect_disconnect(self) -> None: client = MCPClient("http://localhost:3000") async def _run() -> None: await client.connect() assert client.is_connected await client.disconnect() assert not client.is_connected asyncio.run(_run()) def test_list_tools_not_connected(self) -> None: client = MCPClient("http://localhost:3000") with pytest.raises(ConnectionError): asyncio.run(client.list_tools()) def test_call_tool_not_connected(self) -> None: client = MCPClient("http://localhost:3000") with pytest.raises(ConnectionError): asyncio.run(client.call_tool("test", {})) def test_list_resources_not_connected(self) -> None: client = MCPClient("http://localhost:3000") with pytest.raises(ConnectionError): asyncio.run(client.list_resources()) def test_read_resource_not_connected(self) -> None: client = MCPClient("http://localhost:3000") with pytest.raises(ConnectionError): asyncio.run(client.read_resource("test://resource")) def test_list_tools_connected(self) -> None: client = MCPClient("http://localhost:3000") async def _run() -> list: await client.connect() return await client.list_tools() tools = asyncio.run(_run()) assert isinstance(tools, list) def test_tools_cached(self) -> None: client = MCPClient("http://localhost:3000") async def _run() -> tuple: await client.connect() t1 = await client.list_tools() t2 = await client.list_tools() return t1, t2 t1, t2 = asyncio.run(_run()) assert t1 is t2 # ══════════════════════════════════════════════════════════════════ # MCP Server Registry tests # ══════════════════════════════════════════════════════════════════ class TestMCPServerRegistry: def test_register_and_list(self) -> None: async def _run() -> list: reg = MCPServerRegistry() await reg.register("test", "http://localhost:3000") return reg.list_all() servers = asyncio.run(_run()) assert len(servers) == 1 assert servers[0]["name"] == "test" assert servers[0]["connected"] is True def test_unregister(self) -> None: async def _run() -> int: reg = MCPServerRegistry() await reg.register("test", "http://localhost:3000") await reg.unregister("test") return reg.count count = asyncio.run(_run()) assert count == 0 def test_get(self) -> None: async def _run() -> MCPClient | None: reg = MCPServerRegistry() await reg.register("test", "http://localhost:3000") return reg.get("test") client = asyncio.run(_run()) assert client is not None assert client.is_connected def test_get_missing(self) -> None: reg = MCPServerRegistry() assert reg.get("nonexistent") is None def test_close_all(self) -> None: async def _run() -> int: reg = MCPServerRegistry() await reg.register("a", "http://a:3000") await reg.register("b", "http://b:3000") await reg.close_all() return reg.count count = asyncio.run(_run()) assert count == 0 # ══════════════════════════════════════════════════════════════════ # Transport tests # ══════════════════════════════════════════════════════════════════ class TestSSETransport: def test_start_stop(self) -> None: transport = SSETransport(port=9999) async def _run() -> None: await transport.start() assert transport._running is True await transport.close() assert transport._running is False asyncio.run(_run()) def test_receive_not_implemented(self) -> None: transport = SSETransport() with pytest.raises(NotImplementedError): asyncio.run(transport.receive()) ================================================ FILE: tests/test_memory_system.py ================================================ """Tests for the persistent memory system (40+ tests). Covers: - MemoryStore CRUD operations - Vector embedding generation (mocked) - Similarity retrieval - Time decay computation - Confidence updates - Persistence (JSONL read/write) - IdeationMemory, ExperimentMemory, WritingMemory """ from __future__ import annotations import json import math from datetime import datetime, timezone, timedelta from pathlib import Path import pytest from researchclaw.memory.store import MemoryEntry, MemoryStore, VALID_CATEGORIES from researchclaw.memory.decay import time_decay_weight, confidence_update from researchclaw.memory.embeddings import EmbeddingProvider, _tokenize, _hash_token from researchclaw.memory.retriever import MemoryRetriever, cosine_similarity from researchclaw.memory.ideation_memory import IdeationMemory from researchclaw.memory.experiment_memory import ExperimentMemory from researchclaw.memory.writing_memory import WritingMemory # ── Fixtures ───────────────────────────────────────────────────────── @pytest.fixture def tmp_store_dir(tmp_path: Path) -> Path: d = tmp_path / "memory_store" d.mkdir() return d @pytest.fixture def store(tmp_store_dir: Path) -> MemoryStore: return MemoryStore(tmp_store_dir) @pytest.fixture def populated_store(store: MemoryStore) -> MemoryStore: store.add("ideation", "Topic: RL for robotics\nOutcome: success", {"run_id": "r1"}) store.add("ideation", "Topic: Meta-learning\nOutcome: failure", {"run_id": "r2"}) store.add("experiment", "Task: classification\nHP: lr=0.001", {"run_id": "r1"}) store.add("experiment", "Trick: mixed precision\nImprovement: 5%", {"run_id": "r2"}) store.add("writing", "Feedback: clarity\nResolution: rewrite", {"run_id": "r1"}) return store @pytest.fixture def embedding_fn() -> object: """Simple deterministic embedding for testing.""" def _embed(text: str) -> list[float]: vec = [0.0] * 16 for i, ch in enumerate(text[:16]): vec[i] = ord(ch) / 256.0 norm = math.sqrt(sum(v * v for v in vec)) or 1.0 return [v / norm for v in vec] return _embed # ── MemoryStore CRUD ───────────────────────────────────────────────── class TestMemoryStoreCRUD: def test_add_entry(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "test content", {"key": "value"}) assert entry_id assert store.count("ideation") == 1 def test_add_invalid_category(self, store: MemoryStore) -> None: with pytest.raises(ValueError, match="Invalid category"): store.add("invalid_cat", "content") def test_add_all_categories(self, store: MemoryStore) -> None: for cat in VALID_CATEGORIES: store.add(cat, f"content for {cat}") assert store.count() == 3 def test_get_entry(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "findme") entry = store.get(entry_id) assert entry is not None assert entry.content == "findme" assert entry.category == "ideation" def test_get_nonexistent(self, store: MemoryStore) -> None: assert store.get("nonexistent_id") is None def test_get_all_no_filter(self, populated_store: MemoryStore) -> None: all_entries = populated_store.get_all() assert len(all_entries) == 5 def test_get_all_with_filter(self, populated_store: MemoryStore) -> None: ideation = populated_store.get_all("ideation") assert len(ideation) == 2 def test_update_confidence_success(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "conf test", confidence=0.5) assert store.update_confidence(entry_id, 0.1) entry = store.get(entry_id) assert entry is not None assert abs(entry.confidence - 0.6) < 1e-6 def test_update_confidence_clamp_high(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "test", confidence=0.95) store.update_confidence(entry_id, 0.2) entry = store.get(entry_id) assert entry is not None assert entry.confidence == 1.0 def test_update_confidence_clamp_low(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "test", confidence=0.1) store.update_confidence(entry_id, -0.5) entry = store.get(entry_id) assert entry is not None assert entry.confidence == 0.0 def test_update_confidence_nonexistent(self, store: MemoryStore) -> None: assert not store.update_confidence("nope", 0.1) def test_mark_accessed(self, store: MemoryStore) -> None: entry_id = store.add("ideation", "access test") entry = store.get(entry_id) assert entry is not None assert entry.access_count == 0 store.mark_accessed(entry_id) entry = store.get(entry_id) assert entry is not None assert entry.access_count == 1 def test_capacity_enforcement(self, tmp_store_dir: Path) -> None: store = MemoryStore(tmp_store_dir, max_entries_per_category=3) for i in range(5): store.add("ideation", f"entry {i}", confidence=i * 0.2) assert store.count("ideation") == 3 # Should keep highest confidence entries entries = store.get_all("ideation") confidences = [e.confidence for e in entries] assert min(confidences) >= 0.4 # lowest 2 (0.0, 0.2) should be pruned def test_count_empty(self, store: MemoryStore) -> None: assert store.count() == 0 assert store.count("ideation") == 0 # ── Persistence ────────────────────────────────────────────────────── class TestMemoryPersistence: def test_save_and_load(self, tmp_store_dir: Path) -> None: store = MemoryStore(tmp_store_dir) store.add("ideation", "persistent content", {"key": "val"}) store.add("experiment", "exp content") store.save() store2 = MemoryStore(tmp_store_dir) loaded = store2.load() assert loaded == 2 assert store2.count() == 2 def test_save_creates_directory(self, tmp_path: Path) -> None: new_dir = tmp_path / "new" / "nested" / "dir" store = MemoryStore(new_dir) store.add("ideation", "test") store.save() assert (new_dir / "ideation.jsonl").exists() def test_load_empty_dir(self, tmp_store_dir: Path) -> None: store = MemoryStore(tmp_store_dir) assert store.load() == 0 def test_load_malformed_jsonl(self, tmp_store_dir: Path) -> None: (tmp_store_dir / "ideation.jsonl").write_text( '{"id": "a", "category": "ideation"}\nnot json\n', encoding="utf-8", ) store = MemoryStore(tmp_store_dir) loaded = store.load() assert loaded == 1 # only valid entry loaded def test_roundtrip_preserves_data(self, tmp_store_dir: Path) -> None: store = MemoryStore(tmp_store_dir) entry_id = store.add( "experiment", "test content", metadata={"key": "value"}, embedding=[0.1, 0.2, 0.3], confidence=0.7, ) store.save() store2 = MemoryStore(tmp_store_dir) store2.load() entry = store2.get(entry_id) assert entry is not None assert entry.content == "test content" assert entry.metadata == {"key": "value"} assert entry.embedding == [0.1, 0.2, 0.3] assert abs(entry.confidence - 0.7) < 1e-6 # ── Prune ──────────────────────────────────────────────────────────── class TestMemoryPrune: def test_prune_low_confidence(self, store: MemoryStore) -> None: store.add("ideation", "low conf", confidence=0.1) store.add("ideation", "high conf", confidence=0.8) removed = store.prune(confidence_threshold=0.5) assert removed == 1 assert store.count("ideation") == 1 def test_prune_nothing_to_remove(self, store: MemoryStore) -> None: store.add("ideation", "good", confidence=0.9) removed = store.prune() assert removed == 0 # ── MemoryEntry ────────────────────────────────────────────────────── class TestMemoryEntry: def test_to_dict(self) -> None: entry = MemoryEntry( id="abc", category="ideation", content="test", metadata={}, embedding=[], confidence=0.5, created_at="2024-01-01T00:00:00+00:00", last_accessed="2024-01-01T00:00:00+00:00", access_count=0, ) d = entry.to_dict() assert d["id"] == "abc" assert d["category"] == "ideation" def test_from_dict(self) -> None: data = { "id": "xyz", "category": "experiment", "content": "hp test", "metadata": {"run": "1"}, "embedding": [0.1], "confidence": 0.6, "created_at": "2024-06-01T00:00:00+00:00", "last_accessed": "2024-06-01T00:00:00+00:00", "access_count": 3, } entry = MemoryEntry.from_dict(data) assert entry.id == "xyz" assert entry.access_count == 3 def test_from_dict_defaults(self) -> None: entry = MemoryEntry.from_dict({}) assert entry.id == "" assert entry.confidence == 0.5 assert entry.access_count == 0 # ── Time Decay ─────────────────────────────────────────────────────── class TestTimeDecay: def test_fresh_entry(self) -> None: now = datetime.now(timezone.utc) w = time_decay_weight(now, half_life_days=90.0, now=now) assert abs(w - 1.0) < 1e-6 def test_half_life(self) -> None: now = datetime.now(timezone.utc) half = now - timedelta(days=90) w = time_decay_weight(half, half_life_days=90.0, now=now) assert abs(w - 0.5) < 0.01 def test_expired(self) -> None: now = datetime.now(timezone.utc) old = now - timedelta(days=400) w = time_decay_weight(old, half_life_days=90.0, max_age_days=365.0, now=now) assert w == 0.0 def test_future_timestamp(self) -> None: now = datetime.now(timezone.utc) future = now + timedelta(days=10) w = time_decay_weight(future, now=now) assert w == 1.0 def test_naive_datetime(self) -> None: now = datetime.now(timezone.utc) naive = now.replace(tzinfo=None) w = time_decay_weight(naive, now=now) assert w > 0.0 class TestConfidenceUpdate: def test_increase(self) -> None: assert confidence_update(0.5, 0.1) == 0.6 def test_decrease(self) -> None: assert confidence_update(0.5, -0.2) == pytest.approx(0.3) def test_clamp_ceiling(self) -> None: assert confidence_update(0.95, 0.2) == 1.0 def test_clamp_floor(self) -> None: assert confidence_update(0.1, -0.5) == 0.0 # ── Embeddings ─────────────────────────────────────────────────────── class TestEmbeddings: def test_tfidf_fallback(self) -> None: provider = EmbeddingProvider() vec = provider.embed("hello world test") assert len(vec) > 0 assert isinstance(vec[0], float) def test_tfidf_normalized(self) -> None: provider = EmbeddingProvider() vec = provider.embed("deep learning neural network") norm = math.sqrt(sum(v * v for v in vec)) assert abs(norm - 1.0) < 0.01 def test_tfidf_empty(self) -> None: provider = EmbeddingProvider() # Force TF-IDF backend to test zero-vector behavior provider._backend = "tfidf" provider._dim = 256 vec = provider.embed("") assert all(v == 0.0 for v in vec) def test_tokenize(self) -> None: tokens = _tokenize("Hello, World! 123") assert "hello" in tokens assert "world" in tokens assert "123" in tokens def test_hash_token_deterministic(self) -> None: a = _hash_token("test", 256) b = _hash_token("test", 256) assert a == b def test_embed_batch(self) -> None: provider = EmbeddingProvider() vecs = provider.embed_batch(["hello", "world"]) assert len(vecs) == 2 def test_backend_detection(self) -> None: provider = EmbeddingProvider() backend = provider.backend assert backend in ("api", "sentence_transformers", "tfidf") # ── Retriever ──────────────────────────────────────────────────────── class TestRetriever: def test_cosine_similarity_identical(self) -> None: vec = [1.0, 0.0, 0.0] assert abs(cosine_similarity(vec, vec) - 1.0) < 1e-6 def test_cosine_similarity_orthogonal(self) -> None: a = [1.0, 0.0] b = [0.0, 1.0] assert abs(cosine_similarity(a, b)) < 1e-6 def test_cosine_similarity_opposite(self) -> None: a = [1.0, 0.0] b = [-1.0, 0.0] assert abs(cosine_similarity(a, b) + 1.0) < 1e-6 def test_cosine_similarity_empty(self) -> None: assert cosine_similarity([], []) == 0.0 def test_cosine_similarity_mismatched_length(self) -> None: assert cosine_similarity([1.0], [1.0, 2.0]) == 0.0 def test_recall_empty_store(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) results = retriever.recall([0.1, 0.2], category="ideation") assert results == [] def test_recall_returns_results(self, store: MemoryStore) -> None: store.add("ideation", "RL research", embedding=[1.0, 0.0, 0.0]) store.add("ideation", "NLP research", embedding=[0.0, 1.0, 0.0]) retriever = MemoryRetriever(store) results = retriever.recall([0.9, 0.1, 0.0], category="ideation", top_k=1) assert len(results) == 1 assert "RL" in results[0][0].content def test_recall_respects_top_k(self, store: MemoryStore) -> None: for i in range(10): store.add("ideation", f"entry {i}", embedding=[float(i)] * 3) retriever = MemoryRetriever(store) results = retriever.recall([5.0, 5.0, 5.0], top_k=3) assert len(results) == 3 def test_format_for_prompt(self, store: MemoryStore) -> None: store.add("ideation", "Topic: RL", embedding=[1.0]) retriever = MemoryRetriever(store) results = retriever.recall([1.0]) text = retriever.format_for_prompt(results) assert "ideation" in text def test_format_for_prompt_empty(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) text = retriever.format_for_prompt([]) assert text == "" # ── Ideation Memory ────────────────────────────────────────────────── class TestIdeationMemory: def test_record_topic_success(self, store: MemoryStore, embedding_fn: object) -> None: retriever = MemoryRetriever(store) im = IdeationMemory(store, retriever, embed_fn=embedding_fn) entry_id = im.record_topic_outcome("RL for robotics", "success", 8.0) assert entry_id assert store.count("ideation") == 1 def test_record_topic_failure(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) im = IdeationMemory(store, retriever) im.record_topic_outcome("Bad topic", "failure", 2.0, run_id="r1") entries = store.get_all("ideation") assert entries[0].metadata["outcome"] == "failure" def test_record_hypothesis(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) im = IdeationMemory(store, retriever) im.record_hypothesis("H1: X is better than Y", True, "Validated") assert store.count("ideation") == 1 def test_get_anti_patterns(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) im = IdeationMemory(store, retriever) im.record_topic_outcome("Bad direction", "failure", 1.0) im.record_topic_outcome("Good direction", "success", 9.0) patterns = im.get_anti_patterns() assert len(patterns) == 1 assert "Bad" in patterns[0] def test_recall_similar_topics_empty(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) im = IdeationMemory(store, retriever) result = im.recall_similar_topics("test query") assert result == "" # ── Experiment Memory ──────────────────────────────────────────────── class TestExperimentMemory: def test_record_hyperparams(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) em = ExperimentMemory(store, retriever) em.record_hyperparams("image_cls", {"lr": 0.001, "bs": 32}, 0.95) assert store.count("experiment") == 1 def test_record_architecture(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) em = ExperimentMemory(store, retriever) em.record_architecture("image_cls", "ResNet-18", 0.96) entry = store.get_all("experiment")[0] assert "ResNet" in entry.content def test_record_training_trick(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) em = ExperimentMemory(store, retriever) em.record_training_trick("CosineAnnealing", 0.03, "CIFAR-10 training") entry = store.get_all("experiment")[0] assert "CosineAnnealing" in entry.content def test_recall_best_configs_empty(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) em = ExperimentMemory(store, retriever) result = em.recall_best_configs("anything") assert result == "" # ── Writing Memory ─────────────────────────────────────────────────── class TestWritingMemory: def test_record_review_feedback(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) wm = WritingMemory(store, retriever) wm.record_review_feedback("clarity", "Section 3 is unclear", "Rewrote S3") assert store.count("writing") == 1 def test_record_successful_structure(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) wm = WritingMemory(store, retriever) wm.record_successful_structure("intro", "Problem-Gap-Contribution", 8.5) entry = store.get_all("writing")[0] assert entry.metadata["section"] == "intro" def test_recall_writing_tips_empty(self, store: MemoryStore) -> None: retriever = MemoryRetriever(store) wm = WritingMemory(store, retriever) result = wm.recall_writing_tips("method", "RL paper") assert result == "" ================================================ FILE: tests/test_metaclaw_bridge/__init__.py ================================================ ================================================ FILE: tests/test_metaclaw_bridge/test_config.py ================================================ """Tests for MetaClaw bridge configuration parsing.""" from researchclaw.config import RCConfig def _minimal_config_data(**overrides): """Return minimal valid config data with metaclaw_bridge overrides.""" base = { "project": {"name": "test", "mode": "full-auto"}, "research": {"topic": "test topic", "domains": ["ml"]}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "console"}, "knowledge_base": {"backend": "markdown", "root": "docs/kb"}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:8080", "api_key_env": "TEST_KEY", "api_key": "sk-test", "primary_model": "gpt-4o", }, } base.update(overrides) return base def test_metaclaw_bridge_defaults(): """MetaClaw bridge should have sensible defaults when not configured.""" data = _minimal_config_data() cfg = RCConfig.from_dict(data, check_paths=False) assert cfg.metaclaw_bridge.enabled is False assert cfg.metaclaw_bridge.proxy_url == "http://localhost:30000" assert cfg.metaclaw_bridge.prm.enabled is False assert cfg.metaclaw_bridge.lesson_to_skill.enabled is True def test_metaclaw_bridge_enabled(): """MetaClaw bridge config should be parsed when provided.""" data = _minimal_config_data( metaclaw_bridge={ "enabled": True, "proxy_url": "http://localhost:31000", "skills_dir": "/tmp/skills", "prm": { "enabled": True, "api_base": "http://localhost:8080", "api_key": "test-key", "model": "gpt-5.4", "votes": 5, "gate_stages": [5, 20], }, "lesson_to_skill": { "enabled": True, "min_severity": "warning", "max_skills_per_run": 5, }, } ) cfg = RCConfig.from_dict(data, check_paths=False) assert cfg.metaclaw_bridge.enabled is True assert cfg.metaclaw_bridge.proxy_url == "http://localhost:31000" assert cfg.metaclaw_bridge.prm.enabled is True assert cfg.metaclaw_bridge.prm.votes == 5 assert cfg.metaclaw_bridge.prm.gate_stages == (5, 20) assert cfg.metaclaw_bridge.lesson_to_skill.min_severity == "warning" assert cfg.metaclaw_bridge.lesson_to_skill.max_skills_per_run == 5 def test_metaclaw_bridge_none_is_default(): """When metaclaw_bridge is None/missing, defaults should apply.""" data = _minimal_config_data(metaclaw_bridge=None) cfg = RCConfig.from_dict(data, check_paths=False) assert cfg.metaclaw_bridge.enabled is False ================================================ FILE: tests/test_metaclaw_bridge/test_lesson_to_skill.py ================================================ """Tests for lesson-to-skill conversion module.""" import json import tempfile from pathlib import Path from researchclaw.metaclaw_bridge.lesson_to_skill import ( _format_lessons, _list_existing_skill_names, _parse_skills_response, _write_skill, ) from researchclaw.evolution import LessonEntry def _make_lesson(stage: str = "experiment_run", severity: str = "error") -> LessonEntry: return LessonEntry( stage_name=stage, stage_num=12, category="experiment", severity=severity, description="Metric NaN detected in loss computation", timestamp="2026-03-15T00:00:00+00:00", run_id="test-001", ) def test_format_lessons(): lessons = [_make_lesson(), _make_lesson("code_generation")] text = _format_lessons(lessons) assert "experiment_run" in text assert "code_generation" in text assert "NaN" in text def test_list_existing_skills(tmp_path): (tmp_path / "skill-a").mkdir() (tmp_path / "skill-b").mkdir() (tmp_path / "not-a-skill.txt").write_text("x") names = _list_existing_skill_names(tmp_path) assert "skill-a" in names assert "skill-b" in names assert "not-a-skill.txt" not in names def test_list_existing_skills_missing_dir(): names = _list_existing_skill_names(Path("/nonexistent/dir")) assert names == [] def test_parse_skills_response_valid(): response = json.dumps([ { "name": "arc-fix-nan", "description": "Prevent NaN in loss", "category": "coding", "content": "# Fix NaN\n1. Check inputs\n2. Use grad clipping", } ]) parsed = _parse_skills_response(response) assert len(parsed) == 1 assert parsed[0]["name"] == "arc-fix-nan" def test_parse_skills_response_with_code_fence(): response = "```json\n" + json.dumps([ { "name": "arc-test", "description": "test", "category": "coding", "content": "test content", } ]) + "\n```" parsed = _parse_skills_response(response) assert len(parsed) == 1 def test_parse_skills_response_invalid(): assert _parse_skills_response("not json") == [] assert _parse_skills_response("[]") == [] def test_write_skill(tmp_path): skill = { "name": "arc-test-skill", "description": "A test skill", "category": "coding", "content": "# Test\n1. Do something", } path = _write_skill(tmp_path, skill) assert path is not None assert path.exists() content = path.read_text() assert "name: arc-test-skill" in content assert "category: coding" in content assert "# Test" in content ================================================ FILE: tests/test_metaclaw_bridge/test_prm_gate.py ================================================ """Tests for PRM quality gate module.""" from unittest.mock import patch, MagicMock from researchclaw.metaclaw_bridge.prm_gate import ( ResearchPRMGate, _GATE_INSTRUCTIONS, ) def test_gate_instructions_cover_expected_stages(): """PRM gate instructions should cover key gate stages.""" assert 5 in _GATE_INSTRUCTIONS assert 9 in _GATE_INSTRUCTIONS assert 15 in _GATE_INSTRUCTIONS assert 20 in _GATE_INSTRUCTIONS def test_should_gate(): gate = ResearchPRMGate( api_base="http://test", api_key="test", ) assert gate.should_gate(5) is True assert gate.should_gate(9) is True assert gate.should_gate(15) is True assert gate.should_gate(20) is True assert gate.should_gate(1) is False assert gate.should_gate(10) is False def test_from_bridge_config_disabled(): """Should return None when PRM is not enabled.""" config = MagicMock() config.enabled = False assert ResearchPRMGate.from_bridge_config(config) is None def test_from_bridge_config_enabled(): """Should create a gate when properly configured.""" config = MagicMock() config.enabled = True config.api_base = "http://test" config.api_key = "test-key" config.api_key_env = "" config.model = "gpt-5.4" config.votes = 3 config.temperature = 0.6 gate = ResearchPRMGate.from_bridge_config(config) assert gate is not None assert gate.api_base == "http://test" assert gate.votes == 3 @patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call") def test_evaluate_stage_majority_pass(mock_call): """Should return 1.0 when majority votes pass.""" mock_call.side_effect = [1.0, 1.0, -1.0] gate = ResearchPRMGate( api_base="http://test", api_key="test", votes=3, ) score = gate.evaluate_stage(20, "This is a good paper.") assert score == 1.0 @patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call") def test_evaluate_stage_majority_fail(mock_call): """Should return -1.0 when majority votes fail.""" mock_call.side_effect = [-1.0, -1.0, 1.0] gate = ResearchPRMGate( api_base="http://test", api_key="test", votes=3, ) score = gate.evaluate_stage(20, "This paper has critical issues.") assert score == -1.0 @patch("researchclaw.metaclaw_bridge.prm_gate._single_judge_call") def test_evaluate_stage_all_failed(mock_call): """Should return 0.0 when all judge calls fail.""" mock_call.side_effect = [None, None, None] gate = ResearchPRMGate( api_base="http://test", api_key="test", votes=3, ) score = gate.evaluate_stage(20, "test") assert score == 0.0 ================================================ FILE: tests/test_metaclaw_bridge/test_session.py ================================================ """Tests for MetaClaw session management module.""" from researchclaw.metaclaw_bridge.session import MetaClawSession def test_session_creation(): session = MetaClawSession("test-run-001") assert session.session_id == "arc-test-run-001" assert session.is_active is True def test_session_headers(): session = MetaClawSession("run-123") headers = session.get_headers("hypothesis_gen") assert headers["X-Session-Id"] == "arc-run-123" assert headers["X-Turn-Type"] == "main" assert headers["X-AutoRC-Stage"] == "hypothesis_gen" def test_session_headers_no_stage(): session = MetaClawSession("run-123") headers = session.get_headers() assert "X-AutoRC-Stage" not in headers def test_session_end(): session = MetaClawSession("run-456") end_headers = session.end() assert end_headers["X-Session-Done"] == "true" assert end_headers["X-Session-Id"] == "arc-run-456" assert session.is_active is False ================================================ FILE: tests/test_metaclaw_bridge/test_skill_feedback.py ================================================ """Tests for skill feedback tracking module.""" from pathlib import Path from researchclaw.metaclaw_bridge.skill_feedback import ( SkillEffectivenessRecord, SkillFeedbackStore, record_stage_skills, ) def test_append_and_load(tmp_path): store = SkillFeedbackStore(tmp_path / "feedback.jsonl") rec = SkillEffectivenessRecord( skill_name="hypothesis-formulation", stage_name="hypothesis_gen", run_id="test-001", stage_success=True, timestamp="2026-03-15T00:00:00+00:00", ) store.append(rec) loaded = store.load_all() assert len(loaded) == 1 assert loaded[0].skill_name == "hypothesis-formulation" assert loaded[0].stage_success is True def test_append_many(tmp_path): store = SkillFeedbackStore(tmp_path / "feedback.jsonl") records = [ SkillEffectivenessRecord("skill-a", "stage-1", "run-1", True, "2026-01-01"), SkillEffectivenessRecord("skill-b", "stage-2", "run-1", False, "2026-01-01"), ] store.append_many(records) assert len(store.load_all()) == 2 def test_compute_stats(tmp_path): store = SkillFeedbackStore(tmp_path / "feedback.jsonl") records = [ SkillEffectivenessRecord("skill-a", "s1", "r1", True, "t1"), SkillEffectivenessRecord("skill-a", "s2", "r1", False, "t1"), SkillEffectivenessRecord("skill-a", "s3", "r2", True, "t2"), SkillEffectivenessRecord("skill-b", "s1", "r1", False, "t1"), ] store.append_many(records) stats = store.compute_skill_stats() assert stats["skill-a"]["total"] == 3 assert stats["skill-a"]["successes"] == 2 assert abs(stats["skill-a"]["success_rate"] - 2 / 3) < 0.01 assert stats["skill-b"]["total"] == 1 assert stats["skill-b"]["success_rate"] == 0.0 def test_record_stage_skills(tmp_path): store = SkillFeedbackStore(tmp_path / "feedback.jsonl") record_stage_skills( store, stage_name="hypothesis_gen", run_id="test-002", stage_success=True, active_skills=["hypothesis-formulation", "research-gap-identification"], ) loaded = store.load_all() assert len(loaded) == 2 names = {r.skill_name for r in loaded} assert names == {"hypothesis-formulation", "research-gap-identification"} def test_empty_store(tmp_path): store = SkillFeedbackStore(tmp_path / "nonexistent.jsonl") assert store.load_all() == [] assert store.compute_skill_stats() == {} ================================================ FILE: tests/test_metaclaw_bridge/test_stage_skill_map.py ================================================ """Tests for stage-skill mapping module.""" from researchclaw.metaclaw_bridge.stage_skill_map import ( STAGE_SKILL_MAP, LESSON_CATEGORY_TO_SKILL_CATEGORY, get_stage_config, ) def test_all_23_stages_mapped(): """All 23 pipeline stages should have a mapping entry.""" expected_stages = [ "topic_init", "problem_decompose", "search_strategy", "literature_collect", "literature_screen", "knowledge_extract", "synthesis", "hypothesis_gen", "experiment_design", "code_generation", "resource_planning", "experiment_run", "iterative_refine", "result_analysis", "research_decision", "paper_outline", "paper_draft", "peer_review", "paper_revision", "quality_gate", "knowledge_archive", "export_publish", "citation_verify", ] for stage in expected_stages: assert stage in STAGE_SKILL_MAP, f"Missing mapping for {stage}" def test_stage_config_has_required_keys(): """Each stage config should have task_type, skills, and top_k.""" for stage_name, config in STAGE_SKILL_MAP.items(): assert "task_type" in config, f"{stage_name} missing task_type" assert "skills" in config, f"{stage_name} missing skills" assert "top_k" in config, f"{stage_name} missing top_k" assert isinstance(config["skills"], list) assert isinstance(config["top_k"], int) assert config["top_k"] > 0 def test_get_stage_config_known(): cfg = get_stage_config("hypothesis_gen") assert cfg["task_type"] == "research" assert "hypothesis-formulation" in cfg["skills"] def test_get_stage_config_unknown_returns_default(): cfg = get_stage_config("nonexistent_stage") assert cfg["task_type"] == "research" assert cfg["top_k"] == 4 def test_lesson_category_mapping_complete(): """All lesson categories should map to a skill category.""" expected = ["system", "experiment", "writing", "analysis", "literature", "pipeline"] for cat in expected: assert cat in LESSON_CATEGORY_TO_SKILL_CATEGORY ================================================ FILE: tests/test_metric_parser.py ================================================ """Tests for the universal metric parser.""" from __future__ import annotations import json import math import pytest from pathlib import Path from researchclaw.experiment.metrics import ( ExperimentResults, MetricType, UniversalMetricParser, ) @pytest.fixture def parser(): return UniversalMetricParser() @pytest.fixture def tmp_run_dir(tmp_path): return tmp_path # --------------------------------------------------------------------------- # JSON parsing tests # --------------------------------------------------------------------------- class TestJSONParsing: def test_parse_comparison_results(self, parser, tmp_run_dir): data = { "experiment_type": "comparison", "conditions": { "proposed_method": { "seed_42": {"accuracy": 0.95, "f1": 0.93}, "seed_123": {"accuracy": 0.94, "f1": 0.92}, }, "baseline": { "seed_42": {"accuracy": 0.88, "f1": 0.85}, }, }, "metadata": { "domain": "ml_vision", "total_runtime_sec": 120.5, }, } (tmp_run_dir / "results.json").write_text(json.dumps(data)) result = parser.parse(tmp_run_dir) assert result.source == "json" assert result.experiment_type == "comparison" assert result.domain == "ml_vision" assert "proposed_method" in result.conditions flat = result.to_flat_metrics() assert "proposed_method/accuracy" in flat def test_parse_convergence_results(self, parser, tmp_run_dir): data = { "experiment_type": "convergence", "convergence": { "euler": [ {"h": 0.1, "error": 0.05}, {"h": 0.05, "error": 0.012}, {"h": 0.025, "error": 0.003}, ], "rk4": [ {"h": 0.1, "error": 0.001}, {"h": 0.05, "error": 6.25e-5}, {"h": 0.025, "error": 3.9e-6}, ], }, } (tmp_run_dir / "results.json").write_text(json.dumps(data)) result = parser.parse(tmp_run_dir) assert result.source == "json" assert "euler" in result.convergence assert len(result.convergence["euler"]) == 3 flat = result.to_flat_metrics() assert "euler/error" in flat # last point def test_parse_regression_table(self, parser, tmp_run_dir): data = { "experiment_type": "progressive_spec", "regression_table": { "spec_1_ols": {"coeff": 0.15, "se": 0.03, "p": 0.001, "n": 5000, "r2": 0.12}, "spec_2_fe": {"coeff": 0.11, "se": 0.02, "p": 0.001, "n": 5000, "r2": 0.35}, }, } (tmp_run_dir / "results.json").write_text(json.dumps(data)) result = parser.parse(tmp_run_dir) assert result.source == "json" assert "spec_1_ols" in result.regression_table flat = result.to_flat_metrics() assert "spec_1_ols/coeff" in flat assert flat["spec_1_ols/coeff"] == 0.15 def test_parse_top_level_scalars(self, parser, tmp_run_dir): data = {"accuracy": 0.95, "loss": 0.32} (tmp_run_dir / "results.json").write_text(json.dumps(data)) result = parser.parse(tmp_run_dir) assert result.scalars["accuracy"] == 0.95 assert result.scalars["loss"] == 0.32 def test_skip_nan_inf(self, parser, tmp_run_dir): data = { "conditions": { "method": { "seed_1": {"accuracy": float("nan"), "f1": 0.9}, }, }, } (tmp_run_dir / "results.json").write_text(json.dumps(data)) result = parser.parse(tmp_run_dir) flat = result.to_flat_metrics() # NaN should be excluded for k, v in flat.items(): assert math.isfinite(v), f"Non-finite value: {k}={v}" def test_invalid_json_falls_through(self, parser, tmp_run_dir): (tmp_run_dir / "results.json").write_text("not valid json{{{") result = parser.parse(tmp_run_dir, stdout="metric_a: 0.5") # Should fallback to stdout assert result.source == "stdout" # --------------------------------------------------------------------------- # CSV parsing tests # --------------------------------------------------------------------------- class TestCSVParsing: def test_parse_condition_csv(self, parser, tmp_run_dir): csv_content = "condition,seed,metric,value\nmethod_a,42,accuracy,0.95\nmethod_b,42,accuracy,0.88\n" (tmp_run_dir / "results.csv").write_text(csv_content) result = parser.parse(tmp_run_dir) assert result.source == "csv" assert "method_a/accuracy" in result.scalars assert result.scalars["method_a/accuracy"] == 0.95 def test_parse_convergence_csv(self, parser, tmp_run_dir): csv_content = "method,h,error\neuler,0.1,0.05\neuler,0.05,0.012\nrk4,0.1,0.001\n" (tmp_run_dir / "results.csv").write_text(csv_content) result = parser.parse(tmp_run_dir) assert result.source == "csv" assert "euler" in result.convergence assert len(result.convergence["euler"]) == 2 def test_csv_skip_invalid(self, parser, tmp_run_dir): csv_content = "condition,metric,value\nmethod,accuracy,not_a_number\n" (tmp_run_dir / "results.csv").write_text(csv_content) result = parser.parse(tmp_run_dir) assert result.source == "csv" assert len(result.scalars) == 0 # --------------------------------------------------------------------------- # stdout fallback tests # --------------------------------------------------------------------------- class TestStdoutParsing: def test_parse_plain_metrics(self, parser, tmp_run_dir): result = parser.parse(tmp_run_dir, stdout="accuracy: 0.95\nloss: 0.32\n") assert result.source == "stdout" assert result.scalars["accuracy"] == 0.95 assert result.scalars["loss"] == 0.32 def test_parse_condition_metrics(self, parser, tmp_run_dir): stdout = "condition=method_a accuracy: 0.95\ncondition=method_b accuracy: 0.88\n" result = parser.parse(tmp_run_dir, stdout=stdout) assert result.source == "stdout" assert "method_a/accuracy" in result.scalars def test_fallback_to_stdout_log(self, parser, tmp_run_dir): (tmp_run_dir / "stdout.log").write_text("metric_x: 1.5\n") result = parser.parse(tmp_run_dir) assert result.source == "stdout" assert result.scalars.get("metric_x") == 1.5 # --------------------------------------------------------------------------- # ExperimentResults tests # --------------------------------------------------------------------------- class TestExperimentResults: def test_to_flat_metrics_empty(self): result = ExperimentResults() assert result.to_flat_metrics() == {} def test_to_flat_metrics_scalars(self): result = ExperimentResults(scalars={"a": 1.0, "b": 2.0}) flat = result.to_flat_metrics() assert flat["a"] == 1.0 assert flat["b"] == 2.0 def test_to_flat_metrics_conditions(self): result = ExperimentResults( conditions={ "method": {"seed_1": {"acc": 0.9}, "seed_2": {"acc": 0.91}}, } ) flat = result.to_flat_metrics() assert "method/acc" in flat def test_to_flat_metrics_convergence(self): result = ExperimentResults( convergence={ "euler": [ {"h": 0.1, "error": 0.05}, {"h": 0.05, "error": 0.01}, ], } ) flat = result.to_flat_metrics() assert "euler/error" in flat assert flat["euler/error"] == 0.01 # last point def test_to_flat_metrics_regression(self): result = ExperimentResults( regression_table={ "ols": {"coeff": 0.5, "se": 0.1}, } ) flat = result.to_flat_metrics() assert flat["ols/coeff"] == 0.5 # --------------------------------------------------------------------------- # Priority tests (JSON > CSV > stdout) # --------------------------------------------------------------------------- class TestParsePriority: def test_json_takes_priority_over_csv(self, parser, tmp_run_dir): (tmp_run_dir / "results.json").write_text('{"from_json": 1.0}') (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n") result = parser.parse(tmp_run_dir) assert result.source == "json" def test_csv_takes_priority_over_stdout(self, parser, tmp_run_dir): (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n") result = parser.parse(tmp_run_dir, stdout="stdout_metric: 3.0") assert result.source == "csv" def test_empty_json_falls_to_csv(self, parser, tmp_run_dir): (tmp_run_dir / "results.json").write_text("{}") (tmp_run_dir / "results.csv").write_text("condition,metric,value\ncsv,m,2.0\n") result = parser.parse(tmp_run_dir) assert result.source == "csv" # --------------------------------------------------------------------------- # MetricType enum tests # --------------------------------------------------------------------------- class TestMetricType: def test_values(self): assert MetricType.SCALAR.value == "scalar" assert MetricType.TABLE.value == "table" assert MetricType.CONVERGENCE.value == "convergence" assert MetricType.STRUCTURED.value == "structured" ================================================ FILE: tests/test_minimax_provider.py ================================================ """Tests for MiniMax provider integration. Covers: provider preset, CLI registration, factory wiring, temperature clamping, and live API integration. """ from __future__ import annotations import json import os import urllib.request from types import SimpleNamespace from typing import Any, Mapping import pytest from researchclaw.llm import PROVIDER_PRESETS, create_llm_client from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- class _DummyHTTPResponse: """Minimal stub for ``urllib.request.urlopen`` results.""" def __init__(self, payload: Mapping[str, Any]): self._payload = payload def read(self) -> bytes: return json.dumps(self._payload).encode("utf-8") def __enter__(self) -> _DummyHTTPResponse: return self def __exit__(self, *a: object) -> None: return None def _make_minimax_client( *, api_key: str = "test-minimax-key", primary_model: str = "MiniMax-M2.5", fallback_models: list[str] | None = None, ) -> LLMClient: config = LLMConfig( base_url="https://api.minimax.io/v1", api_key=api_key, primary_model=primary_model, fallback_models=fallback_models or ["MiniMax-M2.5-highspeed"], ) return LLMClient(config) # --------------------------------------------------------------------------- # Unit tests — provider preset # --------------------------------------------------------------------------- class TestMiniMaxPreset: """Verify MiniMax is registered in PROVIDER_PRESETS.""" def test_minimax_in_provider_presets(self): assert "minimax" in PROVIDER_PRESETS def test_minimax_base_url(self): assert PROVIDER_PRESETS["minimax"]["base_url"] == "https://api.minimax.io/v1" # --------------------------------------------------------------------------- # Unit tests — from_rc_config wiring # --------------------------------------------------------------------------- class TestMiniMaxFromRCConfig: """Verify that LLMClient.from_rc_config resolves MiniMax preset.""" def test_from_rc_config_sets_minimax_base_url(self): rc_config = SimpleNamespace( llm=SimpleNamespace( provider="minimax", base_url="", api_key="mk-test", api_key_env="", primary_model="MiniMax-M2.5", fallback_models=("MiniMax-M2.5-highspeed",), ), ) client = LLMClient.from_rc_config(rc_config) assert client.config.base_url == "https://api.minimax.io/v1" assert client.config.api_key == "mk-test" assert client.config.primary_model == "MiniMax-M2.5" assert client.config.fallback_models == ["MiniMax-M2.5-highspeed"] def test_from_rc_config_reads_minimax_api_key_from_env(self, monkeypatch): monkeypatch.setenv("MINIMAX_API_KEY", "env-minimax-key") rc_config = SimpleNamespace( llm=SimpleNamespace( provider="minimax", base_url="", api_key="", api_key_env="MINIMAX_API_KEY", primary_model="MiniMax-M2.5", fallback_models=(), ), ) client = LLMClient.from_rc_config(rc_config) assert client.config.api_key == "env-minimax-key" def test_from_rc_config_custom_base_url_overrides_preset(self): rc_config = SimpleNamespace( llm=SimpleNamespace( provider="minimax", base_url="https://custom-proxy.example/v1", api_key="mk-test", api_key_env="", primary_model="MiniMax-M2.5", fallback_models=(), ), ) client = LLMClient.from_rc_config(rc_config) assert client.config.base_url == "https://custom-proxy.example/v1" # --------------------------------------------------------------------------- # Unit tests — temperature clamping # --------------------------------------------------------------------------- class TestMiniMaxTemperatureClamping: """MiniMax API requires temperature in [0, 1.0].""" def _capture_body( self, monkeypatch: pytest.MonkeyPatch, client: LLMClient, temperature: float, ) -> dict[str, Any]: captured: dict[str, Any] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["body"] = json.loads(req.data.decode("utf-8")) return _DummyHTTPResponse( {"choices": [{"message": {"content": "ok"}, "finish_reason": "stop"}]} ) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client._raw_call( "MiniMax-M2.5", [{"role": "user", "content": "hi"}], 1024, temperature, False, ) return captured["body"] def test_temperature_above_one_clamped(self, monkeypatch): client = _make_minimax_client() body = self._capture_body(monkeypatch, client, 1.5) assert body["temperature"] == 1.0 def test_temperature_within_range_unchanged(self, monkeypatch): client = _make_minimax_client() body = self._capture_body(monkeypatch, client, 0.7) assert body["temperature"] == 0.7 def test_temperature_zero_allowed(self, monkeypatch): client = _make_minimax_client() body = self._capture_body(monkeypatch, client, 0.0) assert body["temperature"] == 0.0 def test_temperature_negative_clamped_to_zero(self, monkeypatch): client = _make_minimax_client() body = self._capture_body(monkeypatch, client, -0.1) assert body["temperature"] == 0.0 def test_non_minimax_url_no_clamping(self, monkeypatch): """Non-MiniMax URLs should not clamp temperature.""" config = LLMConfig( base_url="https://api.openai.com/v1", api_key="test-key", primary_model="gpt-4o", ) client = LLMClient(config) captured: dict[str, Any] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["body"] = json.loads(req.data.decode("utf-8")) return _DummyHTTPResponse( {"choices": [{"message": {"content": "ok"}, "finish_reason": "stop"}]} ) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client._raw_call("gpt-4o", [{"role": "user", "content": "hi"}], 1024, 1.5, False) assert captured["body"]["temperature"] == 1.5 # no clamping # --------------------------------------------------------------------------- # Unit tests — model chain # --------------------------------------------------------------------------- class TestMiniMaxModelChain: """Model fallback chain for MiniMax.""" def test_model_chain_default(self): client = _make_minimax_client() assert client._model_chain == ["MiniMax-M2.5", "MiniMax-M2.5-highspeed"] def test_model_chain_custom_fallbacks(self): client = _make_minimax_client( primary_model="MiniMax-M2.7", fallback_models=["MiniMax-M2.5", "MiniMax-M2.5-highspeed"], ) assert client._model_chain == [ "MiniMax-M2.7", "MiniMax-M2.5", "MiniMax-M2.5-highspeed", ] # --------------------------------------------------------------------------- # Unit tests — raw call body structure # --------------------------------------------------------------------------- class TestMiniMaxRawCall: """Verify request body sent to MiniMax API.""" def test_request_body_structure(self, monkeypatch): client = _make_minimax_client() captured: dict[str, Any] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["url"] = req.full_url captured["body"] = json.loads(req.data.decode("utf-8")) captured["headers"] = {k.lower(): v for k, v in req.headers.items()} return _DummyHTTPResponse( { "model": "MiniMax-M2.5", "choices": [{"message": {"content": "pong"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 5, "completion_tokens": 1, "total_tokens": 6}, } ) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) resp = client._raw_call( "MiniMax-M2.5", [{"role": "user", "content": "ping"}], 1024, 0.5, False, ) assert captured["url"] == "https://api.minimax.io/v1/chat/completions" assert captured["body"]["model"] == "MiniMax-M2.5" assert captured["body"]["temperature"] == 0.5 assert captured["headers"]["authorization"] == "Bearer test-minimax-key" assert resp.content == "pong" assert resp.model == "MiniMax-M2.5" def test_json_mode_adds_response_format(self, monkeypatch): client = _make_minimax_client() captured: dict[str, Any] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["body"] = json.loads(req.data.decode("utf-8")) return _DummyHTTPResponse( {"choices": [{"message": {"content": "{}"}, "finish_reason": "stop"}]} ) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client._raw_call( "MiniMax-M2.5", [{"role": "user", "content": "json"}], 1024, 0.5, True, ) assert captured["body"]["response_format"] == {"type": "json_object"} # --------------------------------------------------------------------------- # Unit tests — CLI provider registration # --------------------------------------------------------------------------- class TestMiniMaxCLI: """Verify MiniMax is in the CLI interactive provider menu.""" def test_minimax_in_provider_choices(self): from researchclaw.cli import _PROVIDER_CHOICES found = any(v[0] == "minimax" for v in _PROVIDER_CHOICES.values()) assert found, "minimax not found in _PROVIDER_CHOICES" def test_minimax_in_provider_urls(self): from researchclaw.cli import _PROVIDER_URLS assert _PROVIDER_URLS["minimax"] == "https://api.minimax.io/v1" def test_minimax_in_provider_models(self): from researchclaw.cli import _PROVIDER_MODELS primary, fallbacks = _PROVIDER_MODELS["minimax"] assert primary == "MiniMax-M2.5" assert "MiniMax-M2.5-highspeed" in fallbacks # --------------------------------------------------------------------------- # Unit tests — factory function # --------------------------------------------------------------------------- class TestMiniMaxFactory: """Verify create_llm_client dispatches correctly for MiniMax.""" def test_create_llm_client_returns_llm_client(self): from researchclaw.config import LlmConfig, RCConfig rc_config = SimpleNamespace( llm=SimpleNamespace( provider="minimax", base_url="", api_key="mk-factory-test", api_key_env="", primary_model="MiniMax-M2.5", fallback_models=(), ), ) client = create_llm_client(rc_config) assert isinstance(client, LLMClient) assert client.config.base_url == "https://api.minimax.io/v1" assert client._anthropic is None # Not anthropic # --------------------------------------------------------------------------- # Unit tests — chat fallback with MiniMax models # --------------------------------------------------------------------------- class TestMiniMaxChatFallback: """Verify fallback works with MiniMax models.""" def test_fallback_to_highspeed_on_primary_failure(self, monkeypatch): client = _make_minimax_client() calls: list[str] = [] def fake_call_with_retry( self, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool, ) -> LLMResponse: calls.append(model) if model == "MiniMax-M2.5": raise RuntimeError("rate limited") return LLMResponse(content="ok", model=model) monkeypatch.setattr(LLMClient, "_call_with_retry", fake_call_with_retry) resp = client.chat([{"role": "user", "content": "test"}]) assert calls == ["MiniMax-M2.5", "MiniMax-M2.5-highspeed"] assert resp.model == "MiniMax-M2.5-highspeed" # --------------------------------------------------------------------------- # Integration tests — live MiniMax API (skipped without key) # --------------------------------------------------------------------------- @pytest.mark.skipif( not os.environ.get("MINIMAX_API_KEY"), reason="MINIMAX_API_KEY not set", ) class TestMiniMaxLiveAPI: """Integration tests against the real MiniMax API.""" def _live_client(self) -> LLMClient: return LLMClient( LLMConfig( base_url="https://api.minimax.io/v1", api_key=os.environ["MINIMAX_API_KEY"], primary_model="MiniMax-M2.5", fallback_models=["MiniMax-M2.5-highspeed"], max_tokens=64, timeout_sec=60, ) ) def test_simple_chat_completion(self): client = self._live_client() resp = client.chat( [{"role": "user", "content": "Say 'hello' and nothing else."}], max_tokens=16, temperature=0.1, ) assert resp.content.strip(), "empty response" assert "hello" in resp.content.lower() def test_json_mode(self): client = self._live_client() resp = client.chat( [ {"role": "system", "content": "You are a helpful assistant that responds in JSON."}, {"role": "user", "content": 'Return a JSON object with key "status" set to "ok".'}, ], max_tokens=128, temperature=0.1, json_mode=True, strip_thinking=True, ) # MiniMax M2.5 may wrap JSON in markdown code fences import re text = resp.content.strip() fence_match = re.search(r"```(?:json)?\s*\n(.*?)```", text, re.DOTALL) if fence_match: text = fence_match.group(1).strip() parsed = json.loads(text) assert "status" in parsed def test_preflight_check(self): client = self._live_client() ok, msg = client.preflight() assert ok, f"preflight failed: {msg}" ================================================ FILE: tests/test_neuroscience_domain.py ================================================ """Tests for computational neuroscience domain support. Covers profile loading, keyword detection, adapter dispatch, and prompt block generation for neuroscience_computational and neuroscience_imaging domains. """ from __future__ import annotations import pytest from researchclaw.domains.detector import ( DomainProfile, detect_domain, detect_domain_id, get_profile, _keyword_detect, _profile_cache, ) from researchclaw.domains.prompt_adapter import ( MLPromptAdapter, PromptBlocks, get_adapter, ) # --------------------------------------------------------------------------- # Profile loading # --------------------------------------------------------------------------- class TestNeuroscienceProfiles: def setup_method(self): _profile_cache.clear() def test_computational_profile_exists(self): profile = get_profile("neuroscience_computational") assert profile is not None assert profile.domain_id == "neuroscience_computational" assert profile.display_name == "Computational Neuroscience" def test_computational_profile_fields(self): profile = get_profile("neuroscience_computational") assert profile is not None assert profile.experiment_paradigm == "simulation" assert "brian2" in profile.core_libraries assert "numpy" in profile.core_libraries assert profile.gpu_required is False def test_computational_profile_baselines(self): profile = get_profile("neuroscience_computational") assert profile is not None assert len(profile.standard_baselines) >= 2 assert any("LIF" in b or "Integrate-and-Fire" in b for b in profile.standard_baselines) def test_imaging_profile_exists(self): profile = get_profile("neuroscience_imaging") assert profile is not None assert profile.domain_id == "neuroscience_imaging" assert profile.display_name == "Brain Imaging Analysis" def test_imaging_profile_fields(self): profile = get_profile("neuroscience_imaging") assert profile is not None assert profile.experiment_paradigm == "comparison" assert "nilearn" in profile.core_libraries assert "mne" in profile.core_libraries # --------------------------------------------------------------------------- # Keyword detection # --------------------------------------------------------------------------- class TestNeuroscienceKeywordDetection: def test_spiking_network(self): assert _keyword_detect("spiking neural model of cortical columns") == "neuroscience_computational" def test_brian2(self): assert _keyword_detect("network model implemented in brian2") == "neuroscience_computational" def test_hodgkin_huxley(self): assert _keyword_detect("Hodgkin-Huxley neuron model") == "neuroscience_computational" def test_integrate_and_fire(self): assert _keyword_detect("leaky integrate-and-fire model") == "neuroscience_computational" def test_izhikevich(self): assert _keyword_detect("Izhikevich neuron dynamics") == "neuroscience_computational" def test_neural_decoding(self): assert _keyword_detect("neural decoding of population coding in cortex") == "neuroscience_computational" def test_firing_rate(self): assert _keyword_detect("firing rate analysis of cortical neurons") == "neuroscience_computational" def test_fmri(self): assert _keyword_detect("fmri resting state analysis") == "neuroscience_imaging" def test_eeg(self): assert _keyword_detect("EEG classification for BCI") == "neuroscience_imaging" def test_nilearn(self): assert _keyword_detect("brain parcellation with nilearn") == "neuroscience_imaging" def test_mne_python(self): assert _keyword_detect("ERP analysis using mne-python") == "neuroscience_imaging" def test_generic_neuroscience(self): result = _keyword_detect("neuroscience of learning and memory") assert result == "neuroscience_computational" def test_detect_domain_integration(self): profile = detect_domain("brian2 spiking neural model of cortical microcircuits") assert profile.domain_id == "neuroscience_computational" def test_detect_domain_id_shortcut(self): domain_id = detect_domain_id("brian2 leaky integrate-and-fire cortical model") assert domain_id == "neuroscience_computational" # --------------------------------------------------------------------------- # Adapter dispatch # --------------------------------------------------------------------------- class TestNeuroscienceAdapter: def test_computational_gets_neuroscience_adapter(self): profile = get_profile("neuroscience_computational") if profile is None: pytest.skip("neuroscience_computational profile not found") adapter = get_adapter(profile) assert not isinstance(adapter, MLPromptAdapter) from researchclaw.domains.adapters.neuroscience import ( NeurosciencePromptAdapter, ) assert isinstance(adapter, NeurosciencePromptAdapter) def test_imaging_gets_neuroscience_adapter(self): profile = get_profile("neuroscience_imaging") if profile is None: pytest.skip("neuroscience_imaging profile not found") adapter = get_adapter(profile) assert not isinstance(adapter, MLPromptAdapter) def test_code_generation_blocks_nonempty(self): profile = get_profile("neuroscience_computational") if profile is None: pytest.skip("neuroscience_computational profile not found") adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints assert blocks.dataset_guidance assert blocks.output_format_guidance def test_experiment_design_blocks(self): profile = get_profile("neuroscience_computational") if profile is None: pytest.skip("neuroscience_computational profile not found") adapter = get_adapter(profile) blocks = adapter.get_experiment_design_blocks({}) assert "neuroscience" in blocks.experiment_design_context.lower() or \ "Computational Neuroscience" in blocks.experiment_design_context assert blocks.statistical_test_guidance def test_result_analysis_blocks(self): profile = get_profile("neuroscience_computational") if profile is None: pytest.skip("neuroscience_computational profile not found") adapter = get_adapter(profile) blocks = adapter.get_result_analysis_blocks({}) assert "firing rate" in blocks.result_analysis_hints.lower() def test_blueprint_context(self): profile = get_profile("neuroscience_computational") if profile is None: pytest.skip("neuroscience_computational profile not found") adapter = get_adapter(profile) ctx = adapter.get_blueprint_context() # Should include file structure and libraries from the profile if profile.typical_file_structure: assert "network.py" in ctx or "neuron.py" in ctx if profile.core_libraries: assert "brian2" in ctx or "numpy" in ctx ================================================ FILE: tests/test_opencode_bridge.py ================================================ """Tests for OpenCode Beast Mode bridge.""" from __future__ import annotations import json import subprocess from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.config import OpenCodeConfig, _parse_opencode_config from researchclaw.pipeline.opencode_bridge import ( ComplexityScore, OpenCodeBridge, OpenCodeResult, count_historical_failures, score_complexity, ) # ============================================================ # TestComplexityScorer # ============================================================ class TestComplexityScorer: """Tests for complexity scoring logic.""" def test_low_complexity_simple_classification(self): plan = ( "Train a ResNet-18 on CIFAR-10 with SGD optimizer.\n" "Report test accuracy as the primary metric.\n" "condition_0: baseline (lr=0.1)\n" "condition_1: ablation (lr=0.01)\n" ) result = score_complexity(plan, topic="Image classification on CIFAR-10") assert result.score < 0.4 assert result.recommendation == "code_agent" def test_high_complexity_multimodal_gan(self): plan = ( "Implement a vision-language GAN with the following components:\n" "- Encoder: ViT-based image encoder\n" "- Decoder: Transformer text decoder\n" "- Generator: produces synthetic image-text pairs\n" "- Discriminator: classifies real vs fake\n" "- Critic: provides auxiliary reward signal\n" "Multiple files needed: model.py, trainer.py, dataset.py\n" "condition_0: baseline\n" "condition_1: ablation without critic\n" "condition_2: ablation without encoder pretraining\n" "condition_3: ablation with reduced generator\n" "Custom loss function and custom layer for cross-modal attention.\n" ) result = score_complexity( plan, topic="Multi-modal GAN for vision-language synthesis" ) assert result.score > 0.6 assert result.recommendation == "beast_mode" def test_historical_failures_boost_score(self): plan = ( "Train a simple model with encoder and decoder.\n" "condition_0: baseline\n" ) score_without = score_complexity(plan, topic="test", historical_failures=0) score_with = score_complexity(plan, topic="test", historical_failures=3) assert score_with.score > score_without.score assert score_with.signals["historical_failure"] > 0 def test_empty_plan_returns_zero(self): result = score_complexity("", topic="") assert result.score == 0.0 assert result.recommendation == "legacy" assert result.reason == "Empty plan" def test_threshold_boundary(self): """A plan scoring exactly at threshold should recommend beast_mode.""" plan = ( "Multi-modal diffusion model with encoder, decoder, discriminator.\n" "Custom loss, custom layer, wrapper pattern.\n" "model.py, trainer.py needed.\n" ) # Use a low threshold to ensure it triggers result = score_complexity(plan, topic="Diffusion model", threshold=0.2) assert result.recommendation == "beast_mode" # Use a very high threshold to ensure it doesn't trigger result2 = score_complexity(plan, topic="Diffusion model", threshold=0.99) assert result2.recommendation == "code_agent" def test_signals_all_present(self): result = score_complexity("some plan", topic="some topic") expected_keys = { "component_count", "file_count_hint", "domain_complexity", "condition_count", "historical_failure", "dependency_depth", } assert set(result.signals.keys()) == expected_keys def test_score_clamped_to_unit_interval(self): """Score should never exceed 1.0 even with extreme inputs.""" plan = " ".join( ["encoder decoder discriminator generator critic actor teacher student"] * 10 + ["model.py trainer.py dataset.py multiple files modular"] * 10 + ["multi-modal distributed GAN diffusion NeRF MoE meta-learning"] * 10 + ["condition_1 condition_2 condition_3 ablation_4 variant_5 baseline"] * 10 + ["custom layer custom loss wrapper registry hook callback"] * 10 ) result = score_complexity(plan, topic="everything", historical_failures=100) assert 0.0 <= result.score <= 1.0 def test_domain_complexity_keywords(self): plan = "Implement a physics-informed neural network (PINN) with neural ODE solver." result = score_complexity(plan, topic="PINN for fluid dynamics") assert result.signals["domain_complexity"] > 0 # ============================================================ # TestOpenCodeBridge # ============================================================ class TestOpenCodeBridge: """Tests for the OpenCode bridge class.""" def test_check_available_returns_false_when_not_installed(self): with patch( "researchclaw.pipeline.opencode_bridge.shutil.which", return_value=None, ): assert OpenCodeBridge.check_available() is False def test_check_available_returns_false_on_timeout(self): with patch( "researchclaw.pipeline.opencode_bridge.shutil.which", return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd", ), patch( "researchclaw.pipeline.opencode_bridge.subprocess.run", side_effect=subprocess.TimeoutExpired(cmd="opencode", timeout=15), ): assert OpenCodeBridge.check_available() is False def test_check_available_returns_true(self): mock_result = MagicMock() mock_result.returncode = 0 with patch( "researchclaw.pipeline.opencode_bridge.shutil.which", return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd", ), patch( "researchclaw.pipeline.opencode_bridge.subprocess.run", return_value=mock_result, ) as run_mock: assert OpenCodeBridge.check_available() is True assert run_mock.call_args.args[0][0].endswith("opencode.cmd") def test_workspace_creates_correct_files(self, tmp_path): bridge = OpenCodeBridge( model="gpt-5.2", llm_base_url="https://example.com", api_key_env="TEST_KEY", ) ws = bridge._prepare_workspace( stage_dir=tmp_path, topic="Test topic", exp_plan="plan: test", metric="accuracy", pkg_hint="torch available", extra_guidance="Be careful", time_budget_sec=300, ) assert (ws / "EXPERIMENT_PLAN.yaml").exists() assert (ws / "GUIDANCE.md").exists() assert (ws / "opencode.json").exists() guidance = (ws / "GUIDANCE.md").read_text() assert "Test topic" in guidance assert "accuracy" in guidance def test_opencode_config_azure_format(self, tmp_path): bridge = OpenCodeBridge( model="gpt-5.2", llm_base_url="https://huaxi.openai.azure.com/openai/v1", api_key_env="AZURE_OPENAI_API_KEY", llm_provider="azure", ) ws = bridge._prepare_workspace( stage_dir=tmp_path, topic="t", exp_plan="p", metric="m", pkg_hint="", extra_guidance="", time_budget_sec=300, ) cfg = json.loads((ws / "opencode.json").read_text()) # Azure now uses the unified "openai" provider (Bearer token auth # works on Azure endpoints and Responses API is supported) assert cfg["model"] == "openai/gpt-5.2" assert "provider" in cfg assert "openai" in cfg["provider"] assert cfg["provider"]["openai"]["options"]["baseURL"] == "https://huaxi.openai.azure.com/openai/v1" assert "{env:AZURE_OPENAI_API_KEY}" in cfg["provider"]["openai"]["options"]["apiKey"] def test_opencode_config_openai_format(self, tmp_path): bridge = OpenCodeBridge( model="gpt-4o", llm_base_url="https://api.openai.com/v1", api_key_env="OPENAI_API_KEY", ) ws = bridge._prepare_workspace( stage_dir=tmp_path, topic="t", exp_plan="p", metric="m", pkg_hint="", extra_guidance="", time_budget_sec=300, ) cfg = json.loads((ws / "opencode.json").read_text()) assert cfg["model"] == "openai/gpt-4o" assert "openai" in cfg["provider"] def test_opencode_config_preserves_prefixed_model(self, tmp_path): """Model with '/' prefix (e.g. anthropic/...) should NOT get double-prefixed (BUG-C fix).""" bridge = OpenCodeBridge( model="anthropic/claude-sonnet-4-6", llm_base_url="https://huaxi.openai.azure.com/openai/v1", api_key_env="AZURE_API_KEY", llm_provider="azure", ) ws = bridge._prepare_workspace( stage_dir=tmp_path, topic="t", exp_plan="p", metric="m", pkg_hint="", extra_guidance="", time_budget_sec=300, ) cfg = json.loads((ws / "opencode.json").read_text()) # Should be "anthropic/claude-sonnet-4-6", NOT "azure/anthropic/claude-sonnet-4-6" assert cfg["model"] == "anthropic/claude-sonnet-4-6" def test_resolve_model_azure_uses_openai_prefix(self): """Azure endpoint → uses openai/ prefix (Azure supports Responses API now).""" bridge = OpenCodeBridge( model="gpt-5.2", llm_base_url="https://huaxi.openai.azure.com/openai/v1", llm_provider="azure", ) resolved = bridge._resolve_opencode_model() assert resolved == "openai/gpt-5.2" def test_resolve_model_preserves_explicit_prefix(self): """Model with '/' prefix should be used as-is regardless of provider.""" bridge = OpenCodeBridge( model="anthropic/claude-sonnet-4-6", llm_base_url="https://huaxi.openai.azure.com/openai/v1", llm_provider="azure", ) resolved = bridge._resolve_opencode_model() assert resolved == "anthropic/claude-sonnet-4-6" def test_resolve_model_no_model_default(self): """Empty model string → default Anthropic model.""" bridge = OpenCodeBridge() assert bridge._resolve_opencode_model() == "anthropic/claude-sonnet-4-6" def test_collect_files_ignores_pycache(self, tmp_path): (tmp_path / "main.py").write_text("print('hello')") pycache = tmp_path / "__pycache__" pycache.mkdir() (pycache / "main.cpython-311.pyc").write_text("bytecode") # Also write a .py in pycache to test filtering (pycache / "cached.py").write_text("cached") files = OpenCodeBridge._collect_files(tmp_path) assert "main.py" in files assert not any("__pycache__" in k for k in files) def test_collect_files_includes_requirements(self, tmp_path): (tmp_path / "main.py").write_text("import torch") (tmp_path / "requirements.txt").write_text("torch>=2.0") files = OpenCodeBridge._collect_files(tmp_path) assert "requirements.txt" in files assert "main.py" in files def test_collect_files_flattens_subdirectories(self, tmp_path): """Files in subdirs should be flattened to basenames (BUG-D fix).""" src = tmp_path / "src" src.mkdir() (src / "model.py").write_text("class Model: pass") (src / "utils.py").write_text("def helper(): pass") (tmp_path / "main.py").write_text("from model import Model") files = OpenCodeBridge._collect_files(tmp_path) # Keys should be flat basenames, not paths like "src/model.py" assert "model.py" in files assert "utils.py" in files assert "main.py" in files assert not any("/" in k for k in files) def test_collect_files_root_takes_priority_over_subdir(self, tmp_path): """Root-level file wins when basename collides with subdir file.""" (tmp_path / "main.py").write_text("root version") sub = tmp_path / "src" sub.mkdir() (sub / "main.py").write_text("subdir version") files = OpenCodeBridge._collect_files(tmp_path) assert files["main.py"] == "root version" def test_generate_returns_error_on_not_installed(self, tmp_path): bridge = OpenCodeBridge() with patch.object(OpenCodeBridge, "check_available", return_value=False): result = bridge.generate( stage_dir=tmp_path, topic="test", exp_plan="plan", metric="acc", ) assert not result.success assert "not installed" in result.error def test_generate_returns_error_on_cli_failure(self, tmp_path): bridge = OpenCodeBridge(max_retries=0, workspace_cleanup=True) with patch.object(OpenCodeBridge, "check_available", return_value=True), \ patch.object( bridge, "_invoke_opencode", return_value=(False, "CLI error", 1.5), ): result = bridge.generate( stage_dir=tmp_path, topic="test", exp_plan="plan", metric="acc", ) assert not result.success assert "failed" in result.error.lower() def test_generate_success(self, tmp_path): bridge = OpenCodeBridge(max_retries=0, workspace_cleanup=False) def fake_invoke(workspace, prompt): # Write main.py into the workspace to simulate OpenCode output (workspace / "main.py").write_text("print('acc: 0.95')") (workspace / "requirements.txt").write_text("torch") return True, "success", 5.0 with patch.object(OpenCodeBridge, "check_available", return_value=True), \ patch.object(bridge, "_invoke_opencode", side_effect=fake_invoke): result = bridge.generate( stage_dir=tmp_path, topic="test", exp_plan="plan", metric="acc", ) assert result.success assert "main.py" in result.files assert result.elapsed_sec == 5.0 def test_invoke_opencode_uses_resolved_path(self, tmp_path): bridge = OpenCodeBridge(model="gpt-5.2", timeout_sec=10) mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = "{}" mock_result.stderr = "" with patch( "researchclaw.pipeline.opencode_bridge.shutil.which", return_value=r"C:\Users\tester\AppData\Roaming\npm\opencode.cmd", ), patch( "researchclaw.pipeline.opencode_bridge.subprocess.run", return_value=mock_result, ) as run_mock: success, _log, _elapsed = bridge._invoke_opencode(tmp_path, "test prompt") assert success is True assert run_mock.call_args.args[0][0].endswith("opencode.cmd") # ============================================================ # TestEnsureMainEntryPoint (BUG-R52-01) # ============================================================ class TestHasMainGuard: """Tests for _has_main_guard static method.""" def test_with_guard(self): code = 'def main():\n pass\n\nif __name__ == "__main__":\n main()\n' assert OpenCodeBridge._has_main_guard(code) is True def test_without_guard(self): code = "def main():\n pass\n" assert OpenCodeBridge._has_main_guard(code) is False def test_syntax_error(self): assert OpenCodeBridge._has_main_guard("def broken(") is False def test_empty(self): assert OpenCodeBridge._has_main_guard("") is False def test_single_quote_guard(self): code = "if __name__ == '__main__':\n print('hi')\n" assert OpenCodeBridge._has_main_guard(code) is True class TestEnsureMainEntryPoint: """Tests for _ensure_main_entry_point — BUG-R52-01 fix.""" def test_already_has_guard_unchanged(self): files = { "main.py": 'def run():\n pass\n\nif __name__ == "__main__":\n run()\n', "utils.py": "def helper(): pass\n", } result = OpenCodeBridge._ensure_main_entry_point(files) assert result is files # Same object, unchanged def test_no_main_py_unchanged(self): files = {"utils.py": "def helper(): pass\n"} result = OpenCodeBridge._ensure_main_entry_point(files) assert result is files def test_swap_entry_point_from_other_file(self): """When main.py is library-only and another file has __main__, swap.""" lib_code = "class Model:\n pass\n\ndef train(model):\n pass\n" entry_code = ( 'from main import Model, train\n\n' 'if __name__ == "__main__":\n' ' m = Model()\n' ' train(m)\n' ) files = { "main.py": lib_code, "run_experiment.py": entry_code, } result = OpenCodeBridge._ensure_main_entry_point(files) # main.py should now contain the entry point code assert '__main__' in result["main.py"] # The old main.py content should be in run_experiment.py assert result["run_experiment.py"] == lib_code def test_inject_entry_for_main_function(self): """When main.py defines main() but no guard, inject one.""" code = "import torch\n\ndef main():\n print('training')\n" files = {"main.py": code} result = OpenCodeBridge._ensure_main_entry_point(files) assert '__main__' in result["main.py"] assert "main()" in result["main.py"] def test_inject_entry_for_run_function(self): """Should also detect run(), train(), etc.""" code = "def run_experiment():\n print('running')\n" files = {"main.py": code} result = OpenCodeBridge._ensure_main_entry_point(files) assert '__main__' in result["main.py"] assert "run_experiment()" in result["main.py"] def test_no_known_entry_function_warns(self): """When no known entry function exists, return unchanged with warning.""" code = "class Config:\n x = 1\n\nclass Trainer:\n pass\n" files = {"main.py": code} result = OpenCodeBridge._ensure_main_entry_point(files) # Should return unchanged since no entry function found assert result["main.py"] == code def test_non_py_files_not_checked(self): """requirements.txt and setup.py should not be checked for __main__.""" lib_code = "class Model:\n pass\n" files = { "main.py": lib_code, "requirements.txt": "torch>=2.0\n", "setup.py": "# setup\n", } result = OpenCodeBridge._ensure_main_entry_point(files) # No swap should occur — only .py files are checked assert result["main.py"] == lib_code def test_swap_preserves_other_files(self): """Swapping should not lose any files from the dict.""" files = { "main.py": "class Lib: pass\n", "run.py": 'if __name__ == "__main__":\n print("go")\n', "utils.py": "def helper(): pass\n", "requirements.txt": "numpy\n", } result = OpenCodeBridge._ensure_main_entry_point(files) assert len(result) == len(files) assert "utils.py" in result assert "requirements.txt" in result # ============================================================ # TestOpenCodeConfig # ============================================================ class TestOpenCodeConfig: """Tests for OpenCodeConfig dataclass and parser.""" def test_default_values(self): cfg = OpenCodeConfig() assert cfg.enabled is True assert cfg.auto is True assert cfg.complexity_threshold == 0.2 assert cfg.model == "" assert cfg.timeout_sec == 600 assert cfg.max_retries == 1 assert cfg.workspace_cleanup is True def test_parse_from_dict(self): data = { "enabled": True, "auto": True, "complexity_threshold": 0.5, "model": "gpt-5.2", "timeout_sec": 900, "max_retries": 2, "workspace_cleanup": False, } cfg = _parse_opencode_config(data) assert cfg.enabled is True assert cfg.auto is True assert cfg.complexity_threshold == 0.5 assert cfg.model == "gpt-5.2" assert cfg.timeout_sec == 900 assert cfg.max_retries == 2 assert cfg.workspace_cleanup is False def test_empty_dict_returns_default(self): cfg = _parse_opencode_config({}) assert cfg == OpenCodeConfig() # ============================================================ # TestCountHistoricalFailures # ============================================================ class TestCountHistoricalFailures: def test_no_failures(self, tmp_path): assert count_historical_failures(tmp_path) == 0 def test_counts_beast_mode_failures(self, tmp_path): d = tmp_path / "stage-10_001" d.mkdir() (d / "beast_mode_log.json").write_text(json.dumps({"success": False})) assert count_historical_failures(tmp_path) >= 1 def test_counts_validation_failures(self, tmp_path): d = tmp_path / "stage-10_002" d.mkdir() (d / "validation_report.md").write_text("**Status**: FAILED after 5 repairs") assert count_historical_failures(tmp_path) >= 1 def test_deduplicates_multiple_failure_indicators(self, tmp_path): """Same dir with beast_mode_log + stage_health + validation_report = 1 failure (BUG-E fix).""" d = tmp_path / "stage-10_003" d.mkdir() (d / "beast_mode_log.json").write_text(json.dumps({"success": False})) (d / "stage_health.json").write_text(json.dumps({"status": "FAILED"})) (d / "validation_report.md").write_text("FAILED after 3 repairs") assert count_historical_failures(tmp_path) == 1 ================================================ FILE: tests/test_overleaf.py ================================================ """Tests for Overleaf sync (C4): Sync engine, Conflict resolver, Watcher, Formatter.""" from __future__ import annotations import textwrap from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.overleaf.sync import OverleafSync from researchclaw.overleaf.conflict import ConflictResolver, _extract_conflicts, _resolve_content from researchclaw.overleaf.watcher import FileWatcher from researchclaw.overleaf.formatter import LatexFormatter # ══════════════════════════════════════════════════════════════════ # ConflictResolver tests # ══════════════════════════════════════════════════════════════════ class TestConflictResolver: def test_no_conflicts(self, tmp_path: Path) -> None: (tmp_path / "paper.tex").write_text("\\section{Intro}\nHello world\n") resolver = ConflictResolver() assert not resolver.has_conflicts(tmp_path) def test_has_conflicts(self, tmp_path: Path) -> None: content = textwrap.dedent("""\ \\section{Intro} <<<<<<< HEAD Our method is great. ======= Our method is good. >>>>>>> remote """) (tmp_path / "paper.tex").write_text(content) resolver = ConflictResolver() assert resolver.has_conflicts(tmp_path) def test_detect_conflicts(self, tmp_path: Path) -> None: content = textwrap.dedent("""\ <<<<<<< HEAD line A ======= line B >>>>>>> remote """) (tmp_path / "main.tex").write_text(content) resolver = ConflictResolver() conflicts = resolver.detect(tmp_path) assert len(conflicts) == 1 assert conflicts[0]["ours"] == "line A" assert conflicts[0]["theirs"] == "line B" def test_resolve_ours(self, tmp_path: Path) -> None: content = textwrap.dedent("""\ \\section{Intro} <<<<<<< HEAD AI version ======= Human version >>>>>>> remote \\section{End} """) (tmp_path / "paper.tex").write_text(content) resolver = ConflictResolver() resolved = resolver.resolve(tmp_path, strategy="ours") assert len(resolved) == 1 text = (tmp_path / "paper.tex").read_text() assert "AI version" in text assert "Human version" not in text assert "<<<<<<" not in text def test_resolve_theirs(self, tmp_path: Path) -> None: content = textwrap.dedent("""\ <<<<<<< HEAD AI text ======= Human text >>>>>>> remote """) (tmp_path / "paper.tex").write_text(content) resolver = ConflictResolver() resolver.resolve(tmp_path, strategy="theirs") text = (tmp_path / "paper.tex").read_text() assert "Human text" in text assert "AI text" not in text def test_multiple_conflicts(self, tmp_path: Path) -> None: content = textwrap.dedent("""\ <<<<<<< HEAD A1 ======= B1 >>>>>>> remote middle <<<<<<< HEAD A2 ======= B2 >>>>>>> remote """) (tmp_path / "paper.tex").write_text(content) resolver = ConflictResolver() conflicts = resolver.detect(tmp_path) assert len(conflicts) == 2 class TestConflictHelpers: def test_extract_conflicts_empty(self) -> None: assert _extract_conflicts("no conflicts here") == [] def test_resolve_content_ours(self) -> None: content = "<<<<<<< HEAD\nours\n=======\ntheirs\n>>>>>>> remote\n" resolved = _resolve_content(content, "ours") assert "ours" in resolved assert "theirs" not in resolved def test_resolve_content_theirs(self) -> None: content = "<<<<<<< HEAD\nours\n=======\ntheirs\n>>>>>>> remote\n" resolved = _resolve_content(content, "theirs") assert "theirs" in resolved assert "ours" not in resolved # ══════════════════════════════════════════════════════════════════ # FileWatcher tests # ══════════════════════════════════════════════════════════════════ class TestFileWatcher: def test_no_changes_initially(self, tmp_path: Path) -> None: (tmp_path / "paper.tex").write_text("content") watcher = FileWatcher(tmp_path) assert watcher.check_changes() == [] def test_detect_new_file(self, tmp_path: Path) -> None: watcher = FileWatcher(tmp_path) (tmp_path / "new.tex").write_text("new content") changes = watcher.check_changes() assert "new.tex" in changes def test_detect_modified_file(self, tmp_path: Path) -> None: f = tmp_path / "paper.tex" f.write_text("v1") watcher = FileWatcher(tmp_path) # Modify import time time.sleep(0.05) f.write_text("v2") changes = watcher.check_changes() assert "paper.tex" in changes def test_detect_deleted_file(self, tmp_path: Path) -> None: f = tmp_path / "paper.tex" f.write_text("content") watcher = FileWatcher(tmp_path) f.unlink() changes = watcher.check_changes() assert "paper.tex" in changes def test_only_watches_extensions(self, tmp_path: Path) -> None: watcher = FileWatcher(tmp_path, extensions=(".tex",)) (tmp_path / "readme.md").write_text("markdown") changes = watcher.check_changes() assert changes == [] def test_nonexistent_dir(self, tmp_path: Path) -> None: watcher = FileWatcher(tmp_path / "nonexistent") assert watcher.check_changes() == [] # ══════════════════════════════════════════════════════════════════ # LatexFormatter tests # ══════════════════════════════════════════════════════════════════ class TestLatexFormatter: def test_normalize_paths(self) -> None: content = r"\includegraphics[width=0.5\textwidth]{/home/user/artifacts/rc-123/figures/plot.png}" result = LatexFormatter.normalize_paths(content) assert "figures/plot.png" in result assert "/home/user" not in result def test_ensure_document_class_adds(self) -> None: content = "\\begin{document}\nHello\n\\end{document}" result = LatexFormatter.ensure_document_class(content) assert "\\documentclass" in result def test_ensure_document_class_noop(self) -> None: content = "\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}" result = LatexFormatter.ensure_document_class(content) assert result.count("\\documentclass") == 1 def test_strip_local_comments(self) -> None: content = "Normal line\n% RESEARCHCLAW: internal note\nAnother line\n" result = LatexFormatter.strip_local_comments(content) assert "RESEARCHCLAW" not in result assert "Normal line" in result assert "Another line" in result def test_fix_encoding(self) -> None: content = "\\documentclass{article}\n\\begin{document}\n" result = LatexFormatter.fix_encoding(content) assert "\\usepackage[utf8]{inputenc}" in result def test_fix_encoding_noop(self) -> None: content = "\\documentclass{article}\n\\usepackage[utf8]{inputenc}\n\\begin{document}\n" result = LatexFormatter.fix_encoding(content) assert result.count("inputenc") == 1 def test_format_for_overleaf(self, tmp_path: Path) -> None: tex = tmp_path / "paper.tex" tex.write_text("\\documentclass{article}\n% RESEARCHCLAW: test\n\\begin{document}\nHello\n\\end{document}\n") formatter = LatexFormatter() result = formatter.format_for_overleaf(tex) assert "RESEARCHCLAW" not in result assert "inputenc" in result # ══════════════════════════════════════════════════════════════════ # OverleafSync tests (mock git) # ══════════════════════════════════════════════════════════════════ class TestOverleafSync: def test_init(self) -> None: sync = OverleafSync(git_url="https://git.overleaf.com/abc123") assert sync.git_url == "https://git.overleaf.com/abc123" assert sync.branch == "main" assert sync.local_dir is None def test_get_status_before_setup(self) -> None: sync = OverleafSync(git_url="https://git.overleaf.com/abc123") status = sync.get_status() assert status["local_dir"] is None assert status["last_sync"] is None def test_push_before_setup_raises(self, tmp_path: Path) -> None: sync = OverleafSync(git_url="https://git.overleaf.com/abc123") with pytest.raises(RuntimeError, match="setup"): sync.push_paper(tmp_path / "paper.tex") def test_pull_before_setup_raises(self) -> None: sync = OverleafSync(git_url="https://git.overleaf.com/abc123") with pytest.raises(RuntimeError, match="setup"): sync.pull_changes() def test_resolve_before_setup_raises(self) -> None: sync = OverleafSync(git_url="https://git.overleaf.com/abc123") with pytest.raises(RuntimeError, match="setup"): sync.resolve_conflicts() @patch("researchclaw.overleaf.sync.subprocess.run") def test_setup_clones(self, mock_run: MagicMock, tmp_path: Path) -> None: mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") sync = OverleafSync(git_url="https://git.overleaf.com/abc123") local = sync.setup(tmp_path) assert local == tmp_path / "overleaf_repo" # git clone was called mock_run.assert_called_once() args = mock_run.call_args[0][0] assert "clone" in args ================================================ FILE: tests/test_paper_verifier.py ================================================ """Tests for paper_verifier — post-generation fabrication detection.""" from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.pipeline.paper_verifier import ( VerificationResult, verify_paper, ) from researchclaw.pipeline.verified_registry import VerifiedRegistry ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts" # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_registry(**kwargs) -> VerifiedRegistry: summary = {"best_run": {"metrics": {}}, "condition_summaries": {}, "metrics_summary": {}} conditions = kwargs.get("conditions", {}) for cond_name, seeds in conditions.items(): for seed_idx, value in seeds.items(): summary["best_run"]["metrics"][f"{cond_name}/{seed_idx}/metric"] = value mean_val = sum(seeds.values()) / len(seeds) summary["condition_summaries"][cond_name] = {"metrics": {"metric": mean_val}} pm = kwargs.get("primary_metric") if pm is not None: summary["best_run"]["metrics"]["primary_metric"] = pm return VerifiedRegistry.from_experiment(summary) # --------------------------------------------------------------------------- # Unit tests — clean paper # --------------------------------------------------------------------------- class TestCleanPaper: def test_all_numbers_verified_passes(self): reg = _make_registry( conditions={"Baseline": {0: 80.0, 1: 82.0}, "Proposed": {0: 90.0, 1: 92.0}}, primary_metric=91.0, ) tex = r""" \section{Results} Our proposed method achieves 91.0000 on the primary metric, compared to 81.0000 for the baseline. \begin{table}[htbp] \centering \begin{tabular}{lcc} \toprule Method & Metric & $n$ \\ \midrule Baseline & 81.0000 $\pm$ 1.4142 & 2 \\ Proposed & 91.0000 $\pm$ 1.4142 & 2 \\ \bottomrule \end{tabular} \end{table} """ result = verify_paper(tex, reg) assert result.severity == "PASS" assert result.strict_violations == 0 def test_common_constants_allowed(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Experimental Setup} We use a batch size of 64 and train for 100 epochs with a learning rate of 0.001. """ result = verify_paper(tex, reg) assert result.severity == "PASS" def test_year_numbers_allowed(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Introduction} Following the work of Smith et al. (2025), we propose... """ result = verify_paper(tex, reg) assert result.severity == "PASS" # --------------------------------------------------------------------------- # Unit tests — fabricated numbers # --------------------------------------------------------------------------- class TestFabricatedNumbers: def test_fabricated_in_results_rejects(self): reg = _make_registry( conditions={"Baseline": {0: 80.0}, "Proposed": {0: 90.0}}, ) tex = r""" \section{Results} Our method achieves 95.5 accuracy. """ result = verify_paper(tex, reg) assert result.severity == "REJECT" assert result.strict_violations >= 1 assert any(abs(u.value - 95.5) < 0.01 for u in result.unverified_numbers) def test_fabricated_in_table_rejects(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Results} \begin{table}[h] \begin{tabular}{lc} A & 85.3 \\ \end{tabular} \end{table} """ result = verify_paper(tex, reg) assert result.severity == "REJECT" def test_fabricated_in_discussion_warns(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Discussion} Compared to prior work reporting 95.5 accuracy, our result is lower. """ result = verify_paper(tex, reg) # In Discussion → warning, not reject assert result.severity == "WARN" assert result.lenient_violations >= 1 def test_numbers_in_cite_skipped(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Results} As shown by \cite{smith2025deep}, our method works. """ result = verify_paper(tex, reg) assert result.severity == "PASS" def test_numbers_in_comments_skipped(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Results} % This is a comment with fake number 99.99 Our method achieves 80.0. """ result = verify_paper(tex, reg) assert result.severity == "PASS" # --------------------------------------------------------------------------- # Unit tests — fabricated conditions # --------------------------------------------------------------------------- class TestFabricatedConditions: def test_unknown_condition_in_table(self): reg = _make_registry(conditions={"DQN": {0: 80.0}, "DQN+Abstraction": {0: 90.0}}) tex = r""" \section{Results} \begin{table}[h] \begin{tabular}{lc} DQN & 80.0 \\ DQN+Abstraction & 90.0 \\ PPO & 75.0 \\ \end{tabular} \end{table} """ result = verify_paper(tex, reg) assert len(result.fabricated_conditions) >= 1 assert any(fc.name == "PPO" for fc in result.fabricated_conditions) assert result.severity == "REJECT" # --------------------------------------------------------------------------- # Unit tests — fabrication rate # --------------------------------------------------------------------------- class TestFabricationRate: def test_rate_zero_for_clean_paper(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Results} Accuracy is 80.0. """ result = verify_paper(tex, reg) assert result.fabrication_rate == 0.0 def test_rate_nonzero_for_fabricated(self): reg = _make_registry(conditions={"A": {0: 80.0}}) tex = r""" \section{Results} Accuracy is 99.99 and loss is 45.67. """ result = verify_paper(tex, reg) assert result.fabrication_rate > 0.0 # --------------------------------------------------------------------------- # Integration — real fabricated papers # --------------------------------------------------------------------------- class TestRealPapers: def _load(self, run_id: str) -> tuple[str, VerifiedRegistry]: pattern = f"rc-*-{run_id}" matches = sorted(ARTIFACTS.glob(pattern)) if not matches: pytest.skip(f"Artifact {run_id} not found") base = matches[0] tex_path = base / "stage-22" / "paper.tex" summary_path = base / "stage-14" / "experiment_summary.json" ref_path = base / "stage-13" / "refinement_log.json" if not tex_path.exists() or not summary_path.exists(): pytest.skip(f"Missing files for {run_id}") tex = tex_path.read_text(encoding="utf-8") summary = json.loads(summary_path.read_text()) ref_log = None if ref_path.exists(): ref_log = json.loads(ref_path.read_text()) reg = VerifiedRegistry.from_experiment(summary, ref_log) return tex, reg def test_run_e57360_severe_fabrication_detected(self): """Run 38 (LACE) — audit found SEVERE fabrication. The verifier should REJECT this paper.""" tex, reg = self._load("e57360") result = verify_paper(tex, reg) assert result.severity == "REJECT", ( f"Expected REJECT for severely fabricated paper, got {result.severity}. " f"Unverified: {len(result.unverified_numbers)}, " f"Fabricated conditions: {[fc.name for fc in result.fabricated_conditions]}" ) def test_run_6a1ec9_severe_fabrication_detected(self): """Run 6a1ec9 (FAME) — audit found SEVERE fabrication.""" tex, reg = self._load("6a1ec9") result = verify_paper(tex, reg) assert result.severity == "REJECT" def test_run_85fefc_fabrication_detected(self): """Run 85fefc (CRAFT) — audit found SEVERE fabrication.""" tex, reg = self._load("85fefc") result = verify_paper(tex, reg) # Should detect at least some issues assert len(result.unverified_numbers) > 0 or len(result.fabricated_conditions) > 0 def test_run_acbdfa_moderate_fabrication(self): """Run acbdfa (CTS) — audit found MODERATE fabrication.""" tex, reg = self._load("acbdfa") result = verify_paper(tex, reg) # May or may not reject (moderate case), but should find issues assert len(result.unverified_numbers) > 0 or result.lenient_violations > 0 ================================================ FILE: tests/test_project_manager.py ================================================ """Tests for multi-project management (C1): ProjectManager, ProjectScheduler, IdeaPool.""" from __future__ import annotations import json from datetime import datetime, timezone from pathlib import Path import pytest from researchclaw.project.models import Idea, Project from researchclaw.project.manager import ProjectManager from researchclaw.project.scheduler import ProjectScheduler from researchclaw.project.idea_pool import IdeaPool # ── fixtures ────────────────────────────────────────────────────── @pytest.fixture def tmp_projects(tmp_path: Path) -> Path: return tmp_path / "projects" @pytest.fixture def manager(tmp_projects: Path) -> ProjectManager: return ProjectManager(tmp_projects) @pytest.fixture def config_yaml(tmp_path: Path) -> Path: cfg = tmp_path / "config.yaml" cfg.write_text("project:\n name: test\nresearch:\n topic: test\n") return cfg @pytest.fixture def pool_path(tmp_path: Path) -> Path: return tmp_path / "ideas.json" # ══════════════════════════════════════════════════════════════════ # Project model tests # ══════════════════════════════════════════════════════════════════ class TestProjectModel: def test_to_dict_roundtrip(self) -> None: p = Project(name="test", config_path="/a/b", run_dir="/c/d", topic="ml") d = p.to_dict() p2 = Project.from_dict(d) assert p2.name == p.name assert p2.topic == p.topic assert p2.status == "idle" def test_from_dict_defaults(self) -> None: d = {"name": "x", "config_path": "/a", "run_dir": "/b"} p = Project.from_dict(d) assert p.status == "idle" assert p.last_run_id is None def test_from_dict_with_iso_date(self) -> None: d = { "name": "x", "config_path": "/a", "run_dir": "/b", "created_at": "2024-01-01T00:00:00+00:00", } p = Project.from_dict(d) assert p.created_at.year == 2024 # ══════════════════════════════════════════════════════════════════ # Idea model tests # ══════════════════════════════════════════════════════════════════ class TestIdeaModel: def test_score_calculation(self) -> None: idea = Idea(id="1", title="t", description="d", feasibility=1.0, novelty=1.0) assert idea.score == pytest.approx(1.0) def test_score_weighted(self) -> None: idea = Idea(id="1", title="t", description="d", feasibility=0.5, novelty=0.5) assert idea.score == pytest.approx(0.5) def test_to_dict_roundtrip(self) -> None: idea = Idea(id="abc", title="GNN", description="graph stuff", domains=["ml"]) d = idea.to_dict() i2 = Idea.from_dict(d) assert i2.id == "abc" assert i2.domains == ["ml"] # ══════════════════════════════════════════════════════════════════ # ProjectManager tests # ══════════════════════════════════════════════════════════════════ class TestProjectManager: def test_create_project(self, manager: ProjectManager, config_yaml: Path) -> None: proj = manager.create("my_project", str(config_yaml), topic="RL") assert proj.name == "my_project" assert proj.topic == "RL" assert proj.status == "idle" def test_create_sets_active(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("first", str(config_yaml)) assert manager.active is not None assert manager.active.name == "first" def test_create_duplicate_raises(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("dup", str(config_yaml)) with pytest.raises(ValueError, match="already exists"): manager.create("dup", str(config_yaml)) def test_delete_project(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("del_me", str(config_yaml)) manager.delete("del_me") assert "del_me" not in manager.projects def test_delete_unknown_raises(self, manager: ProjectManager) -> None: with pytest.raises(KeyError): manager.delete("nonexistent") def test_get_project(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("proj1", str(config_yaml)) p = manager.get("proj1") assert p.name == "proj1" def test_get_unknown_raises(self, manager: ProjectManager) -> None: with pytest.raises(KeyError): manager.get("nope") def test_list_all_sorted(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("b_proj", str(config_yaml)) manager.create("a_proj", str(config_yaml)) projects = manager.list_all() assert len(projects) == 2 # Sorted by creation time (b_proj first) assert projects[0].name == "b_proj" def test_get_status(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("s1", str(config_yaml)) manager.create("s2", str(config_yaml)) status = manager.get_status() assert status["total"] == 2 assert status["active"] == "s1" def test_switch_project(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("a", str(config_yaml)) manager.create("b", str(config_yaml)) manager.switch("b") assert manager.active is not None assert manager.active.name == "b" def test_switch_unknown_raises(self, manager: ProjectManager) -> None: with pytest.raises(KeyError): manager.switch("ghost") def test_compare_projects(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("pa", str(config_yaml)) manager.create("pb", str(config_yaml)) manager.projects["pa"].metrics = {"acc": 0.9} manager.projects["pb"].metrics = {"acc": 0.95} result = manager.compare("pa", "pb") assert "metric_diff" in result assert result["metric_diff"]["acc"]["delta"] == pytest.approx(0.05) def test_start_run(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("run_proj", str(config_yaml)) rid = manager.start_run("run_proj", run_id="rc-123") assert rid == "rc-123" assert manager.get("run_proj").status == "running" def test_finish_run(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("fin_proj", str(config_yaml)) manager.start_run("fin_proj", run_id="rc-456") manager.finish_run("fin_proj", "completed", {"acc": 0.88}) p = manager.get("fin_proj") assert p.status == "completed" assert p.metrics["acc"] == 0.88 def test_registry_persistence(self, tmp_projects: Path, config_yaml: Path) -> None: m1 = ProjectManager(tmp_projects) m1.create("persist", str(config_yaml), topic="persistence") # Load from disk m2 = ProjectManager(tmp_projects) assert "persist" in m2.projects assert m2.projects["persist"].topic == "persistence" def test_delete_switches_active(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("first", str(config_yaml)) manager.create("second", str(config_yaml)) manager.switch("first") manager.delete("first") # Should switch active to remaining project assert manager.active is not None assert manager.active.name == "second" def test_config_copied_to_project_dir(self, manager: ProjectManager, config_yaml: Path) -> None: proj = manager.create("copy_test", str(config_yaml)) copied = Path(proj.config_path) assert copied.exists() assert "test" in copied.read_text() # ══════════════════════════════════════════════════════════════════ # ProjectScheduler tests # ══════════════════════════════════════════════════════════════════ class TestProjectScheduler: def test_enqueue_and_next(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("proj", str(config_yaml)) sched = ProjectScheduler(manager, max_concurrent=1) sched.enqueue("proj") name = sched.next() assert name == "proj" def test_concurrency_limit(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("a", str(config_yaml)) manager.create("b", str(config_yaml)) sched = ProjectScheduler(manager, max_concurrent=1) sched.enqueue("a") sched.enqueue("b") sched.next() # starts "a" assert sched.next() is None # can't start "b" def test_mark_done_frees_slot(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("a", str(config_yaml)) manager.create("b", str(config_yaml)) sched = ProjectScheduler(manager, max_concurrent=1) sched.enqueue("a") sched.enqueue("b") sched.next() # starts "a" sched.mark_done("a") name = sched.next() assert name == "b" def test_priority_order(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("low", str(config_yaml)) manager.create("high", str(config_yaml)) sched = ProjectScheduler(manager, max_concurrent=2) sched.enqueue("low", priority=10) sched.enqueue("high", priority=1) # Higher priority (lower number) first assert sched.next() == "high" assert sched.next() == "low" def test_enqueue_unknown_raises(self, manager: ProjectManager) -> None: sched = ProjectScheduler(manager) with pytest.raises(KeyError): sched.enqueue("ghost") def test_duplicate_enqueue_ignored(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("dup", str(config_yaml)) sched = ProjectScheduler(manager) sched.enqueue("dup") sched.enqueue("dup") assert sched.queue_size == 1 def test_get_status(self, manager: ProjectManager, config_yaml: Path) -> None: manager.create("s", str(config_yaml)) sched = ProjectScheduler(manager, max_concurrent=3) sched.enqueue("s") status = sched.get_status() assert status["max_concurrent"] == 3 assert status["queue_size"] == 1 def test_can_start_empty_queue(self, manager: ProjectManager) -> None: sched = ProjectScheduler(manager) assert not sched.can_start() # ══════════════════════════════════════════════════════════════════ # IdeaPool tests # ══════════════════════════════════════════════════════════════════ class TestIdeaPool: def test_add_idea(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("GNN for proteins", "Apply GNN to protein folding", ["bio", "ml"]) assert idea.title == "GNN for proteins" assert len(idea.id) == 8 def test_remove_idea(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("remove me", "desc") pool.remove(idea.id) assert idea.id not in pool.ideas def test_remove_unknown_raises(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) with pytest.raises(KeyError): pool.remove("nonexistent") def test_get_idea(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("get me", "desc") retrieved = pool.get(idea.id) assert retrieved.title == "get me" def test_evaluate(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("eval", "desc") result = pool.evaluate(idea.id, feasibility=0.8, novelty=0.9) assert result["feasibility"] == 0.8 assert result["novelty"] == 0.9 assert pool.get(idea.id).status == "evaluated" def test_evaluate_clamps_values(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("clamp", "desc") pool.evaluate(idea.id, feasibility=1.5, novelty=-0.5) assert pool.get(idea.id).feasibility == 1.0 assert pool.get(idea.id).novelty == 0.0 def test_rank(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) pool.add("low", "desc") pool.add("high", "desc") pool.evaluate(pool.list_all()[0].id, 0.1, 0.1) pool.evaluate(pool.list_all()[1].id, 0.9, 0.9) ranked = pool.rank() assert ranked[0].score > ranked[1].score def test_list_all(self, pool_path: Path) -> None: pool = IdeaPool(pool_path) pool.add("a", "desc") pool.add("b", "desc") assert len(pool.list_all()) == 2 def test_persistence(self, pool_path: Path) -> None: pool1 = IdeaPool(pool_path) pool1.add("persist", "desc", ["ml"]) pool2 = IdeaPool(pool_path) assert len(pool2.ideas) == 1 assert list(pool2.ideas.values())[0].title == "persist" def test_to_project(self, pool_path: Path, tmp_path: Path, config_yaml: Path) -> None: pool = IdeaPool(pool_path) idea = pool.add("my idea", "a nice description") projects_dir = tmp_path / "projects" proj = pool.to_project(idea.id, str(config_yaml), projects_dir) assert proj.topic == "a nice description" assert pool.get(idea.id).status == "planned" ================================================ FILE: tests/test_prompt_adapter.py ================================================ """Tests for domain-aware prompt adapters.""" from __future__ import annotations import pytest from researchclaw.domains.detector import DomainProfile, get_profile, get_generic_profile from researchclaw.domains.prompt_adapter import ( GenericPromptAdapter, MLPromptAdapter, PromptAdapter, PromptBlocks, get_adapter, register_adapter, ) # --------------------------------------------------------------------------- # PromptBlocks tests # --------------------------------------------------------------------------- class TestPromptBlocks: def test_default_empty(self): blocks = PromptBlocks() assert blocks.compute_budget == "" assert blocks.dataset_guidance == "" assert blocks.code_generation_hints == "" def test_all_fields(self): blocks = PromptBlocks( compute_budget="budget info", dataset_guidance="data info", hp_reporting="hp info", code_generation_hints="code hints", result_analysis_hints="analysis hints", experiment_design_context="design context", statistical_test_guidance="stat guidance", output_format_guidance="output format", ) assert blocks.compute_budget == "budget info" assert blocks.output_format_guidance == "output format" # --------------------------------------------------------------------------- # ML Adapter tests # --------------------------------------------------------------------------- class TestMLPromptAdapter: def test_returns_empty_blocks(self): """ML adapter must return empty blocks (delegates to prompts.py).""" profile = get_profile("ml_vision") or DomainProfile( domain_id="ml_vision", display_name="CV" ) adapter = MLPromptAdapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.compute_budget == "" assert blocks.dataset_guidance == "" assert blocks.code_generation_hints == "" def test_all_methods_return_empty(self): profile = DomainProfile(domain_id="ml_generic", display_name="ML") adapter = MLPromptAdapter(profile) for method in [ adapter.get_code_generation_blocks, adapter.get_experiment_design_blocks, adapter.get_result_analysis_blocks, ]: blocks = method({}) assert all( getattr(blocks, f) == "" for f in [ "compute_budget", "dataset_guidance", "hp_reporting", "code_generation_hints", "result_analysis_hints", ] ) # --------------------------------------------------------------------------- # Generic Adapter tests # --------------------------------------------------------------------------- class TestGenericPromptAdapter: def test_provides_code_hints(self): profile = DomainProfile( domain_id="generic", display_name="Generic", core_libraries=["numpy", "scipy"], ) adapter = GenericPromptAdapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints # should not be empty def test_convergence_hints(self): profile = DomainProfile( domain_id="test_conv", display_name="Conv Test", experiment_paradigm="convergence", ) adapter = GenericPromptAdapter(profile) blocks = adapter.get_code_generation_blocks({}) assert "convergence" in blocks.code_generation_hints.lower() def test_progressive_spec_hints(self): profile = DomainProfile( domain_id="test_econ", display_name="Econ Test", experiment_paradigm="progressive_spec", ) adapter = GenericPromptAdapter(profile) blocks = adapter.get_code_generation_blocks({}) assert "progressive" in blocks.code_generation_hints.lower() def test_experiment_design_has_terminology(self): profile = DomainProfile( domain_id="test", display_name="Test Domain", condition_terminology={"baseline": "reference", "proposed": "our method"}, standard_baselines=["Method A", "Method B"], ) adapter = GenericPromptAdapter(profile) blocks = adapter.get_experiment_design_blocks({}) assert "reference" in blocks.experiment_design_context assert "Method A" in blocks.experiment_design_context # --------------------------------------------------------------------------- # Physics Adapter tests # --------------------------------------------------------------------------- class TestPhysicsAdapter: def test_physics_adapter_loaded(self): profile = get_profile("physics_simulation") if profile is None: pytest.skip("physics_simulation profile not found") adapter = get_adapter(profile) assert not isinstance(adapter, MLPromptAdapter) def test_physics_code_blocks_nonempty(self): profile = get_profile("physics_pde") if profile is None: pytest.skip("physics_pde profile not found") adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints # should have physics-specific hints # --------------------------------------------------------------------------- # Economics Adapter tests # --------------------------------------------------------------------------- class TestEconomicsAdapter: def test_economics_adapter_loaded(self): profile = get_profile("economics_empirical") if profile is None: pytest.skip("economics_empirical profile not found") adapter = get_adapter(profile) assert not isinstance(adapter, MLPromptAdapter) def test_economics_design_blocks(self): profile = get_profile("economics_empirical") if profile is None: pytest.skip("economics_empirical profile not found") adapter = get_adapter(profile) blocks = adapter.get_experiment_design_blocks({}) assert "progressive" in blocks.experiment_design_context.lower() # --------------------------------------------------------------------------- # get_adapter dispatch tests # --------------------------------------------------------------------------- class TestGetAdapter: def test_ml_domains_get_ml_adapter(self): for domain_id in ["ml_vision", "ml_nlp", "ml_rl", "ml_generic"]: profile = get_profile(domain_id) if profile is None: continue adapter = get_adapter(profile) assert isinstance(adapter, MLPromptAdapter), ( f"{domain_id} should use MLPromptAdapter" ) def test_generic_domain_gets_generic_adapter(self): profile = get_generic_profile() adapter = get_adapter(profile) assert isinstance(adapter, GenericPromptAdapter) def test_physics_uses_physics_adapter(self): profile = get_profile("physics_simulation") if profile is None: pytest.skip("physics_simulation profile not found") adapter = get_adapter(profile) from researchclaw.domains.adapters.physics import PhysicsPromptAdapter assert isinstance(adapter, PhysicsPromptAdapter) def test_unknown_domain_gets_generic(self): profile = DomainProfile(domain_id="unknown_domain", display_name="Unknown") adapter = get_adapter(profile) assert isinstance(adapter, GenericPromptAdapter) # --------------------------------------------------------------------------- # Blueprint context tests # --------------------------------------------------------------------------- class TestBlueprintContext: def test_blueprint_includes_file_structure(self): profile = DomainProfile( domain_id="test", display_name="Test", typical_file_structure={"config.py": "Config", "main.py": "Entry"}, core_libraries=["numpy"], ) adapter = GenericPromptAdapter(profile) ctx = adapter.get_blueprint_context() assert "config.py" in ctx assert "numpy" in ctx def test_blueprint_includes_hints(self): profile = DomainProfile( domain_id="test", display_name="Test", code_generation_hints="Use scipy.integrate for ODE solving", ) adapter = GenericPromptAdapter(profile) ctx = adapter.get_blueprint_context() assert "scipy.integrate" in ctx def test_ml_adapter_blueprint_context(self): """ML adapter should also provide basic blueprint context.""" profile = get_profile("ml_vision") or DomainProfile( domain_id="ml_vision", display_name="CV", typical_file_structure={"model.py": "Model", "train.py": "Training"}, ) adapter = MLPromptAdapter(profile) ctx = adapter.get_blueprint_context() # ML adapter inherits from base, should have file structure if profile has it if profile.typical_file_structure: assert "model.py" in ctx or ctx == "" # acceptable either way # --------------------------------------------------------------------------- # Adapter registration tests # --------------------------------------------------------------------------- class TestAdapterRegistration: def test_register_custom_adapter(self): class CustomAdapter(PromptAdapter): def get_code_generation_blocks(self, ctx): return PromptBlocks(code_generation_hints="custom") def get_experiment_design_blocks(self, ctx): return PromptBlocks() def get_result_analysis_blocks(self, ctx): return PromptBlocks() register_adapter("custom_domain", CustomAdapter) profile = DomainProfile(domain_id="custom_domain", display_name="Custom") adapter = get_adapter(profile) assert isinstance(adapter, CustomAdapter) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints == "custom" ================================================ FILE: tests/test_rc_adapters.py ================================================ from __future__ import annotations from researchclaw.adapters import ( AdapterBundle, BrowserPage, FetchResponse, RecordingBrowserAdapter, RecordingCronAdapter, RecordingMemoryAdapter, RecordingMessageAdapter, RecordingSessionsAdapter, RecordingWebFetchAdapter, ) def test_adapter_bundle_defaults_are_recording_types(): bundle = AdapterBundle() assert isinstance(bundle.cron, RecordingCronAdapter) assert isinstance(bundle.message, RecordingMessageAdapter) assert isinstance(bundle.memory, RecordingMemoryAdapter) assert isinstance(bundle.sessions, RecordingSessionsAdapter) assert isinstance(bundle.web_fetch, RecordingWebFetchAdapter) assert isinstance(bundle.browser, RecordingBrowserAdapter) def test_recording_cron_adapter_records_call_and_returns_id(): adapter = RecordingCronAdapter() result = adapter.schedule_resume("run-1", 7, "gate opened") assert result == "cron-1" assert adapter.calls == [("run-1", 7, "gate opened")] def test_recording_message_adapter_notify_records_call(): adapter = RecordingMessageAdapter() result = adapter.notify("ops", "stage update", "stage 3 done") assert result == "message-1" assert adapter.calls == [("ops", "stage update", "stage 3 done")] def test_recording_memory_adapter_append_records_entries(): adapter = RecordingMemoryAdapter() result = adapter.append("runs", "run-1 started") assert result == "memory-1" assert adapter.entries == [("runs", "run-1 started")] def test_recording_sessions_adapter_spawn_records_calls(): adapter = RecordingSessionsAdapter() result = adapter.spawn("worker", ("python", "train.py")) assert result == "session-1" assert adapter.calls == [("worker", ("python", "train.py"))] def test_recording_webfetch_fetch_returns_success_response(): adapter = RecordingWebFetchAdapter() response = adapter.fetch("https://example.com") assert isinstance(response, FetchResponse) assert response.url == "https://example.com" assert response.status_code == 200 assert "stub fetch" in response.text def test_recording_browser_open_returns_browser_page(): adapter = RecordingBrowserAdapter() page = adapter.open("https://example.com") assert isinstance(page, BrowserPage) assert page.url == "https://example.com" assert "Stub browser page" in page.title def test_fetch_response_dataclass_fields(): response = FetchResponse(url="u", status_code=201, text="ok") assert response.url == "u" assert response.status_code == 201 assert response.text == "ok" def test_browser_page_dataclass_fields(): page = BrowserPage(url="https://a", title="A") assert page.url == "https://a" assert page.title == "A" def test_all_adapters_start_with_empty_call_lists(): cron = RecordingCronAdapter() message = RecordingMessageAdapter() memory = RecordingMemoryAdapter() sessions = RecordingSessionsAdapter() web_fetch = RecordingWebFetchAdapter() browser = RecordingBrowserAdapter() assert cron.calls == [] assert message.calls == [] assert memory.entries == [] assert sessions.calls == [] assert web_fetch.calls == [] assert browser.calls == [] ================================================ FILE: tests/test_rc_cache.py ================================================ """Tests for literature query cache and degradation fallback.""" from __future__ import annotations import importlib from unittest.mock import patch from researchclaw.literature.models import Author, Paper from researchclaw.literature.search import search_papers cache_mod = importlib.import_module("researchclaw.literature.cache") cache_key = cache_mod.cache_key cache_stats = cache_mod.cache_stats clear_cache = cache_mod.clear_cache get_cached = cache_mod.get_cached put_cache = cache_mod.put_cache class TestCacheKey: def test_deterministic(self, tmp_path): _ = tmp_path k1 = cache_key("transformer", "s2", 20) k2 = cache_key("transformer", "s2", 20) assert k1 == k2 def test_different_query(self): k1 = cache_key("transformer", "s2", 20) k2 = cache_key("attention", "s2", 20) assert k1 != k2 def test_case_insensitive(self): k1 = cache_key("Transformer", "S2", 20) k2 = cache_key("transformer", "s2", 20) assert k1 == k2 def test_length_16(self): k = cache_key("test", "s2", 10) assert len(k) == 16 class TestGetPut: def test_put_and_get(self, tmp_path): papers = [{"paper_id": "1", "title": "Test Paper"}] put_cache("q1", "s2", 20, papers, cache_base=tmp_path) result = get_cached("q1", "s2", 20, cache_base=tmp_path) assert result is not None assert len(result) == 1 assert result[0]["title"] == "Test Paper" def test_cache_miss(self, tmp_path): result = get_cached("nonexistent", "s2", 20, cache_base=tmp_path) assert result is None def test_cache_expired(self, tmp_path): papers = [{"paper_id": "1", "title": "Old"}] put_cache("q1", "s2", 20, papers, cache_base=tmp_path) result = get_cached("q1", "s2", 20, cache_base=tmp_path, ttl=0) assert result is None def test_cache_not_expired(self, tmp_path): papers = [{"paper_id": "1", "title": "Fresh"}] put_cache("q1", "s2", 20, papers, cache_base=tmp_path) result = get_cached("q1", "s2", 20, cache_base=tmp_path, ttl=9999) assert result is not None def test_corrupted_cache_returns_none(self, tmp_path): key = cache_key("q1", "s2", 20) (tmp_path / f"{key}.json").write_text("not json", encoding="utf-8") result = get_cached("q1", "s2", 20, cache_base=tmp_path) assert result is None class TestClear: def test_clear_removes_all(self, tmp_path): put_cache("q1", "s2", 20, [{"id": "1"}], cache_base=tmp_path) put_cache("q2", "arxiv", 10, [{"id": "2"}], cache_base=tmp_path) count = clear_cache(cache_base=tmp_path) assert count == 2 assert get_cached("q1", "s2", 20, cache_base=tmp_path) is None def test_clear_empty(self, tmp_path): count = clear_cache(cache_base=tmp_path) assert count == 0 class TestStats: def test_stats_empty(self, tmp_path): stats = cache_stats(cache_base=tmp_path) assert stats["entries"] == 0 assert stats["total_bytes"] == 0 def test_stats_with_entries(self, tmp_path): put_cache("q1", "s2", 20, [{"id": "1"}], cache_base=tmp_path) stats = cache_stats(cache_base=tmp_path) assert stats["entries"] == 1 assert stats["total_bytes"] > 0 class TestSearchDegradation: def test_search_uses_cache_on_failure(self, tmp_path): cached_papers = [ { "paper_id": "s2-123", "title": "Cached Paper", "authors": [], "year": 2024, "abstract": "", "venue": "", "citation_count": 10, "doi": "", "arxiv_id": "", "url": "", "source": "semantic_scholar", } ] put_cache( "test query", "semantic_scholar", 20, cached_papers, cache_base=tmp_path, ) with patch( "researchclaw.literature.search.search_openalex", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.search.search_semantic_scholar", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.search.search_arxiv", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path ): with patch( "researchclaw.literature.search.time.sleep", lambda _: None ): results = search_papers("test query", limit=20) assert len(results) >= 1 assert results[0].title == "Cached Paper" def test_search_caches_successful_results(self, tmp_path): mock_paper = Paper( paper_id="s2-test", title="Test", authors=(Author(name="Smith"),), year=2024, abstract="abs", source="semantic_scholar", ) with patch( "researchclaw.literature.search.search_semantic_scholar", return_value=[mock_paper], ): with patch("researchclaw.literature.search.search_arxiv", return_value=[]): with patch( "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path ): with patch( "researchclaw.literature.search.time.sleep", lambda _: None ): _ = search_papers("test", limit=20) cached = get_cached("test", "semantic_scholar", 20, cache_base=tmp_path) assert cached is not None assert cached[0]["paper_id"] == "s2-test" ================================================ FILE: tests/test_rc_checkpoint.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false """Tests for checkpoint/resume and content metrics.""" from __future__ import annotations import json from pathlib import Path from typing import cast from researchclaw.pipeline.executor import StageResult from researchclaw.pipeline.runner import ( _build_pipeline_summary, _collect_content_metrics, _write_checkpoint, read_checkpoint, resume_from_checkpoint, ) from researchclaw.pipeline.stages import ( NONCRITICAL_STAGES, STAGE_SEQUENCE, Stage, StageStatus, ) class TestCheckpoint: def test_write_checkpoint(self, tmp_path: Path): _write_checkpoint(tmp_path, Stage.LITERATURE_COLLECT, "test-run") cp = json.loads((tmp_path / "checkpoint.json").read_text()) assert cp["last_completed_stage"] == 4 assert cp["last_completed_name"] == "LITERATURE_COLLECT" assert cp["run_id"] == "test-run" assert "timestamp" in cp def test_read_checkpoint_returns_next_stage(self, tmp_path: Path): _write_checkpoint(tmp_path, Stage.LITERATURE_COLLECT, "test-run") next_stage = read_checkpoint(tmp_path) assert next_stage == Stage.LITERATURE_SCREEN def test_read_checkpoint_no_file(self, tmp_path: Path): assert read_checkpoint(tmp_path) is None def test_read_checkpoint_last_stage(self, tmp_path: Path): _write_checkpoint(tmp_path, Stage.CITATION_VERIFY, "test-run") assert read_checkpoint(tmp_path) is None def test_read_checkpoint_corrupted(self, tmp_path: Path): (tmp_path / "checkpoint.json").write_text("not json", encoding="utf-8") assert read_checkpoint(tmp_path) is None def test_read_checkpoint_invalid_stage(self, tmp_path: Path): (tmp_path / "checkpoint.json").write_text( json.dumps({"last_completed_stage": 999}), encoding="utf-8" ) assert read_checkpoint(tmp_path) is None def test_resume_from_checkpoint_uses_default(self, tmp_path: Path): assert resume_from_checkpoint(tmp_path) == Stage.TOPIC_INIT def test_resume_from_checkpoint_uses_next_stage(self, tmp_path: Path): _write_checkpoint(tmp_path, Stage.SEARCH_STRATEGY, "run-x") assert resume_from_checkpoint(tmp_path) == Stage.LITERATURE_COLLECT class TestNoncriticalStages: def test_knowledge_archive_is_noncritical(self): assert Stage.KNOWLEDGE_ARCHIVE in NONCRITICAL_STAGES def test_citation_verify_is_critical(self): # T3.4: CITATION_VERIFY is now critical — hallucinated refs must block export assert Stage.CITATION_VERIFY not in NONCRITICAL_STAGES def test_topic_init_is_critical(self): assert Stage.TOPIC_INIT not in NONCRITICAL_STAGES def test_paper_draft_is_critical(self): assert Stage.PAPER_DRAFT not in NONCRITICAL_STAGES def test_stage_sequence_still_ends_with_citation_verify(self): assert STAGE_SEQUENCE[-1] == Stage.CITATION_VERIFY class TestContentMetrics: def test_metrics_empty_run_dir(self, tmp_path: Path): metrics = _collect_content_metrics(tmp_path) assert metrics["template_ratio"] is None assert metrics["citation_verify_score"] is None assert metrics["total_citations"] is None assert metrics["degraded_sources"] == [] def test_metrics_with_draft(self, tmp_path: Path): draft_dir = tmp_path / "stage-17" draft_dir.mkdir() (draft_dir / "paper_draft.md").write_text( "This is a real academic paper about transformers and attention mechanisms. We propose a novel method for improving efficiency.", encoding="utf-8", ) metrics = _collect_content_metrics(tmp_path) assert metrics["template_ratio"] is not None assert cast(float, metrics["template_ratio"]) < 0.5 def test_metrics_with_verification(self, tmp_path: Path): verify_dir = tmp_path / "stage-23" verify_dir.mkdir() (verify_dir / "verification_report.json").write_text( json.dumps( { "summary": { "total": 10, "verified": 8, "suspicious": 1, "hallucinated": 1, "skipped": 0, "integrity_score": 0.8 }, "results": [] } ), encoding="utf-8", ) metrics = _collect_content_metrics(tmp_path) assert metrics["total_citations"] == 10 assert metrics["verified_citations"] == 8 assert metrics["citation_verify_score"] == 0.8 def test_metrics_no_stage23(self, tmp_path: Path): metrics = _collect_content_metrics(tmp_path) assert metrics["citation_verify_score"] is None def test_metrics_with_non_dict_summary(self, tmp_path: Path): """Must not raise NameError when 'summary' is not a dict.""" verify_dir = tmp_path / "stage-23" verify_dir.mkdir() (verify_dir / "verification_report.json").write_text( json.dumps({"summary": "unexpected string"}), encoding="utf-8", ) metrics = _collect_content_metrics(tmp_path) assert metrics["total_citations"] is None assert metrics["verified_citations"] is None assert metrics["citation_verify_score"] is None def test_metrics_with_summary_missing_fields(self, tmp_path: Path): """summary dict without total/verified should not crash.""" verify_dir = tmp_path / "stage-23" verify_dir.mkdir() (verify_dir / "verification_report.json").write_text( json.dumps({"summary": {"notes": "incomplete"}}), encoding="utf-8", ) metrics = _collect_content_metrics(tmp_path) assert metrics["total_citations"] == 0 assert metrics["verified_citations"] == 0 assert metrics["citation_verify_score"] is None def test_summary_includes_content_metrics(self, tmp_path: Path): results = [ StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("topic.json",), ), ] summary = _build_pipeline_summary( run_id="test", results=results, from_stage=Stage.TOPIC_INIT, run_dir=tmp_path, ) assert "content_metrics" in summary assert isinstance(summary["content_metrics"], dict) ================================================ FILE: tests/test_rc_citation_resolve.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false """Tests for BUG-194: Citation resolver must not replace correct bib entries with garbage papers from search results. Tests cover: - _resolve_missing_citations: seminal lookup, API validation, rejection of unrelated results, year mismatch rejection - _load_seminal_papers_by_key: index construction - _seminal_to_bibtex: BibTeX generation from YAML entries """ from __future__ import annotations from unittest.mock import patch import pytest from researchclaw.literature.models import Author, Paper # --------------------------------------------------------------------------- # Helpers to build mock Paper objects # --------------------------------------------------------------------------- def _make_paper( title: str, year: int = 2020, authors: list[str] | None = None, bibtex_override: str = "", ) -> Paper: """Create a Paper with minimal metadata.""" return Paper( paper_id=f"test_{title[:10].replace(' ', '_').lower()}", title=title, authors=tuple(Author(name=n) for n in (authors or ["Unknown"])), year=year, source="test", _bibtex_override=bibtex_override, ) # Patch target for search_papers — the import inside _resolve_missing_citations # does `from researchclaw.literature.search import search_papers`, so we patch # the source module. _SEARCH_PAPERS_PATH = "researchclaw.literature.search.search_papers" # --------------------------------------------------------------------------- # Tests for _load_seminal_papers_by_key # --------------------------------------------------------------------------- class TestLoadSeminalPapersByKey: """Test the seminal papers index builder.""" def test_loads_well_known_keys(self): from researchclaw.pipeline.stage_impls._review_publish import ( _load_seminal_papers_by_key, ) index = _load_seminal_papers_by_key() # The seminal_papers.yaml must contain these foundational papers assert "he2016deep" in index assert "vaswani2017attention" in index assert "srivastava2014dropout" in index def test_entries_have_required_fields(self): from researchclaw.pipeline.stage_impls._review_publish import ( _load_seminal_papers_by_key, ) index = _load_seminal_papers_by_key() for key, entry in index.items(): assert "title" in entry, f"Missing title for {key}" assert "year" in entry, f"Missing year for {key}" assert "authors" in entry, f"Missing authors for {key}" def test_graceful_on_load_failure(self): """If _load_all raises, _load_seminal_papers_by_key returns {}.""" from researchclaw.pipeline.stage_impls._review_publish import ( _load_seminal_papers_by_key, ) with patch( "researchclaw.data._load_all", side_effect=RuntimeError("disk error"), ): result = _load_seminal_papers_by_key() assert result == {} # --------------------------------------------------------------------------- # Tests for _seminal_to_bibtex # --------------------------------------------------------------------------- class TestSeminalToBibtex: """Test BibTeX generation from seminal_papers.yaml entries.""" def test_conference_paper(self): from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex entry = { "title": "Deep Residual Learning for Image Recognition", "authors": "He et al.", "year": 2016, "venue": "CVPR", } bib = _seminal_to_bibtex(entry, "he2016deep") assert "@inproceedings{he2016deep," in bib assert "Deep Residual Learning" in bib assert "He et al." in bib assert "2016" in bib assert "booktitle = {CVPR}" in bib def test_journal_paper(self): from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex entry = { "title": "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", "authors": "Srivastava et al.", "year": 2014, "venue": "JMLR", } bib = _seminal_to_bibtex(entry, "srivastava2014dropout") assert "@article{srivastava2014dropout," in bib assert "Dropout" in bib assert "journal = {JMLR}" in bib def test_neurips_is_conference(self): from researchclaw.pipeline.stage_impls._review_publish import _seminal_to_bibtex entry = { "title": "Attention Is All You Need", "authors": "Vaswani et al.", "year": 2017, "venue": "NeurIPS", } bib = _seminal_to_bibtex(entry, "vaswani2017attention") assert "@inproceedings{vaswani2017attention," in bib # --------------------------------------------------------------------------- # Tests for _resolve_missing_citations # --------------------------------------------------------------------------- class TestResolveMissingCitations: """Test the full resolution pipeline with BUG-194 fixes.""" def test_seminal_papers_resolved_without_api(self): """Foundational papers should be resolved from seminal_papers.yaml without any API calls.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"he2016deep", "vaswani2017attention", "srivastava2014dropout"} existing_bib = "" # Patch search_papers so it FAILS if called — seminal papers shouldn't # need it. with patch( _SEARCH_PAPERS_PATH, side_effect=AssertionError("Should not be called for seminal papers"), ): resolved, entries = _resolve_missing_citations(missing, existing_bib) assert "he2016deep" in resolved assert "vaswani2017attention" in resolved assert "srivastava2014dropout" in resolved assert len(entries) == 3 # Verify the BibTeX entries contain correct titles combined = "\n".join(entries) assert "Deep Residual Learning" in combined assert "Attention Is All You Need" in combined assert "Dropout" in combined def test_seminal_papers_not_duplicated_in_existing_bib(self): """If the key is already in existing_bib, don't add it again.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) existing_bib = "@article{he2016deep, title={Deep Residual Learning}}" missing = {"he2016deep"} # Mock search_papers to ensure no real API calls (key should be skipped # entirely since it's already in existing_bib). with patch( _SEARCH_PAPERS_PATH, side_effect=AssertionError("Should not call API for key in existing_bib"), ): resolved, entries = _resolve_missing_citations(missing, existing_bib) assert "he2016deep" not in resolved assert len(entries) == 0 def test_garbage_results_rejected_by_similarity(self): """BUG-194 regression: unrelated search results must be rejected.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) # Mock a garbage result that has the right year but wrong title garbage_paper = _make_paper( title="Jokowi and the New Developmentalism", year=2016, authors=["He, Some Politician"], bibtex_override=( "@article{jokowi2016,\n" " title = {Jokowi and the New Developmentalism},\n" " author = {He, Some Politician},\n" " year = {2016},\n" "}" ), ) # This key is NOT in seminal_papers.yaml missing = {"smith2016novel"} with patch(_SEARCH_PAPERS_PATH, return_value=[garbage_paper]): resolved, entries = _resolve_missing_citations(missing, "") # The garbage result should be rejected (no overlap with "smith novel") assert "smith2016novel" not in resolved assert len(entries) == 0 def test_year_mismatch_rejected(self): """Results with year > 1 year off from cite key are rejected.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) wrong_year_paper = _make_paper( title="Novel Deep Learning Approach by Smith", year=2020, # cite key says 2016 authors=["Smith, John"], bibtex_override=( "@article{smith2020,\n" " title = {Novel Deep Learning Approach by Smith},\n" " author = {Smith, John},\n" " year = {2020},\n" "}" ), ) missing = {"smith2016novel"} with patch(_SEARCH_PAPERS_PATH, return_value=[wrong_year_paper]): resolved, entries = _resolve_missing_citations(missing, "") assert "smith2016novel" not in resolved def test_good_api_result_accepted(self): """A search result with matching author + title words should be accepted.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) good_paper = _make_paper( title="Novel Approach to Feature Extraction in Deep Networks", year=2018, authors=["Chen, Wei"], bibtex_override=( "@article{chen2018something,\n" " title = {Novel Approach to Feature Extraction in Deep Networks},\n" " author = {Chen, Wei},\n" " year = {2018},\n" "}" ), ) # cite key: chen2018novel — "chen" matches author, "novel" matches title missing = {"chen2018novel"} with patch(_SEARCH_PAPERS_PATH, return_value=[good_paper]): resolved, entries = _resolve_missing_citations(missing, "") assert "chen2018novel" in resolved assert len(entries) == 1 # The bib entry should use the original cite_key assert "chen2018novel" in entries[0] def test_empty_missing_keys_returns_empty(self): """No keys to resolve -> empty results.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) resolved, entries = _resolve_missing_citations(set(), "") assert len(resolved) == 0 assert len(entries) == 0 def test_unparseable_keys_skipped(self): """Keys that don't match author-year pattern are skipped.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"notyearkey", "abc"} resolved, entries = _resolve_missing_citations(missing, "") assert len(resolved) == 0 assert len(entries) == 0 def test_import_failure_returns_seminal_only(self): """If search_papers can't be imported, seminal results still returned.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) # Mix of seminal and non-seminal keys missing = {"he2016deep", "unknownauthor2020something"} with patch( _SEARCH_PAPERS_PATH, side_effect=ImportError("mocked"), ): resolved, entries = _resolve_missing_citations(missing, "") # he2016deep should be resolved from seminal assert "he2016deep" in resolved # unknownauthor2020something would need API which fails assert "unknownauthor2020something" not in resolved def test_search_exception_handled_gracefully(self): """If search_papers raises, the key is skipped (no crash).""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"unknownauthor2020something"} with patch( _SEARCH_PAPERS_PATH, side_effect=RuntimeError("API down"), ): resolved, entries = _resolve_missing_citations(missing, "") assert len(resolved) == 0 def test_bug194_he2016deep_not_replaced_with_jokowi(self): """BUG-194 exact regression: he2016deep must NEVER resolve to 'Jokowi and the New Developmentalism'.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) # he2016deep IS in seminal_papers.yaml, so it should resolve from there missing = {"he2016deep"} resolved, entries = _resolve_missing_citations(missing, "") assert "he2016deep" in resolved assert len(entries) == 1 assert "Jokowi" not in entries[0] assert "Deep Residual Learning" in entries[0] def test_bug194_vaswani2017attention_not_replaced_with_health_supplement(self): """BUG-194 exact regression: vaswani2017attention must resolve to 'Attention Is All You Need', not health supplement garbage.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"vaswani2017attention"} resolved, entries = _resolve_missing_citations(missing, "") assert "vaswani2017attention" in resolved assert len(entries) == 1 assert "Health Supplement" not in entries[0] assert "Attention Is All You Need" in entries[0] def test_bug194_srivastava2014dropout_not_replaced_with_cnn_sentence(self): """BUG-194 exact regression: srivastava2014dropout must resolve to Dropout paper, not CNN for Sentence Classification.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"srivastava2014dropout"} resolved, entries = _resolve_missing_citations(missing, "") assert "srivastava2014dropout" in resolved assert len(entries) == 1 assert "Sentence Classification" not in entries[0] assert "Dropout" in entries[0] def test_multiple_seminal_and_api_mixed(self): """Mix of seminal keys (resolved locally) and API keys.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) api_paper = _make_paper( title="Adaptive Learning Rate Methods for Deep Networks", year=2019, authors=["Zhang, Adaptive"], bibtex_override=( "@article{zhang2019something,\n" " title = {Adaptive Learning Rate Methods for Deep Networks},\n" " author = {Zhang, Adaptive},\n" " year = {2019},\n" "}" ), ) missing = {"he2016deep", "zhang2019adaptive"} with patch(_SEARCH_PAPERS_PATH, return_value=[api_paper]): resolved, entries = _resolve_missing_citations(missing, "") # he2016deep from seminal, zhang2019adaptive from API assert "he2016deep" in resolved assert "zhang2019adaptive" in resolved assert len(entries) == 2 def test_no_results_from_api_skips(self): """If API returns empty list, key is skipped (not crashed).""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) missing = {"unknownauthor2020something"} with patch(_SEARCH_PAPERS_PATH, return_value=[]): resolved, entries = _resolve_missing_citations(missing, "") assert len(resolved) == 0 assert len(entries) == 0 def test_close_year_accepted(self): """A result with year within 1 of the cite key year should be accepted (arXiv vs conference year difference).""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) paper = _make_paper( title="Novel Deep Feature Extraction by Li", year=2019, # cite key says 2018, but 1 year off is OK authors=["Li, Novel"], bibtex_override=( "@article{li2019,\n" " title = {Novel Deep Feature Extraction by Li},\n" " author = {Li, Novel},\n" " year = {2019},\n" "}" ), ) missing = {"li2018novel"} with patch(_SEARCH_PAPERS_PATH, return_value=[paper]): resolved, entries = _resolve_missing_citations(missing, "") # Year 2019 vs 2018 — diff=1, should be accepted since title matches assert "li2018novel" in resolved def test_completely_unrelated_title_rejected(self): """Even if year and author name match, completely unrelated title must be rejected.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) paper = _make_paper( title="AI-Assisted Pipeline for Dynamic Generation of Trustworthy Health Supplement Content at Scale", year=2017, authors=["Vaswani, Raj"], bibtex_override=( "@article{vaswani2017health,\n" " title = {AI-Assisted Pipeline for Dynamic Generation of Trustworthy Health Supplement Content at Scale},\n" " author = {Vaswani, Raj},\n" " year = {2017},\n" "}" ), ) # Not in seminal_papers.yaml (different key) missing = {"vaswani2017health"} with patch(_SEARCH_PAPERS_PATH, return_value=[paper]): resolved, entries = _resolve_missing_citations(missing, "") # "health" matches but the overall overlap with query words # ["vaswani", "health"] should be evaluated. "vaswani" is in author # and "health" is in title, so it may pass. But this tests the # validation path at least works. # The key point: the search is called only for non-seminal keys. def test_picks_best_result_from_multiple(self): """When API returns multiple results, the one with best overlap wins.""" from researchclaw.pipeline.stage_impls._review_publish import ( _resolve_missing_citations, ) bad_paper = _make_paper( title="Convolutional Neural Networks for Sentence Classification", year=2018, authors=["Kim, Yoon"], ) good_paper = _make_paper( title="Feature Extraction via Progressive Learning", year=2018, authors=["Wang, Feature"], bibtex_override=( "@article{wang2018,\n" " title = {Feature Extraction via Progressive Learning},\n" " author = {Wang, Feature},\n" " year = {2018},\n" "}" ), ) missing = {"wang2018feature"} with patch(_SEARCH_PAPERS_PATH, return_value=[bad_paper, good_paper]): resolved, entries = _resolve_missing_citations(missing, "") if resolved: # If resolved, it should be the good paper, not the bad one assert "Sentence Classification" not in entries[0] ================================================ FILE: tests/test_rc_citation_verify.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false from __future__ import annotations import json import textwrap from typing import Any from unittest.mock import MagicMock, patch import pytest from researchclaw.literature.verify import ( CitationResult, VerificationReport, VerifyStatus, annotate_paper_hallucinations, filter_verified_bibtex, parse_bibtex_entries, title_similarity, verify_by_arxiv_id, verify_by_doi, verify_by_title_search, verify_citations, ) from researchclaw.literature.models import Author, Paper SAMPLE_BIB = textwrap.dedent("""\ @article{vaswani2017attention, title = {Attention Is All You Need}, author = {Ashish Vaswani and Noam Shazeer}, year = {2017}, eprint = {1706.03762}, archiveprefix = {arXiv}, } @inproceedings{devlin2019bert, title = {BERT: Pre-training of Deep Bidirectional Transformers}, author = {Jacob Devlin}, year = {2019}, doi = {10.18653/v1/N19-1423}, booktitle = {NAACL}, } @article{fakepaper2025hallucinated, title = {A Completely Made Up Paper That Does Not Exist}, author = {Imaginary Author}, year = {2025}, } """) SAMPLE_ARXIV_VERIFY_RESPONSE = textwrap.dedent("""\ http://arxiv.org/abs/1706.03762v5 Attention Is All You Need The dominant sequence transduction models... Ashish Vaswani """) SAMPLE_ARXIV_EMPTY_RESPONSE = textwrap.dedent("""\ http://arxiv.org/api/errors#incorrect_id_format_for_9999.99999 Error incorrect id format for 9999.99999 """) SAMPLE_CROSSREF_RESPONSE = { "status": "ok", "message": { "DOI": "10.18653/v1/N19-1423", "title": [ "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding" ], "author": [{"given": "Jacob", "family": "Devlin"}], }, } class TestParseBibtexEntries: def test_parses_three_entries(self) -> None: entries = parse_bibtex_entries(SAMPLE_BIB) assert len(entries) == 3 def test_entry_keys(self) -> None: entries = parse_bibtex_entries(SAMPLE_BIB) keys = [e["key"] for e in entries] assert "vaswani2017attention" in keys assert "devlin2019bert" in keys assert "fakepaper2025hallucinated" in keys def test_entry_fields(self) -> None: entries = parse_bibtex_entries(SAMPLE_BIB) vaswani = next(e for e in entries if e["key"] == "vaswani2017attention") assert vaswani["title"] == "Attention Is All You Need" assert vaswani["eprint"] == "1706.03762" assert vaswani["type"] == "article" def test_entry_type(self) -> None: entries = parse_bibtex_entries(SAMPLE_BIB) devlin = next(e for e in entries if e["key"] == "devlin2019bert") assert devlin["type"] == "inproceedings" assert devlin["doi"] == "10.18653/v1/N19-1423" def test_empty_bib(self) -> None: assert parse_bibtex_entries("") == [] def test_malformed_bib(self) -> None: assert parse_bibtex_entries("not bibtex at all") == [] class TestTitleSimilarity: def test_identical(self) -> None: assert ( title_similarity("Attention Is All You Need", "Attention Is All You Need") == 1.0 ) def test_case_insensitive(self) -> None: assert ( title_similarity("attention is all you need", "ATTENTION IS ALL YOU NEED") == 1.0 ) def test_high_similarity(self) -> None: sim = title_similarity( "Attention Is All You Need", "Attention Is All You Need: A Transformer Architecture", ) assert sim >= 0.5 def test_low_similarity(self) -> None: sim = title_similarity( "Attention Is All You Need", "Protein Folding with AlphaFold", ) assert sim < 0.3 def test_empty_strings(self) -> None: assert title_similarity("", "") == 0.0 assert title_similarity("something", "") == 0.0 class TestVerifyByArxivId: def test_verified_match(self) -> None: mock_resp = MagicMock() mock_resp.read.return_value = SAMPLE_ARXIV_VERIFY_RESPONSE.encode("utf-8") mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("urllib.request.urlopen", return_value=mock_resp): result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need") assert result is not None assert result.status == VerifyStatus.VERIFIED assert result.method == "arxiv_id" assert result.confidence >= 0.80 def test_hallucinated_error_response(self) -> None: mock_resp = MagicMock() mock_resp.read.return_value = SAMPLE_ARXIV_EMPTY_RESPONSE.encode("utf-8") mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("urllib.request.urlopen", return_value=mock_resp): result = verify_by_arxiv_id("9999.99999", "Fake Paper") assert result is not None assert result.status == VerifyStatus.HALLUCINATED def test_network_failure_returns_none(self) -> None: with patch("urllib.request.urlopen", side_effect=OSError("connection refused")): result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need") assert result is None def test_title_mismatch_suspicious(self) -> None: different_title_response = textwrap.dedent("""\ http://arxiv.org/abs/1706.03762v5 A Completely Different Paper Title About Quantum Computing Summary """) mock_resp = MagicMock() mock_resp.read.return_value = different_title_response.encode("utf-8") mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("urllib.request.urlopen", return_value=mock_resp): result = verify_by_arxiv_id("1706.03762", "Attention Is All You Need") assert result is not None assert result.status == VerifyStatus.SUSPICIOUS class TestVerifyByDoi: def test_verified_crossref(self) -> None: mock_resp = MagicMock() mock_resp.read.return_value = json.dumps(SAMPLE_CROSSREF_RESPONSE).encode( "utf-8" ) mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("urllib.request.urlopen", return_value=mock_resp): result = verify_by_doi( "10.18653/v1/N19-1423", "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", ) assert result is not None assert result.status == VerifyStatus.VERIFIED assert result.method == "doi" def test_doi_404_hallucinated(self) -> None: import urllib.error with patch( "urllib.request.urlopen", side_effect=urllib.error.HTTPError( "https://api.crossref.org/works/10.fake/doi", 404, "Not Found", {}, None, # type: ignore[arg-type] ), ): result = verify_by_doi("10.fake/doi", "Nonexistent Paper") assert result is not None assert result.status == VerifyStatus.HALLUCINATED def test_network_error_returns_none(self) -> None: with patch("urllib.request.urlopen", side_effect=OSError("timeout")): result = verify_by_doi("10.1234/test", "Test Paper") assert result is None def test_doi_exists_no_title(self) -> None: no_title_resp = {"status": "ok", "message": {"DOI": "10.1234/test"}} mock_resp = MagicMock() mock_resp.read.return_value = json.dumps(no_title_resp).encode("utf-8") mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) with patch("urllib.request.urlopen", return_value=mock_resp): result = verify_by_doi("10.1234/test", "Some Paper") assert result is not None assert result.status == VerifyStatus.VERIFIED assert "no title comparison" in result.details.lower() class TestVerifyByTitleSearch: def test_verified_via_search(self) -> None: mock_paper = Paper( paper_id="s2-abc", title="Attention Is All You Need", authors=(Author(name="Vaswani"),), year=2017, source="semantic_scholar", ) with patch( "researchclaw.literature.search.search_papers", return_value=[mock_paper], ): result = verify_by_title_search("Attention Is All You Need") assert result is not None assert result.status == VerifyStatus.VERIFIED assert result.matched_paper is not None def test_no_results_hallucinated(self) -> None: with patch("researchclaw.literature.search.search_papers", return_value=[]): result = verify_by_title_search("A Completely Made Up Paper") assert result is not None assert result.status == VerifyStatus.HALLUCINATED def test_weak_match_hallucinated(self) -> None: mock_paper = Paper( paper_id="s2-xyz", title="Quantum Computing for Protein Folding", year=2023, source="arxiv", ) with patch( "researchclaw.literature.search.search_papers", return_value=[mock_paper], ): result = verify_by_title_search("A Completely Made Up Paper About Nothing") assert result is not None assert result.status == VerifyStatus.HALLUCINATED def test_partial_match_suspicious(self) -> None: mock_paper = Paper( paper_id="s2-partial", title="Attention Mechanisms in Neural Networks", year=2019, source="semantic_scholar", ) with patch( "researchclaw.literature.search.search_papers", return_value=[mock_paper], ): result = verify_by_title_search("Attention Neural Networks Survey Overview") assert result is not None assert result.status in (VerifyStatus.SUSPICIOUS, VerifyStatus.HALLUCINATED) def test_network_failure_returns_none(self) -> None: with patch( "researchclaw.literature.search.search_papers", side_effect=OSError("network down"), ): result = verify_by_title_search("Any Paper") assert result is None class TestVerifyCitations: def test_full_pipeline_mocked(self) -> None: arxiv_resp = MagicMock() arxiv_resp.read.return_value = SAMPLE_ARXIV_VERIFY_RESPONSE.encode("utf-8") arxiv_resp.__enter__ = lambda s: s arxiv_resp.__exit__ = MagicMock(return_value=False) crossref_resp = MagicMock() crossref_resp.read.return_value = json.dumps(SAMPLE_CROSSREF_RESPONSE).encode( "utf-8" ) crossref_resp.__enter__ = lambda s: s crossref_resp.__exit__ = MagicMock(return_value=False) call_count = {"n": 0} def mock_urlopen(req: Any, **kwargs: Any) -> MagicMock: call_count["n"] += 1 url = req.full_url if hasattr(req, "full_url") else str(req) if "arxiv.org" in url: return arxiv_resp if "crossref.org" in url: return crossref_resp raise OSError("unexpected URL") with ( patch("researchclaw.literature.verify.time.sleep"), patch("urllib.request.urlopen", side_effect=mock_urlopen), patch("researchclaw.literature.search.search_papers", return_value=[]), ): report = verify_citations(SAMPLE_BIB, inter_verify_delay=0) assert report.total == 3 assert report.verified >= 1 assert report.hallucinated >= 1 report_dict = report.to_dict() assert "summary" in report_dict assert "results" in report_dict assert report_dict["summary"]["total"] == 3 def test_empty_bib(self) -> None: report = verify_citations("") assert report.total == 0 assert report.integrity_score == 1.0 def test_no_title_entry_skipped(self) -> None: bib = textwrap.dedent("""\ @article{noauthor2025, author = {Some Author}, year = {2025}, } """) report = verify_citations(bib) assert report.total == 1 assert report.skipped == 1 class TestVerificationReport: def test_integrity_score(self) -> None: report = VerificationReport( total=10, verified=7, suspicious=1, hallucinated=2, skipped=0 ) assert report.integrity_score == 0.7 def test_integrity_score_with_skips(self) -> None: report = VerificationReport( total=10, verified=6, suspicious=0, hallucinated=2, skipped=2 ) assert report.integrity_score == 0.75 def test_integrity_score_all_skipped(self) -> None: report = VerificationReport( total=3, verified=0, suspicious=0, hallucinated=0, skipped=3 ) assert report.integrity_score == 1.0 def test_to_dict(self) -> None: report = VerificationReport(total=2, verified=1, hallucinated=1) d = report.to_dict() assert d["summary"]["total"] == 2 assert d["summary"]["integrity_score"] == 0.5 class TestFilterVerifiedBibtex: def _make_report(self) -> VerificationReport: return VerificationReport( total=3, verified=1, suspicious=1, hallucinated=1, results=[ CitationResult( cite_key="vaswani2017attention", title="Attention Is All You Need", status=VerifyStatus.VERIFIED, confidence=1.0, method="arxiv_id", ), CitationResult( cite_key="devlin2019bert", title="BERT", status=VerifyStatus.SUSPICIOUS, confidence=0.6, method="doi", ), CitationResult( cite_key="fakepaper2025hallucinated", title="Fake Paper", status=VerifyStatus.HALLUCINATED, confidence=0.9, method="title_search", ), ], ) def test_includes_verified_and_suspicious(self) -> None: report = self._make_report() filtered = filter_verified_bibtex(SAMPLE_BIB, report, include_suspicious=True) assert "vaswani2017attention" in filtered assert "devlin2019bert" in filtered assert "fakepaper2025hallucinated" not in filtered def test_excludes_suspicious(self) -> None: report = self._make_report() filtered = filter_verified_bibtex(SAMPLE_BIB, report, include_suspicious=False) assert "vaswani2017attention" in filtered assert "devlin2019bert" not in filtered assert "fakepaper2025hallucinated" not in filtered def test_empty_bib(self) -> None: report = VerificationReport() assert filter_verified_bibtex("", report) == "" class TestAnnotatePaperHallucinations: def test_latex_citations(self) -> None: paper = r"As shown in \cite{vaswani2017attention} and \cite{fakepaper2025hallucinated}." report = VerificationReport( results=[ CitationResult( cite_key="vaswani2017attention", title="", status=VerifyStatus.VERIFIED, confidence=1.0, method="arxiv_id", ), CitationResult( cite_key="fakepaper2025hallucinated", title="", status=VerifyStatus.HALLUCINATED, confidence=0.9, method="title_search", ), ], ) result = annotate_paper_hallucinations(paper, report) assert r"\cite{vaswani2017attention}" in result # Hallucinated citations are removed, not annotated assert "fakepaper2025hallucinated" not in result def test_markdown_citations(self) -> None: paper = "As shown in [vaswani2017attention] and [fakepaper2025hallucinated]." report = VerificationReport( results=[ CitationResult( cite_key="vaswani2017attention", title="", status=VerifyStatus.VERIFIED, confidence=1.0, method="arxiv_id", ), CitationResult( cite_key="fakepaper2025hallucinated", title="", status=VerifyStatus.HALLUCINATED, confidence=0.9, method="title_search", ), ], ) result = annotate_paper_hallucinations(paper, report) assert "[vaswani2017attention]" in result # Hallucinated citations are removed, not annotated assert "fakepaper2025hallucinated" not in result def test_suspicious_annotation(self) -> None: """Suspicious citations are left unchanged (not removed).""" paper = r"\cite{devlin2019bert}" report = VerificationReport( results=[ CitationResult( cite_key="devlin2019bert", title="", status=VerifyStatus.SUSPICIOUS, confidence=0.6, method="doi", ), ], ) result = annotate_paper_hallucinations(paper, report) assert r"\cite{devlin2019bert}" in result def test_no_modifications_all_verified(self) -> None: paper = r"See \cite{vaswani2017attention}." report = VerificationReport( results=[ CitationResult( cite_key="vaswani2017attention", title="", status=VerifyStatus.VERIFIED, confidence=1.0, method="arxiv_id", ), ], ) result = annotate_paper_hallucinations(paper, report) assert result == paper class TestCitationResultSerialization: def test_to_dict_basic(self) -> None: result = CitationResult( cite_key="smith2024test", title="Test Paper", status=VerifyStatus.VERIFIED, confidence=0.95, method="arxiv_id", details="Confirmed", ) d = result.to_dict() assert d["cite_key"] == "smith2024test" assert d["status"] == "verified" assert d["confidence"] == 0.95 def test_to_dict_with_matched_paper(self) -> None: paper = Paper( paper_id="s2-abc", title="Found Paper", authors=(Author(name="Smith"),), year=2024, source="semantic_scholar", ) result = CitationResult( cite_key="smith2024test", title="Test", status=VerifyStatus.VERIFIED, confidence=0.9, method="title_search", matched_paper=paper, ) d = result.to_dict() assert "matched_paper" in d assert d["matched_paper"]["title"] == "Found Paper" class TestStage23Integration: def test_stage_exists_in_enum(self) -> None: from researchclaw.pipeline.stages import Stage assert hasattr(Stage, "CITATION_VERIFY") assert Stage.CITATION_VERIFY == 23 def test_stage_in_sequence(self) -> None: from researchclaw.pipeline.stages import Stage, STAGE_SEQUENCE, NEXT_STAGE assert Stage.CITATION_VERIFY in STAGE_SEQUENCE assert NEXT_STAGE[Stage.EXPORT_PUBLISH] == Stage.CITATION_VERIFY assert NEXT_STAGE[Stage.CITATION_VERIFY] is None def test_contract_exists(self) -> None: from researchclaw.pipeline.contracts import CONTRACTS from researchclaw.pipeline.stages import Stage assert Stage.CITATION_VERIFY in CONTRACTS contract = CONTRACTS[Stage.CITATION_VERIFY] assert "verification_report.json" in contract.output_files assert "references_verified.bib" in contract.output_files def test_executor_registered(self) -> None: from researchclaw.pipeline.executor import _STAGE_EXECUTORS from researchclaw.pipeline.stages import Stage assert Stage.CITATION_VERIFY in _STAGE_EXECUTORS def test_phase_map(self) -> None: from researchclaw.pipeline.stages import PHASE_MAP, Stage finalization_stages = PHASE_MAP["H: Finalization"] assert Stage.CITATION_VERIFY in finalization_stages def test_total_stages_is_23(self) -> None: from researchclaw.pipeline.stages import STAGE_SEQUENCE assert len(STAGE_SEQUENCE) == 23 ================================================ FILE: tests/test_rc_cli.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false from __future__ import annotations import argparse import re from pathlib import Path import pytest from researchclaw import cli as rc_cli from researchclaw.config import resolve_config_path def _write_valid_config(path: Path) -> None: path.write_text( """ project: name: demo mode: docs-first research: topic: Synthetic benchmark research runtime: timezone: UTC notifications: channel: test knowledge_base: backend: markdown root: kb openclaw_bridge: {} llm: provider: openai-compatible base_url: http://localhost:1234/v1 api_key_env: TEST_KEY """.strip() + "\n", encoding="utf-8", ) def test_main_with_no_args_returns_zero_and_prints_help( capsys: pytest.CaptureFixture[str], ) -> None: code = rc_cli.main([]) assert code == 0 captured = capsys.readouterr() assert "ResearchClaw" in captured.out assert "usage:" in captured.out @pytest.mark.parametrize("argv", [["run", "--help"], ["validate", "--help"]]) def test_help_subcommands_exit_zero(argv: list[str]) -> None: with pytest.raises(SystemExit) as exc_info: rc_cli.main(argv) assert exc_info.value.code == 0 def test_generate_run_id_format() -> None: run_id = rc_cli._generate_run_id("my topic") assert run_id.startswith("rc-") assert re.fullmatch(r"rc-\d{8}-\d{6}-[0-9a-f]{6}", run_id) def test_cmd_run_missing_config_returns_one( tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: args = argparse.Namespace( config=str(tmp_path / "missing.yaml"), topic=None, output=None, from_stage=None, auto_approve=False, skip_preflight=True, resume=False, skip_noncritical_stage=False, ) code = rc_cli.cmd_run(args) assert code == 1 assert "config file not found" in capsys.readouterr().err def test_cmd_validate_missing_config_returns_one( tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: args = argparse.Namespace( config=str(tmp_path / "missing.yaml"), no_check_paths=False ) code = rc_cli.cmd_validate(args) assert code == 1 assert "config file not found" in capsys.readouterr().err def test_cmd_validate_valid_config_returns_zero( tmp_path: Path, capsys: pytest.CaptureFixture[str] ) -> None: config_path = tmp_path / "config.yaml" _write_valid_config(config_path) args = argparse.Namespace(config=str(config_path), no_check_paths=True) code = rc_cli.cmd_validate(args) assert code == 0 assert "Config validation passed" in capsys.readouterr().out def test_main_dispatches_run_command(monkeypatch: pytest.MonkeyPatch) -> None: captured = {} def fake_cmd_run(args): captured["args"] = args return 0 monkeypatch.setattr(rc_cli, "cmd_run", fake_cmd_run) code = rc_cli.main( [ "run", "--topic", "new topic", "--config", "cfg.yaml", "--output", "out-dir", "--from-stage", "PAPER_OUTLINE", "--auto-approve", ] ) assert code == 0 parsed = captured["args"] assert parsed.topic == "new topic" assert parsed.config == "cfg.yaml" assert parsed.output == "out-dir" assert parsed.from_stage == "PAPER_OUTLINE" assert parsed.auto_approve is True def test_main_dispatches_validate_command(monkeypatch: pytest.MonkeyPatch) -> None: captured = {} def fake_cmd_validate(args): captured["args"] = args return 0 monkeypatch.setattr(rc_cli, "cmd_validate", fake_cmd_validate) code = rc_cli.main(["validate", "--config", "cfg.yaml", "--no-check-paths"]) assert code == 0 parsed = captured["args"] assert parsed.config == "cfg.yaml" assert parsed.no_check_paths is True @pytest.mark.parametrize( "argv", [ ["run", "--topic", "x", "--config", "c.yaml"], ["run", "--output", "out", "--config", "c.yaml"], ["run", "--from-stage", "TOPIC_INIT", "--config", "c.yaml"], ["run", "--auto-approve", "--config", "c.yaml"], ], ) def test_run_parser_accepts_required_flags( argv: list[str], monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setattr(rc_cli, "cmd_run", lambda args: 0) assert rc_cli.main(argv) == 0 def test_validate_parser_accepts_config_flag(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(rc_cli, "cmd_validate", lambda args: 0) assert rc_cli.main(["validate", "--config", "cfg.yaml"]) == 0 # --- resolve_config_path tests --- def test_resolve_config_finds_arc_yaml_first( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.chdir(tmp_path) (tmp_path / "config.arc.yaml").write_text("x: 1\n") (tmp_path / "config.yaml").write_text("x: 2\n") result = resolve_config_path(None) assert result is not None assert result.name == "config.arc.yaml" def test_resolve_config_falls_back_to_config_yaml( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.chdir(tmp_path) (tmp_path / "config.yaml").write_text("x: 1\n") result = resolve_config_path(None) assert result is not None assert result.name == "config.yaml" def test_resolve_config_returns_none_when_missing( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.chdir(tmp_path) result = resolve_config_path(None) assert result is None def test_resolve_config_explicit_path_no_search() -> None: result = resolve_config_path("/some/explicit/path.yaml") assert result is not None assert str(result) == "/some/explicit/path.yaml" # --- cmd_init tests --- def _write_example_config(path: Path) -> None: path.write_text( """\ project: name: "my-research" llm: provider: "openai" base_url: "https://api.openai.com/v1" api_key_env: "OPENAI_API_KEY" primary_model: "gpt-4o" fallback_models: - "gpt-4.1" - "gpt-4o-mini" """, encoding="utf-8", ) def test_cmd_init_creates_config( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: monkeypatch.chdir(tmp_path) _write_example_config(tmp_path / "config.researchclaw.example.yaml") # Simulate non-TTY (stdin not a tty) → defaults to openai monkeypatch.setattr("sys.stdin", type("FakeStdin", (), {"isatty": lambda self: False})()) args = argparse.Namespace(force=False) code = rc_cli.cmd_init(args) assert code == 0 created = tmp_path / "config.arc.yaml" assert created.exists() content = created.read_text() assert 'provider: "openai"' in content assert "Created config.arc.yaml" in capsys.readouterr().out def test_cmd_init_refuses_overwrite( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: monkeypatch.chdir(tmp_path) _write_example_config(tmp_path / "config.researchclaw.example.yaml") (tmp_path / "config.arc.yaml").write_text("existing\n") args = argparse.Namespace(force=False) code = rc_cli.cmd_init(args) assert code == 1 assert "already exists" in capsys.readouterr().err assert (tmp_path / "config.arc.yaml").read_text() == "existing\n" def test_cmd_init_force_overwrites( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.chdir(tmp_path) _write_example_config(tmp_path / "config.researchclaw.example.yaml") (tmp_path / "config.arc.yaml").write_text("old\n") monkeypatch.setattr("sys.stdin", type("FakeStdin", (), {"isatty": lambda self: False})()) args = argparse.Namespace(force=True) code = rc_cli.cmd_init(args) assert code == 0 assert (tmp_path / "config.arc.yaml").read_text() != "old\n" def test_cmd_run_missing_config_shows_init_hint( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: monkeypatch.chdir(tmp_path) args = argparse.Namespace( config=None, topic=None, output=None, from_stage=None, auto_approve=False, skip_preflight=True, resume=False, skip_noncritical_stage=False, ) code = rc_cli.cmd_run(args) assert code == 1 assert "researchclaw init" in capsys.readouterr().err def test_resume_finds_existing_checkpoint_dir( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: """BUG-119: --resume without --output should find the latest checkpoint dir.""" import hashlib import json monkeypatch.chdir(tmp_path) # Write a valid config config_path = tmp_path / "config.arc.yaml" _write_valid_config(config_path) # Create a fake previous run directory with a checkpoint topic = "Synthetic benchmark research" # matches _write_valid_config topic_hash = hashlib.sha256(topic.encode()).hexdigest()[:6] old_run_dir = tmp_path / "artifacts" / f"rc-20260319-100000-{topic_hash}" old_run_dir.mkdir(parents=True) (old_run_dir / "checkpoint.json").write_text( json.dumps({"last_completed_stage": 5, "last_completed_name": "HYPOTHESIS_GEN", "run_id": old_run_dir.name, "timestamp": "2026-03-19T10:00:00Z"}) ) # Mock execute_pipeline so we don't actually run import researchclaw.pipeline.runner as runner_mod monkeypatch.setattr(runner_mod, "execute_pipeline", lambda **kw: []) # Also mock preflight from unittest.mock import MagicMock mock_client = MagicMock() mock_client.preflight.return_value = (True, "OK") import researchclaw.llm as llm_mod monkeypatch.setattr(llm_mod, "create_llm_client", lambda cfg: mock_client) args = argparse.Namespace( config=str(config_path), topic=None, output=None, from_stage=None, auto_approve=False, skip_preflight=True, resume=True, skip_noncritical_stage=False, no_graceful_degradation=False, ) rc_cli.cmd_run(args) captured = capsys.readouterr() assert "Found existing run to resume" in captured.out assert old_run_dir.name in captured.out def test_resume_no_checkpoint_warns( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] ) -> None: """BUG-119: --resume with no matching checkpoint should warn and start new.""" monkeypatch.chdir(tmp_path) config_path = tmp_path / "config.arc.yaml" _write_valid_config(config_path) # Create empty artifacts dir (no checkpoints) (tmp_path / "artifacts").mkdir() import researchclaw.pipeline.runner as runner_mod monkeypatch.setattr(runner_mod, "execute_pipeline", lambda **kw: []) from unittest.mock import MagicMock mock_client = MagicMock() mock_client.preflight.return_value = (True, "OK") import researchclaw.llm as llm_mod monkeypatch.setattr(llm_mod, "create_llm_client", lambda cfg: mock_client) args = argparse.Namespace( config=str(config_path), topic=None, output=None, from_stage=None, auto_approve=False, skip_preflight=True, resume=True, skip_noncritical_stage=False, no_graceful_degradation=False, ) rc_cli.cmd_run(args) captured = capsys.readouterr() assert "no checkpoint found" in captured.err def test_main_dispatches_init(monkeypatch: pytest.MonkeyPatch) -> None: captured = {} def fake_cmd_init(args): captured["args"] = args return 0 monkeypatch.setattr(rc_cli, "cmd_init", fake_cmd_init) code = rc_cli.main(["init", "--force"]) assert code == 0 assert captured["args"].force is True ================================================ FILE: tests/test_rc_config.py ================================================ import json from pathlib import Path from typing import cast import pytest from researchclaw.config import ( ExperimentConfig, RCConfig, SandboxConfig, SecurityConfig, ValidationResult, load_config, validate_config, ) def _write_valid_config(tmp_path: Path) -> Path: kb_root = tmp_path / "docs" / "kb" for name in ( "questions", "literature", "experiments", "findings", "decisions", "reviews", ): (kb_root / name).mkdir(parents=True, exist_ok=True) config_path = tmp_path / "config.rc.yaml" _ = config_path.write_text( """ project: name: demo mode: docs-first research: topic: Test topic domains: [ml, agents] runtime: timezone: America/New_York notifications: channel: discord knowledge_base: backend: markdown root: docs/kb openclaw_bridge: use_cron: true use_message: true use_memory: true use_sessions_spawn: true use_web_fetch: true use_browser: false llm: provider: openai-compatible base_url: https://example.invalid/v1 api_key_env: OPENAI_API_KEY security: hitl_required_stages: [5, 9, 20] experiment: mode: simulated """.strip() + "\n", encoding="utf-8", ) return config_path def _valid_config_data() -> dict[str, dict[str, object]]: return { "project": {"name": "demo", "mode": "docs-first"}, "research": {"topic": "Test topic", "domains": ["ml", "agents"]}, "runtime": {"timezone": "America/New_York"}, "notifications": {"channel": "discord"}, "knowledge_base": {"backend": "markdown", "root": "docs/kb"}, "openclaw_bridge": { "use_cron": True, "use_message": True, "use_memory": True, "use_sessions_spawn": True, "use_web_fetch": True, "use_browser": False, }, "llm": { "provider": "openai-compatible", "base_url": "https://example.invalid/v1", "api_key_env": "OPENAI_API_KEY", "primary_model": "gpt-4.1", "fallback_models": ["gpt-4o-mini", "gpt-4o"], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "simulated", "metric_direction": "minimize", }, } def test_valid_config_data_helper_returns_expected_baseline_shape(): data = _valid_config_data() assert data["project"]["name"] == "demo" assert data["knowledge_base"]["root"] == "docs/kb" assert data["security"]["hitl_required_stages"] == [5, 9, 20] def test_validate_config_with_valid_data_returns_ok_true(tmp_path: Path): result = validate_config( _valid_config_data(), project_root=tmp_path, check_paths=False ) assert isinstance(result, ValidationResult) assert result.ok is True assert result.errors == () def test_validate_config_missing_required_fields_returns_errors(tmp_path: Path): data = _valid_config_data() data["research"] = {} result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "Missing required field: research.topic" in result.errors def test_validate_config_rejects_invalid_project_mode(tmp_path: Path): data = _valid_config_data() data["project"]["mode"] = "invalid-mode" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "Invalid project.mode: invalid-mode" in result.errors def test_validate_config_rejects_invalid_knowledge_base_backend(tmp_path: Path): data = _valid_config_data() data["knowledge_base"]["backend"] = "sqlite" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "Invalid knowledge_base.backend: sqlite" in result.errors @pytest.mark.parametrize("entry", [0, 24, "5", 9.1]) def test_validate_config_rejects_invalid_hitl_required_stages_entries( tmp_path: Path, entry: object ): data = _valid_config_data() data["security"]["hitl_required_stages"] = [5, entry, 20] result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert f"Invalid security.hitl_required_stages entry: {entry}" in result.errors def test_validate_config_rejects_non_list_hitl_required_stages(tmp_path: Path): data = _valid_config_data() data["security"]["hitl_required_stages"] = "5,9,20" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "security.hitl_required_stages must be a list" in result.errors def test_validate_config_rejects_invalid_experiment_mode(tmp_path: Path): data = _valid_config_data() data["experiment"]["mode"] = "kubernetes" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "Invalid experiment.mode: kubernetes" in result.errors def test_validate_config_accepts_docker_mode(tmp_path: Path): data = _valid_config_data() data["experiment"]["mode"] = "docker" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is True def test_validate_config_rejects_invalid_metric_direction(tmp_path: Path): data = _valid_config_data() data["experiment"]["metric_direction"] = "upward" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is False assert "Invalid experiment.metric_direction: upward" in result.errors def test_rcconfig_from_dict_happy_path(tmp_path: Path): config = RCConfig.from_dict( _valid_config_data(), project_root=tmp_path, check_paths=False, ) assert isinstance(config, RCConfig) assert config.project.name == "demo" assert config.research.domains == ("ml", "agents") assert config.llm.fallback_models == ("gpt-4o-mini", "gpt-4o") def test_rcconfig_from_dict_missing_fields_raises_value_error(tmp_path: Path): data = _valid_config_data() del data["runtime"] with pytest.raises(ValueError, match="Missing required field: runtime.timezone"): _ = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) def test_rcconfig_load_from_yaml_file(tmp_path: Path): config_path = _write_valid_config(tmp_path) config = RCConfig.load(config_path, project_root=tmp_path) assert isinstance(config, RCConfig) assert config.project.name == "demo" assert config.knowledge_base.root == "docs/kb" def test_load_config_wrapper_returns_rcconfig(tmp_path: Path): config_path = _write_valid_config(tmp_path) config = load_config(config_path, project_root=tmp_path) assert isinstance(config, RCConfig) assert config.security.hitl_required_stages == (5, 9, 20) def test_security_config_defaults_match_expected_values(): defaults = SecurityConfig() assert defaults.hitl_required_stages == (5, 9, 20) assert defaults.allow_publish_without_approval is False assert defaults.redact_sensitive_logs is True def test_experiment_config_defaults_mode_is_simulated(): defaults = ExperimentConfig() assert defaults.mode == "simulated" assert defaults.metric_direction == "minimize" def test_sandbox_config_defaults_match_expected_values(): from researchclaw.config import DEFAULT_PYTHON_PATH defaults = SandboxConfig() assert defaults.python_path == DEFAULT_PYTHON_PATH assert defaults.gpu_required is False assert defaults.max_memory_mb == 4096 assert "numpy" in defaults.allowed_imports def test_to_dict_roundtrip_rehydrates_equivalent_rcconfig(tmp_path: Path): original = RCConfig.from_dict( _valid_config_data(), project_root=tmp_path, check_paths=False, ) normalized = cast(dict[str, object], json.loads(json.dumps(original.to_dict()))) rehydrated = RCConfig.from_dict( normalized, project_root=tmp_path, check_paths=False, ) assert rehydrated == original assert isinstance(original.to_dict()["security"]["hitl_required_stages"], tuple) def test_check_paths_false_skips_missing_kb_root_validation(tmp_path: Path): data = _valid_config_data() data["knowledge_base"]["root"] = "docs/missing-kb" result = validate_config(data, project_root=tmp_path, check_paths=False) assert result.ok is True assert not any(error.startswith("Missing path:") for error in result.errors) def test_path_validation_missing_kb_root_is_error(tmp_path: Path): result = validate_config( _valid_config_data(), project_root=tmp_path, check_paths=True ) assert result.ok is False assert any(error.startswith("Missing path:") for error in result.errors) def test_validate_config_missing_kb_subdirs_emits_warnings(tmp_path: Path): data = _valid_config_data() _ = (tmp_path / "docs" / "kb").mkdir(parents=True) result = validate_config(data, project_root=tmp_path, check_paths=True) assert result.ok is True assert len(result.warnings) == 6 assert all( warning.startswith("Missing recommended kb subdir:") for warning in result.warnings ) def test_rcconfig_from_dict_uses_default_security_when_missing(tmp_path: Path): data = _valid_config_data() del data["security"] config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) assert config.security.hitl_required_stages == (5, 9, 20) def test_load_uses_file_parent_as_default_project_root(tmp_path: Path): config_path = _write_valid_config(tmp_path) config = RCConfig.load(config_path) assert config.project.name == "demo" assert config.knowledge_base.root == "docs/kb" ================================================ FILE: tests/test_rc_contracts.py ================================================ import re import pytest from researchclaw.pipeline.contracts import CONTRACTS, StageContract from researchclaw.pipeline.stages import GATE_STAGES, STAGE_SEQUENCE, Stage def test_contracts_dict_has_exactly_23_entries(): assert len(CONTRACTS) == 23 def test_every_stage_has_matching_contract_entry(): assert set(CONTRACTS.keys()) == set(Stage) @pytest.mark.parametrize("stage", STAGE_SEQUENCE) def test_each_stage_member_resolves_to_stage_contract(stage: Stage): assert isinstance(CONTRACTS[stage], StageContract) @pytest.mark.parametrize("stage,contract", tuple(CONTRACTS.items())) def test_contract_stage_field_matches_dict_key(stage: Stage, contract: StageContract): assert contract.stage is stage @pytest.mark.parametrize("contract", tuple(CONTRACTS.values())) def test_output_files_is_non_empty_for_all_contracts(contract: StageContract): assert contract.output_files @pytest.mark.parametrize("stage,contract", tuple(CONTRACTS.items())) def test_error_code_starts_with_e_and_contains_stage_number( stage: Stage, contract: StageContract ): assert contract.error_code.startswith("E") assert f"{int(stage):02d}" in contract.error_code assert re.match(r"^E\d{2}_[A-Z0-9_]+$", contract.error_code) @pytest.mark.parametrize("contract", tuple(CONTRACTS.values())) def test_max_retries_is_non_negative_for_all_contracts(contract: StageContract): assert contract.max_retries >= 0 def test_gate_stages_have_expected_max_retries(): assert CONTRACTS[Stage.LITERATURE_SCREEN].max_retries == 0 assert CONTRACTS[Stage.EXPERIMENT_DESIGN].max_retries == 0 assert CONTRACTS[Stage.QUALITY_GATE].max_retries == 0 @pytest.mark.parametrize("stage", tuple(GATE_STAGES)) def test_gate_stage_contracts_are_never_retried(stage: Stage): assert CONTRACTS[stage].max_retries == 0 def test_topic_init_contract_has_expected_input_output_files(): contract = CONTRACTS[Stage.TOPIC_INIT] assert contract.input_files == () assert contract.output_files == ("goal.md", "hardware_profile.json") def test_export_publish_contract_has_expected_outputs(): contract = CONTRACTS[Stage.EXPORT_PUBLISH] assert contract.output_files == ("paper_final.md", "code/") @pytest.mark.parametrize("contract", tuple(CONTRACTS.values())) def test_dod_is_non_empty_string_for_all_contracts(contract: StageContract): assert isinstance(contract.dod, str) assert contract.dod.strip() @pytest.mark.parametrize("contract", tuple(CONTRACTS.values())) def test_input_files_is_tuple_of_strings(contract: StageContract): assert isinstance(contract.input_files, tuple) assert all(isinstance(path, str) and path for path in contract.input_files) @pytest.mark.parametrize("contract", tuple(CONTRACTS.values())) def test_output_files_is_tuple_of_strings(contract: StageContract): assert isinstance(contract.output_files, tuple) assert all(isinstance(path, str) and path for path in contract.output_files) def test_error_codes_are_unique_across_contracts(): all_codes = [contract.error_code for contract in CONTRACTS.values()] assert len(all_codes) == len(set(all_codes)) def test_contracts_follow_stage_sequence_order(): assert tuple(CONTRACTS.keys()) == STAGE_SEQUENCE @pytest.mark.parametrize("stage", STAGE_SEQUENCE) def test_contract_stage_int_matches_stage_enum_value(stage: Stage): assert int(CONTRACTS[stage].stage) == int(stage) ================================================ FILE: tests/test_rc_docker_sandbox.py ================================================ """Tests for DockerSandbox — all mocked, no real Docker needed.""" from __future__ import annotations import subprocess import threading from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.config import DockerSandboxConfig, ExperimentConfig from researchclaw.experiment.docker_sandbox import DockerSandbox, _next_container_name from researchclaw.experiment.factory import create_sandbox from researchclaw.experiment.sandbox import SandboxResult # ── SandboxResult contract ───────────────────────────────────────────── def test_sandbox_result_fields(): r = SandboxResult( returncode=0, stdout="primary_metric: 0.95\n", stderr="", elapsed_sec=1.2, metrics={"primary_metric": 0.95}, timed_out=False, ) assert r.returncode == 0 assert r.metrics["primary_metric"] == 0.95 assert r.timed_out is False # ── DockerSandbox command building ───────────────────────────────────── def test_build_run_command_network_none(tmp_path: Path): """network_policy='none' → --network none, --user UID:GID.""" cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") cmd = sandbox._build_run_command( tmp_path / "staging", entry_point="main.py", container_name="rc-test-1", ) assert "docker" in cmd assert "--gpus" in cmd assert "--network" in cmd assert "none" in cmd assert "--memory=8192m" in cmd assert "--shm-size=2048m" in cmd assert cmd[-1] == "main.py" # Should contain --user (non-root) assert "--user" in cmd def test_build_run_command_setup_only(tmp_path: Path): """Default network_policy='setup_only' → RC_SETUP_ONLY_NETWORK=1, --cap-add.""" cfg = DockerSandboxConfig() # default is setup_only sandbox = DockerSandbox(cfg, tmp_path / "work") cmd = sandbox._build_run_command( tmp_path / "staging", entry_point="main.py", container_name="rc-test-setup", ) # Should set env var for setup-only network assert "-e" in cmd env_idx = [i for i, x in enumerate(cmd) if x == "-e"] env_values = [cmd[i + 1] for i in env_idx] assert "RC_SETUP_ONLY_NETWORK=1" in env_values # Should add NET_ADMIN capability assert "--cap-add=NET_ADMIN" in cmd # Should NOT have --network none (needs network for setup) network_indices = [i for i, x in enumerate(cmd) if x == "--network"] assert len(network_indices) == 0 # Should have --user (runs as host user so experiment can write results.json) assert "--user" in cmd def test_build_run_command_full_network(tmp_path: Path): """network_policy='full' → no --network none, has --user.""" cfg = DockerSandboxConfig(network_policy="full") sandbox = DockerSandbox(cfg, tmp_path / "work") cmd = sandbox._build_run_command( tmp_path / "staging", entry_point="main.py", container_name="rc-test-full", ) # No --network none network_indices = [i for i, x in enumerate(cmd) if x == "--network"] assert len(network_indices) == 0 # Should have --user (non-root) assert "--user" in cmd def test_build_run_command_no_gpu(tmp_path: Path): cfg = DockerSandboxConfig(gpu_enabled=False, network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") cmd = sandbox._build_run_command( tmp_path / "staging", entry_point="main.py", container_name="rc-test-2", ) assert "--gpus" not in cmd def test_build_run_command_specific_gpus(tmp_path: Path): cfg = DockerSandboxConfig(gpu_device_ids=(0, 2), network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") cmd = sandbox._build_run_command( tmp_path / "staging", entry_point="main.py", container_name="rc-test-3", ) assert "--gpus" in cmd gpu_idx = cmd.index("--gpus") assert "0,2" in cmd[gpu_idx + 1] # ── Harness injection ───────────────────────────────────────────────── def test_harness_injection(tmp_path: Path): harness_src = Path(__file__).parent.parent / "researchclaw" / "experiment" / "harness_template.py" if not harness_src.exists(): pytest.skip("harness_template.py not found") target = tmp_path / "project" target.mkdir() DockerSandbox._inject_harness(target) assert (target / "experiment_harness.py").exists() # ── Factory ──────────────────────────────────────────────────────────── def test_factory_returns_experiment_sandbox(tmp_path: Path): from researchclaw.experiment.sandbox import ExperimentSandbox config = ExperimentConfig(mode="sandbox") sandbox = create_sandbox(config, tmp_path / "work") assert isinstance(sandbox, ExperimentSandbox) @patch("researchclaw.experiment.docker_sandbox.DockerSandbox.ensure_image", return_value=True) @patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=True) def test_factory_returns_docker_sandbox(mock_avail, mock_image, tmp_path: Path): config = ExperimentConfig(mode="docker") sandbox = create_sandbox(config, tmp_path / "work") assert isinstance(sandbox, DockerSandbox) @patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=False) def test_factory_falls_back_when_docker_unavailable(mock_avail, tmp_path: Path): config = ExperimentConfig(mode="docker") sandbox = create_sandbox(config, tmp_path / "work") # BUG-002: Should fall back to subprocess sandbox instead of raising from researchclaw.experiment.sandbox import ExperimentSandbox assert isinstance(sandbox, ExperimentSandbox) @patch("researchclaw.experiment.docker_sandbox.DockerSandbox.ensure_image", return_value=False) @patch("researchclaw.experiment.docker_sandbox.DockerSandbox.check_docker_available", return_value=True) def test_factory_raises_when_image_missing(mock_avail, mock_image, tmp_path: Path): config = ExperimentConfig(mode="docker") with pytest.raises(RuntimeError, match="not found locally"): create_sandbox(config, tmp_path / "work") # ── run() with mocked subprocess ────────────────────────────────────── @patch("subprocess.run") def test_docker_run_success(mock_run, tmp_path: Path): mock_run.return_value = subprocess.CompletedProcess( args=["docker", "run"], returncode=0, stdout="primary_metric: 0.85\n", stderr="", ) cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run("print('hello')", timeout_sec=60) assert result.returncode == 0 assert result.metrics.get("primary_metric") == 0.85 assert result.timed_out is False @patch("subprocess.run") def test_docker_run_timeout(mock_run, tmp_path: Path): mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker run", timeout=10) cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run("import time; time.sleep(999)", timeout_sec=10) assert result.timed_out is True assert result.returncode == -1 # ── Dep detection ───────────────────────────────────────────────────── def test_detect_pip_packages(tmp_path: Path): (tmp_path / "main.py").write_text( "import torchdiffeq\nimport numpy\nfrom PIL import Image\n" ) detected = DockerSandbox._detect_pip_packages(tmp_path) # torchdiffeq and PIL/Pillow are now in builtin → skipped # numpy should be skipped (builtin) assert "numpy" not in detected assert "torchdiffeq" not in detected def test_detect_pip_packages_finds_unknown(tmp_path: Path): """Unknown packages should be detected.""" (tmp_path / "main.py").write_text( "import some_new_package\nimport numpy\n" ) detected = DockerSandbox._detect_pip_packages(tmp_path) assert "some_new_package" in detected assert "numpy" not in detected def test_detect_pip_packages_skips_setup_py(tmp_path: Path): """setup.py should not be scanned for experiment deps.""" (tmp_path / "setup.py").write_text("import some_setup_dep\n") (tmp_path / "main.py").write_text("import numpy\n") detected = DockerSandbox._detect_pip_packages(tmp_path) assert "some_setup_dep" not in detected def test_detect_pip_packages_maps_imports(tmp_path: Path): """Known import-to-pip mappings should be applied.""" (tmp_path / "main.py").write_text( "import cv2\nimport wandb\n" ) detected = DockerSandbox._detect_pip_packages(tmp_path) assert "opencv-python" in detected assert "wandb" in detected def test_next_container_name_is_thread_safe(): names: list[str] = [] lock = threading.Lock() def worker() -> None: for _ in range(20): name = _next_container_name() with lock: names.append(name) threads = [threading.Thread(target=worker) for _ in range(5)] for thread in threads: thread.start() for thread in threads: thread.join() assert len(names) == 100 assert len(names) == len(set(names)) # ── requirements.txt generation ────────────────────────────────────── def test_write_requirements_txt_from_auto_detect(tmp_path: Path): """Auto-detected packages should be written to requirements.txt.""" staging = tmp_path / "staging" staging.mkdir() (staging / "main.py").write_text("import wandb\nimport optuna\n") cfg = DockerSandboxConfig(auto_install_deps=True) sandbox = DockerSandbox(cfg, tmp_path / "work") sandbox._write_requirements_txt(staging) req_path = staging / "requirements.txt" assert req_path.exists() content = req_path.read_text() assert "wandb" in content assert "optuna" in content def test_write_requirements_txt_with_pip_pre_install(tmp_path: Path): """pip_pre_install packages should be added to requirements.txt.""" staging = tmp_path / "staging" staging.mkdir() (staging / "main.py").write_text("import numpy\n") cfg = DockerSandboxConfig(pip_pre_install=("einops==0.8.0", "kornia")) sandbox = DockerSandbox(cfg, tmp_path / "work") sandbox._write_requirements_txt(staging) req_path = staging / "requirements.txt" assert req_path.exists() content = req_path.read_text() assert "einops==0.8.0" in content assert "kornia" in content def test_write_requirements_txt_respects_existing(tmp_path: Path): """If LLM already generated requirements.txt, append only new packages.""" staging = tmp_path / "staging" staging.mkdir() (staging / "main.py").write_text("import numpy\n") (staging / "requirements.txt").write_text("wandb\n") cfg = DockerSandboxConfig(pip_pre_install=("wandb", "einops")) sandbox = DockerSandbox(cfg, tmp_path / "work") sandbox._write_requirements_txt(staging) content = (staging / "requirements.txt").read_text() # wandb already in existing file, should not be duplicated assert content.count("wandb") == 1 # einops should be appended assert "einops" in content def test_write_requirements_txt_no_packages(tmp_path: Path): """No requirements.txt if no packages needed.""" staging = tmp_path / "staging" staging.mkdir() (staging / "main.py").write_text("import numpy\n") cfg = DockerSandboxConfig() sandbox = DockerSandbox(cfg, tmp_path / "work") sandbox._write_requirements_txt(staging) assert not (staging / "requirements.txt").exists() # ── Static checks (mocked) ──────────────────────────────────────────── @patch("subprocess.run") def test_check_docker_available_true(mock_run): mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0) assert DockerSandbox.check_docker_available() is True @patch("subprocess.run") def test_check_docker_available_false(mock_run): mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=1) assert DockerSandbox.check_docker_available() is False @patch("subprocess.run", side_effect=FileNotFoundError) def test_check_docker_available_no_binary(mock_run): assert DockerSandbox.check_docker_available() is False @patch("subprocess.run") def test_ensure_image_true(mock_run): mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=0) assert DockerSandbox.ensure_image("researchclaw/experiment:latest") is True @patch("subprocess.run") def test_ensure_image_false(mock_run): mock_run.return_value = subprocess.CompletedProcess(args=[], returncode=1) assert DockerSandbox.ensure_image("nonexistent:latest") is False # ── Default config values ──────────────────────────────────────────── def test_default_network_policy_is_setup_only(): """Default network_policy should be 'setup_only', not 'none'.""" cfg = DockerSandboxConfig() assert cfg.network_policy == "setup_only" def test_default_auto_install_deps_enabled(): cfg = DockerSandboxConfig() assert cfg.auto_install_deps is True # ── Entry point path traversal validation ───────────────────────────── @patch("researchclaw.experiment.docker_sandbox.subprocess.run") def test_run_project_rejects_path_traversal(mock_run: MagicMock, tmp_path: Path): """run_project() must reject entry_point with '..' components.""" project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") cfg = DockerSandboxConfig() work = tmp_path / "work" sandbox = DockerSandbox(cfg, work) # Create escape target so .exists() alone wouldn't catch it work.mkdir(parents=True, exist_ok=True) (work / "escape.py").write_text("print('escaped!')") result = sandbox.run_project(project, entry_point="../escape.py") assert result.returncode == -1 assert ".." in result.stderr mock_run.assert_not_called() @patch("researchclaw.experiment.docker_sandbox.subprocess.run") def test_run_project_rejects_absolute_path(mock_run: MagicMock, tmp_path: Path): """run_project() must reject absolute entry_point paths.""" project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") cfg = DockerSandboxConfig() sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run_project(project, entry_point="/etc/passwd") assert result.returncode == -1 assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower() mock_run.assert_not_called() # ── Container cleanup behavior ──────────────────────────────────────── @patch.object(DockerSandbox, "_remove_container") @patch("subprocess.run") def test_cleanup_on_normal_exit(mock_run: MagicMock, mock_remove: MagicMock, tmp_path: Path): """_remove_container is called on normal successful exit.""" mock_run.return_value = subprocess.CompletedProcess( args=["docker", "run"], returncode=0, stdout="metric: 1.0\n", stderr="", ) cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run("print('ok')", timeout_sec=60) assert result.returncode == 0 mock_remove.assert_called_once() @patch.object(DockerSandbox, "_remove_container") @patch.object(DockerSandbox, "_kill_container") @patch("subprocess.run") def test_cleanup_on_timeout( mock_run: MagicMock, mock_kill: MagicMock, mock_remove: MagicMock, tmp_path: Path, ): """Both _kill_container and _remove_container are called on timeout.""" mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker run", timeout=10) cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run("import time; time.sleep(999)", timeout_sec=10) assert result.timed_out is True mock_kill.assert_called_once() mock_remove.assert_called_once() @patch.object(DockerSandbox, "_remove_container") @patch("subprocess.run") def test_cleanup_on_exception(mock_run: MagicMock, mock_remove: MagicMock, tmp_path: Path): """_remove_container is called even when subprocess.run raises an unexpected exception.""" mock_run.side_effect = OSError("Docker daemon not responding") cfg = DockerSandboxConfig(network_policy="none") sandbox = DockerSandbox(cfg, tmp_path / "work") result = sandbox.run("print('hi')", timeout_sec=60) assert result.returncode == -1 assert "Docker execution error" in result.stderr mock_remove.assert_called_once() @patch.object(DockerSandbox, "_remove_container") @patch.object(DockerSandbox, "_kill_container") @patch("subprocess.run") def test_keep_containers_skips_removal( mock_run: MagicMock, mock_kill: MagicMock, mock_remove: MagicMock, tmp_path: Path, ): """When keep_containers=True, _remove_container is never called.""" mock_run.return_value = subprocess.CompletedProcess( args=["docker", "run"], returncode=0, stdout="", stderr="", ) cfg = DockerSandboxConfig(network_policy="none", keep_containers=True) sandbox = DockerSandbox(cfg, tmp_path / "work") sandbox.run("print('ok')", timeout_sec=60) mock_remove.assert_not_called() ================================================ FILE: tests/test_rc_e2e_regression.py ================================================ # pyright: reportMissingImports=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownVariableType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportPrivateUsage=false, reportUnknownLambdaType=false from __future__ import annotations import json import urllib.error from email.message import Message from pathlib import Path from unittest.mock import patch import pytest class _DummyResponse: def __init__(self, payload: bytes) -> None: self._payload: bytes = payload def read(self) -> bytes: return self._payload def __enter__(self) -> _DummyResponse: return self def __exit__(self, exc_type, exc, tb) -> None: _ = exc_type, exc, tb return None class TestRateLimitRetry: def test_s2_429_retries_and_succeeds(self) -> None: from researchclaw.literature.semantic_scholar import ( _reset_circuit_breaker, search_semantic_scholar, ) _reset_circuit_breaker() # ensure clean CB state from prior tests call_count = 0 def mock_urlopen(req, **kwargs): _ = kwargs nonlocal call_count call_count += 1 if call_count == 1: raise urllib.error.HTTPError( req.full_url if hasattr(req, "full_url") else str(req), 429, "Too Many Requests", Message(), None, ) payload = json.dumps( { "data": [ { "paperId": "abc123", "title": "Test Paper", "authors": [{"name": "Smith"}], "year": 2024, "abstract": "test abstract", "venue": "NeurIPS", "citationCount": 10, "externalIds": {"DOI": "10.1234/test"}, "url": "https://example.com", } ] } ).encode("utf-8") return _DummyResponse(payload) with patch("urllib.request.urlopen", side_effect=mock_urlopen): with patch("time.sleep"): papers = search_semantic_scholar("test query", limit=5) assert call_count >= 2 assert len(papers) == 1 def test_s2_persistent_429_exhausts_retries_and_returns_empty(self) -> None: from researchclaw.literature.semantic_scholar import ( _MAX_RETRIES, _reset_circuit_breaker, search_semantic_scholar, ) _reset_circuit_breaker() # ensure clean CB state from prior tests call_count = 0 def mock_urlopen(req, **kwargs): _ = kwargs nonlocal call_count call_count += 1 raise urllib.error.HTTPError( req.full_url if hasattr(req, "full_url") else str(req), 429, "Too Many Requests", Message(), None, ) with patch("urllib.request.urlopen", side_effect=mock_urlopen): with patch("time.sleep"): papers = search_semantic_scholar("test query", limit=5) assert papers == [] assert call_count == _MAX_RETRIES class TestDegradationChain: def test_search_degrades_to_cache_on_api_failure(self, tmp_path: Path) -> None: from researchclaw.literature.cache import put_cache from researchclaw.literature.search import search_papers cached = [ { "paper_id": "cached-1", "title": "Cached Paper", "authors": [], "year": 2024, "abstract": "cached", "venue": "", "citation_count": 5, "doi": "", "arxiv_id": "", "url": "", "source": "semantic_scholar", } ] put_cache( "test degradation", "semantic_scholar", 20, cached, cache_base=tmp_path ) with patch( "researchclaw.literature.search.search_semantic_scholar", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.search.search_arxiv", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path ): papers = search_papers("test degradation", limit=20) assert len(papers) >= 1 assert any(p.title == "Cached Paper" for p in papers) def test_search_empty_on_total_failure(self, tmp_path: Path) -> None: from researchclaw.literature.search import search_papers with patch( "researchclaw.literature.search.search_openalex", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.search.search_semantic_scholar", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.search.search_arxiv", side_effect=RuntimeError("API down"), ): with patch( "researchclaw.literature.cache._DEFAULT_CACHE_DIR", tmp_path / "empty-cache", ): papers = search_papers("no results query", limit=20) assert papers == [] class TestLLMFallback: def test_primary_403_forbidden_fallback_succeeds(self) -> None: from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse client = LLMClient( LLMConfig( base_url="https://api.example.com/v1", api_key="test-key", primary_model="gpt-blocked", fallback_models=["gpt-fallback"], max_retries=1, ) ) call_models: list[str] = [] def mock_raw_call(model, messages, max_tokens, temperature, json_mode): _ = messages, max_tokens, temperature, json_mode call_models.append(model) if model == "gpt-blocked": raise urllib.error.HTTPError( "url", 403, "not allowed to use model", Message(), None ) return LLMResponse(content="ok", model=model) with patch.object(client, "_raw_call", side_effect=mock_raw_call): resp = client.chat([{"role": "user", "content": "test"}]) assert resp.content == "ok" assert "gpt-blocked" in call_models assert "gpt-fallback" in call_models def test_preflight_detects_401(self) -> None: from researchclaw.llm.client import LLMClient, LLMConfig client = LLMClient( LLMConfig( base_url="https://api.example.com/v1", api_key="bad-key", primary_model="gpt-test", fallback_models=[], max_retries=1, ) ) if not hasattr(client, "preflight"): pytest.skip("preflight() not yet implemented") err = urllib.error.HTTPError("url", 401, "Unauthorized", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "Invalid API key" in msg class TestNoncriticalStageSkip: @staticmethod def _make_rc_config(tmp_path: Path): from researchclaw.config import RCConfig data = { "project": {"name": "rc-e2e-regression", "mode": "docs-first"}, "research": {"topic": "pipeline regression"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline", }, } return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) def test_noncritical_stage_failure_is_skipped(self, tmp_path: Path) -> None: from researchclaw.adapters import AdapterBundle from researchclaw.pipeline import runner as rc_runner from researchclaw.pipeline.executor import StageResult from researchclaw.pipeline.stages import STAGE_SEQUENCE, Stage, StageStatus run_dir = tmp_path / "run" run_dir.mkdir() config = self._make_rc_config(tmp_path) adapters = AdapterBundle() def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage is Stage.KNOWLEDGE_ARCHIVE: return StageResult( stage=stage, status=StageStatus.FAILED, artifacts=(), error="archive error", ) return StageResult( stage=stage, status=StageStatus.DONE, artifacts=("ok.md",) ) with patch.object(rc_runner, "execute_stage", side_effect=mock_execute_stage): results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-skip-noncritical", config=config, adapters=adapters, skip_noncritical=True, ) assert len(results) == len(STAGE_SEQUENCE) assert results[-1].stage is Stage.CITATION_VERIFY assert any( r.stage is Stage.KNOWLEDGE_ARCHIVE and r.status is StageStatus.FAILED for r in results ) def test_critical_stage_failure_still_aborts(self, tmp_path: Path) -> None: from researchclaw.adapters import AdapterBundle from researchclaw.pipeline import runner as rc_runner from researchclaw.pipeline.executor import StageResult from researchclaw.pipeline.stages import Stage, StageStatus run_dir = tmp_path / "run-critical" run_dir.mkdir() config = self._make_rc_config(tmp_path) adapters = AdapterBundle() def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage is Stage.PAPER_DRAFT: return StageResult( stage=stage, status=StageStatus.FAILED, artifacts=(), error="draft error", ) return StageResult( stage=stage, status=StageStatus.DONE, artifacts=("ok.md",) ) with patch.object(rc_runner, "execute_stage", side_effect=mock_execute_stage): results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-fail-critical", config=config, adapters=adapters, skip_noncritical=True, ) assert results[-1].stage is Stage.PAPER_DRAFT assert results[-1].status is StageStatus.FAILED ================================================ FILE: tests/test_rc_evolution.py ================================================ # pyright: reportPrivateUsage=false """Tests for the evolution (self-learning) system.""" from __future__ import annotations import json from datetime import datetime, timezone, timedelta from pathlib import Path import pytest from researchclaw.evolution import ( EvolutionStore, LessonCategory, LessonEntry, extract_lessons, _classify_error, _time_weight, ) # ── LessonEntry tests ── class TestLessonEntry: def test_to_dict_and_from_dict_roundtrip(self) -> None: entry = LessonEntry( stage_name="hypothesis_gen", stage_num=8, category="experiment", severity="error", description="Code validation failed", timestamp="2026-03-10T12:00:00+00:00", run_id="run-1", ) data = entry.to_dict() restored = LessonEntry.from_dict(data) assert restored.stage_name == "hypothesis_gen" assert restored.stage_num == 8 assert restored.category == "experiment" assert restored.severity == "error" def test_from_dict_handles_missing_fields(self) -> None: entry = LessonEntry.from_dict({}) assert entry.stage_name == "" assert entry.stage_num == 0 assert entry.category == "pipeline" # ── Classification tests ── class TestClassifyError: def test_timeout_classified_as_system(self) -> None: assert _classify_error("experiment_run", "Connection timeout after 30s") == "system" def test_validation_classified_as_experiment(self) -> None: assert _classify_error("code_generation", "Syntax error in code") == "experiment" def test_citation_classified_as_literature(self) -> None: assert _classify_error("citation_verify", "Hallucinated reference") == "literature" def test_paper_classified_as_writing(self) -> None: assert _classify_error("paper_draft", "Draft quality too low") == "writing" def test_unknown_defaults_to_pipeline(self) -> None: assert _classify_error("unknown_stage", "something random") == "pipeline" # ── Time weight tests ── class TestTimeWeight: def test_recent_lesson_has_high_weight(self) -> None: now = datetime.now(timezone.utc).isoformat(timespec="seconds") assert _time_weight(now) > 0.9 def test_30_day_old_has_half_weight(self) -> None: ts = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat(timespec="seconds") weight = _time_weight(ts) assert 0.4 < weight < 0.6 # Should be ~0.5 def test_90_day_old_returns_zero(self) -> None: ts = (datetime.now(timezone.utc) - timedelta(days=91)).isoformat(timespec="seconds") assert _time_weight(ts) == 0.0 def test_invalid_timestamp_returns_zero(self) -> None: assert _time_weight("not-a-date") == 0.0 def test_empty_timestamp_returns_zero(self) -> None: assert _time_weight("") == 0.0 # ── Extract lessons tests ── class TestExtractLessons: def _make_result(self, stage_num, status, error=None, decision="proceed"): from types import SimpleNamespace from researchclaw.pipeline.stages import Stage, StageStatus stage = Stage(stage_num) return SimpleNamespace( stage=stage, status=StageStatus(status), error=error, decision=decision, ) def test_extracts_lesson_from_failed_stage(self) -> None: results = [self._make_result(4, "failed", error="API rate limited")] lessons = extract_lessons(results, run_id="test-run") assert len(lessons) == 1 assert lessons[0].severity == "error" assert "rate limited" in lessons[0].description def test_extracts_lesson_from_blocked_stage(self) -> None: results = [self._make_result(5, "blocked_approval")] lessons = extract_lessons(results, run_id="test-run") assert len(lessons) == 1 assert lessons[0].severity == "warning" assert "blocked" in lessons[0].description def test_extracts_lesson_from_pivot_decision(self) -> None: results = [self._make_result(15, "done", decision="pivot")] lessons = extract_lessons(results, run_id="test-run") assert len(lessons) == 1 assert "PIVOT" in lessons[0].description def test_no_lessons_from_successful_proceed(self) -> None: results = [self._make_result(1, "done", decision="proceed")] lessons = extract_lessons(results) assert len(lessons) == 0 def test_multiple_results_multiple_lessons(self) -> None: results = [ self._make_result(4, "failed", error="timeout"), self._make_result(5, "blocked_approval"), self._make_result(15, "done", decision="refine"), ] lessons = extract_lessons(results) assert len(lessons) == 3 def test_extracts_decision_rationale(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" stage_dir = run_dir / "stage-15" stage_dir.mkdir(parents=True) (stage_dir / "decision_structured.json").write_text( json.dumps({"decision": "pivot", "rationale": "NaN in metrics"}), encoding="utf-8", ) results = [self._make_result(15, "done", decision="pivot")] lessons = extract_lessons(results, run_id="test", run_dir=run_dir) assert any("NaN in metrics" in l.description for l in lessons) def test_extracts_rationale_from_raw_text_excerpt(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" stage_dir = run_dir / "stage-15" stage_dir.mkdir(parents=True) (stage_dir / "decision_structured.json").write_text( json.dumps({ "decision": "refine", "raw_text_excerpt": ( "## Decision\n**REFINE**\n\n" "## Justification\n" "The analysis provides promising evidence but lacks statistical rigor." ), "generated": "2026-03-11T05:15:43+00:00", }), encoding="utf-8", ) results = [self._make_result(15, "done", decision="refine")] lessons = extract_lessons(results, run_id="test", run_dir=run_dir) assert any("statistical rigor" in l.description for l in lessons) def test_extracts_stderr_runtime_lesson(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True) (runs_dir / "run-1.json").write_text( json.dumps({ "metrics": {"loss": 0.5}, "stderr": "RuntimeWarning: invalid value encountered in divide", }), encoding="utf-8", ) results = [self._make_result(12, "done")] lessons = extract_lessons(results, run_dir=run_dir) assert any("RuntimeWarning" in l.description for l in lessons) def test_extracts_nan_metric_lesson(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True) (runs_dir / "run-1.json").write_text( json.dumps({"metrics": {"accuracy": "nan"}}), encoding="utf-8", ) results = [self._make_result(12, "done")] lessons = extract_lessons(results, run_dir=run_dir) assert any("accuracy" in l.description and "nan" in l.description.lower() for l in lessons) def test_no_runtime_lessons_without_run_dir(self) -> None: results = [self._make_result(12, "done")] lessons = extract_lessons(results) assert len(lessons) == 0 # ── EvolutionStore tests ── class TestEvolutionStore: def test_append_and_load(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") lesson = LessonEntry( stage_name="hypothesis_gen", stage_num=8, category="pipeline", severity="warning", description="PIVOT triggered", timestamp=datetime.now(timezone.utc).isoformat(timespec="seconds"), ) store.append(lesson) loaded = store.load_all() assert len(loaded) == 1 assert loaded[0].stage_name == "hypothesis_gen" def test_append_many(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") lessons = [ LessonEntry("s1", 1, "system", "error", "err1", datetime.now(timezone.utc).isoformat()), LessonEntry("s2", 2, "pipeline", "info", "info1", datetime.now(timezone.utc).isoformat()), ] store.append_many(lessons) assert store.count() == 2 def test_append_many_empty_is_noop(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") store.append_many([]) assert store.count() == 0 def test_load_all_empty_store(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") assert store.load_all() == [] def test_query_for_stage_returns_relevant_lessons(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") now = datetime.now(timezone.utc).isoformat(timespec="seconds") store.append(LessonEntry("hypothesis_gen", 8, "pipeline", "error", "Failed hypothesis", now)) store.append(LessonEntry("paper_draft", 17, "writing", "warning", "Draft too short", now)) result = store.query_for_stage("hypothesis_gen", max_lessons=5) # hypothesis_gen lesson should be boosted assert len(result) >= 1 assert result[0].stage_name == "hypothesis_gen" def test_query_respects_max_lessons(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") now = datetime.now(timezone.utc).isoformat(timespec="seconds") for i in range(10): store.append(LessonEntry("stage_1", 1, "system", "error", f"Error {i}", now)) result = store.query_for_stage("stage_1", max_lessons=3) assert len(result) == 3 def test_build_overlay_returns_empty_for_no_lessons(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") assert store.build_overlay("hypothesis_gen") == "" def test_build_overlay_returns_formatted_text(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") now = datetime.now(timezone.utc).isoformat(timespec="seconds") store.append(LessonEntry("hypothesis_gen", 8, "experiment", "error", "Code syntax error in experiment", now)) overlay = store.build_overlay("hypothesis_gen") assert "Lessons from Prior Runs" in overlay assert "Code syntax error" in overlay assert "❌" in overlay def test_old_lessons_filtered_by_time_weight(self, tmp_path: Path) -> None: store = EvolutionStore(tmp_path / "evo") old_ts = (datetime.now(timezone.utc) - timedelta(days=100)).isoformat() store.append(LessonEntry("stage_1", 1, "system", "error", "Old error", old_ts)) result = store.query_for_stage("stage_1") assert len(result) == 0 # Filtered out due to age > 90 days def test_creates_directory_if_not_exists(self, tmp_path: Path) -> None: store_dir = tmp_path / "nested" / "evo" store = EvolutionStore(store_dir) assert store_dir.exists() # ── PromptManager evolution overlay integration ── class TestPromptManagerEvolutionOverlay: def test_overlay_appended_to_user_prompt(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() overlay = "## Lessons\n1. Avoid timeout errors." sp = pm.for_stage( "topic_init", evolution_overlay=overlay, topic="test", domains="ml", project_name="p1", quality_threshold="8.0", ) assert "Avoid timeout errors" in sp.user def test_no_overlay_when_empty(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp1 = pm.for_stage( "topic_init", topic="test", domains="ml", project_name="p1", quality_threshold="8.0", ) sp2 = pm.for_stage( "topic_init", evolution_overlay="", topic="test", domains="ml", project_name="p1", quality_threshold="8.0", ) assert sp1.user == sp2.user ================================================ FILE: tests/test_rc_executor.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false from __future__ import annotations import json import re import sys from pathlib import Path from types import SimpleNamespace from typing import Any, cast import pytest from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.pipeline import executor as rc_executor from researchclaw.pipeline.stages import Stage, StageStatus class FakeLLMClient: def __init__(self, response_text: str = "mock response"): self.response_text: str = response_text self.calls: list[list[dict[str, str]]] = [] def chat(self, messages: list[dict[str, str]], **kwargs: object): _ = kwargs self.calls.append(messages) from researchclaw.llm.client import LLMResponse return LLMResponse(content=self.response_text, model="fake-model") class FakeLLMClientWithConfig(FakeLLMClient): def __init__(self, response_text: str = "mock response"): super().__init__(response_text=response_text) self.config: SimpleNamespace = SimpleNamespace( base_url="http://fake", api_key="fake-key" ) @pytest.fixture() def rc_config(tmp_path: Path) -> RCConfig: data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "test-driven science", "domains": ["ml", "systems"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": {"mode": "sandbox"}, } return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) @pytest.fixture() def adapters() -> AdapterBundle: return AdapterBundle() @pytest.fixture() def run_dir(tmp_path: Path) -> Path: path = tmp_path / "run" path.mkdir() return path def _write_prior_artifact( run_dir: Path, stage_num: int, filename: str, content: str ) -> None: stage_dir = run_dir / f"stage-{stage_num:02d}" stage_dir.mkdir(parents=True, exist_ok=True) (stage_dir / filename).write_text(content, encoding="utf-8") def test_executor_map_has_23_entries() -> None: executor_map = getattr(rc_executor, "EXECUTOR_MAP", rc_executor._STAGE_EXECUTORS) assert len(executor_map) == 23 def test_every_stage_member_has_matching_executor() -> None: executor_map = getattr(rc_executor, "EXECUTOR_MAP", rc_executor._STAGE_EXECUTORS) assert set(executor_map.keys()) == set(Stage) def test_stage_result_dataclass_fields() -> None: result = rc_executor.StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md",) ) assert result.stage == Stage.TOPIC_INIT assert result.status == StageStatus.DONE assert result.artifacts == ("goal.md",) assert result.error is None assert result.decision == "proceed" assert result.evidence_refs == () def test_utcnow_iso_returns_valid_iso_timestamp() -> None: ts = rc_executor._utcnow_iso() assert ts.endswith("+00:00") assert "T" in ts @pytest.mark.parametrize( ("text", "expected"), [ ("before\n```yaml\na: 1\n```\nafter", "a: 1"), ("```yml\nkey: value\n```", "key: value"), ("```\nplain: true\n```", "plain: true"), (" x: y ", "x: y"), ], ) def test_extract_yaml_block_variants(text: str, expected: str) -> None: assert rc_executor._extract_yaml_block(text) == expected @pytest.mark.parametrize( ("payload", "default", "expected"), [ ('{"ok": true}', {"fallback": True}, {"ok": True}), ("[1, 2, 3]", {"fallback": True}, [1, 2, 3]), ("not-json", {"fallback": True}, {"fallback": True}), ], ) def test_safe_json_loads_valid_and_invalid(payload: str, default, expected) -> None: assert rc_executor._safe_json_loads(payload, default) == expected @pytest.mark.parametrize( ("raw", "expected"), [ ("a/b", "a_b"), ("a\\b", "a_b"), ("../secret", "__secret"), ("name with spaces!.md", "name_with_spaces_.md"), ("", "unnamed"), ], ) def test_safe_filename_sanitization(raw: str, expected: str) -> None: assert rc_executor._safe_filename(raw) == expected def test_safe_filename_truncates_to_100_chars() -> None: raw = "x" * 120 cleaned = rc_executor._safe_filename(raw) assert len(cleaned) == 100 assert cleaned == "x" * 100 def test_build_context_preamble_basic_fields( rc_config: RCConfig, run_dir: Path ) -> None: text = rc_executor._build_context_preamble(rc_config, run_dir) assert "## Research Context" in text assert "test-driven science" in text assert "ml, systems" in text def test_build_context_preamble_includes_selected_prior_artifacts( rc_config: RCConfig, run_dir: Path ) -> None: _write_prior_artifact(run_dir, 1, "goal.md", "goal content") _write_prior_artifact(run_dir, 8, "hypotheses.md", "hyp content") _write_prior_artifact(run_dir, 7, "synthesis.md", "synth content") text = rc_executor._build_context_preamble( rc_config, run_dir, include_goal=True, include_hypotheses=True, include_synthesis=True, ) assert "### Goal" in text assert "goal content" in text assert "### Hypotheses" in text assert "hyp content" in text assert "### Synthesis" in text assert "synth content" in text def test_read_prior_artifact_finds_newest_file(run_dir: Path) -> None: _write_prior_artifact(run_dir, 1, "goal.md", "old") _write_prior_artifact(run_dir, 3, "goal.md", "new") found = rc_executor._read_prior_artifact(run_dir, "goal.md") assert found == "new" def test_read_prior_artifact_finds_directory_path(run_dir: Path) -> None: cards_dir = run_dir / "stage-06" / "cards" cards_dir.mkdir(parents=True) (cards_dir / "card-1.json").write_text("{}", encoding="utf-8") found = rc_executor._read_prior_artifact(run_dir, "cards/") assert found == str(cards_dir) def test_read_prior_artifact_returns_none_when_not_found(run_dir: Path) -> None: assert rc_executor._read_prior_artifact(run_dir, "missing.md") is None def test_read_best_analysis_prefers_best_file(run_dir: Path) -> None: """BUG-225: _read_best_analysis prefers analysis_best.md at run root.""" from researchclaw.pipeline._helpers import _read_best_analysis # Create degenerate analysis in stage-14 and best at run root s14 = run_dir / "stage-14" s14.mkdir(parents=True) (s14 / "analysis.md").write_text("Degenerate analysis", encoding="utf-8") (run_dir / "analysis_best.md").write_text("Best analysis", encoding="utf-8") result = _read_best_analysis(run_dir) assert result == "Best analysis" def test_read_best_analysis_falls_back_to_prior_artifact(run_dir: Path) -> None: """BUG-225: Falls back to _read_prior_artifact when no analysis_best.md.""" from researchclaw.pipeline._helpers import _read_best_analysis s14 = run_dir / "stage-14" s14.mkdir(parents=True) (s14 / "analysis.md").write_text("Only analysis", encoding="utf-8") result = _read_best_analysis(run_dir) assert result == "Only analysis" def test_read_best_analysis_returns_empty_when_none(run_dir: Path) -> None: """BUG-225: Returns empty string when no analysis exists at all.""" from researchclaw.pipeline._helpers import _read_best_analysis result = _read_best_analysis(run_dir) assert result == "" def test_write_stage_meta_writes_expected_json(run_dir: Path) -> None: stage_dir = run_dir / "stage-01" stage_dir.mkdir() result = rc_executor.StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md",), decision="proceed", evidence_refs=("stage-01/goal.md",), ) rc_executor._write_stage_meta(stage_dir, Stage.TOPIC_INIT, "run-abc", result) payload = cast( dict[str, Any], json.loads((stage_dir / "decision.json").read_text(encoding="utf-8")), ) assert payload["stage_id"] == "01-topic_init" assert payload["run_id"] == "run-abc" assert payload["status"] == "done" assert payload["decision"] == "proceed" assert payload["output_artifacts"] == ["goal.md"] assert payload["evidence_refs"] == ["stage-01/goal.md"] assert payload["next_stage"] == 2 assert re.match(r"\d{4}-\d{2}-\d{2}T", payload["ts"]) def test_execute_stage_creates_stage_dir_writes_artifacts_and_meta( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: fake_llm = FakeLLMClientWithConfig("# Goal\n\nMocked goal body") monkeypatch.setattr( "researchclaw.pipeline.executor.LLMClient.from_rc_config", lambda _config: fake_llm, ) result = rc_executor.execute_stage( Stage.TOPIC_INIT, run_dir=run_dir, run_id="run-1", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.DONE assert "goal.md" in result.artifacts assert "hardware_profile.json" in result.artifacts assert (run_dir / "stage-01").is_dir() assert ( (run_dir / "stage-01" / "goal.md") .read_text(encoding="utf-8") .startswith("# Goal") ) assert (run_dir / "stage-01" / "hardware_profile.json").exists() assert len(fake_llm.calls) == 1 decision = cast( dict[str, Any], json.loads( (run_dir / "stage-01" / "decision.json").read_text(encoding="utf-8") ), ) assert decision["run_id"] == "run-1" assert decision["status"] == "done" assert decision["output_artifacts"] == ["goal.md", "hardware_profile.json"] def test_execute_stage_contract_validation_missing_output_file_marks_failed( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: def bad_executor( _stage_dir: Path, _run_dir: Path, _config: RCConfig, _adapters: AdapterBundle, *, llm: object = None, ): _ = llm return rc_executor.StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("goal.md",) ) monkeypatch.setitem(rc_executor._STAGE_EXECUTORS, Stage.TOPIC_INIT, bad_executor) result = rc_executor.execute_stage( Stage.TOPIC_INIT, run_dir=run_dir, run_id="run-2", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.FAILED assert "Missing or empty output: goal.md" in (result.error or "") def test_execute_stage_contract_validation_missing_output_directory_marks_failed( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact(run_dir, 5, "shortlist.jsonl", '{"title": "x"}') def bad_executor( _stage_dir: Path, _run_dir: Path, _config: RCConfig, _adapters: AdapterBundle, *, llm: object = None, ): _ = llm return rc_executor.StageResult( stage=Stage.KNOWLEDGE_EXTRACT, status=StageStatus.DONE, artifacts=("cards/",), ) monkeypatch.setitem( rc_executor._STAGE_EXECUTORS, Stage.KNOWLEDGE_EXTRACT, bad_executor ) result = rc_executor.execute_stage( Stage.KNOWLEDGE_EXTRACT, run_dir=run_dir, run_id="run-3", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.FAILED assert "Missing output directory: cards/" in (result.error or "") def test_execute_stage_missing_required_input_returns_failed( run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: result = rc_executor.execute_stage( Stage.PROBLEM_DECOMPOSE, run_dir=run_dir, run_id="run-4", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.FAILED assert "Missing input: goal.md" in (result.error or "") def test_execute_stage_gate_behavior_auto_approve_true_keeps_done( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact(run_dir, 4, "candidates.jsonl", '{"title": "paper"}') def good_executor( stage_dir: Path, _run_dir: Path, _config: RCConfig, _adapters: AdapterBundle, *, llm: object = None, **_kwargs: object, ): _ = llm (stage_dir / "shortlist.jsonl").write_text( '{"title": "paper"}\n', encoding="utf-8" ) return rc_executor.StageResult( stage=Stage.LITERATURE_SCREEN, status=StageStatus.DONE, artifacts=("shortlist.jsonl",), ) monkeypatch.setitem( rc_executor._STAGE_EXECUTORS, Stage.LITERATURE_SCREEN, good_executor ) result = rc_executor.execute_stage( Stage.LITERATURE_SCREEN, run_dir=run_dir, run_id="run-5", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.DONE memory_entries = getattr(adapters.memory, "entries", []) assert any( ns == "gates" and "auto-approved" in content for ns, content in memory_entries ) def test_execute_stage_gate_behavior_auto_approve_false_blocks( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact(run_dir, 4, "candidates.jsonl", '{"title": "paper"}') def good_executor( stage_dir: Path, _run_dir: Path, _config: RCConfig, _adapters: AdapterBundle, *, llm: object = None, **_kwargs: object, ): _ = llm (stage_dir / "shortlist.jsonl").write_text( '{"title": "paper"}\n', encoding="utf-8" ) return rc_executor.StageResult( stage=Stage.LITERATURE_SCREEN, status=StageStatus.DONE, artifacts=("shortlist.jsonl",), ) monkeypatch.setitem( rc_executor._STAGE_EXECUTORS, Stage.LITERATURE_SCREEN, good_executor ) result = rc_executor.execute_stage( Stage.LITERATURE_SCREEN, run_dir=run_dir, run_id="run-6", config=rc_config, adapters=adapters, auto_approve_gates=False, ) assert result.status == StageStatus.BLOCKED_APPROVAL assert result.decision == "block" message_calls = getattr(adapters.message, "calls", []) assert message_calls assert "Approval required" in message_calls[-1][2] def test_execute_stage_llm_client_creation_error_falls_back_without_crash( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: def boom(_config: RCConfig): raise RuntimeError("llm init failed") monkeypatch.setattr("researchclaw.pipeline.executor.LLMClient.from_rc_config", boom) result = rc_executor.execute_stage( Stage.TOPIC_INIT, run_dir=run_dir, run_id="run-7", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.DONE assert (run_dir / "stage-01" / "goal.md").exists() def test_execute_stage_executor_exception_returns_failed( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: def raising_executor( _stage_dir: Path, _run_dir: Path, _config: RCConfig, _adapters: AdapterBundle, *, llm: object = None, **_kwargs: object, ): _ = llm raise RuntimeError("stage exploded") monkeypatch.setitem( rc_executor._STAGE_EXECUTORS, Stage.TOPIC_INIT, raising_executor ) result = rc_executor.execute_stage( Stage.TOPIC_INIT, run_dir=run_dir, run_id="run-8", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert result.status == StageStatus.FAILED assert result.decision == "retry" assert "stage exploded" in (result.error or "") @pytest.mark.parametrize( "stage", [ Stage.TOPIC_INIT, Stage.PROBLEM_DECOMPOSE, Stage.SEARCH_STRATEGY, Stage.LITERATURE_COLLECT, Stage.LITERATURE_SCREEN, Stage.KNOWLEDGE_EXTRACT, Stage.SYNTHESIS, Stage.HYPOTHESIS_GEN, Stage.EXPERIMENT_DESIGN, Stage.CODE_GENERATION, ], ) def test_stage_executor_mapping_values_are_callable(stage: Stage) -> None: assert callable(rc_executor._STAGE_EXECUTORS[stage]) class TestStageHealth: def test_stage_health_json_written(self, tmp_path: Path) -> None: from researchclaw.pipeline.executor import execute_stage from researchclaw.pipeline.stages import Stage config = RCConfig.load( Path(__file__).parent.parent / "config.researchclaw.example.yaml", check_paths=False, ) result = execute_stage( Stage.TOPIC_INIT, run_dir=tmp_path, run_id="test-health", config=config, adapters=AdapterBundle(), auto_approve_gates=True, ) health_path = tmp_path / "stage-01" / "stage_health.json" assert result is not None assert health_path.exists() def test_stage_health_has_required_fields(self, tmp_path: Path) -> None: from unittest.mock import MagicMock, patch from researchclaw.pipeline.executor import execute_stage from researchclaw.pipeline.stages import Stage config = RCConfig.load( Path(__file__).parent.parent / "config.researchclaw.example.yaml", check_paths=False, ) with patch("researchclaw.pipeline.executor.LLMClient") as mock_llm_cls: mock_client = MagicMock() mock_client.chat.return_value = MagicMock( content='{"topic": "test", "research_questions": ["q1"]}' ) mock_llm_cls.from_rc_config.return_value = mock_client execute_stage( Stage.TOPIC_INIT, run_dir=tmp_path, run_id="test-health-fields", config=config, adapters=AdapterBundle(), auto_approve_gates=True, ) health_path = tmp_path / "stage-01" / "stage_health.json" if health_path.exists(): data = json.loads(health_path.read_text(encoding="utf-8")) assert "stage_id" in data assert "run_id" in data assert "duration_sec" in data assert "status" in data assert "timestamp" in data assert data["duration_sec"] >= 0 def test_stage_health_duration_positive(self, tmp_path: Path) -> None: from unittest.mock import MagicMock, patch from researchclaw.pipeline.executor import execute_stage from researchclaw.pipeline.stages import Stage config = RCConfig.load( Path(__file__).parent.parent / "config.researchclaw.example.yaml", check_paths=False, ) with patch("researchclaw.pipeline.executor.LLMClient") as mock_llm_cls: mock_client = MagicMock() mock_client.chat.return_value = MagicMock( content='{"topic": "test", "sub_problems": []}' ) mock_llm_cls.from_rc_config.return_value = mock_client execute_stage( Stage.TOPIC_INIT, run_dir=tmp_path, run_id="test-duration", config=config, adapters=AdapterBundle(), auto_approve_gates=True, ) health_path = tmp_path / "stage-01" / "stage_health.json" if health_path.exists(): data = json.loads(health_path.read_text(encoding="utf-8")) assert data["duration_sec"] >= 0 # Contracts import for Stage 13/22 preservation features. from researchclaw.pipeline.contracts import CONTRACTS class TestIterativeRefine: def _prepare_refine_inputs(self, run_dir: Path) -> None: _write_prior_artifact( run_dir, 10, "experiment.py", ( "import random\n" "random.seed(42)\n" "for i in range(5):\n" " print(f'val_loss: {0.5 - i*0.05:.4f}')\n" ), ) (run_dir / "stage-12" / "runs").mkdir(parents=True, exist_ok=True) _write_prior_artifact( run_dir, 12, "runs/run-1.json", json.dumps( { "run_id": "run-1", "status": "completed", "metrics": {"val_loss": 0.35}, } ), ) def test_refine_simulated_mode_skips( self, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: """R10-Fix3: Simulated mode should skip iterative refinement entirely.""" self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) # Force simulated mode to test the skip behavior import copy sim_cfg = copy.deepcopy(rc_config) object.__setattr__(sim_cfg.experiment, "mode", "simulated") result = rc_executor._execute_iterative_refine( stage_dir, run_dir, sim_cfg, adapters, llm=None, ) payload = json.loads( (stage_dir / "refinement_log.json").read_text(encoding="utf-8") ) assert payload["skipped"] is True assert payload["mode"] == "simulated" assert result.status == StageStatus.DONE # Original code should be copied as final assert (stage_dir / "experiment_final.py").exists() def test_refine_no_llm_saves_original_as_final( self, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) result = rc_executor._execute_iterative_refine( stage_dir, run_dir, rc_config, adapters, llm=None, ) original_code = (run_dir / "stage-10" / "experiment.py").read_text( encoding="utf-8" ) final_code = (stage_dir / "experiment_final.py").read_text(encoding="utf-8") assert original_code == final_code payload = json.loads( (stage_dir / "refinement_log.json").read_text(encoding="utf-8") ) assert payload["stop_reason"] == "llm_unavailable" assert result.status == StageStatus.DONE def test_refine_with_llm_generates_improved_code( self, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) llm = FakeLLMClient( "```python\n" "import random\n" "random.seed(42)\n" "for i in range(10):\n" " print(f'val_loss: {0.4 - i*0.03:.4f}')\n" "```" ) rc_executor._execute_iterative_refine( stage_dir, run_dir, rc_config, adapters, llm=llm ) assert (stage_dir / "experiment_v1").is_dir() assert (stage_dir / "experiment_final.py").exists() payload = json.loads( (stage_dir / "refinement_log.json").read_text(encoding="utf-8") ) assert isinstance(payload.get("iterations"), list) assert payload["iterations"] def test_refine_converges_after_no_improvement( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle, ) -> None: import sys self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) sandbox_data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "test-driven science", "domains": ["ml", "systems"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 3, "metric_key": "val_loss", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 1024, }, }, } sandbox_config = RCConfig.from_dict( sandbox_data, project_root=tmp_path, check_paths=False, ) llm = FakeLLMClient( "```python\nfor _ in range(3):\n print('val_loss: 0.5000')\n```" ) rc_executor._execute_iterative_refine( stage_dir, run_dir, sandbox_config, adapters, llm=llm, ) payload = json.loads( (stage_dir / "refinement_log.json").read_text(encoding="utf-8") ) assert payload["converged"] is True assert payload["stop_reason"] == "no_improvement_for_2_iterations" def test_refine_artifacts_include_version_files( self, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) llm = FakeLLMClient( "```python\n" "import random\n" "random.seed(42)\n" "for i in range(10):\n" " print(f'val_loss: {0.4 - i*0.03:.4f}')\n" "```" ) result = rc_executor._execute_iterative_refine( stage_dir, run_dir, rc_config, adapters, llm=llm, ) assert "refinement_log.json" in result.artifacts assert "experiment_final/" in result.artifacts assert any( artifact.startswith("experiment_v") and artifact.endswith("/") for artifact in result.artifacts ) def test_refine_sandbox_mode_runs_code( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle, ) -> None: import sys self._prepare_refine_inputs(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) sandbox_data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "test-driven science", "domains": ["ml", "systems"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 3, "metric_key": "val_loss", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 1024, }, }, } sandbox_config = RCConfig.from_dict( sandbox_data, project_root=tmp_path, check_paths=False, ) llm = FakeLLMClient( "```python\n" "import random\n" "random.seed(42)\n" "for i in range(10):\n" " print(f'val_loss: {0.4 - i*0.03:.4f}')\n" "```" ) rc_executor._execute_iterative_refine( stage_dir, run_dir, sandbox_config, adapters, llm=llm, ) payload = json.loads( (stage_dir / "refinement_log.json").read_text(encoding="utf-8") ) assert any( "sandbox" in iteration for iteration in payload.get("iterations", []) ) class TestExportPublishCodePackage: def test_export_packages_experiment_final( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact( run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..." ) _write_prior_artifact( run_dir, 13, "experiment_final.py", 'import numpy\nprint("val_loss: 0.1")\n', ) stage_dir = tmp_path / "run" / "stage-22" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_export_publish( stage_dir, run_dir, rc_config, adapters, llm=None ) assert (stage_dir / "code" / "experiment.py").exists() assert (stage_dir / "code" / "README.md").exists() req_text = (stage_dir / "code" / "requirements.txt").read_text(encoding="utf-8") assert "numpy" in req_text def test_export_falls_back_to_experiment_py( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact( run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..." ) _write_prior_artifact( run_dir, 10, "experiment.py", 'import numpy\nprint("val_loss: 0.1")\n', ) stage_dir = tmp_path / "run" / "stage-22" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_export_publish( stage_dir, run_dir, rc_config, adapters, llm=None ) code_text = (stage_dir / "code" / "experiment.py").read_text(encoding="utf-8") assert "val_loss: 0.1" in code_text def test_export_no_experiment_skips_code_dir( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact( run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..." ) stage_dir = tmp_path / "run" / "stage-22" stage_dir.mkdir(parents=True, exist_ok=True) result = rc_executor._execute_export_publish( stage_dir, run_dir, rc_config, adapters, llm=None, ) assert not (stage_dir / "code").exists() assert "code/" not in result.artifacts def test_export_detects_multiple_dependencies( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact( run_dir, 19, "paper_revised.md", "# Test Paper\n\nSome content..." ) _write_prior_artifact( run_dir, 13, "experiment_final.py", ( "import numpy\n" "import torch\n" "from sklearn.metrics import accuracy_score\n" "print(accuracy_score([1], [1]))\n" ), ) stage_dir = tmp_path / "run" / "stage-22" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_export_publish( stage_dir, run_dir, rc_config, adapters, llm=None ) requirements = (stage_dir / "code" / "requirements.txt").read_text( encoding="utf-8" ) assert "numpy" in requirements assert "torch" in requirements assert "scikit-learn" in requirements def test_export_code_readme_contains_title( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: _write_prior_artifact( run_dir, 19, "paper_revised.md", "# My Great Paper\n\nSome content..." ) _write_prior_artifact( run_dir, 13, "experiment_final.py", 'print("val_loss: 0.1")\n', ) stage_dir = tmp_path / "run" / "stage-22" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_export_publish( stage_dir, run_dir, rc_config, adapters, llm=None ) readme = (stage_dir / "code" / "README.md").read_text(encoding="utf-8") assert "My Great Paper" in readme def test_contracts_stage13_includes_experiment_final() -> None: assert "experiment_final/" in CONTRACTS[Stage.ITERATIVE_REFINE].output_files def test_contracts_stage22_includes_code_dir() -> None: assert "code/" in CONTRACTS[Stage.EXPORT_PUBLISH].output_files # ── P1-1: Topic keyword extraction tests ── class TestExtractTopicKeywords: def test_basic_extraction(self) -> None: keywords = rc_executor._extract_topic_keywords( "Agent-based Reinforcement Learning for Automated Scientific Discovery" ) assert "agent-based" in keywords assert "reinforcement" in keywords assert "learning" in keywords assert "automated" in keywords assert "scientific" in keywords assert "discovery" in keywords # Stop words excluded # Stop words excluded assert "for" not in keywords def test_includes_domain_keywords(self) -> None: keywords = rc_executor._extract_topic_keywords( "Neural network pruning", domains=("ml", "optimization") ) assert "neural" in keywords assert "network" in keywords assert "pruning" in keywords assert "ml" in keywords assert "optimization" in keywords def test_deduplication(self) -> None: keywords = rc_executor._extract_topic_keywords( "Learning to learn meta-learning", domains=("learning",) ) assert keywords.count("learning") == 1 def test_empty_topic(self) -> None: keywords = rc_executor._extract_topic_keywords("") assert keywords == [] # ── P1-2: Topic constraint block test ── class TestTopicConstraintBlock: def test_contains_topic(self) -> None: block = rc_executor._topic_constraint_block("Transformer attention for time series") assert "Transformer attention for time series" in block def test_contains_prohibition(self) -> None: block = rc_executor._topic_constraint_block("anything") assert "PROHIBITED" in block assert "environment" in block.lower() assert "infrastructure" in block.lower() def test_hard_constraint_markers(self) -> None: block = rc_executor._topic_constraint_block("test") assert "HARD TOPIC CONSTRAINT" in block assert "END CONSTRAINT" in block # ── Multi-perspective debate tests ── class TestParseDecision: def test_proceed_default(self) -> None: assert rc_executor._parse_decision("Some random text") == "proceed" def test_proceed_explicit(self) -> None: text = "## Decision\nPROCEED\n## Justification\nGood results." assert rc_executor._parse_decision(text) == "proceed" def test_pivot_detected(self) -> None: text = "## Decision\nPIVOT\n## Justification\nHypotheses flawed." assert rc_executor._parse_decision(text) == "pivot" def test_refine_detected(self) -> None: text = "## Decision\nREFINE\n## Justification\nNeed more tuning." assert rc_executor._parse_decision(text) == "refine" def test_pivot_case_insensitive(self) -> None: text = "## Decision\npivot\n## Justification\nBad approach." assert rc_executor._parse_decision(text) == "pivot" def test_pivot_takes_priority_over_proceed(self) -> None: text = "## Decision\nPIVOT\nWe should not PROCEED." assert rc_executor._parse_decision(text) == "pivot" def test_decision_in_body_not_heading(self) -> None: text = "The results suggest we should PIVOT to a new approach." assert rc_executor._parse_decision(text) == "pivot" class TestResearchDecisionStructured: def test_decision_produces_structured_json( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-15" stage_dir.mkdir(parents=True) _write_prior_artifact(run_dir, 14, "analysis.md", "# Analysis\nResults ok.") fake_llm = FakeLLMClient("## Decision\nPROCEED\n## Justification\nGood.") result = rc_executor._execute_research_decision( stage_dir, run_dir, rc_config, adapters, llm=fake_llm ) assert result.decision == "proceed" assert "decision_structured.json" in result.artifacts import json data = json.loads((stage_dir / "decision_structured.json").read_text()) assert data["decision"] == "proceed" def test_pivot_decision_from_llm( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-15" stage_dir.mkdir(parents=True) _write_prior_artifact(run_dir, 14, "analysis.md", "# Analysis\nBad results.") fake_llm = FakeLLMClient("## Decision\nPIVOT\n## Justification\nFlawed.") result = rc_executor._execute_research_decision( stage_dir, run_dir, rc_config, adapters, llm=fake_llm ) assert result.decision == "pivot" def test_no_llm_defaults_to_proceed( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-15" stage_dir.mkdir(parents=True) result = rc_executor._execute_research_decision( stage_dir, run_dir, rc_config, adapters, llm=None ) assert result.decision == "proceed" class TestMultiPerspectiveGenerate: def test_generates_all_perspectives(self, tmp_path: Path) -> None: roles = { "role_a": {"system": "You are A.", "user": "Do A for {topic}."}, "role_b": {"system": "You are B.", "user": "Do B for {topic}."}, } fake_llm = FakeLLMClient("perspective output") perspectives_dir = tmp_path / "perspectives" result = rc_executor._multi_perspective_generate( fake_llm, roles, {"topic": "test"}, perspectives_dir ) assert set(result.keys()) == {"role_a", "role_b"} assert (perspectives_dir / "role_a.md").exists() assert (perspectives_dir / "role_b.md").exists() assert len(fake_llm.calls) == 2 def test_saves_perspective_content(self, tmp_path: Path) -> None: roles = {"critic": {"system": "Be critical.", "user": "Criticize {topic}."}} fake_llm = FakeLLMClient("critical analysis here") perspectives_dir = tmp_path / "perspectives" rc_executor._multi_perspective_generate( fake_llm, roles, {"topic": "ml"}, perspectives_dir ) content = (perspectives_dir / "critic.md").read_text() assert content == "critical analysis here" def test_renders_variables_in_prompts(self, tmp_path: Path) -> None: roles = {"r1": {"system": "Sys for {topic}.", "user": "User for {topic}."}} fake_llm = FakeLLMClient("ok") rc_executor._multi_perspective_generate( fake_llm, roles, {"topic": "RL"}, tmp_path / "p" ) call = fake_llm.calls[0] assert "RL" in call[0]["content"] class TestSynthesizePerspectives: def test_combines_perspectives(self) -> None: fake_llm = FakeLLMClient("synthesized result") pm = rc_executor.PromptManager() perspectives = {"innovator": "idea A", "contrarian": "idea B"} result = rc_executor._synthesize_perspectives( fake_llm, perspectives, "hypothesis_synthesize", pm ) assert result == "synthesized result" # Check the user prompt contained both perspectives call_content = fake_llm.calls[0][0]["content"] assert "innovator" in call_content assert "contrarian" in call_content class TestHypothesisGenDebate: def test_hypothesis_gen_with_llm_creates_perspectives( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-08" stage_dir.mkdir(parents=True) _write_prior_artifact(run_dir, 7, "synthesis.md", "# Synthesis\nGap found.") fake_llm = FakeLLMClient("## H1\nTest hypothesis") result = rc_executor._execute_hypothesis_gen( stage_dir, run_dir, rc_config, adapters, llm=fake_llm ) assert result.status == StageStatus.DONE assert "hypotheses.md" in result.artifacts perspectives_dir = stage_dir / "perspectives" assert perspectives_dir.exists() # Should have 3 perspective files (innovator, pragmatist, contrarian) perspective_files = list(perspectives_dir.glob("*.md")) assert len(perspective_files) == 3 def test_hypothesis_gen_without_llm_no_perspectives( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-08" stage_dir.mkdir(parents=True) _write_prior_artifact(run_dir, 7, "synthesis.md", "# Synthesis\nGap found.") result = rc_executor._execute_hypothesis_gen( stage_dir, run_dir, rc_config, adapters, llm=None ) assert result.status == StageStatus.DONE assert "hypotheses.md" in result.artifacts # No perspectives directory when no LLM assert not (stage_dir / "perspectives").exists() class TestResultAnalysisDebate: def test_result_analysis_with_llm_creates_perspectives( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-14" stage_dir.mkdir(parents=True) _write_prior_artifact(run_dir, 1, "goal.md", "# Goal\nTest") _write_prior_artifact(run_dir, 8, "hypotheses.md", "# H1\nTest") fake_llm = FakeLLMClient("## Analysis\nResults look good.") result = rc_executor._execute_result_analysis( stage_dir, run_dir, rc_config, adapters, llm=fake_llm ) assert result.status == StageStatus.DONE assert "analysis.md" in result.artifacts perspectives_dir = stage_dir / "perspectives" assert perspectives_dir.exists() # Should have 3 perspective files (optimist, skeptic, methodologist) perspective_files = list(perspectives_dir.glob("*.md")) assert len(perspective_files) == 3 def test_result_analysis_without_llm_no_perspectives( self, tmp_path: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-14" stage_dir.mkdir(parents=True) result = rc_executor._execute_result_analysis( stage_dir, run_dir, rc_config, adapters, llm=None ) assert result.status == StageStatus.DONE assert "analysis.md" in result.artifacts assert not (stage_dir / "perspectives").exists() class TestParseMetricsFromStdout: """Tests for _parse_metrics_from_stdout() helper.""" def test_parses_simple_name_value(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = "loss: 0.0042\naccuracy: 0.95" metrics = _parse_metrics_from_stdout(stdout) assert metrics["loss"] == pytest.approx(0.0042) assert metrics["accuracy"] == pytest.approx(0.95) def test_parses_compound_names(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = "UCB (Stochastic) cumulative_regret: 361.9233\nEXP3 (Adversarial) total_rewards: 13368.4811" metrics = _parse_metrics_from_stdout(stdout) assert "UCB (Stochastic) cumulative_regret" in metrics assert metrics["UCB (Stochastic) cumulative_regret"] == pytest.approx(361.9233) def test_ignores_non_numeric_lines(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = "Running experiment...\nloss: 0.5\nDone." metrics = _parse_metrics_from_stdout(stdout) assert len(metrics) == 1 assert metrics["loss"] == pytest.approx(0.5) def test_empty_stdout_returns_empty_dict(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout assert _parse_metrics_from_stdout("") == {} def test_handles_negative_values(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = "UCB (Adversarial) cumulative_regret: -3877.5323" metrics = _parse_metrics_from_stdout(stdout) assert metrics["UCB (Adversarial) cumulative_regret"] == pytest.approx(-3877.5323) def test_filters_log_lines(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = ( "Running experiments for support set size: 1\n" "Loading model weights: 42\n" "Training epoch: 5\n" "loss: 0.123\n" "accuracy: 0.95\n" ) metrics = _parse_metrics_from_stdout(stdout) assert "loss" in metrics assert "accuracy" in metrics assert len(metrics) == 2 # log lines should be excluded def test_filters_long_name_lines(self) -> None: from researchclaw.pipeline.executor import _parse_metrics_from_stdout stdout = "this is a very long status message that should not be a metric: 42\n" metrics = _parse_metrics_from_stdout(stdout) assert len(metrics) == 0 class TestDetectRuntimeIssues: """Tests for _detect_runtime_issues() helper.""" def _make_sandbox_result( self, metrics: dict | None = None, stdout: str = "", stderr: str = "", ): from types import SimpleNamespace return SimpleNamespace( metrics=metrics or {}, stdout=stdout, stderr=stderr, returncode=0, elapsed_sec=1.0, timed_out=False, ) def test_no_issues_returns_empty_string(self) -> None: r = self._make_sandbox_result(metrics={"loss": 0.5}, stdout="loss: 0.5") assert rc_executor._detect_runtime_issues(r) == "" def test_detects_nan_in_metrics(self) -> None: r = self._make_sandbox_result(metrics={"loss": float("nan")}) result = rc_executor._detect_runtime_issues(r) assert "NaN" in result assert "loss" in result def test_detects_inf_in_metrics(self) -> None: r = self._make_sandbox_result(metrics={"loss": float("inf")}) result = rc_executor._detect_runtime_issues(r) assert "Inf" in result def test_detects_nan_in_stdout(self) -> None: r = self._make_sandbox_result(stdout="accuracy: nan\nloss: 0.5") result = rc_executor._detect_runtime_issues(r) assert "NaN" in result or "nan" in result def test_detects_runtime_warning_in_stderr(self) -> None: stderr = ( "optimizers.py:76: RuntimeWarning: invalid value encountered in divide\n" " directions = np.vstack((directions[1:], new_direction / norm))\n" ) r = self._make_sandbox_result(stderr=stderr) result = rc_executor._detect_runtime_issues(r) assert "RuntimeWarning" in result assert "invalid value" in result def test_detects_division_error_in_stderr(self) -> None: stderr = "ZeroDivisionError: division by zero\n" r = self._make_sandbox_result(stderr=stderr) result = rc_executor._detect_runtime_issues(r) assert "Error" in result def test_ignores_benign_stderr(self) -> None: # Non-warning stderr should be ignored r = self._make_sandbox_result(stderr="Loading module...\nDone.\n") assert rc_executor._detect_runtime_issues(r) == "" def test_combined_nan_and_stderr(self) -> None: r = self._make_sandbox_result( metrics={"accuracy": float("nan")}, stderr="RuntimeWarning: invalid value\n", ) result = rc_executor._detect_runtime_issues(r) assert "NaN" in result assert "RuntimeWarning" in result def test_detects_dummy_metric_identical_values(self) -> None: stdout = ( "UCB (Stochastic) convergence_rate: 1.0000\n" "UCB (Adversarial) convergence_rate: 1.0000\n" "Thompson (Stochastic) convergence_rate: 1.0000\n" "Thompson (Adversarial) convergence_rate: 1.0000\n" ) r = self._make_sandbox_result(stdout=stdout) result = rc_executor._detect_runtime_issues(r) assert "DUMMY" in result assert "convergence_rate" in result def test_no_dummy_metric_when_values_differ(self) -> None: stdout = ( "UCB (Stochastic) regret: 78.5\n" "Thompson (Stochastic) regret: 121.0\n" "EpsilonGreedy (Stochastic) regret: 42.1\n" ) r = self._make_sandbox_result(stdout=stdout) result = rc_executor._detect_runtime_issues(r) assert "DUMMY" not in result class TestRemoveBibtexEntries: """Tests for _remove_bibtex_entries() helper.""" def test_removes_specified_keys(self) -> None: bib = ( '@article{smith2024,\n title={Good Paper},\n author={Smith},\n}\n\n' '@article{venus2024,\n title={Venus Exploration},\n author={NASA},\n}\n' ) result = rc_executor._remove_bibtex_entries(bib, {"venus2024"}) assert "smith2024" in result assert "venus2024" not in result def test_keeps_all_when_no_match(self) -> None: bib = '@article{smith2024,\n title={Paper},\n}\n' result = rc_executor._remove_bibtex_entries(bib, {"other_key"}) assert "smith2024" in result def test_empty_bib(self) -> None: assert rc_executor._remove_bibtex_entries("", {"key"}) == "" class TestRemoveCitationsFromText: """Tests for _remove_citations_from_text() helper.""" def test_removes_latex_cite(self) -> None: text = r"As shown in \cite{venus2024}, the results are..." result = rc_executor._remove_citations_from_text(text, {"venus2024"}) assert "venus2024" not in result assert "results are" in result def test_removes_markdown_cite(self) -> None: text = "Prior work [venus2024] explored this topic." result = rc_executor._remove_citations_from_text(text, {"venus2024"}) assert "venus2024" not in result def test_cleans_multi_cite_comma(self) -> None: text = r"\cite{good2024,venus2024}" result = rc_executor._remove_citations_from_text(text, {"venus2024"}) assert r"\cite{good2024}" in result class TestCollectRawExperimentMetrics: """Tests for _collect_raw_experiment_metrics() helper.""" def test_returns_empty_when_no_runs(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" run_dir.mkdir() block, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir) assert block == "" assert not has_parsed def test_extracts_metrics_from_stdout(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True) payload = { "metrics": {}, "stdout": "UCB regret: 361.92\nThompson regret: 576.24\n", } (runs_dir / "run-1.json").write_text(json.dumps(payload)) result, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir) assert "361.92" in result assert "576.24" in result assert "1 run(s)" in result assert not has_parsed def test_extracts_from_metrics_dict(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True) payload = {"metrics": {"loss": 0.042, "accuracy": 0.95}, "stdout": ""} (runs_dir / "run-1.json").write_text(json.dumps(payload)) result, has_parsed = rc_executor._collect_raw_experiment_metrics(run_dir) assert "loss" in result assert "0.042" in result assert has_parsed def test_deduplicates_metrics(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True) payload = { "metrics": {"loss": 0.5}, "stdout": "loss: 0.5\nloss: 0.5\n", } (runs_dir / "run-1.json").write_text(json.dumps(payload)) result, _ = rc_executor._collect_raw_experiment_metrics(run_dir) # "loss: 0.5" should appear only once (deduplicated) assert result.count("loss: 0.5") == 1 class TestCollectExperimentEvidence: """Tests for _collect_experiment_evidence() helper.""" def test_returns_empty_when_no_artifacts(self, tmp_path: Path) -> None: run_dir = tmp_path / "run" run_dir.mkdir() assert rc_executor._collect_experiment_evidence(run_dir) == "" def test_includes_main_py_code(self, run_dir: Path) -> None: exp_dir = run_dir / "stage-10" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('hello')", encoding="utf-8") result = rc_executor._collect_experiment_evidence(run_dir) assert "main.py" in result assert "hello" in result def test_includes_run_metrics(self, run_dir: Path) -> None: runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({"metrics": {"loss": 0.5}, "elapsed_sec": 3.2}), encoding="utf-8", ) result = rc_executor._collect_experiment_evidence(run_dir) assert "loss" in result assert "0.5" in result def test_includes_stderr_excerpt(self, run_dir: Path) -> None: runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "metrics": {"loss": 0.5}, "stderr": "RuntimeWarning: divide by zero", }), encoding="utf-8", ) result = rc_executor._collect_experiment_evidence(run_dir) assert "divide by zero" in result def test_includes_refinement_summary(self, run_dir: Path) -> None: refine_dir = run_dir / "stage-13" refine_dir.mkdir(parents=True, exist_ok=True) (refine_dir / "refinement_log.json").write_text( json.dumps({ "iterations": [{"iteration": 1}, {"iteration": 2}], "converged": True, "stop_reason": "no_improvement_for_2_iterations", "best_metric": 0.3, }), encoding="utf-8", ) result = rc_executor._collect_experiment_evidence(run_dir) assert "iterations_executed" in result assert "2" in result def test_includes_actual_trial_count(self, run_dir: Path) -> None: runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({"metrics": {"loss": 0.5}}), encoding="utf-8" ) result = rc_executor._collect_experiment_evidence(run_dir) assert "1 time(s)" in result assert "CRITICAL" in result class TestWritePaperSections: """Tests for _write_paper_sections() multi-call writing.""" def test_produces_three_part_draft(self) -> None: call_count = {"n": 0} parts = [ "# Test Title\n\n## Abstract\nTest abstract.\n\n## Introduction\nTest intro.\n\n## Related Work\nTest related.", "## Method\nTest method.\n\n## Experiments\nTest experiments.", "## Results\nTest results.\n\n## Discussion\nTest discussion.\n\n## Limitations\nTest limits.\n\n## Conclusion\nTest conclusion.", ] class MultiCallLLM: def __init__(self): self.calls: list = [] def chat(self, messages, **kwargs): self.calls.append(messages) from researchclaw.llm.client import LLMResponse idx = len(self.calls) - 1 return LLMResponse(content=parts[min(idx, 2)], model="fake") llm = MultiCallLLM() from researchclaw.prompts import PromptManager pm = PromptManager() draft = rc_executor._write_paper_sections( llm=llm, pm=pm, preamble="Test preamble", topic_constraint="", exp_metrics_instruction="", citation_instruction="", outline="Test outline", ) assert llm.calls is not None assert len(llm.calls) == 3 assert "## Abstract" in draft assert "## Method" in draft assert "## Results" in draft assert "## Conclusion" in draft def test_each_call_receives_prior_context(self) -> None: class ContextTrackingLLM: def __init__(self): self.user_prompts: list[str] = [] def chat(self, messages, **kwargs): for m in messages: if m.get("role") == "user": self.user_prompts.append(m["content"]) from researchclaw.llm.client import LLMResponse return LLMResponse(content="## Section\nContent here.", model="fake") llm = ContextTrackingLLM() from researchclaw.prompts import PromptManager pm = PromptManager() rc_executor._write_paper_sections( llm=llm, pm=pm, preamble="Preamble", topic_constraint="", exp_metrics_instruction="", citation_instruction="", outline="Outline", ) assert len(llm.user_prompts) == 3 # Call 2 and 3 should contain "sections written so far" assert "sections written so far" in llm.user_prompts[1] assert "completing a paper" in llm.user_prompts[2] class TestLoadHardwareProfile: """Tests for _load_hardware_profile().""" @pytest.fixture() def run_dir(self, tmp_path: Path) -> Path: d = tmp_path / "run" d.mkdir() return d def test_loads_valid_profile(self, run_dir: Path) -> None: stage = run_dir / "stage-01" stage.mkdir() profile = {"has_gpu": True, "gpu_type": "mps", "tier": "limited"} (stage / "hardware_profile.json").write_text( json.dumps(profile), encoding="utf-8" ) result = rc_executor._load_hardware_profile(run_dir) assert result is not None assert result["gpu_type"] == "mps" def test_returns_none_when_missing(self, run_dir: Path) -> None: assert rc_executor._load_hardware_profile(run_dir) is None def test_returns_none_on_invalid_json(self, run_dir: Path) -> None: stage = run_dir / "stage-01" stage.mkdir() (stage / "hardware_profile.json").write_text("not json", encoding="utf-8") assert rc_executor._load_hardware_profile(run_dir) is None class TestExpandSearchQueries: """Tests for _expand_search_queries().""" def test_adds_broader_queries(self) -> None: queries = ["gradient descent optimization algorithms"] topic = "Comparing gradient descent optimization algorithms on benchmark functions" result = rc_executor._expand_search_queries(queries, topic) assert len(result) > len(queries) def test_deduplicates(self) -> None: queries = ["gradient descent survey"] topic = "gradient descent optimization" result = rc_executor._expand_search_queries(queries, topic) lowered = [q.lower().strip() for q in result] assert len(lowered) == len(set(lowered)) def test_preserves_original_queries(self) -> None: queries = ["query A", "query B"] topic = "some research topic about machine learning methods" result = rc_executor._expand_search_queries(queries, topic) assert result[0] == "query A" assert result[1] == "query B" def test_adds_survey_benchmark_variants(self) -> None: queries = ["deep learning"] topic = "deep learning for image classification with limited data" result = rc_executor._expand_search_queries(queries, topic) has_survey = any("survey" in q.lower() for q in result) has_benchmark = any("benchmark" in q.lower() for q in result) assert has_survey assert has_benchmark # ── R4-1: Experiment Budget Guard Tests ────────────────────────────── class TestComputeBudgetBlock: """Test compute_budget prompt block injection (R4-1a).""" def test_compute_budget_block_exists_in_prompt_manager(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() block = pm.block("compute_budget") assert "time_budget_sec" in block or "Compute Budget" in block def test_compute_budget_injected_into_code_generation( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: import sys data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "optimizer comparison", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 60, "metric_key": "best_loss", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 1024, }, }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) # Write exp_plan prior artifact _write_prior_artifact(run_dir, 10, "exp_plan.yaml", "objectives: test") # Capture what the LLM receives llm = FakeLLMClient( "```filename:main.py\nimport numpy as np\nprint('best_loss: 0.1')\n```" ) stage_dir = run_dir / "stage-11" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_code_generation( stage_dir, run_dir, cfg, adapters, llm=llm ) # The LLM should have received compute budget info in some call # (may be first call in legacy mode, or second call with CodeAgent) assert len(llm.calls) >= 1 all_user_msgs = " ".join( call[-1]["content"] for call in llm.calls if call ) assert "60" in all_user_msgs or "Compute Budget" in all_user_msgs class TestPartialTimeoutStatus: """Test partial status for timed-out experiments with data (R4-1c).""" def test_timed_out_with_metrics_sets_partial_status( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: import sys data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "test topic", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 2, "metric_key": "best_loss", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 1024, }, }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) # Write experiment code that prints some metrics then sleeps exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text( "import time, sys\n" "print('best_loss: 0.5', flush=True)\n" "sys.stdout.flush()\n" "time.sleep(10)\n", encoding="utf-8", ) stage_dir = run_dir / "stage-12" stage_dir.mkdir(parents=True, exist_ok=True) rc_executor._execute_experiment_run( stage_dir, run_dir, cfg, adapters ) run_file = stage_dir / "runs" / "run-1.json" assert run_file.exists() payload = json.loads(run_file.read_text(encoding="utf-8")) # Should be "partial" since metrics were captured before timeout assert payload["timed_out"] is True # Status should be "partial" if metrics captured, "failed" if not if payload["metrics"]: assert payload["status"] == "partial" else: # Subprocess stdout may not flush before kill on some platforms assert payload["status"] == "failed" class TestTimeoutAwareRefine: """Test timeout-aware prompt injection in iterative refine (R4-1b).""" def _prepare_timed_out_run(self, run_dir: Path) -> None: """Create a prior run that timed out with no metrics.""" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "task_id": "sandbox-main", "status": "failed", "metrics": {}, "timed_out": True, "elapsed_sec": 120.0, }), encoding="utf-8", ) # Write experiment code exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text( "print('best_loss: 0.1')\n", encoding="utf-8", ) def test_timeout_refine_injects_scale_reduction_prompt( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: self._prepare_timed_out_run(run_dir) stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": { "topic": "test topic", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2, }, "runtime": {"timezone": "UTC"}, "notifications": { "channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True, }, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": [], }, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 120, "max_iterations": 1, "metric_key": "best_loss", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) llm = FakeLLMClient( "```python\nimport numpy as np\nprint('best_loss: 0.1')\n```" ) rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) # The LLM should have received the timeout-aware prompt assert len(llm.calls) >= 1 user_msg = llm.calls[0][-1]["content"] assert "TIMED OUT" in user_msg assert "120" in user_msg # ── R4-2: Data Integrity Enforcement Tests ─────────────────────────── class TestDataIntegrityBlock: """Test paper draft blocked when no metrics exist (R4-2a).""" def test_paper_draft_blocked_with_no_metrics( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: # Write prior artifacts with NO metrics _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n## Abstract\n") # No experiment_summary.json, no run files with metrics runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({"run_id": "run-1", "status": "failed", "metrics": {}, "timed_out": True}), encoding="utf-8", ) stage_dir = run_dir / "stage-17" stage_dir.mkdir(parents=True, exist_ok=True) llm = FakeLLMClient("should not be called") result = rc_executor._execute_paper_draft( stage_dir, run_dir, rc_config, adapters, llm=llm ) assert result.status == StageStatus.FAILED draft = (stage_dir / "paper_draft.md").read_text(encoding="utf-8") assert "Blocked" in draft or "BLOCKED" in draft or "no metrics" in draft.lower() # LLM should NOT have been called assert len(llm.calls) == 0 def test_paper_draft_proceeds_with_metrics( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n## Abstract\n") # Write experiment data with real metrics runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "completed", "metrics": {"best_loss": 0.123}, "stdout": "best_loss: 0.123\n", }), encoding="utf-8", ) stage_dir = run_dir / "stage-17" stage_dir.mkdir(parents=True, exist_ok=True) llm = FakeLLMClient("# Paper Title\n## Abstract\nSome abstract text.") result = rc_executor._execute_paper_draft( stage_dir, run_dir, rc_config, adapters, llm=llm ) # Should proceed (LLM was called) assert len(llm.calls) >= 1 # The prompt should contain anti-fabrication instructions all_prompts = " ".join( msg["content"] for call in llm.calls for msg in call ) assert "Data Integrity" in all_prompts or "ONLY report numbers" in all_prompts # ── R4-3: Conference-Grade Title Guidelines Tests ──────────────────── class TestTitleGuidelines: """Test title_guidelines and abstract_structure blocks (R4-3).""" def test_title_guidelines_block_exists(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() block = pm.block("title_guidelines") assert "novelty" in block.lower() or "TITLE RULES" in block assert "14 words" in block or "15 words" in block or "concrete" in block.lower() def test_abstract_structure_block_exists(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() block = pm.block("abstract_structure") assert "5-sentence" in block or "problem" in block.lower() def test_title_guidelines_injected_into_paper_draft( self, tmp_path: Path, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle ) -> None: _write_prior_artifact(run_dir, 16, "outline.md", "# Outline\n") runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({"run_id": "run-1", "status": "completed", "metrics": {"best_loss": 0.1}, "stdout": "best_loss: 0.1\n"}), encoding="utf-8", ) stage_dir = run_dir / "stage-17" stage_dir.mkdir(parents=True, exist_ok=True) llm = FakeLLMClient("# Paper Title\n## Abstract\nText.") rc_executor._execute_paper_draft( stage_dir, run_dir, rc_config, adapters, llm=llm ) all_prompts = " ".join( msg["content"] for call in llm.calls for msg in call ) assert "Title" in all_prompts or "TITLE" in all_prompts # ── R4-4: Conference-Grade Writing Quality Tests ───────────────────── class TestConferenceWritingQuality: """Test enhanced writing prompts and writing_guide.py (R4-4).""" def test_writing_guide_format_all(self) -> None: from researchclaw.writing_guide import format_writing_tips result = format_writing_tips() assert "Conference Writing Best Practices" in result assert "Title" in result assert "Common Rejections" in result def test_writing_guide_format_subset(self) -> None: from researchclaw.writing_guide import format_writing_tips result = format_writing_tips(["title", "abstract"]) assert "Title" in result assert "Abstract" in result assert "Common Rejections" not in result def test_paper_draft_system_includes_principles(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "paper_draft", preamble="test", topic_constraint="test", exp_metrics_instruction="test", citation_instruction="test", outline="test", ) # System prompt should mention key principles assert "NOVELTY" in sp.system or "novelty" in sp.system.lower() assert "fabricate" in sp.system.lower() or "real experimental" in sp.system.lower() # ── R5-1 & R5-2: Bug Fixes Tests ──────────────────────────────────── class TestRefineTimeoutAndIterationCap: """Test R5-1 (no 120s cap) and R5-2 (iteration cap raised to 10).""" def test_refine_timeout_uses_full_budget(self) -> None: """R5-1: Refine sandbox should NOT cap at 120s.""" import ast import inspect source = inspect.getsource(rc_executor._execute_iterative_refine) tree = ast.parse(source) source_text = inspect.getsource(rc_executor._execute_iterative_refine) # Should NOT contain min(..., 120) assert "min(config.experiment.time_budget_sec, 120)" not in source_text def test_iteration_cap_is_10(self) -> None: """R5-2: Max iterations should be capped at 10, not 3.""" import inspect source = inspect.getsource(rc_executor._execute_iterative_refine) assert "min(requested_iterations, 10)" in source assert "min(requested_iterations, 3)" not in source def test_refine_respects_high_iteration_count( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: """R5-2: Setting max_iterations=7 should actually allow 7 iterations.""" # Write prior run artifacts runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({"run_id": "run-1", "status": "completed", "metrics": {"best_loss": 0.5}}), encoding="utf-8", ) exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('best_loss: 0.5')\n", encoding="utf-8") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 300, "max_iterations": 7, "metric_key": "best_loss", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) # LLM always returns same code — will trigger no_improvement early stop llm = FakeLLMClient("```python\nprint('best_loss: 0.5')\n```") rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) log = json.loads((stage_dir / "refinement_log.json").read_text(encoding="utf-8")) # Should have been allowed more than 3 iterations (capped at 7) assert log["max_iterations_executed"] == 7 # But may have stopped early due to no_improvement_for_2_iterations assert len(log["iterations"]) >= 2 # ── R5-3: NaN/Divergence Fast-Fail Tests ──────────────────────────── class TestNaNDivergenceDetection: """Test NaN/Inf filtering and divergence detection (R5-3).""" def test_parse_metrics_filters_nan(self) -> None: from researchclaw.experiment.sandbox import parse_metrics stdout = "best_loss: 0.5\nbad_metric: nan\ngood_metric: 1.23\n" metrics = parse_metrics(stdout) assert "best_loss" in metrics assert "good_metric" in metrics assert "bad_metric" not in metrics # NaN should be filtered def test_parse_metrics_filters_inf(self) -> None: from researchclaw.experiment.sandbox import parse_metrics stdout = "metric_a: inf\nmetric_b: -inf\nmetric_c: 0.42\n" metrics = parse_metrics(stdout) assert "metric_c" in metrics assert "metric_a" not in metrics assert "metric_b" not in metrics def test_detect_nan_divergence_finds_nan(self) -> None: from researchclaw.experiment.sandbox import detect_nan_divergence result = detect_nan_divergence("loss: nan\nstep 5 done", "") assert result is not None assert "NaN" in result or "nan" in result.lower() def test_detect_nan_divergence_finds_diverging_loss(self) -> None: from researchclaw.experiment.sandbox import detect_nan_divergence result = detect_nan_divergence("best_loss: 999.5\n", "") assert result is not None assert "loss" in result.lower() or "999" in result def test_detect_nan_divergence_returns_none_for_clean(self) -> None: from researchclaw.experiment.sandbox import detect_nan_divergence result = detect_nan_divergence("best_loss: 0.123\naccuracy: 0.95\n", "") assert result is None def test_runtime_issues_detects_diverging_loss(self) -> None: from types import SimpleNamespace fake_result = SimpleNamespace( metrics={"best_loss": 500.0}, stdout="best_loss: 500.0\n", stderr="", ) issues = rc_executor._detect_runtime_issues(fake_result) assert "DIVERGING" in issues or "diverging" in issues.lower() def test_compute_budget_includes_nan_guard(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() block = pm.block("compute_budget") assert "NaN" in block or "nan" in block.lower() or "divergence" in block.lower() # ── R5-4: Experiment Harness Template Tests ────────────────────────── class TestExperimentHarness: """Test the immutable experiment harness (R5-4).""" def test_harness_should_stop(self) -> None: from researchclaw.experiment.harness_template import ExperimentHarness h = ExperimentHarness(time_budget=1) assert not h.should_stop() # Just created, not at 80% yet import time time.sleep(0.9) assert h.should_stop() # Should be past 80% of 1s def test_harness_report_metric(self, capsys: pytest.CaptureFixture[str]) -> None: from researchclaw.experiment.harness_template import ExperimentHarness h = ExperimentHarness(time_budget=60) h.report_metric("best_loss", 0.123) captured = capsys.readouterr() assert "best_loss: 0.123" in captured.out assert h._metrics["best_loss"] == 0.123 def test_harness_rejects_nan(self, capsys: pytest.CaptureFixture[str]) -> None: from researchclaw.experiment.harness_template import ExperimentHarness h = ExperimentHarness(time_budget=60) h.report_metric("bad", float("nan")) captured = capsys.readouterr() assert "bad" not in h._metrics assert "non-finite" in captured.err.lower() or "WARNING" in captured.err def test_harness_rejects_inf(self, capsys: pytest.CaptureFixture[str]) -> None: from researchclaw.experiment.harness_template import ExperimentHarness h = ExperimentHarness(time_budget=60) h.report_metric("bad", float("inf")) assert "bad" not in h._metrics def test_harness_finalize(self, tmp_path: Path) -> None: import os from researchclaw.experiment.harness_template import ExperimentHarness old_cwd = os.getcwd() os.chdir(tmp_path) try: h = ExperimentHarness(time_budget=60) h.report_metric("accuracy", 0.95) h.report_metric("loss", 0.05) h.log_result({"condition": "A", "value": 1.0}) h.finalize() results = json.loads((tmp_path / "results.json").read_text(encoding="utf-8")) assert results["metrics"]["accuracy"] == 0.95 assert results["metrics"]["loss"] == 0.05 assert len(results["results"]) == 1 finally: os.chdir(old_cwd) def test_harness_progress(self) -> None: from researchclaw.experiment.harness_template import ExperimentHarness h = ExperimentHarness(time_budget=1000) assert h.progress < 0.01 # Just started assert 0.0 <= h.progress <= 1.0 def test_harness_injected_into_sandbox(self, tmp_path: Path) -> None: import sys from researchclaw.config import SandboxConfig from researchclaw.experiment.sandbox import ExperimentSandbox config = SandboxConfig(python_path=sys.executable) sandbox = ExperimentSandbox(config, tmp_path / "sandbox") # Create a project dir project = tmp_path / "project" project.mkdir() (project / "main.py").write_text("print('test: 1.0')\n", encoding="utf-8") sandbox.run_project(project, timeout_sec=5) # Check that harness was injected (BUG-DA8-06: dir is now _project_{N}) project_dirs = list((tmp_path / "sandbox").glob("_project_*")) assert project_dirs, "No _project_N directory found" harness_path = project_dirs[0] / "experiment_harness.py" assert harness_path.exists() content = harness_path.read_text(encoding="utf-8") assert "ExperimentHarness" in content def test_harness_not_overwritten_by_project(self, tmp_path: Path) -> None: import sys from researchclaw.config import SandboxConfig from researchclaw.experiment.sandbox import ExperimentSandbox config = SandboxConfig(python_path=sys.executable) sandbox = ExperimentSandbox(config, tmp_path / "sandbox") # Create a project with a fake experiment_harness.py project = tmp_path / "project" project.mkdir() (project / "main.py").write_text("print('test: 1.0')\n", encoding="utf-8") (project / "experiment_harness.py").write_text("# FAKE HARNESS", encoding="utf-8") sandbox.run_project(project, timeout_sec=5) # The real harness should be there, not the fake one (BUG-DA8-06) project_dirs = list((tmp_path / "sandbox").glob("_project_*")) assert project_dirs harness_path = project_dirs[0] / "experiment_harness.py" content = harness_path.read_text(encoding="utf-8") assert "ExperimentHarness" in content assert "FAKE HARNESS" not in content def test_prompt_mentions_harness(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() block = pm.block("compute_budget") assert "experiment_harness" in block or "ExperimentHarness" in block # ── R5-5: Stdout Truncation Tests ──────────────────────────────────── class TestStdoutTruncation: """Test stdout/stderr truncation in refine run summaries (R5-5).""" def test_long_stdout_truncated_in_refine( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: # Create a run with very long stdout runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) long_stdout = "\n".join(f"step {i}: loss={0.5 - i * 0.001:.6f}" for i in range(200)) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "completed", "metrics": {"best_loss": 0.3}, "stdout": long_stdout, }), encoding="utf-8", ) exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('best_loss: 0.3')\n", encoding="utf-8") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "best_loss", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) llm = FakeLLMClient("```python\nprint('best_loss: 0.3')\n```") rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) # The LLM should have received truncated stdout, not all 200 lines assert len(llm.calls) >= 1 user_msg = llm.calls[0][-1]["content"] # Should contain truncation indicator assert "truncated" in user_msg or len(user_msg) < len(long_stdout) # =================================================================== # R6 Tests — Post-E2E Failure Analysis Fixes # =================================================================== class TestNoImproveStreakFix: """R6-1: no_improve_streak should only count iterations with real metrics.""" def test_empty_metrics_dont_increment_streak( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: """When metrics are empty (None), the streak should NOT increment.""" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "failed", "metrics": {}, "stdout": "FAIL: NaN/divergence detected", }), encoding="utf-8", ) exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('hello')\n", encoding="utf-8") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 4, "metric_key": "primary_metric", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) # LLM returns code that won't produce metrics in simulated mode llm = FakeLLMClient("```python\nprint('no metrics here')\n```") result = rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) # Should abort after 3 consecutive no-metrics iterations log_path = stage_dir / "refinement_log.json" log_data = json.loads(log_path.read_text()) # consecutive_no_metrics triggers early abort after 3 iterations assert len(log_data["iterations"]) == 3 assert log_data.get("stop_reason") == "consecutive_no_metrics" class TestStdoutFailureDetection: """R6-2: Detect stdout failure signals even when exit code is 0.""" def test_fail_signal_in_stdout_marks_failed(self, tmp_path: Path) -> None: """Exit code 0 + 'FAIL:' in stdout + no metrics → status='failed'.""" from researchclaw.pipeline.executor import _execute_experiment_run # Create necessary structure run_dir = tmp_path / "run" run_dir.mkdir() (run_dir / "stage-10").mkdir() exp_dir = run_dir / "stage-10" / "experiment" exp_dir.mkdir() # Simple code that prints FAIL but exits 0 (exp_dir / "main.py").write_text( "print('FAIL: NaN/divergence detected')\n", encoding="utf-8" ) (run_dir / "stage-11").mkdir() (run_dir / "stage-11" / "schedule.json").write_text("{}", encoding="utf-8") stage_dir = run_dir / "stage-12" stage_dir.mkdir() data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "primary_metric", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 512, "allowed_imports": ["json"], }, }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) adapters = AdapterBundle() result = _execute_experiment_run( stage_dir, run_dir, cfg, adapters ) # Check the run payload runs_dir = stage_dir / "runs" run_file = runs_dir / "run-1.json" assert run_file.exists() payload = json.loads(run_file.read_text()) assert payload["status"] == "failed" def test_clean_exit_no_fail_signal_marks_completed(self, tmp_path: Path) -> None: """Exit code 0 + valid metrics + no FAIL signal → status='completed'.""" from researchclaw.pipeline.executor import _execute_experiment_run run_dir = tmp_path / "run" run_dir.mkdir() (run_dir / "stage-10").mkdir() exp_dir = run_dir / "stage-10" / "experiment" exp_dir.mkdir() (exp_dir / "main.py").write_text( "print('primary_metric: 0.95')\n", encoding="utf-8" ) (run_dir / "stage-11").mkdir() (run_dir / "stage-11" / "schedule.json").write_text("{}", encoding="utf-8") stage_dir = run_dir / "stage-12" stage_dir.mkdir() data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "primary_metric", "metric_direction": "minimize", "sandbox": { "python_path": sys.executable, "gpu_required": False, "max_memory_mb": 512, "allowed_imports": ["json"], }, }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) adapters = AdapterBundle() result = _execute_experiment_run( stage_dir, run_dir, cfg, adapters ) runs_dir = stage_dir / "runs" payload = json.loads((runs_dir / "run-1.json").read_text()) assert payload["status"] == "completed" class TestMetricValUndefined: """R6-3: metric_val should be initialized to None before conditional block.""" def test_metric_val_initialized_before_use(self) -> None: """Verify the code pattern: metric_val = None before if block.""" import inspect source = inspect.getsource(rc_executor._execute_iterative_refine) # Find that metric_val = None appears before the sandbox block init_pos = source.find("metric_val = None") sandbox_pos = source.find("if validation.ok and config.experiment.mode") assert init_pos != -1, "metric_val = None not found" assert sandbox_pos != -1, "sandbox block not found" assert init_pos < sandbox_pos, "metric_val = None should come before sandbox block" class TestConsecutiveEmptyMetrics: """R6-4: Pipeline should detect consecutive empty-metrics REFINE cycles.""" def test_detects_consecutive_empty(self, tmp_path: Path) -> None: """Two cycles with empty metrics should return True.""" from researchclaw.pipeline.runner import _consecutive_empty_metrics run_dir = tmp_path / "run" # Current cycle (stage-14) s14 = run_dir / "stage-14" s14.mkdir(parents=True) (s14 / "experiment_summary.json").write_text(json.dumps({ "metrics_summary": {}, "best_run": {"metrics": {}}, })) # Previous cycle (stage-14_v1) s14v1 = run_dir / "stage-14_v1" s14v1.mkdir(parents=True) (s14v1 / "experiment_summary.json").write_text(json.dumps({ "metrics_summary": {}, "best_run": {"metrics": {}}, })) assert _consecutive_empty_metrics(run_dir, pivot_count=1) is True def test_not_empty_when_metrics_exist(self, tmp_path: Path) -> None: """If any cycle has real metrics, return False.""" from researchclaw.pipeline.runner import _consecutive_empty_metrics run_dir = tmp_path / "run" s14 = run_dir / "stage-14" s14.mkdir(parents=True) (s14 / "experiment_summary.json").write_text(json.dumps({ "metrics_summary": {}, "best_run": {"metrics": {"loss": 0.5}}, })) s14v1 = run_dir / "stage-14_v1" s14v1.mkdir(parents=True) (s14v1 / "experiment_summary.json").write_text(json.dumps({ "metrics_summary": {}, "best_run": {"metrics": {}}, })) assert _consecutive_empty_metrics(run_dir, pivot_count=1) is False def test_false_when_no_previous_cycle(self, tmp_path: Path) -> None: """First cycle (no v1) should return False.""" from researchclaw.pipeline.runner import _consecutive_empty_metrics run_dir = tmp_path / "run" s14 = run_dir / "stage-14" s14.mkdir(parents=True) (s14 / "experiment_summary.json").write_text(json.dumps({ "metrics_summary": {}, "best_run": {"metrics": {}}, })) # No stage-14_v1 exists assert _consecutive_empty_metrics(run_dir, pivot_count=1) is False # =================================================================== # R7 Tests — Experiment-Paper Quality Alignment # =================================================================== class TestMultiConditionEnforcement: """R7-1: Code generation prompt must enforce multi-condition experiments.""" def test_code_generation_prompt_has_multi_condition_block(self) -> None: """The code_generation prompt should contain multi-condition instructions.""" from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "code_generation", topic="test topic", metric="primary_metric", pkg_hint="", exp_plan="conditions:\n - echo_chamber\n - bridge_building\n - random", ) assert "MULTI-CONDITION REQUIREMENT" in sp.user assert "condition=" in sp.user assert "SUMMARY" in sp.user def test_multi_condition_labels_required(self) -> None: """Prompt must mention per-condition labeled output format.""" from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "code_generation", topic="test", metric="loss", pkg_hint="", exp_plan="treatments: [A, B, C]", ) assert "condition=" in sp.user class TestEvidenceBoundedWriting: """R7-2: Paper draft prompt must enforce evidence-bounded claims.""" def test_paper_draft_has_evidence_bounding_rules(self) -> None: """System prompt should contain evidence-bounding rules.""" from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "paper_draft", preamble="test preamble", topic_constraint="", exp_metrics_instruction="", citation_instruction="", outline="# Outline", ) assert "EVIDENCE-BOUNDING RULES" in sp.system assert "title" in sp.system.lower() assert "causal claim" in sp.system.lower() or "causal claims" in sp.system.lower() def test_hedging_language_guidance(self) -> None: """Should suggest hedged alternatives like 'Toward...' for partial data.""" from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "paper_draft", preamble="", topic_constraint="", exp_metrics_instruction="", citation_instruction="", outline="", ) assert "Toward" in sp.system or "Investigating" in sp.system class TestConditionCoverageDetection: """R7-3: REFINE should detect condition coverage gaps.""" def test_coverage_hint_injected_when_no_labels( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: """If stdout has no 'condition=' labels, a coverage hint should be injected.""" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "completed", "metrics": {"primary_metric": 0.5}, "stdout": "primary_metric: 0.5\nprimary_metric: 0.3\n", }), encoding="utf-8", ) exp_plan_dir = run_dir / "stage-09" exp_plan_dir.mkdir(parents=True, exist_ok=True) (exp_plan_dir / "exp_plan.yaml").write_text( "conditions:\n - echo_chamber\n - bridge_building\n - random\n", encoding="utf-8", ) exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('primary_metric: 0.5')\n", encoding="utf-8") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "primary_metric", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) llm = FakeLLMClient("```python\nprint('primary_metric: 0.3')\n```") rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) assert len(llm.calls) >= 1 user_msg = llm.calls[0][-1]["content"] assert "CONDITION COVERAGE GAP" in user_msg def test_no_hint_when_labels_present( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: """If stdout already has 'condition=' labels, no hint should be injected.""" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "completed", "metrics": {"primary_metric": 0.5}, "stdout": "condition=echo primary_metric: 0.5\ncondition=bridge primary_metric: 0.3\n", }), encoding="utf-8", ) exp_plan_dir = run_dir / "stage-09" exp_plan_dir.mkdir(parents=True, exist_ok=True) (exp_plan_dir / "exp_plan.yaml").write_text( "conditions:\n - echo\n - bridge\n", encoding="utf-8", ) exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("print('primary_metric: 0.5')\n", encoding="utf-8") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "primary_metric", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) llm = FakeLLMClient("```python\nprint('primary_metric: 0.3')\n```") rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) assert len(llm.calls) >= 1 user_msg = llm.calls[0][-1]["content"] assert "CONDITION COVERAGE GAP" not in user_msg # =================================================================== # R8 Tests — AutoBench Round 1 Fixes # =================================================================== class TestBreadthFirstPrompt: """R8-1: Code generation prompt should require breadth-first condition ordering.""" def test_breadth_first_in_code_generation(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "code_generation", topic="test", metric="primary_metric", pkg_hint="", exp_plan="conditions: [A, B, C]", ) assert "BREADTH-FIRST" in sp.user assert "ONE representative" in sp.user class TestRefineFilePreservation: """R8-2: Refine should preserve supporting files when LLM only returns main.py.""" def test_supporting_files_preserved_in_refine( self, tmp_path: Path, run_dir: Path, adapters: AdapterBundle ) -> None: """When LLM returns only main.py, other project files should be preserved.""" runs_dir = run_dir / "stage-12" / "runs" runs_dir.mkdir(parents=True, exist_ok=True) (runs_dir / "run-1.json").write_text( json.dumps({ "run_id": "run-1", "status": "completed", "metrics": {"primary_metric": 0.5}, "stdout": "primary_metric: 0.5", }), encoding="utf-8", ) # Multi-file experiment project exp_dir = run_dir / "stage-11" / "experiment" exp_dir.mkdir(parents=True, exist_ok=True) (exp_dir / "main.py").write_text("from helpers import foo\nprint('primary_metric: 0.5')\n") (exp_dir / "helpers.py").write_text("def foo(): return 42\n") (exp_dir / "utils.py").write_text("def bar(): return 99\n") stage_dir = run_dir / "stage-13" stage_dir.mkdir(parents=True, exist_ok=True) data = { "project": {"name": "rc-test", "mode": "docs-first"}, "research": {"topic": "test", "domains": ["ml"], "daily_paper_count": 2, "quality_threshold": 8.2}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local", "on_stage_start": True, "on_stage_fail": False, "on_gate_required": True}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {"use_memory": True, "use_message": True}, "llm": {"provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline-test-key", "primary_model": "fake-model", "fallback_models": []}, "security": {"hitl_required_stages": [5, 9, 20]}, "experiment": { "mode": "sandbox", "time_budget_sec": 30, "max_iterations": 1, "metric_key": "primary_metric", "metric_direction": "minimize", }, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) # LLM returns only main.py in multi-file format llm = FakeLLMClient("```filename:main.py\nfrom helpers import foo\nprint('primary_metric: 0.3')\n```") rc_executor._execute_iterative_refine( stage_dir, run_dir, cfg, adapters, llm=llm ) # Check that experiment_v1 has ALL files, not just main.py v1_dir = stage_dir / "experiment_v1" assert v1_dir.exists() v1_files = {f.name for f in v1_dir.glob("*.py")} assert "main.py" in v1_files assert "helpers.py" in v1_files, "Supporting file helpers.py should be preserved" assert "utils.py" in v1_files, "Supporting file utils.py should be preserved" # =================================================================== # R9 Tests — AutoBench Round 2 Fixes # =================================================================== class TestCodeGenTopicNeutral: """R9-1: Code generation prompt should be topic-neutral, not optimization-biased.""" def test_no_gradient_descent_bias(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "code_generation", topic="multi-agent simulation", metric="primary_metric", pkg_hint="", exp_plan="conditions: [L1, L2, L3, L4]", ) # Should NOT contain optimization-specific examples as recommended approaches assert "Adam" not in sp.user assert "SGD" not in sp.user assert "Rosenbrock" not in sp.user # "gradient descent" may appear as anti-pattern warning but not as example assert "e.g., gradient descent" not in sp.user def test_topic_relevant_guidance(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.for_stage( "code_generation", topic="multi-agent simulation", metric="primary_metric", pkg_hint="", exp_plan="conditions: [L1, L2, L3, L4]", ) # Should contain generic guidance that works for any topic assert "simulation" in sp.user.lower() or "appropriate" in sp.user.lower() assert "ACTUAL experiment" in sp.user or "relevant to the TOPIC" in sp.user class TestRefineTopicAlignment: """R9-2: Refine prompt should include topic-code alignment check.""" def test_topic_alignment_in_refine_prompt(self) -> None: from researchclaw.prompts import PromptManager pm = PromptManager() sp = pm.sub_prompt( "iterative_improve", metric_key="primary_metric", metric_direction="maximize", files_context="# main.py\nprint('hello')", run_summaries="{}", condition_coverage_hint="", topic="multi-agent diversity scaling", exp_plan_anchor="", ) assert "EXPERIMENT PLAN ANCHOR" in sp.user assert "multi-agent diversity scaling" in sp.user assert "NEVER rename" in sp.user # ===================================================================== # _validate_draft_quality tests # ===================================================================== def _make_prose(word_count: int) -> str: # noqa: E302 """Generate flowing prose text of approximately *word_count* words.""" sentence = ( "This is a flowing academic prose sentence " "that demonstrates our research findings. " ) words_per = len(sentence.split()) return sentence * (word_count // words_per + 1) def _make_bullets(word_count: int) -> str: """Generate bullet-point text of approximately *word_count* words.""" line = "- This is a bullet point about a research finding\n" words_per = len(line.split()) return line * (word_count // words_per + 1) def _make_comparative_prose(word_count: int) -> str: """Generate related-work style prose with comparative language.""" sentence = ( "Unlike prior work that focuses on simple baselines, " "our approach differs by incorporating novel techniques. " "In contrast to existing methods, we address key limitations. " "However, while previous approaches rely on heuristics, " "our method provides theoretical guarantees. " ) words_per = len(sentence.split()) return sentence * (word_count // words_per + 1) def _make_results_prose(word_count: int) -> str: """Generate results prose with statistical measures.""" sentence = ( "Our method achieves 85.3 ± 1.2 accuracy averaged over 5 seeds. " "The baseline comparison yields a p-value of 0.003, confirming " "statistical significance with 95% confidence interval. " ) words_per = len(sentence.split()) return sentence * (word_count // words_per + 1) def _build_draft(**section_overrides: str) -> str: """Build a paper draft with default prose sections.""" defaults = { "Abstract": _make_prose(200), "Introduction": _make_prose(900), "Related Work": _make_comparative_prose(700), "Method": _make_prose(1200), "Experiments": _make_prose(1000), "Results": _make_results_prose(700), "Discussion": _make_prose(500), "Limitations": _make_prose(250), "Conclusion": _make_prose(250), } defaults.update(section_overrides) parts = ["# My Research Title\n"] for heading, body in defaults.items(): parts.append(f"# {heading}\n{body}\n") return "\n".join(parts) class TestValidateDraftQuality: """Tests for _validate_draft_quality().""" def test_short_section_triggers_warning(self) -> None: """Short Method section triggers expand warning.""" draft = _build_draft(Method=_make_prose(200)) result = rc_executor._validate_draft_quality(draft) assert any("Method" in w for w in result["overall_warnings"]) assert any("EXPAND" in d or "Expand" in d for d in result["revision_directives"]) def test_bullet_density_triggers_warning(self) -> None: """Bullet-heavy Method section triggers rewrite warning.""" draft = _build_draft(Method=_make_bullets(1200)) result = rc_executor._validate_draft_quality(draft) assert any( "bullet" in w.lower() or "density" in w.lower() for w in result["overall_warnings"] ) assert any("REWRITE" in d for d in result["revision_directives"]) def test_clean_draft_no_warnings(self) -> None: """Balanced prose draft produces zero warnings.""" draft = _build_draft() result = rc_executor._validate_draft_quality(draft) assert len(result["overall_warnings"]) == 0 assert len(result["revision_directives"]) == 0 def test_balance_warning(self) -> None: """Large imbalance between sections triggers balance warning.""" draft = _build_draft( Introduction=_make_prose(1500), Results=_make_prose(100), ) result = rc_executor._validate_draft_quality(draft) bal = [w for w in result["overall_warnings"] if "imbalance" in w.lower()] assert len(bal) >= 1, ( f"Expected balance warning, got: {result['overall_warnings']}" ) def test_writes_json_to_stage_dir(self, tmp_path: Path) -> None: """Quality report is written as draft_quality.json.""" draft = _build_draft(Method=_make_prose(200)) rc_executor._validate_draft_quality(draft, stage_dir=tmp_path) assert (tmp_path / "draft_quality.json").exists() data = json.loads( (tmp_path / "draft_quality.json").read_text() ) assert "section_analysis" in data assert "overall_warnings" in data assert "revision_directives" in data ================================================ FILE: tests/test_rc_hardware.py ================================================ """Tests for researchclaw.hardware — GPU detection & metric filtering.""" from __future__ import annotations import subprocess from unittest.mock import MagicMock, patch import pytest from researchclaw.hardware import ( HardwareProfile, _detect_mps, _detect_nvidia, detect_hardware, ensure_torch_available, is_metric_name, ) # --------------------------------------------------------------------------- # HardwareProfile # --------------------------------------------------------------------------- class TestHardwareProfile: def test_to_dict(self): hp = HardwareProfile( has_gpu=True, gpu_type="cuda", gpu_name="RTX 4090", vram_mb=24564, tier="high", warning="", ) d = hp.to_dict() assert d["has_gpu"] is True assert d["gpu_type"] == "cuda" assert d["vram_mb"] == 24564 def test_cpu_only_profile(self): hp = HardwareProfile( has_gpu=False, gpu_type="cpu", gpu_name="CPU only", vram_mb=None, tier="cpu_only", warning="No GPU", ) assert hp.tier == "cpu_only" assert hp.warning == "No GPU" # --------------------------------------------------------------------------- # NVIDIA detection # --------------------------------------------------------------------------- class TestDetectNvidia: def test_high_vram_nvidia(self): mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = "NVIDIA GeForce RTX 4090, 24564\n" with patch("researchclaw.hardware.subprocess.run", return_value=mock_result): profile = _detect_nvidia() assert profile is not None assert profile.has_gpu is True assert profile.gpu_type == "cuda" assert profile.gpu_name == "NVIDIA GeForce RTX 4090" assert profile.vram_mb == 24564 assert profile.tier == "high" assert profile.warning == "" def test_low_vram_nvidia(self): mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = "NVIDIA GeForce GTX 1650, 4096\n" with patch("researchclaw.hardware.subprocess.run", return_value=mock_result): profile = _detect_nvidia() assert profile is not None assert profile.tier == "limited" assert "limited memory" in profile.warning def test_nvidia_smi_not_found(self): with patch( "researchclaw.hardware.subprocess.run", side_effect=FileNotFoundError, ): assert _detect_nvidia() is None def test_nvidia_smi_failure(self): mock_result = MagicMock() mock_result.returncode = 1 with patch("researchclaw.hardware.subprocess.run", return_value=mock_result): assert _detect_nvidia() is None def test_nvidia_smi_timeout(self): with patch( "researchclaw.hardware.subprocess.run", side_effect=subprocess.TimeoutExpired("nvidia-smi", 10), ): assert _detect_nvidia() is None # --------------------------------------------------------------------------- # MPS detection # --------------------------------------------------------------------------- class TestDetectMPS: def test_apple_silicon(self): mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = "Apple M3 Pro\n" with ( patch("researchclaw.hardware.platform.system", return_value="Darwin"), patch("researchclaw.hardware.platform.machine", return_value="arm64"), patch("researchclaw.hardware.subprocess.run", return_value=mock_result), ): profile = _detect_mps() assert profile is not None assert profile.has_gpu is True assert profile.gpu_type == "mps" assert profile.gpu_name == "Apple M3 Pro" assert profile.tier == "limited" assert "MPS" in profile.warning def test_non_darwin(self): with patch("researchclaw.hardware.platform.system", return_value="Linux"): assert _detect_mps() is None def test_intel_mac(self): with ( patch("researchclaw.hardware.platform.system", return_value="Darwin"), patch("researchclaw.hardware.platform.machine", return_value="x86_64"), ): assert _detect_mps() is None # --------------------------------------------------------------------------- # detect_hardware (integration) # --------------------------------------------------------------------------- class TestDetectHardware: def test_falls_back_to_cpu(self): with ( patch("researchclaw.hardware._detect_nvidia", return_value=None), patch("researchclaw.hardware._detect_mps", return_value=None), ): profile = detect_hardware() assert profile.has_gpu is False assert profile.gpu_type == "cpu" assert profile.tier == "cpu_only" assert "No GPU" in profile.warning def test_nvidia_takes_priority(self): nvidia_profile = HardwareProfile( has_gpu=True, gpu_type="cuda", gpu_name="RTX 4090", vram_mb=24564, tier="high", warning="", ) mps_profile = HardwareProfile( has_gpu=True, gpu_type="mps", gpu_name="M3", vram_mb=None, tier="limited", warning="MPS", ) with ( patch("researchclaw.hardware._detect_nvidia", return_value=nvidia_profile), patch("researchclaw.hardware._detect_mps", return_value=mps_profile), ): profile = detect_hardware() assert profile.gpu_type == "cuda" # --------------------------------------------------------------------------- # ensure_torch_available # --------------------------------------------------------------------------- class TestEnsureTorchAvailable: def test_already_installed(self): mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = "2.3.0\n" with patch("researchclaw.hardware.subprocess.run", return_value=mock_result): assert ensure_torch_available("/usr/bin/python3", "cuda") is True def test_cpu_only_skips_install(self): mock_check = MagicMock() mock_check.returncode = 1 # not installed mock_check.stdout = "" with patch("researchclaw.hardware.subprocess.run", return_value=mock_check): assert ensure_torch_available("/usr/bin/python3", "cpu") is False def test_install_succeeds(self): call_count = {"n": 0} def side_effect(*args, **kwargs): call_count["n"] += 1 mock = MagicMock() if call_count["n"] == 1: mock.returncode = 1 # import check fails mock.stdout = "" else: mock.returncode = 0 # pip install succeeds mock.stdout = "" return mock with patch("researchclaw.hardware.subprocess.run", side_effect=side_effect): assert ensure_torch_available("/usr/bin/python3", "cuda") is True def test_install_fails(self): mock = MagicMock() mock.returncode = 1 mock.stdout = "" mock.stderr = "ERROR: Could not install" with patch("researchclaw.hardware.subprocess.run", return_value=mock): assert ensure_torch_available("/usr/bin/python3", "mps") is False def test_python_not_found(self): with patch( "researchclaw.hardware.subprocess.run", side_effect=FileNotFoundError, ): assert ensure_torch_available("/nonexistent/python3", "cuda") is False # --------------------------------------------------------------------------- # is_metric_name # --------------------------------------------------------------------------- class TestIsMetricName: def test_valid_metrics(self): assert is_metric_name("loss") is True assert is_metric_name("primary_metric") is True assert is_metric_name("UCB (Stochastic) cumulative_regret") is True assert is_metric_name("accuracy") is True assert is_metric_name("f1_score") is True def test_log_lines_filtered(self): assert is_metric_name("Running experiments for support set size") is False assert is_metric_name("Loading model weights") is False assert is_metric_name("Training epoch 5") is False assert is_metric_name("Evaluating on test set") is False assert is_metric_name("Processing batch") is False assert is_metric_name("Initializing optimizer") is False def test_too_many_words_filtered(self): assert is_metric_name("this is a very long name that has many words") is False def test_short_names_pass(self): assert is_metric_name("val_loss") is True assert is_metric_name("test accuracy score") is True ================================================ FILE: tests/test_rc_health.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false, reportMissingImports=false, reportUntypedNamedTuple=false, reportMissingTypeArgument=false, reportArgumentType=false from __future__ import annotations import json import socket import urllib.error from pathlib import Path from typing import NamedTuple, cast from unittest.mock import patch import pytest from researchclaw import health class _VersionInfo(NamedTuple): major: int minor: int micro: int releaselevel: str serial: int class _DummyHTTPResponse: status: int _payload: dict[str, object] def __init__( self, *, status: int = 200, payload: dict[str, object] | None = None ) -> None: self.status = status self._payload = payload if payload is not None else {} def read(self) -> bytes: return json.dumps(self._payload).encode("utf-8") def __enter__(self) -> _DummyHTTPResponse: return self def __exit__(self, exc_type: object, exc: object, tb: object) -> None: return None def _write_valid_config(path: Path) -> None: _ = path.write_text( """ project: name: demo research: topic: Doctor checks runtime: timezone: UTC notifications: channel: test knowledge_base: root: kb llm: base_url: https://api.example.com/v1 api_key_env: OPENAI_API_KEY """.strip() + "\n", encoding="utf-8", ) def test_check_python_version_pass() -> None: with patch("sys.version_info", _VersionInfo(3, 11, 0, "final", 0)): result = health.check_python_version() assert result.status == "pass" def test_check_python_version_fail() -> None: with patch("sys.version_info", _VersionInfo(3, 10, 9, "final", 0)): result = health.check_python_version() assert result.status == "fail" assert "Install Python 3.11 or newer" == result.fix def test_check_yaml_import_pass() -> None: with patch("importlib.import_module", return_value=object()): result = health.check_yaml_import() assert result.status == "pass" def test_check_yaml_import_fail() -> None: with patch("importlib.import_module", side_effect=ImportError): result = health.check_yaml_import() assert result.status == "fail" assert result.fix == "pip install pyyaml" def test_check_config_valid_pass(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _write_valid_config(config_path) result = health.check_config_valid(config_path) assert result.status == "pass" def test_check_config_invalid(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _ = config_path.write_text("project: {}\n", encoding="utf-8") result = health.check_config_valid(config_path) assert result.status == "fail" assert "Missing required field:" in result.detail def test_check_config_missing_file(tmp_path: Path) -> None: result = health.check_config_valid(tmp_path / "missing.yaml") assert result.status == "fail" assert "Config file not found" in result.detail def test_check_llm_connectivity_pass() -> None: with patch("urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200)): result = health.check_llm_connectivity("https://api.example.com/v1") assert result.status == "pass" def test_check_llm_connectivity_timeout() -> None: with patch( "urllib.request.urlopen", side_effect=urllib.error.URLError(socket.timeout("timed out")), ): result = health.check_llm_connectivity("https://api.example.com/v1") assert result.status == "fail" assert result.detail == "LLM endpoint unreachable" def test_check_llm_connectivity_http_error() -> None: with patch( "urllib.request.urlopen", side_effect=urllib.error.HTTPError( "https://api.example.com/v1/models", 503, "unavailable", {}, None ), ): result = health.check_llm_connectivity("https://api.example.com/v1") assert result.status == "fail" assert "503" in result.detail def test_check_api_key_valid() -> None: with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload={"data": []}), ): result = health.check_api_key_valid("https://api.example.com/v1", "sk-test") assert result.status == "pass" def test_check_api_key_invalid_401() -> None: with patch( "urllib.request.urlopen", side_effect=urllib.error.HTTPError( "https://api.example.com/v1/models", 401, "unauthorized", {}, None ), ): result = health.check_api_key_valid("https://api.example.com/v1", "bad") assert result.status == "fail" assert result.detail == "Invalid API key" def test_check_model_available_pass() -> None: payload = {"data": [{"id": "gpt-5.2"}, {"id": "gpt-4o"}]} with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload=payload), ): result = health.check_model_available( "https://api.example.com/v1", "sk-test", "gpt-5.2" ) assert result.status == "pass" def test_check_model_not_available() -> None: payload = {"data": [{"id": "gpt-4o"}]} with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload=payload), ): result = health.check_model_available( "https://api.example.com/v1", "sk-test", "gpt-5.2" ) assert result.status == "fail" assert result.detail == "Model gpt-5.2 not available" def test_check_model_chain_all_available() -> None: payload = {"data": [{"id": "gpt-4o"}, {"id": "gpt-4.1"}]} with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload=payload), ): result = health.check_model_chain( "https://api.example.com/v1", "sk-test", "gpt-4o", ("gpt-4.1",) ) assert result.status == "pass" assert "All models available" in result.detail def test_check_model_chain_primary_missing_fallback_ok() -> None: payload = {"data": [{"id": "gpt-4.1"}, {"id": "gpt-4o-mini"}]} with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload=payload), ): result = health.check_model_chain( "https://api.example.com/v1", "sk-test", "gpt-5.2", ("gpt-4.1", "gpt-4o-mini") ) assert result.status == "pass" assert "unavailable" in result.detail assert "gpt-5.2" in result.detail def test_check_model_chain_all_missing() -> None: payload = {"data": [{"id": "gpt-4o"}]} with patch( "urllib.request.urlopen", return_value=_DummyHTTPResponse(status=200, payload=payload), ): result = health.check_model_chain( "https://api.example.com/v1", "sk-test", "gpt-5.2", ("gpt-5.1",) ) assert result.status == "fail" assert "No models available" in result.detail def test_check_model_chain_no_models() -> None: result = health.check_model_chain( "https://api.example.com/v1", "sk-test", "", () ) assert result.status == "warn" assert "No models configured" in result.detail def test_check_sandbox_python_exists() -> None: with ( patch.object(Path, "exists", return_value=True), patch("os.access", return_value=True), ): result = health.check_sandbox_python(".venv_arc/bin/python3") assert result.status == "pass" def test_check_sandbox_python_missing() -> None: with ( patch.object(Path, "exists", return_value=False), patch("os.access", return_value=False), ): result = health.check_sandbox_python(".venv_arc/bin/python3") assert result.status == "warn" def test_check_matplotlib_available() -> None: with patch("importlib.import_module", return_value=object()): result = health.check_matplotlib() assert result.status == "pass" def test_check_matplotlib_missing() -> None: with patch("importlib.import_module", side_effect=ImportError): result = health.check_matplotlib() assert result.status == "warn" assert result.detail == "Not installed; charts will be skipped" def test_check_experiment_mode_simulated() -> None: result = health.check_experiment_mode("simulated") assert result.status == "warn" def test_check_experiment_mode_sandbox() -> None: result = health.check_experiment_mode("sandbox") assert result.status == "pass" def test_run_doctor_all_pass_openai(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _ = config_path.write_text("project: {}\n", encoding="utf-8") with ( patch.object( health, "check_python_version", return_value=health.CheckResult("python_version", "pass", "ok"), ), patch.object( health, "check_yaml_import", return_value=health.CheckResult("yaml_import", "pass", "ok"), ), patch.object( health, "check_config_valid", return_value=health.CheckResult("config_valid", "pass", "ok"), ), patch.object( health, "check_llm_connectivity", return_value=health.CheckResult("llm_connectivity", "pass", "ok"), ), patch.object( health, "check_api_key_valid", return_value=health.CheckResult("api_key_valid", "pass", "ok"), ), patch.object( health, "check_model_chain", return_value=health.CheckResult("model_chain", "pass", "ok"), ), patch.object( health, "check_sandbox_python", return_value=health.CheckResult("sandbox_python", "pass", "ok"), ), patch.object( health, "check_matplotlib", return_value=health.CheckResult("matplotlib", "pass", "ok"), ), patch.object( health, "check_experiment_mode", return_value=health.CheckResult("experiment_mode", "pass", "ok"), ), ): report = health.run_doctor(config_path) assert report.overall == "pass" assert len(report.checks) == 9 def test_run_doctor_with_failures(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _ = config_path.write_text("project: {}\n", encoding="utf-8") with ( patch.object( health, "check_python_version", return_value=health.CheckResult("python_version", "pass", "ok"), ), patch.object( health, "check_yaml_import", return_value=health.CheckResult("yaml_import", "pass", "ok"), ), patch.object( health, "check_config_valid", return_value=health.CheckResult("config_valid", "fail", "bad", "fix it"), ), patch.object( health, "check_llm_connectivity", return_value=health.CheckResult("llm_connectivity", "pass", "ok"), ), patch.object( health, "check_api_key_valid", return_value=health.CheckResult("api_key_valid", "warn", "warn", "later"), ), patch.object( health, "check_model_chain", return_value=health.CheckResult("model_chain", "pass", "ok"), ), patch.object( health, "check_sandbox_python", return_value=health.CheckResult("sandbox_python", "pass", "ok"), ), patch.object( health, "check_matplotlib", return_value=health.CheckResult("matplotlib", "pass", "ok"), ), patch.object( health, "check_experiment_mode", return_value=health.CheckResult("experiment_mode", "pass", "ok"), ), ): report = health.run_doctor(config_path) assert report.overall == "fail" assert "fix it" in report.actionable_fixes def test_doctor_report_json_structure(tmp_path: Path) -> None: report = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[ health.CheckResult("python_version", "pass", "ok"), health.CheckResult( "matplotlib", "warn", "missing", "pip install matplotlib" ), ], overall="pass", ) output_path = tmp_path / "reports" / "doctor.json" health.write_doctor_report(report, output_path) raw = cast(dict[str, object], json.loads(output_path.read_text(encoding="utf-8"))) assert raw["timestamp"] == "2026-01-01T00:00:00+00:00" assert raw["overall"] == "pass" assert isinstance(raw["checks"], list) assert raw["actionable_fixes"] == ["pip install matplotlib"] def test_doctor_report_overall_logic() -> None: passing = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[health.CheckResult("x", "pass", "ok")], overall="pass", ) failing = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[health.CheckResult("x", "fail", "bad", "fix")], overall="fail", ) assert passing.overall == "pass" assert failing.overall == "fail" assert failing.actionable_fixes == ["fix"] def test_print_doctor_report_pass(capsys: pytest.CaptureFixture[str]) -> None: report = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[health.CheckResult("python_version", "pass", "ok")], overall="pass", ) health.print_doctor_report(report) out = capsys.readouterr().out assert "✅" in out assert "Result: PASS" in out def test_print_doctor_report_fail(capsys: pytest.CaptureFixture[str]) -> None: report = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[ health.CheckResult("config_valid", "fail", "bad config", "fix config"), health.CheckResult( "matplotlib", "warn", "missing", "pip install matplotlib" ), ], overall="fail", ) health.print_doctor_report(report) out = capsys.readouterr().out assert "❌" in out assert "⚠️" in out assert "Result: FAIL (1 errors, 1 warnings)" in out # --- ACP agent checks --- def test_check_acp_agent_found() -> None: with patch("shutil.which", return_value="/usr/local/bin/claude"): result = health.check_acp_agent("claude") assert result.status == "pass" assert "/usr/local/bin/claude" in result.detail def test_check_acp_agent_missing() -> None: with patch("shutil.which", return_value=None): result = health.check_acp_agent("claude") assert result.status == "fail" assert "'claude' not found" in result.detail assert "Install claude" in result.fix def _write_acp_config(path: Path) -> None: _ = path.write_text( """\ project: name: demo research: topic: ACP test runtime: timezone: UTC notifications: channel: test knowledge_base: root: kb llm: provider: acp acp: agent: claude """, encoding="utf-8", ) def test_run_doctor_acp_skips_http_checks(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _write_acp_config(config_path) with ( patch.object( health, "check_python_version", return_value=health.CheckResult("python_version", "pass", "ok"), ), patch.object( health, "check_yaml_import", return_value=health.CheckResult("yaml_import", "pass", "ok"), ), patch.object( health, "check_config_valid", return_value=health.CheckResult("config_valid", "pass", "ok"), ), patch.object( health, "check_acp_agent", return_value=health.CheckResult("acp_agent", "pass", "ok"), ), patch.object( health, "check_sandbox_python", return_value=health.CheckResult("sandbox_python", "pass", "ok"), ), patch.object( health, "check_matplotlib", return_value=health.CheckResult("matplotlib", "pass", "ok"), ), patch.object( health, "check_experiment_mode", return_value=health.CheckResult("experiment_mode", "pass", "ok"), ), ): report = health.run_doctor(config_path) check_names = [c.name for c in report.checks] assert "llm_connectivity" not in check_names assert "api_key_valid" not in check_names assert "model_chain" not in check_names def test_run_doctor_acp_includes_agent_check(tmp_path: Path) -> None: config_path = tmp_path / "config.yaml" _write_acp_config(config_path) with ( patch.object( health, "check_python_version", return_value=health.CheckResult("python_version", "pass", "ok"), ), patch.object( health, "check_yaml_import", return_value=health.CheckResult("yaml_import", "pass", "ok"), ), patch.object( health, "check_config_valid", return_value=health.CheckResult("config_valid", "pass", "ok"), ), patch.object( health, "check_acp_agent", return_value=health.CheckResult("acp_agent", "pass", "ok"), ), patch.object( health, "check_sandbox_python", return_value=health.CheckResult("sandbox_python", "pass", "ok"), ), patch.object( health, "check_matplotlib", return_value=health.CheckResult("matplotlib", "pass", "ok"), ), patch.object( health, "check_experiment_mode", return_value=health.CheckResult("experiment_mode", "pass", "ok"), ), ): report = health.run_doctor(config_path) check_names = [c.name for c in report.checks] assert "acp_agent" in check_names assert report.overall == "pass" assert len(report.checks) == 7 def test_print_doctor_report_ascii_fallback(monkeypatch: pytest.MonkeyPatch) -> None: report = health.DoctorReport( timestamp="2026-01-01T00:00:00+00:00", checks=[health.CheckResult("python_version", "pass", "ok")], overall="pass", ) class _AsciiStdout: encoding = "ascii" def __init__(self) -> None: self.parts: list[str] = [] def write(self, text: str) -> int: text.encode(self.encoding) self.parts.append(text) return len(text) def flush(self) -> None: return None fake_stdout = _AsciiStdout() monkeypatch.setattr(health.sys, "stdout", fake_stdout) health.print_doctor_report(report) out = "".join(fake_stdout.parts) assert "[OK] python_version: ok" in out assert "Result: PASS" in out ================================================ FILE: tests/test_rc_kb.py ================================================ from __future__ import annotations import json from pathlib import Path import yaml from researchclaw.knowledge.base import ( KB_CATEGORY_MAP, KBEntry, _markdown_frontmatter, _obsidian_enhancements, generate_weekly_report, write_kb_entry, write_stage_to_kb, ) def _kb_root(tmp_path: Path) -> Path: return tmp_path / "kb" def test_kb_entry_dataclass_creation(): entry = KBEntry( category="findings", entry_id="e1", title="T", content="C", source_stage="01-goal_define", run_id="run1", ) assert entry.category == "findings" assert entry.entry_id == "e1" assert entry.run_id == "run1" def test_write_kb_entry_creates_expected_file_path(tmp_path: Path): kb_root = _kb_root(tmp_path) entry = KBEntry("questions", "q-1", "Q", "Body", "01-goal_define", "run-a") path = write_kb_entry(kb_root, entry) assert path == kb_root / "questions" / "q-1.md" assert path.exists() def test_write_kb_entry_includes_frontmatter_markers(tmp_path: Path): kb_root = _kb_root(tmp_path) entry = KBEntry("findings", "f-1", "Finding", "Body", "14-result_analysis", "run-a") text = write_kb_entry(kb_root, entry).read_text(encoding="utf-8") assert text.startswith("---\n") assert "\n---\n" in text def test_write_kb_entry_markdown_backend_has_no_obsidian_extras(tmp_path: Path): kb_root = _kb_root(tmp_path) entry = KBEntry( "questions", "q-2", "Question", "Body", "01-goal_define", "run-a", tags=["hypothesis"], links=["run-run-a"], ) text = write_kb_entry(kb_root, entry, backend="markdown").read_text( encoding="utf-8" ) assert "[[run-run-a]]" not in text assert "#hypothesis" not in text def test_write_kb_entry_obsidian_backend_includes_tags_and_wikilinks(tmp_path: Path): kb_root = _kb_root(tmp_path) entry = KBEntry( "questions", "q-3", "Question", "Body", "01-goal_define", "run-a", tags=["hypothesis", "q1"], links=["run-run-a", "topic-a"], ) text = write_kb_entry(kb_root, entry, backend="obsidian").read_text( encoding="utf-8" ) assert "#hypothesis #q1" in text assert "Related: [[run-run-a]], [[topic-a]]" in text def test_markdown_frontmatter_output_format_and_fields(): entry = KBEntry( "reviews", "r-1", "Report", "Body", "report", "run-x", tags=["weekly"], evidence_refs=["stage-01/goal.md"], ) fm = _markdown_frontmatter(entry) assert fm.startswith("---\n") assert fm.endswith("\n---\n") parsed = yaml.safe_load(fm.split("---\n", 1)[1].rsplit("\n---\n", 1)[0]) assert parsed["id"] == "r-1" assert parsed["title"] == "Report" assert parsed["stage"] == "report" assert parsed["run_id"] == "run-x" assert parsed["tags"] == ["weekly"] assert parsed["evidence"] == ["stage-01/goal.md"] def test_obsidian_enhancements_with_tags_and_links(): entry = KBEntry( "findings", "f-2", "Finding", "Body", "14-result_analysis", "run-z", tags=["a", "b"], links=["run-z", "result-node"], ) enh = _obsidian_enhancements(entry) assert "#a #b" in enh assert "Related: [[run-z]], [[result-node]]" in enh def test_obsidian_enhancements_with_no_tags_or_links_returns_empty(): entry = KBEntry("findings", "f-3", "Finding", "Body", "14-result_analysis", "run-z") assert _obsidian_enhancements(entry) == "" def test_kb_category_map_has_exactly_22_stage_entries(): assert len(KB_CATEGORY_MAP) == 22 assert set(KB_CATEGORY_MAP) == set(range(1, 23)) def test_kb_category_map_values_are_valid_categories(): valid = { "questions", "literature", "experiments", "findings", "decisions", "reviews", } assert set(KB_CATEGORY_MAP.values()).issubset(valid) def test_write_stage_to_kb_places_entry_in_mapped_category(tmp_path: Path): kb_root = _kb_root(tmp_path) stage_dir = tmp_path / "stage-10" stage_dir.mkdir() (stage_dir / "run.md").write_text("exp content", encoding="utf-8") paths = write_stage_to_kb( kb_root, 10, "experiment_cycle", "run-1", ["run.md"], stage_dir ) assert len(paths) == 1 assert paths[0].parent.name == "experiments" def test_write_stage_to_kb_reads_artifact_file_contents(tmp_path: Path): kb_root = _kb_root(tmp_path) stage_dir = tmp_path / "stage-04" stage_dir.mkdir() (stage_dir / "lit.md").write_text("paper A\npaper B", encoding="utf-8") path = write_stage_to_kb( kb_root, 4, "literature_search", "run-1", ["lit.md"], stage_dir )[0] text = path.read_text(encoding="utf-8") assert "paper A" in text assert "stage-04/lit.md" in text def test_write_stage_to_kb_handles_missing_artifacts_gracefully(tmp_path: Path): kb_root = _kb_root(tmp_path) stage_dir = tmp_path / "stage-05" stage_dir.mkdir() path = write_stage_to_kb( kb_root, 5, "literature_extract", "run-2", ["missing.md"], stage_dir )[0] text = path.read_text(encoding="utf-8") assert "Stage 05 (literature_extract) completed" in text def test_write_stage_to_kb_truncates_large_artifact_content(tmp_path: Path): kb_root = _kb_root(tmp_path) stage_dir = tmp_path / "stage-12" stage_dir.mkdir() large_text = "x" * 6000 (stage_dir / "big.txt").write_text(large_text, encoding="utf-8") path = write_stage_to_kb( kb_root, 12, "experiment_implement", "run-3", ["big.txt"], stage_dir )[0] text = path.read_text(encoding="utf-8") assert "... (truncated, see full artifact)" in text assert text.count("x") >= 5000 def test_write_stage_to_kb_directory_artifact_records_listing(tmp_path: Path): kb_root = _kb_root(tmp_path) stage_dir = tmp_path / "stage-13" artifact_dir = stage_dir / "outputs" artifact_dir.mkdir(parents=True) (artifact_dir / "a.txt").write_text("a", encoding="utf-8") (artifact_dir / "b.txt").write_text("b", encoding="utf-8") path = write_stage_to_kb( kb_root, 13, "experiment_execute", "run-4", ["outputs/"], stage_dir )[0] text = path.read_text(encoding="utf-8") assert "Directory with 2 files: a.txt, b.txt" in text assert "stage-13/outputs/" in text def test_generate_weekly_report_creates_file_in_reviews_category(tmp_path: Path): kb_root = _kb_root(tmp_path) run_dir = tmp_path / "run-a" run_dir.mkdir() (run_dir / "pipeline_summary.json").write_text( json.dumps({"run_id": "run-a", "stages_executed": 10, "stages_done": 10}), encoding="utf-8", ) path = generate_weekly_report(kb_root, [run_dir], week_label="2026-W10") assert path.parent.name == "reviews" assert path.name == "weekly-report-2026-W10.md" def test_generate_weekly_report_with_empty_run_dirs(tmp_path: Path): kb_root = _kb_root(tmp_path) path = generate_weekly_report(kb_root, [], week_label="2026-W11") text = path.read_text(encoding="utf-8") assert "Pipeline runs: 0" in text assert "Success rate: N/A" in text def test_generate_weekly_report_aggregates_statistics_correctly(tmp_path: Path): kb_root = _kb_root(tmp_path) run1 = tmp_path / "run-1" run2 = tmp_path / "run-2" run1.mkdir() run2.mkdir() (run1 / "pipeline_summary.json").write_text( json.dumps( { "run_id": "run-1", "stages_executed": 20, "stages_done": 18, "stages_failed": 1, "stages_blocked": 1, "final_status": "failed", } ), encoding="utf-8", ) (run2 / "pipeline_summary.json").write_text( json.dumps( { "run_id": "run-2", "stages_executed": 10, "stages_done": 10, "stages_failed": 0, "stages_blocked": 0, "final_status": "done", } ), encoding="utf-8", ) report = generate_weekly_report(kb_root, [run1, run2], week_label="2026-W12") text = report.read_text(encoding="utf-8") assert "Pipeline runs: 2" in text assert "Stages executed: 30" in text assert "Stages completed: 28" in text assert "Stages failed: 1" in text assert "Stages blocked (gate): 1" in text assert "Success rate: 93.3%" in text def test_generate_weekly_report_ignores_missing_summary_files(tmp_path: Path): kb_root = _kb_root(tmp_path) run_ok = tmp_path / "run-ok" run_empty = tmp_path / "run-empty" run_ok.mkdir() run_empty.mkdir() (run_ok / "pipeline_summary.json").write_text( json.dumps({"run_id": "run-ok", "stages_executed": 5, "stages_done": 5}), encoding="utf-8", ) report = generate_weekly_report(kb_root, [run_ok, run_empty], week_label="2026-W13") text = report.read_text(encoding="utf-8") assert "Pipeline runs: 1" in text ================================================ FILE: tests/test_rc_literature.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false """Unit tests for researchclaw.literature module. All network-dependent tests mock HTTP responses via monkeypatch. """ from __future__ import annotations import json import textwrap from typing import Any from unittest.mock import MagicMock, patch import pytest from researchclaw.literature.models import Author, Paper from researchclaw.literature.semantic_scholar import ( _parse_s2_paper, search_semantic_scholar, ) from researchclaw.literature.arxiv_client import ( _convert_result, search_arxiv, ) from researchclaw.literature.search import ( _deduplicate, _normalise_title, papers_to_bibtex, search_papers, search_papers_multi_query, ) # ────────────────────────────────────────────────────────────────────── # Fixtures & helpers # ────────────────────────────────────────────────────────────────────── def _make_paper(**kwargs: Any) -> Paper: defaults = { "paper_id": "s2-abc", "title": "Attention Is All You Need", "authors": (Author(name="Ashish Vaswani"),), "year": 2017, "venue": "NeurIPS", "citation_count": 80000, "doi": "10.5555/3295222.3295349", "arxiv_id": "1706.03762", "url": "https://arxiv.org/abs/1706.03762", "source": "semantic_scholar", } defaults.update(kwargs) return Paper(**defaults) SAMPLE_S2_RESPONSE = { "total": 1, "data": [ { "paperId": "abc123", "title": "Test Paper on Transformers", "abstract": "We study transformers for NLP tasks.", "year": 2024, "venue": "NeurIPS", "citationCount": 42, "authors": [ {"authorId": "1", "name": "Jane Smith"}, {"authorId": "2", "name": "John Doe"}, ], "externalIds": {"DOI": "10.1234/test", "ArXiv": "2401.00001"}, "url": "https://www.semanticscholar.org/paper/abc123", } ], } SAMPLE_ARXIV_ATOM = textwrap.dedent("""\ http://arxiv.org/abs/2401.00001v1 A Novel Approach to Protein Folding We propose a new method for protein structure prediction. 2024-01-15T00:00:00Z Alice Researcher Bob Scientist 10.5678/protein http://arxiv.org/abs/2402.00002v1 Deep Reinforcement Learning Survey A comprehensive survey of deep RL methods. 2024-02-20T00:00:00Z Charlie Expert """) # ────────────────────────────────────────────────────────────────────── # Author tests # ────────────────────────────────────────────────────────────────────── class TestAuthor: def test_last_name_simple(self) -> None: a = Author(name="Jane Smith") assert a.last_name() == "smith" def test_last_name_accented(self) -> None: a = Author(name="José García") assert a.last_name() == "garcia" # accent stripped, but 'i' preserved def test_last_name_single(self) -> None: a = Author(name="Madonna") assert a.last_name() == "madonna" def test_last_name_empty(self) -> None: a = Author(name="") assert a.last_name() == "unknown" # ────────────────────────────────────────────────────────────────────── # Paper tests # ────────────────────────────────────────────────────────────────────── class TestPaper: def test_cite_key_format(self) -> None: p = _make_paper() key = p.cite_key assert key == "vaswani2017attention" def test_cite_key_no_authors(self) -> None: p = _make_paper(authors=()) assert p.cite_key.startswith("anon") def test_cite_key_no_year(self) -> None: p = _make_paper(year=0) assert "0000" in p.cite_key def test_to_bibtex_contains_fields(self) -> None: p = _make_paper() bib = p.to_bibtex() assert "@inproceedings{vaswani2017attention," in bib assert "title = {Attention Is All You Need}" in bib assert "author = {Ashish Vaswani}" in bib assert "year = {2017}" in bib assert "doi = {10.5555/3295222.3295349}" in bib assert "eprint = {1706.03762}" in bib def test_to_bibtex_override(self) -> None: p = _make_paper(_bibtex_override="@article{custom, title={Custom}}") assert p.to_bibtex() == "@article{custom, title={Custom}}" def test_to_bibtex_article_no_venue(self) -> None: p = _make_paper(venue="", arxiv_id="2301.00001") bib = p.to_bibtex() assert "@article{" in bib assert "journal = {arXiv preprint arXiv:2301.00001}" in bib def test_to_bibtex_arxiv_category_venue(self) -> None: """T1.4: arXiv category codes (cs.CL) must not be used as journal names.""" p = _make_paper(venue="cs.CL", arxiv_id="2301.00001") bib = p.to_bibtex() assert "journal = {cs.CL}" not in bib assert "arXiv preprint" in bib def test_to_dict(self) -> None: p = _make_paper() d = p.to_dict() assert d["paper_id"] == "s2-abc" assert d["cite_key"] == "vaswani2017attention" assert isinstance(d["authors"], list) assert d["authors"][0]["name"] == "Ashish Vaswani" def test_paper_frozen(self) -> None: p = _make_paper() with pytest.raises(AttributeError): p.title = "new title" # type: ignore[misc] # ────────────────────────────────────────────────────────────────────── # Semantic Scholar client tests # ────────────────────────────────────────────────────────────────────── class TestSemanticScholar: def test_parse_s2_paper(self) -> None: item = SAMPLE_S2_RESPONSE["data"][0] p = _parse_s2_paper(item) assert p.paper_id == "s2-abc123" assert p.title == "Test Paper on Transformers" assert len(p.authors) == 2 assert p.authors[0].name == "Jane Smith" assert p.year == 2024 assert p.doi == "10.1234/test" assert p.arxiv_id == "2401.00001" assert p.source == "semantic_scholar" assert p.citation_count == 42 def test_search_semantic_scholar_mock( self, monkeypatch: pytest.MonkeyPatch ) -> None: """Mock urllib to return sample S2 response.""" # Reset S2 circuit breaker (may be tripped from prior test API calls) from researchclaw.literature.semantic_scholar import _reset_circuit_breaker _reset_circuit_breaker() response_bytes = json.dumps(SAMPLE_S2_RESPONSE).encode("utf-8") mock_resp = MagicMock() mock_resp.read.return_value = response_bytes mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) monkeypatch.setattr( "researchclaw.literature.semantic_scholar.urllib.request.urlopen", lambda *a, **kw: mock_resp, ) papers = search_semantic_scholar("transformers", limit=5) assert len(papers) == 1 assert papers[0].title == "Test Paper on Transformers" def test_search_semantic_scholar_network_error( self, monkeypatch: pytest.MonkeyPatch ) -> None: """Should return empty list on network error.""" from researchclaw.literature.semantic_scholar import _reset_circuit_breaker _reset_circuit_breaker() import urllib.error monkeypatch.setattr( "researchclaw.literature.semantic_scholar.urllib.request.urlopen", lambda *a, **kw: (_ for _ in ()).throw(urllib.error.URLError("timeout")), ) # Patch sleep to speed up test monkeypatch.setattr( "researchclaw.literature.semantic_scholar.time.sleep", lambda _: None ) papers = search_semantic_scholar("test", limit=5) assert papers == [] # ────────────────────────────────────────────────────────────────────── # arXiv client tests # ────────────────────────────────────────────────────────────────────── class TestArxiv: def test_convert_result(self) -> None: """Test converting arxiv.Result to Paper via the new library.""" from unittest.mock import MagicMock from datetime import datetime mock_result = MagicMock() mock_result.entry_id = "http://arxiv.org/abs/2401.00001v1" mock_result.title = "A Novel Approach to Protein Folding" mock_result.summary = "We study protein folding." mock_result.published = datetime(2024, 1, 15) mock_result.doi = "10.5678/protein" mock_result.primary_category = "q-bio.BM" mock_author1 = MagicMock() mock_author1.name = "Alice Researcher" mock_author2 = MagicMock() mock_author2.name = "Bob Scientist" mock_result.authors = [mock_author1, mock_author2] paper = _convert_result(mock_result) assert paper.title == "A Novel Approach to Protein Folding" assert paper.arxiv_id == "2401.00001" assert paper.year == 2024 assert len(paper.authors) == 2 assert paper.authors[0].name == "Alice Researcher" assert paper.source == "arxiv" assert paper.doi == "10.5678/protein" assert paper.venue == "q-bio.BM" def test_search_arxiv_mock(self, monkeypatch: pytest.MonkeyPatch) -> None: """Test search_arxiv with mocked arxiv library.""" from unittest.mock import MagicMock from datetime import datetime import types mock_result = MagicMock() mock_result.entry_id = "http://arxiv.org/abs/2401.00001v1" mock_result.title = "Test Paper" mock_result.summary = "Abstract." mock_result.published = datetime(2024, 1, 1) mock_result.doi = "" mock_result.primary_category = "cs.LG" mock_author = MagicMock() mock_author.name = "Test Author" mock_result.authors = [mock_author] mock_client = MagicMock() mock_client.results.return_value = iter([mock_result]) # Mock the module-level `arxiv` so the `if arxiv is None` guard # doesn't short-circuit before the mocked _get_client is reached. # Use MagicMock so all attributes (Search, SortOrder, etc.) auto-resolve. _fake_arxiv = MagicMock() monkeypatch.setattr( "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv, ) monkeypatch.setattr( "researchclaw.literature.arxiv_client._get_client", lambda: mock_client, ) from researchclaw.literature.arxiv_client import _reset_circuit_breaker _reset_circuit_breaker() papers = search_arxiv("test", limit=10) assert len(papers) == 1 assert papers[0].title == "Test Paper" assert papers[0].arxiv_id == "2401.00001" def test_search_arxiv_error_graceful(self, monkeypatch: pytest.MonkeyPatch) -> None: """search_arxiv returns empty list on error, not raise.""" from unittest.mock import MagicMock import types # Build a fake arxiv module with real exception classes so # `except arxiv.HTTPError` doesn't TypeError. _fake_arxiv = types.ModuleType("arxiv") class _FakeHTTPError(Exception): pass class _FakeUnexpectedEmptyPageError(Exception): pass _fake_arxiv.HTTPError = _FakeHTTPError _fake_arxiv.UnexpectedEmptyPageError = _FakeUnexpectedEmptyPageError _fake_arxiv.SortCriterion = MagicMock() _fake_arxiv.SortOrder = MagicMock() _fake_arxiv.Search = MagicMock() monkeypatch.setattr( "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv, ) mock_client = MagicMock() mock_client.results.side_effect = _FakeHTTPError("Simulated arXiv HTTP error") monkeypatch.setattr( "researchclaw.literature.arxiv_client._get_client", lambda: mock_client, ) from researchclaw.literature.arxiv_client import _reset_circuit_breaker _reset_circuit_breaker() papers = search_arxiv("test", limit=10) assert papers == [] # ────────────────────────────────────────────────────────────────────── # Unified search & deduplication tests # ────────────────────────────────────────────────────────────────────── class TestDeduplication: def test_dedup_by_doi(self) -> None: p1 = _make_paper(paper_id="s2-1", doi="10.1234/a", citation_count=100) p2 = _make_paper( paper_id="arxiv-1", doi="10.1234/a", citation_count=50, source="arxiv" ) result = _deduplicate([p1, p2]) assert len(result) == 1 assert result[0].citation_count == 100 # keeps higher def test_dedup_by_arxiv_id(self) -> None: p1 = _make_paper( paper_id="s2-1", doi="", arxiv_id="2401.00001", citation_count=10 ) p2 = _make_paper( paper_id="arxiv-1", doi="", arxiv_id="2401.00001", citation_count=20, source="arxiv", ) result = _deduplicate([p1, p2]) assert len(result) == 1 assert result[0].citation_count == 20 # arxiv version had more def test_dedup_by_title(self) -> None: p1 = _make_paper( paper_id="s2-1", doi="", arxiv_id="", title="My Cool Paper", citation_count=5, ) p2 = _make_paper( paper_id="s2-2", doi="", arxiv_id="", title="My Cool Paper", citation_count=10, ) result = _deduplicate([p1, p2]) assert len(result) == 1 assert result[0].citation_count == 10 def test_dedup_no_duplicates(self) -> None: p1 = _make_paper(paper_id="s2-1", title="Paper A", doi="10.1/a", arxiv_id="1111.11111") p2 = _make_paper(paper_id="s2-2", title="Paper B", doi="10.1/b", arxiv_id="2222.22222") result = _deduplicate([p1, p2]) assert len(result) == 2 def test_normalise_title(self) -> None: assert _normalise_title(" The Great Paper!!! ") == "the great paper" assert _normalise_title("A/B Testing: Methods") == "ab testing methods" class TestPapersToBibtex: def test_generates_combined(self) -> None: p1 = _make_paper(paper_id="s2-1", title="Paper A") p2 = _make_paper(paper_id="s2-2", title="Paper B", venue="ICML 2024") bib = papers_to_bibtex([p1, p2]) assert bib.count("@") == 2 assert "Paper A" in bib assert "Paper B" in bib class TestSearchPapers: def test_search_papers_combines_sources( self, monkeypatch: pytest.MonkeyPatch ) -> None: """Mock both S2 and arXiv to verify combined search.""" s2_paper = _make_paper( paper_id="s2-1", source="semantic_scholar", citation_count=100 ) arxiv_paper = _make_paper( paper_id="arxiv-1", title="Different Paper", doi="10.2/b", arxiv_id="2402.99999", source="arxiv", citation_count=50, ) monkeypatch.setattr( "researchclaw.literature.search.search_semantic_scholar", lambda *a, **kw: [s2_paper], ) monkeypatch.setattr( "researchclaw.literature.search.search_arxiv", lambda *a, **kw: [arxiv_paper], ) monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None) papers = search_papers("test", sources=["semantic_scholar", "arxiv"]) assert len(papers) == 2 # Should be sorted by citation_count desc assert papers[0].citation_count >= papers[1].citation_count def test_default_sources_openalex_first(self) -> None: """OpenAlex should be the primary (first) source — least restrictive limits.""" from researchclaw.literature.search import _DEFAULT_SOURCES assert _DEFAULT_SOURCES[0] == "openalex" assert "semantic_scholar" in _DEFAULT_SOURCES assert "arxiv" in _DEFAULT_SOURCES def test_s2_failure_does_not_block_others( self, monkeypatch: pytest.MonkeyPatch ) -> None: """When S2 fails, other sources should still return results.""" arxiv_paper = _make_paper( paper_id="arxiv-ok", title="ArXiv Paper", source="arxiv", doi="10.1/ax", arxiv_id="2401.99991", ) monkeypatch.setattr( "researchclaw.literature.search.search_openalex", lambda *a, **kw: [], ) monkeypatch.setattr( "researchclaw.literature.search.search_semantic_scholar", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("S2 down")), ) monkeypatch.setattr( "researchclaw.literature.search.search_arxiv", lambda *a, **kw: [arxiv_paper], ) monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None) papers = search_papers("test") assert len(papers) >= 1 assert papers[0].source == "arxiv" def test_search_papers_unknown_source( self, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None) papers = search_papers("test", sources=["unknown_source"]) assert papers == [] def test_search_papers_multi_query(self, monkeypatch: pytest.MonkeyPatch) -> None: call_count = 0 def mock_search(*a: Any, **kw: Any) -> list[Paper]: nonlocal call_count call_count += 1 return [ _make_paper( paper_id=f"s2-{call_count}", title=f"Unique Paper {call_count}", doi=f"10.{call_count}/unique", arxiv_id=f"240{call_count}.{call_count:05d}", ) ] monkeypatch.setattr( "researchclaw.literature.search.search_papers", mock_search, ) monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None) papers = search_papers_multi_query(["q1", "q2", "q3"]) assert call_count == 3 # All unique titles so no dedup assert len(papers) == 3 # ────────────────────────────────────────────────────────────────────── # Edge cases # ────────────────────────────────────────────────────────────────────── class TestEdgeCases: def test_paper_with_no_meaningful_title_word(self) -> None: """cite_key should still work with stopword-only titles.""" p = _make_paper(title="The And For With", year=2024) # All words are stopwords or <4 chars, keyword should be empty key = p.cite_key assert key.startswith("vaswani2024") def test_paper_multiple_authors_bibtex(self) -> None: p = _make_paper( authors=( Author(name="Alice One"), Author(name="Bob Two"), Author(name="Charlie Three"), ) ) bib = p.to_bibtex() assert "Alice One and Bob Two and Charlie Three" in bib def test_empty_s2_response(self) -> None: """_parse_s2_paper shouldn't crash on minimal data.""" p = _parse_s2_paper({"paperId": "x"}) assert p.paper_id == "s2-x" assert p.title == "" assert p.authors == () # ────────────────────────────────────────────────────────────────────── # arXiv circuit breaker tests # ────────────────────────────────────────────────────────────────────── class TestArxivCircuitBreaker: def setup_method(self) -> None: from researchclaw.literature.arxiv_client import _reset_circuit_breaker _reset_circuit_breaker() def test_failure_triggers_circuit_breaker(self) -> None: """Three consecutive failures should trip the circuit breaker.""" from researchclaw.literature import arxiv_client # Simulate 3 consecutive failures for _ in range(3): arxiv_client._cb_on_failure() assert arxiv_client._cb_state == arxiv_client._CB_OPEN assert arxiv_client._cb_trip_count == 1 def test_breaker_open_skips_requests(self) -> None: """When breaker is OPEN, requests should be skipped.""" import time as time_mod from researchclaw.literature import arxiv_client arxiv_client._cb_state = arxiv_client._CB_OPEN arxiv_client._cb_open_since = time_mod.monotonic() arxiv_client._cb_cooldown_sec = 999 assert not arxiv_client._cb_should_allow() def test_success_resets_breaker(self) -> None: """A successful request should reset the circuit breaker.""" from researchclaw.literature import arxiv_client arxiv_client._cb_state = arxiv_client._CB_HALF_OPEN arxiv_client._cb_consecutive_429s = 2 arxiv_client._cb_on_success() assert arxiv_client._cb_state == arxiv_client._CB_CLOSED assert arxiv_client._cb_consecutive_429s == 0 def test_half_open_probe_failure_doubles_cooldown(self) -> None: """Probe failure in HALF_OPEN should double the cooldown.""" from researchclaw.literature import arxiv_client arxiv_client._cb_state = arxiv_client._CB_HALF_OPEN initial_cooldown = arxiv_client._cb_cooldown_sec arxiv_client._cb_on_failure() assert arxiv_client._cb_state == arxiv_client._CB_OPEN assert arxiv_client._cb_cooldown_sec == min(initial_cooldown * 2, 600) def test_search_with_http_error(self, monkeypatch: pytest.MonkeyPatch) -> None: """search_arxiv should return empty list on HTTPError.""" import types _fake_arxiv = types.ModuleType("arxiv") class _FakeHTTPError(Exception): pass class _FakeUnexpectedEmptyPageError(Exception): pass _fake_arxiv.HTTPError = _FakeHTTPError _fake_arxiv.UnexpectedEmptyPageError = _FakeUnexpectedEmptyPageError _fake_arxiv.SortCriterion = MagicMock() _fake_arxiv.SortOrder = MagicMock() _fake_arxiv.Search = MagicMock() monkeypatch.setattr( "researchclaw.literature.arxiv_client.arxiv", _fake_arxiv, ) mock_client = MagicMock() mock_client.results.side_effect = _FakeHTTPError("Simulated 429") monkeypatch.setattr( "researchclaw.literature.arxiv_client._get_client", lambda: mock_client, ) from researchclaw.literature.arxiv_client import _reset_circuit_breaker _reset_circuit_breaker() papers = search_arxiv("test", limit=5) assert papers == [] # ────────────────────────────────────────────────────────────────────── # OpenAlex client tests # ────────────────────────────────────────────────────────────────────── SAMPLE_OPENALEX_RESPONSE = { "results": [ { "id": "https://openalex.org/W123456", "title": "Attention Is All You Need", "authorships": [ { "author": {"display_name": "Ashish Vaswani"}, "institutions": [{"display_name": "Google Brain"}], } ], "publication_year": 2017, "primary_location": { "source": {"display_name": "NeurIPS"} }, "cited_by_count": 85000, "doi": "https://doi.org/10.5555/3295222.3295349", "ids": { "openalex": "https://openalex.org/W123456", "arxiv": "https://arxiv.org/abs/1706.03762", }, "abstract_inverted_index": { "The": [0], "dominant": [1], "models": [2, 6], "are": [3], "based": [4], "on": [5], }, } ] } class TestOpenAlex: def test_parse_openalex_response(self, monkeypatch: pytest.MonkeyPatch) -> None: """Mock urllib to return sample OpenAlex response.""" from researchclaw.literature.openalex_client import search_openalex response_bytes = json.dumps(SAMPLE_OPENALEX_RESPONSE).encode("utf-8") mock_resp = MagicMock() mock_resp.read.return_value = response_bytes mock_resp.__enter__ = lambda s: s mock_resp.__exit__ = MagicMock(return_value=False) monkeypatch.setattr( "researchclaw.literature.openalex_client.urllib.request.urlopen", lambda *a, **kw: mock_resp, ) papers = search_openalex("attention", limit=5) assert len(papers) == 1 p = papers[0] assert p.title == "Attention Is All You Need" assert p.year == 2017 assert p.citation_count == 85000 assert p.doi == "10.5555/3295222.3295349" assert p.arxiv_id == "1706.03762" assert p.source == "openalex" assert p.authors[0].name == "Ashish Vaswani" def test_abstract_reconstruction(self) -> None: from researchclaw.literature.openalex_client import _reconstruct_abstract inv_idx = {"Hello": [0], "world": [1], "foo": [3], "bar": [2]} result = _reconstruct_abstract(inv_idx) assert result == "Hello world bar foo" def test_abstract_empty(self) -> None: from researchclaw.literature.openalex_client import _reconstruct_abstract assert _reconstruct_abstract(None) == "" assert _reconstruct_abstract({}) == "" def test_openalex_network_error(self, monkeypatch: pytest.MonkeyPatch) -> None: """Should return empty list on network error.""" from researchclaw.literature.openalex_client import search_openalex monkeypatch.setattr( "researchclaw.literature.openalex_client.urllib.request.urlopen", lambda *a, **kw: (_ for _ in ()).throw(urllib.error.URLError("timeout")), ) monkeypatch.setattr( "researchclaw.literature.openalex_client.time.sleep", lambda _: None, ) papers = search_openalex("test", limit=5) assert papers == [] # ────────────────────────────────────────────────────────────────────── # Multi-source fallback tests # ────────────────────────────────────────────────────────────────────── class TestMultiSourceFallback: def test_openalex_failure_falls_back_to_s2_and_arxiv( self, monkeypatch: pytest.MonkeyPatch ) -> None: """When OpenAlex fails, S2 and arXiv should still return results.""" arxiv_paper = _make_paper( paper_id="arxiv-ok", title="ArXiv Paper", source="arxiv", doi="10.1/ax", arxiv_id="2401.99999", ) s2_paper = _make_paper( paper_id="s2-ok", title="S2 Paper", source="semantic_scholar", doi="10.1/s2", arxiv_id="2402.99999", ) monkeypatch.setattr( "researchclaw.literature.search.search_openalex", lambda *a, **kw: (_ for _ in ()).throw(RuntimeError("OpenAlex down")), ) monkeypatch.setattr( "researchclaw.literature.search.search_semantic_scholar", lambda *a, **kw: [s2_paper], ) monkeypatch.setattr( "researchclaw.literature.search.search_arxiv", lambda *a, **kw: [arxiv_paper], ) monkeypatch.setattr("researchclaw.literature.search.time.sleep", lambda _: None) papers = search_papers("test") assert len(papers) >= 1 sources = {p.source for p in papers} assert "semantic_scholar" in sources or "arxiv" in sources # ────────────────────────────────────────────────────────────────────── # Cache TTL tests # ────────────────────────────────────────────────────────────────────── class TestCacheTTL: def test_source_specific_ttl(self, tmp_path: Any) -> None: """arXiv cache should expire after 24h, not 7 days.""" from researchclaw.literature.cache import get_cached, put_cache, _SOURCE_TTL assert _SOURCE_TTL["arxiv"] == 86400 # 24h assert _SOURCE_TTL["semantic_scholar"] == 86400 * 3 # Put and get immediately — should hit put_cache("test", "arxiv", 10, [{"paper_id": "x", "title": "Y"}], cache_base=tmp_path) result = get_cached("test", "arxiv", 10, cache_base=tmp_path) assert result is not None assert len(result) == 1 def test_citation_verify_ttl_is_permanent(self) -> None: from researchclaw.literature.cache import _SOURCE_TTL assert _SOURCE_TTL["citation_verify"] >= 86400 * 365 import urllib.error ================================================ FILE: tests/test_rc_llm.py ================================================ from __future__ import annotations import json import urllib.request from types import SimpleNamespace from typing import Any, Mapping import pytest from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse, _NEW_PARAM_MODELS class _DummyHTTPResponse: def __init__(self, payload: Mapping[str, Any]): self._payload = payload def read(self) -> bytes: return json.dumps(self._payload).encode("utf-8") def __enter__(self) -> _DummyHTTPResponse: return self def __exit__(self, exc_type: object, exc: object, tb: object) -> None: return None def _make_client( *, api_key: str = "test-key", primary_model: str = "gpt-5.2", fallback_models: list[str] | None = None, timeout_sec: int = 120, ) -> LLMClient: config = LLMConfig( base_url="https://api.example.com/v1", api_key=api_key, primary_model=primary_model, fallback_models=fallback_models or ["gpt-5.1", "gpt-4.1", "gpt-4o"], timeout_sec=timeout_sec, ) return LLMClient(config) def _capture_raw_call( monkeypatch: pytest.MonkeyPatch, *, model: str, response_data: Mapping[str, Any] ) -> tuple[dict[str, object], LLMResponse, dict[str, object]]: captured: dict[str, object] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["request"] = req captured["timeout"] = timeout return _DummyHTTPResponse(response_data) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client = _make_client() resp = client._raw_call( model, [{"role": "user", "content": "hello"}], 123, 0.2, False ) request = captured["request"] assert isinstance(request, urllib.request.Request) data = request.data assert isinstance(data, bytes) body = json.loads(data.decode("utf-8")) assert isinstance(body, dict) return body, resp, captured def test_llm_config_defaults(): config = LLMConfig(base_url="https://api.example.com/v1", api_key="k") assert config.primary_model == "gpt-4o" assert config.max_tokens == 4096 assert config.temperature == 0.7 def test_llm_config_custom_values(): config = LLMConfig( base_url="https://custom.example/v1", api_key="custom", primary_model="o3", fallback_models=["o3-mini"], max_tokens=2048, temperature=0.1, timeout_sec=30, ) assert config.primary_model == "o3" assert config.fallback_models == ["o3-mini"] assert config.max_tokens == 2048 assert config.temperature == 0.1 assert config.timeout_sec == 30 def test_llm_response_dataclass_fields(): response = LLMResponse(content="ok", model="gpt-5.2", completion_tokens=10) assert response.content == "ok" assert response.model == "gpt-5.2" assert response.completion_tokens == 10 def test_llm_response_defaults(): response = LLMResponse(content="ok", model="gpt-5.2") assert response.prompt_tokens == 0 assert response.completion_tokens == 0 assert response.total_tokens == 0 assert response.finish_reason == "" assert response.truncated is False assert response.raw == {} def test_llm_client_initialization_stores_config(): config = LLMConfig(base_url="https://api.example.com/v1", api_key="k") client = LLMClient(config) assert client.config is config def test_llm_client_model_chain_is_primary_plus_fallbacks(): client = _make_client( primary_model="gpt-5.4", fallback_models=["gpt-4.1", "gpt-4o"] ) assert client._model_chain == ["gpt-5.4", "gpt-4.1", "gpt-4o"] def test_needs_max_completion_tokens_for_new_models(): model = "gpt-5.2" assert any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS) def test_needs_max_completion_tokens_false_for_old_models(): model = "gpt-4o" assert not any(model.startswith(prefix) for prefix in _NEW_PARAM_MODELS) def test_build_request_body_structure_via_raw_call(monkeypatch: pytest.MonkeyPatch): response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]} body, _, _ = _capture_raw_call(monkeypatch, model="gpt-4o", response_data=response) assert body["model"] == "gpt-4o" assert body["messages"] == [{"role": "user", "content": "hello"}] assert body["temperature"] == 0.2 def test_build_request_uses_max_completion_tokens_for_new_models( monkeypatch: pytest.MonkeyPatch, ): response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]} body, _, _ = _capture_raw_call(monkeypatch, model="gpt-5.2", response_data=response) # Reasoning models enforce a minimum of 32768 tokens assert body["max_completion_tokens"] == 32768 assert "max_tokens" not in body def test_build_request_uses_max_tokens_for_old_models(monkeypatch: pytest.MonkeyPatch): response = {"choices": [{"message": {"content": "x"}, "finish_reason": "stop"}]} body, _, _ = _capture_raw_call(monkeypatch, model="gpt-4.1", response_data=response) assert body["max_tokens"] == 123 assert "max_completion_tokens" not in body def test_parse_response_with_valid_payload_via_raw_call( monkeypatch: pytest.MonkeyPatch, ): response = { "model": "gpt-5.2", "choices": [{"message": {"content": "hello"}, "finish_reason": "stop"}], "usage": {"prompt_tokens": 1, "completion_tokens": 2, "total_tokens": 3}, } _, parsed, _ = _capture_raw_call( monkeypatch, model="gpt-5.2", response_data=response ) assert parsed.content == "hello" assert parsed.model == "gpt-5.2" assert parsed.prompt_tokens == 1 assert parsed.total_tokens == 3 def test_parse_response_truncated_when_finish_reason_length( monkeypatch: pytest.MonkeyPatch, ): response = { "choices": [{"message": {"content": "partial"}, "finish_reason": "length"}], "usage": {}, } _, parsed, _ = _capture_raw_call( monkeypatch, model="gpt-5.2", response_data=response ) assert parsed.finish_reason == "length" assert parsed.truncated is True def test_parse_response_missing_optional_fields_graceful( monkeypatch: pytest.MonkeyPatch, ): response = {"choices": [{"message": {"content": None}}]} _, parsed, _ = _capture_raw_call( monkeypatch, model="gpt-5.2", response_data=response ) assert parsed.content == "" assert parsed.prompt_tokens == 0 assert parsed.completion_tokens == 0 assert parsed.total_tokens == 0 assert parsed.finish_reason == "" def test_from_rc_config_builds_expected_llm_config(): rc_config = SimpleNamespace( llm=SimpleNamespace( base_url="https://proxy.example/v1", api_key="inline-key", api_key_env="OPENAI_API_KEY", primary_model="o3", fallback_models=("o3-mini", "gpt-4o"), ) ) client = LLMClient.from_rc_config(rc_config) assert client.config.base_url == "https://proxy.example/v1" assert client.config.api_key == "inline-key" assert client.config.primary_model == "o3" assert client.config.fallback_models == ["o3-mini", "gpt-4o"] def test_from_rc_config_reads_api_key_from_env_when_missing( monkeypatch: pytest.MonkeyPatch, ): monkeypatch.setenv("RC_TEST_API_KEY", "env-key") rc_config = SimpleNamespace( llm=SimpleNamespace( base_url="https://proxy.example/v1", api_key="", api_key_env="RC_TEST_API_KEY", primary_model="gpt-5.2", fallback_models=(), ) ) client = LLMClient.from_rc_config(rc_config) assert client.config.api_key == "env-key" def test_new_param_models_contains_expected_models(): expected = {"gpt-5", "gpt-5.1", "gpt-5.2", "gpt-5.4", "o3", "o3-mini", "o4-mini"} assert expected.issubset(_NEW_PARAM_MODELS) def test_raw_call_adds_json_mode_response_format(monkeypatch: pytest.MonkeyPatch): captured: dict[str, object] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["request"] = req return _DummyHTTPResponse({"choices": [{"message": {"content": "{}"}}]}) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client = _make_client() _ = client._raw_call( "gpt-5.2", [{"role": "user", "content": "json"}], 50, 0.1, True ) request = captured["request"] assert isinstance(request, urllib.request.Request) data = request.data assert isinstance(data, bytes) body = json.loads(data.decode("utf-8")) assert isinstance(body, dict) assert body["response_format"] == {"type": "json_object"} def test_raw_call_sets_auth_and_user_agent_headers(monkeypatch: pytest.MonkeyPatch): captured: dict[str, object] = {} def fake_urlopen(req: urllib.request.Request, timeout: int) -> _DummyHTTPResponse: captured["request"] = req captured["timeout"] = timeout return _DummyHTTPResponse({"choices": [{"message": {"content": "ok"}}]}) monkeypatch.setattr(urllib.request, "urlopen", fake_urlopen) client = _make_client(api_key="secret", timeout_sec=77) _ = client._raw_call("gpt-5.2", [{"role": "user", "content": "hi"}], 20, 0.6, False) request = captured["request"] assert isinstance(request, urllib.request.Request) headers = {k.lower(): v for k, v in request.headers.items()} assert headers["authorization"] == "Bearer secret" assert "user-agent" in headers timeout = captured["timeout"] assert timeout == 77 def test_chat_prepends_system_message(monkeypatch: pytest.MonkeyPatch): captured: dict[str, list[dict[str, str]]] = {} def fake_raw_call( self: LLMClient, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool, ) -> LLMResponse: captured["messages"] = messages return LLMResponse(content="ok", model=model) monkeypatch.setattr(LLMClient, "_raw_call", fake_raw_call) client = _make_client(primary_model="gpt-5.2", fallback_models=["gpt-4o"]) client.chat([{"role": "user", "content": "q"}], system="sys") assert captured["messages"][0] == {"role": "system", "content": "sys"} def test_chat_uses_fallback_after_first_model_error(monkeypatch: pytest.MonkeyPatch): calls: list[str] = [] def fake_call_with_retry( self: LLMClient, model: str, messages: list[dict[str, str]], max_tokens: int, temperature: float, json_mode: bool, ) -> LLMResponse: _ = (self, messages, max_tokens, temperature, json_mode) calls.append(model) if model == "gpt-5.2": raise RuntimeError("first failed") return LLMResponse(content="ok", model=model) monkeypatch.setattr(LLMClient, "_call_with_retry", fake_call_with_retry) client = _make_client(primary_model="gpt-5.2", fallback_models=["gpt-5.1"]) response = client.chat([{"role": "user", "content": "x"}]) assert calls == ["gpt-5.2", "gpt-5.1"] assert response.model == "gpt-5.1" ================================================ FILE: tests/test_rc_novelty.py ================================================ """Tests for researchclaw.literature.novelty — novelty detection module.""" from __future__ import annotations import json from pathlib import Path from typing import Any from unittest.mock import MagicMock, patch import pytest from researchclaw.literature.novelty import ( _assess_novelty, _build_novelty_queries, _compute_similarity, _extract_keywords, _jaccard_keywords, check_novelty, ) # --------------------------------------------------------------------------- # _extract_keywords # --------------------------------------------------------------------------- class TestExtractKeywords: def test_basic_extraction(self) -> None: kws = _extract_keywords("Transformer attention mechanisms for NLP") assert "transformer" in kws assert "attention" in kws assert "mechanisms" in kws assert "nlp" in kws def test_stop_words_removed(self) -> None: kws = _extract_keywords("the model is a new approach for data") # "the", "is", "a", "new", "approach", "for", "data", "model" are stop words assert "the" not in kws assert "is" not in kws def test_short_tokens_removed(self) -> None: kws = _extract_keywords("AI ML RL deep reinforcement learning") # "AI", "ML", "RL" are only 2 chars → removed assert "ai" not in kws assert "deep" in kws assert "reinforcement" in kws def test_deduplication(self) -> None: kws = _extract_keywords("attention attention attention mechanism") assert kws.count("attention") == 1 def test_empty_input(self) -> None: assert _extract_keywords("") == [] def test_preserves_order(self) -> None: kws = _extract_keywords("alpha beta gamma delta") assert kws == ["alpha", "beta", "gamma", "delta"] # --------------------------------------------------------------------------- # _jaccard_keywords # --------------------------------------------------------------------------- class TestJaccardKeywords: def test_identical_sets(self) -> None: assert _jaccard_keywords(["a", "b", "c"], ["a", "b", "c"]) == 1.0 def test_disjoint_sets(self) -> None: assert _jaccard_keywords(["a", "b"], ["c", "d"]) == 0.0 def test_partial_overlap(self) -> None: # {a, b, c} & {b, c, d} = {b, c} / {a, b, c, d} = 2/4 = 0.5 assert _jaccard_keywords(["a", "b", "c"], ["b", "c", "d"]) == 0.5 def test_empty_first(self) -> None: assert _jaccard_keywords([], ["a", "b"]) == 0.0 def test_empty_second(self) -> None: assert _jaccard_keywords(["a", "b"], []) == 0.0 def test_both_empty(self) -> None: assert _jaccard_keywords([], []) == 0.0 # --------------------------------------------------------------------------- # _compute_similarity # --------------------------------------------------------------------------- class TestComputeSimilarity: def test_returns_float_0_to_1(self) -> None: sim = _compute_similarity( ["transformer", "attention"], "Transformer Attention in NLP", "We study attention mechanisms in transformer models.", ) assert 0.0 <= sim <= 1.0 def test_high_similarity_for_matching_content(self) -> None: kws = ["transformer", "attention", "mechanisms", "self-attention"] sim = _compute_similarity( kws, "Self-Attention Mechanisms in Transformers", "This paper studies transformer self-attention mechanisms in detail.", ) assert sim > 0.1 # should have meaningful overlap def test_low_similarity_for_unrelated_content(self) -> None: kws = ["quantum", "computing", "entanglement", "qubit"] sim = _compute_similarity( kws, "Deep Learning for Image Classification", "We propose a convolutional neural network for classifying images.", ) assert sim < 0.1 def test_empty_keywords(self) -> None: sim = _compute_similarity([], "Some title", "Some abstract") assert sim == 0.0 # --------------------------------------------------------------------------- # _build_novelty_queries # --------------------------------------------------------------------------- class TestBuildNoveltyQueries: def test_includes_topic(self) -> None: queries = _build_novelty_queries("Reinforcement Learning", "No hypotheses") assert queries[0] == "Reinforcement Learning" def test_extracts_hypothesis_titles(self) -> None: hyp_text = ( "## H1: Adaptive learning rates improve convergence\n" "Details about H1...\n\n" "## H2: Curriculum learning reduces sample complexity\n" "Details about H2...\n" ) queries = _build_novelty_queries("RL topic", hyp_text) assert len(queries) >= 3 # topic + H1 + H2 def test_caps_at_5(self) -> None: hyp_text = "\n".join( f"## H{i}: Hypothesis number {i} with enough text to pass length filter" for i in range(1, 10) ) queries = _build_novelty_queries("Topic", hyp_text) assert len(queries) <= 5 def test_skips_short_titles(self) -> None: hyp_text = "## H1: Short\n## H2: This is a longer hypothesis title\n" queries = _build_novelty_queries("Topic", hyp_text) # "Short" is < 10 chars → excluded assert not any("Short" in q for q in queries) def test_empty_hypotheses(self) -> None: queries = _build_novelty_queries("Topic", "") assert len(queries) >= 1 assert queries[0] == "Topic" # --------------------------------------------------------------------------- # _assess_novelty # --------------------------------------------------------------------------- class TestAssessNovelty: def test_no_similar_papers_is_high(self) -> None: score, assessment = _assess_novelty([], 0.25) assert score == 1.0 assert assessment == "high" def test_moderate_similarity(self) -> None: papers = [{"similarity": 0.35, "citation_count": 10}] score, assessment = _assess_novelty(papers, 0.25) assert 0.45 <= score <= 0.85 assert assessment in ("high", "moderate") def test_high_similarity_low_novelty(self) -> None: papers = [{"similarity": 0.8, "citation_count": 200}] score, assessment = _assess_novelty(papers, 0.25) assert score <= 0.3 assert assessment in ("low", "critical") def test_multiple_high_impact_overlaps_penalize(self) -> None: papers = [ {"similarity": 0.5, "citation_count": 100}, {"similarity": 0.45, "citation_count": 80}, {"similarity": 0.42, "citation_count": 60}, ] score, _ = _assess_novelty(papers, 0.25) # Should be penalized for multiple high-citation overlaps assert score < 0.6 def test_score_bounded_0_to_1(self) -> None: papers = [{"similarity": 0.99, "citation_count": 5000}] score, _ = _assess_novelty(papers, 0.25) assert 0.0 <= score <= 1.0 def test_critical_assessment(self) -> None: papers = [ {"similarity": 0.9, "citation_count": 200}, {"similarity": 0.85, "citation_count": 150}, ] score, assessment = _assess_novelty(papers, 0.25) assert assessment == "critical" assert score < 0.25 # --------------------------------------------------------------------------- # check_novelty (integration) # --------------------------------------------------------------------------- class TestCheckNovelty: """Integration tests for check_novelty — mocks the real API calls.""" @patch("researchclaw.literature.search.search_papers_multi_query") def test_basic_flow(self, mock_search: MagicMock) -> None: """Smoke test: no similar papers found → high novelty.""" mock_search.return_value = [] result = check_novelty( topic="Novel quantum-inspired optimization", hypotheses_text="## H1: Quantum tunneling improves escape from local minima\n", ) assert isinstance(result, dict) assert result["novelty_score"] == 1.0 assert result["assessment"] in ("high", "insufficient_data") assert result["recommendation"] in ("proceed", "proceed_with_caution") assert result["topic"] == "Novel quantum-inspired optimization" assert "generated" in result @patch("researchclaw.literature.search.search_papers_multi_query") def test_with_similar_papers(self, mock_search: MagicMock) -> None: """Papers with keyword overlap → lower novelty.""" # Create a mock paper with overlapping keywords mock_paper = MagicMock() mock_paper.title = "Quantum-Inspired Optimization for Combinatorial Problems" mock_paper.abstract = ( "We propose quantum-inspired optimization methods " "using tunneling and superposition analogies to escape local minima." ) mock_paper.paper_id = "abc123" mock_paper.year = 2024 mock_paper.venue = "NeurIPS" mock_paper.citation_count = 45 mock_paper.url = "https://example.com/paper" mock_paper.cite_key = "abc2024quantum" mock_search.return_value = [mock_paper] result = check_novelty( topic="Quantum-inspired optimization", hypotheses_text="## H1: Quantum tunneling improves escape from local minima\n", ) assert result["similar_papers_found"] >= 0 assert 0.0 <= result["novelty_score"] <= 1.0 @patch("researchclaw.literature.search.search_papers_multi_query") def test_with_pipeline_papers(self, mock_search: MagicMock) -> None: """Papers from candidates.jsonl also checked for overlap.""" mock_search.return_value = [] pipeline_papers = [ { "title": "Adaptive Learning Rate Schedules via Meta-Learning", "abstract": "We study adaptive learning rate schedules.", "paper_id": "p1", "year": 2023, "venue": "ICML", "citation_count": 30, "url": "https://example.com", "cite_key": "p12023", }, ] result = check_novelty( topic="Adaptive learning rate schedules", hypotheses_text="## H1: Meta-learning adaptive learning rate schedules\n", papers_already_seen=pipeline_papers, ) assert isinstance(result, dict) assert "similar_papers" in result @patch("researchclaw.literature.search.search_papers_multi_query") def test_search_failure_graceful(self, mock_search: MagicMock) -> None: """API failure should not crash — falls back to pipeline papers.""" mock_search.side_effect = RuntimeError("API down") result = check_novelty( topic="Some topic", hypotheses_text="## H1: Some hypothesis with enough text\n", ) assert isinstance(result, dict) assert "novelty_score" in result @patch("researchclaw.literature.search.search_papers_multi_query") def test_output_keys_complete(self, mock_search: MagicMock) -> None: """All expected keys present in output.""" mock_search.return_value = [] result = check_novelty( topic="Test topic", hypotheses_text="Some hypotheses text", ) expected_keys = { "topic", "hypotheses_checked", "search_queries", "similar_papers_found", "novelty_score", "assessment", "similar_papers", "recommendation", "similarity_threshold", "search_coverage", "total_papers_retrieved", "generated", } assert expected_keys == set(result.keys()) @patch("researchclaw.literature.search.search_papers_multi_query") def test_recommendation_values(self, mock_search: MagicMock) -> None: """Recommendation must be one of proceed/differentiate/abort.""" mock_search.return_value = [] result = check_novelty( topic="Test", hypotheses_text="## H1: Hypothesis one\n", ) assert result["recommendation"] in ("proceed", "differentiate", "abort", "proceed_with_caution") @patch("researchclaw.literature.search.search_papers_multi_query") def test_json_serializable(self, mock_search: MagicMock) -> None: """Output must be JSON-serializable for writing to novelty_report.json.""" mock_search.return_value = [] result = check_novelty( topic="JSON test", hypotheses_text="## H1: Test hypothesis title is long enough\n", ) serialized = json.dumps(result) assert isinstance(serialized, str) @patch("researchclaw.literature.search.search_papers_multi_query") def test_similar_papers_capped_at_20(self, mock_search: MagicMock) -> None: """Output similar_papers list capped at 20.""" # Create many mock papers papers = [] for i in range(40): p = MagicMock() p.title = f"Paper about optimization variant {i}" p.abstract = "Optimization variant study" p.paper_id = f"id_{i}" p.year = 2024 p.venue = "Conf" p.citation_count = 10 p.url = f"https://example.com/{i}" p.cite_key = f"key{i}" papers.append(p) mock_search.return_value = papers result = check_novelty( topic="optimization", hypotheses_text="## H1: Optimization variants improve performance\n", similarity_threshold=0.0, # low threshold → many matches ) assert len(result["similar_papers"]) <= 20 # --------------------------------------------------------------------------- # Executor integration — _execute_hypothesis_gen with novelty check # --------------------------------------------------------------------------- class TestHypothesisGenNoveltyIntegration: """Test that _execute_hypothesis_gen integrates novelty check correctly.""" def test_novelty_report_written_when_available(self, tmp_path: Path) -> None: """Hypothesis gen should write novelty_report.json when check succeeds.""" from researchclaw.pipeline.executor import _execute_hypothesis_gen from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig # Set up minimal run directory run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-08" stage_dir.mkdir() # Create synthesis artifact from prior stage stage_07 = run_dir / "stage-07" stage_07.mkdir() (stage_07 / "synthesis.md").write_text("## Synthesis\nSome synthesis content.") data = { "project": {"name": "novelty-test", "mode": "docs-first"}, "research": {"topic": "novelty testing"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline", }, } config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) adapters = AdapterBundle() with patch( "researchclaw.literature.search.search_papers_multi_query" ) as mock_search: mock_search.return_value = [] result = _execute_hypothesis_gen(stage_dir, run_dir, config, adapters) assert result.stage.name == "HYPOTHESIS_GEN" assert result.status.name == "DONE" # hypotheses.md always written assert (stage_dir / "hypotheses.md").exists() # novelty_report.json should be written (API mocked as returning empty) assert (stage_dir / "novelty_report.json").exists() report = json.loads((stage_dir / "novelty_report.json").read_text()) assert report["novelty_score"] == 1.0 # no similar papers → max novelty assert "novelty_report.json" in result.artifacts def test_novelty_failure_does_not_block(self, tmp_path: Path) -> None: """If novelty check crashes, hypothesis gen still succeeds.""" from researchclaw.pipeline.executor import _execute_hypothesis_gen from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig run_dir = tmp_path / "run" run_dir.mkdir() stage_dir = run_dir / "stage-08" stage_dir.mkdir() stage_07 = run_dir / "stage-07" stage_07.mkdir() (stage_07 / "synthesis.md").write_text("## Synthesis\nContent.") data = { "project": {"name": "novelty-test", "mode": "docs-first"}, "research": {"topic": "novelty testing"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline", }, } config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) adapters = AdapterBundle() with patch( "researchclaw.literature.novelty.check_novelty", side_effect=RuntimeError("Novelty check exploded"), ): result = _execute_hypothesis_gen(stage_dir, run_dir, config, adapters) assert result.status.name == "DONE" assert (stage_dir / "hypotheses.md").exists() # novelty_report.json NOT written since check failed assert not (stage_dir / "novelty_report.json").exists() assert "novelty_report.json" not in result.artifacts ================================================ FILE: tests/test_rc_preflight.py ================================================ from __future__ import annotations import urllib.error from email.message import Message from unittest.mock import patch from researchclaw.llm.client import LLMClient, LLMConfig, LLMResponse def _make_client( *, base_url: str = "https://api.example.com/v1", api_key: str = "test-key", primary_model: str = "gpt-test", fallback_models: list[str] | None = None, max_retries: int = 1, ) -> LLMClient: return LLMClient( LLMConfig( base_url=base_url, api_key=api_key, primary_model=primary_model, fallback_models=fallback_models or [], max_retries=max_retries, ) ) class TestPreflight: def test_preflight_success(self): client = _make_client() mock_resp = LLMResponse(content="pong", model="gpt-test") with patch.object(client, "chat", return_value=mock_resp): ok, msg = client.preflight() assert ok is True assert "OK" in msg assert "gpt-test" in msg def test_preflight_401_invalid_key(self): client = _make_client() err = urllib.error.HTTPError("url", 401, "Unauthorized", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "Invalid API key" in msg def test_preflight_403_model_forbidden(self): client = _make_client() err = urllib.error.HTTPError("url", 403, "Forbidden", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "not allowed" in msg def test_preflight_404_bad_endpoint(self): client = _make_client() err = urllib.error.HTTPError("url", 404, "Not Found", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "Endpoint not found" in msg def test_preflight_429_rate_limited(self): client = _make_client() err = urllib.error.HTTPError("url", 429, "Too Many Requests", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "Rate limited" in msg def test_preflight_timeout(self): client = _make_client() err = urllib.error.URLError("timeout") with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "Connection failed" in msg def test_preflight_all_models_failed(self): client = _make_client() err = RuntimeError("All models failed. Last error: ...") with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "All models failed" in msg def test_preflight_unknown_http_error(self): client = _make_client() err = urllib.error.HTTPError("url", 500, "Server Error", Message(), None) with patch.object(client, "chat", side_effect=err): ok, msg = client.preflight() assert ok is False assert "HTTP 500" in msg ================================================ FILE: tests/test_rc_prompts.py ================================================ """Tests for researchclaw.prompts — PromptManager and template rendering.""" from __future__ import annotations import textwrap from pathlib import Path import pytest import yaml from researchclaw.prompts import ( PromptManager, RenderedPrompt, _render, ) # --------------------------------------------------------------------------- # _render() — template variable substitution # --------------------------------------------------------------------------- class TestRender: def test_simple_substitution(self) -> None: assert _render("Hello {name}!", {"name": "World"}) == "Hello World!" def test_multiple_variables(self) -> None: result = _render( "Topic: {topic}, Domain: {domain}", {"topic": "RL", "domain": "ML"} ) assert result == "Topic: RL, Domain: ML" def test_missing_variable_left_untouched(self) -> None: assert _render("Value: {unknown}", {}) == "Value: {unknown}" def test_json_schema_not_substituted(self) -> None: template = "Return JSON: {candidates:[...]} with >=8 rows." assert _render(template, {"candidates": "SHOULD_NOT_APPEAR"}) == template def test_json_schema_complex_not_substituted(self) -> None: template = "Schema: {score_1_to_10:number, verdict:string}" assert _render(template, {}) == template def test_curly_braces_in_code_not_substituted(self) -> None: template = "def foo(): { return 1; }" assert _render(template, {}) == template def test_underscore_variable(self) -> None: assert _render("{my_var}", {"my_var": "ok"}) == "ok" def test_numeric_suffix(self) -> None: assert _render("{score_1}", {"score_1": "9"}) == "9" def test_empty_template(self) -> None: assert _render("", {"x": "y"}) == "" def test_no_placeholders(self) -> None: assert _render("No variables here", {"x": "y"}) == "No variables here" # --------------------------------------------------------------------------- # PromptManager — defaults # --------------------------------------------------------------------------- class TestPromptManagerDefaults: def test_all_stages_present(self) -> None: """20 stages have for_stage() prompts; iterative_refine uses sub_prompts only.""" pm = PromptManager() names = pm.stage_names() assert len(names) >= 20 for required in [ "topic_init", "problem_decompose", "search_strategy", "literature_collect", "literature_screen", "knowledge_extract", "synthesis", "hypothesis_gen", "experiment_design", "code_generation", "resource_planning", "result_analysis", "research_decision", "paper_outline", "paper_draft", "peer_review", "paper_revision", "quality_gate", "knowledge_archive", "export_publish", ]: assert pm.has_stage(required), f"Missing stage: {required}" def test_system_prompt_nonempty(self) -> None: pm = PromptManager() for name in pm.stage_names(): assert pm.system(name), f"Empty system prompt for {name}" def test_for_stage_returns_rendered_prompt(self) -> None: pm = PromptManager() sp = pm.for_stage( "topic_init", topic="RL", domains="ml", project_name="test", quality_threshold="4.0", ) assert isinstance(sp, RenderedPrompt) assert "RL" in sp.user assert "ml" in sp.user assert sp.system def test_json_mode_stages(self) -> None: pm = PromptManager() json_stages = [ "search_strategy", "literature_collect", "literature_screen", "knowledge_extract", "resource_planning", "quality_gate", ] for stage in json_stages: assert pm.json_mode(stage), f"{stage} should have json_mode=True" def test_non_json_stages(self) -> None: pm = PromptManager() assert not pm.json_mode("topic_init") assert not pm.json_mode("synthesis") def test_max_tokens(self) -> None: pm = PromptManager() assert pm.max_tokens("code_generation") == 8192 assert pm.max_tokens("paper_draft") == 16384 assert pm.max_tokens("topic_init") is None def test_block_topic_constraint(self) -> None: pm = PromptManager() block = pm.block("topic_constraint", topic="Neural Architecture Search") assert "Neural Architecture Search" in block assert "HARD TOPIC CONSTRAINT" in block def test_block_pkg_hint(self) -> None: pm = PromptManager() block = pm.block("pkg_hint_sandbox") assert "numpy" in block assert "torch" in block # mentioned as prohibited def test_sub_prompt_code_repair(self) -> None: pm = PromptManager() rp = pm.sub_prompt( "code_repair", fname="model.py", issues_text="SyntaxError", all_files_ctx="...", ) assert "model.py" in rp.user assert "SyntaxError" in rp.user assert rp.system def test_sub_prompt_iterative_improve(self) -> None: pm = PromptManager() ip = pm.sub_prompt( "iterative_improve", metric_key="val_loss", metric_direction="minimize", files_context="...", run_summaries="...", ) assert "val_loss" in ip.user assert "minimize" in ip.user def test_sub_prompt_iterative_repair(self) -> None: pm = PromptManager() irp = pm.sub_prompt( "iterative_repair", issue_text="import error", all_files_ctx="..." ) assert "import error" in irp.user # --------------------------------------------------------------------------- # PromptManager — YAML override # --------------------------------------------------------------------------- class TestPromptManagerOverrides: def test_override_system_prompt(self, tmp_path: Path) -> None: yaml_content = textwrap.dedent("""\ stages: topic_init: system: "You are a custom planner." """) override_file = tmp_path / "custom.yaml" override_file.write_text(yaml_content, encoding="utf-8") pm = PromptManager(override_file) assert pm.system("topic_init") == "You are a custom planner." # Other stages should keep defaults assert pm.system("problem_decompose") == "You are a senior research strategist." def test_override_user_template(self, tmp_path: Path) -> None: yaml_content = textwrap.dedent("""\ stages: topic_init: user: "Custom prompt for {topic}." """) override_file = tmp_path / "custom.yaml" override_file.write_text(yaml_content, encoding="utf-8") pm = PromptManager(override_file) result = pm.user("topic_init", topic="GAN") assert result == "Custom prompt for GAN." def test_override_block(self, tmp_path: Path) -> None: yaml_content = textwrap.dedent("""\ blocks: topic_constraint: "Stay focused on {topic}." """) override_file = tmp_path / "custom.yaml" override_file.write_text(yaml_content, encoding="utf-8") pm = PromptManager(override_file) assert pm.block("topic_constraint", topic="NAS") == "Stay focused on NAS." def test_override_json_mode(self, tmp_path: Path) -> None: yaml_content = textwrap.dedent("""\ stages: topic_init: json_mode: true """) override_file = tmp_path / "custom.yaml" override_file.write_text(yaml_content, encoding="utf-8") pm = PromptManager(override_file) assert pm.json_mode("topic_init") is True def test_missing_file_uses_defaults(self, tmp_path: Path) -> None: pm = PromptManager(tmp_path / "nonexistent.yaml") assert pm.has_stage("topic_init") assert pm.system("topic_init") def test_invalid_yaml_uses_defaults(self, tmp_path: Path) -> None: bad_file = tmp_path / "bad.yaml" bad_file.write_text(": invalid: yaml: [", encoding="utf-8") pm = PromptManager(bad_file) assert pm.has_stage("topic_init") def test_unknown_stage_in_override_ignored(self, tmp_path: Path) -> None: yaml_content = textwrap.dedent("""\ stages: nonexistent_stage: system: "Should be ignored." """) override_file = tmp_path / "custom.yaml" override_file.write_text(yaml_content, encoding="utf-8") # Should not raise pm = PromptManager(override_file) assert not pm.has_stage("nonexistent_stage") # --------------------------------------------------------------------------- # PromptManager — export_yaml # --------------------------------------------------------------------------- class TestExportYaml: def test_export_roundtrip(self, tmp_path: Path) -> None: pm1 = PromptManager() export_path = tmp_path / "exported.yaml" pm1.export_yaml(export_path) assert export_path.exists() # Load it back — should parse cleanly data = yaml.safe_load(export_path.read_text(encoding="utf-8")) assert "stages" in data assert "blocks" in data assert "version" in data def test_export_contains_all_stages(self, tmp_path: Path) -> None: pm = PromptManager() export_path = tmp_path / "exported.yaml" pm.export_yaml(export_path) data = yaml.safe_load(export_path.read_text(encoding="utf-8")) for stage in pm.stage_names(): assert stage in data["stages"], f"Missing {stage} in export" def test_export_with_overrides(self, tmp_path: Path) -> None: override_file = tmp_path / "custom.yaml" override_file.write_text( "stages:\n topic_init:\n system: CUSTOM\n", encoding="utf-8", ) pm = PromptManager(override_file) export_path = tmp_path / "exported.yaml" pm.export_yaml(export_path) data = yaml.safe_load(export_path.read_text(encoding="utf-8")) assert data["stages"]["topic_init"]["system"] == "CUSTOM" # --------------------------------------------------------------------------- # RenderedPrompt dataclass # --------------------------------------------------------------------------- class TestRenderedPrompt: def test_defaults(self) -> None: rp = RenderedPrompt(system="sys", user="usr") assert rp.json_mode is False assert rp.max_tokens is None def test_with_options(self) -> None: rp = RenderedPrompt(system="s", user="u", json_mode=True, max_tokens=4096) assert rp.json_mode is True assert rp.max_tokens == 4096 def test_frozen(self) -> None: rp = RenderedPrompt(system="s", user="u") with pytest.raises(AttributeError): rp.system = "modified" # type: ignore[misc] ================================================ FILE: tests/test_rc_quality.py ================================================ """Tests for content quality assessment.""" from __future__ import annotations # pyright: reportMissingImports=false, reportUnknownVariableType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false from researchclaw.quality import ( assess_quality, check_strict_quality, compute_template_ratio, detect_template_content, ) REAL_ABSTRACT = ( "We propose a novel method for protein structure prediction using " "graph neural networks. Our approach achieves state-of-the-art results " "on the CASP14 benchmark with 3.2 GDT-TS improvement over AlphaFold2. " "We demonstrate that incorporating side-chain interactions as graph " "edges significantly improves local structure accuracy." ) TEMPLATE_ABSTRACT = ( "Template abstract: This section will describe the main contributions " "of our work. [INSERT your abstract here]. We will discuss the results " "in the following sections. Replace this text with your actual content." ) MIXED_CONTENT = ( "We propose a novel method for protein structure prediction.\n" "[TODO: Add more details about the method]\n" "Our experiments show significant improvements over baselines.\n" "Template introduction: This section will describe the background." ) REAL_PAPER_SECTION = ( "## Introduction\n\n" "Recent advances in large language models have demonstrated remarkable " "capabilities in natural language understanding and generation. However, " "these models often struggle with factual consistency and hallucinate " "information. In this work, we address this limitation by introducing " "a retrieval-augmented generation framework that grounds model outputs " "in verified knowledge sources.\n\n" "Our key contributions are:\n" "1. A novel attention mechanism for integrating retrieved passages\n" "2. A training procedure that incentivizes factual consistency\n" "3. Comprehensive evaluation on three benchmark datasets" ) class TestDetectTemplateContent: def test_real_text_no_matches(self): matches = detect_template_content(REAL_ABSTRACT) assert len(matches) == 0 def test_template_text_has_matches(self): matches = detect_template_content(TEMPLATE_ABSTRACT) assert len(matches) >= 3 def test_detects_insert_placeholder(self): text = "The results show [INSERT your results here] improvement." matches = detect_template_content(text) assert any("Insert placeholder" in m.pattern_desc for m in matches) def test_detects_todo_placeholder(self): text = "Method description [TODO: complete this section]." matches = detect_template_content(text) assert any("TODO" in m.pattern_desc for m in matches) def test_detects_template_section(self): text = "Template introduction: This paper presents our work." matches = detect_template_content(text) assert any("Template section" in m.pattern_desc for m in matches) def test_detects_future_tense_placeholder(self): text = "This section will describe the methodology in detail." matches = detect_template_content(text) assert any("Future-tense" in m.pattern_desc for m in matches) def test_detects_lorem_ipsum(self): text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." matches = detect_template_content(text) assert any("Lorem ipsum" in m.pattern_desc for m in matches) def test_match_has_line_number(self): text = "Good line\n[TODO: fix this]\nAnother good line" matches = detect_template_content(text) assert len(matches) == 1 assert matches[0].line_number == 2 def test_real_paper_section_clean(self): matches = detect_template_content(REAL_PAPER_SECTION) assert len(matches) == 0 def test_empty_text(self): matches = detect_template_content("") assert len(matches) == 0 class TestComputeTemplateRatio: def test_real_text_low_ratio(self): ratio = compute_template_ratio(REAL_ABSTRACT) assert ratio < 0.05 def test_template_text_high_ratio(self): ratio = compute_template_ratio(TEMPLATE_ABSTRACT) assert ratio > 0.5 def test_mixed_content_moderate_ratio(self): ratio = compute_template_ratio(MIXED_CONTENT) assert 0.1 < ratio < 0.9 def test_empty_text_zero_ratio(self): ratio = compute_template_ratio("") assert ratio == 0.0 def test_ratio_bounded_0_1(self): ratio = compute_template_ratio(TEMPLATE_ABSTRACT) assert 0.0 <= ratio <= 1.0 def test_real_paper_section_low_ratio(self): ratio = compute_template_ratio(REAL_PAPER_SECTION) assert ratio < 0.05 class TestAssessQuality: def test_report_has_all_fields(self): report = assess_quality(REAL_ABSTRACT) assert report.total_lines > 0 assert report.total_chars > 0 assert isinstance(report.template_ratio, float) assert isinstance(report.template_matches, tuple) def test_report_to_dict(self): report = assess_quality(MIXED_CONTENT) d = report.to_dict() assert "template_ratio" in d assert "template_matches" in d assert "has_template_content" in d assert "match_count" in d def test_report_has_template_flag(self): report = assess_quality(TEMPLATE_ABSTRACT) assert report.has_template_content is True report2 = assess_quality(REAL_ABSTRACT) assert report2.has_template_content is False class TestCheckStrictQuality: def test_real_text_passes(self): passed, _msg = check_strict_quality(REAL_ABSTRACT) assert passed is True def test_template_text_fails(self): passed, msg = check_strict_quality(TEMPLATE_ABSTRACT) assert passed is False assert "Template content detected" in msg def test_custom_threshold(self): passed, _msg = check_strict_quality(TEMPLATE_ABSTRACT, threshold=1.0) assert passed is True def test_failure_message_includes_examples(self): _passed, msg = check_strict_quality(TEMPLATE_ABSTRACT) assert "L" in msg ================================================ FILE: tests/test_rc_report.py ================================================ # pyright: basic, reportMissingImports=false, reportUnusedCallResult=false from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.report import generate_report class TestReport: def test_report_missing_run_dir(self, tmp_path: Path): with pytest.raises(FileNotFoundError): generate_report(tmp_path / "nonexistent") def test_report_no_summary(self, tmp_path: Path): with pytest.raises(ValueError, match="pipeline_summary"): generate_report(tmp_path) def test_report_minimal_run(self, tmp_path: Path): (tmp_path / "pipeline_summary.json").write_text( json.dumps( { "run_id": "rc-test-123", "stages_executed": 23, "stages_done": 23, "stages_blocked": 0, "stages_failed": 0, "final_status": "done", "generated": "2026-03-10T12:00:00Z", } ) ) report = generate_report(tmp_path) assert "# ResearchClaw Run Report" in report assert "rc-test-123" in report assert "✅" in report def test_report_with_paper(self, tmp_path: Path): (tmp_path / "pipeline_summary.json").write_text( json.dumps( { "run_id": "test", "stages_executed": 1, "stages_done": 1, "stages_failed": 0, "final_status": "done", "generated": "now", } ) ) draft_dir = tmp_path / "stage-17" draft_dir.mkdir() (draft_dir / "paper_draft.md").write_text( "This is a paper with some words in it." ) report = generate_report(tmp_path) assert "Paper" in report assert "words" in report def test_report_with_citations(self, tmp_path: Path): (tmp_path / "pipeline_summary.json").write_text( json.dumps( { "run_id": "test", "stages_executed": 1, "stages_done": 1, "stages_failed": 0, "final_status": "done", "generated": "now", } ) ) verify_dir = tmp_path / "stage-23" verify_dir.mkdir() (verify_dir / "verification_report.json").write_text( json.dumps( { "total_references": 10, "verified_count": 8, "suspicious_count": 1, "hallucinated_count": 1, } ) ) report = generate_report(tmp_path) assert "Citations" in report assert "8/10" in report assert "Suspicious: 1" in report def test_report_with_failures(self, tmp_path: Path): (tmp_path / "pipeline_summary.json").write_text( json.dumps( { "run_id": "test", "stages_executed": 5, "stages_done": 3, "stages_failed": 2, "final_status": "failed", "generated": "now", } ) ) report = generate_report(tmp_path) assert "❌" in report assert "Warnings" in report assert "2 stage(s) failed" in report def test_report_with_experiment_results(self, tmp_path: Path): (tmp_path / "pipeline_summary.json").write_text( json.dumps( { "run_id": "test", "stages_executed": 1, "stages_done": 1, "stages_failed": 0, "final_status": "done", "generated": "now", } ) ) exp_dir = tmp_path / "stage-12" exp_dir.mkdir() (exp_dir / "experiment_results.json").write_text( json.dumps( { "iterations": [{"loss": 0.5}, {"loss": 0.3}], "best_metric": 0.3, } ) ) report = generate_report(tmp_path) assert "Experiments" in report assert "2 iterations" in report ================================================ FILE: tests/test_rc_runner.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false from __future__ import annotations import json from pathlib import Path from typing import Any, cast import pytest from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.pipeline import runner as rc_runner from researchclaw.pipeline.executor import StageResult from researchclaw.pipeline.stages import STAGE_SEQUENCE, Stage, StageStatus @pytest.fixture() def rc_config(tmp_path: Path) -> RCConfig: data = { "project": {"name": "rc-runner-test", "mode": "docs-first"}, "research": {"topic": "pipeline testing"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline", }, } return RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) @pytest.fixture() def adapters() -> AdapterBundle: return AdapterBundle() @pytest.fixture() def run_dir(tmp_path: Path) -> Path: path = tmp_path / "run" path.mkdir() return path def _done(stage: Stage, artifacts: tuple[str, ...] = ("out.md",)) -> StageResult: return StageResult(stage=stage, status=StageStatus.DONE, artifacts=artifacts) def _failed(stage: Stage, msg: str = "boom") -> StageResult: return StageResult(stage=stage, status=StageStatus.FAILED, artifacts=(), error=msg) def _blocked(stage: Stage) -> StageResult: return StageResult( stage=stage, status=StageStatus.BLOCKED_APPROVAL, artifacts=("gate.md",), decision="block", ) def test_execute_pipeline_runs_stages_in_sequence( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-seq", config=rc_config, adapters=adapters, ) assert seen == list(STAGE_SEQUENCE) assert len(results) == 23 assert all(r.status == StageStatus.DONE for r in results) def test_execute_pipeline_stops_on_failed_stage( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: fail_stage = Stage.SEARCH_STRATEGY def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage == fail_stage: return _failed(stage, "forced failure") return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-fail", config=rc_config, adapters=adapters, ) assert results[-1].stage == fail_stage assert results[-1].status == StageStatus.FAILED assert len(results) == int(fail_stage) def test_execute_pipeline_stops_on_gate_when_stop_on_gate_enabled( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: gate_stage = Stage.LITERATURE_SCREEN def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage == gate_stage: return _blocked(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-gate-stop", config=rc_config, adapters=adapters, stop_on_gate=True, ) assert results[-1].stage == gate_stage assert results[-1].status == StageStatus.BLOCKED_APPROVAL assert len(results) == int(gate_stage) def test_execute_pipeline_continues_after_gate_when_stop_on_gate_disabled( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: gate_stage = Stage.LITERATURE_SCREEN def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage == gate_stage: return _blocked(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-gate-continue", config=rc_config, adapters=adapters, stop_on_gate=False, ) assert len(results) == 23 assert any(item.status == StageStatus.BLOCKED_APPROVAL for item in results) def test_execute_pipeline_writes_pipeline_summary_json( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-summary", config=rc_config, adapters=adapters, ) summary_path = run_dir / "pipeline_summary.json" assert summary_path.exists() def test_pipeline_summary_has_expected_fields_and_values( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs if stage == Stage.LITERATURE_SCREEN: return _blocked(stage) if stage == Stage.HYPOTHESIS_GEN: return _failed(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-summary-fields", config=rc_config, adapters=adapters, ) summary = cast( dict[str, Any], json.loads((run_dir / "pipeline_summary.json").read_text(encoding="utf-8")), ) assert summary["run_id"] == "run-summary-fields" assert summary["stages_executed"] == len(results) assert summary["stages_done"] == sum( 1 for r in results if r.status == StageStatus.DONE ) assert summary["stages_blocked"] == 1 assert summary["stages_failed"] == 1 assert summary["from_stage"] == 1 assert summary["final_stage"] == int(Stage.HYPOTHESIS_GEN) assert summary["final_status"] == "failed" assert "generated" in summary def test_execute_pipeline_from_stage_skips_earlier_stages( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-from-stage", config=rc_config, adapters=adapters, from_stage=Stage.PAPER_OUTLINE, ) assert seen[0] == Stage.PAPER_OUTLINE assert len(seen) == len(STAGE_SEQUENCE) - (int(Stage.PAPER_OUTLINE) - 1) assert len(results) == len(seen) def test_execute_pipeline_writes_kb_entries_when_kb_root_provided( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, tmp_path: Path, ) -> None: calls: list[tuple[int, str, str]] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs stage_dir = run_dir / f"stage-{int(stage):02d}" stage_dir.mkdir(parents=True, exist_ok=True) (stage_dir / "out.md").write_text(f"stage {int(stage)}", encoding="utf-8") return _done(stage) def mock_write_stage_to_kb( kb_root: Path, stage_id: int, stage_name: str, run_id: str, artifacts: list[str], stage_dir: Path, **kwargs, ): _ = kb_root, artifacts, stage_dir, kwargs calls.append((stage_id, stage_name, run_id)) return [] monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) monkeypatch.setattr(rc_runner, "write_stage_to_kb", mock_write_stage_to_kb) kb_root = tmp_path / "kb-out" results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-kb", config=rc_config, adapters=adapters, kb_root=kb_root, ) assert len(results) == 23 assert len(calls) == 23 assert calls[0] == (1, "topic_init", "run-kb") def test_execute_pipeline_passes_auto_approve_flag_to_execute_stage( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: received: list[bool] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: received.append(kwargs["auto_approve_gates"]) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-auto-approve", config=rc_config, adapters=adapters, auto_approve_gates=True, ) assert received assert all(received) @pytest.mark.parametrize( ("stage", "started", "expected"), [ (Stage.TOPIC_INIT, False, True), (Stage.PROBLEM_DECOMPOSE, False, False), (Stage.PAPER_DRAFT, True, True), ], ) def test_should_start_logic(stage: Stage, started: bool, expected: bool) -> None: assert rc_runner._should_start(stage, Stage.TOPIC_INIT, started) is expected @pytest.mark.parametrize( ("results", "expected_status", "expected_final_stage"), [ ([], "no_stages", int(Stage.TOPIC_INIT)), ([_done(Stage.TOPIC_INIT)], "done", int(Stage.TOPIC_INIT)), ( [_done(Stage.TOPIC_INIT), _failed(Stage.PROBLEM_DECOMPOSE)], "failed", int(Stage.PROBLEM_DECOMPOSE), ), ], ) def test_build_pipeline_summary_core_fields( results, expected_status: str, expected_final_stage: int ) -> None: summary = rc_runner._build_pipeline_summary( run_id="run-core", results=results, from_stage=Stage.TOPIC_INIT, ) assert summary["run_id"] == "run-core" assert summary["final_status"] == expected_status assert summary["final_stage"] == expected_final_stage def test_pipeline_prints_stage_progress( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, capsys: pytest.CaptureFixture[str], ) -> None: mock_results = [ StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("topic.json",) ), StageResult( stage=Stage.PROBLEM_DECOMPOSE, status=StageStatus.DONE, artifacts=("tree.json",), ), StageResult( stage=Stage.SEARCH_STRATEGY, status=StageStatus.FAILED, artifacts=(), error="LLM timeout", ), ] call_idx = 0 def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = stage, kwargs nonlocal call_idx idx = call_idx call_idx += 1 return mock_results[min(idx, len(mock_results) - 1)] monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) monkeypatch.setattr(rc_runner, "write_stage_to_kb", lambda *args, **kwargs: []) _ = rc_runner.execute_pipeline( run_dir=run_dir, run_id="rc-test-001", config=rc_config, adapters=adapters, ) captured = capsys.readouterr() assert "TOPIC_INIT — running..." in captured.out assert "TOPIC_INIT — done" in captured.out assert "SEARCH_STRATEGY — FAILED" in captured.out assert "LLM timeout" in captured.out def test_pipeline_prints_elapsed_time( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, capsys: pytest.CaptureFixture[str], ) -> None: mock_result = StageResult( stage=Stage.TOPIC_INIT, status=StageStatus.DONE, artifacts=("topic.json",), ) mock_fail = StageResult( stage=Stage.PROBLEM_DECOMPOSE, status=StageStatus.FAILED, artifacts=(), error="test", ) results_iter = iter([mock_result, mock_fail]) monkeypatch.setattr( rc_runner, "execute_stage", lambda *args, **kwargs: next(results_iter) ) monkeypatch.setattr(rc_runner, "write_stage_to_kb", lambda *args, **kwargs: []) _ = rc_runner.execute_pipeline( run_dir=run_dir, run_id="rc-test-002", config=rc_config, adapters=adapters, ) captured = capsys.readouterr() import re assert re.search(r"\d+\.\d+s\)", captured.out), ( f"No elapsed time found in: {captured.out}" ) # ── PIVOT/PROCEED/REFINE decision loop tests ── def _pivot_result(stage: Stage) -> StageResult: return StageResult( stage=stage, status=StageStatus.DONE, artifacts=("decision.md",), decision="pivot" ) def _refine_result(stage: Stage) -> StageResult: return StageResult( stage=stage, status=StageStatus.DONE, artifacts=("decision.md",), decision="refine" ) def test_pivot_decision_triggers_rollback_to_hypothesis_gen( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] pivot_count = 0 def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) nonlocal pivot_count if stage == Stage.RESEARCH_DECISION and pivot_count == 0: pivot_count += 1 return _pivot_result(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-pivot", config=rc_config, adapters=adapters, ) # Should have seen HYPOTHESIS_GEN at least twice (original + rollback) hyp_gen_count = sum(1 for s in seen if s == Stage.HYPOTHESIS_GEN) assert hyp_gen_count >= 2 # Decision history should be recorded history_path = run_dir / "decision_history.json" assert history_path.exists() history = json.loads(history_path.read_text()) assert len(history) == 1 assert history[0]["decision"] == "pivot" def test_refine_decision_triggers_rollback_to_iterative_refine( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] refine_count = 0 def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) nonlocal refine_count if stage == Stage.RESEARCH_DECISION and refine_count == 0: refine_count += 1 return _refine_result(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-refine", config=rc_config, adapters=adapters, ) # Should have seen ITERATIVE_REFINE at least twice refine_stage_count = sum(1 for s in seen if s == Stage.ITERATIVE_REFINE) assert refine_stage_count >= 2 def test_max_pivot_count_prevents_infinite_loop( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) # Always PIVOT — should be limited by MAX_DECISION_PIVOTS if stage == Stage.RESEARCH_DECISION: return _pivot_result(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-max-pivot", config=rc_config, adapters=adapters, ) # RESEARCH_DECISION should appear at most MAX_DECISION_PIVOTS + 1 times from researchclaw.pipeline.stages import MAX_DECISION_PIVOTS decision_count = sum(1 for s in seen if s == Stage.RESEARCH_DECISION) assert decision_count <= MAX_DECISION_PIVOTS + 1 def test_proceed_decision_does_not_trigger_rollback( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, ) -> None: seen: list[Stage] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-proceed", config=rc_config, adapters=adapters, ) # Should be exactly 23 stages, no rollback assert len(seen) == 23 assert not (run_dir / "decision_history.json").exists() def test_read_pivot_count_returns_zero_for_no_history(run_dir: Path) -> None: assert rc_runner._read_pivot_count(run_dir) == 0 def test_record_decision_history_appends(run_dir: Path) -> None: rc_runner._record_decision_history(run_dir, "pivot", Stage.HYPOTHESIS_GEN, 1) rc_runner._record_decision_history(run_dir, "refine", Stage.ITERATIVE_REFINE, 2) history = json.loads((run_dir / "decision_history.json").read_text()) assert len(history) == 2 assert history[0]["decision"] == "pivot" assert history[1]["decision"] == "refine" # ── Deliverables packaging tests ── def _setup_stage_artifacts(run_dir: Path) -> None: """Create typical stage-22 and stage-23 output files for testing.""" s22 = run_dir / "stage-22" s22.mkdir(parents=True, exist_ok=True) (s22 / "paper_final.md").write_text("# My Paper\nContent here.", encoding="utf-8") (s22 / "paper.tex").write_text("\\documentclass{article}\n\\begin{document}\nHello\n\\end{document}", encoding="utf-8") (s22 / "references.bib").write_text("@article{smith2024,\n title={Test}\n}", encoding="utf-8") code_dir = s22 / "code" code_dir.mkdir() (code_dir / "main.py").write_text("print('hello')", encoding="utf-8") (code_dir / "requirements.txt").write_text("numpy\n", encoding="utf-8") (code_dir / "README.md").write_text("# Code\n", encoding="utf-8") s23 = run_dir / "stage-23" s23.mkdir(parents=True, exist_ok=True) (s23 / "paper_final_verified.md").write_text("# My Paper (verified)\nContent.", encoding="utf-8") (s23 / "references_verified.bib").write_text("@article{smith2024,\n title={Test}\n}", encoding="utf-8") (s23 / "verification_report.json").write_text( json.dumps({"summary": {"total": 5, "verified": 4}}), encoding="utf-8" ) def test_package_deliverables_collects_all_artifacts( run_dir: Path, rc_config: RCConfig ) -> None: _setup_stage_artifacts(run_dir) dest = rc_runner._package_deliverables(run_dir, "run-pkg-test", rc_config) assert dest is not None assert dest == run_dir / "deliverables" assert (dest / "paper_final.md").exists() assert (dest / "paper.tex").exists() assert (dest / "references.bib").exists() assert (dest / "code" / "main.py").exists() assert (dest / "verification_report.json").exists() assert (dest / "manifest.json").exists() manifest = json.loads((dest / "manifest.json").read_text()) assert manifest["run_id"] == "run-pkg-test" assert "paper_final.md" in manifest["files"] def test_package_deliverables_prefers_verified_versions( run_dir: Path, rc_config: RCConfig ) -> None: _setup_stage_artifacts(run_dir) rc_runner._package_deliverables(run_dir, "run-verified", rc_config) dest = run_dir / "deliverables" # Should contain verified content (from stage 23), not base (from stage 22) paper = (dest / "paper_final.md").read_text(encoding="utf-8") assert "verified" in paper bib = (dest / "references.bib").read_text(encoding="utf-8") assert "smith2024" in bib def test_package_deliverables_falls_back_to_stage22( run_dir: Path, rc_config: RCConfig ) -> None: """When stage 23 outputs are missing, falls back to stage 22 versions.""" s22 = run_dir / "stage-22" s22.mkdir(parents=True, exist_ok=True) (s22 / "paper_final.md").write_text("# Base Paper", encoding="utf-8") (s22 / "references.bib").write_text("@article{a,title={A}}", encoding="utf-8") dest = rc_runner._package_deliverables(run_dir, "run-fallback", rc_config) assert dest is not None paper = (dest / "paper_final.md").read_text(encoding="utf-8") assert "Base Paper" in paper def test_package_deliverables_returns_none_when_no_stage_artifacts( run_dir: Path, tmp_path: Path, ) -> None: """Returns None when no stage artifacts exist and no style files found.""" # Use a config with an unknown conference so style files aren't bundled data = { "project": {"name": "empty-test", "mode": "docs-first"}, "research": {"topic": "empty"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost:1234/v1", "api_key_env": "RC_TEST_KEY", "api_key": "inline", }, "export": {"target_conference": "unknown_conf_9999"}, } cfg = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) result = rc_runner._package_deliverables(run_dir, "run-empty", cfg) assert result is None assert not (run_dir / "deliverables").exists() def test_package_deliverables_includes_style_files( run_dir: Path, rc_config: RCConfig ) -> None: """Style files (.sty, .bst) for the target conference are bundled.""" _setup_stage_artifacts(run_dir) dest = rc_runner._package_deliverables(run_dir, "run-styles", rc_config) assert dest is not None # Default config uses neurips_2025 → should have neurips_2025.sty assert (dest / "neurips_2025.sty").exists() manifest = json.loads((dest / "manifest.json").read_text()) assert "neurips_2025.sty" in manifest["files"] # ── Atomic checkpoint write tests ── def test_write_checkpoint_uses_atomic_rename(run_dir: Path) -> None: """Checkpoint must be written via temp file + rename, not direct write""" rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-atomic") cp = run_dir / "checkpoint.json" assert cp.exists() data = json.loads(cp.read_text(encoding="utf-8")) assert data["last_completed_stage"] == int(Stage.TOPIC_INIT) assert data["run_id"] == "run-atomic" def test_write_checkpoint_leaves_no_temp_files(run_dir: Path) -> None: """Atomic write must clean up temp files on success""" rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-clean") temps = list(run_dir.glob("*.tmp")) assert temps == [], f"Leftover temp files: {temps}" def test_write_checkpoint_preserves_old_on_write_failure( run_dir: Path, monkeypatch: pytest.MonkeyPatch ) -> None: """If the temp-file write fails, the existing checkpoint must survive""" import builtins rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-ok") original_open = builtins.open def _exploding_open(path, *args, **kwargs): # After os.close(fd), _write_checkpoint opens via path string — # intercept temp-file opens (checkpoint_*.tmp) if isinstance(path, (str, Path)) and "checkpoint_" in str(path): raise OSError("disk full") if isinstance(path, int): raise OSError("disk full") return original_open(path, *args, **kwargs) monkeypatch.setattr(builtins, "open", _exploding_open) with pytest.raises(OSError): rc_runner._write_checkpoint(run_dir, Stage.PROBLEM_DECOMPOSE, "run-ok") # Original checkpoint must be intact data = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8")) assert data["last_completed_stage"] == int(Stage.TOPIC_INIT) # Temp file must be cleaned up assert list(run_dir.glob("checkpoint_*.tmp")) == [] def test_write_checkpoint_overwrites_previous(run_dir: Path) -> None: """A second checkpoint call must fully replace the first""" rc_runner._write_checkpoint(run_dir, Stage.TOPIC_INIT, "run-1") rc_runner._write_checkpoint(run_dir, Stage.PROBLEM_DECOMPOSE, "run-1") data = json.loads((run_dir / "checkpoint.json").read_text(encoding="utf-8")) assert data["last_completed_stage"] == int(Stage.PROBLEM_DECOMPOSE) assert data["last_completed_name"] == Stage.PROBLEM_DECOMPOSE.name def _degraded(stage: Stage) -> StageResult: return StageResult( stage=stage, status=StageStatus.DONE, artifacts=("quality_report.json",), decision="degraded", ) def test_degraded_quality_gate_continues_pipeline( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, capsys: pytest.CaptureFixture[str], ) -> None: """When quality gate returns decision='degraded', pipeline continues to completion.""" seen: list[Stage] = [] def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: _ = kwargs seen.append(stage) if stage == Stage.QUALITY_GATE: return _degraded(stage) return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) results = rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-degraded", config=rc_config, adapters=adapters, ) # All 23 stages should execute (not stopped at quality gate) assert len(results) == 23 assert seen == list(STAGE_SEQUENCE) # Quality gate result should have decision="degraded" qg_result = [r for r in results if r.stage == Stage.QUALITY_GATE][0] assert qg_result.decision == "degraded" assert qg_result.status == StageStatus.DONE # Pipeline summary should have degraded=True summary = json.loads((run_dir / "pipeline_summary.json").read_text()) assert summary["degraded"] is True # Output should show DEGRADED message captured = capsys.readouterr() assert "DEGRADED" in captured.out def test_package_deliverables_called_after_pipeline( monkeypatch: pytest.MonkeyPatch, run_dir: Path, rc_config: RCConfig, adapters: AdapterBundle, capsys: pytest.CaptureFixture[str], ) -> None: """Deliverables packaging is called at end of execute_pipeline.""" _setup_stage_artifacts(run_dir) def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: return _done(stage) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) rc_runner.execute_pipeline( run_dir=run_dir, run_id="run-with-deliverables", config=rc_config, adapters=adapters, ) captured = capsys.readouterr() assert "Deliverables packaged" in captured.out assert (run_dir / "deliverables" / "manifest.json").exists() # --------------------------------------------------------------------------- # BUG-223: _promote_best_stage14 must always write experiment_summary_best.json # --------------------------------------------------------------------------- def _make_stage14_summary(run_dir: Path, suffix: str, pm_value: float) -> None: """Helper: create a stage-14{suffix}/experiment_summary.json.""" d = run_dir / f"stage-14{suffix}" d.mkdir(parents=True, exist_ok=True) data = { "metrics_summary": { "primary_metric": {"min": pm_value, "max": pm_value, "mean": pm_value, "count": 1} }, "condition_summaries": {"cond_a": {"metrics": {"primary_metric": pm_value}}}, } (d / "experiment_summary.json").write_text(json.dumps(data), encoding="utf-8") class TestPromoteBestStage14BestJson: """BUG-223: experiment_summary_best.json must be written even when stage-14/ already has the best result (early-return path).""" @pytest.fixture() def max_config(self, rc_config: RCConfig) -> RCConfig: """Config with metric_direction=maximize (accuracy-like metrics).""" object.__setattr__(rc_config.experiment, "metric_direction", "maximize") return rc_config def test_best_json_written_when_current_is_best( self, run_dir: Path, max_config: RCConfig ) -> None: """stage-14/ already best → should still write best.json.""" _make_stage14_summary(run_dir, "", 90.0) _make_stage14_summary(run_dir, "_v1", 80.0) _make_stage14_summary(run_dir, "_v2", 70.0) rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" assert best_path.exists(), "experiment_summary_best.json must always be written" data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 90.0 def test_best_json_written_when_promotion_needed( self, run_dir: Path, max_config: RCConfig ) -> None: """stage-14/ is NOT best → promote + write best.json.""" _make_stage14_summary(run_dir, "", 70.0) _make_stage14_summary(run_dir, "_v1", 95.0) rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" assert best_path.exists() data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 95.0 def test_best_json_written_with_equal_values( self, run_dir: Path, max_config: RCConfig ) -> None: """BUG-223 exact scenario: stage-14 and stage-14_v1 have equal metrics, stage-14_v2 is regressed.""" _make_stage14_summary(run_dir, "", 64.46) _make_stage14_summary(run_dir, "_v1", 64.46) _make_stage14_summary(run_dir, "_v2", 26.80) rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" assert best_path.exists(), "BUG-223: best.json missing when current is tied-best" data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 64.46 class TestPromoteBestStage14AnalysisBest: """BUG-225: analysis_best.md must be written from best stage-14 iteration.""" @pytest.fixture() def max_config(self, rc_config: RCConfig) -> RCConfig: object.__setattr__(rc_config.experiment, "metric_direction", "maximize") return rc_config def test_analysis_best_written_from_best_iteration( self, run_dir: Path, max_config: RCConfig ) -> None: """analysis_best.md should come from the best stage-14 iteration.""" _make_stage14_summary(run_dir, "", 70.0) _make_stage14_summary(run_dir, "_v1", 95.0) # Write analysis.md in each (run_dir / "stage-14" / "analysis.md").write_text("Degenerate analysis", encoding="utf-8") (run_dir / "stage-14_v1" / "analysis.md").write_text("Best analysis v1", encoding="utf-8") rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] best_analysis = run_dir / "analysis_best.md" assert best_analysis.exists(), "BUG-225: analysis_best.md must be written" assert best_analysis.read_text(encoding="utf-8") == "Best analysis v1" def test_analysis_best_written_when_current_is_best( self, run_dir: Path, max_config: RCConfig ) -> None: """Even when stage-14 is already best, analysis_best.md should be written.""" _make_stage14_summary(run_dir, "", 90.0) _make_stage14_summary(run_dir, "_v1", 80.0) (run_dir / "stage-14" / "analysis.md").write_text("Best analysis current", encoding="utf-8") (run_dir / "stage-14_v1" / "analysis.md").write_text("Worse analysis", encoding="utf-8") rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] best_analysis = run_dir / "analysis_best.md" assert best_analysis.exists() assert best_analysis.read_text(encoding="utf-8") == "Best analysis current" def test_no_analysis_best_when_no_analysis_md( self, run_dir: Path, max_config: RCConfig ) -> None: """If best stage-14 has no analysis.md, no analysis_best.md is written.""" _make_stage14_summary(run_dir, "", 90.0) rc_runner._promote_best_stage14(run_dir, max_config) # type: ignore[attr-defined] assert not (run_dir / "analysis_best.md").exists() class TestPromoteBestStage14DegenerateDetection: """BUG-226: Degenerate near-zero metrics must not be promoted as best.""" def test_degenerate_minimize_skipped(self, run_dir: Path, rc_config: RCConfig) -> None: """When minimize, a value 1000x smaller than second-best is degenerate.""" # metric_direction defaults to "minimize" _make_stage14_summary(run_dir, "", 7.26e-8) # degenerate (broken normalization) _make_stage14_summary(run_dir, "_v2", 0.37) # valid rc_runner._promote_best_stage14(run_dir, rc_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" assert best_path.exists() data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 0.37, "Degenerate value should be skipped, valid v2 promoted" def test_legitimate_minimize_not_skipped(self, run_dir: Path, rc_config: RCConfig) -> None: """When values are within normal range, smaller is legitimately best.""" _make_stage14_summary(run_dir, "", 0.15) _make_stage14_summary(run_dir, "_v1", 0.37) rc_runner._promote_best_stage14(run_dir, rc_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 0.15, "Legitimate lower value should be promoted" def test_single_candidate_not_affected(self, run_dir: Path, rc_config: RCConfig) -> None: """Single candidate is never skipped regardless of value.""" _make_stage14_summary(run_dir, "", 1e-10) rc_runner._promote_best_stage14(run_dir, rc_config) # type: ignore[attr-defined] best_path = run_dir / "experiment_summary_best.json" data = json.loads(best_path.read_text(encoding="utf-8")) pm = data["metrics_summary"]["primary_metric"] assert pm["mean"] == 1e-10 ================================================ FILE: tests/test_rc_sanitization.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.pipeline.executor import _sanitize_fabricated_data from researchclaw.pipeline.stage_impls._code_generation import _check_rl_compatibility @pytest.fixture() def run_dir(tmp_path: Path) -> Path: path = tmp_path / "run" path.mkdir() return path def _write_experiment_summary(run_dir: Path, data: dict) -> None: stage14 = run_dir / "stage-14" stage14.mkdir(parents=True, exist_ok=True) (stage14 / "experiment_summary.json").write_text( json.dumps(data, indent=2), encoding="utf-8" ) def test_sanitize_replaces_unverified_numbers(run_dir: Path) -> None: _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85, "f1": 0.82}, "best_run": {"metrics": {"accuracy": 0.85}}, }) paper = ( "## Results\n\n" "| Method | Accuracy | F1 | Precision |\n" "| --- | --- | --- | --- |\n" "| Ours | 0.85 | 0.82 | 0.91 |\n" "| Baseline | 0.73 | 0.65 | 0.78 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 0.85 and 0.82 should be kept (verified), 0.91, 0.73, 0.65, 0.78 replaced assert "0.85" in sanitized assert "0.82" in sanitized assert "0.91" not in sanitized assert "0.73" not in sanitized assert "---" in sanitized assert report["sanitized"] is True assert report["numbers_replaced"] == 4 assert report["numbers_kept"] == 2 def test_sanitize_preserves_table_structure(run_dir: Path) -> None: _write_experiment_summary(run_dir, { "metrics_summary": {"loss": 0.12}, }) paper = ( "| Model | Loss |\n" "| --- | --- |\n" "| A | 0.12 |\n" "| B | 0.8765 |\n" ) sanitized, _ = _sanitize_fabricated_data(paper, run_dir) # Table pipes should still be intact assert sanitized.count("|") == paper.count("|") assert "0.12" in sanitized assert "0.8765" not in sanitized def test_sanitize_no_experiment_summary(run_dir: Path) -> None: paper = "| A | 0.5 |\n| --- | --- |\n| B | 0.6 |\n" sanitized, report = _sanitize_fabricated_data(paper, run_dir) assert report["sanitized"] is False assert sanitized == paper # unchanged def test_sanitize_tolerance_within_1_percent(run_dir: Path) -> None: _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 100.0}, }) paper = ( "| Method | Acc |\n" "| --- | --- |\n" "| Ours | 100.5 |\n" # within 1% of 100.0 "| Other | 110.0 |\n" # outside 1% ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) assert "100.5" in sanitized # kept (within tolerance) assert "110.0" not in sanitized # replaced def test_sanitize_header_row_preserved(run_dir: Path) -> None: _write_experiment_summary(run_dir, { "metrics_summary": {"val": 5.0}, }) paper = ( "| Col1 | Col2 |\n" "| --- | --- |\n" "| data | 99.9 |\n" ) sanitized, _ = _sanitize_fabricated_data(paper, run_dir) # Header row should be untouched assert "| Col1 | Col2 |" in sanitized def test_sanitize_hp_columns_preserved_in_mixed_table(run_dir: Path) -> None: """BUG-184: HP columns in mixed tables should not be sanitized.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, "best_run": {"metrics": {"accuracy": 0.85}}, }) paper = ( "## Results\n\n" "| Method | LR | Batch Size | Accuracy | F1 |\n" "| --- | --- | --- | --- | --- |\n" "| Ours | 0.0007 | 48 | 0.85 | 0.91 |\n" "| Baseline | 0.0001 | 24 | 0.73 | 0.78 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # HP columns (LR, Batch Size) should be preserved regardless of verification assert "0.0007" in sanitized, "HP column 'LR' value should not be sanitized" assert "0.0001" in sanitized, "HP column 'LR' value should not be sanitized" # Result columns: 0.85 verified → kept; 0.91, 0.73, 0.78 → replaced assert "0.85" in sanitized assert "0.91" not in sanitized assert "0.73" not in sanitized def test_sanitize_pure_hp_table_skipped(run_dir: Path) -> None: """BUG-192: Pure HP tables (header keywords) should be fully skipped.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, }) paper = ( "| Hyperparameter | Value |\n" "| --- | --- |\n" "| Learning Rate | 0.0007 |\n" "| Batch Size | 48 |\n" "| Weight Decay | 0.0005 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # Entire table should be skipped — no sanitization at all assert "0.0007" in sanitized assert "0.0005" in sanitized assert report["tables_processed"] == 0 def test_prose_sanitization_replaces_unverified(run_dir: Path) -> None: """Prose numbers in Results section should be sanitized.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, "best_run": {"metrics": {"accuracy": 0.85}}, }) paper = ( "# Introduction\n" "Prior work achieved 92.3% accuracy on this task.\n\n" "# Results\n" "Our method achieved 85.0% accuracy, which is significantly better.\n" "The baseline obtained 72.4% accuracy on the same benchmark.\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 85.0 is verified (matches 0.85 × 100), should be kept assert "85.0" in sanitized # 72.4 is unverified in Results → replaced assert "72.4" not in sanitized assert "[value removed]" in sanitized # 92.3 is in Introduction (not Results) → should be preserved assert "92.3" in sanitized assert report["prose_numbers_replaced"] >= 1 def test_sanitize_model_name_numbers_preserved(run_dir: Path) -> None: """BUG-206: Numbers in model names (ResNet-34) must not be replaced.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, "best_run": {"metrics": {"accuracy": 0.85}}, }) # Table with model variant numbers in the first column (ci=1, skipped) paper = ( "## Results\n\n" "| Method | Accuracy |\n" "| --- | --- |\n" "| ResNet-34 (baseline) | 0.85 |\n" "| ResNet-50 (teacher) | 0.91 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # First column is method names — must be preserved (includes "34", "50") assert "ResNet-34" in sanitized, "Model name 'ResNet-34' should not be sanitized" assert "ResNet-50" in sanitized, "Model name 'ResNet-50' should not be sanitized" def test_sanitize_unicode_hyphen_model_names_preserved(run_dir: Path) -> None: """BUG-206: Unicode non-breaking hyphen in model names must not be replaced.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, "best_run": {"metrics": {"accuracy": 0.85}}, }) # U+2011 non-breaking hyphen (common LLM output) paper = ( "## Results\n\n" "| Method | Accuracy |\n" "| --- | --- |\n" "| ResNet\u201134 (baseline) | 0.85 |\n" "| ResNet\u201150 (teacher) | 0.91 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) assert "ResNet\u201134" in sanitized, "Model name with U+2011 hyphen should not be sanitized" assert "ResNet\u201150" in sanitized, "Model name with U+2011 hyphen should not be sanitized" def test_prose_sanitization_preserves_introduction(run_dir: Path) -> None: """Numbers outside Results/Experiments should NOT be touched.""" _write_experiment_summary(run_dir, { "metrics_summary": {"val": 0.50}, }) paper = ( "# Introduction\n" "Previous methods achieved 94.2% accuracy.\n\n" "# Related Work\n" "Smith et al. reported 88.7% on the benchmark.\n\n" "# Conclusion\n" "We demonstrated 50.0% accuracy.\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # None of these sections are Results/Experiments → all preserved assert "94.2" in sanitized assert "88.7" in sanitized assert report["prose_numbers_replaced"] == 0 # --------------------------------------------------------------------------- # RL compatibility check (Improvement G) # --------------------------------------------------------------------------- def test_rl_compatibility_dqn_continuous_detected() -> None: """DQN + continuous env should produce errors.""" code = """ import gymnasium as gym from stable_baselines3 import DQN env = gym.make('Pendulum-v1') model = DQN('MlpPolicy', env) model.learn(total_timesteps=10000) """ errors = _check_rl_compatibility(code) assert len(errors) >= 1 assert "DQN" in errors[0] assert "pendulum" in errors[0].lower() def test_rl_compatibility_ppo_continuous_ok() -> None: """PPO + continuous env should be fine.""" code = """ import gymnasium as gym from stable_baselines3 import PPO env = gym.make('HalfCheetah-v5') model = PPO('MlpPolicy', env) model.learn(total_timesteps=100000) """ errors = _check_rl_compatibility(code) assert len(errors) == 0 def test_sanitize_reads_promoted_best_data(run_dir: Path) -> None: """BUG-222: Sanitizer uses experiment_summary_best.json (promoted best). After REFINE, the pipeline promotes the best iteration's data to experiment_summary_best.json. The sanitizer should validate against that file, not scan all refinement logs. """ # Stale stage-14 data (from a regressed iteration) _write_experiment_summary(run_dir, { "metrics_summary": {"primary_metric": {"min": 8.42, "max": 8.91, "mean": 8.6467, "count": 3}}, "best_run": {"metrics": {"primary_metric": 8.65}}, }) # Promoted best data (from the winning iteration) (run_dir / "experiment_summary_best.json").write_text( json.dumps({ "metrics_summary": {"primary_metric": {"min": 73.07, "max": 78.93, "mean": 75.56, "count": 3}}, "best_run": {"metrics": {"primary_metric": 78.93}}, "condition_summaries": { "Ours": {"metrics": {"primary_metric": 78.93}}, "SGD": {"metrics": {"primary_metric": 73.07}}, "AdamW": {"metrics": {"primary_metric": 68.67}}, }, }, indent=2), encoding="utf-8" ) # Paper uses values from promoted best paper = ( "## Results\n\n" "| Method | Accuracy |\n" "| --- | --- |\n" "| Ours | 78.93 |\n" "| SGD | 73.07 |\n" "| AdamW | 68.67 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) assert "78.93" in sanitized assert "73.07" in sanitized assert "68.67" in sanitized assert report["numbers_kept"] == 3 assert report["numbers_replaced"] == 0 def test_sanitize_rejects_regressed_refine_data(run_dir: Path) -> None: """BUG-222: Regressed REFINE iteration data must NOT pass sanitizer. Reproduces the Run 75 fabrication bypass: v1 had 74.52%, v3 regressed to 69.30%. Paper cited v3 numbers. The sanitizer should reject them. """ # v1 (best) promoted to experiment_summary_best.json (run_dir / "experiment_summary_best.json").write_text( json.dumps({ "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}}, "condition_summaries": { "FeatureKD": {"metrics": {"metric": 0.7452}}, "Teacher": {"metrics": {"metric": 0.7431}}, }, "metrics_summary": {"metric": {"mean": 0.7442, "min": 0.7431, "max": 0.7452}}, }, indent=2), encoding="utf-8" ) # v3 (regressed) in stage-14 (stale) _write_experiment_summary(run_dir, { "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}}, "condition_summaries": { "FeatureKD": {"metrics": {"metric": 0.6930}}, "Teacher": {"metrics": {"metric": 0.7292}}, }, "metrics_summary": {"metric": {"mean": 0.7111, "min": 0.6930, "max": 0.7292}}, }) # v3 sandbox data in refinement_log stage13 = run_dir / "stage-13_v2" stage13.mkdir(parents=True, exist_ok=True) (stage13 / "refinement_log.json").write_text(json.dumps({ "iterations": [{"sandbox": {"metrics": {"primary_metric": 0.6930}}}] }), encoding="utf-8") # Paper fabricates v3 numbers paper = ( "## Results\n\n" "| Method | Accuracy |\n" "| --- | --- |\n" "| FeatureKD | 69.30 |\n" "| Teacher | 72.92 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 69.30 should be REPLACED — it's from regressed v3, not promoted v1 assert "69.30" not in sanitized assert report["numbers_replaced"] >= 1 # But 74.52 or 74.31 (v1 best) would pass if cited paper_v1 = ( "## Results\n\n" "| Method | Accuracy |\n" "| --- | --- |\n" "| FeatureKD | 74.52 |\n" "| Teacher | 74.31 |\n" ) sanitized_v1, report_v1 = _sanitize_fabricated_data(paper_v1, run_dir) assert "74.52" in sanitized_v1 assert "74.31" in sanitized_v1 assert report_v1["numbers_replaced"] == 0 def test_sanitize_condition_names_with_decimals_preserved(run_dir: Path) -> None: """BUG-210: Condition names with decimal params (ema_decay_0.9) must not be damaged.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 73.07}, "best_run": {"metrics": {"accuracy": 73.07}}, }) paper = ( "## Results\n\n" "| Condition | Accuracy |\n" "| --- | --- |\n" "| ema_decay_0.9 | 73.07 |\n" "| ema_decay_0.99 | 69.33 |\n" "| swa_start_0.75 | 68.67 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # First column (condition names) must be completely preserved assert "ema_decay_0.9 " in sanitized, "Condition name 'ema_decay_0.9' damaged" assert "ema_decay_0.99" in sanitized, "Condition name 'ema_decay_0.99' damaged" assert "swa_start_0.75" in sanitized, "Condition name 'swa_start_0.75' damaged" # 73.07 is verified → kept assert "73.07" in sanitized def test_rl_compatibility_dqn_discrete_ok() -> None: """DQN + discrete env (CartPole) should be fine.""" code = """ import gymnasium as gym from stable_baselines3 import DQN env = gym.make('CartPole-v1') model = DQN('MlpPolicy', env) """ errors = _check_rl_compatibility(code) assert len(errors) == 0 # --------------------------------------------------------------------------- # BUG-211: LaTeX tabular sanitization # --------------------------------------------------------------------------- def test_sanitize_latex_tabular_replaces_unverified(run_dir: Path) -> None: """BUG-211: Numbers inside \\begin{tabular} must be sanitized.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.4816}, "best_run": {"metrics": {"accuracy": 0.4816}}, }) paper = ( "## Results\n\n" "```latex\n" "\\begin{table}[htbp]\n" "\\centering\n" "\\caption{Test accuracy for all configurations.}\n" "\\begin{tabular}{l c}\n" "\\toprule\n" "Method & Accuracy \\\\\n" "\\midrule\n" "baseline\\_resnet18 & \\textbf{0.4816} \\\\\n" "baseline\\_resnet50 & 0.4451 \\\\\n" "dropout\\_standard & 0.3243 \\\\\n" "\\bottomrule\n" "\\end{tabular}\n" "\\end{table}\n" "```\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 0.4816 is verified → kept assert "0.4816" in sanitized # 0.4451 and 0.3243 are unverified → replaced with --- assert "0.4451" not in sanitized assert "0.3243" not in sanitized assert "---" in sanitized assert report["tables_processed"] >= 1 assert report["numbers_replaced"] >= 2 def test_sanitize_latex_tabular_hp_table_skipped(run_dir: Path) -> None: """BUG-211: LaTeX HP tables should be skipped just like markdown ones.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, }) paper = ( "\\begin{table}[htbp]\n" "\\centering\n" "\\caption{Training hyperparameters.}\n" "\\begin{tabular}{l c}\n" "\\toprule\n" "Hyperparameter & Value \\\\\n" "\\midrule\n" "Learning Rate & 0.001 \\\\\n" "Batch Size & 128 \\\\\n" "Weight Decay & 0.0005 \\\\\n" "\\bottomrule\n" "\\end{tabular}\n" "\\end{table}\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # HP table — all values preserved, table NOT processed assert "0.001" in sanitized assert "0.0005" in sanitized def test_sanitize_latex_tabular_with_pm(run_dir: Path) -> None: """BUG-211: Numbers with ± in LaTeX cells must be individually checked.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 48.16, "accuracy_std": 0.35}, "best_run": {"metrics": {"accuracy": 48.16}}, "condition_summaries": { "method_a": {"primary_metric_mean": 48.16, "primary_metric_std": 0.35}, }, }) paper = ( "\\begin{tabular}{l c}\n" "\\toprule\n" "Method & Accuracy (mean $\\pm$ std) \\\\\n" "\\midrule\n" "method\\_a & 48.16 $\\pm$ 0.35 \\\\\n" "method\\_b & 32.43 $\\pm$ 0.45 \\\\\n" "\\bottomrule\n" "\\end{tabular}\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 48.16 and 0.35 are verified → kept assert "48.16" in sanitized assert "0.35" in sanitized # 32.43 and 0.45 are unverified → replaced assert "32.43" not in sanitized assert "0.45" not in sanitized def test_sanitize_latex_tabular_preserves_first_column(run_dir: Path) -> None: """BUG-211: First column (method names) must be preserved.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": 0.85}, "best_run": {"metrics": {"accuracy": 0.85}}, }) paper = ( "\\begin{tabular}{l r r r r}\n" "\\toprule\n" "Method & Seed 0 & Seed 1 & Seed 2 & Mean \\\\\n" "\\midrule\n" "resnet\\_18 & 0.4861 & 0.4809 & 0.4777 & 0.4816 \\\\\n" "resnet\\_50 & 0.4455 & 0.4459 & 0.4438 & 0.4451 \\\\\n" "\\bottomrule\n" "\\end{tabular}\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # Method names in first column must be preserved assert "resnet\\_18" in sanitized assert "resnet\\_50" in sanitized # --------------------------------------------------------------------------- # BUG-224: Statistical analysis tables should NOT be sanitized # --------------------------------------------------------------------------- def test_sanitize_skips_statistical_analysis_table(run_dir: Path) -> None: """BUG-224: Tables with t-statistics, p-values, and effect sizes are derived from experiment data and should not be sanitized.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": {"mean": 64.26}}, "condition_summaries": {"ce": {"metrics": {"accuracy": 64.26}}}, }) paper = ( "## Results\n\n" "| Method | Accuracy |\n" "|--------|----------|\n" "| CE | 64.26 |\n" "| SCE | 56.93 |\n\n" "## Statistical Analysis\n\n" "| Comparison | t-statistic | p-value |\n" "|-----------|------------|--------|\n" "| CE vs SCE | 7.3267 | 0.0123 |\n" "| CE vs GCE | 1.7100 | 0.0569 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # Results table: 64.26 is verified, 56.93 is NOT → gets replaced assert "56.93" not in sanitized or "---" in sanitized # Statistical table: 7.3267 and 0.0123 are derived → MUST be preserved assert "7.3267" in sanitized, "BUG-224: t-statistic was sanitized" assert "0.0123" in sanitized, "BUG-224: p-value was sanitized" assert "1.7100" in sanitized, "BUG-224: t-statistic was sanitized" assert "0.0569" in sanitized, "BUG-224: p-value was sanitized" def test_sanitize_preserves_common_hp_values(run_dir: Path) -> None: """BUG-224: Common HP values like 0.7 should be in the always-allowed set.""" _write_experiment_summary(run_dir, { "metrics_summary": {"accuracy": {"mean": 64.26}}, "condition_summaries": {"ce": {"metrics": {"accuracy": 64.26}}}, }) paper = ( "| Method | q | Accuracy |\n" "|--------|---|----------|\n" "| GCE | 0.7 | 64.26 |\n" "| GCE-05 | 0.5 | 66.77 |\n" ) sanitized, report = _sanitize_fabricated_data(paper, run_dir) # 0.7 should be preserved (always-allowed HP value) assert "0.7" in sanitized, "BUG-224: q=0.7 was incorrectly sanitized" # 0.5 should also be preserved assert "0.5" in sanitized ================================================ FILE: tests/test_rc_sentinel.py ================================================ # pyright: reportPrivateUsage=false """Tests for the sentinel watchdog and heartbeat system.""" from __future__ import annotations import json import os import subprocess from pathlib import Path import pytest from researchclaw.pipeline import runner as rc_runner from researchclaw.pipeline.stages import Stage # ── Heartbeat writing tests ── class TestHeartbeatWriting: def test_write_heartbeat_creates_file(self, tmp_path: Path) -> None: rc_runner._write_heartbeat(tmp_path, Stage.TOPIC_INIT, "run-hb-1") hb_path = tmp_path / "heartbeat.json" assert hb_path.exists() def test_heartbeat_contains_required_fields(self, tmp_path: Path) -> None: rc_runner._write_heartbeat(tmp_path, Stage.HYPOTHESIS_GEN, "run-hb-2") data = json.loads((tmp_path / "heartbeat.json").read_text()) assert data["pid"] == os.getpid() assert data["last_stage"] == 8 assert data["last_stage_name"] == "HYPOTHESIS_GEN" assert data["run_id"] == "run-hb-2" assert "timestamp" in data def test_heartbeat_updates_on_each_stage(self, tmp_path: Path) -> None: rc_runner._write_heartbeat(tmp_path, Stage.TOPIC_INIT, "run-1") data1 = json.loads((tmp_path / "heartbeat.json").read_text()) rc_runner._write_heartbeat(tmp_path, Stage.PAPER_DRAFT, "run-1") data2 = json.loads((tmp_path / "heartbeat.json").read_text()) assert data2["last_stage"] == 17 assert data1["last_stage"] == 1 class TestHeartbeatInPipeline: def test_pipeline_writes_heartbeat_after_each_stage( self, monkeypatch: pytest.MonkeyPatch, tmp_path: Path, ) -> None: from researchclaw.adapters import AdapterBundle from researchclaw.config import RCConfig from researchclaw.pipeline.executor import StageResult from researchclaw.pipeline.stages import StageStatus data = { "project": {"name": "hb-test", "mode": "docs-first"}, "research": {"topic": "heartbeat testing"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "local"}, "knowledge_base": {"backend": "markdown", "root": str(tmp_path / "kb")}, "openclaw_bridge": {}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost/v1", "api_key_env": "K", "api_key": "k", }, } config = RCConfig.from_dict(data, project_root=tmp_path, check_paths=False) run_dir = tmp_path / "run" run_dir.mkdir() call_count = 0 def mock_execute_stage(stage: Stage, **kwargs) -> StageResult: nonlocal call_count call_count += 1 if call_count >= 3: return StageResult( stage=stage, status=StageStatus.FAILED, artifacts=(), error="stop" ) return StageResult(stage=stage, status=StageStatus.DONE, artifacts=("x.md",)) monkeypatch.setattr(rc_runner, "execute_stage", mock_execute_stage) rc_runner.execute_pipeline( run_dir=run_dir, run_id="hb-test", config=config, adapters=AdapterBundle(), ) hb_path = run_dir / "heartbeat.json" assert hb_path.exists() data_out = json.loads(hb_path.read_text()) assert data_out["run_id"] == "hb-test" # ── Sentinel script syntax check ── class TestSentinelScript: def test_sentinel_script_exists(self) -> None: script = Path(__file__).parent.parent / "sentinel.sh" assert script.exists() def test_sentinel_script_is_valid_bash(self) -> None: script = Path(__file__).parent.parent / "sentinel.sh" result = subprocess.run( ["bash", "-n", str(script)], capture_output=True, text=True, ) assert result.returncode == 0, f"Bash syntax error: {result.stderr}" def test_sentinel_script_is_executable(self) -> None: script = Path(__file__).parent.parent / "sentinel.sh" assert os.access(script, os.X_OK) def test_sentinel_script_has_shebang(self) -> None: script = Path(__file__).parent.parent / "sentinel.sh" first_line = script.read_text().splitlines()[0] assert first_line.startswith("#!/") def test_sentinel_prints_usage_on_no_args(self) -> None: script = Path(__file__).parent.parent / "sentinel.sh" result = subprocess.run( ["bash", str(script)], capture_output=True, text=True, ) # Should fail because no run_dir argument provided assert result.returncode != 0 ================================================ FILE: tests/test_rc_stages.py ================================================ import pytest from researchclaw.pipeline.stages import ( DECISION_ROLLBACK, GATE_ROLLBACK, GATE_STAGES, MAX_DECISION_PIVOTS, NEXT_STAGE, PHASE_MAP, PREVIOUS_STAGE, STAGE_SEQUENCE, TRANSITION_MAP, Stage, StageStatus, TransitionEvent, TransitionOutcome, advance, default_rollback_stage, gate_required, ) def test_stage_enum_has_exactly_23_members(): assert len(Stage) == 23 @pytest.mark.parametrize( "index,stage", [(idx, stage) for idx, stage in enumerate(STAGE_SEQUENCE, start=1)] ) def test_stage_values_follow_sequence_order(index: int, stage: Stage): assert int(stage) == index def test_stage_sequence_contains_all_23_stages_in_order(): assert len(STAGE_SEQUENCE) == 23 assert STAGE_SEQUENCE[0] is Stage.TOPIC_INIT assert STAGE_SEQUENCE[-1] is Stage.CITATION_VERIFY assert tuple(Stage) == STAGE_SEQUENCE def test_next_stage_boundary_values(): assert NEXT_STAGE[Stage.TOPIC_INIT] is Stage.PROBLEM_DECOMPOSE assert NEXT_STAGE[Stage.EXPORT_PUBLISH] is Stage.CITATION_VERIFY def test_previous_stage_boundary_values(): assert PREVIOUS_STAGE[Stage.TOPIC_INIT] is None assert PREVIOUS_STAGE[Stage.PROBLEM_DECOMPOSE] is Stage.TOPIC_INIT def test_gate_stages_matches_expected_set(): assert GATE_STAGES == frozenset( {Stage.LITERATURE_SCREEN, Stage.EXPERIMENT_DESIGN, Stage.QUALITY_GATE} ) def test_gate_rollback_map_matches_expected_targets(): assert GATE_ROLLBACK == { Stage.LITERATURE_SCREEN: Stage.LITERATURE_COLLECT, Stage.EXPERIMENT_DESIGN: Stage.HYPOTHESIS_GEN, Stage.QUALITY_GATE: Stage.PAPER_OUTLINE, } def test_phase_map_has_8_phases_with_expected_membership(): assert len(PHASE_MAP) == 8 assert PHASE_MAP["A: Research Scoping"] == ( Stage.TOPIC_INIT, Stage.PROBLEM_DECOMPOSE, ) assert PHASE_MAP["B: Literature Discovery"] == ( Stage.SEARCH_STRATEGY, Stage.LITERATURE_COLLECT, Stage.LITERATURE_SCREEN, Stage.KNOWLEDGE_EXTRACT, ) assert PHASE_MAP["C: Knowledge Synthesis"] == ( Stage.SYNTHESIS, Stage.HYPOTHESIS_GEN, ) assert PHASE_MAP["D: Experiment Design"] == ( Stage.EXPERIMENT_DESIGN, Stage.CODE_GENERATION, Stage.RESOURCE_PLANNING, ) assert PHASE_MAP["E: Experiment Execution"] == ( Stage.EXPERIMENT_RUN, Stage.ITERATIVE_REFINE, ) assert PHASE_MAP["F: Analysis & Decision"] == ( Stage.RESULT_ANALYSIS, Stage.RESEARCH_DECISION, ) assert PHASE_MAP["G: Paper Writing"] == ( Stage.PAPER_OUTLINE, Stage.PAPER_DRAFT, Stage.PEER_REVIEW, Stage.PAPER_REVISION, ) assert PHASE_MAP["H: Finalization"] == ( Stage.QUALITY_GATE, Stage.KNOWLEDGE_ARCHIVE, Stage.EXPORT_PUBLISH, Stage.CITATION_VERIFY, ) def test_phase_map_covers_all_stages_exactly_once(): flattened = tuple(stage for stages in PHASE_MAP.values() for stage in stages) assert len(flattened) == 23 assert set(flattened) == set(Stage) @pytest.mark.parametrize( "status", [StageStatus.PENDING, StageStatus.RETRYING, StageStatus.PAUSED], ) def test_start_event_transitions_to_running_from_allowed_states(status: StageStatus): outcome = advance(Stage.EXPERIMENT_RUN, status, TransitionEvent.START) assert outcome.status is StageStatus.RUNNING assert outcome.next_stage is Stage.EXPERIMENT_RUN def test_succeed_event_on_non_gate_stage_transitions_to_done(): outcome = advance( Stage.SEARCH_STRATEGY, StageStatus.RUNNING, TransitionEvent.SUCCEED, hitl_required_stages=(5, 9, 20), ) assert outcome.status is StageStatus.DONE assert outcome.next_stage is Stage.LITERATURE_COLLECT assert outcome.checkpoint_required is True assert outcome.decision == "proceed" def test_succeed_event_on_gate_stage_transitions_to_blocked_approval(): outcome = advance( Stage.LITERATURE_SCREEN, StageStatus.RUNNING, TransitionEvent.SUCCEED, hitl_required_stages=(5, 20), ) assert outcome.status is StageStatus.BLOCKED_APPROVAL assert outcome.next_stage is Stage.LITERATURE_SCREEN assert outcome.checkpoint_required is False assert outcome.decision == "block" def test_approve_event_transitions_blocked_stage_to_done(): outcome = advance( Stage.EXPERIMENT_DESIGN, StageStatus.BLOCKED_APPROVAL, TransitionEvent.APPROVE, hitl_required_stages=(5, 9, 20), ) assert outcome.status is StageStatus.DONE assert outcome.next_stage is Stage.CODE_GENERATION assert outcome.checkpoint_required is True def test_reject_event_rolls_back_to_default_gate_mapping(): outcome = advance( Stage.QUALITY_GATE, StageStatus.BLOCKED_APPROVAL, TransitionEvent.REJECT, hitl_required_stages=(5, 9, 20), ) assert outcome.status is StageStatus.PENDING assert outcome.stage is Stage.PAPER_OUTLINE assert outcome.next_stage is Stage.PAPER_OUTLINE assert outcome.rollback_stage is Stage.PAPER_OUTLINE assert outcome.checkpoint_required is True assert outcome.decision == "pivot" def test_reject_event_uses_explicit_rollback_stage_when_provided(): outcome = advance( Stage.PAPER_REVISION, StageStatus.BLOCKED_APPROVAL, TransitionEvent.REJECT, rollback_stage=Stage.PAPER_OUTLINE, ) assert outcome.status is StageStatus.PENDING assert outcome.stage is Stage.PAPER_OUTLINE assert outcome.next_stage is Stage.PAPER_OUTLINE assert outcome.rollback_stage is Stage.PAPER_OUTLINE def test_timeout_event_transitions_to_paused_with_block_decision(): outcome = advance( Stage.LITERATURE_SCREEN, StageStatus.BLOCKED_APPROVAL, TransitionEvent.TIMEOUT, ) assert outcome.status is StageStatus.PAUSED assert outcome.next_stage is Stage.LITERATURE_SCREEN assert outcome.checkpoint_required is True assert outcome.decision == "block" def test_fail_event_transitions_running_to_failed_with_retry_decision(): outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.RUNNING, TransitionEvent.FAIL) assert outcome.status is StageStatus.FAILED assert outcome.next_stage is Stage.EXPERIMENT_RUN assert outcome.checkpoint_required is True assert outcome.decision == "retry" def test_retry_event_transitions_failed_to_retrying(): outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.FAILED, TransitionEvent.RETRY) assert outcome.status is StageStatus.RETRYING assert outcome.next_stage is Stage.EXPERIMENT_RUN assert outcome.decision == "retry" def test_resume_event_transitions_paused_to_running(): outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.PAUSED, TransitionEvent.RESUME) assert outcome.status is StageStatus.RUNNING assert outcome.next_stage is Stage.EXPERIMENT_RUN def test_pause_event_transitions_failed_to_paused(): outcome = advance(Stage.EXPERIMENT_RUN, StageStatus.FAILED, TransitionEvent.PAUSE) assert outcome.status is StageStatus.PAUSED assert outcome.next_stage is Stage.EXPERIMENT_RUN assert outcome.checkpoint_required is True assert outcome.decision == "block" def test_invalid_transition_raises_value_error(): with pytest.raises(ValueError, match="Unsupported transition"): _ = advance(Stage.TOPIC_INIT, StageStatus.DONE, TransitionEvent.START) def test_advance_rejects_unknown_transition_event_string(): with pytest.raises(ValueError, match="not a valid TransitionEvent"): _ = advance(Stage.TOPIC_INIT, StageStatus.PENDING, "unknown") @pytest.mark.parametrize("stage", tuple(GATE_STAGES)) def test_gate_required_for_gate_stages_with_default_config(stage: Stage): assert gate_required(stage, None) is True @pytest.mark.parametrize("stage", tuple(GATE_STAGES)) def test_gate_required_respects_hitl_stage_subset(stage: Stage): required = (5, 20) assert gate_required(stage, required) is (int(stage) in required) @pytest.mark.parametrize("stage", tuple(s for s in Stage if s not in GATE_STAGES)) def test_gate_required_is_false_for_non_gate_stages(stage: Stage): assert gate_required(stage, (5, 9, 20)) is False @pytest.mark.parametrize( "stage,expected", [ (Stage.LITERATURE_SCREEN, Stage.LITERATURE_COLLECT), (Stage.EXPERIMENT_DESIGN, Stage.HYPOTHESIS_GEN), (Stage.QUALITY_GATE, Stage.PAPER_OUTLINE), ], ) def test_default_rollback_stage_for_known_gate_mappings(stage: Stage, expected: Stage): assert default_rollback_stage(stage) is expected def test_default_rollback_stage_for_unknown_stage_uses_previous_stage(): assert default_rollback_stage(Stage.PAPER_DRAFT) is Stage.PAPER_OUTLINE def test_default_rollback_stage_for_first_stage_returns_self(): assert default_rollback_stage(Stage.TOPIC_INIT) is Stage.TOPIC_INIT def test_transition_outcome_field_values_are_exposed(): outcome = TransitionOutcome( stage=Stage.TOPIC_INIT, status=StageStatus.RUNNING, next_stage=Stage.TOPIC_INIT, rollback_stage=Stage.TOPIC_INIT, checkpoint_required=True, decision="block", ) assert outcome.checkpoint_required is True assert outcome.decision == "block" def test_sequence_and_neighbor_maps_are_consistent_for_all_stages(): for idx, stage in enumerate(STAGE_SEQUENCE): expected_prev = STAGE_SEQUENCE[idx - 1] if idx > 0 else None expected_next = ( STAGE_SEQUENCE[idx + 1] if idx + 1 < len(STAGE_SEQUENCE) else None ) assert PREVIOUS_STAGE[stage] is expected_prev assert NEXT_STAGE[stage] is expected_next def test_transition_map_covers_all_stage_status_values(): assert set(TRANSITION_MAP.keys()) == set(StageStatus) for source_status, targets in TRANSITION_MAP.items(): assert isinstance(targets, frozenset) assert all(target in StageStatus for target in targets) if source_status is StageStatus.DONE: assert targets == frozenset() # ── DECISION_ROLLBACK tests ── def test_decision_rollback_has_pivot_and_refine(): assert "pivot" in DECISION_ROLLBACK assert "refine" in DECISION_ROLLBACK def test_decision_rollback_pivot_targets_hypothesis_gen(): assert DECISION_ROLLBACK["pivot"] is Stage.HYPOTHESIS_GEN def test_decision_rollback_refine_targets_iterative_refine(): assert DECISION_ROLLBACK["refine"] is Stage.ITERATIVE_REFINE def test_max_decision_pivots_is_positive(): assert MAX_DECISION_PIVOTS >= 1 ================================================ FILE: tests/test_rc_templates.py ================================================ """Unit tests for researchclaw.templates — conference templates + MD→LaTeX converter.""" from __future__ import annotations import threading import pytest from researchclaw.templates.conference import ( CONFERENCE_REGISTRY, ConferenceTemplate, get_template, list_conferences, NEURIPS_2024, NEURIPS_2025, ICLR_2025, ICLR_2026, ICML_2025, ICML_2026, ) from researchclaw.templates.converter import ( markdown_to_latex, _parse_sections, _extract_title, _extract_abstract, _convert_inline, _escape_latex, _escape_algo_line, _render_code_block, _build_body, _render_table, _parse_table_row, _parse_alignments, _render_itemize, _render_enumerate, _reset_render_counters, _next_table_num, _next_figure_num, check_paper_completeness, # noqa: F401 ) # ===================================================================== # conference.py tests # ===================================================================== class TestConferenceTemplate: """Tests for ConferenceTemplate dataclass.""" def test_neurips_basic_fields(self) -> None: t = NEURIPS_2024 assert t.name == "neurips_2024" assert t.display_name == "NeurIPS 2024" assert t.year == 2024 assert t.document_class == "article" assert t.style_package == "neurips_2024" assert t.columns == 1 assert t.author_format == "neurips" assert t.bib_style == "plainnat" def test_iclr_basic_fields(self) -> None: t = ICLR_2025 assert t.name == "iclr_2025" assert t.year == 2025 assert t.style_package == "iclr2025_conference" assert t.bib_style == "iclr2025_conference" assert t.columns == 1 assert t.author_format == "iclr" def test_icml_basic_fields(self) -> None: t = ICML_2025 assert t.name == "icml_2025" assert t.year == 2025 assert t.style_package == "icml2025" assert t.columns == 2 assert t.author_format == "icml" assert t.bib_style == "icml2025" def test_frozen(self) -> None: with pytest.raises(AttributeError): NEURIPS_2024.name = "hacked" # type: ignore[misc] class TestRenderPreamble: """Tests for ConferenceTemplate.render_preamble().""" def test_neurips_preamble_structure(self) -> None: tex = NEURIPS_2024.render_preamble("My Title", "J. Doe", "An abstract.") assert r"\documentclass{article}" in tex assert r"\usepackage[preprint]{neurips_2024}" in tex assert r"\title{My Title}" in tex assert r"\author{J. Doe}" in tex assert r"\begin{abstract}" in tex assert "An abstract." in tex assert r"\end{abstract}" in tex assert r"\begin{document}" in tex assert r"\maketitle" in tex def test_iclr_preamble_no_options(self) -> None: tex = ICLR_2025.render_preamble("Title", "Author", "Abstract") assert r"\documentclass{article}" in tex # no options assert r"\usepackage{iclr2025_conference}" in tex def test_icml_author_block(self) -> None: tex = ICML_2025.render_preamble("Title", "Alice", "Abstract") assert r"\begin{icmlauthorlist}" in tex assert r"\icmlauthor{Alice}{aff1}" in tex assert r"\end{icmlauthorlist}" in tex assert r"\icmlaffiliation{aff1}{Affiliation}" in tex def test_icml_preamble_extra(self) -> None: tex = ICML_2025.render_preamble("Title", "Author", "Abstract") assert r"\icmltitlerunning{Title}" in tex class TestRenderFooter: """Tests for ConferenceTemplate.render_footer().""" def test_neurips_footer(self) -> None: tex = NEURIPS_2024.render_footer("refs") assert r"\bibliographystyle{plainnat}" in tex assert r"\bibliography{refs}" in tex assert r"\end{document}" in tex def test_icml_footer(self) -> None: tex = ICML_2025.render_footer() assert r"\bibliographystyle{icml2025}" in tex assert r"\bibliography{references}" in tex def test_default_bib_file(self) -> None: tex = NEURIPS_2024.render_footer() assert r"\bibliography{references}" in tex class TestGetTemplate: """Tests for get_template() lookup.""" def test_full_name(self) -> None: assert get_template("neurips_2024") is NEURIPS_2024 def test_short_alias(self) -> None: assert get_template("neurips") is NEURIPS_2025 assert get_template("iclr") is ICLR_2026 assert get_template("icml") is ICML_2026 def test_case_insensitive(self) -> None: assert get_template("NeurIPS") is NEURIPS_2025 assert get_template("ICML_2026") is ICML_2026 def test_dash_and_space_normalization(self) -> None: assert get_template("neurips-2025") is NEURIPS_2025 assert get_template("icml 2026") is ICML_2026 def test_unknown_raises(self) -> None: with pytest.raises(KeyError, match="Unknown conference"): get_template("aaai_2025") class TestListConferences: """Tests for list_conferences().""" def test_returns_canonical_names(self) -> None: names = list_conferences() assert "neurips_2025" in names assert "iclr_2026" in names assert "icml_2026" in names # Should be deduplicated — no aliases (6 conference + 1 generic) assert len(names) == 7 def test_sorted(self) -> None: names = list_conferences() assert names == sorted(names) class TestConferenceRegistry: """Tests for CONFERENCE_REGISTRY dict.""" def test_all_aliases_resolve(self) -> None: for key, tpl in CONFERENCE_REGISTRY.items(): assert isinstance(tpl, ConferenceTemplate) assert tpl.name # not empty # ===================================================================== # converter.py tests # ===================================================================== class TestParseSections: """Tests for _parse_sections().""" def test_empty(self) -> None: sections = _parse_sections("") assert len(sections) == 1 assert sections[0].level == 1 assert sections[0].body == "" def test_single_heading(self) -> None: md = "# Introduction\nHello world" sections = _parse_sections(md) assert len(sections) == 1 assert sections[0].level == 1 assert sections[0].heading == "Introduction" assert "Hello world" in sections[0].body def test_multiple_headings(self) -> None: md = "# Title\nfoo\n## Method\nbar\n### Details\nbaz" sections = _parse_sections(md) assert len(sections) == 3 assert sections[0].heading == "Title" assert sections[1].heading == "Method" assert sections[2].heading == "Details" def test_preamble_before_heading(self) -> None: md = "Some text before\n\n# First\nBody" sections = _parse_sections(md) assert len(sections) == 2 assert sections[0].level == 0 assert "Some text before" in sections[0].body def test_heading_lower(self) -> None: md = "# Abstract\nContent" sections = _parse_sections(md) assert sections[0].heading_lower == "abstract" class TestExtractTitle: """Tests for _extract_title().""" def test_bold_title_after_heading(self) -> None: md = "# Title\n**My Paper**\n\n# Abstract\nblah" sections = _parse_sections(md) assert _extract_title(sections, md) == "My Paper" def test_first_non_meta_h1(self) -> None: md = "# Introduction\nSome text" sections = _parse_sections(md) assert _extract_title(sections, md) == "Introduction" def test_fallback(self) -> None: sections = _parse_sections("") assert _extract_title(sections, "") == "Untitled Paper" class TestExtractAbstract: """Tests for _extract_abstract().""" def test_from_h1(self) -> None: md = "# Abstract\nThis is the abstract.\n\n# Intro\nBody" sections = _parse_sections(md) assert "This is the abstract." in _extract_abstract(sections) def test_from_h2(self) -> None: md = "# Title\nfoo\n## Abstract\nAbstract text.\n## Intro" sections = _parse_sections(md) assert "Abstract text." in _extract_abstract(sections) def test_missing_abstract(self) -> None: md = "# Introduction\nNo abstract here" sections = _parse_sections(md) assert _extract_abstract(sections) == "" class TestConvertInline: """Tests for _convert_inline().""" def test_bold(self) -> None: assert r"\textbf{bold}" in _convert_inline("**bold**") def test_italic(self) -> None: assert r"\textit{italic}" in _convert_inline("*italic*") def test_inline_code(self) -> None: assert r"\texttt{code}" in _convert_inline("`code`") def test_link(self) -> None: result = _convert_inline("[text](http://example.com)") assert r"\href{http://example.com}{text}" in result def test_special_chars_escaped(self) -> None: result = _convert_inline("100% done & 5# items") assert r"100\% done \& 5\# items" in result def test_math_preserved(self) -> None: result = _convert_inline(r"where \(x + y\) is given") assert r"\(x + y\)" in result def test_cite_preserved(self) -> None: result = _convert_inline(r"as shown by \cite{doe2024}") assert r"\cite{doe2024}" in result def test_dollar_math_preserved(self) -> None: result = _convert_inline("the value $x^2$ is") assert "$x^2$" in result def test_pre_escaped_underscore_not_doubled(self) -> None: """BUG-182: LLM pre-escapes underscores → must NOT double-escape to \\\\_.""" result = _convert_inline(r"RawObservation\_PPO\_WithNorm") assert r"\\_" not in result, f"Double-escaped: {result}" assert r"\_" in result def test_pre_escaped_underscore_near_math(self) -> None: """BUG-182: Pre-escaped underscore adjacent to math must not break.""" result = _convert_inline( r"RawObs\_PPO. Statistics \(\mu_t\) are given" ) assert r"\\_" not in result assert r"\_" in result assert r"\(\mu_t\)" in result def test_pre_escaped_hash_not_doubled(self) -> None: """BUG-182: Pre-escaped hash should not be double-escaped.""" result = _convert_inline(r"Section \#3 details") assert r"\\#" not in result assert r"\#" in result class TestEscapeLatex: """Tests for _escape_latex().""" def test_special_chars(self) -> None: assert r"\#" in _escape_latex("#") assert r"\%" in _escape_latex("%") assert r"\&" in _escape_latex("&") assert r"\_" in _escape_latex("_") def test_math_not_escaped(self) -> None: result = _escape_latex(r"value \(x_1\) here") assert r"\(x_1\)" in result # underscore inside math preserved class TestBuildBody: """Tests for _build_body().""" def test_skips_title_and_abstract(self) -> None: md = "# Title\nfoo\n# Abstract\nbar\n# Introduction\nbaz" sections = _parse_sections(md) body = _build_body(sections) assert r"\section{Introduction}" in body assert "baz" in body # Title and abstract should not appear as sections assert r"\section{Title}" not in body assert r"\section{Abstract}" not in body def test_subsection_promoted_when_all_h2(self) -> None: """T1.3: When all body sections are H2, they should be promoted to \\section.""" md = "## Method\ntext" sections = _parse_sections(md) body = _build_body(sections) # All-H2 document → auto-promoted to \section assert r"\section{Method}" in body def test_h2_promoted_under_h1_title(self) -> None: """When title occupies H1, H2 body sections promote to \\section.""" md = "# My Paper\ntitle body\n## Method\ntext" sections = _parse_sections(md) body = _build_body(sections, title="My Paper") assert r"\section{Method}" in body def test_subsubsection(self) -> None: md = "## Intro\nintro\n### Details\ntext" sections = _parse_sections(md) body = _build_body(sections) # H2 promoted to \section, H3 promoted to \subsection assert r"\subsection{Details}" in body class TestListRendering: """Tests for bullet and numbered list rendering.""" def test_bullet_list(self) -> None: items = ["First item", "Second item"] result = _render_itemize(items) assert r"\begin{itemize}" in result assert r"\item First item" in result assert r"\item Second item" in result assert r"\end{itemize}" in result def test_numbered_list(self) -> None: items = ["Step one", "Step two"] result = _render_enumerate(items) assert r"\begin{enumerate}" in result assert r"\item Step one" in result assert r"\end{enumerate}" in result class TestTableRendering: """Tests for Markdown table → LaTeX tabular conversion.""" def test_parse_table_row(self) -> None: assert _parse_table_row("| a | b | c |") == ["a", "b", "c"] def test_parse_alignments(self) -> None: assert _parse_alignments("| --- | :---: | ---: |", 3) == ["l", "c", "r"] def test_render_simple_table(self) -> None: lines = [ "| Name | Value |", "| --- | --- |", "| A | 1 |", "| B | 2 |", ] result = _render_table(lines) assert r"\begin{table}" in result assert r"\begin{tabular}{ll}" in result assert r"\toprule" in result assert r"\textbf{Name}" in result assert r"\midrule" in result assert r"\bottomrule" in result assert r"\end{tabular}" in result assert r"\end{table}" in result def test_render_counters_are_thread_local(self) -> None: results: list[tuple[int, int, int]] = [] lock = threading.Lock() def worker() -> None: _reset_render_counters() value = (_next_table_num(), _next_table_num(), _next_figure_num()) with lock: results.append(value) threads = [threading.Thread(target=worker) for _ in range(4)] for thread in threads: thread.start() for thread in threads: thread.join() assert results == [(1, 2, 1)] * 4 # ===================================================================== # markdown_to_latex integration tests # ===================================================================== class TestMarkdownToLatex: """Integration tests for the full conversion pipeline.""" SAMPLE_MD = ( "# Title\n" "**My Great Paper**\n\n" "# Abstract\n" "This is the abstract.\n\n" "# Introduction\n" "We study the problem of RL.\n\n" "## Related Work\n" "Prior work includes **many** approaches.\n\n" "# Method\n" "Our method uses \\(f(x) = x^2\\) as the objective.\n\n" "# Results\n" "- Result 1\n" "- Result 2\n\n" "# Conclusion\n" "We conclude.\n\n" "# References\n" "1. Doe et al. (2024)\n" ) def test_neurips_full(self) -> None: tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024) assert r"\documentclass{article}" in tex assert r"\usepackage[preprint]{neurips_2024}" in tex assert r"\title{My Great Paper}" in tex assert r"\begin{abstract}" in tex assert "This is the abstract." in tex assert r"\section{Introduction}" in tex assert r"\subsection{Related Work}" in tex assert r"\section{Method}" in tex assert r"\begin{itemize}" in tex assert r"\bibliographystyle{plainnat}" in tex assert r"\end{document}" in tex def test_iclr_full(self) -> None: tex = markdown_to_latex(self.SAMPLE_MD, ICLR_2025) assert r"\usepackage{iclr2025_conference}" in tex assert r"\bibliographystyle{iclr2025_conference}" in tex def test_icml_full(self) -> None: tex = markdown_to_latex(self.SAMPLE_MD, ICML_2025, authors="Alice") assert r"\begin{icmlauthorlist}" in tex assert r"\icmlauthor{Alice}{aff1}" in tex assert r"\bibliographystyle{icml2025}" in tex def test_custom_title_override(self) -> None: tex = markdown_to_latex( "# Abstract\nblah\n# Intro\nbody", NEURIPS_2024, title="Override Title", ) assert r"\title{Override Title}" in tex def test_custom_authors(self) -> None: tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024, authors="Jane Doe") assert r"\author{Jane Doe}" in tex def test_custom_bib_file(self) -> None: tex = markdown_to_latex(self.SAMPLE_MD, NEURIPS_2024, bib_file="my_refs") assert r"\bibliography{my_refs}" in tex def test_math_preserved_in_output(self) -> None: md = "# Abstract\nabs\n# Method\n\\(f(x)\\) and \\[E = mc^2\\]" tex = markdown_to_latex(md, NEURIPS_2024, title="T") assert r"\(f(x)\)" in tex assert r"\[E = mc^2\]" in tex def test_empty_paper(self) -> None: tex = markdown_to_latex("", NEURIPS_2024, title="Empty") assert r"\begin{document}" in tex assert r"\end{document}" in tex def test_display_math_block(self) -> None: md = "# Abstract\nabs\n# Method\n\\[\nx = y + z\n\\]" tex = markdown_to_latex(md, NEURIPS_2024, title="T") assert "x = y + z" in tex def test_code_block(self) -> None: md = "# Abstract\nabs\n# Method\n```python\nprint('hello')\n```" tex = markdown_to_latex(md, NEURIPS_2024, title="T") assert r"\begin{verbatim}" in tex assert "print('hello')" in tex assert r"\end{verbatim}" in tex def test_table_in_paper(self) -> None: md = ( "# Abstract\nabs\n" "# Results\n" "| Model | Score |\n" "| --- | --- |\n" "| Ours | 95.0 |\n" ) tex = markdown_to_latex(md, NEURIPS_2024, title="T") assert r"\begin{tabular}" in tex assert r"\textbf{Model}" in tex # ===================================================================== # ExportConfig tests # ===================================================================== class TestExportConfig: """Tests for ExportConfig in config.py.""" def test_default_values(self) -> None: from researchclaw.config import ExportConfig ec = ExportConfig() assert ec.target_conference == "neurips_2025" assert ec.authors == "Anonymous" assert ec.bib_file == "references" def test_frozen(self) -> None: from researchclaw.config import ExportConfig ec = ExportConfig() with pytest.raises(AttributeError): ec.target_conference = "icml" # type: ignore[misc] def test_rcconfig_has_export(self) -> None: from researchclaw.config import RCConfig cfg = RCConfig.load("config.researchclaw.example.yaml", check_paths=False) assert hasattr(cfg, "export") assert cfg.export.target_conference == "neurips_2025" def test_rcconfig_export_from_dict(self) -> None: from researchclaw.config import RCConfig import yaml from pathlib import Path data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text()) data["export"] = { "target_conference": "icml_2025", "authors": "Test Author", "bib_file": "mybib", } cfg = RCConfig.from_dict(data, check_paths=False) assert cfg.export.target_conference == "icml_2025" assert cfg.export.authors == "Test Author" assert cfg.export.bib_file == "mybib" # ===================================================================== # hitl_required_stages validation update test # ===================================================================== class TestHitlStageValidation: """Test that hitl_required_stages now accepts up to stage 23.""" def test_stage_23_valid(self) -> None: from researchclaw.config import validate_config import yaml from pathlib import Path data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text()) data.setdefault("security", {})["hitl_required_stages"] = [1, 22, 23] result = validate_config(data, check_paths=False) assert result.ok, f"Errors: {result.errors}" def test_get_style_files_returns_bundled_sty(self) -> None: """Each conference template bundles at least one .sty file.""" for name in ["neurips_2025", "neurips_2024", "iclr_2026", "iclr_2025", "icml_2026", "icml_2025"]: tpl = get_template(name) files = tpl.get_style_files() assert len(files) >= 1, f"No style files for {name}" sty_names = [f.name for f in files] assert any(f.endswith(".sty") for f in sty_names), f"No .sty file for {name}" def test_iclr_icml_have_bst_files(self) -> None: """ICLR and ICML templates bundle custom .bst files.""" for name in ["iclr_2026", "iclr_2025", "icml_2026", "icml_2025"]: tpl = get_template(name) files = tpl.get_style_files() bst_names = [f.name for f in files if f.suffix == ".bst"] assert len(bst_names) >= 1, f"No .bst file for {name}" def test_stage_24_invalid(self) -> None: from researchclaw.config import validate_config import yaml from pathlib import Path data = yaml.safe_load(Path("config.researchclaw.example.yaml").read_text()) data.setdefault("security", {})["hitl_required_stages"] = [24] result = validate_config(data, check_paths=False) assert not result.ok assert any("24" in e for e in result.errors) # ===================================================================== # check_paper_completeness — section word count + bullet density checks # ===================================================================== class TestCompletenessWordCountAndBullets: """Tests for new per-section word count and bullet density checks.""" @staticmethod def _make_sections(section_specs: list[tuple[str, int, bool]]) -> list: """Build _Section objects from (heading, word_count, use_bullets) specs.""" results = [] for heading, wc, bullets in section_specs: if bullets: lines = [f"- Point number {i}" for i in range(wc // 3)] body = "\n".join(lines) else: body = " ".join(["word"] * wc) results.append( type("_Section", (), { "level": 1, "heading": heading, "heading_lower": heading.lower(), "body": body, })() ) return results def test_completeness_section_word_count_short(self) -> None: """A Method section with only 100 words triggers a warning.""" secs = self._make_sections([ ("Title", 5, False), ("Abstract", 200, False), ("Introduction", 900, False), ("Related Work", 700, False), ("Method", 100, False), ("Experiments", 1000, False), ("Results", 700, False), ("Conclusion", 250, False), ]) warns = check_paper_completeness(secs) method_warns = [w for w in warns if "Method" in w and "words" in w] assert len(method_warns) >= 1, f"Expected word count warning, got: {warns}" def test_completeness_bullet_density(self) -> None: """A Method section full of bullet points triggers a warning.""" secs = self._make_sections([ ("Title", 5, False), ("Abstract", 200, False), ("Introduction", 900, False), ("Related Work", 700, False), ("Method", 300, True), ("Experiments", 1000, False), ("Results", 700, False), ("Conclusion", 250, False), ]) warns = check_paper_completeness(secs) bullet_warns = [w for w in warns if "bullet" in w.lower() and "Method" in w] assert len(bullet_warns) >= 1, f"Expected bullet warning, got: {warns}" # ===================================================================== # BUG-177: Algorithm pseudocode escaping tests # ===================================================================== class TestAlgorithmEscaping: """Tests for _escape_algo_line and algorithm rendering in _render_code_block.""" def test_escape_underscore(self) -> None: assert r"psi\_1" in _escape_algo_line("psi_1") def test_escape_hash_comment(self) -> None: result = _escape_algo_line("x = y # update rule") assert r"\COMMENT{update rule}" in result assert "x = y" in result def test_fullline_hash_comment(self) -> None: result = _escape_algo_line("# Initialize buffer") assert result == r"\COMMENT{Initialize buffer}" def test_escape_percent(self) -> None: assert r"\%" in _escape_algo_line("accuracy 95%") def test_escape_ampersand(self) -> None: assert r"\&" in _escape_algo_line("x & y") def test_preserve_latex_commands(self) -> None: result = _escape_algo_line(r"Set $x = \alpha$ and update") assert r"$x = \alpha$" in result def test_render_code_block_algo_escapes(self) -> None: code = ( "Initialize theta_1, theta_2\n" "for t = 1 to T do\n" " Sample batch B # prioritized\n" ) result = _render_code_block("algorithm", code) assert r"\begin{algorithm}" in result assert r"\begin{algorithmic}" in result assert r"theta\_1" in result assert r"\COMMENT{prioritized}" in result def test_render_code_block_verbatim_no_escape(self) -> None: """Non-algorithm code blocks should use verbatim (no escaping).""" code = "x_1 = y_2 # comment" result = _render_code_block("python", code) assert r"\begin{verbatim}" in result assert "x_1" in result # NOT escaped in verbatim ================================================ FILE: tests/test_rc_validator.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false, reportMissingParameterType=false, reportUnknownMemberType=false, reportUnknownArgumentType=false, reportUnknownVariableType=false, reportUnusedCallResult=false, reportAttributeAccessIssue=false, reportUnknownLambdaType=false from __future__ import annotations import pytest from researchclaw.experiment.validator import ( BANNED_MODULES, DANGEROUS_BUILTINS, DANGEROUS_CALLS, CodeValidation, ValidationIssue, check_filename_collisions, extract_imports, format_issues_for_llm, validate_code, validate_imports, validate_security, validate_syntax, ) def _call_source(name: str) -> str: top = name.split(".")[0] lines: list[str] = [] if top in {"os", "subprocess", "shutil"}: lines.append(f"import {top}") lines.append(f"{name}()") return "\n".join(lines) def test_validate_syntax_accepts_valid_code(): result = validate_syntax("x = 1\nif x > 0:\n x += 1") assert result.ok is True assert result.issues == [] def test_validate_syntax_reports_syntax_error_with_location(): result = validate_syntax("def bad(:\n pass") assert result.ok is False assert len(result.issues) == 1 issue = result.issues[0] assert issue.severity == "error" assert issue.category == "syntax" assert issue.line == 1 assert issue.col is not None assert issue.message @pytest.mark.parametrize("code", ["", " \n\t ", "# comment only\n# still comment"]) def test_validate_syntax_accepts_empty_whitespace_and_comment_only(code: str): result = validate_syntax(code) assert result.ok is True assert result.issues == [] def test_validate_security_accepts_safe_code(): code = 'import os\nvalue = os.path.join("a", "b")\nprint(value)' result = validate_security(code) assert result.ok is True assert result.issues == [] def test_validate_security_skips_when_code_has_syntax_error(): result = validate_security("def broken(:\n pass") assert result.ok is True assert result.issues == [] @pytest.mark.parametrize("builtin_name", sorted(DANGEROUS_BUILTINS)) def test_validate_security_flags_every_dangerous_builtin_call(builtin_name: str): if builtin_name == "__import__": code = '__import__("os")' elif builtin_name == "compile": code = 'compile("x = 1", "", "exec")' else: code = f'{builtin_name}("print(1)")' result = validate_security(code) assert len(result.issues) == 1 issue = result.issues[0] assert issue.severity == "error" assert issue.category == "security" assert issue.message == f"Dangerous built-in call: {builtin_name}()" @pytest.mark.parametrize("call_name", sorted(DANGEROUS_CALLS)) def test_validate_security_flags_every_dangerous_call(call_name: str): result = validate_security(_call_source(call_name)) messages = [issue.message for issue in result.issues] assert f"Dangerous call: {call_name}()" in messages assert all(issue.severity == "error" for issue in result.issues) assert all(issue.category == "security" for issue in result.issues) @pytest.mark.parametrize("module_name", sorted(BANNED_MODULES)) def test_validate_security_flags_every_banned_import(module_name: str): result = validate_security(f"import {module_name}") assert len(result.issues) == 1 issue = result.issues[0] assert issue.severity == "error" assert issue.category == "security" assert issue.message == f"Banned module import: {module_name}" @pytest.mark.parametrize("module_name", sorted(BANNED_MODULES)) def test_validate_security_flags_every_banned_from_import(module_name: str): result = validate_security(f"from {module_name} import x") assert len(result.issues) == 1 issue = result.issues[0] assert issue.severity == "error" assert issue.category == "security" assert issue.message == f"Banned module import: from {module_name}" def test_validate_imports_recognizes_stdlib_modules_by_default(): result = validate_imports("import json\nfrom math import sqrt") assert result.ok is True assert result.warnings == [] def test_validate_imports_warns_for_unavailable_package(): result = validate_imports("import totally_missing_pkg") assert result.ok is True assert len(result.warnings) == 1 warning = result.warnings[0] assert warning.severity == "warning" assert warning.category == "import" assert ( warning.message == "Module 'totally_missing_pkg' may not be available in sandbox" ) def test_validate_imports_respects_custom_available_set(): result = validate_imports( "import alpha\nimport beta\nimport gamma", available={"alpha", "gamma"}, ) assert [w.message for w in result.warnings] == [ "Module 'beta' may not be available in sandbox", ] def test_validate_imports_returns_no_warnings_for_syntax_error_input(): result = validate_imports("def bad(:\n pass", available=set()) assert result.ok is True assert result.warnings == [] @pytest.mark.parametrize("code", ["", " \n\t ", "# comment only"]) def test_validate_imports_handles_empty_like_inputs(code: str): result = validate_imports(code, available=set()) assert result.ok is True assert result.warnings == [] def test_validate_code_combines_security_and_import_issues_in_order(): code = 'import os\nos.system("echo hi")\nimport unknown_mod' result = validate_code(code, available_packages={"os"}) assert result.ok is False assert [i.category for i in result.issues] == ["security", "import"] assert result.issues[0].message == "Dangerous call: os.system()" assert ( result.issues[1].message == "Module 'unknown_mod' may not be available in sandbox" ) def test_validate_code_short_circuits_after_syntax_error(): result = validate_code("def bad(:\n pass") assert len(result.issues) == 1 assert result.issues[0].category == "syntax" def test_validate_code_skip_security_excludes_security_issues(): code = 'import os\nos.system("echo hi")\nimport unknown_mod' result = validate_code(code, available_packages={"os"}, skip_security=True) assert [i.category for i in result.issues] == ["import"] def test_validate_code_skip_imports_excludes_import_warnings(): code = 'import os\nos.system("echo hi")\nimport unknown_mod' result = validate_code(code, available_packages={"os"}, skip_imports=True) assert all(issue.category == "security" for issue in result.issues) assert len(result.issues) == 1 def test_validate_code_skip_both_returns_clean_for_safe_code(): result = validate_code("x = 1", skip_security=True, skip_imports=True) assert result.ok is True assert result.issues == [] def test_validate_code_uses_available_packages_for_import_validation(): code = "import alpha\nimport beta" result = validate_code(code, available_packages={"alpha"}) assert [i.message for i in result.issues] == [ "Module 'beta' may not be available in sandbox", ] def test_extract_imports_supports_import_and_from_import_styles(): code = ( "import os\nimport numpy as np\nfrom pandas import DataFrame\nfrom x.y import z" ) assert extract_imports(code) == {"os", "numpy", "pandas", "x"} def test_extract_imports_supports_multiple_aliases_and_dedupes(): code = "import os.path, os, json as js\nfrom json import loads" assert extract_imports(code) == {"os", "json"} def test_extract_imports_ignores_relative_import_without_module_name(): assert extract_imports("from . import local_mod") == set() def test_extract_imports_includes_relative_import_with_module_name(): assert extract_imports("from ..pkg.sub import thing") == {"pkg"} def test_extract_imports_returns_empty_set_for_syntax_error(): assert extract_imports("def bad(:\n pass") == set() @pytest.mark.parametrize("code", ["", " \n\t", "# comment only"]) def test_extract_imports_handles_empty_like_inputs(code: str): assert extract_imports(code) == set() def test_format_issues_for_llm_returns_no_issues_message_when_clean(): assert format_issues_for_llm(CodeValidation()) == "No issues found." def test_format_issues_for_llm_formats_issues_with_and_without_line(): validation = CodeValidation( issues=[ ValidationIssue( severity="error", category="syntax", message="invalid syntax", line=3, ), ValidationIssue( severity="warning", category="import", message="Module 'x' may be missing", line=None, ), ] ) formatted = format_issues_for_llm(validation) assert "- [ERROR] (syntax) invalid syntax @ line 3" in formatted assert ( "- [WARNING] (import) Module 'x' may be missing @ unknown location" in formatted ) def test_format_issues_for_llm_preserves_issue_order(): validation = CodeValidation( issues=[ ValidationIssue(severity="warning", category="import", message="first"), ValidationIssue( severity="error", category="security", message="second", line=9 ), ] ) formatted = format_issues_for_llm(validation).splitlines() assert formatted[0] == "- [WARNING] (import) first @ unknown location" assert formatted[1] == "- [ERROR] (security) second @ line 9" def test_code_validation_ok_true_when_no_errors_present(): validation = CodeValidation( issues=[ValidationIssue(severity="warning", category="import", message="warn")] ) assert validation.ok is True def test_code_validation_ok_false_when_error_present(): validation = CodeValidation( issues=[ValidationIssue(severity="error", category="syntax", message="bad")] ) assert validation.ok is False def test_code_validation_errors_and_warnings_filter_correctly(): err = ValidationIssue(severity="error", category="security", message="danger") warn = ValidationIssue( severity="warning", category="import", message="maybe missing" ) validation = CodeValidation(issues=[err, warn]) assert validation.errors == [err] assert validation.warnings == [warn] def test_code_validation_summary_for_no_issues(): assert CodeValidation().summary() == "Code validation passed." def test_code_validation_summary_for_errors_only(): validation = CodeValidation( issues=[ValidationIssue(severity="error", category="syntax", message="bad")] ) assert validation.summary() == "Code validation: 1 error(s)" def test_code_validation_summary_for_warnings_only(): validation = CodeValidation( issues=[ValidationIssue(severity="warning", category="import", message="warn")] ) assert validation.summary() == "Code validation: 1 warning(s)" def test_code_validation_summary_for_errors_and_warnings(): validation = CodeValidation( issues=[ ValidationIssue(severity="error", category="syntax", message="bad"), ValidationIssue(severity="warning", category="import", message="warn"), ] ) assert validation.summary() == "Code validation: 1 error(s), 1 warning(s)" # --------------------------------------------------------------------------- # check_filename_collisions (BUG-202) # --------------------------------------------------------------------------- def test_filename_collision_detects_config_py(): """BUG-202: config.py shadows pip 'config' package.""" warnings = check_filename_collisions({"config.py": "x = 1", "main.py": "print(1)"}) assert len(warnings) == 1 assert "shadows stdlib/pip" in warnings[0] assert "config" in warnings[0] def test_filename_collision_detects_stdlib_shadows(): """Filenames shadowing stdlib modules should be flagged.""" warnings = check_filename_collisions({"json.py": "x = 1"}) assert len(warnings) == 1 assert "json" in warnings[0] def test_filename_collision_allows_safe_names(): """Normal experiment filenames should not trigger warnings.""" files = { "main.py": "print(1)", "models.py": "class M: pass", "training.py": "def train(): pass", "data_loader.py": "def load(): pass", "experiment_config.py": "LR = 0.01", "requirements.txt": "torch", } warnings = check_filename_collisions(files) assert warnings == [] def test_filename_collision_multiple_shadows(): """Multiple shadowing files should each produce a warning.""" files = {"config.py": "", "logging.py": "", "main.py": ""} warnings = check_filename_collisions(files) assert len(warnings) == 2 ================================================ FILE: tests/test_results_table_builder.py ================================================ """Tests for results_table_builder — pre-built LaTeX tables.""" from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.pipeline.verified_registry import VerifiedRegistry from researchclaw.templates.results_table_builder import ( LatexTable, build_condition_whitelist, build_results_tables, ) ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts" def _make_registry( conditions: dict[str, dict[int, float]], primary_metric: float | None = None, ) -> VerifiedRegistry: """Create a registry from simple condition → {seed: value} mapping.""" summary = {"best_run": {"metrics": {}}, "condition_summaries": {}, "metrics_summary": {}} for cond_name, seeds in conditions.items(): for seed_idx, value in seeds.items(): key = f"{cond_name}/{seed_idx}/metric" summary["best_run"]["metrics"][key] = value cond_metric = sum(seeds.values()) / len(seeds) if seeds else 0 summary["condition_summaries"][cond_name] = {"metrics": {"metric": cond_metric}} if primary_metric is not None: summary["best_run"]["metrics"]["primary_metric"] = primary_metric return VerifiedRegistry.from_experiment(summary) class TestBuildResultsTables: def test_basic_table(self): reg = _make_registry( { "Baseline": {0: 80.0, 1: 82.0, 2: 81.0}, "Proposed": {0: 85.0, 1: 87.0, 2: 86.0}, }, primary_metric=86.0, ) tables = build_results_tables(reg, metric_name="Accuracy (\\%)") assert len(tables) == 2 # main + per-seed main = tables[0] assert main.label == "tab:main_results" assert "AUTO-GENERATED" in main.latex_code assert "\\begin{table}" in main.latex_code assert "Baseline" in main.latex_code assert "Proposed" in main.latex_code assert main.n_conditions == 2 def test_best_is_bolded(self): reg = _make_registry( { "Baseline": {0: 70.0, 1: 72.0}, "Proposed": {0: 85.0, 1: 87.0}, } ) tables = build_results_tables(reg, metric_direction="maximize") main = tables[0] # Proposed should be bold (higher metric) assert "\\textbf" in main.latex_code def test_single_seed_marker(self): reg = _make_registry( { "Baseline": {0: 80.0, 1: 82.0}, "Proposed": {0: 90.0}, # Single seed } ) tables = build_results_tables(reg) main = tables[0] assert "\\ddagger" in main.latex_code # Single-seed footnote def test_no_conditions(self): reg = VerifiedRegistry() tables = build_results_tables(reg) assert len(tables) == 0 def test_all_single_seed_no_per_seed_table(self): reg = _make_registry( { "A": {0: 80.0}, "B": {0: 70.0}, } ) tables = build_results_tables(reg) # Only 1 table (main), no per-seed table (all single seed) assert len(tables) == 1 def test_per_seed_table_structure(self): reg = _make_registry( { "DQN": {0: 156.1, 1: 105.5, 2: 356.7}, "DQN+Abstraction": {0: 98.1, 1: 456.7, 2: 282.0}, } ) tables = build_results_tables(reg) assert len(tables) == 2 seed_table = tables[1] assert seed_table.label == "tab:per_seed" assert "156.10" in seed_table.latex_code or "156.1" in seed_table.latex_code assert "Seed 0" in seed_table.latex_code def test_two_column_uses_table_star(self): reg = _make_registry({"A": {0: 80.0, 1: 82.0}}) tables = build_results_tables(reg, two_column=True) assert "\\begin{table*}" in tables[0].latex_code def test_verified_values_populated(self): reg = _make_registry( {"A": {0: 80.0, 1: 82.0}, "B": {0: 70.0, 1: 72.0}} ) tables = build_results_tables(reg) main = tables[0] assert 81.0 in main.verified_values or any( abs(v - 81.0) < 0.01 for v in main.verified_values ) def test_special_chars_escaped(self): reg = _make_registry({"DQN+Raw_Count": {0: 80.0, 1: 82.0}}) tables = build_results_tables(reg) assert "DQN+Raw\\_Count" in tables[0].latex_code def test_minimize_direction(self): reg = _make_registry( { "Baseline": {0: 20.0, 1: 22.0}, "Proposed": {0: 10.0, 1: 12.0}, } ) tables = build_results_tables(reg, metric_direction="minimize") # Proposed (lower) should be bold lines = tables[0].latex_code.split("\n") proposed_line = [l for l in lines if "Proposed" in l][0] assert "\\textbf" in proposed_line class TestConditionWhitelist: def test_basic(self): reg = _make_registry( { "DQN": {0: 206.1, 1: 105.5, 2: 356.7}, "DQN+Abstraction": {0: 278.93}, } ) wl = build_condition_whitelist(reg) assert "DQN" in wl assert "DQN+Abstraction" in wl assert "3 seed(s)" in wl assert "1 seed(s)" in wl def test_empty_registry(self): reg = VerifiedRegistry() wl = build_condition_whitelist(reg) assert "no conditions completed" in wl class TestRealArtifacts: def _load(self, run_id: str) -> VerifiedRegistry: pattern = f"rc-*-{run_id}" matches = sorted(ARTIFACTS.glob(pattern)) if not matches: pytest.skip(f"Artifact {run_id} not found") summary_path = matches[0] / "stage-14" / "experiment_summary.json" ref_path = matches[0] / "stage-13" / "refinement_log.json" if not summary_path.exists(): pytest.skip(f"No experiment_summary for {run_id}") summary = json.loads(summary_path.read_text()) ref_log = None if ref_path.exists(): ref_log = json.loads(ref_path.read_text()) return VerifiedRegistry.from_experiment(summary, ref_log) def test_run_e57360_rl_tables(self): reg = self._load("e57360") tables = build_results_tables(reg, metric_name="Return") assert len(tables) >= 1 main = tables[0] # Should NOT contain PPO (never ran) assert "PPO" not in main.latex_code # Should contain DQN assert "DQN" in main.latex_code def test_run_acbdfa_tables(self): reg = self._load("acbdfa") tables = build_results_tables(reg, metric_name="Top-1 Accuracy (\\%)") assert len(tables) >= 1 ================================================ FILE: tests/test_robotics_adapter.py ================================================ """Tests for robotics & control domain adapter. Covers adapter dispatch, prompt block generation, and integration with the existing domain detection and profile system. """ from __future__ import annotations import pytest from researchclaw.domains.detector import ( get_profile, _keyword_detect, _profile_cache, ) from researchclaw.domains.prompt_adapter import ( MLPromptAdapter, GenericPromptAdapter, get_adapter, ) # --------------------------------------------------------------------------- # Profile sanity # --------------------------------------------------------------------------- class TestRoboticsProfile: def setup_method(self): _profile_cache.clear() def test_profile_exists(self): profile = get_profile("robotics_control") assert profile is not None assert profile.domain_id == "robotics_control" def test_profile_fields(self): profile = get_profile("robotics_control") assert profile is not None assert profile.experiment_paradigm == "comparison" assert "gymnasium" in profile.core_libraries assert "stable-baselines3" in profile.core_libraries assert profile.gpu_required is True def test_profile_baselines(self): profile = get_profile("robotics_control") assert profile is not None baselines = profile.standard_baselines assert any("PPO" in b for b in baselines) assert any("SAC" in b for b in baselines) # --------------------------------------------------------------------------- # Keyword detection # --------------------------------------------------------------------------- class TestRoboticsKeywordDetection: def test_robot_keyword(self): assert _keyword_detect("robot manipulation task") == "robotics_control" def test_mujoco(self): assert _keyword_detect("locomotion in MuJoCo") == "robotics_control" def test_pybullet(self): assert _keyword_detect("grasping policy with PyBullet") == "robotics_control" # --------------------------------------------------------------------------- # Adapter dispatch # --------------------------------------------------------------------------- class TestRoboticsAdapter: def test_gets_robotics_adapter(self): profile = get_profile("robotics_control") if profile is None: pytest.skip("robotics_control profile not found") adapter = get_adapter(profile) assert not isinstance(adapter, MLPromptAdapter) # Before this contribution it would fall back to GenericPromptAdapter from researchclaw.domains.adapters.robotics import ( RoboticsPromptAdapter, ) assert isinstance(adapter, RoboticsPromptAdapter) def test_code_generation_blocks_nonempty(self): profile = get_profile("robotics_control") if profile is None: pytest.skip("robotics_control profile not found") adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints assert blocks.dataset_guidance assert blocks.output_format_guidance def test_experiment_design_mentions_baselines(self): profile = get_profile("robotics_control") if profile is None: pytest.skip("robotics_control profile not found") adapter = get_adapter(profile) blocks = adapter.get_experiment_design_blocks({}) assert "PPO" in blocks.experiment_design_context assert "SAC" in blocks.experiment_design_context def test_result_analysis_mentions_return(self): profile = get_profile("robotics_control") if profile is None: pytest.skip("robotics_control profile not found") adapter = get_adapter(profile) blocks = adapter.get_result_analysis_blocks({}) assert "return" in blocks.result_analysis_hints.lower() def test_blueprint_context(self): profile = get_profile("robotics_control") if profile is None: pytest.skip("robotics_control profile not found") adapter = get_adapter(profile) ctx = adapter.get_blueprint_context() if profile.typical_file_structure: assert "agent.py" in ctx or "train.py" in ctx ================================================ FILE: tests/test_servers.py ================================================ """Tests for multi-server resource scheduling (C2): Registry, Monitor, Dispatcher, Executors.""" from __future__ import annotations import asyncio from unittest.mock import AsyncMock, MagicMock, patch import pytest from researchclaw.servers.registry import ServerEntry, ServerRegistry from researchclaw.servers.monitor import ServerMonitor, _parse_status_output from researchclaw.servers.dispatcher import TaskDispatcher from researchclaw.servers.ssh_executor import SSHExecutor from researchclaw.servers.slurm_executor import SlurmExecutor from researchclaw.servers.cloud_executor import CloudExecutor # ── fixtures ────────────────────────────────────────────────────── def _make_server( name: str = "s1", host: str = "gpu1.local", server_type: str = "ssh", vram_gb: int = 24, priority: int = 1, cost: float = 0.0, scheduler: str = "", cloud_provider: str = "", ) -> ServerEntry: return ServerEntry( name=name, host=host, server_type=server_type, gpu="RTX 4090", vram_gb=vram_gb, priority=priority, cost_per_hour=cost, scheduler=scheduler, cloud_provider=cloud_provider, ) @pytest.fixture def registry() -> ServerRegistry: return ServerRegistry([ _make_server("local", "localhost", vram_gb=48, priority=1), _make_server("cloud1", "cloud.host", server_type="cloud", vram_gb=80, priority=3, cost=2.0, cloud_provider="aws"), _make_server("hpc", "hpc.host", server_type="slurm", vram_gb=40, priority=2, scheduler="slurm"), ]) # ══════════════════════════════════════════════════════════════════ # ServerEntry tests # ══════════════════════════════════════════════════════════════════ class TestServerEntry: def test_to_dict_roundtrip(self) -> None: s = _make_server() d = s.to_dict() s2 = ServerEntry.from_dict(d) assert s2.name == s.name assert s2.vram_gb == s.vram_gb def test_defaults(self) -> None: s = ServerEntry.from_dict({"name": "x"}) assert s.server_type == "ssh" assert s.priority == 1 # ══════════════════════════════════════════════════════════════════ # ServerRegistry tests # ══════════════════════════════════════════════════════════════════ class TestServerRegistry: def test_list_all_sorted_by_priority(self, registry: ServerRegistry) -> None: servers = registry.list_all() priorities = [s.priority for s in servers] assert priorities == sorted(priorities) def test_count(self, registry: ServerRegistry) -> None: assert registry.count == 3 def test_add_server(self) -> None: reg = ServerRegistry() reg.add(_make_server("new")) assert reg.count == 1 assert reg.get("new").name == "new" def test_remove_server(self, registry: ServerRegistry) -> None: registry.remove("local") assert registry.count == 2 def test_remove_unknown_raises(self, registry: ServerRegistry) -> None: with pytest.raises(KeyError): registry.remove("ghost") def test_get_unknown_raises(self, registry: ServerRegistry) -> None: with pytest.raises(KeyError): registry.get("ghost") def test_get_available_excludes(self, registry: ServerRegistry) -> None: avail = registry.get_available(exclude={"local"}) names = [s.name for s in avail] assert "local" not in names assert len(names) == 2 def test_get_best_match_by_vram(self, registry: ServerRegistry) -> None: best = registry.get_best_match({"min_vram_gb": 40}) assert best is not None assert best.vram_gb >= 40 def test_get_best_match_by_type(self, registry: ServerRegistry) -> None: best = registry.get_best_match({"server_type": "slurm"}) assert best is not None assert best.server_type == "slurm" def test_get_best_match_prefers_free(self, registry: ServerRegistry) -> None: best = registry.get_best_match(prefer_free=True) assert best is not None assert best.cost_per_hour == 0.0 def test_get_best_match_none_when_impossible(self, registry: ServerRegistry) -> None: best = registry.get_best_match({"min_vram_gb": 999}) assert best is None def test_get_best_match_by_gpu(self, registry: ServerRegistry) -> None: best = registry.get_best_match({"gpu": "RTX"}) assert best is not None def test_get_best_match_no_requirements(self, registry: ServerRegistry) -> None: best = registry.get_best_match() assert best is not None assert best.name == "local" # ══════════════════════════════════════════════════════════════════ # ServerMonitor tests # ══════════════════════════════════════════════════════════════════ class TestServerMonitor: def test_parse_status_output(self) -> None: raw = "75, 8000, 24576\n---\n total used free\nMem: 64000 32000 32000\n---\n 10:00:00 up 5 days" server = _make_server() status = _parse_status_output(raw, server) assert status["gpu"]["count"] == 1 assert status["gpu"]["devices"][0]["utilization_pct"] == 75 assert status["memory"]["total_mb"] == 64000 assert "uptime" in status def test_parse_status_no_gpu(self) -> None: raw = "\n---\n total used free\nMem: 64000 32000 32000\n---\nup 1 day" server = _make_server() status = _parse_status_output(raw, server) assert status["gpu"]["count"] == 0 def test_get_cached_none(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) assert monitor.get_cached("local") is None def test_get_gpu_usage_empty(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) assert monitor.get_gpu_usage(_make_server()) == {} def test_check_status_unreachable(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) with patch("researchclaw.servers.monitor._ssh_command", side_effect=RuntimeError("unreachable")): status = asyncio.run(monitor.check_status(_make_server())) assert status["reachable"] is False def test_check_all(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) with patch("researchclaw.servers.monitor._ssh_command", side_effect=RuntimeError("unreachable")): results = asyncio.run(monitor.check_all()) assert len(results) == 3 for name, status in results.items(): assert status["reachable"] is False # ══════════════════════════════════════════════════════════════════ # SSHExecutor tests # ══════════════════════════════════════════════════════════════════ class TestSSHExecutor: def test_init(self) -> None: server = _make_server() exe = SSHExecutor(server) assert exe.host == "gpu1.local" def test_run_experiment_timeout(self) -> None: server = _make_server() exe = SSHExecutor(server) async def _run() -> dict: with patch("asyncio.create_subprocess_exec") as mock_exec: proc = AsyncMock() proc.communicate = AsyncMock(side_effect=asyncio.TimeoutError) proc.kill = AsyncMock() proc.wait = AsyncMock() mock_exec.return_value = proc return await exe.run_experiment("/tmp/test", "echo hello", timeout=1) result = asyncio.run(_run()) assert result["success"] is False assert "Timeout" in result["error"] # ══════════════════════════════════════════════════════════════════ # SlurmExecutor tests # ══════════════════════════════════════════════════════════════════ class TestSlurmExecutor: def test_init_wrong_type_raises(self) -> None: server = _make_server(server_type="ssh") with pytest.raises(ValueError, match="not a slurm"): SlurmExecutor(server) def test_generate_sbatch_script(self) -> None: server = _make_server(server_type="slurm", scheduler="slurm") exe = SlurmExecutor(server) script = exe._generate_sbatch_script("python main.py", resources={"gpus": 2, "mem_gb": 32}) assert "#SBATCH --gres=gpu:2" in script assert "#SBATCH --mem=32G" in script assert "python main.py" in script def test_sbatch_script_default_resources(self) -> None: server = _make_server(server_type="slurm", scheduler="slurm") exe = SlurmExecutor(server) script = exe._generate_sbatch_script("echo hi") assert "#SBATCH --gres=gpu:1" in script assert "#SBATCH --time=01:00:00" in script def test_submit_job_parses_output(self) -> None: server = _make_server(server_type="slurm", scheduler="slurm") exe = SlurmExecutor(server) async def _run() -> str: with patch("asyncio.create_subprocess_exec") as mock_exec: proc = AsyncMock() proc.communicate = AsyncMock(return_value=(b"Submitted batch job 12345\n", b"")) proc.returncode = 0 mock_exec.return_value = proc return await exe.submit_job("echo hi", "/tmp/test") job_id = asyncio.run(_run()) assert job_id == "12345" # ══════════════════════════════════════════════════════════════════ # CloudExecutor tests # ══════════════════════════════════════════════════════════════════ class TestCloudExecutor: def test_init_wrong_type_raises(self) -> None: server = _make_server(server_type="ssh") with pytest.raises(ValueError, match="not a cloud"): CloudExecutor(server) def test_launch_instance_stub(self) -> None: server = _make_server(server_type="cloud", cloud_provider="aws") exe = CloudExecutor(server) result = asyncio.run(exe.launch_instance()) assert result["status"] == "stub_launched" assert result["provider"] == "aws" # ══════════════════════════════════════════════════════════════════ # TaskDispatcher tests # ══════════════════════════════════════════════════════════════════ class TestTaskDispatcher: def test_dispatch_returns_task_id(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) disp = TaskDispatcher(registry, monitor) task_id = asyncio.run(disp.dispatch({"command": "echo hi", "local_dir": "/tmp"})) assert len(task_id) == 12 def test_dispatch_no_server_queues(self) -> None: reg = ServerRegistry() monitor = ServerMonitor(reg) disp = TaskDispatcher(reg, monitor) task_id = asyncio.run(disp.dispatch({"command": "echo hi"})) status = disp.get_task_status(task_id) assert status["status"] == "queued" def test_get_task_status_unknown(self, registry: ServerRegistry) -> None: monitor = ServerMonitor(registry) disp = TaskDispatcher(registry, monitor) status = disp.get_task_status("nonexistent") assert status["status"] == "unknown" ================================================ FILE: tests/test_skills_library.py ================================================ """Tests for the dynamic skills library. Covers: - Skill schema (agentskills.io data model) - YAML skill loading (legacy) - SKILL.md loading (agentskills.io) - Skill registry (register, query, external dirs) - Keyword matching + description fallback - Stage filtering (int + string) - Prompt formatting """ from __future__ import annotations import json from pathlib import Path import pytest from researchclaw.skills.schema import Skill, STAGE_NAME_TO_NUMBER from researchclaw.skills.loader import ( load_skill_file, load_skill_from_skillmd, load_skillmd_from_directory, load_skills_from_directory, ) from researchclaw.skills.registry import SkillRegistry from researchclaw.skills.matcher import ( match_skills, format_skills_for_prompt, _tokenize, _resolve_stage, ) # ── Fixtures ───────────────────────────────────────────────────────── @pytest.fixture def sample_skill() -> Skill: return Skill( name="test-skill-1", description="A test skill for unit testing", body="## Test Skill\nDo the thing.", metadata={ "category": "tooling", "trigger-keywords": "training,pytorch,gpu", "applicable-stages": "10,12", "priority": "5", "version": "1.0", "code-template": "print('hello')", "references": "Test Paper 2024", }, ) @pytest.fixture def skill_yaml_dir(tmp_path: Path) -> Path: d = tmp_path / "skills" d.mkdir() skill_data = { "id": "yaml-skill-1", "name": "YAML Test Skill", "category": "experiment", "description": "Loaded from YAML", "trigger_keywords": ["review", "literature"], "applicable_stages": [3, 4, 5], "prompt_template": "Do literature review", "version": "1.0", "priority": 3, } import yaml (d / "test_skill.yaml").write_text(yaml.dump(skill_data), encoding="utf-8") return d @pytest.fixture def skill_json_dir(tmp_path: Path) -> Path: d = tmp_path / "json_skills" d.mkdir() skill_data = { "id": "json-skill-1", "name": "JSON Test Skill", "category": "writing", "description": "Loaded from JSON", "trigger_keywords": ["paper", "writing"], "applicable_stages": [17], "prompt_template": "Write well", "version": "1.0", "priority": 4, } (d / "test_skill.json").write_text( json.dumps(skill_data), encoding="utf-8" ) return d @pytest.fixture def skillmd_dir(tmp_path: Path) -> Path: """Create a directory with SKILL.md files for testing.""" d = tmp_path / "skillmd_skills" d.mkdir() # Skill with full metadata s1 = d / "test-skill-md" s1.mkdir() (s1 / "SKILL.md").write_text( "---\n" "name: test-skill-md\n" "description: A test skill from SKILL.md\n" "metadata:\n" " category: domain\n" " trigger-keywords: \"nlp,transformer,bert\"\n" " applicable-stages: \"9,10\"\n" " priority: \"2\"\n" "---\n\n" "## NLP Skill\nDo NLP things.\n", encoding="utf-8", ) # Skill with minimal metadata (no trigger-keywords) s2 = d / "minimal-skill" s2.mkdir() (s2 / "SKILL.md").write_text( "---\n" "name: minimal-skill\n" "description: A minimal skill for testing description-based matching\n" "---\n\n" "## Minimal\nJust a body.\n", encoding="utf-8", ) return d @pytest.fixture def external_skillmd_dir(tmp_path: Path) -> Path: """Simulates an external skill directory (like Collider-Agent).""" d = tmp_path / "external" d.mkdir() s = d / "hep-feynrules" s.mkdir() (s / "SKILL.md").write_text( "---\n" "name: hep-feynrules\n" "description: Generate FeynRules model files for BSM physics\n" "metadata:\n" " category: domain\n" " applicable-stages: \"10\"\n" "---\n\n" "## FeynRules Model Generation\n" "Build BSM model files for MadGraph.\n", encoding="utf-8", ) return d # ── Skill Schema ───────────────────────────────────────────────────── class TestSkillSchema: def test_create_skill(self, sample_skill: Skill) -> None: assert sample_skill.name == "test-skill-1" assert sample_skill.id == "test-skill-1" # backward compat assert sample_skill.category == "tooling" assert len(sample_skill.trigger_keywords) == 3 def test_to_dict(self, sample_skill: Skill) -> None: d = sample_skill.to_dict() assert d["id"] == "test-skill-1" assert d["applicable_stages"] == [10, 12] assert d["code_template"] == "print('hello')" def test_from_dict(self) -> None: data = { "id": "from-dict", "name": "From Dict", "category": "domain", "description": "Created from dict", "trigger_keywords": ["test"], "applicable_stages": [1], "prompt_template": "test prompt", } skill = Skill.from_dict(data) assert skill.name == "from-dict" assert skill.priority == 5 # default def test_from_dict_defaults(self) -> None: skill = Skill.from_dict({}) assert skill.name == "" assert skill.version == "1.0" assert skill.code_template is None def test_roundtrip(self, sample_skill: Skill) -> None: d = sample_skill.to_dict() restored = Skill.from_dict(d) assert restored.name == sample_skill.name assert restored.applicable_stages == sample_skill.applicable_stages def test_stage_name_to_number(self) -> None: assert STAGE_NAME_TO_NUMBER["code_generation"] == 10 assert STAGE_NAME_TO_NUMBER["paper_draft"] == 17 assert len(STAGE_NAME_TO_NUMBER) == 23 def test_prompt_template_alias(self, sample_skill: Skill) -> None: assert sample_skill.prompt_template == sample_skill.body # ── Skill Loader ───────────────────────────────────────────────────── class TestSkillLoader: def test_load_yaml(self, skill_yaml_dir: Path) -> None: skill = load_skill_file(skill_yaml_dir / "test_skill.yaml") assert skill is not None assert skill.name == "yaml-skill-1" assert skill.category == "experiment" def test_load_json(self, skill_json_dir: Path) -> None: skill = load_skill_file(skill_json_dir / "test_skill.json") assert skill is not None assert skill.name == "json-skill-1" def test_load_nonexistent(self, tmp_path: Path) -> None: skill = load_skill_file(tmp_path / "nope.yaml") assert skill is None def test_load_invalid_yaml(self, tmp_path: Path) -> None: bad = tmp_path / "bad.yaml" bad.write_text("not: [valid: yaml: {", encoding="utf-8") skill = load_skill_file(bad) assert skill is None def test_load_unsupported_format(self, tmp_path: Path) -> None: txt = tmp_path / "skill.txt" txt.write_text("id: test", encoding="utf-8") skill = load_skill_file(txt) assert skill is None def test_load_directory(self, skill_yaml_dir: Path) -> None: skills = load_skills_from_directory(skill_yaml_dir) assert len(skills) == 1 def test_load_empty_directory(self, tmp_path: Path) -> None: empty = tmp_path / "empty" empty.mkdir() skills = load_skills_from_directory(empty) assert skills == [] def test_load_missing_directory(self, tmp_path: Path) -> None: skills = load_skills_from_directory(tmp_path / "nonexistent") assert skills == [] class TestSkillMdLoader: def test_load_skillmd(self, skillmd_dir: Path) -> None: skill = load_skill_from_skillmd(skillmd_dir / "test-skill-md" / "SKILL.md") assert skill is not None assert skill.name == "test-skill-md" assert skill.category == "domain" assert "nlp" in skill.trigger_keywords assert skill.applicable_stages == [9, 10] assert skill.priority == 2 assert "NLP Skill" in skill.body assert skill.source_format == "skillmd" def test_load_skillmd_minimal(self, skillmd_dir: Path) -> None: skill = load_skill_from_skillmd(skillmd_dir / "minimal-skill" / "SKILL.md") assert skill is not None assert skill.name == "minimal-skill" assert skill.trigger_keywords == [] assert skill.applicable_stages == [] assert skill.priority == 5 # default def test_load_skillmd_missing(self, tmp_path: Path) -> None: skill = load_skill_from_skillmd(tmp_path / "nope" / "SKILL.md") assert skill is None def test_load_skillmd_no_frontmatter(self, tmp_path: Path) -> None: d = tmp_path / "bad-skill" d.mkdir() (d / "SKILL.md").write_text("No frontmatter here", encoding="utf-8") skill = load_skill_from_skillmd(d / "SKILL.md") assert skill is None def test_load_skillmd_directory(self, skillmd_dir: Path) -> None: skills = load_skillmd_from_directory(skillmd_dir) assert len(skills) == 2 names = {s.name for s in skills} assert "test-skill-md" in names assert "minimal-skill" in names def test_skillmd_wins_over_yaml(self, tmp_path: Path) -> None: """When both SKILL.md and YAML exist for the same name, SKILL.md wins.""" d = tmp_path / "mixed" d.mkdir() # YAML file import yaml (d / "test-skill-md.yaml").write_text( yaml.dump({ "id": "test-skill-md", "name": "test-skill-md", "category": "tooling", "description": "From YAML", "trigger_keywords": ["x"], "applicable_stages": [1], "prompt_template": "yaml body", }), encoding="utf-8", ) # SKILL.md file sd = d / "test-skill-md" sd.mkdir() (sd / "SKILL.md").write_text( "---\nname: test-skill-md\ndescription: From SKILL.md\n---\n\nskillmd body\n", encoding="utf-8", ) skills = load_skills_from_directory(d) matched = [s for s in skills if s.name == "test-skill-md"] assert len(matched) == 1 assert matched[0].source_format == "skillmd" assert "From SKILL.md" in matched[0].description # ── Matcher ────────────────────────────────────────────────────────── class TestMatcher: def test_tokenize(self) -> None: tokens = _tokenize("PyTorch Training GPU") assert "pytorch" in tokens assert "training" in tokens assert "gpu" in tokens def test_match_by_keyword(self, sample_skill: Skill) -> None: matched = match_skills( [sample_skill], context="training a pytorch model on gpu", stage=10, ) assert len(matched) == 1 assert matched[0].name == "test-skill-1" def test_match_filters_by_stage(self, sample_skill: Skill) -> None: matched = match_skills( [sample_skill], context="training pytorch gpu", stage=1, # not in applicable_stages ) assert len(matched) == 0 def test_match_empty_context(self, sample_skill: Skill) -> None: matched = match_skills([sample_skill], context="", stage=10) assert len(matched) == 0 def test_match_no_keyword_overlap(self, sample_skill: Skill) -> None: matched = match_skills( [sample_skill], context="linguistics morphology", stage=10, ) assert len(matched) == 0 def test_match_respects_top_k(self) -> None: skills = [ Skill( name=f"skill-{i}", description="test", body="test", metadata={ "category": "tooling", "trigger-keywords": "training", "applicable-stages": "10", "priority": str(i), }, ) for i in range(10) ] matched = match_skills(skills, context="training", stage=10, top_k=3) assert len(matched) == 3 def test_match_priority_ordering(self) -> None: high = Skill( name="high", description="t", body="t", metadata={ "trigger-keywords": "training", "applicable-stages": "10", "priority": "1", }, ) low = Skill( name="low", description="t", body="t", metadata={ "trigger-keywords": "training", "applicable-stages": "10", "priority": "9", }, ) matched = match_skills([low, high], context="training", stage=10) assert matched[0].name == "high" def test_match_string_stage(self, sample_skill: Skill) -> None: """String stage names should be resolved via STAGE_NAME_TO_NUMBER.""" matched = match_skills( [sample_skill], context="training pytorch gpu", stage="code_generation", # resolves to 10 ) assert len(matched) == 1 assert matched[0].name == "test-skill-1" def test_match_string_stage_mismatch(self, sample_skill: Skill) -> None: matched = match_skills( [sample_skill], context="training pytorch gpu", stage="paper_draft", # resolves to 17, not in [10, 12] ) assert len(matched) == 0 def test_resolve_stage(self) -> None: assert _resolve_stage(10) == 10 assert _resolve_stage("code_generation") == 10 assert _resolve_stage("unknown_stage") == -1 def test_match_description_fallback(self) -> None: """Skills without trigger_keywords should match via description.""" external_skill = Skill( name="ext-skill", description="Generate FeynRules model files for BSM physics", body="Do feynrules things.", metadata={"applicable-stages": "10"}, ) matched = match_skills( [external_skill], context="feynrules model generation", stage=10, fallback_matching=True, ) assert len(matched) == 1 assert matched[0].name == "ext-skill" def test_match_description_fallback_disabled(self) -> None: external_skill = Skill( name="ext-skill", description="Generate FeynRules model files for BSM physics", body="Do feynrules things.", metadata={"applicable-stages": "10"}, ) matched = match_skills( [external_skill], context="feynrules model generation", stage=10, fallback_matching=False, ) assert len(matched) == 0 class TestFormatSkills: def test_format_single_skill(self, sample_skill: Skill) -> None: text = format_skills_for_prompt([sample_skill]) assert "test-skill-1" in text assert "tooling" in text def test_format_empty(self) -> None: assert format_skills_for_prompt([]) == "" def test_format_includes_code_template(self, sample_skill: Skill) -> None: text = format_skills_for_prompt([sample_skill]) assert "print('hello')" in text def test_format_includes_references(self, sample_skill: Skill) -> None: text = format_skills_for_prompt([sample_skill]) assert "Test Paper 2024" in text def test_format_respects_max_chars(self) -> None: skills = [ Skill( name=f"s{i}", description="t", body="x" * 500, metadata={ "category": "tooling", "trigger-keywords": "t", }, ) for i in range(10) ] text = format_skills_for_prompt(skills, max_chars=1000) assert len(text) <= 1500 # some slack for headers # ── Registry ───────────────────────────────────────────────────────── class TestSkillRegistry: def test_registry_loads_builtins(self) -> None: registry = SkillRegistry() assert registry.count() >= 12 # builtin skills (SKILL.md format) def test_builtin_skillmd_count(self) -> None: """All builtin skills should load from SKILL.md.""" registry = SkillRegistry() assert registry.count() == 16 def test_register_custom(self, sample_skill: Skill) -> None: registry = SkillRegistry() initial = registry.count() registry.register(sample_skill) assert registry.count() == initial + 1 def test_get_skill(self, sample_skill: Skill) -> None: registry = SkillRegistry() registry.register(sample_skill) got = registry.get("test-skill-1") assert got is not None assert got.name == "test-skill-1" def test_get_nonexistent(self) -> None: registry = SkillRegistry() assert registry.get("nonexistent") is None def test_unregister(self, sample_skill: Skill) -> None: registry = SkillRegistry() registry.register(sample_skill) assert registry.unregister("test-skill-1") assert registry.get("test-skill-1") is None def test_unregister_nonexistent(self) -> None: registry = SkillRegistry() assert not registry.unregister("nope") def test_list_by_category(self) -> None: registry = SkillRegistry() tooling = registry.list_by_category("tooling") assert len(tooling) > 0 assert all(s.category == "tooling" for s in tooling) def test_list_by_stage(self) -> None: registry = SkillRegistry() stage_10 = registry.list_by_stage(10) assert len(stage_10) > 0 def test_match(self) -> None: registry = SkillRegistry() matched = registry.match("pytorch training classification cifar", stage=10) assert len(matched) > 0 def test_match_string_stage(self) -> None: registry = SkillRegistry() matched = registry.match( "pytorch training classification", stage="code_generation", ) assert len(matched) > 0 def test_export_for_prompt(self) -> None: registry = SkillRegistry() matched = registry.match("pytorch training", stage=10, top_k=2) text = registry.export_for_prompt(matched) assert len(text) > 0 def test_custom_dir_loading(self, skill_yaml_dir: Path) -> None: registry = SkillRegistry(custom_dirs=[str(skill_yaml_dir)]) skill = registry.get("yaml-skill-1") assert skill is not None def test_registry_external_dirs(self, external_skillmd_dir: Path) -> None: registry = SkillRegistry(external_dirs=[str(external_skillmd_dir)]) assert registry.count() == 17 # 16 builtin + 1 external skill = registry.get("hep-feynrules") assert skill is not None assert skill.category == "domain" def test_registry_external_match_fallback( self, external_skillmd_dir: Path ) -> None: """External skills without trigger_keywords should match via description.""" registry = SkillRegistry( external_dirs=[str(external_skillmd_dir)], fallback_matching=True, ) matched = registry.match("feynrules model generation", stage=10, top_k=10) names = [s.name for s in matched] assert "hep-feynrules" in names ================================================ FILE: tests/test_ssh_and_colab_sandbox.py ================================================ # pyright: reportPrivateUsage=false, reportUnknownParameterType=false """Tests for ssh_remote and colab_drive experiment backends.""" from __future__ import annotations import json import textwrap import time from pathlib import Path from unittest import mock import pytest from researchclaw.config import ( ColabDriveConfig, ExperimentConfig, SandboxConfig, SshRemoteConfig, DockerSandboxConfig, CodeAgentConfig, BenchmarkAgentConfig, FigureAgentConfig, ) from researchclaw.experiment.ssh_sandbox import ( SshRemoteSandbox, _build_ssh_base, _ssh_target, ) from researchclaw.experiment.colab_sandbox import ( ColabDriveSandbox, COLAB_WORKER_TEMPLATE, ) from researchclaw.experiment.factory import create_sandbox # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_experiment_config(**overrides) -> ExperimentConfig: defaults = dict( sandbox=SandboxConfig(), docker=DockerSandboxConfig(), ssh_remote=SshRemoteConfig(), colab_drive=ColabDriveConfig(), code_agent=CodeAgentConfig(), benchmark_agent=BenchmarkAgentConfig(), figure_agent=FigureAgentConfig(), ) defaults.update(overrides) return ExperimentConfig(**defaults) # =========================================================================== # SSH Remote: unit tests # =========================================================================== class TestSshTarget: def test_with_user(self): cfg = SshRemoteConfig(host="gpu.lab.edu", user="alice") assert _ssh_target(cfg) == "alice@gpu.lab.edu" def test_without_user(self): cfg = SshRemoteConfig(host="gpu.lab.edu") assert _ssh_target(cfg) == "gpu.lab.edu" class TestBuildSshBase: def test_default_port(self): cfg = SshRemoteConfig(host="server", user="bob") cmd = _build_ssh_base(cfg) assert "ssh" in cmd assert "bob@server" in cmd assert "-p" not in cmd def test_custom_port(self): cfg = SshRemoteConfig(host="server", user="bob", port=2222) cmd = _build_ssh_base(cfg) idx = cmd.index("-p") assert cmd[idx + 1] == "2222" def test_key_path(self): cfg = SshRemoteConfig(host="server", key_path="~/.ssh/my_key") cmd = _build_ssh_base(cfg) assert "-i" in cmd class TestSshRemoteSandboxCommands: def test_bare_exec_cmd(self, tmp_path: Path): cfg = SshRemoteConfig( host="server", user="test", gpu_ids=(0, 1), remote_python="python3", ) sb = SshRemoteSandbox(cfg, tmp_path) cmd = sb._build_bare_exec_cmd("/tmp/rc-test", entry_point="main.py") assert "CUDA_VISIBLE_DEVICES=0,1" in cmd assert "HOME=/tmp/rc-test" in cmd assert "python3 -u main.py" in cmd assert "unshare --net" in cmd def test_bare_exec_no_gpu(self, tmp_path: Path): cfg = SshRemoteConfig(host="server", user="test") sb = SshRemoteSandbox(cfg, tmp_path) cmd = sb._build_bare_exec_cmd("/tmp/rc-test", entry_point="main.py") assert "CUDA_VISIBLE_DEVICES" not in cmd def test_docker_exec_cmd(self, tmp_path: Path): cfg = SshRemoteConfig( host="server", user="test", use_docker=True, docker_image="myimage:latest", docker_network_policy="none", docker_memory_limit_mb=4096, docker_shm_size_mb=1024, gpu_ids=(0,), ) sb = SshRemoteSandbox(cfg, tmp_path) cmd = sb._build_docker_exec_cmd("/tmp/rc-test", entry_point="main.py") assert "docker run --rm" in cmd assert "-v /tmp/rc-test:/workspace" in cmd assert "--network none" in cmd assert "--memory=4096m" in cmd assert "--shm-size=1024m" in cmd assert "device=0" in cmd assert "myimage:latest" in cmd assert cmd.endswith("main.py") def test_docker_exec_full_network(self, tmp_path: Path): cfg = SshRemoteConfig( host="server", use_docker=True, docker_network_policy="full", ) sb = SshRemoteSandbox(cfg, tmp_path) cmd = sb._build_docker_exec_cmd("/tmp/rc-test", entry_point="main.py") assert "--network" not in cmd # ── Entry point path traversal validation ───────────────────────────── class TestSshEntryPointValidation: def test_run_project_rejects_path_traversal(self, tmp_path: Path): """run_project() must reject entry_point with '..' components.""" project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") cfg = SshRemoteConfig(host="server", user="test") work = tmp_path / "work" sandbox = SshRemoteSandbox(cfg, work) # Create escape target so .exists() alone wouldn't catch it work.mkdir(parents=True, exist_ok=True) (work / "escape.py").write_text("print('escaped!')") # Mock _execute to ensure it's never reached sandbox._execute = mock.MagicMock() # type: ignore[assignment] result = sandbox.run_project(project, entry_point="../escape.py") assert result.returncode == -1 assert ".." in result.stderr sandbox._execute.assert_not_called() def test_run_project_rejects_absolute_path(self, tmp_path: Path): """run_project() must reject absolute entry_point paths.""" project = tmp_path / "proj" project.mkdir() (project / "main.py").write_text("print('hi')") cfg = SshRemoteConfig(host="server", user="test") sandbox = SshRemoteSandbox(cfg, tmp_path / "work") sandbox._execute = mock.MagicMock() # type: ignore[assignment] result = sandbox.run_project(project, entry_point="/etc/passwd") assert result.returncode == -1 assert "relative" in result.stderr.lower() or "absolute" in result.stderr.lower() sandbox._execute.assert_not_called() class TestSshConnectivityCheck: def test_empty_host(self): cfg = SshRemoteConfig(host="") ok, msg = SshRemoteSandbox.check_ssh_available(cfg) assert not ok assert "empty" in msg def test_unreachable_host(self): cfg = SshRemoteConfig(host="nonexistent-host-12345.invalid") ok, msg = SshRemoteSandbox.check_ssh_available(cfg) assert not ok class TestSshSandboxRun: """Test run() with mocked SSH commands.""" def test_run_success(self, tmp_path: Path): cfg = SshRemoteConfig(host="fake", user="test") sb = SshRemoteSandbox(cfg, tmp_path) fake_results = [ mock.Mock(returncode=0, stdout="", stderr=""), # mkdir mock.Mock(returncode=0, stdout="accuracy: 0.95\nloss: 0.05", stderr=""), # exec mock.Mock(returncode=0, stdout="", stderr=""), # cleanup ] call_count = [0] def fake_ssh_run(command, *, timeout_sec=60): from researchclaw.experiment.ssh_sandbox import _SshResult idx = min(call_count[0], len(fake_results) - 1) r = fake_results[idx] call_count[0] += 1 return _SshResult( returncode=r.returncode, stdout=r.stdout, stderr=r.stderr, ) def fake_scp(local_dir, remote_dir): return True with mock.patch.object(sb, '_ssh_run', side_effect=fake_ssh_run): with mock.patch.object(sb, '_scp_upload', side_effect=fake_scp): result = sb.run("print('hello')", timeout_sec=60) assert result.returncode == 0 assert result.metrics.get("accuracy") == 0.95 assert result.metrics.get("loss") == 0.05 def test_run_upload_failure(self, tmp_path: Path): cfg = SshRemoteConfig(host="fake", user="test") sb = SshRemoteSandbox(cfg, tmp_path) from researchclaw.experiment.ssh_sandbox import _SshResult with mock.patch.object(sb, '_ssh_run', return_value=_SshResult(0, "", "")): with mock.patch.object(sb, '_scp_upload', return_value=False): result = sb.run("print('hello')") assert result.returncode == -1 assert "Failed to upload" in result.stderr # =========================================================================== # Colab Drive: unit tests # =========================================================================== class TestColabDriveCheck: def test_empty_root(self): cfg = ColabDriveConfig(drive_root="") ok, msg = ColabDriveSandbox.check_drive_available(cfg) assert not ok assert "empty" in msg def test_nonexistent_root(self): cfg = ColabDriveConfig(drive_root="/nonexistent/path/12345") ok, msg = ColabDriveSandbox.check_drive_available(cfg) assert not ok assert "not found" in msg def test_existing_root(self, tmp_path: Path): cfg = ColabDriveConfig(drive_root=str(tmp_path)) ok, msg = ColabDriveSandbox.check_drive_available(cfg) assert ok class TestColabDriveSandbox: def test_submit_and_collect(self, tmp_path: Path): """Simulate the full flow: submit task → worker picks up → collect result.""" drive_root = tmp_path / "drive" drive_root.mkdir() cfg = ColabDriveConfig( drive_root=str(drive_root), poll_interval_sec=1, timeout_sec=10, ) sb = ColabDriveSandbox(cfg, tmp_path / "workdir") # Simulate worker in a thread: move pending → done with result import threading def fake_worker(): pending = drive_root / "pending" done = drive_root / "done" for _ in range(20): # poll for up to 20 seconds if pending.exists(): for task_dir in pending.iterdir(): if task_dir.is_dir(): done.mkdir(parents=True, exist_ok=True) done_dir = done / task_dir.name task_dir.rename(done_dir) (done_dir / "result.json").write_text(json.dumps({ "returncode": 0, "stdout": "primary_metric: 42.0\naccuracy: 0.99", "stderr": "", })) return time.sleep(0.5) worker = threading.Thread(target=fake_worker, daemon=True) worker.start() result = sb.run("print('experiment')", timeout_sec=15) worker.join(timeout=5) assert result.returncode == 0 assert result.metrics.get("primary_metric") == 42.0 assert result.metrics.get("accuracy") == 0.99 def test_timeout(self, tmp_path: Path): """If worker never picks up, should timeout.""" drive_root = tmp_path / "drive" drive_root.mkdir() cfg = ColabDriveConfig( drive_root=str(drive_root), poll_interval_sec=1, timeout_sec=3, ) sb = ColabDriveSandbox(cfg, tmp_path / "workdir") result = sb.run("print('hello')", timeout_sec=3) assert result.timed_out assert result.returncode == -1 assert "did not complete" in result.stderr def test_setup_script_written(self, tmp_path: Path): drive_root = tmp_path / "drive" drive_root.mkdir() cfg = ColabDriveConfig( drive_root=str(drive_root), poll_interval_sec=1, timeout_sec=3, setup_script="pip install torch -q", ) sb = ColabDriveSandbox(cfg, tmp_path / "workdir") # Just submit, don't wait for result staging = tmp_path / "workdir" / "_colab_1" staging.mkdir(parents=True, exist_ok=True) (staging / "main.py").write_text("print('hi')") sb._write_setup_script(staging) setup_sh = staging / "setup.sh" assert setup_sh.exists() content = setup_sh.read_text() assert "pip install torch -q" in content class TestColabWorkerTemplate: def test_template_not_empty(self): assert len(COLAB_WORKER_TEMPLATE) > 100 def test_template_has_key_elements(self): assert "pending" in COLAB_WORKER_TEMPLATE assert "done" in COLAB_WORKER_TEMPLATE assert "result.json" in COLAB_WORKER_TEMPLATE assert "drive.mount" in COLAB_WORKER_TEMPLATE # =========================================================================== # Factory integration tests # =========================================================================== class TestFactoryIntegration: def test_ssh_remote_requires_host(self, tmp_path: Path): cfg = _make_experiment_config( mode="ssh_remote", ssh_remote=SshRemoteConfig(host=""), ) with pytest.raises(RuntimeError, match="host"): create_sandbox(cfg, tmp_path) def test_ssh_remote_checks_connectivity(self, tmp_path: Path): cfg = _make_experiment_config( mode="ssh_remote", ssh_remote=SshRemoteConfig(host="nonexistent.invalid"), ) with pytest.raises(RuntimeError, match="SSH connectivity"): create_sandbox(cfg, tmp_path) def test_colab_drive_requires_root(self, tmp_path: Path): cfg = _make_experiment_config( mode="colab_drive", colab_drive=ColabDriveConfig(drive_root=""), ) with pytest.raises(RuntimeError, match="empty"): create_sandbox(cfg, tmp_path) def test_colab_drive_checks_path(self, tmp_path: Path): cfg = _make_experiment_config( mode="colab_drive", colab_drive=ColabDriveConfig(drive_root="/nonexistent/12345"), ) with pytest.raises(RuntimeError, match="not found"): create_sandbox(cfg, tmp_path) def test_colab_drive_creates_sandbox(self, tmp_path: Path): drive_root = tmp_path / "drive" drive_root.mkdir() cfg = _make_experiment_config( mode="colab_drive", colab_drive=ColabDriveConfig(drive_root=str(drive_root)), ) sb = create_sandbox(cfg, tmp_path / "workdir") assert isinstance(sb, ColabDriveSandbox) # =========================================================================== # ACP timeout fix test # =========================================================================== class TestAcpTimeoutFix: def test_timeout_passed_from_config(self): from researchclaw.config import RCConfig, AcpConfig, LlmConfig from researchclaw.llm.acp_client import ACPClient, ACPConfig acp_cfg = AcpConfig(agent="codex", timeout_sec=1500) llm_cfg = LlmConfig(provider="acp", acp=acp_cfg) # Simulate RCConfig with just the fields ACPClient.from_rc_config uses fake_rc = mock.Mock() fake_rc.llm = llm_cfg client = ACPClient.from_rc_config(fake_rc) assert client.config.timeout_sec == 1500 def test_timeout_default(self): from researchclaw.llm.acp_client import ACPClient fake_rc = mock.Mock() fake_rc.llm.acp.agent = "claude" fake_rc.llm.acp.cwd = "." fake_rc.llm.acp.acpx_command = "" fake_rc.llm.acp.session_name = "test" fake_rc.llm.acp.timeout_sec = 600 client = ACPClient.from_rc_config(fake_rc) assert client.config.timeout_sec == 600 # =========================================================================== # ACP session reconnect tests (Issue #52) # =========================================================================== class TestAcpSessionReconnect: def test_reconnect_on_session_died(self): """_send_prompt retries when session dies with 'agent needs reconnect'.""" from researchclaw.llm.acp_client import ACPClient, ACPConfig client = ACPClient(ACPConfig(agent="claude")) client._acpx = "/usr/bin/true" client._session_ready = True call_count = 0 def fake_cli(acpx: str, prompt: str) -> str: nonlocal call_count call_count += 1 if call_count == 1: raise RuntimeError("ACP prompt failed (exit 1): agent needs reconnect") return "success response" client._send_prompt_cli = fake_cli # type: ignore[assignment] client._ensure_session = lambda: None # type: ignore[assignment] client._force_reconnect = lambda: None # type: ignore[assignment] result = client._send_prompt("test prompt") assert result == "success response" assert call_count == 2 def test_reconnect_exhausted_raises(self): """_send_prompt raises after exhausting reconnect attempts.""" from researchclaw.llm.acp_client import ACPClient, ACPConfig client = ACPClient(ACPConfig(agent="claude")) client._acpx = "/usr/bin/true" client._session_ready = True def always_fail(acpx: str, prompt: str) -> str: raise RuntimeError("ACP prompt failed (exit 1): session not found") client._send_prompt_cli = always_fail # type: ignore[assignment] client._ensure_session = lambda: None # type: ignore[assignment] client._force_reconnect = lambda: None # type: ignore[assignment] import pytest with pytest.raises(RuntimeError, match="session not found"): client._send_prompt("test prompt") def test_non_reconnectable_error_raises_immediately(self): """_send_prompt does not retry on non-session errors.""" from researchclaw.llm.acp_client import ACPClient, ACPConfig client = ACPClient(ACPConfig(agent="claude")) client._acpx = "/usr/bin/true" client._session_ready = True call_count = 0 def fail_with_other_error(acpx: str, prompt: str) -> str: nonlocal call_count call_count += 1 raise RuntimeError("ACP prompt failed (exit 1): permission denied") client._send_prompt_cli = fail_with_other_error # type: ignore[assignment] client._ensure_session = lambda: None # type: ignore[assignment] import pytest with pytest.raises(RuntimeError, match="permission denied"): client._send_prompt("test prompt") assert call_count == 1 # no retry ================================================ FILE: tests/test_trends.py ================================================ """Tests for researchclaw.trends — Research Trend Tracker (Agent D1). 25+ tests covering feeds, trend_analyzer, opportunity_finder, daily_digest, auto_topic, and literature/trends. """ from __future__ import annotations import asyncio from datetime import date from pathlib import Path from typing import Any from unittest.mock import MagicMock, patch import pytest from researchclaw.trends.feeds import FeedManager from researchclaw.trends.trend_analyzer import TrendAnalyzer, _STOPWORDS from researchclaw.trends.opportunity_finder import OpportunityFinder from researchclaw.trends.daily_digest import DailyDigest from researchclaw.trends.auto_topic import AutoTopicGenerator from researchclaw.literature.trends import LiteratureTrendAnalyzer # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def _make_papers(n: int = 10) -> list[dict[str, Any]]: """Generate synthetic papers for testing.""" papers = [] for i in range(n): papers.append({ "title": f"Transformer attention mechanism for graph neural networks part {i}", "authors": [ {"name": "Alice Smith"}, {"name": "Bob Jones"}, ] if i % 2 == 0 else ["Alice Smith", "Charlie Brown"], "abstract": ( "We propose a transformer-based attention approach for " "graph neural networks using contrastive learning on ImageNet " "and CIFAR datasets. Our diffusion model achieves SOTA results." ), "url": f"https://arxiv.org/abs/2026.{i:05d}", "source": "arxiv" if i % 2 == 0 else "semantic_scholar", "published_date": "2026-03-01", }) return papers class MockLLM: async def chat_async(self, prompt: str) -> str: return ( "TOPIC: Graph transformers for drug discovery | " "WHY: Rising trend | FEASIBILITY: high\n" "TOPIC: Diffusion models for 3D generation | " "WHY: New paradigm | FEASIBILITY: medium\n" ) class FailingLLM: async def chat_async(self, prompt: str) -> str: raise RuntimeError("API error") # =================================================================== # FeedManager tests # =================================================================== class TestFeedManager: def test_init_filters_supported_sources(self): fm = FeedManager(sources=("arxiv", "invalid_source", "semantic_scholar")) assert fm.sources == ("arxiv", "semantic_scholar") def test_supported_sources(self): assert "arxiv" in FeedManager.SUPPORTED_SOURCES assert "semantic_scholar" in FeedManager.SUPPORTED_SOURCES assert "openalex" in FeedManager.SUPPORTED_SOURCES def test_fetch_deduplicates_by_title(self): fm = FeedManager(sources=("arxiv",)) # Mock _fetch_arxiv to return duplicates papers = [ {"title": "Same Title", "source": "arxiv"}, {"title": "Same Title", "source": "arxiv"}, {"title": "Different Title", "source": "arxiv"}, ] with patch.object(fm, "_fetch_arxiv", return_value=papers): result = fm.fetch_recent_papers(["ml"], max_papers=10) assert len(result) == 2 def test_fetch_respects_max_papers(self): fm = FeedManager(sources=("arxiv",)) papers = [{"title": f"Paper {i}", "source": "arxiv"} for i in range(20)] with patch.object(fm, "_fetch_arxiv", return_value=papers): result = fm.fetch_recent_papers(["ml"], max_papers=5) assert len(result) == 5 def test_fetch_handles_source_failure(self): fm = FeedManager(sources=("arxiv",)) with patch.object(fm, "_fetch_arxiv", side_effect=RuntimeError("fail")): result = fm.fetch_recent_papers(["ml"]) assert result == [] def test_fetch_empty_title_excluded(self): fm = FeedManager(sources=("arxiv",)) papers = [ {"title": "", "source": "arxiv"}, {"title": " ", "source": "arxiv"}, {"title": "Valid Paper", "source": "arxiv"}, ] with patch.object(fm, "_fetch_arxiv", return_value=papers): result = fm.fetch_recent_papers(["ml"]) assert len(result) == 1 # =================================================================== # TrendAnalyzer tests # =================================================================== class TestTrendAnalyzer: def test_analyze_empty(self): analyzer = TrendAnalyzer() result = analyzer.analyze([]) assert result["paper_count"] == 0 assert result["rising_keywords"] == [] def test_analyze_extracts_keywords(self): analyzer = TrendAnalyzer() papers = _make_papers(10) result = analyzer.analyze(papers) assert result["paper_count"] == 10 assert len(result["rising_keywords"]) > 0 def test_keywords_exclude_stopwords(self): analyzer = TrendAnalyzer() papers = _make_papers(10) result = analyzer.analyze(papers) for kw in result["rising_keywords"]: for word in kw["keyword"].split(): assert word not in _STOPWORDS def test_extract_authors_dict_format(self): analyzer = TrendAnalyzer() papers = [ {"authors": [{"name": "Alice"}, {"name": "Bob"}]} for _ in range(5) ] authors = analyzer._extract_authors(papers) assert any(a["author"] == "Alice" for a in authors) def test_extract_authors_string_format(self): analyzer = TrendAnalyzer() papers = [{"authors": ["Alice", "Bob"]} for _ in range(5)] authors = analyzer._extract_authors(papers) assert any(a["author"] == "Alice" for a in authors) def test_extract_datasets(self): analyzer = TrendAnalyzer() papers = [ {"title": "Training on ImageNet and CIFAR", "abstract": ""}, {"title": "MNIST results", "abstract": "evaluated on GLUE benchmark"}, ] datasets = analyzer._extract_datasets(papers) ds_names = {d["dataset"] for d in datasets} assert "ImageNet" in ds_names assert "CIFAR" in ds_names def test_extract_methods(self): analyzer = TrendAnalyzer() papers = [ {"title": "Transformer attention", "abstract": "using diffusion models"}, {"title": "GAN for images", "abstract": "contrastive learning approach"}, ] methods = analyzer._extract_methods(papers) method_names = {m["method"] for m in methods} assert "transformer" in method_names or "attention" in method_names def test_tokenize(self): tokens = TrendAnalyzer._tokenize("Hello World! It's a test-case.") assert "hello" in tokens assert "world" in tokens assert "it's" in tokens assert "test-case" in tokens def test_source_distribution(self): papers = [ {"source": "arxiv"}, {"source": "arxiv"}, {"source": "semantic_scholar"}, ] dist = TrendAnalyzer._source_distribution(papers) assert dist["arxiv"] == 2 assert dist["semantic_scholar"] == 1 def test_generate_trend_report(self): analyzer = TrendAnalyzer() analysis = analyzer.analyze(_make_papers(10)) report = analyzer.generate_trend_report(analysis) assert "Research Trend Analysis" in report assert "10 papers" in report def test_min_keyword_length(self): analyzer = TrendAnalyzer(min_keyword_length=5) papers = [{"title": "AI is a big deal", "abstract": ""}] * 5 keywords = analyzer._extract_keywords(papers) # Short words like "deal" (4 chars) should be excluded by min_keyword_length=5 # but "big" is only 3 chars so excluded too for kw in keywords: for word in kw["keyword"].split(): assert len(word) >= 5 or word in _STOPWORDS # =================================================================== # OpportunityFinder tests # =================================================================== class TestOpportunityFinder: def test_heuristic_no_llm(self): finder = OpportunityFinder() trend_analysis = { "rising_keywords": [ {"keyword": "graph neural", "count": 10}, {"keyword": "attention", "count": 8}, {"keyword": "diffusion", "count": 6}, ], "method_trends": [ {"method": "transformer", "mention_count": 12}, {"method": "contrastive learning", "mention_count": 7}, ], } result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"])) assert len(result) > 0 assert all("topic" in opp for opp in result) assert all(opp["source"] == "heuristic" for opp in result) def test_heuristic_empty_trends(self): finder = OpportunityFinder() result = asyncio.run(finder.find_opportunities( {"rising_keywords": [], "method_trends": []}, ["ml"] )) assert result == [] def test_llm_path(self): finder = OpportunityFinder(llm_client=MockLLM()) trend_analysis = { "rising_keywords": [{"keyword": "graph", "count": 10}], "method_trends": [{"method": "transformer", "mention_count": 5}], } result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"])) assert len(result) >= 1 assert result[0]["source"] == "llm" def test_llm_fallback_on_failure(self): finder = OpportunityFinder(llm_client=FailingLLM()) trend_analysis = { "rising_keywords": [{"keyword": "test", "count": 5}], "method_trends": [{"method": "GAN", "mention_count": 3}], } result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"])) assert all(opp["source"] == "heuristic" for opp in result) def test_parse_opportunities(self): response = ( "TOPIC: Adaptive transformers | WHY: Trending | FEASIBILITY: high\n" "TOPIC: Diffusion for audio | WHY: New area | FEASIBILITY: medium\n" "Some noise line\n" ) result = OpportunityFinder._parse_opportunities(response) assert len(result) == 2 assert result[0]["topic"] == "Adaptive transformers" assert result[0]["feasibility"] == "high" def test_heuristic_max_five(self): finder = OpportunityFinder() trend_analysis = { "rising_keywords": [ {"keyword": f"kw{i}", "count": 10} for i in range(10) ], "method_trends": [ {"method": f"method{i}", "mention_count": 5} for i in range(10) ], } result = asyncio.run(finder.find_opportunities(trend_analysis, ["ml"])) assert len(result) <= 5 # =================================================================== # DailyDigest tests # =================================================================== class TestDailyDigest: def test_generate_basic_no_papers(self): fm = FeedManager(sources=()) digest = DailyDigest(fm) result = asyncio.run(digest.generate(["ml"])) assert "No new papers found" in result def test_generate_basic_with_papers(self): fm = FeedManager(sources=("arxiv",)) papers = _make_papers(3) with patch.object(fm, "fetch_recent_papers", return_value=papers): digest = DailyDigest(fm) result = asyncio.run(digest.generate(["ml"])) assert "Daily Paper Digest" in result assert "Papers found: 3" in result def test_generate_basic_truncates_abstract(self): fm = FeedManager(sources=("arxiv",)) papers = [{"title": "Test", "abstract": "x" * 500, "authors": [], "url": ""}] with patch.object(fm, "fetch_recent_papers", return_value=papers): digest = DailyDigest(fm) result = asyncio.run(digest.generate(["ml"])) assert "..." in result def test_parse_summary_valid(self): response = "SUMMARY: Great paper on attention | RELEVANCE: 4" summary, relevance = DailyDigest._parse_summary(response) assert summary == "Great paper on attention" assert relevance == 4 def test_parse_summary_no_format(self): response = "Just a plain text response." summary, relevance = DailyDigest._parse_summary(response) assert summary == response assert relevance == 3 # default def test_parse_summary_clamped(self): response = "SUMMARY: x | RELEVANCE: 99" _, relevance = DailyDigest._parse_summary(response) assert relevance == 5 def test_generate_and_save(self, tmp_path: Path): fm = FeedManager(sources=("arxiv",)) papers = _make_papers(2) with patch.object(fm, "fetch_recent_papers", return_value=papers): digest = DailyDigest(fm) result_path = asyncio.run(digest.generate_and_save(tmp_path, ["ml"])) assert result_path.exists() assert result_path.read_text(encoding="utf-8").startswith("## Daily Paper Digest") def test_author_formatting_dict(self): fm = FeedManager(sources=("arxiv",)) papers = [{ "title": "T", "abstract": "", "url": "", "authors": [{"name": "A"}, {"name": "B"}, {"name": "C"}, {"name": "D"}], }] with patch.object(fm, "fetch_recent_papers", return_value=papers): digest = DailyDigest(fm) result = asyncio.run(digest.generate(["ml"])) assert "et al." in result # =================================================================== # AutoTopicGenerator tests # =================================================================== class TestAutoTopicGenerator: def test_generate_candidates(self): analyzer = TrendAnalyzer() finder = OpportunityFinder() gen = AutoTopicGenerator(analyzer, finder) papers = _make_papers(10) candidates = asyncio.run(gen.generate_candidates(["ml"], papers, count=3)) assert len(candidates) <= 3 if candidates: assert "topic" in candidates[0] assert "overall_score" in candidates[0] def test_generate_candidates_empty(self): analyzer = TrendAnalyzer() finder = OpportunityFinder() gen = AutoTopicGenerator(analyzer, finder) candidates = asyncio.run(gen.generate_candidates(["ml"], [], count=3)) # With empty papers, heuristic has no keywords/methods → no opportunities assert isinstance(candidates, list) def test_auto_select_default_fallback(self): analyzer = TrendAnalyzer() finder = OpportunityFinder() gen = AutoTopicGenerator(analyzer, finder) result = asyncio.run(gen.auto_select(["ml"], [])) assert "topic" in result assert result["source"] == "default" def test_score_candidate_feasibility(self): opp_high = {"topic": "unique topic xyz", "feasibility": "high"} opp_low = {"topic": "unique topic xyz", "feasibility": "low"} trend = {"rising_keywords": [], "paper_count": 50} score_h = AutoTopicGenerator._score_candidate(opp_high, trend) score_l = AutoTopicGenerator._score_candidate(opp_low, trend) assert score_h["feasibility"] == 0.9 assert score_l["feasibility"] == 0.3 assert score_h["overall"] > score_l["overall"] def test_score_candidate_novelty_decay(self): opp = {"topic": "graph neural", "feasibility": "medium"} trend = { "rising_keywords": [ {"keyword": "graph neural", "count": 10}, {"keyword": "neural network", "count": 8}, ], "paper_count": 50, } score = AutoTopicGenerator._score_candidate(opp, trend) assert score["novelty"] < 1.0 # overlap penalizes novelty def test_score_candidate_weights(self): """Overall = 0.4*novelty + 0.3*feasibility + 0.3*impact.""" opp = {"topic": "totally unique xyz", "feasibility": "high"} trend = {"rising_keywords": [], "paper_count": 50} score = AutoTopicGenerator._score_candidate(opp, trend) expected = round(0.4 * score["novelty"] + 0.3 * score["feasibility"] + 0.3 * score["impact"], 3) assert score["overall"] == expected def test_format_candidates_empty(self): analyzer = TrendAnalyzer() finder = OpportunityFinder() gen = AutoTopicGenerator(analyzer, finder) assert "No candidate" in gen.format_candidates([]) def test_format_candidates_with_data(self): analyzer = TrendAnalyzer() finder = OpportunityFinder() gen = AutoTopicGenerator(analyzer, finder) candidates = [ { "topic": "Test topic", "overall_score": 0.75, "novelty_score": 0.8, "feasibility_score": 0.7, "impact_score": 0.6, "rationale": "Good idea", } ] output = gen.format_candidates(candidates) assert "Test topic" in output assert "0.75" in output # =================================================================== # LiteratureTrendAnalyzer tests # =================================================================== class TestLiteratureTrendAnalyzer: def test_no_client_returns_empty(self): lta = LiteratureTrendAnalyzer() assert lta.get_daily_papers(["ml"]) == [] def test_analyze_keyword_trends_no_client(self): lta = LiteratureTrendAnalyzer() result = lta.analyze_keyword_trends(["ml"]) assert result["total_papers"] == 0 def test_find_emerging_topics_no_client(self): lta = LiteratureTrendAnalyzer() assert lta.find_emerging_topics(["ml"]) == [] def test_find_emerging_topics_filters_bigrams(self): """Only bigrams with count >= 3 are considered emerging.""" lta = LiteratureTrendAnalyzer(search_client="fake") papers = _make_papers(20) with patch.object(lta, "get_daily_papers", return_value=papers): topics = lta.find_emerging_topics(["ml"]) for t in topics: assert t["type"] == "bigram" assert t["frequency"] >= 3 ================================================ FILE: tests/test_universal_codegen_integration.py ================================================ """Integration tests for universal cross-domain code generation. Tests the full pipeline from domain detection → adapter selection → prompt block generation → blueprint context building, across multiple research domains. These tests do NOT require an LLM or network — they verify the infrastructure wiring. """ from __future__ import annotations import json import pytest from pathlib import Path from unittest.mock import MagicMock, patch from researchclaw.domains.detector import ( DomainProfile, detect_domain, get_profile, is_ml_domain, load_all_profiles, ) from researchclaw.domains.prompt_adapter import get_adapter, PromptBlocks from researchclaw.domains.experiment_schema import ( Condition, ConditionRole, EvaluationSpec, MetricSpec, UniversalExperimentPlan, from_legacy_exp_plan, ) from researchclaw.experiment.metrics import UniversalMetricParser from researchclaw.experiment.evaluators.convergence import analyze_convergence from researchclaw.agents.code_searcher.agent import CodeSearchAgent, CodeSearchResult from researchclaw.agents.code_searcher.pattern_extractor import CodePatterns # --------------------------------------------------------------------------- # Cross-domain domain detection integration # --------------------------------------------------------------------------- class TestCrossDomainDetection: """Test domain detection across all supported domains.""" def test_all_profiles_loadable(self): profiles = load_all_profiles() assert len(profiles) >= 18 # at least 18 domain profiles def test_ml_vision_full_pipeline(self): """ML Vision: detect → adapter → blocks → legacy compatibility.""" profile = detect_domain("image classification on CIFAR-10 with ResNet") assert profile.domain_id == "ml_vision" assert is_ml_domain(profile) adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) # ML adapter returns empty blocks (existing behavior) assert blocks.compute_budget == "" def test_physics_pde_full_pipeline(self): """Physics PDE: detect → adapter → blocks with convergence guidance.""" profile = detect_domain("finite element method for Poisson equation") assert profile.domain_id == "physics_pde" assert not is_ml_domain(profile) adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) # Physics adapter should provide non-empty guidance assert blocks.code_generation_hints # not empty # Blueprint context should mention convergence ctx = adapter.get_blueprint_context() assert ctx # not empty def test_economics_full_pipeline(self): """Economics: detect → adapter → progressive spec guidance.""" profile = detect_domain("panel data regression with instrumental variables") assert profile.domain_id == "economics_empirical" adapter = get_adapter(profile) blocks = adapter.get_experiment_design_blocks({}) assert "progressive" in blocks.experiment_design_context.lower() def test_chemistry_full_pipeline(self): """Chemistry: detect → adapter → PySCF guidance.""" profile = detect_domain("DFT calculation with PySCF for molecular energies") assert profile.domain_id == "chemistry_qm" adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints def test_biology_full_pipeline(self): """Biology: detect → adapter → scanpy guidance.""" profile = detect_domain("single-cell RNA-seq clustering with scanpy") assert profile.domain_id == "biology_singlecell" adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints def test_math_full_pipeline(self): """Math: detect → adapter → convergence guidance.""" profile = detect_domain("Runge-Kutta ODE solver convergence analysis") assert profile.domain_id == "mathematics_numerical" adapter = get_adapter(profile) blocks = adapter.get_code_generation_blocks({}) assert blocks.code_generation_hints # --------------------------------------------------------------------------- # Universal Experiment Schema integration # --------------------------------------------------------------------------- class TestExperimentSchemaIntegration: def test_physics_convergence_plan(self): """Create a physics convergence study plan.""" plan = UniversalExperimentPlan( experiment_type="convergence", domain_id="physics_pde", problem_description="Solve Poisson equation with FEM and FDM", conditions=[ Condition(name="FDM_2nd", role="reference", description="2nd order finite difference"), Condition(name="FEM_P1", role="proposed", description="P1 finite element method"), Condition(name="FEM_P2", role="variant", varies_from="FEM_P1", description="P2 finite element method"), ], evaluation=EvaluationSpec( primary_metric=MetricSpec( name="l2_error", direction="minimize", unit="relative", ), protocol="Run at 5 grid sizes, measure L2 error", statistical_test="convergence_order_fit", num_seeds=1, ), main_figure_type="convergence_plot", ) assert len(plan.references) == 1 assert len(plan.proposed) == 1 assert len(plan.variants) == 1 # Test legacy format conversion legacy = plan.to_legacy_format() assert len(legacy["baselines"]) == 1 assert legacy["baselines"][0]["name"] == "FDM_2nd" assert "l2_error" in legacy["metrics"] # Test YAML serialization yaml_str = plan.to_yaml() assert "convergence" in yaml_str assert "FDM_2nd" in yaml_str def test_economics_progressive_plan(self): """Create an economics progressive specification plan.""" plan = UniversalExperimentPlan( experiment_type="progressive_spec", domain_id="economics_empirical", conditions=[ Condition(name="OLS", role="reference", description="Simple OLS"), Condition(name="OLS_controls", role="proposed", description="OLS with control variables"), Condition(name="FE", role="variant", varies_from="OLS_controls", description="Fixed effects"), Condition(name="IV_2SLS", role="variant", varies_from="OLS_controls", description="Instrumental variables"), ], evaluation=EvaluationSpec( primary_metric=MetricSpec(name="coefficient", direction="maximize"), statistical_test="hausman_test", ), main_table_type="regression_table", ) assert len(plan.conditions) == 4 legacy = plan.to_legacy_format() assert len(legacy["ablations"]) == 2 # FE and IV are variants # --------------------------------------------------------------------------- # Metric Parser + Convergence Evaluator integration # --------------------------------------------------------------------------- class TestMetricConvergenceIntegration: def test_json_convergence_end_to_end(self, tmp_path): """Parse JSON convergence results → analyze convergence → report.""" data = { "experiment_type": "convergence", "convergence": { "euler": [ {"h": 0.1, "error": 0.1}, {"h": 0.05, "error": 0.05}, {"h": 0.025, "error": 0.025}, {"h": 0.0125, "error": 0.0125}, ], "rk4": [ {"h": 0.1, "error": 1e-4}, {"h": 0.05, "error": 6.25e-6}, {"h": 0.025, "error": 3.9e-7}, {"h": 0.0125, "error": 2.44e-8}, ], }, "metadata": {"domain": "mathematics_numerical"}, } (tmp_path / "results.json").write_text(json.dumps(data)) # Parse parser = UniversalMetricParser() results = parser.parse(tmp_path) assert results.source == "json" assert "euler" in results.convergence # Analyze convergence report = analyze_convergence( results.convergence, expected_orders={"euler": 1.0, "rk4": 4.0}, ) assert len(report.methods) == 2 euler = next(r for r in report.methods if r.method == "euler") rk4 = next(r for r in report.methods if r.method == "rk4") assert abs(euler.convergence_order - 1.0) < 0.2 assert abs(rk4.convergence_order - 4.0) < 0.5 assert rk4.convergence_order > euler.convergence_order assert report.best_method == "rk4" def test_flat_metrics_backward_compatible(self, tmp_path): """Ensure new metric parser produces backward-compatible output.""" # Write old-style stdout result = UniversalMetricParser().parse( tmp_path, stdout="accuracy: 0.95\nloss: 0.32\ncondition=proposed accuracy: 0.95\n", ) flat = result.to_flat_metrics() assert "accuracy" in flat assert "loss" in flat assert flat["accuracy"] == 0.95 # --------------------------------------------------------------------------- # Code Search + Domain Profile integration # --------------------------------------------------------------------------- class TestCodeSearchIntegration: def test_code_search_result_in_blueprint(self): """Code search results should be formattable as prompt context.""" result = CodeSearchResult( patterns=CodePatterns( api_patterns=[ "from pyscf import gto, scf\nmol = gto.M(atom='H 0 0 0; H 0 0 0.74', basis='sto-3g')", ], file_structure={"main.py": "Entry point", "molecule.py": "Molecule definitions"}, evaluation_patterns=["mae = np.mean(np.abs(predicted - reference))"], ), repos_found=[ MagicMock(full_name="user/pyscf-example", stars=200), ], ) ctx = result.to_prompt_context() assert "pyscf" in ctx assert "molecule.py" in ctx def test_domain_adapter_blueprint_context(self): """Domain adapter should produce useful blueprint context.""" profile = get_profile("physics_simulation") if profile is None: pytest.skip("physics_simulation profile not found") adapter = get_adapter(profile) ctx = adapter.get_blueprint_context() # Should mention file structure assert "main.py" in ctx or "integrator" in ctx.lower() # Should mention libraries assert "numpy" in ctx.lower() or "scipy" in ctx.lower() or ctx != "" # --------------------------------------------------------------------------- # CodeAgent domain injection test # --------------------------------------------------------------------------- class TestCodeAgentDomainInjection: def test_code_agent_accepts_domain_profile(self): """CodeAgent should accept domain_profile and code_search_result.""" from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig config = CodeAgentConfig(enabled=True) profile = DomainProfile( domain_id="physics_pde", display_name="PDE Solvers", core_libraries=["numpy", "scipy"], ) search_result = CodeSearchResult( patterns=CodePatterns( api_patterns=["import scipy.sparse"], ), ) agent = CodeAgent( llm=MagicMock(), prompts=MagicMock(), config=config, stage_dir=Path("/tmp/test"), domain_profile=profile, code_search_result=search_result, ) # Verify the domain context builder works ctx = agent._build_domain_context() assert "scipy" in ctx.lower() or ctx != "" def test_code_agent_ml_domain_no_extra_context(self): """ML domain should add minimal extra context (preserve existing behavior).""" from researchclaw.pipeline.code_agent import CodeAgent, CodeAgentConfig config = CodeAgentConfig(enabled=True) profile = get_profile("ml_vision") or DomainProfile( domain_id="ml_vision", display_name="Computer Vision", ) agent = CodeAgent( llm=MagicMock(), prompts=MagicMock(), config=config, stage_dir=Path("/tmp/test"), domain_profile=profile, code_search_result=None, # No code search for ML ) # ML adapter returns empty blocks → minimal context ctx = agent._build_domain_context() # It's acceptable for ML to have some context from file structure, # but it should NOT have code search results # (we didn't provide code_search_result) assert "Reference Code from GitHub" not in ctx # --------------------------------------------------------------------------- # Docker profile mapping test # --------------------------------------------------------------------------- class TestDockerProfileMapping: def test_domain_to_docker_mapping(self): """All domains should map to a valid docker profile.""" import yaml profiles_path = Path(__file__).parent.parent / "researchclaw" / "data" / "docker_profiles.yaml" if not profiles_path.exists(): pytest.skip("docker_profiles.yaml not found") with profiles_path.open() as f: docker_config = yaml.safe_load(f) domain_map = docker_config.get("domain_map", {}) profiles = docker_config.get("profiles", {}) # Every mapped domain should point to a valid profile for domain_id, profile_name in domain_map.items(): assert profile_name in profiles, ( f"Domain {domain_id} maps to unknown profile: {profile_name}" ) def test_all_loaded_domains_have_docker_mapping(self): """All domain profiles should have a docker mapping.""" import yaml profiles_path = Path(__file__).parent.parent / "researchclaw" / "data" / "docker_profiles.yaml" if not profiles_path.exists(): pytest.skip("docker_profiles.yaml not found") with profiles_path.open() as f: docker_config = yaml.safe_load(f) domain_map = docker_config.get("domain_map", {}) domain_profiles = load_all_profiles() unmapped = [] for domain_id in domain_profiles: if domain_id not in domain_map and domain_id != "generic": unmapped.append(domain_id) # Allow some unmapped (new domains without docker images yet) # but the core ones should be mapped core_domains = [ "ml_vision", "ml_nlp", "ml_rl", "physics_simulation", "physics_pde", "chemistry_qm", "economics_empirical", "mathematics_numerical", ] for d in core_domains: assert d in domain_map, f"Core domain {d} missing from docker mapping" ================================================ FILE: tests/test_v6_improvements.py ================================================ """Tests for V6 improvements (IMP-13 through IMP-16). Run with: .venv/bin/python3 -m pytest tests/test_v6_improvements.py -v or: .venv/bin/python3 tests/test_v6_improvements.py """ from __future__ import annotations import re import sys import statistics import random import textwrap from pathlib import Path # ============================================================ # IMP-13: Test _extract_paper_title import & behaviour # ============================================================ class TestIMP13_ExtractPaperTitle: """IMP-13: runner.py imports _extract_paper_title from executor. Verify the import works and the function produces correct results.""" def test_import_works(self): """The import `from researchclaw.pipeline.executor import _extract_paper_title` must succeed — runner.py line 394 depends on it.""" from researchclaw.pipeline.executor import _extract_paper_title assert callable(_extract_paper_title), "_extract_paper_title should be callable" print("[IMP-13] PASS: import _extract_paper_title works") def test_extracts_h1_title(self): from researchclaw.pipeline.executor import _extract_paper_title md = textwrap.dedent("""\ # A Novel Approach to Deep Reinforcement Learning ## Abstract This paper presents... """) title = _extract_paper_title(md) assert title == "A Novel Approach to Deep Reinforcement Learning", \ f"Expected H1 title, got: {title!r}" print(f"[IMP-13] PASS: extracted title = {title!r}") def test_skips_abstract_heading(self): """Title before Abstract should be found; Abstract heading itself skipped.""" from researchclaw.pipeline.executor import _extract_paper_title md = textwrap.dedent("""\ # A Real Title of at Least Four Words ## Abstract Some text... """) title = _extract_paper_title(md) # "Abstract" should be skipped; the real title (before Abstract) is found assert title == "A Real Title of at Least Four Words", \ f"Expected real title, got: {title!r}" print(f"[IMP-13] PASS: skipped Abstract, got title = {title!r}") def test_title_after_abstract_not_found(self): """If the only real title is AFTER Abstract, it should not be found (function searches only before Abstract heading).""" from researchclaw.pipeline.executor import _extract_paper_title md = textwrap.dedent("""\ # Abstract # A Title That Appears After Abstract Some text... """) title = _extract_paper_title(md) # Title after Abstract is not in the search region, so fallback assert title == "Untitled Paper", \ f"Expected 'Untitled Paper' since title is after Abstract, got: {title!r}" print(f"[IMP-13] PASS: title after Abstract not found, fallback = {title!r}") def test_fallback_untitled(self): from researchclaw.pipeline.executor import _extract_paper_title md = "Just some text without any headings." title = _extract_paper_title(md) assert title == "Untitled Paper", f"Expected 'Untitled Paper', got: {title!r}" print(f"[IMP-13] PASS: fallback = {title!r}") def test_bold_title(self): from researchclaw.pipeline.executor import _extract_paper_title md = textwrap.dedent("""\ **A Bold Title for This Paper** ## Abstract Text here... """) title = _extract_paper_title(md) assert "Bold Title" in title, f"Expected bold title, got: {title!r}" print(f"[IMP-13] PASS: bold title = {title!r}") # ============================================================ # IMP-14: Test orphaned cite-key stripping logic # ============================================================ class TestIMP14_StripOrphanedCites: """IMP-14: After packaging, any \\cite{key} where key is not in references.bib should be stripped from paper.tex.""" @staticmethod def _run_cite_stripping(tex_text: str, bib_text: str) -> str: """Reproduce the IMP-14 logic from runner.py lines 505-532.""" all_cite_keys: set[str] = set() for cm in re.finditer(r"\\cite\{([^}]+)\}", tex_text): all_cite_keys.update(k.strip() for k in cm.group(1).split(",")) bib_keys = set(re.findall(r"@\w+\{([^,]+),", bib_text)) missing = all_cite_keys - bib_keys if missing: def _filter_cite(m: re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] kept = [k for k in keys if k not in missing] if not kept: return "" return "\\cite{" + ", ".join(kept) + "}" tex_text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text) tex_text = re.sub(r" +", " ", tex_text) tex_text = re.sub(r" ([.,;:)])", r"\1", tex_text) return tex_text def test_mixed_real_and_missing_keys(self): """\\cite{real_key, missing_key} should become \\cite{real_key}.""" tex = r"Some text \cite{real_key, missing_key} and more." bib = textwrap.dedent("""\ @article{real_key, author = {Doe}, title = {Real Paper}, year = {2024}, } """) result = self._run_cite_stripping(tex, bib) assert r"\cite{real_key}" in result, f"Expected \\cite{{real_key}}, got: {result!r}" assert "missing_key" not in result, f"missing_key should be gone: {result!r}" print(f"[IMP-14] PASS: mixed keys → {result!r}") def test_all_keys_missing(self): """\\cite{missing1, missing2} should be entirely removed.""" tex = r"Some text \cite{missing1, missing2} more." bib = "" # empty bib result = self._run_cite_stripping(tex, bib) assert r"\cite" not in result, f"Expected no \\cite, got: {result!r}" print(f"[IMP-14] PASS: all missing → {result!r}") def test_all_keys_valid(self): """When all keys are valid, tex should remain unchanged (except whitespace).""" tex = r"Text \cite{key1, key2} end." bib = textwrap.dedent("""\ @article{key1, author = {A}, title = {T}, year = {2024}, } @article{key2, author = {B}, title = {T2}, year = {2024}, } """) result = self._run_cite_stripping(tex, bib) assert r"\cite{key1, key2}" in result, f"Expected unchanged, got: {result!r}" print(f"[IMP-14] PASS: all valid → {result!r}") def test_multiple_cite_commands(self): """Multiple \\cite commands, each with different missing keys.""" tex = ( r"First \cite{a, b} second \cite{b, c} third \cite{d}." ) bib = textwrap.dedent("""\ @article{a, author = {X}, title = {Y}, year = {2024}, } @article{c, author = {X}, title = {Y}, year = {2024}, } """) result = self._run_cite_stripping(tex, bib) # a is valid, b is missing, c is valid, d is missing assert r"\cite{a}" in result, f"Expected \\cite{{a}}, got: {result!r}" assert r"\cite{c}" in result, f"Expected \\cite{{c}}, got: {result!r}" # b should not appear as a cite key assert r"\cite{b}" not in result, f"\\cite{{b}} should be gone: {result!r}" assert r", b}" not in result and r"{b," not in result, \ f"b key should be stripped: {result!r}" # \cite{d} should be entirely removed (d was the only key) assert r"\cite{d}" not in result, f"\\cite{{d}} should be gone: {result!r}" print(f"[IMP-14] PASS: multiple cites → {result!r}") def test_whitespace_cleanup(self): """After removing a full \\cite{}, leftover double-spaces and ' .' are cleaned.""" tex = r"Text \cite{missing} end." bib = "" result = self._run_cite_stripping(tex, bib) # Should not have double spaces or " ." assert " " not in result, f"Double space in result: {result!r}" assert " ." not in result, f"Space-dot in result: {result!r}" print(f"[IMP-14] PASS: whitespace cleanup → {result!r}") # ============================================================ # IMP-15: Test BibTeX deduplication # ============================================================ class TestIMP15_BibDedup: """IMP-15: Deduplicate .bib entries sharing the same cite key.""" @staticmethod def _run_dedup(bib_text: str) -> str: """Reproduce IMP-15 logic from runner.py lines 486-503.""" _seen_bib_keys: set[str] = set() _deduped_entries: list[str] = [] for _bm in re.finditer( r"(@\w+\{([^,]+),.*?\n\})", bib_text, re.DOTALL ): _bkey = _bm.group(2).strip() if _bkey not in _seen_bib_keys: _seen_bib_keys.add(_bkey) _deduped_entries.append(_bm.group(1)) if len(_deduped_entries) < len( list(re.finditer(r"@\w+\{", bib_text)) ): bib_text = "\n\n".join(_deduped_entries) + "\n" return bib_text def test_duplicate_entries_removed(self): bib = textwrap.dedent("""\ @article{smith2024, author = {Smith}, title = {Paper 1}, year = {2024}, } @article{smith2024, author = {Smith}, title = {Paper 1 duplicate}, year = {2024}, } @article{jones2023, author = {Jones}, title = {Paper 2}, year = {2023}, } """) result = self._run_dedup(bib) # Count how many @article{smith2024, appear count_smith = len(re.findall(r"@article\{smith2024,", result)) count_jones = len(re.findall(r"@article\{jones2023,", result)) assert count_smith == 1, f"Expected 1 smith2024 entry, got {count_smith}" assert count_jones == 1, f"Expected 1 jones2023 entry, got {count_jones}" # First version should be kept assert "Paper 1" in result print(f"[IMP-15] PASS: 2 smith2024 → 1, jones2023 kept. Total entries correct.") def test_no_duplicates_unchanged(self): bib = textwrap.dedent("""\ @article{alpha2024, author = {Alpha}, title = {A}, year = {2024}, } @inproceedings{beta2023, author = {Beta}, title = {B}, year = {2023}, } """) result = self._run_dedup(bib) # Should remain unchanged (both entries present) assert "alpha2024" in result assert "beta2023" in result count = len(re.findall(r"@\w+\{", result)) assert count == 2, f"Expected 2 entries, got {count}" print(f"[IMP-15] PASS: no duplicates → unchanged") def test_triple_duplicate(self): bib = textwrap.dedent("""\ @article{x2024, author = {X}, title = {First}, year = {2024}, } @article{x2024, author = {X}, title = {Second}, year = {2024}, } @article{x2024, author = {X}, title = {Third}, year = {2024}, } """) result = self._run_dedup(bib) count = len(re.findall(r"@article\{x2024,", result)) assert count == 1, f"Expected 1 x2024 entry, got {count}" # First version kept assert "First" in result assert "Second" not in result assert "Third" not in result print(f"[IMP-15] PASS: triple duplicate → 1 entry") def test_empty_bib(self): """Edge case: empty bib text should not crash.""" bib = "" result = self._run_dedup(bib) assert result == "", f"Expected empty, got: {result!r}" print(f"[IMP-15] PASS: empty bib → no crash") # ============================================================ # IMP-16: Test bootstrap CI fallback # ============================================================ class TestIMP16_BootstrapCIFallback: """IMP-16: If bootstrap CI does not contain the mean, fall back to normal approximation (mean +/- 1.96*SE).""" @staticmethod def _compute_ci_with_fallback(vals: list[float]) -> tuple[float, float, bool]: """Reproduce IMP-16 logic from executor.py lines 3367-3397. Returns (ci_low, ci_high, used_fallback).""" _mean = statistics.mean(vals) _std = statistics.stdev(vals) # Bootstrap 95% CI _rng = random.Random(42) _boot_means = [] for _ in range(1000): _sample = [_rng.choice(vals) for _ in range(len(vals))] _boot_means.append(statistics.mean(_sample)) _boot_means.sort() _ci_low = round(_boot_means[int(0.025 * len(_boot_means))], 6) _ci_high = round(_boot_means[int(0.975 * len(_boot_means))], 6) # IMP-16: Sanity check used_fallback = False if _ci_low > _mean or _ci_high < _mean: _se = _std / (len(vals) ** 0.5) _ci_low = round(_mean - 1.96 * _se, 6) _ci_high = round(_mean + 1.96 * _se, 6) used_fallback = True return _ci_low, _ci_high, used_fallback def test_normal_case_no_fallback(self): """Normal data: bootstrap CI should contain the mean, no fallback needed.""" vals = [0.8, 0.82, 0.79, 0.81, 0.83] ci_low, ci_high, used_fallback = self._compute_ci_with_fallback(vals) mean = statistics.mean(vals) assert ci_low <= mean <= ci_high, \ f"CI [{ci_low}, {ci_high}] should contain mean {mean}" assert not used_fallback, "Should NOT have used fallback for normal data" print(f"[IMP-16] PASS: normal data → CI=[{ci_low}, {ci_high}], mean={mean:.4f}, no fallback") def test_fallback_triggers_for_pathological_data(self): """Construct data where bootstrap CI might not contain the mean. This tests the fallback logic path itself. We directly test the condition and fallback formula rather than relying on pathological data generation (which is inherently fragile). """ # Directly test the fallback formula vals = [1.0, 2.0, 3.0, 4.0, 5.0] mean = statistics.mean(vals) std = statistics.stdev(vals) se = std / (len(vals) ** 0.5) # Simulate a bad CI that doesn't contain the mean bad_ci_low = mean + 0.1 # Above mean - CI doesn't contain mean bad_ci_high = mean + 1.0 # Apply fallback logic assert bad_ci_low > mean, "Bad CI should not contain mean" fallback_low = round(mean - 1.96 * se, 6) fallback_high = round(mean + 1.96 * se, 6) assert fallback_low <= mean <= fallback_high, \ f"Fallback CI [{fallback_low}, {fallback_high}] must contain mean {mean}" print(f"[IMP-16] PASS: fallback CI=[{fallback_low}, {fallback_high}], mean={mean:.4f}") def test_fallback_ci_always_contains_mean(self): """The normal-approximation fallback MUST always contain the mean.""" test_cases = [ [10, 20, 30], [0.001, 0.002, 0.003, 0.004], [100, 200, 300, 400, 500], [-5, -3, -1, 1, 3, 5], ] for vals in test_cases: mean = statistics.mean(vals) std = statistics.stdev(vals) se = std / (len(vals) ** 0.5) ci_low = round(mean - 1.96 * se, 6) ci_high = round(mean + 1.96 * se, 6) assert ci_low <= mean <= ci_high, \ f"Fallback CI [{ci_low}, {ci_high}] must contain mean {mean} for vals={vals}" print(f"[IMP-16] PASS: fallback always contains mean for {len(test_cases)} test cases") def test_condition_check_logic(self): """Verify the condition `_ci_low > _mean or _ci_high < _mean` is correct. The condition should detect when the mean is OUTSIDE the CI.""" mean = 5.0 # Case 1: Mean below CI assert (6.0 > mean or 8.0 < mean) == True, "Mean below CI not detected" # Case 2: Mean above CI assert (1.0 > mean or 4.0 < mean) == True, "Mean above CI not detected" # Case 3: Mean inside CI assert (3.0 > mean or 7.0 < mean) == False, "Mean inside CI incorrectly flagged" # Case 4: Mean equals boundary assert (5.0 > mean or 7.0 < mean) == False, "Mean at lower boundary incorrectly flagged" assert (3.0 > mean or 5.0 < mean) == False, "Mean at upper boundary incorrectly flagged" print("[IMP-16] PASS: condition check logic correct for all cases") def test_min_sample_size(self): """The code requires len(vals) >= 3 for bootstrap. Verify with exactly 3.""" vals = [1.0, 2.0, 3.0] ci_low, ci_high, _ = self._compute_ci_with_fallback(vals) mean = statistics.mean(vals) assert ci_low <= mean <= ci_high, \ f"CI [{ci_low}, {ci_high}] should contain mean {mean} for n=3" print(f"[IMP-16] PASS: n=3 works → CI=[{ci_low}, {ci_high}], mean={mean:.4f}") # ============================================================ # Integration-style: Test the runner.py _package_deliverables # cite-stripping + dedup pipeline end-to-end # ============================================================ class TestIMP14_15_Integration: """End-to-end test: dedup + cite stripping on a realistic scenario.""" def test_dedup_then_strip(self): """Run dedup (IMP-15) then cite-strip (IMP-14) in sequence, as runner.py does.""" bib_text = textwrap.dedent("""\ @article{smith2024, author = {Smith}, title = {Paper A}, year = {2024}, } @article{smith2024, author = {Smith}, title = {Paper A dup}, year = {2024}, } @article{jones2023, author = {Jones}, title = {Paper B}, year = {2023}, } """) tex_text = r"Results from \cite{smith2024, jones2023, ghost2024} show..." # Step 1: IMP-15 dedup _seen: set[str] = set() _deduped: list[str] = [] for m in re.finditer(r"(@\w+\{([^,]+),.*?\n\})", bib_text, re.DOTALL): k = m.group(2).strip() if k not in _seen: _seen.add(k) _deduped.append(m.group(1)) if len(_deduped) < len(list(re.finditer(r"@\w+\{", bib_text))): bib_text = "\n\n".join(_deduped) + "\n" # Verify dedup assert bib_text.count("smith2024") == 1, "Dedup failed for smith2024" # Step 2: IMP-14 cite stripping all_cite_keys: set[str] = set() for cm in re.finditer(r"\\cite\{([^}]+)\}", tex_text): all_cite_keys.update(k.strip() for k in cm.group(1).split(",")) bib_keys = set(re.findall(r"@\w+\{([^,]+),", bib_text)) missing = all_cite_keys - bib_keys assert missing == {"ghost2024"}, f"Expected only ghost2024 missing, got {missing}" def _filter_cite(m: re.Match[str]) -> str: keys = [k.strip() for k in m.group(1).split(",")] kept = [k for k in keys if k not in missing] if not kept: return "" return "\\cite{" + ", ".join(kept) + "}" tex_text = re.sub(r"\\cite\{([^}]+)\}", _filter_cite, tex_text) tex_text = re.sub(r" +", " ", tex_text) tex_text = re.sub(r" ([.,;:)])", r"\1", tex_text) assert r"\cite{smith2024, jones2023}" in tex_text, \ f"Expected valid keys kept, got: {tex_text!r}" assert "ghost2024" not in tex_text, \ f"ghost2024 should be stripped: {tex_text!r}" print(f"[Integration] PASS: dedup + cite strip → {tex_text!r}") # ============================================================ # Runner # ============================================================ def run_all_tests(): """Run all tests manually (fallback if pytest not available).""" test_classes = [ TestIMP13_ExtractPaperTitle, TestIMP14_StripOrphanedCites, TestIMP15_BibDedup, TestIMP16_BootstrapCIFallback, TestIMP14_15_Integration, ] total = 0 passed = 0 failed = 0 errors: list[str] = [] for cls in test_classes: instance = cls() test_methods = [m for m in dir(instance) if m.startswith("test_")] for method_name in sorted(test_methods): total += 1 method = getattr(instance, method_name) try: method() passed += 1 except Exception as e: failed += 1 err_msg = f"FAIL: {cls.__name__}.{method_name}: {e}" errors.append(err_msg) print(f" FAIL: {err_msg}") print(f"\n{'='*60}") print(f"Results: {passed}/{total} passed, {failed} failed") if errors: print("Failures:") for e in errors: print(f" - {e}") print(f"{'='*60}") return failed == 0 if __name__ == "__main__": # Add project root to path project_root = Path(__file__).resolve().parent.parent if str(project_root) not in sys.path: sys.path.insert(0, str(project_root)) success = run_all_tests() sys.exit(0 if success else 1) ================================================ FILE: tests/test_verified_registry.py ================================================ """Tests for VerifiedRegistry — ground truth number whitelist.""" from __future__ import annotations import json import math from pathlib import Path import pytest from researchclaw.pipeline.verified_registry import ( ConditionResult, VerifiedRegistry, _is_finite, ) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- ARTIFACTS = Path(__file__).resolve().parent.parent / "artifacts" def _load_experiment_summary(run_id: str) -> dict: """Load experiment_summary.json for a given run.""" pattern = f"rc-*-{run_id}" matches = sorted(ARTIFACTS.glob(pattern)) if not matches: pytest.skip(f"Artifact {run_id} not found") summary_path = matches[0] / "stage-14" / "experiment_summary.json" if not summary_path.exists(): pytest.skip(f"No experiment_summary for {run_id}") return json.loads(summary_path.read_text()) def _load_refinement_log(run_id: str) -> dict | None: pattern = f"rc-*-{run_id}" matches = sorted(ARTIFACTS.glob(pattern)) if not matches: return None log_path = matches[0] / "stage-13" / "refinement_log.json" if not log_path.exists(): return None return json.loads(log_path.read_text()) # --------------------------------------------------------------------------- # Unit tests — ConditionResult # --------------------------------------------------------------------------- class TestConditionResult: def test_compute_stats_multiple_seeds(self): cr = ConditionResult(name="test", per_seed_values={0: 10.0, 1: 20.0, 2: 30.0}) cr.compute_stats() assert cr.n_seeds == 3 assert cr.mean == pytest.approx(20.0) assert cr.std == pytest.approx(10.0) def test_compute_stats_single_seed(self): cr = ConditionResult(name="test", per_seed_values={0: 42.0}) cr.compute_stats() assert cr.n_seeds == 1 assert cr.mean == pytest.approx(42.0) assert cr.std == 0.0 def test_compute_stats_with_nan(self): cr = ConditionResult( name="test", per_seed_values={0: 10.0, 1: float("nan"), 2: 30.0} ) cr.compute_stats() assert cr.n_seeds == 2 # NaN excluded assert cr.mean == pytest.approx(20.0) def test_compute_stats_empty(self): cr = ConditionResult(name="test") cr.compute_stats() assert cr.n_seeds == 0 assert cr.mean is None # --------------------------------------------------------------------------- # Unit tests — VerifiedRegistry core operations # --------------------------------------------------------------------------- class TestVerifiedRegistryCore: def test_add_value(self): reg = VerifiedRegistry() reg.add_value(74.28, "test_source") assert reg.is_verified(74.28) # Rounding variant assert reg.is_verified(74.3, tolerance=0.01) def test_percentage_conversion(self): """Value in [0,1] should also register value*100.""" reg = VerifiedRegistry() reg.add_value(0.7428, "accuracy_fraction") assert reg.is_verified(0.7428) assert reg.is_verified(74.28) # ×100 variant def test_reverse_percentage(self): """Value > 1 should also register value/100.""" reg = VerifiedRegistry() reg.add_value(74.28, "accuracy_percent") assert reg.is_verified(74.28) assert reg.is_verified(0.7428) # ÷100 variant def test_tolerance_matching(self): reg = VerifiedRegistry() reg.add_value(92.14, "test") # Within 1% tolerance assert reg.is_verified(92.14) assert reg.is_verified(92.0, tolerance=0.01) # 0.15% off # Outside tolerance assert not reg.is_verified(95.0, tolerance=0.01) def test_zero_handling(self): reg = VerifiedRegistry() reg.add_value(0.0, "zero_metric") assert reg.is_verified(0.0) assert reg.is_verified(1e-8) # Very close to zero assert not reg.is_verified(0.01) # Not close enough def test_negative_values(self): reg = VerifiedRegistry() reg.add_value(-459.6, "bad_return") assert reg.is_verified(-459.6) assert reg.is_verified(-460.0, tolerance=0.01) def test_nan_inf_rejected(self): reg = VerifiedRegistry() reg.add_value(float("nan"), "nan_metric") reg.add_value(float("inf"), "inf_metric") assert not reg.is_verified(float("nan")) assert not reg.is_verified(float("inf")) assert len(reg.values) == 0 def test_lookup(self): reg = VerifiedRegistry() reg.add_value(42.0, "the_answer") assert reg.lookup(42.0) == "the_answer" assert reg.lookup(999.0) is None def test_verify_condition(self): reg = VerifiedRegistry() reg.condition_names = {"DQN", "DQN+Abstraction"} assert reg.verify_condition("DQN") assert not reg.verify_condition("PPO") # --------------------------------------------------------------------------- # Unit tests — from_experiment (synthetic data) # --------------------------------------------------------------------------- class TestFromExperiment: def _make_summary(self) -> dict: return { "metrics_summary": { "CondA/0/metric": {"min": 80.0, "max": 80.0, "mean": 80.0, "count": 1}, "CondA/1/metric": {"min": 85.0, "max": 85.0, "mean": 85.0, "count": 1}, "CondB/0/metric": {"min": 70.0, "max": 70.0, "mean": 70.0, "count": 1}, "primary_metric": {"min": 82.5, "max": 82.5, "mean": 82.5, "count": 1}, }, "best_run": { "metrics": { "CondA/0/metric": 80.0, "CondA/1/metric": 85.0, "CondB/0/metric": 70.0, "primary_metric": 82.5, "primary_metric_std": 3.5355, "total_elapsed_seconds": 1500.0, }, }, "condition_summaries": { "CondA": {"metrics": {"metric": 82.5}}, "CondB": {"metrics": {"metric": 70.0}}, }, "condition_metrics": { "CondA": {"metrics": {"metric": 82.5}}, "CondB": {"metrics": {"metric": 70.0}}, }, "total_conditions": 2, } def test_conditions_extracted(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) assert "CondA" in reg.condition_names assert "CondB" in reg.condition_names assert len(reg.condition_names) == 2 def test_per_seed_values(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) assert reg.conditions["CondA"].per_seed_values == {0: 80.0, 1: 85.0} assert reg.conditions["CondB"].per_seed_values == {0: 70.0} def test_condition_stats(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) cond_a = reg.conditions["CondA"] assert cond_a.n_seeds == 2 assert cond_a.mean == pytest.approx(82.5) assert cond_a.std == pytest.approx(3.5355, rel=0.01) def test_primary_metric(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) assert reg.primary_metric == pytest.approx(82.5) assert reg.primary_metric_std == pytest.approx(3.5355) def test_all_values_registered(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) # Core values must be verified assert reg.is_verified(80.0) assert reg.is_verified(85.0) assert reg.is_verified(70.0) assert reg.is_verified(82.5) assert reg.is_verified(3.5355, tolerance=0.01) def test_pairwise_differences(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) diff = 82.5 - 70.0 # CondA.mean - CondB.mean assert reg.is_verified(diff) assert reg.is_verified(abs(diff)) def test_fabricated_number_rejected(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) assert not reg.is_verified(99.99) assert not reg.is_verified(60.51) def test_infra_keys_excluded(self): reg = VerifiedRegistry.from_experiment(self._make_summary()) # total_elapsed_seconds goes to training_config, not values assert 1500.0 not in reg.values assert reg.training_config.get("total_elapsed_seconds") == 1500.0 def test_with_refinement_log(self): summary = self._make_summary() ref_log = { "best_metric": 82.5, "best_version": "experiment_v1/", "iterations": [ { "version_dir": "experiment_v1/", "metric": 82.5, "sandbox": {"metrics": {"CondA/0/metric": 80.0}}, } ], } reg = VerifiedRegistry.from_experiment(summary, ref_log) assert reg.is_verified(82.5) # --------------------------------------------------------------------------- # Integration tests — real artifact data # --------------------------------------------------------------------------- class TestRealArtifacts: """Test against actual pipeline output. Skipped if artifacts not present.""" def test_run_e57360_rl_exploration(self): """Run 38 (RL LACE) — 3 conditions, CartPole + Acrobot.""" summary = _load_experiment_summary("e57360") ref_log = _load_refinement_log("e57360") reg = VerifiedRegistry.from_experiment(summary, ref_log) # Conditions that actually ran assert reg.verify_condition("DQN") assert reg.verify_condition("DQN+Abstraction") assert reg.verify_condition("DQN+RawCount") # Conditions that did NOT run (paper fabricated these) assert not reg.verify_condition("PPO") assert not reg.verify_condition("PPO+Abstraction") assert not reg.verify_condition("DQN+Autoencoder") # Real primary metric assert reg.is_verified(278.9333) assert reg.is_verified(146.4139, tolerance=0.01) # Fabricated number from paper (0.0 primary metric) — should NOT verify # unless 0.0 happens to be in the data for another reason # The paper claimed primary_metric=0.0 which is fabricated assert reg.primary_metric == pytest.approx(278.9333) def test_run_acbdfa_cnn_vs_ssm(self): """Run acbdfa (CTS) — ResNet vs S4D on CIFAR-100.""" summary = _load_experiment_summary("acbdfa") reg = VerifiedRegistry.from_experiment(summary) # Real values from experiment assert reg.is_verified(69.99) assert reg.is_verified(69.93) assert reg.is_verified(58.66) assert reg.is_verified(2.75) # Primary metric assert reg.is_verified(66.1933, tolerance=0.01) def test_run_85fefc_contrastive_kd(self): """Run 85fefc (CRAFT) — contrastive KD.""" summary = _load_experiment_summary("85fefc") ref_log = _load_refinement_log("85fefc") reg = VerifiedRegistry.from_experiment(summary, ref_log) # Should have conditions assert len(reg.condition_names) > 0 # Primary metric should be registered assert reg.primary_metric is not None def test_run_8b4a1b_gard_lora(self): """Run 8b4a1b (GARD) — experiment failed, very few values.""" summary = _load_experiment_summary("8b4a1b") reg = VerifiedRegistry.from_experiment(summary) # With empty metrics, registry should be sparse best_metrics = summary.get("best_run", {}).get("metrics", {}) if not best_metrics: assert len(reg.values) == 0 # --------------------------------------------------------------------------- # Unit tests — from_run_dir (merges multiple sources) # --------------------------------------------------------------------------- class TestFromRunDir: def _write_summary(self, path: Path, data: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(data, indent=2), encoding="utf-8") def test_from_run_dir_merges_multiple_stage14(self, tmp_path: Path) -> None: """Two stage-14 dirs with different values → both present.""" run_dir = tmp_path / "run" run_dir.mkdir() # Stage-14 with CondA self._write_summary( run_dir / "stage-14" / "experiment_summary.json", { "best_run": {"metrics": {"CondA/0/metric": 80.0}}, "condition_summaries": {"CondA": {"metrics": {"metric": 80.0}}}, "metrics_summary": {}, }, ) # Stage-14-v2 with CondB self._write_summary( run_dir / "stage-14-v2" / "experiment_summary.json", { "best_run": {"metrics": {"CondB/0/metric": 90.0}}, "condition_summaries": {"CondB": {"metrics": {"metric": 90.0}}}, "metrics_summary": {}, }, ) reg = VerifiedRegistry.from_run_dir(run_dir) assert "CondA" in reg.condition_names assert "CondB" in reg.condition_names assert reg.is_verified(80.0) assert reg.is_verified(90.0) def test_from_run_dir_includes_best(self, tmp_path: Path) -> None: """experiment_summary_best.json values merged.""" run_dir = tmp_path / "run" run_dir.mkdir() # Only best summary at root level self._write_summary( run_dir / "experiment_summary_best.json", { "best_run": {"metrics": {"primary_metric": 0.95}}, "condition_summaries": {"Proposed": {"metrics": {"acc": 0.95}}}, "metrics_summary": {"acc": {"mean": 0.95, "min": 0.95, "max": 0.95}}, }, ) reg = VerifiedRegistry.from_run_dir(run_dir) assert reg.is_verified(0.95) assert reg.is_verified(95.0) # percentage variant assert "Proposed" in reg.condition_names def test_from_run_dir_empty_dir(self, tmp_path: Path) -> None: """Empty run dir → empty registry, no crash.""" run_dir = tmp_path / "empty_run" run_dir.mkdir() reg = VerifiedRegistry.from_run_dir(run_dir) assert len(reg.values) == 0 assert len(reg.condition_names) == 0 # ----------------------------------------------------------------------- # BUG-222: best_only mode — REFINE bypass prevention # ----------------------------------------------------------------------- def test_best_only_uses_experiment_summary_best(self, tmp_path: Path) -> None: """best_only=True should use ONLY experiment_summary_best.json.""" run_dir = tmp_path / "run" run_dir.mkdir() # v1 (best): FeatureKD 74.52% self._write_summary( run_dir / "experiment_summary_best.json", { "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}}, "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}}, "metrics_summary": {"metric": {"mean": 0.7452}}, }, ) # v3 (regressed): FeatureKD 69.30% self._write_summary( run_dir / "stage-14" / "experiment_summary.json", { "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}}, "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.6930}}}, "metrics_summary": {"metric": {"mean": 0.6930}}, }, ) reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True) # Should ONLY have v1 (best) data assert reg.is_verified(0.7452) assert reg.is_verified(74.52) # percentage variant # Should NOT have v3 (regressed) data assert not reg.is_verified(0.6930) assert not reg.is_verified(69.30) def test_best_only_excludes_refinement_log(self, tmp_path: Path) -> None: """best_only=True should NOT merge refinement_log.json sandbox data.""" run_dir = tmp_path / "run" run_dir.mkdir() # Best summary self._write_summary( run_dir / "experiment_summary_best.json", { "best_run": {"metrics": {"primary_metric": 0.7452}}, "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}}, "metrics_summary": {"metric": {"mean": 0.7452}}, }, ) # Refinement log with sandbox metrics from regressed iteration rl_dir = run_dir / "stage-13" rl_dir.mkdir(parents=True) (rl_dir / "refinement_log.json").write_text(json.dumps({ "iterations": [ {"sandbox": {"metrics": {"primary_metric": 0.6930, "best_metric": 0.6930}}} ] }), encoding="utf-8") reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True) assert reg.is_verified(0.7452) assert not reg.is_verified(0.6930), "Refinement log sandbox values should NOT be in best_only registry" def test_best_only_falls_back_to_stage14(self, tmp_path: Path) -> None: """best_only=True without best.json falls back to stage-14/ (non-versioned).""" run_dir = tmp_path / "run" run_dir.mkdir() self._write_summary( run_dir / "stage-14" / "experiment_summary.json", { "best_run": {"metrics": {"metric": 0.85}}, "condition_summaries": {"Baseline": {"metrics": {"metric": 0.85}}}, "metrics_summary": {"metric": {"mean": 0.85}}, }, ) reg = VerifiedRegistry.from_run_dir(run_dir, best_only=True) assert reg.is_verified(0.85) assert "Baseline" in reg.condition_names def test_default_mode_still_merges_all(self, tmp_path: Path) -> None: """Default (best_only=False) preserves backward-compat merging.""" run_dir = tmp_path / "run" run_dir.mkdir() self._write_summary( run_dir / "experiment_summary_best.json", { "best_run": {"metrics": {"FeatureKD/0/metric": 0.7452}}, "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.7452}}}, "metrics_summary": {}, }, ) self._write_summary( run_dir / "stage-14" / "experiment_summary.json", { "best_run": {"metrics": {"FeatureKD/0/metric": 0.6930}}, "condition_summaries": {"FeatureKD": {"metrics": {"metric": 0.6930}}}, "metrics_summary": {}, }, ) reg = VerifiedRegistry.from_run_dir(run_dir, best_only=False) # Both should be present in non-best_only mode assert reg.is_verified(0.7452) assert reg.is_verified(0.6930) ================================================ FILE: tests/test_web_crawler.py ================================================ """Tests for researchclaw.web.crawler — WebCrawler.""" from __future__ import annotations import asyncio from unittest.mock import MagicMock, patch import pytest from researchclaw.web.crawler import CrawlResult, WebCrawler from researchclaw.web import check_url_ssrf # --------------------------------------------------------------------------- # CrawlResult dataclass # --------------------------------------------------------------------------- class TestCrawlResult: def test_has_content_true(self): r = CrawlResult(url="https://example.com", markdown="x" * 100, success=True) assert r.has_content def test_has_content_false_empty(self): r = CrawlResult(url="https://example.com", markdown="", success=True) assert not r.has_content def test_has_content_false_short(self): r = CrawlResult(url="https://example.com", markdown="too short", success=True) assert not r.has_content # --------------------------------------------------------------------------- # HTML → Markdown conversion (urllib fallback) # --------------------------------------------------------------------------- class TestHtmlToMarkdown: def test_strips_script_tags(self): html = "

Hello

World

" md = WebCrawler._html_to_markdown(html) assert "alert" not in md assert "Hello" in md assert "World" in md def test_converts_headings(self): html = "

Title

Subtitle

Section

" md = WebCrawler._html_to_markdown(html) assert "# Title" in md assert "## Subtitle" in md assert "### Section" in md def test_converts_paragraphs(self): html = "

First paragraph.

Second paragraph.

" md = WebCrawler._html_to_markdown(html) assert "First paragraph." in md assert "Second paragraph." in md def test_converts_links(self): html = 'Click' md = WebCrawler._html_to_markdown(html) assert "[Click](https://example.com)" in md def test_converts_list_items(self): html = "
  • Item 1
  • Item 2
" md = WebCrawler._html_to_markdown(html) assert "- Item 1" in md assert "- Item 2" in md def test_decodes_entities(self): html = "

A & B < C > D

" md = WebCrawler._html_to_markdown(html) assert "A & B < C > D" in md def test_collapses_whitespace(self): html = "

Hello

\n\n\n\n

World

" md = WebCrawler._html_to_markdown(html) assert "\n\n\n" not in md # --------------------------------------------------------------------------- # urllib fallback crawl # --------------------------------------------------------------------------- class TestCrawlUrllibFallback: @patch("researchclaw.web.crawler.urlopen") def test_crawl_urllib_success(self, mock_urlopen): mock_resp = MagicMock() mock_resp.read.return_value = b"Test

Content here

" mock_resp.headers = {"Content-Type": "text/html; charset=utf-8"} mock_urlopen.return_value = mock_resp crawler = WebCrawler() import time t0 = time.monotonic() result = crawler._crawl_with_urllib("https://example.com", t0) assert result.success assert result.title == "Test" assert "Content here" in result.markdown @patch("researchclaw.web.crawler.urlopen") def test_crawl_urllib_truncation(self, mock_urlopen): mock_resp = MagicMock() long_content = "

" + "x" * 60000 + "

" mock_resp.read.return_value = long_content.encode() mock_resp.headers = {"Content-Type": "text/html"} mock_urlopen.return_value = mock_resp crawler = WebCrawler(max_content_length=1000) import time t0 = time.monotonic() result = crawler._crawl_with_urllib("https://example.com", t0) assert len(result.markdown) <= 1100 # 1000 + truncation notice # --------------------------------------------------------------------------- # Sync crawl (goes through crawl4ai → urllib fallback chain) # --------------------------------------------------------------------------- class TestCrawlSync: @patch("researchclaw.web.crawler.urlopen") def test_crawl_sync_falls_back_to_urllib(self, mock_urlopen): """crawl_sync tries crawl4ai, then falls back to urllib.""" mock_resp = MagicMock() mock_resp.read.return_value = b"Sync

Works via urllib

" mock_resp.headers = {"Content-Type": "text/html"} mock_urlopen.return_value = mock_resp crawler = WebCrawler() # Crawl4AI may or may not work in test env (no browser), # but urllib fallback should always work result = crawler.crawl_sync("https://example.com") assert result.success or result.error # either crawl4ai or urllib # --------------------------------------------------------------------------- # Async crawl # --------------------------------------------------------------------------- class TestCrawlAsync: @patch("researchclaw.web.crawler.urlopen") def test_crawl_async_urllib_fallback(self, mock_urlopen): """When crawl4ai's browser isn't set up, async crawl falls back to urllib.""" mock_resp = MagicMock() mock_resp.read.return_value = b"Async

Works

" mock_resp.headers = {"Content-Type": "text/html"} mock_urlopen.return_value = mock_resp crawler = WebCrawler() result = asyncio.run(crawler.crawl("https://example.com")) # Should succeed via either crawl4ai or urllib fallback assert isinstance(result, CrawlResult) # --------------------------------------------------------------------------- # SSRF validation: check_url_ssrf # --------------------------------------------------------------------------- class TestCheckUrlSsrf: def test_http_allowed(self): assert check_url_ssrf("http://example.com") is None def test_https_allowed(self): assert check_url_ssrf("https://arxiv.org/abs/2301.00001") is None def test_rejects_file_scheme(self): err = check_url_ssrf("file:///etc/passwd") assert err is not None assert "scheme" in err.lower() def test_rejects_ftp_scheme(self): err = check_url_ssrf("ftp://server/file") assert err is not None def test_rejects_localhost(self): err = check_url_ssrf("http://localhost:8080") assert err is not None assert "internal" in err.lower() or "private" in err.lower() or "blocked" in err.lower() def test_rejects_127(self): err = check_url_ssrf("http://127.0.0.1:6379") assert err is not None def test_rejects_10_range(self): err = check_url_ssrf("http://10.0.0.1") assert err is not None def test_rejects_172_range(self): err = check_url_ssrf("http://172.16.0.1") assert err is not None def test_rejects_192_range(self): err = check_url_ssrf("http://192.168.1.1") assert err is not None def test_rejects_aws_metadata(self): err = check_url_ssrf("http://169.254.169.254/latest/meta-data") assert err is not None def test_rejects_empty_hostname(self): err = check_url_ssrf("http://") assert err is not None # --------------------------------------------------------------------------- # Crawler SSRF integration # --------------------------------------------------------------------------- class TestCrawlerSsrfIntegration: @patch("researchclaw.web.crawler.urlopen") def test_crawl_sync_rejects_private_url(self, mock_urlopen): crawler = WebCrawler() result = crawler.crawl_sync("http://127.0.0.1:8080") assert not result.success assert result.error mock_urlopen.assert_not_called() @patch("researchclaw.web.crawler.urlopen") def test_crawl_sync_rejects_file_scheme(self, mock_urlopen): crawler = WebCrawler() result = crawler.crawl_sync("file:///etc/passwd") assert not result.success assert "scheme" in result.error.lower() mock_urlopen.assert_not_called() @patch("researchclaw.web.crawler.urlopen") def test_crawl_async_rejects_private_url(self, mock_urlopen): crawler = WebCrawler() result = asyncio.run(crawler.crawl("http://10.0.0.1:9200")) assert not result.success assert result.error mock_urlopen.assert_not_called() ================================================ FILE: tests/test_web_integration.py ================================================ """Integration tests for researchclaw.web — WebSearchAgent end-to-end.""" from __future__ import annotations from unittest.mock import MagicMock, patch import pytest from researchclaw.web.agent import WebSearchAgent, WebSearchAgentResult from researchclaw.web.crawler import CrawlResult from researchclaw.web.search import SearchResult, WebSearchResponse from researchclaw.web.scholar import ScholarPaper # --------------------------------------------------------------------------- # WebSearchAgentResult # --------------------------------------------------------------------------- class TestWebSearchAgentResult: def test_total_results(self): r = WebSearchAgentResult( topic="test", web_results=[SearchResult(title="A", url="u1")], scholar_papers=[ScholarPaper(title="B")], ) assert r.total_results == 2 def test_to_context_string_empty(self): r = WebSearchAgentResult(topic="test") ctx = r.to_context_string() assert isinstance(ctx, str) def test_to_context_string_with_results(self): r = WebSearchAgentResult( topic="knowledge distillation", web_results=[ SearchResult( title="KD Survey", url="https://example.com/kd", snippet="A comprehensive survey on KD", source="tavily", ), ], scholar_papers=[ ScholarPaper( title="Distilling Knowledge", authors=["Hinton", "Vinyals", "Dean"], year=2015, citation_count=5000, abstract="We propose a technique for model compression.", ), ], search_answer="KD is a model compression technique.", ) ctx = r.to_context_string() assert "AI Search Summary" in ctx assert "KD Survey" in ctx assert "Distilling Knowledge" in ctx assert "Hinton" in ctx def test_to_context_string_truncation(self): r = WebSearchAgentResult( topic="test", web_results=[ SearchResult(title=f"R{i}", url=f"u{i}", snippet="x" * 1000) for i in range(50) ], ) ctx = r.to_context_string(max_length=5000) assert len(ctx) <= 5100 def test_to_dict(self): r = WebSearchAgentResult( topic="test", web_results=[SearchResult(title="A", url="u1")], ) d = r.to_dict() assert d["topic"] == "test" assert d["web_results_count"] == 1 def test_to_context_with_crawled_pages(self): r = WebSearchAgentResult( topic="test", crawled_pages=[ CrawlResult( url="https://blog.example.com", markdown="# Great Blog Post\n\nContent " * 50, title="Great Blog Post", success=True, ), ], ) ctx = r.to_context_string() assert "Crawled Page Content" in ctx assert "Great Blog Post" in ctx # --------------------------------------------------------------------------- # WebSearchAgent — orchestration # --------------------------------------------------------------------------- class TestWebSearchAgent: def test_generate_queries(self): queries = WebSearchAgent._generate_queries("knowledge distillation") assert len(queries) == 3 assert "knowledge distillation" in queries assert any("survey" in q for q in queries) assert any("benchmark" in q for q in queries) def test_select_urls_to_crawl(self): agent = WebSearchAgent(max_crawl_urls=3) result = WebSearchAgentResult( topic="test", web_results=[ SearchResult(title=f"R{i}", url=f"https://ex.com/{i}") for i in range(10) ], ) urls = agent._select_urls_to_crawl(result) assert len(urls) <= 3 assert all(url.startswith("https://") for url in urls) def test_select_urls_skips_pdf(self): agent = WebSearchAgent(max_crawl_urls=5) result = WebSearchAgentResult( topic="test", web_results=[ SearchResult(title="Paper", url="https://ex.com/paper.pdf"), SearchResult(title="Blog", url="https://ex.com/blog"), ], ) urls = agent._select_urls_to_crawl(result) assert "https://ex.com/paper.pdf" not in urls assert "https://ex.com/blog" in urls def test_find_pdf_urls(self): result = WebSearchAgentResult( topic="test", web_results=[ SearchResult(title="P1", url="https://ex.com/a.pdf"), SearchResult(title="P2", url="https://ex.com/b.html"), SearchResult(title="P3", url="https://ex.com/c.pdf"), ], ) pdfs = WebSearchAgent._find_pdf_urls(result) assert len(pdfs) == 2 assert all(u.endswith(".pdf") for u in pdfs) @patch("researchclaw.web.search.urlopen") @patch("researchclaw.web.scholar.scholarly") def test_search_and_extract_minimal(self, mock_scholarly, mock_urlopen): """End-to-end test with mocked HTTP — DuckDuckGo + mocked Scholar.""" mock_resp = MagicMock() mock_resp.read.return_value = b""" Paper About KD A study on knowledge distillation """ mock_urlopen.return_value = mock_resp # Mock scholarly to return empty (avoid network calls) mock_scholarly.search_pubs.return_value = iter([]) agent = WebSearchAgent( enable_scholar=True, enable_crawling=False, enable_pdf=False, ) result = agent.search_and_extract("knowledge distillation") assert result.topic == "knowledge distillation" assert result.elapsed_seconds > 0 @patch("researchclaw.web.search.urlopen") @patch("researchclaw.web.scholar.scholarly") @patch("researchclaw.web.crawler.urlopen") def test_search_and_extract_with_crawling(self, mock_crawl_urlopen, mock_scholarly, mock_search_urlopen): """Test with crawling enabled.""" mock_search_resp = MagicMock() mock_search_resp.read.return_value = b""" KD Tutorial A tutorial """ mock_search_urlopen.return_value = mock_search_resp mock_crawl_resp = MagicMock() mock_crawl_resp.read.return_value = ( b"KD Tutorial

" + b"Tutorial content about knowledge distillation. " * 20 + b"

" ) mock_crawl_resp.headers = {"Content-Type": "text/html"} mock_crawl_urlopen.return_value = mock_crawl_resp mock_scholarly.search_pubs.return_value = iter([]) agent = WebSearchAgent( enable_scholar=False, enable_crawling=True, enable_pdf=False, max_crawl_urls=2, ) result = agent.search_and_extract("knowledge distillation") assert result.elapsed_seconds > 0 # --------------------------------------------------------------------------- # Config integration # --------------------------------------------------------------------------- class TestWebSearchConfig: def test_default_config(self): from researchclaw.config import WebSearchConfig cfg = WebSearchConfig() assert cfg.enabled is True assert cfg.max_web_results == 10 assert cfg.enable_scholar is True def test_config_in_rcconfig(self): from researchclaw.config import RCConfig import dataclasses field_names = [f.name for f in dataclasses.fields(RCConfig)] assert "web_search" in field_names ================================================ FILE: tests/test_web_pdf_extractor.py ================================================ """Tests for researchclaw.web.pdf_extractor — PDFExtractor.""" from __future__ import annotations import tempfile from pathlib import Path from unittest.mock import MagicMock, patch import pytest from researchclaw.web.pdf_extractor import PDFContent, PDFExtractor # --------------------------------------------------------------------------- # PDFContent dataclass # --------------------------------------------------------------------------- class TestPDFContent: def test_has_content_true(self): c = PDFContent(path="test.pdf", text="x" * 200, success=True) assert c.has_content def test_has_content_false_empty(self): c = PDFContent(path="test.pdf", text="", success=True) assert not c.has_content def test_has_content_false_short(self): c = PDFContent(path="test.pdf", text="short", success=True) assert not c.has_content # --------------------------------------------------------------------------- # PDFExtractor # --------------------------------------------------------------------------- class TestPDFExtractor: def test_backend_detection(self): extractor = PDFExtractor() assert extractor.backend == "pymupdf" # PyMuPDF is now installed def test_extract_nonexistent_file(self, tmp_path): extractor = PDFExtractor() result = extractor.extract(tmp_path / "does_not_exist.pdf") assert not result.success or "not found" in result.error.lower() or result.error def test_extract_abstract_pattern(self): text = """ Some header text Abstract This paper presents a novel approach to knowledge distillation that achieves state-of-the-art results on ImageNet. 1 Introduction We begin by motivating our approach... """ abstract = PDFExtractor._extract_abstract(text) assert "knowledge distillation" in abstract def test_extract_abstract_no_match(self): text = "No abstract section here, just random text." abstract = PDFExtractor._extract_abstract(text) assert abstract == "" def test_detect_sections(self): text = """ 1. Introduction This is the introduction section with some content. 2. Related Work This covers prior work in the field. 3. Method Our proposed approach works as follows. 4. Experiments We evaluate on several benchmarks. """ sections = PDFExtractor._detect_sections(text) assert len(sections) >= 3 headings = [s["heading"] for s in sections] assert any("Introduction" in h for h in headings) assert any("Related" in h or "Method" in h for h in headings) def test_detect_sections_empty(self): text = "No numbered sections here at all." sections = PDFExtractor._detect_sections(text) assert sections == [] @patch("researchclaw.web.pdf_extractor.urlopen") def test_extract_from_url_failure(self, mock_urlopen): mock_urlopen.side_effect = Exception("404 Not Found") extractor = PDFExtractor() result = extractor.extract_from_url("https://example.com/paper.pdf") assert not result.success or result.error ================================================ FILE: tests/test_web_platform.py ================================================ """Tests for Agent A — Web platform and user interface. Covers: FastAPI routes, WebSocket, intents, dashboard collector, wizard, voice commands. All tests run without external services (mocked LLM, mocked Whisper). """ from __future__ import annotations import asyncio import json import os import sys import tempfile import time from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest # --------------------------------------------------------------------------- # Config tests # --------------------------------------------------------------------------- class TestServerConfig: """Test ServerConfig and DashboardConfig in config.py.""" def test_server_config_defaults(self) -> None: from researchclaw.config import ServerConfig cfg = ServerConfig() assert cfg.enabled is False assert cfg.host == "0.0.0.0" assert cfg.port == 8080 assert cfg.cors_origins == ("*",) assert cfg.auth_token == "" assert cfg.voice_enabled is False def test_dashboard_config_defaults(self) -> None: from researchclaw.config import DashboardConfig cfg = DashboardConfig() assert cfg.enabled is True assert cfg.refresh_interval_sec == 5 assert cfg.max_log_lines == 1000 def test_parse_server_config(self) -> None: from researchclaw.config import _parse_server_config cfg = _parse_server_config({ "enabled": True, "host": "127.0.0.1", "port": 9090, "auth_token": "secret123", }) assert cfg.enabled is True assert cfg.host == "127.0.0.1" assert cfg.port == 9090 assert cfg.auth_token == "secret123" def test_parse_server_config_empty(self) -> None: from researchclaw.config import _parse_server_config cfg = _parse_server_config({}) assert cfg.enabled is False assert cfg.port == 8080 def test_parse_dashboard_config(self) -> None: from researchclaw.config import _parse_dashboard_config cfg = _parse_dashboard_config({ "refresh_interval_sec": 10, "max_log_lines": 500, }) assert cfg.refresh_interval_sec == 10 assert cfg.max_log_lines == 500 def test_rcconfig_has_server_and_dashboard(self) -> None: from researchclaw.config import RCConfig, ServerConfig, DashboardConfig # Build minimal valid config dict data = { "project": {"name": "test"}, "research": {"topic": "test topic"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "console"}, "knowledge_base": {"root": "knowledge"}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost", "api_key_env": "TEST_KEY", }, "server": {"enabled": True, "port": 9999}, "dashboard": {"refresh_interval_sec": 3}, } cfg = RCConfig.from_dict(data, check_paths=False) assert isinstance(cfg.server, ServerConfig) assert cfg.server.enabled is True assert cfg.server.port == 9999 assert isinstance(cfg.dashboard, DashboardConfig) assert cfg.dashboard.refresh_interval_sec == 3 # --------------------------------------------------------------------------- # CLI tests # --------------------------------------------------------------------------- class TestCLI: """Test new CLI subcommands are registered.""" def test_serve_subcommand_exists(self) -> None: from researchclaw.cli import main with pytest.raises(SystemExit) as exc: main(["serve", "--help"]) assert exc.value.code == 0 def test_dashboard_subcommand_exists(self) -> None: from researchclaw.cli import main with pytest.raises(SystemExit) as exc: main(["dashboard", "--help"]) assert exc.value.code == 0 def test_wizard_subcommand_exists(self) -> None: from researchclaw.cli import main with pytest.raises(SystemExit) as exc: main(["wizard", "--help"]) assert exc.value.code == 0 # --------------------------------------------------------------------------- # Intent classification tests # --------------------------------------------------------------------------- class TestIntents: """Test intent classification.""" def test_help_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, conf = classify_intent("help") assert intent == Intent.HELP def test_status_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("What stage are we at?") assert intent == Intent.CHECK_STATUS def test_start_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("Start the pipeline") assert intent == Intent.START_PIPELINE def test_topic_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("Help me find a research direction") assert intent == Intent.TOPIC_SELECTION def test_results_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("What are the results?") assert intent == Intent.DISCUSS_RESULTS def test_config_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("Change the learning rate to 0.001") assert intent == Intent.MODIFY_CONFIG def test_paper_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("Edit the abstract") assert intent == Intent.EDIT_PAPER def test_general_intent(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("Hello there") assert intent == Intent.GENERAL_CHAT def test_chinese_status(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("现在到哪一步了") assert intent == Intent.CHECK_STATUS def test_chinese_start(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, _ = classify_intent("开始跑实验") assert intent == Intent.START_PIPELINE def test_empty_message(self) -> None: from researchclaw.server.dialog.intents import Intent, classify_intent intent, conf = classify_intent("") assert intent == Intent.GENERAL_CHAT assert conf == 0.0 # --------------------------------------------------------------------------- # Session management tests # --------------------------------------------------------------------------- class TestSession: """Test chat session management.""" def test_session_create(self) -> None: from researchclaw.server.dialog.session import SessionManager mgr = SessionManager() session = mgr.get_or_create("client1") assert session.client_id == "client1" assert len(session.history) == 0 def test_session_add_message(self) -> None: from researchclaw.server.dialog.session import SessionManager mgr = SessionManager() session = mgr.get_or_create("client1") session.add_message("user", "Hello") session.add_message("assistant", "Hi!") assert len(session.history) == 2 assert session.history[0].role == "user" def test_session_context(self) -> None: from researchclaw.server.dialog.session import SessionManager mgr = SessionManager() session = mgr.get_or_create("client1") for i in range(20): session.add_message("user", f"msg {i}") ctx = session.get_context(last_n=5) assert len(ctx) == 5 def test_session_max_history(self) -> None: from researchclaw.server.dialog.session import ChatSession session = ChatSession(client_id="test") for i in range(100): session.add_message("user", f"msg {i}") assert len(session.history) <= session.MAX_HISTORY def test_session_persistence(self) -> None: from researchclaw.server.dialog.session import SessionManager with tempfile.TemporaryDirectory() as tmpdir: mgr = SessionManager(persist_dir=tmpdir) session = mgr.get_or_create("persist-test") session.add_message("user", "saved message") mgr.save("persist-test") # Load in new manager mgr2 = SessionManager(persist_dir=tmpdir) loaded = mgr2.load("persist-test") assert loaded is not None assert len(loaded.history) == 1 assert loaded.history[0].content == "saved message" # --------------------------------------------------------------------------- # Dashboard collector tests # --------------------------------------------------------------------------- class TestDashboardCollector: """Test dashboard data collection from artifacts/.""" def test_collect_empty_dir(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert runs == [] def test_collect_run_with_checkpoint(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: run_dir = Path(tmpdir) / "rc-20260315-abc123" run_dir.mkdir() ckpt = {"stage": 5, "stage_name": "LITERATURE_SCREEN", "status": "running"} (run_dir / "checkpoint.json").write_text(json.dumps(ckpt)) collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert len(runs) == 1 assert runs[0].current_stage == 5 assert runs[0].current_stage_name == "LITERATURE_SCREEN" def test_collect_run_active_heartbeat(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: run_dir = Path(tmpdir) / "rc-20260315-test01" run_dir.mkdir() hb = {"timestamp": time.time()} # fresh heartbeat (run_dir / "heartbeat.json").write_text(json.dumps(hb)) collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert len(runs) == 1 assert runs[0].is_active is True def test_collect_run_stale_heartbeat(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: run_dir = Path(tmpdir) / "rc-20260315-stale1" run_dir.mkdir() hb = {"timestamp": time.time() - 120} # old heartbeat (run_dir / "heartbeat.json").write_text(json.dumps(hb)) collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert runs[0].is_active is False def test_collect_stage_directories(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: run_dir = Path(tmpdir) / "rc-20260315-stages" run_dir.mkdir() (run_dir / "stage-01").mkdir() (run_dir / "stage-02").mkdir() (run_dir / "stage-03").mkdir() collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert len(runs[0].stages_completed) == 3 def test_collect_metrics(self) -> None: from researchclaw.dashboard.collector import DashboardCollector with tempfile.TemporaryDirectory() as tmpdir: run_dir = Path(tmpdir) / "rc-20260315-metric" run_dir.mkdir() metrics = {"accuracy": 0.85, "loss": 0.12} (run_dir / "results.json").write_text(json.dumps(metrics)) collector = DashboardCollector(artifacts_dir=tmpdir) runs = collector.collect_all() assert runs[0].metrics["accuracy"] == 0.85 def test_snapshot_to_dict(self) -> None: from researchclaw.dashboard.collector import RunSnapshot snap = RunSnapshot(run_id="test-1", path="/tmp/test") d = snap.to_dict() assert d["run_id"] == "test-1" assert "current_stage" in d # --------------------------------------------------------------------------- # Metrics tests # --------------------------------------------------------------------------- class TestMetrics: """Test metric aggregation.""" def test_aggregate_empty(self) -> None: from researchclaw.dashboard.metrics import aggregate_metrics result = aggregate_metrics([]) assert result["total_runs"] == 0 def test_aggregate_mixed(self) -> None: from researchclaw.dashboard.metrics import aggregate_metrics runs = [ {"is_active": True, "status": "running", "current_stage": 10}, {"is_active": False, "status": "completed", "current_stage": 23}, {"is_active": False, "status": "failed", "current_stage": 5}, ] result = aggregate_metrics(runs) assert result["total_runs"] == 3 assert result["active_runs"] == 1 assert result["completed_runs"] == 1 assert result["failed_runs"] == 1 def test_extract_training_curve(self) -> None: from researchclaw.dashboard.metrics import extract_training_curve metrics = { "training_log": [ {"epoch": 1, "loss": 0.5, "accuracy": 0.7}, {"epoch": 2, "loss": 0.3, "accuracy": 0.85}, ] } curve = extract_training_curve(metrics) assert len(curve) == 2 assert curve[1]["loss"] == 0.3 # --------------------------------------------------------------------------- # Voice command tests # --------------------------------------------------------------------------- class TestVoiceCommands: """Test voice command parsing.""" def test_start_command(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("start experiment") assert result.command == VoiceCommand.START def test_stop_command(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("stop") assert result.command == VoiceCommand.STOP def test_chinese_start(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("开始实验") assert result.command == VoiceCommand.START def test_chinese_pause(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("暂停") assert result.command == VoiceCommand.PAUSE def test_not_a_command(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("What about the neural network?") assert result.command == VoiceCommand.NONE def test_status_command(self) -> None: from researchclaw.voice.commands import VoiceCommand, parse_voice_input result = parse_voice_input("查看进度") assert result.command == VoiceCommand.STATUS # --------------------------------------------------------------------------- # Wizard tests # --------------------------------------------------------------------------- class TestWizard: """Test wizard templates and validation.""" def test_list_templates(self) -> None: from researchclaw.wizard.templates import list_templates templates = list_templates() assert len(templates) >= 3 names = [t["name"] for t in templates] assert "quick-demo" in names assert "standard-cv" in names def test_get_template(self) -> None: from researchclaw.wizard.templates import get_template tpl = get_template("quick-demo") assert tpl is not None assert tpl["experiment.mode"] == "simulated" def test_get_template_missing(self) -> None: from researchclaw.wizard.templates import get_template assert get_template("nonexistent") is None def test_wizard_web_mode(self) -> None: from researchclaw.wizard.quickstart import QuickStartWizard wizard = QuickStartWizard() config = wizard.run_web([ {"key": "project_name", "value": "test-proj"}, {"key": "topic", "value": "neural scaling laws"}, {"key": "mode", "value": "docker"}, ]) assert config.get("project", {}).get("name") == "test-proj" assert config.get("research", {}).get("topic") == "neural scaling laws" def test_environment_detection(self) -> None: from researchclaw.wizard.validator import detect_environment report = detect_environment() assert report.has_python is True assert report.python_version != "" d = report.to_dict() assert "has_gpu" in d assert "recommendations" in d # --------------------------------------------------------------------------- # WebSocket events tests # --------------------------------------------------------------------------- class TestEvents: """Test WebSocket event types.""" def test_event_serialization(self) -> None: from researchclaw.server.websocket.events import Event, EventType evt = Event(type=EventType.STAGE_COMPLETE, data={"stage": 5}) json_str = evt.to_json() parsed = json.loads(json_str) assert parsed["type"] == "stage_complete" assert parsed["data"]["stage"] == 5 def test_event_deserialization(self) -> None: from researchclaw.server.websocket.events import Event, EventType raw = json.dumps({ "type": "heartbeat", "data": {"active_clients": 3}, "timestamp": 1234567890.0, }) evt = Event.from_json(raw) assert evt.type == EventType.HEARTBEAT assert evt.data["active_clients"] == 3 def test_event_types_enum(self) -> None: from researchclaw.server.websocket.events import EventType assert EventType.CONNECTED.value == "connected" assert EventType.STAGE_START.value == "stage_start" assert EventType.CHAT_RESPONSE.value == "chat_response" # --------------------------------------------------------------------------- # Dialog router tests # --------------------------------------------------------------------------- class TestDialogRouter: """Test dialog message routing.""" @pytest.mark.asyncio async def test_route_help_message(self) -> None: from researchclaw.server.dialog.router import route_message response = await route_message("help", "test-client") assert "help" in response.lower() or "I can" in response @pytest.mark.asyncio async def test_route_json_message(self) -> None: from researchclaw.server.dialog.router import route_message msg = json.dumps({"message": "help me"}) response = await route_message(msg, "test-client-2") assert isinstance(response, str) assert len(response) > 0 @pytest.mark.asyncio async def test_route_status_message(self) -> None: from researchclaw.server.dialog.router import route_message response = await route_message("What's the current progress?", "test-client-3") assert isinstance(response, str) # --------------------------------------------------------------------------- # FastAPI app tests (requires fastapi + httpx) # --------------------------------------------------------------------------- class TestFastAPIApp: """Test FastAPI application if dependencies are available.""" @pytest.fixture def _skip_if_no_fastapi(self) -> None: try: import fastapi import httpx except ImportError: pytest.skip("fastapi/httpx not installed") @pytest.fixture def app(self, _skip_if_no_fastapi: None) -> object: from researchclaw.config import RCConfig data = { "project": {"name": "test"}, "research": {"topic": "test"}, "runtime": {"timezone": "UTC"}, "notifications": {"channel": "console"}, "knowledge_base": {"root": "knowledge"}, "llm": { "provider": "openai-compatible", "base_url": "http://localhost", "api_key_env": "TEST", }, } config = RCConfig.from_dict(data, check_paths=False) from researchclaw.server.app import create_app return create_app(config) @pytest.mark.asyncio async def test_health_endpoint(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/health") assert resp.status_code == 200 data = resp.json() assert data["status"] == "ok" @pytest.mark.asyncio async def test_config_endpoint(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/config") assert resp.status_code == 200 data = resp.json() assert data["project"] == "test" @pytest.mark.asyncio async def test_pipeline_status_idle(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/pipeline/status") assert resp.status_code == 200 assert resp.json()["status"] == "idle" @pytest.mark.asyncio async def test_pipeline_stages(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/pipeline/stages") assert resp.status_code == 200 stages = resp.json()["stages"] assert len(stages) == 23 @pytest.mark.asyncio async def test_runs_list(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/runs") assert resp.status_code == 200 assert "runs" in resp.json() @pytest.mark.asyncio async def test_projects_list(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.get("/api/projects") assert resp.status_code == 200 assert "projects" in resp.json() @pytest.mark.asyncio async def test_stop_pipeline_404_when_idle(self, app: object) -> None: from httpx import AsyncClient, ASGITransport transport = ASGITransport(app=app) # type: ignore[arg-type] async with AsyncClient(transport=transport, base_url="http://test") as ac: resp = await ac.post("/api/pipeline/stop") assert resp.status_code == 404 ================================================ FILE: tests/test_web_scholar.py ================================================ """Tests for researchclaw.web.scholar — GoogleScholarClient.""" from __future__ import annotations import time from unittest.mock import MagicMock, patch import pytest from researchclaw.web.scholar import GoogleScholarClient, ScholarPaper # --------------------------------------------------------------------------- # ScholarPaper dataclass # --------------------------------------------------------------------------- class TestScholarPaper: def test_to_dict(self): p = ScholarPaper( title="Attention Is All You Need", authors=["Vaswani", "Shazeer"], year=2017, citation_count=50000, ) d = p.to_dict() assert d["title"] == "Attention Is All You Need" assert d["year"] == 2017 assert d["source"] == "google_scholar" def test_to_literature_paper(self): p = ScholarPaper( title="Test Paper", authors=["Author One", "Author Two"], year=2024, abstract="An abstract.", citation_count=100, url="https://example.com", ) lit = p.to_literature_paper() assert lit.title == "Test Paper" assert lit.source == "google_scholar" assert len(lit.authors) == 2 assert lit.authors[0].name == "Author One" # --------------------------------------------------------------------------- # GoogleScholarClient # --------------------------------------------------------------------------- class TestGoogleScholarClient: @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True) def test_available_always_true(self): """scholarly is now an installed dependency, always available.""" client = GoogleScholarClient() assert client.available def test_parse_pub_full(self): """Test _parse_pub with a complete publication dict.""" pub = { "bib": { "title": "Deep Learning", "author": ["LeCun", "Bengio", "Hinton"], "pub_year": "2015", "abstract": "Deep learning review.", "venue": "Nature", }, "num_citations": 30000, "pub_url": "https://nature.com/dl", "cites_id": ["abc123"], } paper = GoogleScholarClient._parse_pub(pub) assert paper.title == "Deep Learning" assert paper.year == 2015 assert paper.citation_count == 30000 assert "LeCun" in paper.authors assert paper.venue == "Nature" def test_parse_pub_string_authors(self): pub = { "bib": { "title": "Paper", "author": "Smith and Jones", "pub_year": "2023", }, "num_citations": 10, "pub_url": "https://example.com", } paper = GoogleScholarClient._parse_pub(pub) assert paper.title == "Paper" assert "Smith" in paper.authors assert "Jones" in paper.authors def test_parse_pub_missing_fields(self): pub = {"bib": {}, "num_citations": 0} paper = GoogleScholarClient._parse_pub(pub) assert paper.title == "" assert paper.year == 0 assert paper.authors == [] @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True) def test_rate_limiting(self): client = GoogleScholarClient(inter_request_delay=0.01) t0 = time.monotonic() client._rate_limit() client._rate_limit() elapsed = time.monotonic() - t0 assert elapsed >= 0.01 @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True) @patch("researchclaw.web.scholar.scholarly") def test_search_with_mocked_scholarly(self, mock_scholarly): """Test search using mocked scholarly library.""" mock_pub = { "bib": { "title": "Test Paper", "author": ["Author A"], "pub_year": "2024", }, "num_citations": 5, "pub_url": "https://example.com", } mock_scholarly.search_pubs.return_value = iter([mock_pub]) client = GoogleScholarClient(inter_request_delay=0.0) results = client.search("test query", limit=5) assert len(results) == 1 assert results[0].title == "Test Paper" @patch("researchclaw.web.scholar.HAS_SCHOLARLY", True) @patch("researchclaw.web.scholar.scholarly") def test_search_error_graceful(self, mock_scholarly): """Search should return empty list on error, not raise.""" mock_scholarly.search_pubs.side_effect = Exception("Rate limited") client = GoogleScholarClient(inter_request_delay=0.0) results = client.search("test query") assert results == [] ================================================ FILE: tests/test_web_search.py ================================================ """Tests for researchclaw.web.search — WebSearchClient.""" from __future__ import annotations from unittest.mock import MagicMock, patch import pytest from researchclaw.web.search import SearchResult, WebSearchClient, WebSearchResponse # --------------------------------------------------------------------------- # SearchResult dataclass # --------------------------------------------------------------------------- class TestSearchResult: def test_to_dict(self): r = SearchResult( title="Test", url="https://example.com", snippet="A snippet", source="tavily" ) d = r.to_dict() assert d["title"] == "Test" assert d["url"] == "https://example.com" assert d["source"] == "tavily" # --------------------------------------------------------------------------- # WebSearchResponse dataclass # --------------------------------------------------------------------------- class TestWebSearchResponse: def test_has_results_true(self): r = WebSearchResponse( query="test", results=[SearchResult(title="A", url="u")], ) assert r.has_results def test_has_results_false(self): r = WebSearchResponse(query="test") assert not r.has_results # --------------------------------------------------------------------------- # DuckDuckGo HTML parsing # --------------------------------------------------------------------------- class TestDDGParsing: def test_parse_ddg_html_basic(self): html = """ """ results = WebSearchClient._parse_ddg_html(html, limit=10) assert len(results) == 2 assert results[0].title == "Title One" assert results[0].url == "https://example.com/1" assert results[0].snippet == "Snippet one here" def test_parse_ddg_html_skips_ddg_links(self): html = """ DDG Link Real """ results = WebSearchClient._parse_ddg_html(html, limit=10) assert len(results) == 1 assert results[0].url == "https://example.com/real" def test_parse_ddg_html_respects_limit(self): html = "" for i in range(20): html += f'T{i}\n' results = WebSearchClient._parse_ddg_html(html, limit=5) assert len(results) == 5 # --------------------------------------------------------------------------- # WebSearchClient.search # --------------------------------------------------------------------------- class TestWebSearchClient: @patch("researchclaw.web.search.urlopen") def test_search_ddg_fallback_no_api_key(self, mock_urlopen): """When no API key is set, uses DuckDuckGo fallback.""" mock_resp = MagicMock() mock_resp.read.return_value = b""" Paper Title About the paper """ mock_urlopen.return_value = mock_resp client = WebSearchClient(api_key="") # No API key response = client.search("test query") assert response.source == "duckduckgo" @patch("researchclaw.web.search.urlopen") def test_search_ddg_error_graceful(self, mock_urlopen): mock_urlopen.side_effect = Exception("Network error") client = WebSearchClient(api_key="") response = client.search("test query") assert response.source == "duckduckgo" assert len(response.results) == 0 def test_search_tavily_with_mock(self): """Test Tavily search with mocked SDK.""" mock_client_instance = MagicMock() mock_client_instance.search.return_value = { "results": [ { "title": "Tavily Result", "url": "https://tavily.com/r1", "content": "Content from Tavily", "score": 0.95, } ], "answer": "AI summary answer", } mock_tavily_module = MagicMock() mock_tavily_module.TavilyClient.return_value = mock_client_instance with patch.dict("sys.modules", {"tavily": mock_tavily_module}): client = WebSearchClient(api_key="test-key") import time response = client._search_tavily("test query", 10, None, None, time.monotonic()) assert response.source == "tavily" assert len(response.results) == 1 assert response.results[0].title == "Tavily Result" assert response.answer == "AI summary answer" @patch("researchclaw.web.search.urlopen") def test_search_multi_deduplication(self, mock_urlopen): mock_resp = MagicMock() mock_resp.read.return_value = b""" Same Result """ mock_urlopen.return_value = mock_resp client = WebSearchClient(api_key="") responses = client.search_multi(["query1", "query2"], inter_query_delay=0.0) assert len(responses) == 2 # Second query should have same URL deduped if responses[0].results: assert all( r.url != responses[0].results[0].url for r in responses[1].results ) ================================================ FILE: website/features.html ================================================ Features — AutoResearchClaw

Features

Everything you need for autonomous research paper generation, built for reliability and quality.

🔍

Multi-Source Literature Search

Searches OpenAlex (primary, 10K/day), Semantic Scholar, and arXiv in parallel. Intelligent source fallback ensures results even when individual APIs are rate-limited.

Rate Limit Defense

Five-layer defense: adaptive rate limiter, three-state circuit breaker, multi-source fallback, intelligent caching with per-source TTL, and request optimization via S2 batch API.

🐋

Docker Sandbox with GPU

Experiments run in isolated Docker containers based on nvidia/cuda:12.4.1 with PyTorch, GPU passthrough, network sandboxing, and pre-cached datasets (CIFAR-10, FashionMNIST).

💻

Hardware-Aware Design

Automatically detects available GPU memory and adjusts experiment parameters (batch size, model size, training epochs) to fit within hardware constraints.

🤖

Multi-Agent Peer Review

Simulated conference-style peer review with multiple LLM reviewer personas providing structured feedback on technical soundness, methodology, and clarity.

🔄

Pivot / Refine / Proceed

After analyzing experiment results, the pipeline autonomously decides whether to proceed with paper writing, refine the experiment, or pivot to a new hypothesis (max 2 pivots).

📈

Experiment Charts

Automatically generates publication-quality comparison charts, filtering out timing/meta metrics. Supports bar charts, learning curves, and ablation visualizations.

📜

Conference-Ready LaTeX

Outputs publication-quality LaTeX with proper bibliography, figure placement, and conference formatting (NeurIPS, ICLR, ICML templates).

Citation Verification

Every cited paper is verified against real academic databases (CrossRef, OpenAlex, arXiv, Semantic Scholar) in optimized order to minimize API pressure.

📦

Result Caching

Per-source cache TTL: arXiv results cached 24h (daily metadata updates), S2/OpenAlex 3 days, citation verification results cached permanently.

📚

Seminal Paper Library

Built-in seed library of foundational ML papers (normalization, ResNets, transformers, etc.) injected during literature search to ensure key references are cited.

🛠

Code Security Validation

Generated experiment code is validated for security (no network access, no subprocess calls, no file system writes outside workspace) before Docker execution.

💡

Contradiction Detection

Automatically detects contradictions in experiment results: null findings, negative results, and cases where control outperforms proposed method.

📋

Quality Assessment

Built-in quality scoring across novelty, soundness, significance, clarity, and reproducibility. Papers below threshold trigger rewriting.

🗃

Knowledge Archive

Research findings are archived in a persistent knowledge base (Markdown-backed) for cross-project knowledge transfer and future reference.

🎓

LLM Fine-Tuning

Optional QLoRA/LoRA fine-tuning support for adapting language models to specific research domains and writing styles.

How We Compare

AutoResearchClaw vs. other autonomous research tools

Feature AutoResearchClaw PaperClaw Sibyl Idea2Paper
Literature search 3 APIs + cache 2 APIs arXiv only Offline KG
Rate limit handling Circuit breaker + fallback Exponential backoff None N/A
Code execution Docker + GPU No No No
Peer review Multi-agent No Single agent No
Citation verification 4 API sources No No No
Pipeline stages 23 ~8 ~5 ~6
================================================ FILE: website/getting-started.html ================================================ Get Started — AutoResearchClaw

Get Started

From zero to your first autonomous research paper in minutes.

0 Prerequisites

  • ☑ Python 3.10+
  • ☑ Docker with NVIDIA Container Toolkit (for GPU experiments)
  • ☑ An OpenAI-compatible API key (Azure OpenAI, OpenAI, or local LLM)
  • ☑ NVIDIA GPU with 8GB+ VRAM (optional, for Docker sandbox)

1 Clone the Repository

git clone https://github.com/aiming-lab/AutoResearchClaw.git
cd AutoResearchClaw

2 Install Dependencies

pip install -e .

This installs the researchclaw package and all required dependencies.

3 Configure Your LLM

Create a YAML config file (e.g., config.yaml) with your LLM settings:

# config.yaml
project:
  name: "my-first-paper"
  mode: "docs-first"

research:
  topic: "Your research topic here"

llm:
  provider: "openai-compatible"
  base_url: "https://api.openai.com/v1"
  api_key_env: "OPENAI_API_KEY"

experiment:
  backend: "docker" # or "subprocess" for local
  timeout_sec: 1800

4 Set Your API Key

export OPENAI_API_KEY="sk-your-key-here"

5 Build the Docker Image (Optional)

If using the Docker sandbox backend for GPU-accelerated experiments:

docker build -t researchclaw-sandbox -f researchclaw/docker/Dockerfile .

6 Run Your First Paper

python -m researchclaw run --config config.yaml

The pipeline will execute all 23 stages autonomously. Output will be saved to the output/ directory including the paper PDF, LaTeX source, experiment code, and charts.

7 Review Your Paper

After the pipeline completes, find your generated paper at:

output/<run-id>/
  paper.pdf        # Final PDF
  paper.tex        # LaTeX source
  references.bib   # Bibliography
  code/main.py     # Experiment code
  charts/          # Generated figures
  results.json     # Experiment metrics

Tips

  • Use GPT-4.1 or newer for best paper quality
  • Set timeout_sec: 3600 for complex experiments
  • For Azure OpenAI, set provider: "azure_openai" and configure your endpoint
  • The pipeline caches literature results, so re-runs with the same topic are faster
  • Run python -m pytest tests/ -v to verify your installation
================================================ FILE: website/index.html ================================================ AutoResearchClaw — Autonomous Research Paper Generation

Chat an Idea.
Get a Paper.

AutoResearchClaw is a fully autonomous 23-stage pipeline that transforms a research topic into a conference-ready paper — with real experiments, GPU-accelerated code, and verified citations.

# one command, one paper
python -m researchclaw run --topic "your research idea"
23
Autonomous Stages
1117
Tests Passing
3
Literature APIs
GPU
Docker Sandbox

From Idea to Paper in 23 Steps

Eight autonomous phases transform a research topic into a publication-ready manuscript.

🎯

A: Research Scoping

Topic initialization, problem decomposition, and scope definition.

📚

B: Literature Discovery

Multi-source paper search via OpenAlex, Semantic Scholar, and arXiv with quality screening.

🧠

C: Knowledge Synthesis

Gap analysis, trend synthesis, and novel hypothesis generation.

⚙️

D: Experiment Design

Methodology design, code generation, and resource planning with hardware awareness.

🚀

E: Experiment Execution

GPU-accelerated Docker sandbox execution with iterative refinement.

📊

F: Analysis & Decision

Result analysis with pivot/refine/proceed decisions.

✍️

G: Paper Writing

Structured drafting, multi-agent peer review, and iterative revision.

✔️

H: Finalization

Quality gate, knowledge archival, LaTeX export, and citation verification.

Key Features

Built for serious research, engineered for reliability.

🔍

Real Literature Search

Multi-source search across OpenAlex, Semantic Scholar, and arXiv with circuit breakers, rate limiting, and intelligent caching.

🐋

Docker Sandbox + GPU

Experiments run in isolated Docker containers with NVIDIA GPU passthrough, network sandboxing, and automatic dependency management.

🤖

Multi-Agent Peer Review

Simulated conference-style peer review with multiple reviewer personas providing structured feedback for revision.

🔄

Iterative Refinement

Automatic pivot/refine/proceed decisions with rollback to any previous stage based on experiment outcomes.

📜

Conference-Ready LaTeX

Publication-quality LaTeX output with proper citations, experiment charts, and structured abstracts.

Citation Verification

All citations verified against CrossRef, OpenAlex, and arXiv APIs to ensure bibliography accuracy.

Showcase Papers

Papers generated entirely by the pipeline, from topic to camera-ready PDF.

📄

Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification

Computer Vision Coming Soon

Investigates adaptive curriculum strategies on CIFAR-10/100 benchmarks, demonstrating improved convergence speed and final accuracy compared to standard training.

📄

Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift

Domain Adaptation Coming Soon

Explores test-time adaptation methods using batch normalization statistics to handle distribution shift on CIFAR-10-C corruption benchmarks.

📄

Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control

Reinforcement Learning Coming Soon

Proposes entropy-guided intrinsic reward bonuses to improve exploration in sparse-reward MuJoCo locomotion environments.

System Architecture

End-to-end pipeline architecture from topic input to published paper.

AutoResearchClaw Framework

Ready to Generate Your First Paper?

Clone the repo, configure your LLM API key, and run your first autonomous research paper.

================================================ FILE: website/papers.html ================================================ Showcase Papers — AutoResearchClaw

Showcase Papers

Every paper below was generated entirely by AutoResearchClaw — from a single topic prompt to a complete research manuscript with real experiments.

📊

Curriculum Learning with Adaptive Difficulty Scheduling for Image Classification

Computer Vision Coming Soon

Investigates adaptive curriculum learning strategies on CIFAR-10/100 benchmarks. Proposes a difficulty-aware scheduling mechanism that dynamically adjusts training sample ordering to improve convergence speed and final accuracy.

💬

Prompt-Length-Aware Routing for Mixture-of-LoRA Experts in Instruction-Following

NLP / PEFT Coming Soon

Proposes a routing mechanism for Mixture-of-LoRA experts that considers prompt length characteristics. Fine-tunes Qwen-2.5-3B with QLoRA to demonstrate improved instruction-following across varying input lengths.

🧬

Graph Attention Networks with Learnable Edge Features for Molecular Property Prediction

GNN / Chemistry Coming Soon

Extends graph attention networks with learnable edge feature transformations for molecular property prediction on the OGB-MolHIV benchmark, achieving competitive performance with existing specialized architectures.

🎮

Entropy-Guided Exploration Bonuses for Sparse-Reward Continuous Control

Reinforcement Learning Coming Soon

Proposes entropy-guided intrinsic reward bonuses to improve exploration efficiency in sparse-reward MuJoCo locomotion environments. Demonstrates improved sample efficiency over baseline algorithms.

🎨

Spectral Normalization Effects on Mode Collapse in Conditional GANs for CIFAR-10

Generative Models Coming Soon

Systematically studies the effect of spectral normalization on mode collapse in conditional GANs trained on CIFAR-10, providing both visual and quantitative analysis (FID, IS) of generation diversity.

🔄

Test-Time Adaptation via Batch Normalization Statistics for Distribution Shift

Domain Adaptation Coming Soon

Explores lightweight test-time adaptation methods that update batch normalization statistics to handle distribution shift on CIFAR-10-C corruption benchmarks, demonstrating practical robustness improvements.

🚀

Papers Coming Soon

We're generating showcase papers across diverse ML subfields. Each paper will include a downloadable PDF, LaTeX source, experiment code, and quality assessment. Check back soon!

================================================ FILE: website/pipeline.html ================================================ Pipeline — AutoResearchClaw

The 23-Stage Pipeline

Click any stage to expand its description. Yellow badges mark gate stages that require quality checks before proceeding.

A

Research Scoping

1

Topic Initialization

Define research topic, scope, and target conference
Takes a user-provided topic prompt and generates a structured research plan including target conference, research questions, and expected contributions. Emphasizes novelty and alignment with recent conference trends.
LLM
2

Problem Decomposition

Break research into sub-problems and objectives
Decomposes the research topic into concrete sub-problems, defines evaluation criteria, and identifies the key technical challenges to address.
LLM
B

Literature Discovery

3

Search Strategy

Generate search queries and select paper sources
Generates targeted search queries from the research plan, selects which APIs to query (OpenAlex, Semantic Scholar, arXiv), and defines inclusion/exclusion criteria.
LLM
4

Literature Collect

Search OpenAlex, Semantic Scholar, and arXiv
Executes multi-source literature search with intelligent caching, circuit breakers, and rate limiting. Deduplicates results across sources and injects seminal papers from the seed library.
API
5

Literature Screen

Quality and relevance screening (Gate)
Gate Stage. LLM reviews each collected paper for relevance, quality, and domain match. Cross-domain false positives are explicitly rejected. Papers below threshold are filtered out.
Gate LLM
6

Knowledge Extract

Extract key insights and methodologies from papers
Extracts structured knowledge from screened papers: key contributions, methods, results, limitations, and open questions. Builds a knowledge graph for synthesis.
LLM
C

Knowledge Synthesis

7

Synthesis

Gap analysis and research trend synthesis
Clusters extracted knowledge by topic, identifies research gaps, and synthesizes trends. Produces a structured literature review summary that informs hypothesis generation.
LLM
8

Hypothesis Generation

Generate testable research hypotheses
Generates novel, testable hypotheses that address gaps not covered by existing literature. Each hypothesis includes expected outcomes, evaluation metrics, and ablation dimensions.
LLM
D

Experiment Design

9

Experiment Design

Methodology design and validation (Gate)
Gate Stage. Designs the complete experimental methodology: baselines, ablations, metrics, datasets, and statistical tests. Requires modern benchmarks and real datasets (CIFAR-10, etc.).
Gate LLM
10

Code Generation

Generate executable experiment code
Generates complete Python experiment code (main.py) with dataset loading, model definition, training loop, evaluation, and results output. Includes security validation, import checking, and code review.
LLM New
11

Resource Planning

Estimate compute budget and time allocation
Estimates GPU memory requirements, training time, and compute budget. Configures Docker sandbox resource limits and timeout values based on available hardware.
LLM
E

Experiment Execution

12

Experiment Run

Execute experiments in Docker sandbox
Runs generated code inside an isolated Docker container with NVIDIA GPU passthrough. Captures stdout metrics, timing data, and exit codes. Pre-cached datasets available at /workspace/data.
Docker
13

Iterative Refinement

Fix errors and improve experiment code
If experiment fails or produces poor results, automatically diagnoses issues and generates refined code. Checks ablation effectiveness (>5% difference from baseline). Up to 3 refinement iterations.
LLM Docker New
F

Analysis & Decision

14

Result Analysis

Statistical analysis of experiment outcomes
Parses experiment outputs, computes statistical significance, generates comparison charts, and produces structured results summaries. Detects result contradictions and null findings.
LLM
15

Research Decision

Pivot, refine, or proceed based on results
Evaluates experiment results and decides: Proceed (results support hypothesis), Refine (re-run with improvements), or Pivot (discard hypothesis, generate new one). Max 2 pivots to prevent infinite loops.
LLM
G

Paper Writing

16

Paper Outline

Structure paper sections and arguments
Creates a detailed paper outline with section-by-section arguments, key claims, and figure placements. Follows conference template structure (abstract, intro, related work, method, experiments, conclusion).
LLM
17

Paper Draft

Write the full paper draft
Generates the complete paper in Markdown/LaTeX with structured writing rules: 150-200 word abstract, no number repetition across sections, proper citation of original papers for all discussed techniques.
LLM
18

Peer Review

Multi-agent simulated conference review
Multiple LLM reviewer personas evaluate the paper: one technical reviewer, one methodology expert, and one clarity/presentation reviewer. Each provides structured feedback with scores.
LLM
19

Paper Revision

Revise based on peer review feedback
Addresses reviewer comments systematically: fixes technical issues, improves writing clarity, adds missing comparisons, and strengthens the narrative. Produces a revised draft.
LLM New
H

Finalization

20

Quality Gate

Final quality assessment (Gate)
Gate Stage. Comprehensive quality assessment scoring the paper on novelty, soundness, significance, clarity, and reproducibility. Papers below threshold are sent back for rewriting.
Gate LLM
21

Knowledge Archive

Archive findings to knowledge base
Stores research findings, methodology, and results in the persistent knowledge base for future reference and cross-project knowledge transfer. Non-critical: failure doesn't abort pipeline.
LLM
22

Export & Publish

Generate LaTeX PDF and final output
Converts the paper to conference-ready LaTeX, compiles to PDF, generates BibTeX bibliography, and produces the final output package (paper.pdf, main.tex, references.bib, charts/).
LaTeX
23

Citation Verify

Verify all citations against real databases
Verifies every cited paper exists in real academic databases. Checks DOI via CrossRef, title via OpenAlex, arXiv ID via arXiv API, and falls back to Semantic Scholar. Non-critical: failure doesn't abort pipeline.
API
================================================ FILE: website/style.css ================================================ /* ============================================================ AutoResearchClaw — Showcase Website Styles Pure CSS, no build step. Tailwind-inspired utility patterns. ============================================================ */ /* ---------- Reset & Variables ---------- */ :root { --color-bg: #0f172a; --color-bg-alt: #1e293b; --color-surface: #334155; --color-border: #475569; --color-text: #e2e8f0; --color-text-muted:#94a3b8; --color-primary: #38bdf8; --color-primary-d: #0284c7; --color-accent: #a78bfa; --color-accent-d: #7c3aed; --color-success: #4ade80; --color-warning: #fbbf24; --color-danger: #f87171; --color-white: #f8fafc; --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; --font-mono: 'JetBrains Mono', 'Fira Code', 'Consolas', monospace; --radius: 0.75rem; --radius-lg: 1rem; --shadow: 0 4px 6px -1px rgba(0,0,0,.3), 0 2px 4px -2px rgba(0,0,0,.2); --shadow-lg: 0 10px 15px -3px rgba(0,0,0,.4), 0 4px 6px -4px rgba(0,0,0,.3); --transition: 0.2s ease; } *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } html { scroll-behavior: smooth; font-size: 16px; } body { font-family: var(--font-sans); background: var(--color-bg); color: var(--color-text); line-height: 1.7; min-height: 100vh; } a { color: var(--color-primary); text-decoration: none; transition: color var(--transition); } a:hover { color: var(--color-accent); } img { max-width: 100%; height: auto; display: block; } /* ---------- Layout ---------- */ .container { max-width: 1200px; margin: 0 auto; padding: 0 1.5rem; } .section { padding: 5rem 0; } /* ---------- Navigation ---------- */ .navbar { position: fixed; top: 0; left: 0; right: 0; z-index: 100; background: rgba(15, 23, 42, 0.85); backdrop-filter: blur(12px); border-bottom: 1px solid var(--color-border); padding: 0.75rem 0; } .navbar .container { display: flex; align-items: center; justify-content: space-between; } .nav-brand { display: flex; align-items: center; gap: 0.75rem; font-weight: 700; font-size: 1.15rem; color: var(--color-white); } .nav-brand img { height: 32px; width: 32px; border-radius: 6px; } .nav-links { display: flex; gap: 1.5rem; list-style: none; } .nav-links a { color: var(--color-text-muted); font-size: 0.9rem; font-weight: 500; padding: 0.4rem 0; transition: color var(--transition); } .nav-links a:hover, .nav-links a.active { color: var(--color-primary); } .nav-github { display: inline-flex; align-items: center; gap: 0.4rem; background: var(--color-surface); color: var(--color-white); padding: 0.45rem 1rem; border-radius: 9999px; font-size: 0.85rem; font-weight: 600; transition: background var(--transition); } .nav-github:hover { background: var(--color-primary-d); color: var(--color-white); } /* Mobile nav toggle */ .nav-toggle { display: none; background: none; border: none; color: var(--color-text); font-size: 1.5rem; cursor: pointer; } @media (max-width: 768px) { .nav-toggle { display: block; } .nav-links { display: none; flex-direction: column; position: absolute; top: 100%; left: 0; right: 0; background: var(--color-bg-alt); padding: 1rem 1.5rem; gap: 0.5rem; border-bottom: 1px solid var(--color-border); } .nav-links.open { display: flex; } } /* ---------- Hero ---------- */ .hero { padding: 10rem 0 5rem; text-align: center; background: linear-gradient(180deg, rgba(56,189,248,0.08) 0%, transparent 60%); } .hero h1 { font-size: clamp(2rem, 5vw, 3.5rem); font-weight: 800; line-height: 1.15; margin-bottom: 1rem; } .hero h1 .gradient { background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); -webkit-background-clip: text; background-clip: text; -webkit-text-fill-color: transparent; } .hero .tagline { font-size: clamp(1.05rem, 2vw, 1.35rem); color: var(--color-text-muted); max-width: 640px; margin: 0 auto 2rem; } .hero-actions { display: flex; gap: 1rem; justify-content: center; flex-wrap: wrap; } .btn { display: inline-flex; align-items: center; gap: 0.5rem; padding: 0.75rem 1.75rem; border-radius: 9999px; font-weight: 600; font-size: 0.95rem; cursor: pointer; border: none; transition: all var(--transition); } .btn-primary { background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); color: #0f172a; } .btn-primary:hover { transform: translateY(-2px); box-shadow: var(--shadow-lg); color: #0f172a; } .btn-outline { background: transparent; color: var(--color-primary); border: 2px solid var(--color-primary); } .btn-outline:hover { background: rgba(56,189,248,0.1); color: var(--color-primary); } .hero-code { margin-top: 2.5rem; display: inline-block; background: var(--color-bg-alt); border: 1px solid var(--color-border); border-radius: var(--radius); padding: 0.8rem 1.5rem; font-family: var(--font-mono); font-size: 0.9rem; color: var(--color-success); } /* ---------- Stats ---------- */ .stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 1.5rem; padding: 3rem 0; } .stat { text-align: center; padding: 1.5rem; background: var(--color-bg-alt); border-radius: var(--radius); border: 1px solid var(--color-border); } .stat-value { font-size: 2rem; font-weight: 800; background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); -webkit-background-clip: text; background-clip: text; -webkit-text-fill-color: transparent; } .stat-label { font-size: 0.85rem; color: var(--color-text-muted); margin-top: 0.25rem; } /* ---------- Section headings ---------- */ .section-header { text-align: center; margin-bottom: 3rem; } .section-header h2 { font-size: clamp(1.5rem, 3vw, 2.25rem); font-weight: 700; margin-bottom: 0.5rem; } .section-header p { color: var(--color-text-muted); max-width: 600px; margin: 0 auto; } /* ---------- Pipeline Overview (Landing) ---------- */ .pipeline-preview { display: grid; grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); gap: 1rem; } .phase-card { background: var(--color-bg-alt); border-radius: var(--radius); border: 1px solid var(--color-border); padding: 1.5rem; transition: all var(--transition); } .phase-card:hover { border-color: var(--color-primary); transform: translateY(-3px); box-shadow: var(--shadow-lg); } .phase-card .phase-icon { font-size: 1.75rem; margin-bottom: 0.75rem; } .phase-card h3 { font-size: 1rem; font-weight: 600; margin-bottom: 0.4rem; } .phase-card p { font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.5; } .phase-card .phase-stages { margin-top: 0.75rem; display: flex; gap: 0.35rem; flex-wrap: wrap; } .stage-dot { width: 8px; height: 8px; border-radius: 50%; background: var(--color-primary); opacity: 0.5; } .stage-dot.gate { background: var(--color-warning); opacity: 1; } /* ---------- Paper Cards ---------- */ .paper-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); gap: 1.5rem; } .paper-card { background: var(--color-bg-alt); border-radius: var(--radius-lg); border: 1px solid var(--color-border); overflow: hidden; transition: all var(--transition); } .paper-card:hover { border-color: var(--color-primary); transform: translateY(-3px); box-shadow: var(--shadow-lg); } .paper-thumb { height: 180px; background: linear-gradient(135deg, var(--color-surface), var(--color-bg)); display: flex; align-items: center; justify-content: center; font-size: 3rem; color: var(--color-text-muted); } .paper-body { padding: 1.25rem; } .paper-body h3 { font-size: 1rem; font-weight: 600; line-height: 1.4; margin-bottom: 0.5rem; } .paper-body .paper-meta { display: flex; gap: 0.5rem; flex-wrap: wrap; margin-bottom: 0.75rem; } .badge { display: inline-flex; align-items: center; gap: 0.25rem; padding: 0.2rem 0.6rem; border-radius: 9999px; font-size: 0.72rem; font-weight: 600; } .badge-domain { background: rgba(56,189,248,0.15); color: var(--color-primary); } .badge-score { background: rgba(74,222,128,0.15); color: var(--color-success); } .badge-pending { background: rgba(251,191,36,0.15); color: var(--color-warning); } .paper-body .paper-abstract { font-size: 0.82rem; color: var(--color-text-muted); line-height: 1.55; display: -webkit-box; -webkit-line-clamp: 3; -webkit-box-orient: vertical; overflow: hidden; } /* ---------- Feature Cards ---------- */ .feature-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1.5rem; } .feature-card { background: var(--color-bg-alt); border-radius: var(--radius); border: 1px solid var(--color-border); padding: 1.75rem; transition: all var(--transition); } .feature-card:hover { border-color: var(--color-accent); transform: translateY(-2px); } .feature-card .feature-icon { width: 48px; height: 48px; border-radius: 12px; display: flex; align-items: center; justify-content: center; font-size: 1.4rem; margin-bottom: 1rem; } .feature-card .feature-icon.bg-blue { background: rgba(56,189,248,0.15); } .feature-card .feature-icon.bg-purple { background: rgba(167,139,250,0.15); } .feature-card .feature-icon.bg-green { background: rgba(74,222,128,0.15); } .feature-card .feature-icon.bg-amber { background: rgba(251,191,36,0.15); } .feature-card .feature-icon.bg-red { background: rgba(248,113,113,0.15); } .feature-card h3 { font-size: 1.05rem; font-weight: 600; margin-bottom: 0.4rem; } .feature-card p { font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.5; } /* ---------- Pipeline Page (full) ---------- */ .pipeline-full { max-width: 860px; margin: 0 auto; } .phase-group { margin-bottom: 3rem; } .phase-group-header { display: flex; align-items: center; gap: 0.75rem; margin-bottom: 1rem; padding-bottom: 0.5rem; border-bottom: 2px solid var(--color-border); } .phase-group-header .phase-letter { width: 36px; height: 36px; border-radius: 10px; display: flex; align-items: center; justify-content: center; font-weight: 800; font-size: 0.85rem; background: linear-gradient(135deg, var(--color-primary), var(--color-accent)); color: #0f172a; } .phase-group-header h3 { font-size: 1.15rem; font-weight: 600; } .stage-list { display: flex; flex-direction: column; gap: 0.5rem; } .stage-item { display: flex; align-items: flex-start; gap: 1rem; padding: 1rem 1.25rem; background: var(--color-bg-alt); border-radius: var(--radius); border: 1px solid var(--color-border); cursor: pointer; transition: all var(--transition); } .stage-item:hover { border-color: var(--color-primary); } .stage-item.expanded { border-color: var(--color-primary); background: rgba(56,189,248,0.04); } .stage-number { flex-shrink: 0; width: 32px; height: 32px; border-radius: 8px; display: flex; align-items: center; justify-content: center; font-weight: 700; font-size: 0.8rem; background: var(--color-surface); color: var(--color-text); } .stage-number.gate { background: rgba(251,191,36,0.2); color: var(--color-warning); } .stage-info { flex: 1; } .stage-info h4 { font-size: 0.95rem; font-weight: 600; margin-bottom: 0.15rem; } .stage-info .stage-subtitle { font-size: 0.8rem; color: var(--color-text-muted); } .stage-detail { display: none; margin-top: 0.75rem; padding-top: 0.75rem; border-top: 1px solid var(--color-border); font-size: 0.85rem; color: var(--color-text-muted); line-height: 1.6; } .stage-item.expanded .stage-detail { display: block; } .stage-badges { display: flex; gap: 0.35rem; flex-wrap: wrap; margin-top: 0.5rem; } .stage-badge { padding: 0.15rem 0.5rem; border-radius: 9999px; font-size: 0.7rem; font-weight: 600; } .stage-badge-gate { background: rgba(251,191,36,0.15); color: var(--color-warning); } .stage-badge-new { background: rgba(167,139,250,0.15); color: var(--color-accent); } .stage-badge-llm { background: rgba(56,189,248,0.15); color: var(--color-primary); } .stage-badge-docker { background: rgba(74,222,128,0.15); color: var(--color-success); } /* ---------- Footer ---------- */ .footer { padding: 3rem 0; text-align: center; border-top: 1px solid var(--color-border); } .footer p { color: var(--color-text-muted); font-size: 0.85rem; } .footer-links { display: flex; justify-content: center; gap: 1.5rem; list-style: none; margin-bottom: 1rem; } .footer-links a { color: var(--color-text-muted); font-size: 0.85rem; } .footer-links a:hover { color: var(--color-primary); } /* ---------- Getting Started ---------- */ .getting-started-content { max-width: 780px; margin: 0 auto; } .step-block { background: var(--color-bg-alt); border-radius: var(--radius); border: 1px solid var(--color-border); padding: 1.5rem; margin-bottom: 1rem; } .step-block h3 { display: flex; align-items: center; gap: 0.75rem; font-size: 1.05rem; font-weight: 600; margin-bottom: 0.75rem; } .step-num { width: 28px; height: 28px; border-radius: 50%; display: inline-flex; align-items: center; justify-content: center; font-size: 0.8rem; font-weight: 700; background: var(--color-primary); color: #0f172a; } .code-block { background: #0d1117; border-radius: 8px; padding: 1rem 1.25rem; margin-top: 0.5rem; font-family: var(--font-mono); font-size: 0.85rem; color: var(--color-success); overflow-x: auto; border: 1px solid #21262d; } .code-block .comment { color: var(--color-text-muted); } /* ---------- Connector Line (pipeline page) ---------- */ .connector { display: flex; justify-content: center; padding: 0.5rem 0; } .connector-line { width: 2px; height: 24px; background: linear-gradient(180deg, var(--color-primary), var(--color-accent)); opacity: 0.4; } /* ---------- Coming Soon Overlay ---------- */ .coming-soon { display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 4rem 2rem; text-align: center; } .coming-soon .cs-icon { font-size: 3rem; margin-bottom: 1rem; } .coming-soon h3 { font-size: 1.25rem; margin-bottom: 0.5rem; } .coming-soon p { color: var(--color-text-muted); max-width: 400px; } /* ---------- Utilities ---------- */ .text-center { text-align: center; } .mt-2 { margin-top: 0.5rem; } .mt-4 { margin-top: 1rem; } .mt-8 { margin-top: 2rem; } .mb-4 { margin-top: 1rem; } .gap-2 { gap: 0.5rem; } .flex { display: flex; } .items-center { align-items: center; } .justify-center { justify-content: center; }