[
  {
    "path": ".github/workflows/run-ruff.yml",
    "content": "name: lint\n\non:\n  pull_request:\n    branches: [ \"main\" ]\n\njobs:\n  lint:\n    if: github.repository_owner == 'MiroMindAI'\n    name: lint pull request\n    runs-on: ubuntu-latest\n    steps:\n    - name: checkout code\n      uses: actions/checkout@v4\n\n    - name: Install uv\n      uses: astral-sh/setup-uv@v5\n\n    - name: Check static error\n      run: |\n        uv tool run ruff@0.8.0 check --show-fixes --output-format=github\n\n    - name: Reformat code style\n      run: |\n        echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY\n        if diff_output=\"$(uv tool run ruff@0.8.0 format --diff 2>&1)\"; then\n          echo \"$diff_output\"\n          echo '✅ Format check passed.' >> \"$GITHUB_STEP_SUMMARY\"\n        else\n          echo \"$diff_output\"\n          echo '❌ Format issues detected.' >> \"$GITHUB_STEP_SUMMARY\"\n          {\n            echo '```diff'\n            echo \"$diff_output\"\n            echo '```'\n          } >> \"$GITHUB_STEP_SUMMARY\"\n          exit 1\n        fi"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[codz]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py.cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# UV\n#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#uv.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n#poetry.toml\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.\n#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control\n#pdm.lock\n#pdm.toml\n.pdm-python\n.pdm-build/\n\n# pixi\n#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.\n#pixi.lock\n#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one\n#   in the .venv directory. It is recommended not to include this directory in version control.\n.pixi\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.envrc\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n\n# Abstra\n# Abstra is an AI-powered process automation framework.\n# Ignore directories containing user credentials, local state, and settings.\n# Learn more at https://abstra.io/docs\n.abstra/\n\n# Visual Studio Code\n#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore \n#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore\n#  and can be added to the global gitignore or merged into this file. However, if you prefer, \n#  you could uncomment the following to ignore the entire vscode folder\n# .vscode/\n\n# Ruff stuff:\n.ruff_cache/\n\n# PyPI configuration file\n.pypirc\n\n# Cursor\n#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to\n#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data\n#  refer to https://docs.cursor.com/context/ignore-files\n.cursorignore\n.cursorindexingignore\n\n# Marimo\nmarimo/_static/\nmarimo/_lsp/\n__marimo__/\n\n\n# -- ADDED --\n# Log files\nlogs/\n\n# Data directory - exclude everything except README\ndata/\n\n\n.idea/\n\n.DS_Store\n\napps/collect-trace/scripts/*/*.sh"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n  <img src=\"assets/mirothinker_logo.png\" width=\"55%\" alt=\"MiroThinker\" />\n</div>\n\n<br>\n\n<div align=\"center\">\n\n[![MODEL](https://img.shields.io/badge/Model-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/collections/miromind-ai/mirothinker-17)\n[![Blog](https://img.shields.io/badge/Blog-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/#blog)\n[![DATA](https://img.shields.io/badge/Data-0040A1?style=for-the-badge&logo=huggingface&logoColor=ffffff&labelColor)](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1)\n\n[![GITHUB](https://img.shields.io/badge/Github-24292F?style=for-the-badge&logo=github&logoColor=white)](https://github.com/MiroMindAI)\n[![WEBSITE](https://img.shields.io/badge/Website-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/)\n[![DISCORD](https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/GPqEnkzQZd)\n\n</div>\n\n<div align=\"center\">\n\n### 🚀 [Try MiroThinker!](https://dr.miromind.ai/)\n\n</div>\n\n**MiroThinker**: A deep research agent optimized for research and prediction. It achieves a 88.2  on the challenging BrowseComp benchmark. See [Quick Start](#-quick-start).\n\n\n## 📋 Table of Contents\n\n- 📰 [News & Updates](#-news--updates)\n- 📝 [Introduction](#-introduction)\n- ✨ [Key Features](#-key-features)\n- 📈 [Performance on Benchmarks](#-performance-on-benchmarks)\n- 🚀 [Quick Start](#-quick-start)\n- 📊 [Benchmark Evaluation](#-benchmark-evaluation)\n- 🔬 [Trace Collection](#-trace-collection)\n- ❓ [FAQ & Troubleshooting](#-faq--troubleshooting)\n- 📄 [License](#-license)\n- 🙏 [Acknowledgments](#-acknowledgments)\n\n## 📰 News & Updates\n- **[2026-03-11]** 🎉🎉🎉 Introducing [MiroThinker-1.7](https://huggingface.co/collections/miromind-ai/mirothinker-17), including [MiroThinker-1.7-mini](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) and [MiroThinker-1.7](https://huggingface.co/miromind-ai/MiroThinker-1.7). MiroThinker-1.7-mini achieves 72.3 on BrowseComp-ZH, setting a new SOTA among open-source models while using only 30B parameters. Our proprietary agent MiroThinker-H1 achieves leading performance on BrowseComp and BrowseComp-ZH among open-source and commercial models.\n- **\\[2026-01-23\\]** 🎉 We have brought two important updates to [MiroThinker online](http://dr.miromind.ai): (a) Core Research Report Generation: Deep Research online reports now support generation, preview, and sharing. (b) Extended Document Upload Types: Now supports the upload of various file formats, such as `.pdf`, `.doc`, `.ppt`,  `.xls`,  `.jpg`. Welcome to try it out! MiroThinker will continue to be maintained and iteratively upgraded, with the goal of becoming the best Research Agent you'll ever use! \n- **\\[2026-01-05\\]** 🎉🎉 We release [MiroThinker-v1.5](https://huggingface.co/collections/miromind-ai/mirothinker-v15), a series of open-source deep research agents optimized for financial prediction. [MiroThinker-v1.5-30B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) surpasses Kimi-K2-Thinking on BrowseComp-ZH at much lower cost, using only 1/30 of the parameters. [MiroThinker-v1.5-235B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) scores 39.2% on HLE-Text, 69.8% on BrowseComp, 71.5% on BrowseComp-ZH, and 80.8% on GAIA-Val-165, setting a new state-of-the-art among search agents.\n\n\n<details>\n  <summary>📜 Click to expand older updates</summary>\n\n- **\\[2025-11-13\\]** 🎉 [MiroThinker-v1.0](https://huggingface.co/collections/miromind-ai/mirothinker-v10) is now released! Introducing **interactive scaling** as a third dimension of performance improvement, MiroThinker v1.0 supports 256K context window and up to 600 tool calls per task. Available in 8B, 30B, and 72B parameter scales, achieving 37.7%, 47.1%, 55.6%, and 81.9% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. See [Technical Report](https://arxiv.org/abs/2511.11793) for more details.\n- **\\[2025-09-11\\]** MiroThinker-72B-Preview ranked 4th in this week's FutureX benchmark. See [FutureX](https://futurex-ai.github.io/).\n- **\\[2025-09-08\\]** [MiroThinker-v0.2](https://huggingface.co/collections/miromind-ai/mirothinker-v02) is now released, achieving open-source SOTA performance across multiple benchmarks, including HLE (17.8%), HLE-Text-Only (19.1%), BrowseComp-EN (17.2%), BrowseComp-ZH (29.4%), XBench-DeepSearch (56.0%), and Frames (74.8%).\n- **\\[2025-09-07\\]** We supported more benchmarks, including [BrowseComp-ZH](https://arxiv.org/abs/2504.19314), [XBench-DeepSearch](https://xbench.org/agi/aisearch), and [FutureX](https://futurex-ai.github.io/). We plan to add more benchmarks in the future.\n- **\\[2025-08-22\\]** Introducing streamlined deployment options for MiroThinker with optimized resource usage and faster startup times. Experience the interactive demo: [🚀 Try Gradio Demo](apps/gradio-demo)\n- **\\[2025-08-08\\]** [MiroThinker-v0.1](https://huggingface.co/collections/miromind-ai/mirothinker-v01-689301b6d0563321862d44a1) released.\n\n</details>\n\n## 📝 Introduction\n\n### MiroThinker-1.7\nOur new MiroThinker family represents a significant leap in building reliable agents for long-chain tasks. Engineered with enhanced post-training pipeline, our  MiroThinker-1.7 family achieve SOTA performance in deep research tasks among open-source models.\n\n\n**Key Features**\n\n- 🚀 MiroThinker-1.7 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.\n- 🔧 Handles up to 300 tool interactions per task, now with more accurate stepwise reasoning and decision-making.\n- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.\n- Our proprietary agent, MiroThinker-H1 provides promising evidence for long-chain verifiable reasoning — reasoning processes that are step-verifiable and globally verifiable, improving the performance of complex agentic workflows.\n\n<div align=\"center\">\n\n|      Model Name       |         Parameters            | Max Context | Max Tool Calls |                              HF Link                               |\n|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|\n| MiroThinker-1.7-mini  | 30B   |    256K     |      300       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) |\n| MiroThinker-1.7 | 235B |    256K     |      300       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7) |\n\n</div>\n\nMiroThinker-1.7 demonstrates strong general-research performance across a broad range of benchmarks, achieving 74.0%, 75.3%, 82.7% and 42.9% on  BrowseComp, BrowseComp-ZH, GAIA-Val-165 and HLE-Text, respectively. MiroThinker-1.7 achieves SOTA performance on BrowseComp-ZH.\n\n![image](/assets/1.7_main_results.png)\n\n\n\n\n### MiroThinker-v1.5\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v1.5 details</summary>\n\nMiroThinker v1.5 is the world-leading open-source search agent that advances tool-augmented reasoning through **interactive scaling** — training the agent to handle deeper and more frequent agent-environment interactions as a third dimension of performance improvement, beyond model size and context length.\n\n![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_framework.png)\n\n**Key Features**\n\n- 🚀 MiroThinker v1.5 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.\n- 🔧 Handles up to 400 tool calls per task — a substantial improvement over previous open-source research agents.\n- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.\n\n<div align=\"center\">\n\n|      Agent Name       |         Base Agent            | Max Context | Max Tool Calls |                              HF Link                               |\n|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|\n| MiroThinker-v1.5-30B  | Qwen3-30B-A3B-Thinking-2507   |    256K     |      400       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) |\n| MiroThinker-v1.5-235B | Qwen3-235B-A22B-Thinking-2507 |    256K     |      400       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) |\n\n</div>\n\nMiroThinker v1.5 demonstrates strong general-research performance across a broad range of benchmarks, achieving 39.2%, 69.8%, 71.5%, and 80.8% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Val-165, respectively. These results surpass previous open-source agents and set the new world-leading BrowseComp performance.\n\n![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_browsecomp.png)\n\n</details>\n\n### MiroThinker-v1.0\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v1.0 details</summary>\n\nUnlike previous agents that scale only model size or context length, MiroThinker v1.0 introduces **interactive scaling** at the agent level, systematically training the agent to handle deeper and more frequent agent–environment interactions as a third dimension of performance improvement. Interactive scaling leverages environment feedback and external information acquisition to correct errors and refine trajectories.\n\n![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v1.0_Overall.png)\n\n### ✨ Key Features\n\n- 🚀 **256K Context Window**: Supports long-horizon reasoning and deep multi-step analysis\n- 🔧 **600 Tool Calls**: Handles up to 600 tool calls per task — a substantial improvement over previous open-source research agents\n- 📦 **Multiple Scales**: Released in 8B, 30B, and 72B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets\n\n<div align=\"center\">\n\n|      Agent Name      |         Base Agent          | Max Context | Max Tool Calls |                              HF Link                               |\n|:--------------------:|:---------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|\n| MiroThinker-v1.0-8B  |        Qwen3-8B             |    256K     |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-8B)  |\n| MiroThinker-v1.0-30B | Qwen3-30B-A3B-Thinking-2507 |    256K    |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-30B) |\n| MiroThinker-v1.0-72B |    Qwen2.5-72B-Instruct     |    256K    |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-72B) |\n\n</div>\n\nMiroThinker v1.0 demonstrates strong general-research performance across a broad range of benchmarks, achieving **37.7%**, **47.1%**, **55.6%**, and **81.9%** on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. These results surpass previous open-source agents and narrow the gap with commercial counterparts such as **GPT-5-high**.\n\n<div align=\"center\">\n  <img src=\"https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v1.0_Performance_1.png\" width=\"100%\" alt=\"MiroThinker\" />\n</div>\n\n</details>\n\n### MiroThinker-v0.2\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v0.2 details</summary>\n\nIn this new version, we introduced three key improvements:\n\n- 📚 **Richer training data** from both English and Chinese sources, yielding significant gains in benchmark performance and generalization\n- 🎯 **Unified DPO training** with a single preference dataset across all agents\n- 📏 **Extended context length** from 40k to 64k for more challenging multi-turn tool-use tasks\n\nCompared to v0.1, MiroThinker v0.2 delivers consistent gains across benchmarks. For example, scores improved from **57.3 → 64.1** on **GAIA-Text-103** and from **17.0 → 29.4** on **BrowseComp-ZH**, reflecting substantial advancements in the model’s general research agent capabilities.\n\n<div align=\"center\">\n\n|        Agent Name        |      Base Agent       | Max Context |                                HF Link                                 |\n|:------------------------:|:---------------------:|:-----------:|:----------------------------------------------------------------------:|\n| MiroThinker-4B-SFT-v0.2  |       Qwen3-4B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-SFT-v0.2)  |\n| MiroThinker-4B-DPO-v0.2  |       Qwen3-4B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-DPO-v0.2)  |\n| MiroThinker-8B-SFT-v0.2  |       Qwen3-8B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.2)  |\n| MiroThinker-8B-DPO-v0.2  |       Qwen3-8B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.2)  |\n| MiroThinker-14B-SFT-v0.2 |       Qwen3-14B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.2) |\n| MiroThinker-14B-DPO-v0.2 |       Qwen3-14B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.2) |\n| MiroThinker-32B-SFT-v0.2 |       Qwen3-32B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.2) |\n| MiroThinker-32B-DPO-v0.2 |       Qwen3-32B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.2) |\n\n</div>\n\n</details>\n\n### MiroThinker-v0.1\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v0.1 details</summary>\n\n<div align=\"center\">\n  <img src=\"assets/gaia_text_103.png\" width=\"98%\" alt=\"MiroFlow Performance on GAIA-Validation\" />\n  <p><strong>Performance of Open-Source Agents on GAIA-Validation Benchmark.</strong></p>\n</div>\n\nWe have released the **MiroThinker v0.1** series, including both SFT and DPO variants at parameter scales of **8B**, **14B**, and **32B**. Notably, MiroThinker v0.1 achieves **state-of-the-art performance** among open-source models on the [GAIA benchmark](https://huggingface.co/datasets/gaia-benchmark/GAIA), a rigorous evaluation suite for advanced agentic capabilities, demonstrating its strength in long-context, decision-intensive, and real-world task scenarios.\n\n<div align=\"center\">\n\n| Agent Name                | Base Agent | Max Context | HF Link                                                               |\n| :-----------------------: |:----------:|:-----------:| :--------------------------------------------------------------------:|\n| MiroThinker-8B-SFT-v0.1   |  Qwen3-8B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.1)  |\n| MiroThinker-8B-DPO-v0.1   |  Qwen3-8B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.1)  |\n| MiroThinker-14B-SFT-v0.1  | Qwen3-14B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.1) |\n| MiroThinker-14B-DPO-v0.1  | Qwen3-14B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.1) |\n| MiroThinker-32B-SFT-v0.1  | Qwen3-32B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.1) |\n| MiroThinker-32B-DPO-v0.1  | Qwen3-32B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.1) |\n\n</div>\n\n</details>\n\n## ✨ Key Features\n\n### 🤖 **MiroThinker-Optimized Framework**\n\n- 🔓 **Fully Open-Source Agent Framework**: Complete transparency with open framework and open agents\n- 🔗 **Tool Integration**: Seamless integration with external tools and APIs\n- 📝 **Trace Collection**: Comprehensive logging and analysis of agent interactions with elapsed time and estimated completion time displayed in minutes. Ready for SFT and DPO\n- 📊 **Benchmark Evaluation**: Extensive testing across multiple benchmark datasets\n\n### 📊 **Comprehensive Benchmark Suite**\n\n<details open>\n  <summary>📋 Click to expand benchmark list</summary>\n\n- **GAIA Validation**: A benchmark for General AI Assistants. ([paper](https://arxiv.org/abs/2311.12983))\n- **GAIA-Text-103**: A subset of GAIA Validation for text-only tasks. ([paper](https://arxiv.org/abs/2505.22648))\n- **HLE**: Humanity's Last Exam. ([paper](https://arxiv.org/abs/2501.14249))\n- **HLE-Text-2158**: A subset of HLE for text-only tasks. ([paper](https://arxiv.org/abs/2501.14249))\n- **HLE-Text-500**: A subset of HLE for text-only tasks, created by [WebThinker](https://arxiv.org/pdf/2504.21776). ([paper](https://arxiv.org/pdf/2504.21776))\n- **BrowseComp-EN**: Web browsing and comprehension tasks. ([paper](https://arxiv.org/abs/2504.12516))\n- **BrowseComp-ZH**: A Chinese version of BrowseComp. ([paper](https://arxiv.org/abs/2504.19314))\n- **WebWalkerQA**: Web navigation and question answering. ([paper](https://arxiv.org/abs/2501.07572))\n- **Frames**: Factuality, Retrieval, And reasoning MEasurement Set. ([paper](https://arxiv.org/abs/2409.12941))\n- **XBench-DeepSearch**: A benchmark for deep research agents. ([website](https://xbench.org/agi/aisearch))\n- **FutureX**: A live benchmark designed for predicting unknown future. ([website](https://futurex-ai.github.io/))\n- **SEAL-0**: A benchmark for evaluating LLMs on conflicting-evidence web questions. ([paper](https://arxiv.org/abs/2506.01062))\n- **AIME2025**: American Invitational Mathematics Examination 2025. ([website](https://artificialanalysis.ai/evaluations/aime-2025))\n- **DeepSearchQA**: Google's Deep Search Question Answering benchmark. ([paper](https://arxiv.org/abs/2505.20827))\n\n</details>\n\n## 📈 Performance on Benchmarks\n\n### MiroThinker-1.7\n\n> To prevent potential information leakage (e.g., retrieving benchmark answers from HuggingFace), we blocked access to certain websites during evaluation.\n\n<div>\n  <img src=\"assets/17_table.png\" width=\"100%\" alt=\"MiroThinker\" />\n</div>\n\n</details>\n\n\n\n### MiroThinker-v1.5\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v1.5 details</summary>\n\n> To prevent potential information leakage (e.g., searching benchmark answers from HuggingFace), access to HuggingFace has been explicitly disabled in these tools.\n\n> We further perform canary string testing on the tool outputs of all trajectories and disregard any trajectory found to be contaminated, treating it as an incorrect answer.\n\n<div>\n  <img src=\"https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_performance.png\" width=\"100%\" alt=\"MiroThinker\" />\n</div>\n\n</details>\n\n### MiroThinker-v1.0\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v1.0 details</summary>\n\n<div align=\"center\">\n  <img src=\"https://github.com/user-attachments/assets/108a2105-4e1d-499e-a001-4713a03fd8ac\" width=\"100%\" alt=\"MiroThinker\" />\n</div>\n\n</details>\n\n### MiroThinker-v0.2\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v0.2 details</summary>\n\n#### Comparison with SOTA Research Agents\n\n<div align=\"center\">\n  <img src=\"https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v0.2_Performance_2.png\" width=\"90%\" alt=\"MiroThinker\" />\n</div>\n\n#### GAIA Benchmark\n\n<div align=\"center\">\n  <img src=\"https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v0.2_Performance_1.png\" width=\"80%\" alt=\"MiroThinker\" />\n</div>\n\n</details>\n\n### MiroThinker-v0.1\n\n<details>\n  <summary>📦 Click to expand MiroThinker-v0.1 details</summary>\n\n#### GAIA Benchmark\n\n<div align=\"center\">\n\n| **Method**                   | Text-103<br>Best Pass@1 | Text-103<br>Pass@1 (Avg@8) | Val-165<br>Best Pass@1 | Val-165<br>Pass@1 (Avg@8) |\n|------------------------------|:-----------------------:|:--------------------------:|:----------------------:|:-------------------------:|\n| **🔹—— 7B/8B Agents ——**     |                         |                            |                        |                           |\n| Search-o1-7B                 |          17.5           |             -              |           -            |             -             |\n| R1-Searcher-7B               |          20.4           |             -              |           -            |             -             |\n| WebDancer-7B                 |          31.0           |             -              |           -            |             -             |\n| WebSailor-7B                 |          37.9           |             -              |           -            |             -             |\n| CK-Pro-8B                    |          40.3           |             -              |          32.7          |             -             |\n| **MiroThinker-8B-SFT-v0.1**  |          44.7           |            40.1            |          34.6          |           31.8            |\n|     + Commercial Tools       |          46.6           |            42.1            |          37.6          |           33.9            |\n| **MiroThinker-8B-DPO-v0.1**  |          46.6           |            44.8            |          37.0          |           35.4            |\n|     + Commercial Tools       |        **50.5**         |          **46.7**          |        **38.2**        |         **35.9**          |\n| **🔹—— 14B Agents ——**       |                         |                            |                        |                           |\n| **MiroThinker-14B-SFT-v0.1** |          47.6           |            44.4            |          37.0          |           34.4            |\n|     + Commercial Tools       |          49.5           |            47.5            |          41.8          |           39.8            |\n| **MiroThinker-14B-DPO-v0.1** |          48.5           |            46.6            |          42.4          |           39.2            |\n|     + Commercial Tools       |        **52.4**         |          **48.5**          |        **45.5**        |         **42.0**          |\n| **🔹—— 32B Agents ——**       |                         |                            |                        |                           |\n| Qwen3-32B                    |          31.1           |            26.7            |          29.7          |           26.4            |\n| Search-o1-32B                |          28.2           |             -              |           -            |             -             |\n| WebThinker-32B-RL            |          48.5           |             -              |           -            |             -             |\n| WebDancer-QwQ-32B            |          51.5           |             -              |           -            |             -             |\n| WebSailor-32B                |          53.2           |             -              |           -            |             -             |\n| WebShaper-QwQ-32B            |          53.3           |             -              |           -            |             -             |\n| **MiroThinker-32B-SFT-v0.1** |          55.3           |            51.3            |          44.9          |           42.7            |\n|     + Commercial Tools       |          58.3           |            54.2            |          48.5          |           45.8            |\n| **MiroThinker-32B-DPO-v0.1** |          57.3           |            54.1            |          48.5          |           45.9            |\n|     + Commercial Tools       |        **60.2**         |          **57.9**          |        **50.9**        |         **48.9**          |\n\n</div>\n\n1. Following the practices of WebThinker, WebAgents, and CognitiveKernel, we report the Best Pass@1, the highest score across three runs, which often reflects stronger performance, though it may exhibit some variability. To provide a more stable measure, we additionally report Pass@1 (Avg@8), which offers greater consistency at the cost of slightly lower scores.\n\n1. For consistency with prior open-source works, we evaluate GAIA-Text-103 using the WebAgents LLM-as-a-Judge template, and report results on GAIA-Val-165 using the official GAIA scorer script.\n\n1. By default, we use open-source tools wherever possible, except for the code tool [E2B](https://github.com/e2b-dev/E2B) and the Google search tool [Serper](https://serper.dev/). We use [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct), and [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) in our implementation. The framework can be easily extended to other open-source tools of your choice.\n\n1. Replacing these open-source tools with commercial alternatives can yield performance gains. Commercial tools were mainly used for multimodal capabilities and certain complex reasoning subtasks. The majority of tasks, including planning, browsing, refinement, navigation, and more, were handled by our agents.\n\n#### More Benchmarks\n\n<div align=\"center\">\n\n| Method                       | HLE<br>Pass@1 | Frames<br>Pass@1 | BrowseComp<br>Pass@1 | BrowseComp-ZH<br>Pass@1 | WebWalkerQA<br>Pass@1 |\n|------------------------------|:-------------:|:----------------:|:--------------------:|:-----------------------:|:---------------------:|\n| OpenAI Deep Research         |     26.6      |        -         |         51.5         |          42.9           |           -           |\n| Gemini Deep Research         |     26.9      |        -         |          -           |            -            |           -           |\n| Kimi-Researcher              |     26.9      |       78.8       |          -           |            -            |           -           |\n|                              |               |                  |                      |                         |                       |\n| WebDancer-7B                 |       -       |        -         |          -           |            -            |         36.0          |\n| WebSailor-7B                 |       -       |        -         |         6.7          |          14.2           |           -           |\n| **MiroThinker-8B-SFT-v0.1**  |       -       |       58.0       |         5.5          |           9.3           |         41.3          |\n| **MiroThinker-8B-DPO-v0.1**  |       -       |       64.4       |         8.7          |          13.6           |         45.7          |\n|                              |               |                  |                      |                         |                       |\n| WebThinker-32B-RL            |       -       |        -         |          -           |            -            |         46.5          |\n| WebDancer-QwQ-32B            |       -       |        -         |         3.8          |          18.0           |         47.9          |\n| WebSailor-32B                |       -       |        -         |         10.5         |          25.5           |           -           |\n| WebShaper-32B                |       -       |        -         |          -           |            -            |         51.4          |\n| **MiroThinker-32B-SFT-v0.1** |     10.2      |       70.4       |         10.6         |          13.8           |         45.7          |\n| **MiroThinker-32B-DPO-v0.1** |     11.8      |       71.7       |         13.0         |          17.0           |         49.3          |\n\n</div>\n\n1. MiroThinker’s performance was tested with this repository and open-source tools; other agents’ results are from their papers and official sites.\n\n1. As [MiroVerse-v0.1](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) mainly contains English data, the agent’s Chinese capability is limited. We plan to add more Chinese data to improve performance in the next version.\n\n</details>\n\n## 🚀 Quick Start\n\nFor optimal usage, we recommend using MiroThinker with this tool-enabled agent framework and thinking mode enabled.\n\n### Prerequisites\n\n- 🐍 **Python 3.10+**\n- 📦 **uv package manager** ([Installation guide](https://github.com/astral-sh/uv))\n- 🔑 **Required API keys** (see configuration section below)\n\n### Installation\n\n```bash\n# Clone the repository\ngit clone https://github.com/MiroMindAI/MiroThinker\ncd MiroThinker\n\n# Setup environment\ncd apps/miroflow-agent\nuv sync\n\n# Configure API keys\ncp .env.example .env\n# Edit .env with your API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)\n```\n\n> **📝 Environment Variables**: See [Tool Configuration](#tool-configuration) section for required API keys.\n\n### Tool Configuration\n\n#### Minimal Configuration for MiroThinker-1.7.\n\n| Server | Description | Tools Provided | Required Environment Variables |\n|:-------|:------------|:---------------|:-------------------------------|\n| **`tool-python`** | Execution environment and file management (E2B sandbox) | `create_sandbox`, `run_command`, `run_python_code`, `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY` |\n| **`search_and_scrape_webpage`** | Google search via Serper API | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` |\n| **`jina_scrape_llm_summary`** | Web scraping with LLM-based information extraction | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` |\n\n**Minimal `.env` configuration example:**\n\n```bash\n# Required for MiroThinker v1.5 and v1.0 (minimal setup)\nSERPER_API_KEY=your_serper_key\nSERPER_BASE_URL=\"https://google.serper.dev\"\nJINA_API_KEY=your_jina_key\nJINA_BASE_URL=\"https://r.jina.ai\"\nE2B_API_KEY=your_e2b_key\n\n# Required for jina_scrape_llm_summary\n# Note: Summary LLM can be a small model (e.g., Qwen3-14B or GPT-5-Nano)\n# The choice has minimal impact on performance, use what's most convenient\nSUMMARY_LLM_BASE_URL=\"https://your_summary_llm_base_url/v1/chat/completions\"\nSUMMARY_LLM_MODEL_NAME=your_llm_model_name  # e.g., \"Qwen/Qwen3-14B\" or \"gpt-5-nano\"\nSUMMARY_LLM_API_KEY=your_llm_api_key  # Optional, depends on LLM provider\n\n# Required for benchmark evaluation (LLM-as-a-Judge)\nOPENAI_API_KEY=your_openai_key  # Required for running benchmark evaluations\nOPENAI_BASE_URL=\"https://api.openai.com/v1\"  # Optional, defaults to OpenAI's API\n```\n\n> **💡 Why this is minimal**: These 3 MCP servers cover the core capabilities needed for research tasks: web search, content extraction, and code execution. All other servers are optional enhancements.\n>\n> **🤖 Summary LLM**: The `SUMMARY_LLM` can be a small model like Qwen3-14B or GPT-5-Nano. The choice has minimal impact on overall performance, use whichever is most convenient for your setup.\n>\n> **📊 For Benchmark Evaluation**: If you plan to run benchmark evaluations, you also need `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL`) for LLM-as-a-Judge functionality used in evaluation scripts.\n>\n> **🖼️ For GAIA Multimodal Tasks**: GAIA-Val-165 includes tasks with image/audio/video files. Since MiroThinker is a text-only LLM, GPT-4o is used to pre-process these files into text descriptions. The same `OPENAI_API_KEY` is used for both this preprocessing and LLM-as-a-Judge.\n>\n> **📖 For more details**: See [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.\n\n<details>\n  <summary>🔧 Click to expand additional available tools</summary>\n\nThe following optional tools are available but were not used in MiroThinker v1.0-1.7 evaluation:\n\n| Server Name          | Type         | Description                                 |\n|:---------------------|:-------------|:--------------------------------------------|\n| `tool-vqa`           | Commercial   | Vision processing using Claude              |\n| `tool-vqa-os`        | Open-Source  | Vision processing (open-source alternative) |\n| `tool-transcribe`    | Commercial   | Audio transcription using OpenAI            |\n| `tool-transcribe-os` | Open-Source  | Audio transcription using Whisper           |\n| `tool-reasoning`     | Commercial   | Reasoning engine using Claude               |\n| `tool-reasoning-os`  | Open-Source  | Reasoning engine (open-source alternative)  |\n| `tool-reading`       | Open-Source  | Document reading using MarkItDown           |\n| `tool-google-search` | Commercial   | Web search using Google + scraping          |\n| `tool-sogou-search` | Commercial   | Web search using Sogou (Chinese)           |\n\n> **📖 Local Deployment**: For instructions on deploying open-source tools (`tool-vqa-os`, `tool-transcribe-os`, `tool-reasoning-os`) locally, see [Local Tool Deployment Guide](assets/LOCAL-TOOL-DEPLOYMENT.md).\n\nSee the [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.\n\n</details>\n\n#### Pre-configured Agent Settings\n\nThe `apps/miroflow-agent/conf/agent/` directory contains several pre-configured agent settings. Each configuration uses different tools and requires corresponding environment variables in your `.env` file.\n\n> **💡 Recommended**: For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management, recommended for most tasks) or `mirothinker_v1.7_keep5_max300` (only used for BrowseComp and BrowseComp-ZH). \n\n| Configuration                          | Description | Max Turns | Context Retention | Required Environment Variables                                                                                                                               | Recommended For |\n|:---------------------------------------|:------------|:----------|:------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|\n| **`mirothinker_1.7_keep5_max200`** ⭐  | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **1.7 (recommended for most tasks)** |\n| **`mirothinker_1.7_keep5_max300`** ⭐  | Single-agent with context management | 300 | Keep 5 most recent | Same as above                                                                                                                              | **1.7 (for BrowseComp & BrowseComp-ZH)** |\n\n\n<details>\n  <summary>📦 Click to expand legacy configurations (v0.1/v0.2)</summary>\n\n| Configuration            | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For |\n|:-------------------------|:------------|:----------|:------------------|:-------------------------------|:----------------|\n| **`mirothinker_v1.5_keep5_max200`**  | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **v1.5 (recommended for most tasks)** |\n| **`mirothinker_v1.5_keep5_max400`**  | Single-agent with context management | 400 | Keep 5 most recent | Same as above                                                                                                                              | **v1.5 (for BrowseComp & BrowseComp-ZH)** |\n| **`mirothinker_v1.5`**                 | Single-agent for MiroThinker v1.5 | 600 | Keep all results | Same as above | **v1.5** |\n| **`mirothinker_v1.0_keep5`**           | Single-agent with context management | 600 | Keep 5 most recent | Same as above                                                                                                                                   | **v1.0** |\n| **`mirothinker_v1.0`**                 | Single-agent for MiroThinker v1.0 | 600 | Keep all results | Same as above | **v1.0** |\n| **`multi_agent`**        | Multi-agent with commercial tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_BASE_URL`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |\n| **`multi_agent_os`**     | Multi-agent with open-source tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`, `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`, `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |\n\n</details>\n\n> **💡 Note**: All environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and fill in the values for the tools you plan to use.\n\n#### Creating Custom Tool Configurations\n\n<details>\n  <summary>🔧 Click to expand custom tool configuration guide</summary>\n\nYou can create your own YAML configuration file to freely combine MCP servers. Here's how:\n\n1. **Create a new YAML file** in `apps/miroflow-agent/conf/agent/`:\n\n```yaml\n# conf/agent/my_custom_config.yaml\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - tool-python                    # Execution environment\n    - search_and_scrape_webpage      # Google search\n    - jina_scrape_llm_summary        # Web scraping with LLM\n    - tool-vqa                       # Vision processing (optional)\n    - tool-transcribe                # Audio processing (optional)\n    - tool-reasoning                 # Reasoning engine (optional)\n    - tool-reading                   # Document reading (optional)\n  max_turns: 300  # Maximum number of turns\n\nsub_agents:\n  agent-browsing:  # Optional sub-agent\n    tools:\n      - tool-google-search\n      - tool-vqa\n      - tool-reading\n      - tool-python\n    max_turns: 50\n\nkeep_tool_result: -1  # Context retention budget: -1 keeps all tool results, or specify K to keep only the K most recent tool responses\n```\n\n> **💡 Context Retention Strategy**: The `keep_tool_result` parameter implements a **recency-based context retention** strategy. In the standard ReAct paradigm, all tool outputs are retained in the message history, which can lead to inefficient context utilization. Empirically, we observe that the agent's subsequent actions depend primarily on recent observations rather than distant ones. This strategy retains only the most recent K tool responses (where K is the `keep_tool_result` value) while preserving the complete sequence of thoughts and actions.\n>\n> **Benefits:**\n>\n> - ✅ Preserves the reasoning and action trace\n> - ✅ Focuses the agent's attention on the most contextually relevant observations\n> - ✅ Frees additional context space for extended reasoning and deeper tool-use trajectories\n> - ✅ Does not lead to performance degradation while allowing more context space for interactive scaling\n>\n> **Usage:** Set `keep_tool_result: -1` to keep all tool results, or specify a positive integer K (e.g., `keep_tool_result: 5`) to keep only the K most recent tool responses.\n\n2. **Use your custom configuration** when running evaluations:\n\n```bash\ncd apps/miroflow-agent\nuv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1\n```\n\n3. **Configure environment variables** in `.env` based on the tools you use.\n\n   All available environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and configure the variables according to your chosen configuration:\n\n   ```bash\n   cd apps/miroflow-agent\n   cp .env.example .env\n   # Edit .env with your actual API keys\n   ```\n\n   **For MiroThinker v1.5** (`mirothinker_v1.5_keep5_max200.yaml`, `mirothinker_v1.5_keep5_max400.yaml`, or `mirothinker_v1.5.yaml`) and **v1.0** (`mirothinker_v1.0_keep5.yaml` or `mirothinker_v1.0.yaml`), see the [Minimal Configuration](#minimal-configuration-for-mirothinker-v15-and-v10) section above for the complete configuration example.\n\n   **For other configurations**, refer to the [Pre-configured Agent Settings](#pre-configured-agent-settings) table above to see which environment variables are required.\n\n</details>\n\n<details>\n  <summary>🔑 Click to expand optional API keys</summary>\n\n```bash\n# API for LLM-as-a-Judge (for benchmark testing, required for benchmark evaluation)\nOPENAI_API_KEY=your_openai_key\nOPENAI_BASE_URL=\"https://api.openai.com/v1\"  # Optional, defaults to OpenAI's API\n\n# API for Open-Source Audio Transcription Tool (for benchmark testing, optional)\nWHISPER_MODEL_NAME=\"openai/whisper-large-v3-turbo\"\nWHISPER_API_KEY=your_whisper_key\nWHISPER_BASE_URL=\"https://your_whisper_base_url/v1\"\n\n# API for Open-Source VQA Tool (for benchmark testing, optional)\nVISION_MODEL_NAME=\"Qwen/Qwen2.5-VL-72B-Instruct\"\nVISION_API_KEY=your_vision_key\nVISION_BASE_URL=\"https://your_vision_base_url/v1/chat/completions\"\n\n# API for Open-Source Reasoning Tool (for benchmark testing, optional)\nREASONING_MODEL_NAME=\"Qwen/Qwen3-235B-A22B-Thinking-2507\"\nREASONING_API_KEY=your_reasoning_key\nREASONING_BASE_URL=\"https://your_reasoning_base_url/v1/chat/completions\"\n\n# API for Claude Sonnet 3.7 as Commercial Tools (optional)\nANTHROPIC_API_KEY=your_anthropic_key\n\n# API for Sogou Search (optional)\nTENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id\nTENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key\n\n# API for Summary LLM (can use small models like Qwen3-14B or GPT-5-Nano)\nSUMMARY_LLM_BASE_URL=\"https://your_summary_llm_base_url/v1/chat/completions\"\nSUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name  # e.g., \"Qwen/Qwen3-14B\" or \"gpt-5-nano\"\nSUMMARY_LLM_API_KEY=your_summary_llm_api_key\n```\n\n</details>\n\n### Serve the MiroThinker Agent\n\n#### Option 1 (Recommended): Serve with SGLang or vLLM\n\nUse SGLang to serve MiroThinker models at port 61002:\n\n```bash\nNUM_GPUS=4\nPORT=61002\n\n# Downloading agent from HF \nAGENT_PATH=miromind-ai/MiroThinker-1.7-mini\n\n\npython3 -m sglang.launch_server \\\n    --model-path $AGENT_PATH \\\n    --tp $NUM_GPUS \\\n    --dp 1 \\\n    --host 0.0.0.0 \\\n    --port $PORT \\\n    --trust-remote-code\n```\n\n> **📍 Server URL**: This will start a server at `http://0.0.0.0:$PORT`. Use this as your server base URL (e.g., `http://0.0.0.0:61002/v1`).\n\n#### Option 2: Quantized Light-Weight Options\n\nWe also provide comprehensive guidance for serving MiroThinker agents using CPU-optimized and GPU-accelerated quantization techniques, along with detailed analysis and guidelines for deployment with llama.cpp, Ollama, SGLang, and other inference frameworks.\n\n> **📖 Complete Guide**: See [Deployment Documentation](apps/gradio-demo/) for detailed deployment instructions.\n\n### Run Your First Task\n\nAfter setting up the environment and starting your server, run `main.py` to test with a default question: *\"What is the title of today's arxiv paper in computer science?\"*\n\n```bash\ncd apps/miroflow-agent\n\n# Using MiroThinker agents (requires your own server)\nuv run python main.py llm=qwen-3 agent=mirothinker_1.7_keep5_max200 llm.base_url=http://localhost:61002/v1\n\n# Or using Claude (requires ANTHROPIC_API_KEY in .env)\nuv run python main.py llm=claude-3-7 agent=single_agent_keep5\n\n# Or using GPT-5 (requires OPENAI_API_KEY in .env)\nuv run python main.py llm=gpt-5 agent=single_agent_keep5\n```\n\n**To customize your question**, edit `main.py` line 32:\n\n```python\ntask_description = \"Your custom question here\"\n```\n\nThe agent will search the web, execute code if needed, and provide an answer with sources.\n\n> **📖 More details**: See [apps/miroflow-agent/README.md](apps/miroflow-agent/README.md) for available configurations and troubleshooting.\n\n## 📊 Benchmark Evaluation\n\n> For researchers who want to reproduce our benchmark results or evaluate on standard benchmarks.\n\n### Download Benchmark Data\n\n```bash\ncd MiroThinker  # Back to project root\nwget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/data_20251115_password_protected.zip\nunzip data_20251115_password_protected.zip\n# Password: pf4*\nrm data_20251115_password_protected.zip\n```\n\n### Run Benchmark Evaluation\n\n> **Note:** For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management), `mirothinker_1.7_keep5_max300` (with context management).\n\n**Available Parameters:**\n\nYou can customize the evaluation by setting the following environment variables before running the script:\n\n| Parameter | Default | Description |\n|:----------|:--------|:------------|\n| `LLM_MODEL` | `\"MiroThinker-Agents\"` | Agent name identifier |\n| `BASE_URL` | `\"https://your-api.com/v1\"` | Base URL of your server |\n| `NUM_RUNS` | Varies by benchmark | Number of evaluation runs (3 for most benchmarks, 8 for GAIA/XBench/FutureX/SEAL-0, 32 for AIME2025) |\n| `LLM_PROVIDER` | `\"qwen\"` | LLM provider (e.g., `qwen`, `openai`, `anthropic`) |\n| `AGENT_SET` | `\"mirothinker_1.7_keep5_max200\"` | Agent configuration (e.g., `mirothinker_1.7_keep5_max200`, `mirothinker_1.7_keep5_max300`.) |\n| `MAX_CONTEXT_LENGTH` | `262144` | Maximum context length (256K) |\n| `MAX_CONCURRENT` | `10` | Maximum concurrent tasks |\n| `PASS_AT_K` | `1` | Pass@K evaluation metric |\n| `TEMPERATURE` | `1.0` | Sampling temperature |\n| `API_KEY` | `\"xxx\"` | API key for the server |\n\n**Example Usage:**\n\n```bash\n# Navigate to the miroflow-agent directory first\ncd apps/miroflow-agent\n\n# Basic usage with v1.5 (recommended)\nNUM_RUNS=8 LLM_MODEL=\"MiroThinker-1.7-mini\" BASE_URL=\"https://your-api.com/v1\" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh\n\n# Or with v1.0\n# NUM_RUNS=8 LLM_MODEL=\"MiroThinker-v1.0-30B\" BASE_URL=\"https://your-api.com/v1\" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh\n\n# Customize number of runs and agent configuration (v1.5 with context management)\nLLM_MODEL=\"MiroThinker-1.7-mini\" \\\nBASE_URL=\"https://your-api.com/v1\" \\\nNUM_RUNS=8 \\\nAGENT_SET=\"mirothinker_1.7_keep5_max200\" \\\nbash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh\n\n```\n\n<details open>\n  <summary>📋 Click to expand all benchmark commands</summary>\n\n> **⚠️ Important for MiroThinker-1.7**: To reproduce our reported results, you must set the correct `AGENT_SET`:\n>\n> - **BrowseComp & BrowseComp-ZH**: Use `AGENT_SET=\"mirothinker_1.7_keep5_max300\"`\n> - **All other benchmarks**: Use `AGENT_SET=\"mirothinker_1.7_keep5_max200\"`\n\n```bash\n# Navigate to the miroflow-agent directory first\ncd apps/miroflow-agent\n\n# HLE\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_hle.sh\n\n# HLE-Text-2158\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_hle-text-2158.sh\n\n# HLE-Text-500\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_hle-text-500.sh\n\n# GAIA-Text-103\nNUM_RUNS=8 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh\n\n# GAIA-Validation (GAIA-Val-165)\nNUM_RUNS=8 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_gaia-validation.sh\n\n# BrowseComp-EN (⚠️ use max300)\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max300\" bash scripts/run_evaluate_multiple_runs_browsecomp.sh\n\n# BrowseComp-ZH (⚠️ use max300)\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max300\" bash scripts/run_evaluate_multiple_runs_browsecomp_zh.sh\n\n# WebWalkerQA\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_webwalkerqa.sh\n\n# XBench-DeepSearch\nNUM_RUNS=8 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh\n\n# FRAMES\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_frames.sh\n\n# SEAL-0\nNUM_RUNS=8 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_seal-0.sh\n\n# FutureX\nNUM_RUNS=8 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_futurex.sh\n\n# AIME2025\nNUM_RUNS=32 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_aime2025.sh\n\n# DeepSearchQA\nNUM_RUNS=3 LLM_MODEL=\"xxx\" BASE_URL=\"xxx\" AGENT_SET=\"mirothinker_1.7_keep5_max200\" bash scripts/run_evaluate_multiple_runs_deepsearchqa.sh\n```\n\n</details>\n\n#### 3. **Monitor evaluation progress**\n\n<details>\n  <summary>📊 Click to expand progress monitoring commands</summary>\n\n```bash\n# Navigate to the miroflow-agent directory first\ncd apps/miroflow-agent\n\n# For HLE\npython benchmarks/check_progress/check_progress_hle.py /path/to/evaluation/logs\n\n# For HLE-Text-2158\npython benchmarks/check_progress/check_progress_hle-text-2158.py /path/to/evaluation/logs\n\n# For HLE-Text-500\npython benchmarks/check_progress/check_progress_hle-text-500.py /path/to/evaluation/logs\n\n# For BrowseComp-EN\npython benchmarks/check_progress/check_progress_browsecomp.py /path/to/evaluation/logs\n\n# For BrowseComp-ZH\npython benchmarks/check_progress/check_progress_browsecomp_zh.py /path/to/evaluation/logs\n\n# For GAIA-Validation\npython benchmarks/check_progress/check_progress_gaia-validation.py /path/to/evaluation/logs\n\n# For GAIA-Text-103\npython benchmarks/check_progress/check_progress_gaia-validation-text-103.py /path/to/evaluation/logs\n\n# For WebWalkerQA\npython benchmarks/check_progress/check_progress_webwalkerqa.py /path/to/evaluation/logs\n\n# For Frames\npython benchmarks/check_progress/check_progress_frames.py /path/to/evaluation/logs\n\n# For XBench-DeepSearch\npython benchmarks/check_progress/check_progress_xbench_deepsearch.py /path/to/evaluation/logs\n\n# For SEAL-0\npython benchmarks/check_progress/check_progress_seal-0.py /path/to/evaluation/logs\n\n# For AIME2025\npython benchmarks/check_progress/check_progress_aime2025.py /path/to/evaluation/logs\n\n# For DeepSearchQA\npython benchmarks/check_progress/check_progress_deepsearchqa.py /path/to/evaluation/logs\n```\n\n</details>\n\n## 🔬 Trace Collection\n\n<details>\n<summary>📋 Click to expand trace collection commands</summary>\n\n```bash\ncd apps/collect-trace\n\n# Collect Traces for SFT\nbash scripts/collect_trace_claude37.sh\nbash scripts/collect_trace_gpt5.sh\n\n# Collect Traces for DPO\nbash scripts/collect_trace_qwen3.sh\n```\n\n</details>\n\n## ❓ FAQ & Troubleshooting\n\n### Common Issues\n\n<details>\n  <summary>🔧 Click to expand troubleshooting guide</summary>\n\n#### **Q: Which version should I use?**\n\n**A:** We recommend **MiroThinker-1.7** ⭐ with the minimal configuration:\n\n- **v1.7** ⭐: Latest version with 256K context, world-leading performance. Use config (with context management):\n  - `mirothinker_1.7_keep5_max200` (up to 200 turns, recommended for most tasks)\n  - `mirothinker_1.7_keep5_max300` (up to 300 turns, only used for BrowseComp and BrowseComp-ZH)\n\n#### **Q: How do I get API keys?**\n\n**A:** You need these keys for minimal setup:\n\n- **SERPER_API_KEY**: Get from [Serper.dev](https://serper.dev/) (Google search API)\n- **JINA_API_KEY**: Get from [Jina.ai](https://jina.ai/) (Web scraping)\n- **E2B_API_KEY**: Get from [E2B.dev](https://e2b.dev/) (Code execution sandbox)\n- **SUMMARY_LLM_API_KEY**: Your LLM API credentials (for content summarization). Can be a small model like Qwen3-14B or GPT-5-Nano—the choice has minimal impact on performance.\n- **OPENAI_API_KEY**: Get from [OpenAI](https://platform.openai.com/) (Required for benchmark evaluation, used for LLM-as-a-Judge)\n- **OPENAI_BASE_URL**: Optional, defaults to `https://api.openai.com/v1`. Can be changed to use OpenAI-compatible APIs.\n\n#### **Q: Agent server connection errors**\n\n**A:** Common issues:\n\n- **Check base URL format**: Should end with `/v1` (e.g., `https://your-api.com/v1`)\n- **Verify API key**: Ensure `API_KEY` is set correctly in environment or script\n- **Check server status**: Make sure your server is running and accessible\n- **Network issues**: Verify firewall/network settings allow connections\n\n#### **Q: Evaluation script fails to run**\n\n**A:** Troubleshooting steps:\n\n1. **Check working directory**: Make sure you're in `apps/miroflow-agent` directory\n1. **Verify environment**: Run `uv sync` to ensure dependencies are installed\n1. **Check .env file**: Ensure all required environment variables are set\n1. **Review logs**: Check `logs/` directory for detailed error messages\n1. **Verify data path**: Ensure benchmark data is downloaded and in correct location\n\n#### **Q: Out of memory errors**\n\n**A:** Solutions:\n\n- **Reduce context length**: Set `MAX_CONTEXT_LENGTH` to a smaller value (e.g., 131072 for 128K)\n- **Use context management with fewer turns**:\n  - For v1.5: Use `mirothinker_1.7_keep5_max200` or `mirothinker_1.7_keep5_max300` (with context management)\n- **Reduce concurrent tasks**: Set `MAX_CONCURRENT` to a smaller number (e.g., 5)\n- **Use smaller agents**:\n  - For v1.5: Try 30B instead of 235B\n  - For v1.0: Try 8B or 30B instead of 72B\n\n#### **Q: Tool execution errors**\n\n**A:** Common fixes:\n\n- **E2B errors**: Verify `E2B_API_KEY` is valid and account has credits\n- **Serper errors**: Check `SERPER_API_KEY` and rate limits\n- **Jina errors**: Verify `JINA_API_KEY` and `JINA_BASE_URL` are correct\n- **LLM summarization errors**: Check `SUMMARY_LLM_*` variables and agent availability\n\n#### **Q: How to monitor long-running evaluations?**\n\n**A:** Use the progress monitoring scripts:\n\n```bash\ncd apps/miroflow-agent\npython benchmarks/check_progress/check_progress_<benchmark_name>.py /path/to/logs\n```\n\nThe scripts show completion status, elapsed time, and estimated remaining time.\n\n</details>\n\n### Getting Help\n\n- 📖 **Documentation**: Check [MiroFlow Tools README](libs/miroflow-tools/README.md) for tool details\n- 💬 **Discord**: Join our [Discord community](https://discord.com/invite/GPqEnkzQZd)\n- 🐛 **Issues**: Report bugs on [GitHub Issues](https://github.com/MiroMindAI/MiroThinker/issues)\n- 📧 **Contact**: Visit [our website](https://miromind.ai/) for more information\n\n## 📄 License\n\nThis project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.\n\n## 🙏 Acknowledgments\n\nWe extend our sincere gratitude to:\n\n- 🏆 **Benchmark Contributors** for the comprehensive evaluation datasets\n- 🌍 **Open Source Community** for the tools and libraries that make this possible\n- 👥 **All Contributors** who have helped make MiroThinker better\n\n<div align=\"center\">\n  <a href=\"https://github.com/MiroMindAI/MiroThinker/graphs/contributors\">\n    <img src=\"https://contrib.rocks/image?repo=MiroMindAI/MiroThinker\" />\n  </a>\n</div>\n\nJoin our community and help us build the future of AI agents!\n\n### References\n\nIf you find this project useful in your research, please consider citing:\n\n**MiroThinker** (Model & Method)\n```\n@article{miromind2026mirothinker,\n  title={MiroThinker-1.7 & H1: Towards Heavy-Duty Research Agents via Verification},\n  author={MiroMind Team and Bai, S. and Bing, L. and Lei, L. and Li, R. and Li, X. and Lin, X. and Min, E. and Su, L. and Wang, B. and Wang, L. and Wang, L. and Wang, S. and Wang, X. and Zhang, Y. and Zhang, Z. and others},\n  journal={arXiv preprint arXiv:2603.15726},\n  year={2026}\n}\n```\n\n**MiroFlow** (Framework)\n```bibtex\n@article{miromind2026miroflow,\n  title={MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework for General Deep Research Tasks},\n  author={Su, Shiqian and Xing, Sen and Dong, Xuan and Zhong, Muyan and Wang, Bin and Zhu, Xizhou and Chen, Yuntao and Wang, Wenhai and Deng, Yue and Zhu, Pengxiang and others},\n  journal={arXiv preprint arXiv:2602.22808},\n  year={2026}\n}\n```\n\n[![Star History Chart](https://api.star-history.com/svg?repos=MiroMindAI/MiroThinker&type=Date)](https://star-history.com/#MiroMindAI/MiroThinker&Date)\n"
  },
  {
    "path": "apps/collect-trace/README.md",
    "content": "# Collect Trace\n\n> TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-a-Judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO).\n\n## 📝 Overview\n\nCollect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct.\n\nWorkflow:\n\n1. Load each RLVR item’s question and verifiable answer.\n\n1. Run the agent in the evaluation pipeline (with tool use / browsing as needed).\n\n1. Verify the model's answer with an LLM-as-a-Judge against the RLVR reference answer.\n\n1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples.\n\n## 🚀 Quick Start\n\n### Prerequisites\n\n- Python 3.10+\n- [uv](https://github.com/astral-sh/uv) package manager\n- OpenAI API key (for LLM-based validation)\n- RLVR dataset (JSONL; contains question and a verifiable answer)\n\n### Installation\n\n1. **Navigate to the collect-trace directory**:\n\n   ```bash\n   cd apps/collect-trace\n   ```\n\n1. **Install dependencies**:\n\n   ```bash\n   uv sync\n   ```\n\n1. **Set up environment variables**:\n\n   ```bash\n   # Create .env if missing (safe; won't overwrite existing file)\n   [ -f ../miroflow-agent/.env ] || cp ../miroflow-agent/.env.example ../miroflow-agent/.env\n   # (Alternative on macOS/Linux) cp -n ../miroflow-agent/.env.example ../miroflow-agent/.env || true\n\n   # Edit .env and fill in your keys\n   # Required: OPENAI_API_KEY (for LLM-as-a-Judge)\n   # Optional: other keys for specific tools\n   ```\n\n### Basic Usage\n\nRun a benchmark evaluation to collect traces:\n\n```bash\n# Using Claude-3.7 for trace collection\nbash scripts/collect_trace_claude37.sh\n\n# Using GPT-5 for trace collection  \nbash scripts/collect_trace_gpt5.sh\n\n# Using Qwen-3 for trace collection  \nbash scripts/collect_trace_qwen3.sh\n```\n"
  },
  {
    "path": "apps/collect-trace/pyproject.toml",
    "content": "[project]\nname = \"collect-trace\"\nversion = \"0.1.0\"\ndescription = \"Executes a user-defined agent loop for capturing multi-turn interaction traces\"\nreadme = \"README.md\"\nrequires-python = \">=3.12\"\nauthors = [{ name = \"MiroMind Team\", email = \"service@miromind.ai\" }]\ndependencies = [\n    \"miroflow-tools>=0.1.0\",\n    \"dotenv>=0.9.9\",\n    \"openai>=1.90.0\",\n]\n\n[tool.uv.sources]\nmiroflow-tools = { path = \"../../libs/miroflow-tools\", editable = true }\n"
  },
  {
    "path": "apps/collect-trace/scripts/collect_trace_claude37.sh",
    "content": "# Check if ANTHROPIC_API_KEY is set\nif [ -z \"$ANTHROPIC_API_KEY\" ]; then\n    echo \"Error: ANTHROPIC_API_KEY is not set.\"\n    exit 1\nelse\n    echo \"ANTHROPIC_API_KEY detected.\"\nfi\n\n# Get the directory where the current script is located\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\necho \"Current script directory: $SCRIPT_DIR\"\n\n\n# Enter the apps/miroflow-agent directory\nTARGET_DIR=\"$SCRIPT_DIR/../../miroflow-agent\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\nmkdir -p ../../logs\nLOG_DIR=\"../../logs/collect_trace_claude37\"\necho \"Log directory: $LOG_DIR\"\nmkdir -p $LOG_DIR\n\n# Collect traces\nuv run python benchmarks/common_benchmark.py \\\n    benchmark=collect_trace \\\n    benchmark.data.data_dir=\"../../data/debug\" \\\n    benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n    llm=claude-3-7 \\\n    llm.provider=anthropic \\\n    llm.model_name=claude-3-7-sonnet-20250219 \\\n    llm.api_key=\"$ANTHROPIC_API_KEY\" \\\n    llm.base_url=https://api.anthropic.com \\\n    llm.async_client=true \\\n    benchmark.execution.max_tasks=null \\\n    benchmark.execution.max_concurrent=10 \\\n    benchmark.execution.pass_at_k=1 \\\n    agent=single_agent \\\n    hydra.run.dir=$LOG_DIR \\\n    2>&1 | tee \"$LOG_DIR/output.log\"\n\n# Enter the apps/collect-trace directory\nTARGET_DIR=\"$SCRIPT_DIR/../\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\n# Process traces\nuv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl\n\n\n"
  },
  {
    "path": "apps/collect-trace/scripts/collect_trace_gpt41.sh",
    "content": "# Check if OPENAI_API_KEY is set\nif [ -z \"$OPENAI_API_KEY\" ]; then\n    echo \"Error: OPENAI_API_KEY is not set.\"\n    exit 1\nelse\n    echo \"OPENAI_API_KEY detected.\"\nfi\n\n# Get the directory where the current script is located\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\necho \"Current script directory: $SCRIPT_DIR\"\n\n\n# Enter the apps/miroflow-agent directory\nTARGET_DIR=\"$SCRIPT_DIR/../../miroflow-agent\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\nmkdir -p ../../logs\nLOG_DIR=\"../../logs/collect_trace_gpt41\"\necho \"Log directory: $LOG_DIR\"\nmkdir -p $LOG_DIR\n\n# Collect traces\nuv run python benchmarks/common_benchmark.py \\\n    benchmark=collect_trace \\\n    benchmark.data.data_dir=\"../../data/debug\" \\\n    benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n    llm=gpt-5 \\\n    llm.provider=openai \\\n    llm.model_name=gpt-4.1-mini \\\n    llm.api_key=\"$OPENAI_API_KEY\" \\\n    llm.base_url=https://api.openai.com/v1 \\\n    llm.async_client=true \\\n    benchmark.execution.max_tasks=null \\\n    benchmark.execution.max_concurrent=10 \\\n    benchmark.execution.pass_at_k=1 \\\n    agent=single_agent \\\n    hydra.run.dir=$LOG_DIR \\\n    2>&1 | tee \"$LOG_DIR/output.log\"\n\n# Enter the apps/collect-trace directory\nTARGET_DIR=\"$SCRIPT_DIR/../\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\n# Process traces\nuv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl\n\n\n"
  },
  {
    "path": "apps/collect-trace/scripts/collect_trace_gpt5.sh",
    "content": "# Check if OPENAI_API_KEY is set\nif [ -z \"$OPENAI_API_KEY\" ]; then\n    echo \"Error: OPENAI_API_KEY is not set.\"\n    exit 1\nelse\n    echo \"OPENAI_API_KEY detected.\"\nfi\n\n# Get the directory where the current script is located\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\necho \"Current script directory: $SCRIPT_DIR\"\n\n\n# Enter the apps/miroflow-agent directory\nTARGET_DIR=\"$SCRIPT_DIR/../../miroflow-agent\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\nmkdir -p ../../logs\nLOG_DIR=\"../../logs/collect_trace_gpt5\"\necho \"Log directory: $LOG_DIR\"\nmkdir -p $LOG_DIR\n\n# Collect traces\nuv run python benchmarks/common_benchmark.py \\\n    benchmark=collect_trace \\\n    benchmark.data.data_dir=\"../../data/debug\" \\\n    benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n    llm=gpt-5 \\\n    llm.provider=openai \\\n    llm.model_name=gpt-5-2025-08-07 \\\n    llm.api_key=\"$OPENAI_API_KEY\" \\\n    llm.base_url=https://api.openai.com/v1 \\\n    llm.async_client=true \\\n    benchmark.execution.max_tasks=null \\\n    benchmark.execution.max_concurrent=10 \\\n    benchmark.execution.pass_at_k=1 \\\n    agent=single_agent \\\n    hydra.run.dir=$LOG_DIR \\\n    2>&1 | tee \"$LOG_DIR/output.log\"\n\n# Enter the apps/collect-trace directory\nTARGET_DIR=\"$SCRIPT_DIR/../\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\n# Process traces\nuv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl\n\n\n"
  },
  {
    "path": "apps/collect-trace/scripts/collect_trace_qwen3.sh",
    "content": "# Get the directory where the current script is located\nSCRIPT_DIR=\"$(cd \"$(dirname \"${BASH_SOURCE[0]}\")\" && pwd)\"\necho \"Current script directory: $SCRIPT_DIR\"\n\n\n# Enter the apps/miroflow-agent directory\nTARGET_DIR=\"$SCRIPT_DIR/../../miroflow-agent\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\nmkdir -p ../../logs\nLOG_DIR=\"../../logs/collect_trace_qwen3\"\necho \"Log directory: $LOG_DIR\"\nmkdir -p $LOG_DIR\n\n# Collect traces\nuv run python benchmarks/common_benchmark.py \\\n    benchmark=collect_trace \\\n    benchmark.data.data_dir=\"../../data/debug\" \\\n    benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n    llm=qwen-3 \\\n    llm.provider=qwen \\\n    llm.model_name=qwen-3-32b \\\n    llm.api_key=\"\" \\\n    llm.base_url=https://your-api.com/v1 \\\n    llm.async_client=true \\\n    llm.temperature=1.0 \\\n    llm.max_context_length=131072 \\\n    benchmark.execution.max_tasks=null \\\n    benchmark.execution.max_concurrent=10 \\\n    benchmark.execution.pass_at_k=1 \\\n    agent=single_agent \\\n    hydra.run.dir=$LOG_DIR \\\n    2>&1 | tee \"$LOG_DIR/output.log\"\n\n# Enter the apps/collect-trace directory\nTARGET_DIR=\"$SCRIPT_DIR/../\"\necho \"Target directory: $TARGET_DIR\"\ncd $TARGET_DIR\n\n# Process traces\nuv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl\n\n\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nfrom .convert_non_oai_to_chatml import (\n    convert_to_json_chatml,\n    extract_and_save_chat_history,\n)\nfrom .convert_oai_to_chatml import (\n    extract_message_history_from_log,\n    oai_tool_message_to_chat_message,\n    process_log_file,\n    save_chatml_to_files,\n)\nfrom .convert_to_chatml_auto_batch import (\n    batch_process_files,\n    determine_conversion_method,\n    get_llm_provider,\n    process_single_file,\n)\n\n__all__ = [\n    # OAI conversion functions\n    \"oai_tool_message_to_chat_message\",\n    \"extract_message_history_from_log\",\n    \"save_chatml_to_files\",\n    \"process_log_file\",\n    # Non-OAI conversion functions\n    \"convert_to_json_chatml\",\n    \"extract_and_save_chat_history\",\n    # Auto batch conversion functions\n    \"get_llm_provider\",\n    \"determine_conversion_method\",\n    \"process_single_file\",\n    \"batch_process_files\",\n]\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport sys\nfrom pathlib import Path\nfrom typing import Any, Dict, List\n\n\ndef convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:\n    \"\"\"\n    Convert message list to OpenAI JSON format ChatML\n    Filter out messages with role 'tool', convert content None to empty string\n    \"\"\"\n    chatml_list = []\n    for message in messages:\n        role = message.get(\"role\", \"\")\n        if role == \"tool\":\n            continue  # Skip tool messages\n        if role == \"system\":\n            continue  # Skip system messages\n        content = message.get(\"content\", \"\")\n        if content is None:\n            content = \"\"\n        # Handle different content formats\n        if isinstance(content, list):\n            text_parts = []\n            for item in content:\n                if isinstance(item, dict) and item.get(\"type\") == \"text\":\n                    text_parts.append(item.get(\"text\", \"\"))\n            content = \" \".join(text_parts)\n        elif isinstance(content, str):\n            pass\n        else:\n            content = str(content)\n        chatml_list.append({\"role\": role, \"content\": content})\n    return chatml_list\n\n\ndef extract_and_save_chat_history(\n    log_data: Dict[str, Any], output_dir: Path, input_filename: str\n):\n    \"\"\"\n    Extract message history from log data and save as ChatML format\n\n    Args:\n        log_data: Log data dictionary\n        output_dir: Output directory\n        input_filename: Input filename (without extension)\n    \"\"\"\n    # Ensure output directory exists\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    # 1. Extract main_agent_message_history\n    main_agent_history = log_data.get(\"main_agent_message_history\", {})\n    if main_agent_history and \"message_history\" in main_agent_history:\n        main_messages = main_agent_history[\"message_history\"]\n        if main_messages:\n            chatml_list = convert_to_json_chatml(main_messages)\n            chatml_list.insert(\n                0,\n                {\n                    \"role\": \"system\",\n                    \"content\": main_agent_history.get(\"system_prompt\", \"\"),\n                },\n            )\n            # Save main agent chat records\n            main_output_file = output_dir / f\"{input_filename}_main_agent_chatml.json\"\n            with open(main_output_file, \"w\", encoding=\"utf-8\") as f:\n                json.dump(chatml_list, f, ensure_ascii=False, indent=2)\n\n            print(f\"✓ Saved main agent chat records: {main_output_file}\")\n\n    # 2. Extract sub_agent_message_history_sessions\n    sub_agent_sessions = log_data.get(\"sub_agent_message_history_sessions\", {})\n    if sub_agent_sessions:\n        for session_name, session_data in sub_agent_sessions.items():\n            if \"message_history\" in session_data:\n                sub_agent_messages = session_data[\"message_history\"]\n                if sub_agent_messages:\n                    chatml_list = convert_to_json_chatml(sub_agent_messages)\n                    chatml_list.insert(\n                        0,\n                        {\n                            \"role\": \"system\",\n                            \"content\": session_data.get(\"system_prompt\", \"\"),\n                        },\n                    )\n\n                    # Save browser agent chat records\n                    sub_agent_output_file = (\n                        output_dir / f\"{input_filename}_{session_name}_chatml.json\"\n                    )\n                    with open(sub_agent_output_file, \"w\", encoding=\"utf-8\") as f:\n                        json.dump(chatml_list, f, ensure_ascii=False, indent=2)\n\n                    print(f\"✓ Saved sub agent chat records: {sub_agent_output_file}\")\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    if len(sys.argv) < 2:\n        print(\"Usage: python convert_non_oai_to_chatml.py <log_file_path> [output_dir]\")\n        print(\n            \"Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json\"\n        )\n        print(\n            \"Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats\"\n        )\n        sys.exit(1)\n\n    log_file_path = Path(sys.argv[1])\n    output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path(\"extracted_chats\")\n\n    # Check if input file exists\n    if not log_file_path.exists():\n        print(f\"Error: Log file does not exist: {log_file_path}\")\n        sys.exit(1)\n\n    try:\n        # Read log file\n        print(f\"Reading log file: {log_file_path}\")\n        with open(log_file_path, \"r\", encoding=\"utf-8\") as f:\n            log_data = json.load(f)\n\n        # Extract input filename (without extension)\n        input_filename = log_file_path.stem\n\n        # Extract and save chat history\n        print(f\"Extracting chat history to: {output_dir}\")\n        extract_and_save_chat_history(log_data, output_dir, input_filename)\n\n        print(\"\\n✓ Chat history extraction completed!\")\n        print(f\"Output directory: {output_dir.absolute()}\")\n\n    except json.JSONDecodeError as e:\n        print(f\"Error: Cannot parse JSON file: {e}\")\n        sys.exit(1)\n    except Exception as e:\n        print(f\"Error: {e}\")\n        sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/convert_oai_to_chatml.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport ast\nimport json\nimport os\nimport sys\nfrom copy import deepcopy\nfrom datetime import datetime\nfrom pathlib import Path\nfrom typing import Any, Dict\n\nfrom system_prompts import (\n    main_system_prompt_foreword,\n    sub_agent_system_prompt_foreword,\n    system_prompt_tool_instrcutions,\n)\n\n# Initialize creation_time_str with current time\ncreation_time_str = datetime.now().strftime(\"%Y-%m-%d\")\n\n\ndef oai_tool_message_to_chat_message(oai_messages, agent_type, tool_definition):\n    def convert_oai_tool_call_to_mcp_tool_call_str(oai_tool_call):\n        if isinstance(oai_tool_call, list):\n            assert len(oai_tool_call) >= 1\n        if isinstance(oai_tool_call, str):\n            oai_tool_call = [json.loads(oai_tool_call)]\n\n        mcp_tool_call_templates = []\n        for each_oai_tool_call in oai_tool_call:\n            assert isinstance(\n                each_oai_tool_call, dict\n            ), f\"oai_tool_call should be a dict, but got {type(each_oai_tool_call)}\"\n\n            server_name, tool_name = each_oai_tool_call[\"function\"][\"name\"].rsplit(\n                \"-\", maxsplit=1\n            )\n            arguments = json.loads(each_oai_tool_call[\"function\"][\"arguments\"])\n            mcp_tool_call_template = f\"<use_mcp_tool>\\n<server_name>{server_name}</server_name>\\n<tool_name>{tool_name}</tool_name>\\n<arguments>\\n{json.dumps(arguments)}\\n</arguments>\\n</use_mcp_tool>\"\n            mcp_tool_call_templates.append(mcp_tool_call_template)\n\n        return \"\\n\\n\".join(mcp_tool_call_templates)\n\n    def safe_get_text(content):\n        \"\"\"Safely extract text content, handling different content formats\"\"\"\n        if isinstance(content, list) and content:\n            if isinstance(content[0], dict) and \"text\" in content[0]:\n                return content[0][\"text\"]\n            elif isinstance(content[0], str):\n                return content[0]\n            else:\n                return str(content[0])\n        elif isinstance(content, str):\n            return content\n        elif content is None:\n            return \"\"\n        else:\n            return str(content)\n\n    def generate_mcp_servers_str(tool_definition):\n        mcp_servers_str = \"\"\n        if tool_definition and len(tool_definition) > 0:\n            for server in tool_definition:\n                mcp_servers_str += f\"## Server name: {server['name']}\\n\"\n                if \"tools\" in server and len(server[\"tools\"]) > 0:\n                    for tool in server[\"tools\"]:\n                        # Skip tools that failed to load (they only have 'error' key)\n                        if \"error\" in tool and \"name\" not in tool:\n                            continue\n                        mcp_servers_str += f\"### Tool name: {tool['name']}\\n\"\n                        mcp_servers_str += f\"Description: {tool['description']}\\n\"\n                        mcp_servers_str += f\"Input JSON schema: {tool['schema']}\\n\"\n        return mcp_servers_str\n\n    oai_messages = deepcopy(oai_messages)\n    chat_messages = []\n    idx = 0\n    pending_user_tool_contents = []\n\n    # Merge pending_user_tool_contents into a single user message and add to chat_messages\n    def flush_pending(pending_user_tool_contents, chat_messages):\n        if pending_user_tool_contents:\n            combined_content = \"\\n\\n\".join(pending_user_tool_contents)\n            chat_messages.append(\n                {\n                    \"role\": \"user\",\n                    \"content\": combined_content,\n                }\n            )\n        return []  # Always return a new empty list\n\n    try:\n        for idx, msg in enumerate(oai_messages):\n            if msg[\"role\"] in [\"developer\", \"system\"]:\n                assert idx == 0, \"System messages should be the first message\"\n\n                time_str = f\" Today is: {creation_time_str}\\n\"\n                tool_definition_str = generate_mcp_servers_str(tool_definition)\n                ori_system_prompt = msg[\"content\"][0][\"text\"]\n\n                system_prompt_after_general_objective = ori_system_prompt[\n                    ori_system_prompt.find(\"# General Objective\") :\n                ]\n\n                if agent_type == \"main\":\n                    system_prompt = (\n                        main_system_prompt_foreword\n                        + time_str\n                        + system_prompt_tool_instrcutions\n                        + tool_definition_str\n                        + system_prompt_after_general_objective\n                    )\n                elif agent_type == \"sub_agent\":\n                    system_prompt = (\n                        sub_agent_system_prompt_foreword\n                        + time_str\n                        + system_prompt_tool_instrcutions\n                        + tool_definition_str\n                        + system_prompt_after_general_objective\n                    )\n                else:\n                    raise ValueError(f\"Unknown agent type: {agent_type}\")\n\n                chat_messages.append(\n                    {\n                        \"role\": \"system\",\n                        \"content\": system_prompt,\n                    }\n                )\n\n            elif msg[\"role\"] in [\"user\", \"tool\"]:\n                content = safe_get_text(msg[\"content\"])\n                pending_user_tool_contents.append(content)\n            elif msg[\"role\"] == \"assistant\" and \"tool_calls\" in msg:\n                # Flush pending user/tool messages\n                pending_user_tool_contents = flush_pending(\n                    pending_user_tool_contents, chat_messages\n                )\n\n                content = safe_get_text(msg.get(\"content\", \"\"))\n\n                if content != \"\":\n                    content += \"\\n\\n\"  # Concatenate thinking text with tool call\n\n                chat_messages.append(\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": content\n                        + convert_oai_tool_call_to_mcp_tool_call_str(msg[\"tool_calls\"]),\n                    }\n                )\n            elif msg[\"role\"] == \"assistant\" and \"tool_calls\" not in msg:\n                # Flush pending user/tool messages\n                pending_user_tool_contents = flush_pending(\n                    pending_user_tool_contents, chat_messages\n                )\n\n                content = safe_get_text(msg[\"content\"])\n\n                chat_messages.append(\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": content,\n                    }\n                )\n            else:\n                raise ValueError(f\"Unknown role: {msg['role']}\")\n\n        assert (\n            len(pending_user_tool_contents) == 0\n        ), \"Error: Trace ends with user/tool round. Pending user/tool contents should be empty.\"\n\n    except Exception as e:\n        raise ValueError(f\"Error processing messages: {e}\")\n\n    return chat_messages\n\n\ndef extract_message_history_from_log(\n    log_data: Dict[str, Any],\n):\n    \"\"\"\n    Extract message history from log data and convert to OpenAI ChatML format\n\n    Args:\n        log_data: Log data dictionary\n\n    Returns:\n        Dictionary containing main_agent and sub_agents message history\n    \"\"\"\n    result = {\"main_agent\": [], \"sub_agents\": {}}\n\n    # Extract main_agent_message_history\n    main_agent_history = log_data.get(\"main_agent_message_history\", {})\n    if main_agent_history and \"message_history\" in main_agent_history:\n        main_messages = main_agent_history[\"message_history\"]\n        if main_messages:\n            tool_main_agent_definition = extract_step_message(\n                log_data, \"get_main_tool_definitions\"\n            )\n\n            result[\"main_agent\"] = oai_tool_message_to_chat_message(\n                main_messages,\n                \"main\",\n                tool_main_agent_definition,\n            )\n\n    # Extract sub_agent_message_history_sessions\n    sub_agent_sessions = log_data.get(\"sub_agent_message_history_sessions\", {})\n    if sub_agent_sessions:\n        for session_name, session_data in sub_agent_sessions.items():\n            if \"message_history\" in session_data:\n                sub_agent_messages = session_data[\"message_history\"]\n                if sub_agent_messages:\n                    sub_agent_type = session_name.split(\"_\")[0]\n\n                    tool_sub_agent_definition = extract_step_message(\n                        log_data, f\"get_sub_{sub_agent_type}_tool_definitions\"\n                    )\n                    result[\"sub_agents\"][session_name] = (\n                        oai_tool_message_to_chat_message(\n                            sub_agent_messages, \"sub_agent\", tool_sub_agent_definition\n                        )\n                    )\n\n    return result\n\n\ndef save_chatml_to_files(\n    chatml_data: Dict[str, Any],\n    output_dir: Path,\n    input_filename: str,\n):\n    \"\"\"\n    Save ChatML format messages to files\n\n    Args:\n        chatml_data: Dictionary containing message history\n        output_dir: Output directory\n        input_filename: Input filename (without extension)\n    \"\"\"\n    # Ensure output directory exists\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    # Save main agent messages\n    if chatml_data[\"main_agent\"]:\n        main_output_file = output_dir / f\"{input_filename}_main_agent_chatml.json\"\n        with open(main_output_file, \"w\", encoding=\"utf-8\") as f:\n            json.dump(chatml_data[\"main_agent\"], f, ensure_ascii=False, indent=2)\n        print(f\"✓ Saved main agent ChatML: {main_output_file}\")\n\n    # Save sub agent messages\n    for session_name, messages in chatml_data[\"sub_agents\"].items():\n        # Extract numeric suffix\n\n        sub_agent_output_file = (\n            output_dir / f\"{input_filename}_{session_name}_chatml.json\"\n        )\n\n        with open(sub_agent_output_file, \"w\", encoding=\"utf-8\") as f:\n            json.dump(messages, f, ensure_ascii=False, indent=2)\n        print(f\"✓ Saved sub agent {session_name} ChatML: {sub_agent_output_file}\")\n\n\ndef extract_step_message(data, target_step_name):\n    try:\n        # Check if step_logs field exists\n        if \"step_logs\" not in data:\n            print(\"step_logs field not found in log file\")\n            return None\n\n        # Iterate through step_logs to find target step_name\n        for i, step in enumerate(data[\"step_logs\"]):\n            step_name = step.get(\"step_name\")\n            if step_name == target_step_name:\n                message = step.get(\"message\")\n                return ast.literal_eval(message)\n\n        print(f\"No record found with step_name '{target_step_name}'\")\n        return None\n\n    except Exception as e:\n        print(f\"Error processing file: {e}\")\n        return None\n\n\ndef process_log_file(log_file_path: str, output_dir: str = \"extracted_chatml\"):\n    \"\"\"\n    Process a single log file, extract message history and convert to ChatML format\n\n    Args:\n        log_file_path: Log file path\n        output_dir: Output directory\n    \"\"\"\n    log_path = Path(log_file_path)\n    output_path = Path(output_dir)\n\n    if not log_path.exists():\n        print(f\"Error: Log file does not exist: {log_file_path}\")\n        return\n\n    # Get file creation time\n    global creation_time_str\n    try:\n        stat_info = os.stat(log_path)\n        creation_time = datetime.fromtimestamp(stat_info.st_ctime)\n        creation_time_str = creation_time.strftime(\"%Y-%m-%d\")\n        print(f\"File creation time: {creation_time_str}\")\n    except Exception as e:\n        print(f\"Warning: Could not get file creation time: {e}\")\n\n    try:\n        # Read log file\n        print(f\"Reading log file: {log_path}\")\n        with open(log_path, \"r\", encoding=\"utf-8\") as f:\n            log_data = json.load(f)\n\n        # Extract input filename (without extension)\n        input_filename = log_path.stem\n\n        # Extract message history and convert to ChatML format\n        print(\"Extracting message history...\")\n        chatml_data = extract_message_history_from_log(log_data)\n\n        # Save to files\n        print(f\"Saving ChatML files to: {output_path}\")\n        save_chatml_to_files(chatml_data, output_path, input_filename)\n\n        print(\"\\n✓ Processing completed!\")\n        print(f\"Output directory: {output_path.absolute()}\")\n\n    except json.JSONDecodeError as e:\n        print(f\"Error: Cannot parse JSON file: {e}\")\n    except Exception as e:\n        print(f\"Error: {e}\")\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    if len(sys.argv) < 2:\n        print(\"Usage: python convert_oai_to_chatml.py <log_file_path> [output_dir]\")\n        print(\"Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json\")\n        print(\n            \"Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chatml\"\n        )\n        sys.exit(1)\n\n    log_file_path = sys.argv[1]\n    output_dir = sys.argv[2] if len(sys.argv) > 2 else \"extracted_chatml\"\n\n    process_log_file(log_file_path, output_dir)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/convert_to_chatml_auto_batch.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport subprocess\nimport sys\nfrom pathlib import Path\nfrom typing import Dict, List\n\n\ndef get_llm_provider(json_file_path: str) -> str:\n    \"\"\"\n    Extract llm_provider from JSON file\n\n    Args:\n        json_file_path: Path to JSON file\n\n    Returns:\n        llm_provider value or 'unknown' if not found\n    \"\"\"\n    try:\n        with open(json_file_path, \"r\", encoding=\"utf-8\") as f:\n            data = json.load(f)\n\n        # Extract llm_provider from env_info\n        provider = data.get(\"env_info\", {}).get(\"llm_provider\")\n        if provider:\n            return provider\n        else:\n            return \"unknown\"\n    except Exception as e:\n        print(f\"Error reading JSON file {json_file_path}: {e}\")\n        return \"error\"\n\n\ndef determine_conversion_method(provider: str) -> str:\n    \"\"\"\n    Determine conversion method based on provider\n\n    Args:\n        provider: LLM provider name\n\n    Returns:\n        'oai' for OpenAI, 'non-oai' for others\n    \"\"\"\n    if provider.lower() in [\"openai\", \"claude_newapi\", \"deepseek_newapi\"]:\n        return \"oai\"\n    else:\n        return \"non-oai\"\n\n\ndef get_script_paths() -> tuple:\n    \"\"\"\n    Get paths to conversion scripts\n\n    Returns:\n        Tuple of (oai_script_path, non_oai_script_path)\n    \"\"\"\n    # Get directory of current script\n    current_dir = Path(__file__).parent\n\n    oai_script = current_dir / \"convert_oai_to_chatml.py\"\n    non_oai_script = current_dir / \"convert_non_oai_to_chatml.py\"\n\n    # Check if scripts exist\n    if not oai_script.exists():\n        raise FileNotFoundError(f\"OAI conversion script not found: {oai_script}\")\n\n    if not non_oai_script.exists():\n        raise FileNotFoundError(\n            f\"Non-OAI conversion script not found: {non_oai_script}\"\n        )\n\n    return str(oai_script), str(non_oai_script)\n\n\ndef process_single_file(json_file_path: str, output_dir: str) -> bool:\n    \"\"\"\n    Process a single JSON file\n\n    Args:\n        json_file_path: Path to JSON file\n        output_dir: Output directory\n\n    Returns:\n        True if successful, False otherwise\n    \"\"\"\n    try:\n        # Get llm_provider\n        provider = get_llm_provider(json_file_path)\n\n        if provider == \"error\":\n            print(f\"❌ Failed to read provider from: {json_file_path}\")\n            return False\n\n        # Determine conversion method\n        conversion_method = determine_conversion_method(provider)\n\n        # Get script paths\n        oai_script, non_oai_script = get_script_paths()\n\n        # Choose script based on conversion method\n        if conversion_method == \"oai\":\n            script_path = oai_script\n            print(f\"🔧 Using OAI conversion for provider: {provider}\")\n        else:\n            script_path = non_oai_script\n            print(f\"🔧 Using Non-OAI conversion for provider: {provider}\")\n\n        # Run conversion script\n        result = subprocess.run(\n            [sys.executable, script_path, json_file_path, output_dir],\n            capture_output=True,\n            text=True,\n        )\n\n        if result.returncode == 0:\n            print(f\"✅ Successfully processed: {json_file_path}\")\n            return True\n        else:\n            print(f\"❌ Failed to process {json_file_path}: {result.stderr}\")\n            return False\n\n    except Exception as e:\n        print(f\"❌ Error processing {json_file_path}: {e}\")\n        return False\n\n\ndef find_json_files(input_paths: List[str]) -> List[str]:\n    \"\"\"\n    Find JSON files from input paths\n\n    Args:\n        input_paths: List of file paths, directories, or patterns\n\n    Returns:\n        List of JSON file paths\n    \"\"\"\n    json_files = []\n\n    for path in input_paths:\n        path_obj = Path(path)\n\n        if path_obj.is_file():\n            # Single file\n            if path_obj.suffix.lower() == \".json\":\n                json_files.append(str(path_obj))\n        elif path_obj.is_dir():\n            # Directory - find all JSON files\n            for json_file in path_obj.glob(\"*.json\"):\n                json_files.append(str(json_file))\n        else:\n            # Pattern matching\n            try:\n                for json_file in Path(\".\").glob(path):\n                    if json_file.suffix.lower() == \".json\":\n                        json_files.append(str(json_file))\n            except Exception:\n                print(f\"Warning: Could not process pattern: {path}\")\n\n    return json_files\n\n\ndef batch_process_files(input_paths: List[str], output_dir: str) -> Dict[str, int]:\n    \"\"\"\n    Batch process multiple files\n\n    Args:\n        input_paths: List of input paths\n        output_dir: Output directory\n\n    Returns:\n        Dictionary with processing statistics\n    \"\"\"\n    # Find JSON files\n    json_files = find_json_files(input_paths)\n\n    if not json_files:\n        print(\"❌ No JSON files found in the specified paths\")\n        return {\"total\": 0, \"success\": 0, \"failed\": 0}\n\n    print(f\"📁 Found {len(json_files)} JSON files to process\")\n\n    # Create output directory\n    Path(output_dir).mkdir(parents=True, exist_ok=True)\n\n    # Process files\n    success_count = 0\n    failed_count = 0\n\n    for json_file in json_files:\n        if process_single_file(json_file, output_dir):\n            success_count += 1\n        else:\n            failed_count += 1\n\n    return {\"total\": len(json_files), \"success\": success_count, \"failed\": failed_count}\n\n\ndef show_help():\n    \"\"\"Show help information\"\"\"\n    help_text = \"\"\"\nAuto ChatML Conversion Script\n============================\n\nAutomatically determines conversion method based on llm_provider field in JSON files\n\nUsage:\n  python convert_to_chatml_auto_batch.py <input_paths...> [output_dir]\n  python convert_to_chatml_auto_batch.py <log_dir> [output_dir]\n  python convert_to_chatml_auto_batch.py <log_file_pattern> [output_dir]\n\nParameters:\n  input_paths: JSON files, directories, or patterns\n  output_dir: Output directory (optional, default: extracted_chatml)\n\nExamples:\n  python convert_to_chatml_auto_batch.py logs/debug_logs/\n  python convert_to_chatml_auto_batch.py logs/debug_logs/*.json\n  python convert_to_chatml_auto_batch.py logs/debug_logs/ ./my_output\n  python convert_to_chatml_auto_batch.py task_1.json task_2.json\n\nConversion Logic:\n  - If llm_provider = 'openai': Use convert_oai_to_chatml.py\n  - If llm_provider = anything else: Use convert_non_oai_to_chatml.py\n\nFeatures:\n  1. Auto-detect conversion method per file\n  2. Batch process log files\n  3. Extract main_agent_message_history\n  4. Extract browser_agent_message_history_sessions\n  5. Convert to OpenAI ChatML format\n  6. Save as separate files\n  7. Generate processing summary\n\"\"\"\n    print(help_text)\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    # Check for help\n    if len(sys.argv) < 2 or sys.argv[1] in [\"-h\", \"--help\"]:\n        show_help()\n        return\n\n    # Parse arguments\n    args = sys.argv[1:]\n\n    # Check if last argument is output directory\n    if len(args) > 1 and not args[-1].startswith(\"-\"):\n        # Check if last argument looks like a directory\n        last_arg = args[-1]\n        if (\n            last_arg.endswith(\"/\")\n            or not Path(last_arg).suffix\n            or last_arg == \"extracted_chatml\"\n            or last_arg.startswith(\"./\")\n        ):\n            output_dir = last_arg\n            input_paths = args[:-1]\n        else:\n            output_dir = \"extracted_chatml\"\n            input_paths = args\n    else:\n        output_dir = \"extracted_chatml\"\n        input_paths = args\n\n    print(\"🚀 Starting auto ChatML conversion\")\n    print(f\"📂 Input paths: {input_paths}\")\n    print(f\"📁 Output directory: {output_dir}\")\n\n    try:\n        # Check if conversion scripts exist\n        get_script_paths()\n\n        # Process files\n        stats = batch_process_files(input_paths, output_dir)\n\n        # Show results\n        print(\"\\n\" + \"=\" * 50)\n        print(\"📊 Processing Summary\")\n        print(\"=\" * 50)\n        print(f\"Total files: {stats['total']}\")\n        print(f\"Successfully processed: {stats['success']}\")\n        print(f\"Failed: {stats['failed']}\")\n        print(f\"Output directory: {Path(output_dir).absolute()}\")\n\n        if stats[\"failed\"] > 0:\n            print(f\"\\n⚠️  {stats['failed']} files failed to process\")\n            sys.exit(1)\n        else:\n            print(\"\\n✅ All files processed successfully!\")\n\n    except FileNotFoundError as e:\n        print(f\"❌ {e}\")\n        sys.exit(1)\n    except Exception as e:\n        print(f\"❌ Unexpected error: {e}\")\n        sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/example_usage.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport os\nimport sys\nimport tempfile\nfrom pathlib import Path\n\n# Add parent directory to Python path\nsys.path.insert(0, os.path.join(os.path.dirname(__file__), \"..\", \"..\"))\n\nfrom utils.converters import (\n    extract_and_save_chat_history,\n    extract_message_history_from_log,\n)\n\n\ndef example_1_basic_conversion():\n    \"\"\"Example 1: Basic conversion using Python API\"\"\"\n    print(\"=== Example 1: Basic Conversion ===\")\n\n    # Sample log data\n    log_data = {\n        \"main_agent_message_history\": {\n            \"system_prompt\": \"You are a helpful assistant.\",\n            \"message_history\": [\n                {\n                    \"role\": \"developer\",\n                    \"content\": [\n                        {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}\n                    ],\n                },\n                {\n                    \"role\": \"user\",\n                    \"content\": [{\"type\": \"text\", \"text\": \"Hello, how are you?\"}],\n                },\n                {\n                    \"role\": \"assistant\",\n                    \"content\": [{\"type\": \"text\", \"text\": \"I'm doing well, thank you!\"}],\n                },\n            ],\n        },\n        \"browser_agent_message_history_sessions\": {\n            \"browser_agent_1\": {\n                \"system_prompt\": \"You are a browsing agent.\",\n                \"message_history\": [\n                    {\n                        \"role\": \"developer\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": \"You are a browsing agent.\"}\n                        ],\n                    },\n                    {\n                        \"role\": \"user\",\n                        \"content\": [{\"type\": \"text\", \"text\": \"Search for something\"}],\n                    },\n                    {\n                        \"role\": \"assistant\",\n                        \"content\": [{\"type\": \"text\", \"text\": \"I found it.\"}],\n                    },\n                ],\n            }\n        },\n        \"env_info\": {\"llm_provider\": \"openai\"},\n    }\n\n    # Convert using OAI method\n    chatml_data = extract_message_history_from_log(log_data)\n    print(\n        f\"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent\"\n    )\n    print(\n        f\"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent\"\n    )\n\n    # Convert using Non-OAI method\n    with tempfile.TemporaryDirectory() as temp_dir:\n        temp_path = Path(temp_dir)\n        extract_and_save_chat_history(log_data, temp_path, \"example\")\n\n        # Check generated files\n        main_file = temp_path / \"example_main_agent_chatml.json\"\n        browser_file = temp_path / \"example_browser_agent_1_chatml.json\"\n\n        if main_file.exists():\n            with open(main_file, \"r\") as f:\n                main_content = json.load(f)\n                print(\n                    f\"Non-OAI conversion result: {len(main_content)} messages in main agent\"\n                )\n\n        if browser_file.exists():\n            with open(browser_file, \"r\") as f:\n                browser_content = json.load(f)\n                print(\n                    f\"Non-OAI conversion result: {len(browser_content)} messages in browser agent\"\n                )\n\n\nif __name__ == \"__main__\":\n    print(\"ChatML Conversion Utilities - Usage Examples\")\n    print(\"=\" * 50)\n\n    example_1_basic_conversion()\n\n    print(\"\\n\" + \"=\" * 50)\n    print(\"Examples completed successfully!\")\n    print(\"\\nFor more information, see the README.md file.\")\n"
  },
  {
    "path": "apps/collect-trace/utils/converters/system_prompts.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nmain_system_prompt_foreword = \"\"\"In this environment you have access to a set of tools you can use to answer the user's question. \\n    \\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\"\"\"\n\nsub_agent_system_prompt_foreword = \"\"\"In this environment you have access to a set of tools you can use to answer the user's question. \\n    \\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\"\"\"\n\nsystem_prompt_tool_instrcutions = \"\"\"# Tool-Use Formatting Instructions \\n\\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\\n\\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\\n\\nDescription: \\nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\\n\\nParameters:\\n- server_name: (required) The name of the MCP server providing the tool\\n- tool_name: (required) The name of the tool to execute\\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\\n\\nUsage:\\n<use_mcp_tool>\\n<server_name>server name here</server_name>\\n<tool_name>tool name here</tool_name>\\n<arguments>\\n{\\n\\\"param1\\\": \\\"value1\\\",\\n\\\"param2\\\": \\\"value2 \\\\\\\"escaped string\\\\\\\"\\\"\\n}\\n</arguments>\\n</use_mcp_tool>\\n\\nImportant Notes:\\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\\n\\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\\nHere are the functions available in JSONSchema format:\\n\\n\"\"\"\n"
  },
  {
    "path": "apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport glob\nimport json\nimport os\n\n\ndef merge_json_files(input_dir, type=\"main\"):\n    # List to store all messages\n    all_conversations = []\n\n    # Get all JSON files matching the pattern\n    json_files = glob.glob(os.path.join(input_dir, f\"*{type}*.json\"))\n\n    # Read each JSON file and merge its content\n    for json_file in json_files:\n        try:\n            with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                data = json.load(f)\n                conversation = {\n                    \"messages\": data,\n                }\n                all_conversations.append(conversation)\n            print(f\"Successfully processed: {json_file}\")\n        except Exception as e:\n            print(f\"Error processing {json_file}: {str(e)}\")\n\n    output_file = os.path.join(input_dir, f\"{type}_merged.json\")\n    # Write the merged data to a new JSON file\n    with open(output_file, \"w\", encoding=\"utf-8\") as f:\n        json.dump(all_conversations, f, ensure_ascii=False, indent=2)\n\n    print(\n        f\"\\nMerging complete! All {type} JSON files have been merged into {output_file}\"\n    )\n    print(f\"Total number of files processed: {len(json_files)}\")\n    print(f\"Total number of messages: {len(all_conversations)}\")\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        description=\"Merge multiple JSON files which contain chat messages into a single file\"\n    )\n    parser.add_argument(\n        \"--input_dir\",\n        type=str,\n        required=True,\n        help=\"File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')\",\n    )\n\n    args = parser.parse_args()\n\n    merge_json_files(args.input_dir, type=\"main_agent\")\n    merge_json_files(args.input_dir, type=\"agent-browsing\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/collect-trace/utils/process_logs.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport json\nimport os\nimport shutil\n\n\ndef get_successful_log_paths(jsonl_file_path: str) -> list:\n    \"\"\"\n    Collects the paths of successful log files from a dataset.\n\n    This function extracts log file paths of successful records based on\n    the value of `final_judge_result`. If the dataset has been fully\n    processed, it reads from a `benchmark_results.jsonl` file. Otherwise,\n    if processing was interrupted, it falls back to scanning individual\n    `.json` files in the given directory.\n\n    Success is determined by:\n    - `PASS_AT_K_SUCCESS` for records in JSONL files.\n    - `CORRECT` for records in individual JSON files.\n\n    Args:\n        jsonl_file_path (str): Path to a JSONL file or a directory of JSON files.\n\n    Returns:\n        list: A list of log file paths for successful records.\n    \"\"\"\n    log_paths = []\n\n    if jsonl_file_path.endswith(\".jsonl\"):\n        with open(jsonl_file_path, \"r\", encoding=\"utf-8\") as f:\n            for line in f:\n                line = line.strip()\n                if line:\n                    try:\n                        data = json.loads(line)\n                        if data.get(\"final_judge_result\") == \"PASS_AT_K_SUCCESS\":\n                            log_path = data.get(\"log_file_path\")\n                            if log_path:\n                                log_paths.append(log_path)\n                    except json.JSONDecodeError:\n                        continue\n    else:\n        filenames = os.listdir(jsonl_file_path)\n        filenames = [filename for filename in filenames if filename.endswith(\".json\")]\n        for filename in filenames:\n            filepath = os.path.join(jsonl_file_path, filename)\n            try:\n                data = json.load(open(filepath, \"r\"))\n            except Exception:\n                continue\n            try:\n                final_judge_result = data[\"final_judge_result\"]\n            except KeyError:\n                print(data.keys())\n                continue\n            if final_judge_result == \"CORRECT\":\n                log_paths.append(filepath)\n\n    return log_paths\n\n\n# Usage example\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Extract successful log paths from JSONL file\"\n    )\n    parser.add_argument(\n        \"file_path\", help=\"Path to the JSONL file containing benchmark results\"\n    )\n    args = parser.parse_args()\n\n    result = get_successful_log_paths(args.file_path)\n\n    # Get the parent directory of args.file_path\n    parent_dir = os.path.abspath(os.path.dirname(args.file_path))\n\n    # Create successful logs directory\n    success_log_dir = parent_dir + \"/successful_logs\"\n    success_chatml_log_dir = parent_dir + \"/successful_chatml_logs\"\n    os.makedirs(success_log_dir, exist_ok=True)\n    print(f\"Successful logs directory: {success_log_dir}\")\n\n    for i, path in enumerate(result, 1):\n        basename = os.path.basename(path)\n        print(f\"Copying file: {path} to {success_log_dir}/{basename}\")\n        shutil.copy(path, f\"{success_log_dir}/{basename}\")\n\n    os.system(\n        f\"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}\"\n    )\n    os.system(\n        f\"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}\"\n    )\n"
  },
  {
    "path": "apps/gradio-demo/README.md",
    "content": "# Local Deep Research Demo with Gradio Web UI\n\nHost your own Deep Research demo using our [MiroThinker v1.5](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) models and lightweight Gradio-based web interface.\n\n## 🖥️ Hardware Requirements\n\n- **GPU**: NVIDIA RTX 40xx/50xx series or equivalent\n- **VRAM**:\n  - **16GB minimum** (with Q4 quantization via llama.cpp)\n  - **48GB+ recommended** (for FP8 quantization or longer context)\n  - MiroThinker-v1.5-30B is a 30B MoE model with 3B active parameters\n\n## ⚙️ LLM Server Deployment\n\n### Download Model Checkpoints\n\nDownload the full checkpoint from Hugging Face:\n\n```python\nfrom huggingface_hub import snapshot_download\nsnapshot_download(repo_id=\"miromind-ai/MiroThinker-v1.5-30B\", local_dir=\"model/MiroThinker-v1.5-30B\")\n```\n\n### Option 1: SGLang Server (Recommended)\n\nFP8 is a highly efficient 8-bit floating point format that significantly reduces memory usage while maintaining model quality. This approach provides excellent performance for inference workloads on modern GPUs.\n\nPlease install [SGLang](https://github.com/sgl-project/sglang) first. Then initialize fast inference with FP8 precision:\n\n```bash\nMODEL_PATH=model/MiroThinker-v1.5-30B\n\npython3 -m sglang.launch_server \\\n    --model-path $MODEL_PATH \\\n    --mem-fraction-static 0.9 \\\n    --quantization fp8 \\\n    --tp 1 \\\n    --dp 1 \\\n    --host 0.0.0.0 \\\n    --port 61005 \\\n    --trust-remote-code\n```\n\nIt will start an openai compatible server with BASE_URL=`http://0.0.0.0:61005/v1`.\n\n### Option 2: llama.cpp (Quantized)\n\nFor memory-efficient inference, download the pre-quantized GGUF version from the community:\n\n**Note**: Thanks to the community for providing quantized versions: [mradermacher](https://huggingface.co/mradermacher)\n\n```bash\n# Download Q4_K_M quantized model (recommended balance)\nwget https://huggingface.co/mradermacher/MiroThinker-v1.5-30B-GGUF/resolve/main/MiroThinker-v1.5-30B.Q4_K_M.gguf\n```\n\nFollow the [official llama.cpp installation guide](https://github.com/ggml-org/llama.cpp) to set up the environment. After that:\n\n```bash\n# Set up model path\nMODEL_PATH=model/MiroThinker-v1.5-30B.Q4_K_M.gguf\n\n# Start the server\nllama-server -m $MODEL_PATH \\\n    --port 61005 \\\n    -ngl 99 \\\n    -v\n```\n\nThis will start an OpenAI-compatible server at `http://0.0.0.0:61005/v1`.\n\n### Other Options\n\nYou can also leverage other frameworks for model serving like Ollama, vLLM, and Text Generation Inference (TGI) for different deployment scenarios.\n\n## 🚀 Quick Start Guide\n\n### 1. **Environment Setup**\n\nGet your API keys:\n\n- [Serper](https://serper.dev/): 2,500 free search credits for new accounts (required for web search)\n- [E2B](https://e2b.dev/): Free tier available (required for Python code execution)\n- [Jina](https://jina.ai/): Free tier available (required for web scraping)\n\nEdit the `apps/miroflow-agent/.env` file with your API keys:\n\n```bash\n# Required - Web Search\nSERPER_API_KEY=your_serper_key\n\n# Required - Python Code Execution (E2B Cloud Sandbox)\nE2B_API_KEY=your_e2b_key\n\n# Required - Web Scraping\nJINA_API_KEY=your_jina_key\n\n# Required - Summary LLM (for webpage summarization)\n# Option 1: Use OpenAI GPT-5-Nano (recommended, cost-effective)\nSUMMARY_LLM_BASE_URL=https://api.openai.com/v1\nSUMMARY_LLM_MODEL_NAME=gpt-5-nano\nSUMMARY_LLM_API_KEY=your_openai_key\n\n# Option 2: Use MiroThinker itself (if you have enough VRAM)\n# SUMMARY_LLM_BASE_URL=http://0.0.0.0:61005/v1\n# SUMMARY_LLM_MODEL_NAME=MiroThinker\n# SUMMARY_LLM_API_KEY=none\n```\n\n### 2. **Install Dependencies**\n\nWe use [uv](https://github.com/astral-sh/uv) to manage all dependencies.\n\n```bash\ncd apps/gradio-demo\nuv sync\n```\n\n### 3. **Configure API Endpoint**\n\nSet your LLM API endpoint and API key:\n\n```bash\nexport BASE_URL=http://your-sglang-address:your-sglang-port/v1\nexport API_KEY=your_api_key  # Optional, required if your endpoint needs authentication\n```\n\n### 4. **Launch the Application**\n\n```bash\nuv run main.py\n```\n\n### 5. **Access the Web Interface**\n\nOpen your browser and navigate to: `http://localhost:8080`\n\n### 📝 Notes\n\n- Ensure your LLM server is up and running before launching the demo\n- The demo will use your local CPU/GPU for inference while leveraging external APIs for search and code execution\n- Monitor your API usage through the respective provider dashboards\n"
  },
  {
    "path": "apps/gradio-demo/main.py",
    "content": "import asyncio\nimport json\nimport logging\nimport os\nimport threading\nimport time\nimport uuid\nfrom concurrent.futures import ThreadPoolExecutor\nfrom pathlib import Path\nfrom typing import AsyncGenerator, List, Optional\n\nimport gradio as gr\nfrom dotenv import load_dotenv\nfrom hydra import compose, initialize_config_dir\nfrom omegaconf import DictConfig\nfrom prompt_patch import apply_prompt_patch\nfrom src.config.settings import expose_sub_agents_as_tools\nfrom src.core.pipeline import create_pipeline_components, execute_task_pipeline\nfrom utils import replace_chinese_punctuation\n\n# Apply custom system prompt patch (adds MiroThinker identity)\napply_prompt_patch()\n\n# Create global cleanup thread pool for operations that won't be affected by asyncio.cancel\ncleanup_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix=\"cleanup\")\n\nlogger = logging.getLogger(__name__)\n\n# Set DEMO_MODE for simplified tool configuration\nos.environ[\"DEMO_MODE\"] = \"1\"\n\n# Load environment variables from .env file\nload_dotenv()\n\n# Global Hydra initialization flag\n_hydra_initialized = False\n\n\ndef load_miroflow_config(config_overrides: Optional[dict] = None) -> DictConfig:\n    \"\"\"\n    Load the full MiroFlow configuration using Hydra, similar to how benchmarks work.\n    \"\"\"\n    global _hydra_initialized\n\n    # Get the path to the miroflow agent config directory\n    miroflow_config_dir = Path(__file__).parent.parent / \"miroflow-agent\" / \"conf\"\n    miroflow_config_dir = miroflow_config_dir.resolve()\n    logger.debug(f\"Config dir: {miroflow_config_dir}\")\n\n    if not miroflow_config_dir.exists():\n        raise FileNotFoundError(\n            f\"MiroFlow config directory not found: {miroflow_config_dir}\"\n        )\n\n    # Initialize Hydra if not already done\n    if not _hydra_initialized:\n        try:\n            initialize_config_dir(\n                config_dir=str(miroflow_config_dir), version_base=None\n            )\n            _hydra_initialized = True\n        except Exception as e:\n            logger.warning(f\"Hydra already initialized or error: {e}\")\n\n    # Compose configuration with environment variable overrides\n    overrides = []\n\n    # Add environment variable based overrides (refer to scripts/debug.sh)\n    llm_provider = os.getenv(\n        \"DEFAULT_LLM_PROVIDER\", \"qwen\"\n    )  # debug.sh defaults to qwen\n    model_name = os.getenv(\n        \"DEFAULT_MODEL_NAME\", \"MiroThinker\"\n    )  # debug.sh default model\n    agent_set = os.getenv(\"DEFAULT_AGENT_SET\", \"demo\")  # Use demo config\n    base_url = os.getenv(\"BASE_URL\", \"http://localhost:11434\")\n    api_key = os.getenv(\"API_KEY\", \"\")  # API key for LLM endpoint\n    logger.debug(f\"LLM base_url: {base_url}\")\n\n    # Map provider names to config files\n    # Available configs: default.yaml, claude-3-7.yaml, gpt-5.yaml, qwen-3.yaml\n    provider_config_map = {\n        \"anthropic\": \"claude-3-7\",\n        \"openai\": \"gpt-5\",\n        \"qwen\": \"qwen-3\",\n    }\n\n    llm_config = provider_config_map.get(\n        llm_provider, \"qwen-3\"\n    )  # fallback to qwen-3 config\n    overrides.extend(\n        [\n            f\"llm={llm_config}\",\n            f\"llm.provider={llm_provider}\",\n            f\"llm.model_name={model_name}\",\n            f\"llm.base_url={base_url}\",\n            f\"llm.api_key={api_key}\",\n            f\"agent={agent_set}\",\n            \"agent.main_agent.max_turns=50\",  # Limit max turns for gradio demo\n            \"benchmark=gaia-validation\",  # refer to debug.sh\n        ]\n    )\n\n    # Add config overrides from request\n    if config_overrides:\n        for key, value in config_overrides.items():\n            if isinstance(value, dict):\n                for subkey, subvalue in value.items():\n                    overrides.append(f\"{key}.{subkey}={subvalue}\")\n            else:\n                overrides.append(f\"{key}={value}\")\n\n    try:\n        cfg = compose(config_name=\"config\", overrides=overrides)\n        return cfg\n    except Exception as e:\n        logger.error(f\"Failed to compose Hydra config: {e}\")\n        exit()\n\n\n# Lazy loading for tool definitions to speed up page load\n# Tools will be loaded on first request instead of blocking startup\n_preload_cache = {\n    \"cfg\": None,\n    \"main_agent_tool_manager\": None,\n    \"sub_agent_tool_managers\": None,\n    \"output_formatter\": None,\n    \"tool_definitions\": None,\n    \"sub_agent_tool_definitions\": None,\n    \"loaded\": False,\n}\n_preload_lock = threading.Lock()\n\n\ndef _ensure_preloaded():\n    \"\"\"Lazy load pipeline components on first request.\"\"\"\n    global _preload_cache\n    if _preload_cache[\"loaded\"]:\n        return\n\n    with _preload_lock:\n        if _preload_cache[\"loaded\"]:\n            return\n\n        logger.info(\"Loading pipeline components (first request)...\")\n        cfg = load_miroflow_config(None)\n        main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (\n            create_pipeline_components(cfg)\n        )\n        tool_definitions = asyncio.run(\n            main_agent_tool_manager.get_all_tool_definitions()\n        )\n        if cfg.agent.sub_agents:\n            tool_definitions += expose_sub_agents_as_tools(cfg.agent.sub_agents)\n\n        sub_agent_tool_definitions = {\n            name: asyncio.run(sub_agent_tool_manager.get_all_tool_definitions())\n            for name, sub_agent_tool_manager in sub_agent_tool_managers.items()\n        }\n\n        _preload_cache[\"cfg\"] = cfg\n        _preload_cache[\"main_agent_tool_manager\"] = main_agent_tool_manager\n        _preload_cache[\"sub_agent_tool_managers\"] = sub_agent_tool_managers\n        _preload_cache[\"output_formatter\"] = output_formatter\n        _preload_cache[\"tool_definitions\"] = tool_definitions\n        _preload_cache[\"sub_agent_tool_definitions\"] = sub_agent_tool_definitions\n        _preload_cache[\"loaded\"] = True\n        logger.info(\"Pipeline components loaded successfully.\")\n\n\nclass ThreadSafeAsyncQueue:\n    \"\"\"Thread-safe async queue wrapper\"\"\"\n\n    def __init__(self):\n        self._queue = asyncio.Queue()\n        self._loop = None\n        self._closed = False\n\n    def set_loop(self, loop):\n        self._loop = loop\n\n    async def put(self, item):\n        \"\"\"Put data safely from any thread\"\"\"\n        if self._closed:\n            return\n        await self._queue.put(item)\n\n    def put_nowait_threadsafe(self, item):\n        \"\"\"Put data from other threads - use direct queue put for lower latency\"\"\"\n        if self._closed or not self._loop:\n            return\n        # Use put_nowait directly instead of creating a task for lower latency\n        self._loop.call_soon_threadsafe(lambda: self._queue.put_nowait(item))\n\n    async def get(self):\n        return await self._queue.get()\n\n    def close(self):\n        self._closed = True\n\n\ndef filter_google_search_organic(organic: List[dict]) -> List[dict]:\n    \"\"\"\n    Filter google search organic results to remove unnecessary information\n    \"\"\"\n    result = []\n    for item in organic:\n        result.append(\n            {\n                \"title\": item.get(\"title\", \"\"),\n                \"link\": item.get(\"link\", \"\"),\n            }\n        )\n    return result\n\n\ndef is_scrape_error(result: str) -> bool:\n    \"\"\"\n    Check if the scrape result is an error\n    \"\"\"\n    try:\n        json.loads(result)\n        return False\n    except json.JSONDecodeError:\n        return True\n\n\ndef filter_message(message: dict) -> dict:\n    \"\"\"\n    Filter message to remove unnecessary information\n    \"\"\"\n    if message[\"event\"] == \"tool_call\":\n        tool_name = message[\"data\"].get(\"tool_name\")\n        tool_input = message[\"data\"].get(\"tool_input\")\n        if (\n            tool_name == \"google_search\"\n            and isinstance(tool_input, dict)\n            and \"result\" in tool_input\n        ):\n            result_dict = json.loads(tool_input[\"result\"])\n            if \"organic\" in result_dict:\n                new_result = {\n                    \"organic\": filter_google_search_organic(result_dict[\"organic\"])\n                }\n                message[\"data\"][\"tool_input\"][\"result\"] = json.dumps(\n                    new_result, ensure_ascii=False\n                )\n        if (\n            tool_name in [\"scrape\", \"scrape_website\"]\n            and isinstance(tool_input, dict)\n            and \"result\" in tool_input\n        ):\n            # if error, it can not be json\n            if is_scrape_error(tool_input[\"result\"]):\n                message[\"data\"][\"tool_input\"] = {\"error\": tool_input[\"result\"]}\n            else:\n                message[\"data\"][\"tool_input\"] = {}\n    return message\n\n\nasync def stream_events_optimized(\n    task_id: str, query: str, _: Optional[dict] = None, disconnect_check=None\n) -> AsyncGenerator[dict, None]:\n    \"\"\"Optimized event stream generator that directly outputs structured events, no longer wrapped as SSE strings.\"\"\"\n    workflow_id = task_id\n    last_send_time = time.time()\n    last_heartbeat_time = time.time()\n\n    # Create thread-safe queue\n    stream_queue = ThreadSafeAsyncQueue()\n    stream_queue.set_loop(asyncio.get_event_loop())\n\n    cancel_event = threading.Event()\n\n    def run_pipeline_in_thread():\n        try:\n            loop = asyncio.new_event_loop()\n            asyncio.set_event_loop(loop)\n\n            class ThreadQueueWrapper:\n                def __init__(self, thread_queue, cancel_event):\n                    self.thread_queue = thread_queue\n                    self.cancel_event = cancel_event\n\n                async def put(self, item):\n                    if self.cancel_event.is_set():\n                        logger.info(\"Pipeline cancelled, stopping execution\")\n                        return\n                    self.thread_queue.put_nowait_threadsafe(filter_message(item))\n\n            wrapper_queue = ThreadQueueWrapper(stream_queue, cancel_event)\n\n            # Ensure pipeline components are loaded (lazy loading)\n            _ensure_preloaded()\n\n            async def pipeline_with_cancellation():\n                pipeline_task = asyncio.create_task(\n                    execute_task_pipeline(\n                        cfg=_preload_cache[\"cfg\"],\n                        task_id=workflow_id,\n                        task_description=query,\n                        task_file_name=None,\n                        main_agent_tool_manager=_preload_cache[\n                            \"main_agent_tool_manager\"\n                        ],\n                        sub_agent_tool_managers=_preload_cache[\n                            \"sub_agent_tool_managers\"\n                        ],\n                        output_formatter=_preload_cache[\"output_formatter\"],\n                        stream_queue=wrapper_queue,\n                        log_dir=os.getenv(\"LOG_DIR\", \"logs/api-server\"),\n                        tool_definitions=_preload_cache[\"tool_definitions\"],\n                        sub_agent_tool_definitions=_preload_cache[\n                            \"sub_agent_tool_definitions\"\n                        ],\n                    )\n                )\n\n                async def check_cancellation():\n                    while not cancel_event.is_set():\n                        await asyncio.sleep(0.5)\n                    logger.info(\"Cancel event detected, cancelling pipeline\")\n                    pipeline_task.cancel()\n\n                cancel_task = asyncio.create_task(check_cancellation())\n\n                try:\n                    done, pending = await asyncio.wait(\n                        [pipeline_task, cancel_task],\n                        return_when=asyncio.FIRST_COMPLETED,\n                    )\n                    for task in pending:\n                        task.cancel()\n                    for task in done:\n                        if task == pipeline_task:\n                            try:\n                                await task\n                            except asyncio.CancelledError:\n                                logger.info(\"Pipeline task was cancelled\")\n                except Exception as e:\n                    logger.error(f\"Pipeline execution error: {e}\")\n                    pipeline_task.cancel()\n                    cancel_task.cancel()\n\n            loop.run_until_complete(pipeline_with_cancellation())\n        except Exception as e:\n            if not cancel_event.is_set():\n                logger.error(f\"Pipeline error: {e}\", exc_info=True)\n                stream_queue.put_nowait_threadsafe(\n                    {\n                        \"event\": \"error\",\n                        \"data\": {\"error\": str(e), \"workflow_id\": workflow_id},\n                    }\n                )\n        finally:\n            stream_queue.put_nowait_threadsafe(None)\n            if \"loop\" in locals():\n                loop.close()\n\n    executor = ThreadPoolExecutor(max_workers=1)\n    future = executor.submit(run_pipeline_in_thread)\n\n    try:\n        while True:\n            try:\n                if disconnect_check and await disconnect_check():\n                    logger.info(\"Client disconnected, stopping pipeline\")\n                    cancel_event.set()\n                    break\n                message = await asyncio.wait_for(stream_queue.get(), timeout=0.1)\n                if message is None:\n                    logger.info(\"Pipeline completed\")\n                    break\n                yield message\n                last_send_time = time.time()\n            except asyncio.TimeoutError:\n                current_time = time.time()\n                if current_time - last_send_time > 300:\n                    logger.info(\"Stream timeout\")\n                    break\n                if future.done():\n                    try:\n                        message = stream_queue._queue.get_nowait()\n                        if message is not None:\n                            yield message\n                            continue\n                    except Exception:\n                        break\n                if current_time - last_heartbeat_time >= 15:\n                    yield {\n                        \"event\": \"heartbeat\",\n                        \"data\": {\"timestamp\": current_time, \"workflow_id\": workflow_id},\n                    }\n                    last_heartbeat_time = current_time\n    except Exception as e:\n        logger.error(f\"Stream error: {e}\", exc_info=True)\n        yield {\n            \"event\": \"error\",\n            \"data\": {\"workflow_id\": workflow_id, \"error\": f\"Stream error: {str(e)}\"},\n        }\n    finally:\n        cancel_event.set()\n        stream_queue.close()\n        try:\n            future.result(timeout=1.0)\n        except Exception:\n            pass\n        executor.shutdown(wait=False)\n\n\n# ========================= Gradio Integration =========================\n\n\ndef _init_render_state():\n    return {\n        \"agent_order\": [],\n        \"agents\": {},  # agent_id -> {\"agent_name\": str, \"tool_call_order\": [], \"tools\": {tool_call_id: {...}}}\n        \"current_agent_id\": None,\n        \"errors\": [],\n    }\n\n\ndef _format_think_content(text: str) -> str:\n    \"\"\"Convert <think> tags to readable markdown format.\"\"\"\n    import re\n\n    # Replace <think> tags with blockquote format (no label)\n    text = re.sub(r\"<think>\\s*\", \"\\n> \", text)\n    text = re.sub(r\"\\s*</think>\", \"\\n\", text)\n    # Convert newlines within thinking to blockquote continuation\n    lines = text.split(\"\\n\")\n    result = []\n    in_thinking = False\n    for line in lines:\n        if line.strip().startswith(\">\") and not in_thinking:\n            in_thinking = True\n            result.append(line)\n        elif in_thinking and line.strip() and not line.startswith(\">\"):\n            result.append(f\"> {line}\")\n        else:\n            if line.strip() == \"\" and in_thinking:\n                in_thinking = False\n            result.append(line)\n    return \"\\n\".join(result)\n\n\ndef _append_show_text(tool_entry: dict, delta: str):\n    existing = tool_entry.get(\"content\", \"\")\n    # Skip \"Final boxed answer\" content (already shown in main response)\n    if \"Final boxed answer\" in delta:\n        return\n    # Format think tags for display\n    formatted_delta = _format_think_content(delta)\n    tool_entry[\"content\"] = existing + formatted_delta\n\n\ndef _is_empty_payload(value) -> bool:\n    if value is None:\n        return True\n    if isinstance(value, str):\n        stripped = value.strip()\n        return stripped == \"\" or stripped in (\"{}\", \"[]\")\n    if isinstance(value, (dict, list, tuple, set)):\n        return len(value) == 0\n    return False\n\n\ndef _format_search_results(tool_input: dict, tool_output: dict) -> str:\n    \"\"\"Format google_search results in a beautiful card layout.\"\"\"\n    lines = []\n\n    # Get search query from input\n    query = \"\"\n    if isinstance(tool_input, dict):\n        query = tool_input.get(\"q\", \"\") or tool_input.get(\"query\", \"\")\n\n    # Parse results from output - handle multiple formats\n    results = []\n    if isinstance(tool_output, dict):\n        # Case 1: output has \"result\" field containing JSON string\n        result_str = tool_output.get(\"result\", \"\")\n        if isinstance(result_str, str) and result_str.strip():\n            try:\n                result_data = json.loads(result_str)\n                if isinstance(result_data, dict):\n                    results = result_data.get(\"organic\", [])\n            except json.JSONDecodeError:\n                pass\n        elif isinstance(result_str, dict):\n            results = result_str.get(\"organic\", [])\n\n        # Case 2: output directly contains \"organic\" field\n        if not results and \"organic\" in tool_output:\n            results = tool_output.get(\"organic\", [])\n\n    if not results and not query:\n        return \"\"\n\n    # Build the card\n    lines.append('<div class=\"search-card\">')\n\n    # Header with query\n    if query:\n        lines.append('<div class=\"search-header\">')\n        lines.append('<span class=\"search-icon\">🔍</span>')\n        lines.append(f'<span class=\"search-query\">Search: \"{query}\"</span>')\n        lines.append(\"</div>\")\n\n    # Results count\n    if results:\n        lines.append(f'<div class=\"search-count\">≡ Found {len(results)} results</div>')\n\n        # Results list\n        lines.append('<div class=\"search-results\">')\n        for item in results[:10]:  # Limit to 10 results\n            title = item.get(\"title\", \"Untitled\")\n            link = item.get(\"link\", \"#\")\n\n            lines.append(f\"\"\"<a href=\"{link}\" target=\"_blank\" class=\"search-result-item\">\n                <span class=\"result-icon\">🌐</span>\n                <span class=\"result-title\">{title}</span>\n            </a>\"\"\")\n        lines.append(\"</div>\")\n\n    lines.append(\"</div>\")\n\n    return \"\\n\".join(lines)\n\n\ndef _format_sogou_search_results(tool_input: dict, tool_output: dict) -> str:\n    \"\"\"Format sogou_search results in a beautiful card layout.\"\"\"\n    lines = []\n\n    # Get search query from input\n    query = \"\"\n    if isinstance(tool_input, dict):\n        query = tool_input.get(\"q\", \"\") or tool_input.get(\"query\", \"\")\n\n    # Parse results from output - sogou uses \"Pages\" instead of \"organic\"\n    results = []\n    if isinstance(tool_output, dict):\n        result_str = tool_output.get(\"result\", \"\")\n        if isinstance(result_str, str) and result_str.strip():\n            try:\n                result_data = json.loads(result_str)\n                if isinstance(result_data, dict):\n                    results = result_data.get(\"Pages\", [])\n            except json.JSONDecodeError:\n                pass\n        elif isinstance(result_str, dict):\n            results = result_str.get(\"Pages\", [])\n\n        if not results and \"Pages\" in tool_output:\n            results = tool_output.get(\"Pages\", [])\n\n    if not results and not query:\n        return \"\"\n\n    # Build the card\n    lines.append('<div class=\"search-card\">')\n\n    # Header with query\n    if query:\n        lines.append('<div class=\"search-header\">')\n        lines.append('<span class=\"search-icon\">🔍</span>')\n        lines.append(f'<span class=\"search-query\">Search: \"{query}\"</span>')\n        lines.append(\"</div>\")\n\n    # Results count\n    if results:\n        lines.append(f'<div class=\"search-count\">≡ Found {len(results)} results</div>')\n\n        # Results list\n        lines.append('<div class=\"search-results\">')\n        for item in results[:10]:  # Limit to 10 results\n            title = item.get(\"title\", \"Untitled\")\n            link = item.get(\"url\", item.get(\"link\", \"#\"))\n\n            lines.append(f\"\"\"<a href=\"{link}\" target=\"_blank\" class=\"search-result-item\">\n                <span class=\"result-icon\">🌐</span>\n                <span class=\"result-title\">{title}</span>\n            </a>\"\"\")\n        lines.append(\"</div>\")\n\n    lines.append(\"</div>\")\n\n    return \"\\n\".join(lines)\n\n\ndef _format_scrape_results(tool_input: dict, tool_output: dict) -> str:\n    \"\"\"Format scrape/webpage results in a card layout.\"\"\"\n    lines = []\n\n    # Get URL\n    url = \"\"\n    if isinstance(tool_input, dict):\n        url = tool_input.get(\"url\", tool_input.get(\"link\", \"\"))\n\n    # Check for error\n    if isinstance(tool_output, dict) and \"error\" in tool_output:\n        lines.append('<div class=\"scrape-card scrape-error\">')\n        lines.append('<div class=\"scrape-header\">')\n        lines.append('<span class=\"scrape-icon\">🌐</span>')\n        lines.append(\n            f'<span class=\"scrape-url\">{url[:60]}{\"...\" if len(url) > 60 else \"\"}</span>'\n        )\n        lines.append(\"</div>\")\n        lines.append('<div class=\"scrape-status error\">❌ Failed</div>')\n        lines.append(\"</div>\")\n        return \"\\n\".join(lines)\n\n    # Success case\n    lines.append('<div class=\"scrape-card\">')\n    if url:\n        lines.append('<div class=\"scrape-header\">')\n        lines.append('<span class=\"scrape-icon\">🌐</span>')\n        lines.append(\n            f'<span class=\"scrape-url\">{url[:60]}{\"...\" if len(url) > 60 else \"\"}</span>'\n        )\n        lines.append(\"</div>\")\n        lines.append('<div class=\"scrape-status success\">✓ Done</div>')\n    lines.append(\"</div>\")\n\n    return \"\\n\".join(lines)\n\n\ndef _render_markdown(state: dict) -> str:\n    lines = []\n    final_summary_lines = []  # Collect final summary content separately\n\n    # Render errors first if any\n    if state.get(\"errors\"):\n        for err in state[\"errors\"]:\n            lines.append(f'<div class=\"error-block\">❌ {err}</div>')\n\n    # Render all agents' content\n    for agent_id in state.get(\"agent_order\", []):\n        agent = state[\"agents\"].get(agent_id, {})\n        agent_name = agent.get(\"agent_name\", \"\")\n        is_final_summary = agent_name == \"Final Summary\"\n\n        for call_id in agent.get(\"tool_call_order\", []):\n            call = agent[\"tools\"].get(call_id, {})\n            tool_name = call.get(\"tool_name\", \"unknown_tool\")\n\n            # Show text / message - display directly\n            if tool_name in (\"show_text\", \"message\"):\n                content = call.get(\"content\", \"\")\n                if content:\n                    if is_final_summary:\n                        final_summary_lines.append(content)\n                    else:\n                        lines.append(content)\n                continue\n\n            tool_input = call.get(\"input\", {})\n            tool_output = call.get(\"output\", {})\n            has_input = not _is_empty_payload(tool_input)\n            has_output = not _is_empty_payload(tool_output)\n\n            # Special formatting for google_search\n            if tool_name == \"google_search\" and (has_input or has_output):\n                formatted = _format_search_results(tool_input, tool_output)\n                if formatted:\n                    lines.append(formatted)\n                continue\n\n            # Special formatting for sogou_search\n            if tool_name == \"sogou_search\" and (has_input or has_output):\n                formatted = _format_sogou_search_results(tool_input, tool_output)\n                if formatted:\n                    lines.append(formatted)\n                continue\n\n            # Special formatting for scrape/webpage tools\n            if tool_name in (\n                \"scrape\",\n                \"scrape_website\",\n                \"scrape_webpage\",\n                \"scrape_and_extract_info\",\n            ) and (has_input or has_output):\n                formatted = _format_scrape_results(tool_input, tool_output)\n                if formatted:\n                    lines.append(formatted)\n                continue\n\n            # Special formatting for code execution tools\n            if tool_name in (\"python\", \"run_python_code\") and (has_input or has_output):\n                # Use pure Markdown to avoid HTML wrapper blocking Markdown rendering\n                lines.append(\"\\n---\\n\")\n                lines.append(\"#### 💻 Code Execution\\n\")\n                # Show code input - try multiple possible keys\n                code = \"\"\n                if isinstance(tool_input, dict):\n                    code = tool_input.get(\"code\") or tool_input.get(\"code_block\") or \"\"\n                elif isinstance(tool_input, str):\n                    code = tool_input\n                if code:\n                    lines.append(f\"\\n```python\\n{code}\\n```\\n\")\n                # Show output if available\n                if has_output:\n                    output = \"\"\n                    if isinstance(tool_output, dict):\n                        output = (\n                            tool_output.get(\"result\")\n                            or tool_output.get(\"output\")\n                            or tool_output.get(\"stdout\")\n                            or \"\"\n                        )\n                    elif isinstance(tool_output, str):\n                        output = tool_output\n                    if isinstance(output, str) and output.strip():\n                        lines.append(\"\\n**Output:**\\n\")\n                        lines.append(\n                            f'\\n```text\\n{output[:1000]}{\"...\" if len(output) > 1000 else \"\"}\\n```\\n'\n                        )\n                lines.append(\"\\n✅ Executed\\n\")\n                continue\n\n            # Other tools - show as compact card\n            if has_input or has_output:\n                target_lines = final_summary_lines if is_final_summary else lines\n                target_lines.append('<div class=\"tool-card\">')\n                target_lines.append(f'<div class=\"tool-header\">🔧 {tool_name}</div>')\n                if has_input:\n                    # Show brief input summary\n                    if isinstance(tool_input, dict):\n                        brief = \", \".join(\n                            f\"{k}: {str(v)[:30]}...\"\n                            if len(str(v)) > 30\n                            else f\"{k}: {v}\"\n                            for k, v in list(tool_input.items())[:2]\n                        )\n                        target_lines.append(f'<div class=\"tool-brief\">{brief}</div>')\n                if has_output:\n                    target_lines.append('<div class=\"tool-status\">✓ Done</div>')\n                target_lines.append(\"</div>\")\n\n    # Add final summary with Markdown-based styling (no HTML wrapper to preserve Markdown rendering)\n    if final_summary_lines:\n        lines.append(\"\\n\\n---\\n\\n\")  # Markdown horizontal rule as divider\n        lines.append(\"## 📋 Research Summary\\n\\n\")\n        lines.extend(final_summary_lines)\n\n    return \"\\n\".join(lines) if lines else \"*Waiting to start research...*\"\n\n\ndef _update_state_with_event(state: dict, message: dict):\n    event = message.get(\"event\")\n    data = message.get(\"data\", {})\n    if event == \"start_of_agent\":\n        agent_id = data.get(\"agent_id\")\n        agent_name = data.get(\"agent_name\", \"unknown\")\n        if agent_id and agent_id not in state[\"agents\"]:\n            state[\"agents\"][agent_id] = {\n                \"agent_name\": agent_name,\n                \"tool_call_order\": [],\n                \"tools\": {},\n            }\n            state[\"agent_order\"].append(agent_id)\n        state[\"current_agent_id\"] = agent_id\n    elif event == \"end_of_agent\":\n        # End marker, no special handling needed, keep structure\n        state[\"current_agent_id\"] = None\n    elif event == \"tool_call\":\n        tool_call_id = data.get(\"tool_call_id\")\n        tool_name = data.get(\"tool_name\", \"unknown_tool\")\n        agent_id = state.get(\"current_agent_id\") or (\n            state[\"agent_order\"][-1] if state[\"agent_order\"] else None\n        )\n        if not agent_id:\n            return state\n        agent = state[\"agents\"].setdefault(\n            agent_id, {\"agent_name\": \"unknown\", \"tool_call_order\": [], \"tools\": {}}\n        )\n        tools = agent[\"tools\"]\n        if tool_call_id not in tools:\n            tools[tool_call_id] = {\"tool_name\": tool_name}\n            agent[\"tool_call_order\"].append(tool_call_id)\n        entry = tools[tool_call_id]\n        if tool_name == \"show_text\" and \"delta_input\" in data:\n            delta = data.get(\"delta_input\", {}).get(\"text\", \"\")\n            _append_show_text(entry, delta)\n        elif tool_name == \"show_text\" and \"tool_input\" in data:\n            ti = data.get(\"tool_input\")\n            text = \"\"\n            if isinstance(ti, dict):\n                text = ti.get(\"text\", \"\") or (\n                    (ti.get(\"result\") or {}).get(\"text\")\n                    if isinstance(ti.get(\"result\"), dict)\n                    else \"\"\n                )\n            elif isinstance(ti, str):\n                text = ti\n            if text:\n                _append_show_text(entry, text)\n        else:\n            # Distinguish between input and output:\n            if \"tool_input\" in data:\n                # Could be input (first time) or output with result (second time)\n                ti = data[\"tool_input\"]\n                # If contains result, assign to output; otherwise assign to input\n                if isinstance(ti, dict) and \"result\" in ti:\n                    entry[\"output\"] = ti\n                else:\n                    # Only update input if we don't already have valid input data, or if the new data is not empty\n                    if \"input\" not in entry or not _is_empty_payload(ti):\n                        entry[\"input\"] = ti\n    elif event == \"message\":\n        # Same incremental text display as show_text, aggregated by message_id\n        message_id = data.get(\"message_id\")\n        agent_id = state.get(\"current_agent_id\") or (\n            state[\"agent_order\"][-1] if state[\"agent_order\"] else None\n        )\n        if not agent_id:\n            return state\n        agent = state[\"agents\"].setdefault(\n            agent_id, {\"agent_name\": \"unknown\", \"tool_call_order\": [], \"tools\": {}}\n        )\n        tools = agent[\"tools\"]\n        if message_id not in tools:\n            tools[message_id] = {\"tool_name\": \"message\"}\n            agent[\"tool_call_order\"].append(message_id)\n        entry = tools[message_id]\n        delta_content = (data.get(\"delta\") or {}).get(\"content\", \"\")\n        if isinstance(delta_content, str) and delta_content:\n            _append_show_text(entry, delta_content)\n    elif event == \"error\":\n        # Collect errors, display uniformly during rendering\n        err_text = data.get(\"error\") if isinstance(data, dict) else None\n        if not err_text:\n            try:\n                err_text = json.dumps(data, ensure_ascii=False)\n            except Exception:\n                err_text = str(data)\n        state.setdefault(\"errors\", []).append(err_text)\n    else:\n        # Ignore heartbeat or other events\n        pass\n\n    return state\n\n\n_CANCEL_FLAGS = {}\n_CANCEL_LOCK = threading.Lock()\n\n\ndef _set_cancel_flag(task_id: str):\n    with _CANCEL_LOCK:\n        _CANCEL_FLAGS[task_id] = True\n\n\ndef _reset_cancel_flag(task_id: str):\n    with _CANCEL_LOCK:\n        _CANCEL_FLAGS[task_id] = False\n\n\nasync def _disconnect_check_for_task(task_id: str):\n    with _CANCEL_LOCK:\n        return _CANCEL_FLAGS.get(task_id, False)\n\n\ndef _spinner_markup(running: bool) -> str:\n    if not running:\n        return \"\"\n    return (\n        '\\n\\n<div style=\"display:flex;align-items:center;gap:8px;color:#555;margin-top:8px;\">'\n        '<div style=\"width:16px;height:16px;border:2px solid #ddd;border-top-color:#3b82f6;border-radius:50%;animation:spin 0.8s linear infinite;\"></div>'\n        \"<span>Generating...</span>\"\n        \"</div>\\n<style>@keyframes spin{to{transform:rotate(360deg)}}</style>\\n\"\n    )\n\n\nasync def gradio_run(query: str, ui_state: Optional[dict]):\n    query = replace_chinese_punctuation(query or \"\")\n    task_id = str(uuid.uuid4())\n    _reset_cancel_flag(task_id)\n    if not ui_state:\n        ui_state = {\"task_id\": task_id}\n    else:\n        ui_state = {**ui_state, \"task_id\": task_id}\n    state = _init_render_state()\n    # Initial: disable Run, enable Stop, and show spinner at bottom of text\n    yield (\n        _render_markdown(state) + _spinner_markup(True),\n        gr.update(interactive=False),\n        gr.update(interactive=True),\n        ui_state,\n    )\n    async for message in stream_events_optimized(\n        task_id, query, None, lambda: _disconnect_check_for_task(task_id)\n    ):\n        # Skip heartbeat events - they don't need UI update\n        event_type = message.get(\"event\", \"unknown\")\n        if event_type == \"heartbeat\":\n            continue\n\n        state = _update_state_with_event(state, message)\n        md = _render_markdown(state)\n        yield (\n            md + _spinner_markup(True),\n            gr.update(interactive=False),\n            gr.update(interactive=True),\n            ui_state,\n        )\n        # Small delay to allow Gradio to process the update\n        await asyncio.sleep(0.01)\n    # End: enable Run, disable Stop, remove spinner\n    yield (\n        _render_markdown(state),\n        gr.update(interactive=True),\n        gr.update(interactive=False),\n        ui_state,\n    )\n\n\ndef stop_current(ui_state: Optional[dict]):\n    tid = (ui_state or {}).get(\"task_id\")\n    if tid:\n        _set_cancel_flag(tid)\n    # Immediately switch button availability: enable Run, disable Stop\n    return (\n        gr.update(interactive=True),\n        gr.update(interactive=False),\n    )\n\n\ndef build_demo():\n    # Use remote logo from dr.miromind.ai for faster page load\n\n    custom_css = \"\"\"\n    /* ========== MiroThinker - Modern Clean Design ========== */\n    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');\n    \n    /* Base */\n    .gradio-container {\n        max-width: 100% !important;\n        margin: 0 !important;\n        padding: 0 !important;\n        font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;\n        background: #ffffff !important;\n        min-height: 100vh;\n    }\n    \n    footer { display: none !important; }\n    \n    /* ===== Top Navigation ===== */\n    .top-nav {\n        display: flex;\n        align-items: center;\n        justify-content: space-between;\n        padding: 16px 32px;\n        border-bottom: 1px solid #f0f0f0;\n        background: #ffffff;\n    }\n    \n    .nav-left {\n        display: flex;\n        align-items: center;\n        gap: 20px;\n    }\n    \n    .nav-brand {\n        display: flex;\n        align-items: center;\n        gap: 10px;\n        font-weight: 600;\n        font-size: 1.1em;\n        color: #18181b;\n    }\n    \n    .brand-logo {\n        width: 32px;\n        height: 32px;\n        border-radius: 6px;\n    }\n    \n    .nav-links {\n        display: flex;\n        align-items: center;\n        gap: 12px;\n    }\n    \n    .nav-links a {\n        color: #71717a;\n        font-size: 1.1em;\n        text-decoration: none;\n        transition: color 0.2s;\n    }\n    \n    .nav-links a:hover {\n        color: #18181b;\n    }\n    \n    .nav-right {\n        display: flex;\n        align-items: center;\n        gap: 16px;\n    }\n    \n    .nav-right a {\n        color: #52525b;\n        text-decoration: none;\n        font-size: 0.9em;\n    }\n    \n    /* ===== Hero Section ===== */\n    .hero-section {\n        text-align: center;\n        padding: 60px 24px 40px;\n        max-width: 900px;\n        margin: 0 auto;\n    }\n    \n    .hero-title {\n        font-size: 3em;\n        font-weight: 700;\n        background: linear-gradient(135deg, #10b981 0%, #14b8a6 50%, #06b6d4 100%);\n        -webkit-background-clip: text;\n        -webkit-text-fill-color: transparent;\n        background-clip: text;\n        margin: 0 0 16px 0;\n        letter-spacing: -0.02em;\n    }\n    \n    .hero-subtitle {\n        display: flex;\n        align-items: center;\n        justify-content: center;\n        gap: 16px;\n        color: #71717a;\n        font-size: 1em;\n    }\n    \n    .hero-line {\n        width: 40px;\n        height: 1px;\n        background: #d4d4d8;\n    }\n    \n    /* ===== Input Section ===== */\n    #input-section {\n        max-width: 720px !important;\n        margin: 0 auto 40px !important;\n        background: #ffffff;\n        border: 1px solid #e0e0e0;\n        border-radius: 16px;\n        box-shadow: 0 2px 8px rgba(0,0,0,0.04);\n    }\n    \n    #question-input {\n        padding: 20px 24px !important;\n        background: #ffffff !important;\n        border: none !important;\n    }\n    \n    #question-input textarea {\n        background: #ffffff !important;\n        border: none !important;\n        font-size: 1.05em !important;\n        line-height: 1.7 !important;\n        color: #18181b !important;\n        box-shadow: none !important;\n    }\n    \n    #question-input textarea:focus {\n        outline: none !important;\n        box-shadow: none !important;\n    }\n    \n    #question-input textarea::placeholder {\n        color: #9ca3af !important;\n    }\n    \n    #btn-row {\n        padding: 16px 24px !important;\n        border-top: 1px solid #f0f0f0;\n        gap: 12px !important;\n    }\n    \n    #run-btn {\n        background: linear-gradient(135deg, #10b981 0%, #14b8a6 100%) !important;\n        color: #ffffff !important;\n        border: none !important;\n        border-radius: 10px !important;\n        padding: 12px 24px !important;\n        font-size: 0.95em !important;\n        font-weight: 500 !important;\n        cursor: pointer !important;\n        transition: opacity 0.2s, transform 0.2s !important;\n    }\n    \n    #run-btn:hover {\n        opacity: 0.9 !important;\n        transform: translateY(-1px) !important;\n    }\n    \n    #stop-btn {\n        background: #ffffff !important;\n        color: #71717a !important;\n        border: 1px solid #e5e5e5 !important;\n        border-radius: 10px !important;\n        padding: 12px 20px !important;\n        font-size: 0.95em !important;\n        font-weight: 500 !important;\n        cursor: pointer !important;\n        transition: all 0.2s !important;\n    }\n    \n    #stop-btn:hover {\n        color: #ef4444 !important;\n        border-color: #fecaca !important;\n        background: #fef2f2 !important;\n    }\n    \n    /* ===== Output Section ===== */\n    #output-section {\n        max-width: 900px !important;\n        margin: 0 auto !important;\n        padding: 0 24px 60px !important;\n    }\n    \n    .output-label {\n        font-size: 0.85em;\n        font-weight: 500;\n        color: #71717a;\n        text-transform: uppercase;\n        letter-spacing: 0.05em;\n        margin-bottom: 12px;\n        padding: 0 4px;\n    }\n    \n    #log-view {\n        padding: 24px !important;\n        min-height: 400px;\n        max-height: 70vh;\n        overflow-y: auto;\n        background: #ffffff !important;\n        border: 1px solid #e5e5e5 !important;\n        border-radius: 16px !important;\n    }\n    \n    #log-view h3 {\n        font-size: 0.95em;\n        font-weight: 600;\n        color: #18181b;\n        margin: 24px 0 16px 0;\n        padding-bottom: 8px;\n        border-bottom: 1px solid #f4f4f5;\n    }\n    \n    #log-view h3:first-child {\n        margin-top: 0;\n    }\n    \n    /* Error block */\n    .error-block {\n        background: #fef2f2;\n        border: 1px solid #fecaca;\n        border-radius: 10px;\n        padding: 12px 16px;\n        margin: 12px 0;\n        color: #dc2626;\n        font-size: 0.9em;\n    }\n    \n    /* Tool card */\n    .tool-card {\n        background: #fafafa;\n        border: 1px solid #e5e5e5;\n        border-radius: 10px;\n        padding: 12px 16px;\n        margin: 12px 0;\n    }\n    \n    .tool-header {\n        font-size: 0.9em;\n        font-weight: 500;\n        color: #3f3f46;\n        margin-bottom: 4px;\n    }\n    \n    .tool-brief {\n        font-size: 0.8em;\n        color: #71717a;\n        margin-top: 4px;\n    }\n    \n    .tool-status {\n        font-size: 0.8em;\n        color: #10b981;\n        margin-top: 6px;\n    }\n    \n    #log-view blockquote {\n        background: linear-gradient(135deg, #f0fdf4 0%, #ecfeff 100%);\n        border: none;\n        border-left: 3px solid #10b981;\n        padding: 16px 20px;\n        margin: 16px 0;\n        border-radius: 0 12px 12px 0;\n        font-style: normal;\n        color: #065f46;\n        font-size: 0.9em;\n        line-height: 1.7;\n    }\n    \n    #log-view pre {\n        background: #f8f9fa !important;\n        color: #1e293b !important;\n        border-radius: 8px !important;\n        padding: 16px !important;\n        font-size: 0.85em !important;\n        line-height: 1.6 !important;\n        overflow-x: auto;\n        margin: 12px 0;\n        border: 1px solid #e2e8f0;\n    }\n    \n    #log-view pre code {\n        background: transparent !important;\n        color: #1e293b !important;\n        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;\n        font-size: inherit !important;\n        padding: 0 !important;\n        white-space: pre-wrap;\n        word-break: break-word;\n    }\n    \n    #log-view code {\n        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;\n        background: #f1f5f9 !important;\n        color: #1e293b !important;\n        padding: 2px 6px !important;\n        border-radius: 4px !important;\n        font-size: 0.9em !important;\n    }\n    \n    #log-view p {\n        line-height: 1.7;\n        color: #3f3f46;\n    }\n    \n    #log-view::-webkit-scrollbar {\n        width: 6px;\n    }\n    \n    #log-view::-webkit-scrollbar-track {\n        background: transparent;\n    }\n    \n    #log-view::-webkit-scrollbar-thumb {\n        background: #e5e5e5;\n        border-radius: 3px;\n    }\n    \n    #log-view::-webkit-scrollbar-thumb:hover {\n        background: #d4d4d8;\n    }\n    \n    /* ===== Footer ===== */\n    .app-footer {\n        text-align: center;\n        padding: 24px;\n        color: #a1a1aa;\n        font-size: 0.85em;\n        border-top: 1px solid #f0f0f0;\n    }\n    \n    /* ===== Loading Spinner ===== */\n    @keyframes spin {\n        to { transform: rotate(360deg); }\n    }\n    \n    .loading-indicator {\n        display: inline-flex;\n        align-items: center;\n        gap: 10px;\n        color: #10b981;\n        font-size: 0.9em;\n        padding: 12px 0;\n    }\n    \n    .loading-indicator::before {\n        content: '';\n        width: 16px;\n        height: 16px;\n        border: 2px solid #d1fae5;\n        border-top-color: #10b981;\n        border-radius: 50%;\n        animation: spin 0.8s linear infinite;\n    }\n    \n    /* ===== Search Results Card ===== */\n    .search-card {\n        background: #ffffff;\n        border: 1px solid #e5e5e5;\n        border-radius: 12px;\n        margin: 16px 0;\n        overflow: hidden;\n    }\n    \n    .search-header {\n        display: flex;\n        align-items: center;\n        gap: 10px;\n        padding: 14px 18px;\n        background: #fafafa;\n        border-bottom: 1px solid #f0f0f0;\n    }\n    \n    .search-icon {\n        font-size: 1em;\n        color: #10b981;\n    }\n    \n    .search-query {\n        font-size: 0.9em;\n        color: #3f3f46;\n        font-weight: 500;\n    }\n    \n    .search-count {\n        padding: 10px 18px;\n        font-size: 0.8em;\n        color: #71717a;\n        background: #fafafa;\n        border-bottom: 1px solid #f0f0f0;\n    }\n    \n    .search-results {\n        padding: 8px 0;\n    }\n    \n    .search-result-item {\n        display: flex;\n        align-items: center;\n        gap: 12px;\n        padding: 12px 18px;\n        text-decoration: none;\n        color: #3f3f46;\n        font-size: 0.9em;\n        transition: background 0.15s;\n        border-left: 3px solid transparent;\n    }\n    \n    .search-result-item:hover {\n        background: #f9fafb;\n        border-left-color: #10b981;\n    }\n    \n    .result-icon {\n        font-size: 1em;\n        flex-shrink: 0;\n        opacity: 0.6;\n    }\n    \n    .result-title {\n        flex: 1;\n        overflow: hidden;\n        text-overflow: ellipsis;\n        white-space: nowrap;\n    }\n    \n    /* ===== Scrape Card ===== */\n    .scrape-card {\n        background: #ffffff;\n        border: 1px solid #e5e5e5;\n        border-radius: 10px;\n        margin: 12px 0;\n        padding: 12px 16px;\n        display: flex;\n        align-items: center;\n        justify-content: space-between;\n        gap: 12px;\n    }\n    \n    .scrape-card.scrape-error {\n        border-color: #fecaca;\n        background: #fef2f2;\n    }\n    \n    .scrape-header {\n        display: flex;\n        align-items: center;\n        gap: 10px;\n        flex: 1;\n        min-width: 0;\n    }\n    \n    .scrape-icon {\n        font-size: 1em;\n        opacity: 0.6;\n    }\n    \n    .scrape-url {\n        font-size: 0.85em;\n        color: #52525b;\n        overflow: hidden;\n        text-overflow: ellipsis;\n        white-space: nowrap;\n    }\n    \n    .scrape-status {\n        font-size: 0.8em;\n        padding: 4px 10px;\n        border-radius: 6px;\n        flex-shrink: 0;\n    }\n    \n    .scrape-status.success {\n        background: #ecfdf5;\n        color: #059669;\n    }\n    \n    .scrape-status.error {\n        background: #fef2f2;\n        color: #dc2626;\n    }\n    \n    /* ===== Final Summary Section ===== */\n    .final-summary-divider {\n        height: 1px;\n        background: linear-gradient(to right, transparent, #e5e5e5, transparent);\n        margin: 32px 0;\n    }\n    \n    .final-summary-section {\n        background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%);\n        border: 1px solid #e2e8f0;\n        border-radius: 16px;\n        padding: 24px;\n        margin-top: 16px;\n    }\n    \n    .final-summary-header {\n        font-size: 1.1em;\n        font-weight: 600;\n        color: #1e293b;\n        margin-bottom: 16px;\n        padding-bottom: 12px;\n        border-bottom: 2px solid #3b82f6;\n        display: inline-block;\n    }\n    \n    .final-summary-content {\n        color: #334155;\n        line-height: 1.8;\n    }\n    \n    .final-summary-content h1,\n    .final-summary-content h2,\n    .final-summary-content h3 {\n        color: #1e293b;\n        margin-top: 1.5em;\n        margin-bottom: 0.5em;\n    }\n    \n    .final-summary-content h1 { font-size: 1.4em; }\n    .final-summary-content h2 { font-size: 1.2em; }\n    .final-summary-content h3 { font-size: 1.1em; }\n    \n    .final-summary-content p {\n        margin: 0.8em 0;\n    }\n    \n    .final-summary-content ul,\n    .final-summary-content ol {\n        margin: 0.8em 0;\n        padding-left: 1.5em;\n    }\n    \n    .final-summary-content li {\n        margin: 0.4em 0;\n    }\n    \n    .final-summary-content a {\n        color: #3b82f6;\n        text-decoration: none;\n    }\n    \n    .final-summary-content a:hover {\n        text-decoration: underline;\n    }\n    \n    .final-summary-content code {\n        background: #e2e8f0;\n        padding: 2px 6px;\n        border-radius: 4px;\n        font-family: 'SF Mono', 'Fira Code', monospace;\n        font-size: 0.9em;\n    }\n    \n    .final-summary-content pre {\n        background: #1e293b;\n        color: #e2e8f0;\n        padding: 16px;\n        border-radius: 8px;\n        overflow-x: auto;\n    }\n    \n    .final-summary-content pre code {\n        background: transparent;\n        padding: 0;\n        color: inherit;\n    }\n    \n    .final-summary-content table {\n        width: 100%;\n        border-collapse: collapse;\n        margin: 1em 0;\n    }\n    \n    .final-summary-content th,\n    .final-summary-content td {\n        padding: 10px 12px;\n        border: 1px solid #e2e8f0;\n        text-align: left;\n    }\n    \n    .final-summary-content th {\n        background: #f1f5f9;\n        font-weight: 600;\n    }\n    \n    .final-summary-content blockquote {\n        border-left: 4px solid #3b82f6;\n        margin: 1em 0;\n        padding: 0.5em 1em;\n        background: #f8fafc;\n        color: #475569;\n    }\n    \n    /* ===== Code Execution Card ===== */\n    .code-card {\n        background: #1e1e2e;\n        border: 1px solid #313244;\n        border-radius: 12px;\n        margin: 12px 0;\n        padding: 16px;\n        overflow: hidden;\n    }\n    \n    .code-header {\n        font-size: 0.9em;\n        font-weight: 600;\n        color: #cdd6f4;\n        margin-bottom: 12px;\n        display: flex;\n        align-items: center;\n        gap: 8px;\n    }\n    \n    .code-card pre {\n        background: #11111b !important;\n        border-radius: 8px;\n        padding: 12px 16px;\n        margin: 8px 0;\n        overflow-x: auto;\n        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;\n        font-size: 0.85em;\n        line-height: 1.5;\n    }\n    \n    .code-card code {\n        background: transparent !important;\n        color: #cdd6f4 !important;\n        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;\n    }\n    \n    .code-output-label {\n        font-size: 0.8em;\n        color: #a6adc8;\n        margin-top: 12px;\n        margin-bottom: 4px;\n    }\n    \n    .code-status {\n        font-size: 0.8em;\n        color: #a6e3a1;\n        margin-top: 8px;\n        text-align: right;\n    }\n    \n    /* ===== Responsive ===== */\n    @media (max-width: 768px) {\n        .hero-title {\n            font-size: 2em;\n        }\n        \n        .hero-section {\n            padding: 40px 16px 24px;\n        }\n        \n        .input-wrapper, .output-wrapper {\n            padding: 0 16px;\n        }\n        \n        #log-view {\n            max-height: 50vh;\n        }\n    }\n    \"\"\"\n\n    # Favicon head content\n    favicon_head = '<link rel=\"icon\" href=\"https://dr.miromind.ai/favicon.ico?v=2\">'\n\n    with gr.Blocks(\n        css=custom_css,\n        title=\"MiroThinker - Deep Research\",\n        theme=gr.themes.Base(),\n        head=favicon_head,\n    ) as demo:\n        # Top Navigation\n        gr.HTML(\"\"\"\n            <nav class=\"top-nav\">\n                <div class=\"nav-left\">\n                    <div class=\"nav-brand\">\n                        <img src=\"https://dr.miromind.ai/favicon.png\" class=\"brand-logo\" alt=\"MiroThinker\" />\n                        MiroThinker\n                    </div>\n                    <div class=\"nav-links\">\n                        <a href=\"https://huggingface.co/MiroMind\" target=\"_blank\">🤗</a>\n                        <a href=\"https://github.com/MiroMind/MiroThinker\" target=\"_blank\">\n                            <svg width=\"20\" height=\"20\" viewBox=\"0 0 24 24\" fill=\"currentColor\">\n                                <path d=\"M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z\"/>\n                            </svg>\n                        </a>\n                    </div>\n                </div>\n                <div class=\"nav-right\">\n                    <a href=\"https://miromind.ai\" target=\"_blank\">Visit Website</a>\n                </div>\n            </nav>\n        \"\"\")\n\n        # Hero Section\n        gr.HTML(\"\"\"\n            <div class=\"hero-section\">\n                <h1 class=\"hero-title\">Research Deep. Uncover the Future</h1>\n                <div class=\"hero-subtitle\">\n                    <span class=\"hero-line\"></span>\n                    Don't just chat. Predict, verify, and discover with science-based AI.\n                    <span class=\"hero-line\"></span>\n                </div>\n            </div>\n        \"\"\")\n\n        # Input Section\n        with gr.Column(elem_id=\"input-section\"):\n            inp = gr.Textbox(\n                lines=4,\n                placeholder=\"Enter your research question...\",\n                show_label=False,\n                elem_id=\"question-input\",\n            )\n            with gr.Row(elem_id=\"btn-row\"):\n                stop_btn = gr.Button(\n                    \"⏹ Stop\",\n                    elem_id=\"stop-btn\",\n                    variant=\"stop\",\n                    interactive=False,\n                    scale=1,\n                )\n                run_btn = gr.Button(\n                    \"Start Research ➤\", elem_id=\"run-btn\", variant=\"primary\", scale=2\n                )\n\n        # Output Section\n        with gr.Column(elem_id=\"output-section\"):\n            gr.HTML('<div class=\"output-label\">Research Progress</div>')\n            out_md = gr.Markdown(\"*Waiting to start research...*\", elem_id=\"log-view\")\n\n        # State\n        ui_state = gr.State({\"task_id\": None})\n\n        # Event handlers\n        run_btn.click(\n            fn=gradio_run,\n            inputs=[inp, ui_state],\n            outputs=[out_md, run_btn, stop_btn, ui_state],\n        )\n        stop_btn.click(fn=stop_current, inputs=[ui_state], outputs=[run_btn, stop_btn])\n\n        # Footer\n        gr.HTML(\"\"\"\n            <div class=\"app-footer\">\n                Content generated by MiroMind AI. Please verify important information.\n            </div>\n        \"\"\")\n\n    return demo\n\n\nif __name__ == \"__main__\":\n    demo = build_demo()\n    host = os.getenv(\"HOST\", \"0.0.0.0\")\n    port = int(os.getenv(\"PORT\", \"8080\"))\n    demo.queue().launch(server_name=host, server_port=port)\n"
  },
  {
    "path": "apps/gradio-demo/prompt_patch.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nCustom Prompt Override (Monkey Patching)\n\nThis module allows customizing prompts without modifying miroflow-agent code.\n\nPatches applied:\n1. `generate_mcp_system_prompt` - Prepends custom identity prompt\n2. `process_input` - Removes the boxed format requirement suffix\n3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt for demo\n4. `format_final_summary_and_log` - Disables boxed format check to prevent retry\n\nUsage:\n    from prompt_patch import apply_prompt_patch\n    apply_prompt_patch()\n\"\"\"\n\nimport re\n\n# ============================================================================\n# Custom Identity Prompt\n# ============================================================================\n\nCUSTOM_IDENTITY_PROMPT = \"\"\"You are MiroThinker, a specialized deep research AI assistant developed by MiroMind.\n\nIMPORTANT IDENTITY REMINDER:\n- You are NOT ChatGPT, Claude, or any other AI assistant\n\n\"\"\"\n\n# ============================================================================\n# Strings to Remove from Input Processing\n# ============================================================================\n\n# This string is appended to task descriptions in input_handler.py\n# We remove it for demo mode since we don't need strict boxed format\nBOXED_FORMAT_SUFFIX = \"\\nYou should follow the format instruction in the request strictly and wrap the final answer in \\\\boxed{}.\"\n\n# ============================================================================\n# Custom Summarize Prompt for Demo Mode\n# ============================================================================\n\n\ndef get_demo_summarize_prompt(target_language: str, task_description: str) -> str:\n    \"\"\"\n    Generate a user-friendly summarize prompt for demo mode.\n\n    This prompt is designed for better user experience, producing well-formatted\n    Markdown responses instead of strict boxed answers.\n\n    Args:\n        target_language: The language to write the response in\n        task_description: The original user question\n\n    Returns:\n        The summarize prompt string\n    \"\"\"\n    return f\"\"\"Please provide the final research summary based only on the information already gathered.\nNo further tool calls are allowed.\n\n## Requirements\n- **Language**: Write the entire response in **{target_language}**.\n- **Focus**: Directly answer the original question above. Do not just summarize gathered information — provide a clear, actionable answer.\n- **Response Length**: Match the complexity of your response to the question. For simple or short questions, provide a concise and direct answer without unnecessary elaboration. For complex questions, provide a detailed and structured report.\n- Use clear and structured Markdown formatting when appropriate.\n- Use appropriate Markdown headings (e.g., #, ##, ###) only when the content warrants structure.\n- Present key findings in an organized, concise, and readable way.\n- Use tables only when they genuinely improve clarity.\n- **Currency Format**: Use `\\\\$` instead of `$` for currency amounts (e.g., `\\\\$100`, `\\\\$1,000`) to avoid conflicts with inline math syntax.\n- **Citation Format**:\n  - **In-Text**: Use the format `[ID]`, where `ID` is a **numeric identifier only** (digits 0–9), e.g. `[1]`, `[2]`.\n  - **References Section(if has any sources)**: At the very end, add \"References\" (or equivalent in {target_language}). Format: [ID] TITLE/SECTION_TITLE. <URL>/<FILENAME>.\n- Do NOT mention tools, tool calls, or internal reasoning steps.\n- Focus solely on delivering a professional, easy-to-read response that answers the user's original question.\n\n## Original Question (for reference)\n{task_description}\"\"\"\n\n\ndef _detect_language(text: str) -> str:\n    \"\"\"\n    Simple language detection based on character analysis.\n\n    Returns a language description suitable for the summarize prompt.\n    \"\"\"\n    # Count characters by script\n    chinese_chars = sum(1 for c in text if \"\\u4e00\" <= c <= \"\\u9fff\")\n    japanese_chars = sum(\n        1 for c in text if \"\\u3040\" <= c <= \"\\u30ff\" or \"\\u31f0\" <= c <= \"\\u31ff\"\n    )\n    korean_chars = sum(1 for c in text if \"\\uac00\" <= c <= \"\\ud7af\")\n\n    total_chars = len(text.replace(\" \", \"\"))\n    if total_chars == 0:\n        return \"English\"\n\n    # Determine primary language\n    if chinese_chars / total_chars > 0.1:\n        return \"Chinese (Simplified)\"\n    elif japanese_chars / total_chars > 0.1:\n        return \"Japanese\"\n    elif korean_chars / total_chars > 0.1:\n        return \"Korean\"\n    else:\n        return \"the same language as the user's question\"\n\n\n# ============================================================================\n# Monkey Patching\n# ============================================================================\n\n_patched = False\n\n\ndef apply_prompt_patch():\n    \"\"\"\n    Apply monkey patches to customize prompts for demo mode.\n\n    Patches applied:\n    1. `generate_mcp_system_prompt` - Prepends custom identity prompt to system prompt\n    2. `process_input` - Removes the boxed format requirement from task descriptions\n    3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt\n    4. `format_final_summary_and_log` - Disables boxed format check to prevent retry\n\n    This function is idempotent - calling it multiple times has no additional effect.\n    \"\"\"\n    global _patched\n\n    if _patched:\n        return\n\n    _patch_system_prompt()\n    _patch_input_handler()\n    _patch_summarize_prompt()\n    _patch_output_formatter()\n\n    _patched = True\n\n\ndef _patch_system_prompt():\n    \"\"\"Patch system prompt generation to include custom identity.\"\"\"\n    from src.llm.providers import anthropic_client, openai_client\n    from src.utils import prompt_utils\n\n    # Store original function\n    original_generate_mcp_system_prompt = prompt_utils.generate_mcp_system_prompt\n\n    def patched_generate_mcp_system_prompt(date, mcp_servers):\n        \"\"\"Patched version that prepends custom identity prompt.\"\"\"\n        original_prompt = original_generate_mcp_system_prompt(date, mcp_servers)\n        return CUSTOM_IDENTITY_PROMPT + original_prompt\n\n    # Apply patches to all modules that import and use this function\n    prompt_utils.generate_mcp_system_prompt = patched_generate_mcp_system_prompt\n    openai_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt\n    anthropic_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt\n\n\ndef _patch_input_handler():\n    \"\"\"Patch input handler to remove boxed format requirement.\"\"\"\n    from src.core import orchestrator\n    from src.io import input_handler\n\n    # Store original function\n    original_process_input = input_handler.process_input\n\n    def patched_process_input(task_description: str, task_file_name: str):\n        \"\"\"Patched version that removes boxed format requirement.\"\"\"\n        result1, result2 = original_process_input(task_description, task_file_name)\n        # Remove the boxed format suffix from both results\n        result1 = result1.replace(BOXED_FORMAT_SUFFIX, \"\")\n        result2 = result2.replace(BOXED_FORMAT_SUFFIX, \"\")\n        return result1, result2\n\n    # Apply patch to input_handler module\n    input_handler.process_input = patched_process_input\n    # Also patch in orchestrator where it's imported\n    orchestrator.process_input = patched_process_input\n\n\ndef _patch_summarize_prompt():\n    \"\"\"Patch summarize prompt generation for better user experience.\"\"\"\n    from src.core import answer_generator, orchestrator\n    from src.utils import prompt_utils\n\n    def patched_generate_agent_summarize_prompt(\n        task_description: str, agent_type: str = \"\"\n    ) -> str:\n        \"\"\"\n        Patched version that uses user-friendly prompt for main agent.\n\n        For main agent in demo mode, uses a Markdown-friendly prompt instead of\n        the strict boxed format prompt used for benchmarks.\n        \"\"\"\n        if agent_type == \"main\":\n            # Detect language from task description\n            target_language = _detect_language(task_description)\n            return get_demo_summarize_prompt(target_language, task_description)\n        elif agent_type == \"agent-browsing\" or agent_type == \"browsing-agent\":\n            # Keep original behavior for sub-agents\n            summarize_prompt = (\n                \"This is a direct instruction to you (the assistant), not the result of a tool call.\\n\\n\"\n                \"We are now ending this session, and your conversation history will be deleted. \"\n                \"You must NOT initiate any further tool use. This is your final opportunity to report \"\n                \"*all* of the information gathered during the session.\\n\\n\"\n                \"The original task is repeated here for reference:\\n\\n\"\n                f'\"{task_description}\"\\n\\n'\n                \"Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\\n\\n\"\n                \"If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\\n\"\n                \"If you reached a conclusion or answer, include it as part of the response.\\n\"\n                \"If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, \"\n                \"Search results, quotes, and observations that might help a downstream agent solve the problem.\\n\"\n                \"If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\\n\\n\"\n                \"Your final response should be a clear, complete, and structured report.\\n\"\n                \"Organize the content into logical sections with appropriate headings.\\n\"\n                \"Do NOT include any tool call instructions, speculative filler, or vague summaries.\\n\"\n                \"Focus on factual, specific, and well-organized information.\"\n            )\n            return summarize_prompt.strip()\n        else:\n            raise ValueError(f\"Unknown agent type: {agent_type}\")\n\n    # Apply patches to all modules that import and use this function\n    prompt_utils.generate_agent_summarize_prompt = (\n        patched_generate_agent_summarize_prompt\n    )\n    orchestrator.generate_agent_summarize_prompt = (\n        patched_generate_agent_summarize_prompt\n    )\n    answer_generator.generate_agent_summarize_prompt = (\n        patched_generate_agent_summarize_prompt\n    )\n\n\ndef _patch_output_formatter():\n    \"\"\"\n    Patch output formatter to disable boxed format check.\n\n    In demo mode, we don't require \\boxed{} format, so we patch the\n    format_final_summary_and_log method to always return a valid result\n    instead of FORMAT_ERROR_MESSAGE, which would trigger retry logic.\n    \"\"\"\n    from src.io import output_formatter\n\n    # Get the OutputFormatter class\n    OutputFormatter = output_formatter.OutputFormatter\n\n    def patched_format_final_summary_and_log(self, final_answer_text: str, client=None):\n        \"\"\"\n        Patched version that doesn't return FORMAT_ERROR_MESSAGE.\n\n        Instead of checking for \\boxed{} content, we use the entire answer\n        (with thinking tags removed) as the result.\n        \"\"\"\n        summary_lines = []\n        summary_lines.append(\"\\n\" + \"=\" * 30 + \" Final Answer \" + \"=\" * 30)\n        summary_lines.append(final_answer_text)\n\n        # In demo mode, use the full answer text (minus thinking) as the result\n        # Remove <think>...</think> tags for the extracted result\n        boxed_result = re.sub(\n            r\"<think>.*?</think>\", \"\", final_answer_text, flags=re.DOTALL\n        ).strip()\n\n        # If there's actual boxed content, extract it (for compatibility)\n        actual_boxed = self._extract_boxed_content(final_answer_text)\n        if actual_boxed:\n            boxed_result = actual_boxed\n\n        # Add extracted result section\n        summary_lines.append(\"\\n\" + \"-\" * 20 + \" Extracted Result \" + \"-\" * 20)\n        summary_lines.append(boxed_result if boxed_result else final_answer_text)\n\n        # Token usage statistics and cost estimation\n        if client and hasattr(client, \"format_token_usage_summary\"):\n            token_summary_lines, log_string = client.format_token_usage_summary()\n            summary_lines.extend(token_summary_lines)\n        else:\n            summary_lines.append(\"\\n\" + \"-\" * 20 + \" Token Usage & Cost \" + \"-\" * 20)\n            summary_lines.append(\"Token usage information not available.\")\n            summary_lines.append(\"-\" * (40 + len(\" Token Usage & Cost \")))\n            log_string = \"Token usage information not available.\"\n\n        # Return boxed_result (never FORMAT_ERROR_MESSAGE in demo mode)\n        # This ensures no retry is triggered\n        return (\n            \"\\n\".join(summary_lines),\n            boxed_result or \"Demo mode - no boxed format required\",\n            log_string,\n        )\n\n    # Apply patch\n    OutputFormatter.format_final_summary_and_log = patched_format_final_summary_and_log\n\n\ndef get_custom_identity_prompt() -> str:\n    \"\"\"Return the custom identity prompt string.\"\"\"\n    return CUSTOM_IDENTITY_PROMPT\n"
  },
  {
    "path": "apps/gradio-demo/pyproject.toml",
    "content": "[project]\nname = \"gradio-demo\"\nversion = \"0.1.0\"\ndescription = \"Gradio Demo\"\nreadme = \"README.md\"\nrequires-python = \">=3.12\"\ndependencies = [\n    \"pydantic>=2.10.0\",\n    \"python-dotenv>=1.0.0\",\n    \"hydra-core>=1.3.0\",\n    \"miroflow-agent\",\n    \"aiohttp>=3.12.15\",\n    \"gradio>=5.42.0\",\n]\n\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.hatch.build.targets.wheel]\npackages = [\"./\"]\n\n[tool.uv.sources]\nmiroflow-agent = { path = \"../miroflow-agent\", editable = true }\n\n[dependency-groups]\ndev = [\n    \"pytest>=8.4.1\",\n    \"pytest-asyncio>=1.0.0\",\n    \"httpx>=0.28.1\",\n]\n"
  },
  {
    "path": "apps/gradio-demo/utils.py",
    "content": "import re\n\n\ndef contains_chinese(text):\n    \"\"\"\n    Detect if a string contains Chinese characters or Chinese punctuation\n\n    Args:\n        text (str): The string to detect\n\n    Returns:\n        bool: True if contains Chinese characters or punctuation, False otherwise\n    \"\"\"\n    # Chinese character Unicode ranges:\n    # \\u4e00-\\u9fff: CJK Unified Ideographs\n    # \\u3400-\\u4dbf: CJK Extension A\n    # \\uf900-\\ufaff: CJK Compatibility Ideographs\n    # \\u3000-\\u303f: CJK Symbols and Punctuation\n    # \\uff00-\\uffef: Fullwidth ASCII, Fullwidth punctuation\n    chinese_pattern = re.compile(\n        r\"[\\u4e00-\\u9fff\\u3400-\\u4dbf\\uf900-\\ufaff\\u3000-\\u303f\\uff00-\\uffef]\"\n    )\n    return bool(chinese_pattern.search(text))\n\n\ndef replace_chinese_punctuation(text):\n    # Handle single-character replacements with translate\n    punctuation_map = str.maketrans(\n        {\n            \"，\": \",\",\n            \"。\": \".\",\n            \"！\": \"!\",\n            \"？\": \"?\",\n            \"；\": \";\",\n            \"：\": \":\",\n            \"“\": '\"',\n            \"”\": '\"',\n            \"‘\": \"'\",\n            \"’\": \"'\",\n            \"（\": \"(\",\n            \"）\": \")\",\n            \"【\": \"[\",\n            \"】\": \"]\",\n            \"《\": \"<\",\n            \"》\": \">\",\n            \"、\": \",\",\n            \"—\": \"-\",\n        }\n    )\n    # First, replace multi-character punctuation\n    text = text.replace(\"……\", \"...\")\n    # Then apply single-character replacements\n    return text.translate(punctuation_map)\n"
  },
  {
    "path": "apps/lobehub-compatibility/MiroThinkerToolParser.py",
    "content": "\"\"\"\nTool parser plugin for vLLM for MiroThinker MCP format to compatible with the tool calling interface of openai.\nMCP format:\n    <use_mcp_tool>\n        <server_name>server name</server_name>\n        <tool_name>tool name</tool_name>\n        <arguments>\n        {...}\n        </arguments>\n    </use_mcp_tool>\n\"\"\"\n\nimport json\nfrom collections.abc import Sequence\n\nimport json_repair\nimport regex as re\nfrom vllm.entrypoints.chat_utils import make_tool_call_id\nfrom vllm.entrypoints.openai.protocol import (\n    ChatCompletionRequest,\n    DeltaFunctionCall,\n    DeltaMessage,\n    DeltaToolCall,\n    ExtractedToolCallInformation,\n    FunctionCall,\n    ToolCall,\n)\nfrom vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (\n    ToolParser,\n    ToolParserManager,\n)\nfrom vllm.logger import init_logger\n\nlogger = init_logger(__name__)\n\n\nclass MirothinkerToolParser(ToolParser):\n    def __init__(self, tokenizer):\n        super().__init__(tokenizer)\n\n        # State tracking for streaming\n        self.current_tool_name_sent: bool = False\n        self.prev_tool_call_arr: list[dict] = []\n        self.current_tool_id: int = -1\n        self.streamed_args_for_tool: list[str] = []\n        self.buffer: str = \"\"  # Buffer for potential tool call tags\n        self._resolved_tool_name_cache: dict[tuple[str, str], str] = {}\n\n        # Correctness-first streaming state (incremental state machine)\n        self._stream_mode: str = \"text\"  # \"text\" | \"tool\"\n        self._text_token_prefix: str = \"\"  # possible prefix of <use_mcp_tool>\n        self._tool_end_token_prefix: str = \"\"  # possible prefix of </use_mcp_tool>\n        self._tool_block_buffer: str = (\n            \"\"  # accumulates between <use_mcp_tool> and </use_mcp_tool>\n        )\n        self._stream_tool_call_ids: list[str] = []\n\n        # Token definitions\n        self.tool_call_start_token: str = \"<use_mcp_tool>\"\n        self.tool_call_end_token: str = \"</use_mcp_tool>\"\n\n        # Regex patterns\n        self.tool_call_regex = re.compile(\n            r\"<use_mcp_tool>\\s*\"\n            r\"<server_name>(.*?)</server_name>\\s*\"\n            r\"<tool_name>(.*?)</tool_name>\\s*\"\n            r\"<arguments>\\s*(.*?)\\s*</arguments>\\s*\"\n            r\"</use_mcp_tool>\",\n            re.DOTALL,\n        )\n\n        # For streaming partial tool calls\n        # IMPORTANT: Use GREEDY matching (.*) for arguments to capture all content\n        # in streaming mode. We'll clean up </arguments> tag in the code if present.\n        # The outer ()? makes the whole <arguments> section optional\n        # The inner (.*) will match empty string if <arguments> exists but has no content yet\n        self.partial_tool_regex = re.compile(\n            r\"<use_mcp_tool>\\s*\"\n            r\"(?:<server_name>(.*?)</server_name>\\s*)?\"\n            r\"(?:<tool_name>(.*?)</tool_name>\\s*)?\"\n            r\"(?:<arguments>(\\s*.*))?\",  # Move \\s* inside capture group so empty match returns \"\"\n            re.DOTALL,\n        )\n\n        # For correctness-first parsing on COMPLETE tool blocks only\n        self._complete_tool_block_regex = re.compile(\n            r\"<use_mcp_tool>\\s*\"\n            r\"(?:<server_name>(.*?)</server_name>\\s*)?\"\n            r\"(?:<tool_name>(.*?)</tool_name>\\s*)?\"\n            r\"(?:<arguments>\\s*(.*?)\\s*(?:</arguments>\\s*)?)?\"\n            r\"</use_mcp_tool>\",\n            re.DOTALL,\n        )\n\n    def _resolve_tool_name(\n        self, server_name: str, tool_name: str, request: ChatCompletionRequest\n    ) -> str:\n        \"\"\"\n        Resolve the actual tool name by combining server_name and tool_name\n        if server_name is not 'default'.\n        \"\"\"\n        if not server_name or server_name == \"default\":\n            return tool_name\n\n        if not request or not request.tools:\n            return tool_name\n\n        cache_key = (server_name, tool_name)\n        cached = self._resolved_tool_name_cache.get(cache_key)\n        if cached:\n            return cached\n\n        # Filter tools that contain server_name\n        candidates = []\n        for tool in request.tools:\n            if hasattr(tool, \"function\") and hasattr(tool.function, \"name\"):\n                name = tool.function.name\n                if tool_name in name:\n                    candidates.append(name)\n        if len(candidates) == 1:\n            resolved = candidates[0]\n            self._resolved_tool_name_cache[cache_key] = resolved\n            return resolved\n        # Find match containing tool_name\n        for candidate in candidates:\n            if server_name in candidate:\n                logger.debug(\n                    \"Resolved tool %s -> %s (server: %s)\",\n                    tool_name,\n                    candidate,\n                    server_name,\n                )\n                self._resolved_tool_name_cache[cache_key] = candidate\n                return candidate\n\n        return tool_name\n\n    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:\n        request = super().adjust_request(request)\n        if request.tools and request.tool_choice != \"none\":\n            # Do not skip special tokens for proper tool parsing\n            request.skip_special_tokens = False\n        return request\n\n    def _ensure_tool_id_valid(self, tool_id: int) -> bool:\n        \"\"\"Ensure the tool_id is valid and arrays have enough elements\"\"\"\n        if tool_id < 0:\n            return False\n\n        # Ensure arrays are large enough\n        while len(self.streamed_args_for_tool) <= tool_id:\n            self.streamed_args_for_tool.append(\"\")\n        while len(self.prev_tool_call_arr) <= tool_id:\n            self.prev_tool_call_arr.append({})\n\n        return True\n\n    def extract_tool_calls(\n        self,\n        model_output: str,\n        request: ChatCompletionRequest,\n    ) -> ExtractedToolCallInformation:\n        # Sanity check; avoid unnecessary processing\n        if logger.isEnabledFor(10):  # DEBUG\n            logger.debug(\"model_output len=%s\", len(model_output))\n        if (\n            self.tool_call_start_token not in model_output\n            or request.tool_choice == \"none\"\n            or not request.tools\n        ):\n            return ExtractedToolCallInformation(\n                tools_called=False, tool_calls=[], content=model_output\n            )\n\n        try:\n            tool_calls = []\n            had_any_match = False\n            had_parse_error = False\n            # Find all complete tool calls\n            for match in self.tool_call_regex.finditer(model_output):\n                had_any_match = True\n                server_name = match.group(1).strip()\n                tool_name = match.group(2).strip()\n                arguments_str = match.group(3).strip()\n\n                # Resolve tool name\n                tool_name = self._resolve_tool_name(server_name, tool_name, request)\n\n                try:\n                    # Parse arguments as JSON\n                    arguments = json.loads(arguments_str)\n\n                    tool_call = ToolCall(\n                        type=\"function\",\n                        function=FunctionCall(\n                            name=tool_name,\n                            arguments=json.dumps(arguments, ensure_ascii=False),\n                        ),\n                    )\n                    tool_calls.append(tool_call)\n\n                except json.JSONDecodeError:\n                    try:\n                        repaired = json_repair.repair_json(arguments_str)\n                        if not repaired:\n                            had_parse_error = True\n                            logger.warning(\n                                \"Failed to repair tool arguments JSON: %s\",\n                                arguments_str,\n                            )\n                            continue\n\n                        arguments = json.loads(repaired)\n                        tool_call = ToolCall(\n                            type=\"function\",\n                            function=FunctionCall(\n                                name=tool_name,\n                                arguments=json.dumps(arguments, ensure_ascii=False),\n                            ),\n                        )\n                        tool_calls.append(tool_call)\n                    except Exception:\n                        had_parse_error = True\n                        logger.warning(\n                            \"Failed to parse tool arguments after repair: %s\",\n                            arguments_str,\n                        )\n                        continue\n\n            # If we couldn't successfully parse tool calls (or format didn't match), do not truncate.\n            # Return the full model output as content to avoid losing text.\n            if had_parse_error or not tool_calls or not had_any_match:\n                return ExtractedToolCallInformation(\n                    tools_called=False, tool_calls=[], content=model_output\n                )\n\n            # Extract content before first tool call\n            content = model_output[: model_output.find(self.tool_call_start_token)]\n\n            return ExtractedToolCallInformation(\n                tools_called=len(tool_calls) > 0,\n                tool_calls=tool_calls,\n                content=content if content else None,\n            )\n\n        except Exception:\n            logger.exception(\"Error in extracting tool call from response.\")\n            return ExtractedToolCallInformation(\n                tools_called=False, tool_calls=[], content=model_output\n            )\n\n    def extract_tool_calls_streaming(\n        self,\n        previous_text: str,\n        current_text: str,\n        delta_text: str,\n        previous_token_ids: Sequence[int],\n        current_token_ids: Sequence[int],\n        delta_token_ids: Sequence[int],\n        request: ChatCompletionRequest,\n    ) -> DeltaMessage | None:\n        # Reset state if this is the start of a new request\n        if not previous_text:\n            self.current_tool_name_sent = False\n            self.prev_tool_call_arr = []\n            self.current_tool_id = -1\n            self.streamed_args_for_tool = []\n            self.buffer = \"\"\n            self._resolved_tool_name_cache = {}\n\n            self._stream_mode = \"text\"\n            self._text_token_prefix = \"\"\n            self._tool_end_token_prefix = \"\"\n            self._tool_block_buffer = \"\"\n            self._stream_tool_call_ids = []\n\n        # If tools are disabled for this request, do not suppress tags or parse tool calls.\n        # Flush any internal buffers as plain text so we never drop output.\n        if request.tool_choice == \"none\" or not request.tools:\n            out = \"\"\n            if self.buffer:\n                out += self.buffer\n                self.buffer = \"\"\n            if self._text_token_prefix:\n                out += self._text_token_prefix\n                self._text_token_prefix = \"\"\n            if self._tool_block_buffer:\n                out += self.tool_call_start_token + self._tool_block_buffer\n                self._tool_block_buffer = \"\"\n            if self._tool_end_token_prefix:\n                out += self._tool_end_token_prefix\n                self._tool_end_token_prefix = \"\"\n            out += delta_text\n            return DeltaMessage(content=out) if out else None\n\n        def _longest_token_prefix_at_end(s: str, token: str) -> str:\n            max_len = min(len(token) - 1, len(s))\n            for i in range(max_len, 0, -1):\n                if token.startswith(s[-i:]):\n                    return s[-i:]\n            return \"\"\n\n        emitted_text_parts: list[str] = []\n        emitted_tool_calls: list[DeltaToolCall] = []\n\n        chunk = delta_text\n\n        while chunk:\n            if self._stream_mode == \"text\":\n                if self._text_token_prefix:\n                    chunk = self._text_token_prefix + chunk\n                    self._text_token_prefix = \"\"\n\n                start_idx = chunk.find(self.tool_call_start_token)\n                if start_idx < 0:\n                    prefix = _longest_token_prefix_at_end(\n                        chunk, self.tool_call_start_token\n                    )\n                    if prefix:\n                        safe = chunk[: -len(prefix)]\n                        if safe:\n                            emitted_text_parts.append(safe)\n                        self._text_token_prefix = prefix\n                    else:\n                        emitted_text_parts.append(chunk)\n                    break\n\n                before = chunk[:start_idx]\n                if before:\n                    emitted_text_parts.append(before)\n                chunk = chunk[start_idx + len(self.tool_call_start_token) :]\n                self._stream_mode = \"tool\"\n                self._tool_block_buffer = \"\"\n                self._tool_end_token_prefix = \"\"\n                continue\n\n            # tool mode\n            if self._tool_end_token_prefix:\n                chunk = self._tool_end_token_prefix + chunk\n                self._tool_end_token_prefix = \"\"\n\n            end_idx = chunk.find(self.tool_call_end_token)\n            if end_idx < 0:\n                prefix = _longest_token_prefix_at_end(chunk, self.tool_call_end_token)\n                if prefix:\n                    self._tool_block_buffer += chunk[: -len(prefix)]\n                    self._tool_end_token_prefix = prefix\n                else:\n                    self._tool_block_buffer += chunk\n                break\n\n            # Complete tool block\n            self._tool_block_buffer += chunk[:end_idx]\n            tool_block = (\n                self.tool_call_start_token\n                + self._tool_block_buffer\n                + self.tool_call_end_token\n            )\n            remainder = chunk[end_idx + len(self.tool_call_end_token) :]\n\n            # Reset tool buffers before parsing\n            self._stream_mode = \"text\"\n            self._tool_block_buffer = \"\"\n            self._tool_end_token_prefix = \"\"\n\n            try:\n                m = self._complete_tool_block_regex.search(tool_block)\n                if not m:\n                    emitted_text_parts.append(tool_block)\n                    chunk = remainder\n                    continue\n\n                server_name = (m.group(1) or \"\").strip()\n                tool_name = (m.group(2) or \"\").strip()\n                arguments_str = (m.group(3) or \"\").strip()\n\n                if not tool_name:\n                    emitted_text_parts.append(tool_block)\n                    chunk = remainder\n                    continue\n\n                resolved_name = (\n                    self._resolve_tool_name(server_name, tool_name, request)\n                    if server_name\n                    else tool_name\n                )\n\n                # Finalize arguments strictly at end of the block\n                if not arguments_str:\n                    arguments_json_str = \"{}\"\n                else:\n                    try:\n                        arguments_obj = json.loads(arguments_str)\n                    except Exception:\n                        repaired = json_repair.repair_json(arguments_str)\n                        if not repaired:\n                            emitted_text_parts.append(tool_block)\n                            chunk = remainder\n                            continue\n                        arguments_obj = json.loads(repaired)\n                    arguments_json_str = json.dumps(arguments_obj, ensure_ascii=False)\n\n                tool_index = len(self._stream_tool_call_ids)\n                tool_call_id = make_tool_call_id()\n                self._stream_tool_call_ids.append(tool_call_id)\n\n                emitted_tool_calls.append(\n                    DeltaToolCall(\n                        index=tool_index,\n                        type=\"function\",\n                        id=tool_call_id,\n                        function=DeltaFunctionCall(\n                            name=resolved_name,\n                            arguments=arguments_json_str,\n                        ).model_dump(exclude_none=True),\n                    )\n                )\n\n            except Exception:\n                logger.exception(\n                    \"Error parsing complete tool block in streaming; falling back to plain text.\"\n                )\n                emitted_text_parts.append(tool_block)\n\n            chunk = remainder\n\n        emitted_text = \"\".join(emitted_text_parts) if emitted_text_parts else None\n        if emitted_text is not None and emitted_text == \"\":\n            emitted_text = None\n        if emitted_text is None and not emitted_tool_calls:\n            return None\n\n        # vLLM's DeltaMessage.tool_calls is validated as a list; do not pass None explicitly.\n        if emitted_tool_calls:\n            return DeltaMessage(content=emitted_text, tool_calls=emitted_tool_calls)\n        return DeltaMessage(content=emitted_text)\n\n\n# Register the tool parser to ToolParserManager\nToolParserManager.register_module(\"mirothinker\", True, MirothinkerToolParser)\n"
  },
  {
    "path": "apps/lobehub-compatibility/README.md",
    "content": "# LobeChat Integration Guide\n\nThis guide describes how to integrate the MiroThinker model with [LobeChat](https://github.com/lobehub/lobe-chat), an open-source, modern LLM UI framework supporting tool usage (function calling).\n\n## Before You Start\n\nMiroThinker is a reasoning model. When generating responses, it first outputs its reasoning process inside `<think>...</think>` tags, then provides the final answer. For agentic tasks (multi-step tool use), the model performs better when it can see its previous reasoning in the conversation history.\n\nHowever, LobeChat does not preserve reasoning content in conversation history. When sending messages back to the API, LobeChat strips the `<think>...</think>` content from previous assistant messages. This means the model cannot see its prior reasoning steps.\n\n- For general chat: This works fine.\n- For agentic workflows: Performance may be degraded since the model cannot reference its previous reasoning.\n\nIf you need full reasoning preservation for agentic use cases, consider modifying LobeChat's source code to return `reasoning_content` in conversation history.\n\n## 1. Start the Inference Service\n\nFirst, launch the MiroThinker model using vLLM with the OpenAI-compatible API adapter. We use vLLM because it supports loading custom tool parsers from external Python files, while SGLang does not. Ensure you include the tool parser plugin.\n\n```bash\n# Configuration\nPORT=61002\nMODEL_PATH=miromind-ai/MiroThinker-v1.5-30B\n\n# Start vLLM server\nvllm serve $MODEL_PATH \\\n    --served-model-name mirothinker \\\n    --port $PORT \\\n    --trust-remote-code \\\n    --chat-template chat_template.jinja \\\n    --tool-parser-plugin MiroThinkerToolParser.py \\\n    --tool-call-parser mirothinker \\\n    --enable-auto-tool-choice\n```\n\n## 2. Configure LobeChat\n\nYou can use either the self-hosted version or the [web application](https://lobechat.com/chat).\n\n### Step 1: Access Settings\n\nNavigate to **Settings** -> **AI Service Provider** to add a custom AI service provider.\n\n![Settings Navigation](img/settings.png)\n\n### Step 2: Add Custom AI Provider\n\nClick the `+` button to add a new provider and configure it as follows:\n\n![Add AI Provider](img/AI-provider.png)\n\n| Field | Value | Description |\n| :--- | :--- | :--- |\n| **Provider ID** | `miromind` | Or any identifier you prefer. |\n| **Request Format** | `OPENAI` |  |\n| **API Key** | `your-api-key` | Use any string if auth is disabled. |\n| **API Proxy Address** | `http://localhost:61002/v1` | Replace with your actual service address. |\n\n### Step 3: Configure the Model\n\nAfter adding the provider, add the models you deploy to the service provider's model list.:\n\n1. Add a new model with the ID `mirothinker` (must match `--served-model-name`).\n1. **Crucial**: Enable the **Function Calling** capability toggle.\n1. Click \"Check\" to verify connectivity.\n\n![Model Configuration](img/model.png)\n\n## 3. Usage Demo\n\nOnce configured, you can use MiroThinker in LobeChat with full tool-calling capabilities.\n\n![Presentation Demo](img/presentation.gif)\n"
  },
  {
    "path": "apps/lobehub-compatibility/chat_template.jinja",
    "content": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"In this environment you have access to a set of tools you can use to answer the user's question.\\n\\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\\n\\nToday is: \" + strftime_now('%Y-%m-%d') + \". For time-dependent questions, answer based on the world as it would reasonably be today.\\n\\n# Tool-Use Formatting Instructions\\n\\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\\n\\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\\n\\nDescription:\\nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\\n\\nParameters:\\n- server_name: (required) The name of the MCP server providing the tool\\n- tool_name: (required) The name of the tool to execute\\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\\n\\nUsage:\\n<use_mcp_tool>\\n<server_name>server name here</server_name>\\n<tool_name>tool name here</tool_name>\\n<arguments>\\n{\\n  \\\"param1\\\": \\\"value1\\\",\\n  \\\"param2\\\": \\\"value2 \\\\\\\"escaped string\\\\\\\"\\\"\\n}\\n</arguments>\\n</use_mcp_tool>\\n\\nImportant Notes:\\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\\n\\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\\nHere are the functions available in JSONSchema format:\\n\\n## Server name: default\" }}\n    {%- for tool in tools %}\n        {%- set func = tool.function if tool.function is defined else tool %}\n        {{- \"\\n### Tool name: \" + func.name + \"\\n\" }}\n        {{- \"Description:\\n\" }}\n        {%- set desc = func.description if func.description else '' %}\n        {%- if desc[:4] == '    ' %}\n            {{- desc }}\n        {%- else %}\n            {{- \"    \" + desc }}\n        {%- endif %}\n        {%- if \"Args:\" not in desc and func.parameters is defined and func.parameters.properties is defined %}\n            {{- \"\\n\\n    Args:\" }}\n            {%- for prop_name, prop_value in func.parameters.properties.items() %}\n                {%- if prop_value.description is defined %}\n                    {{- \"\\n        \" + prop_name + \": \" + prop_value.description }}\n                {%- else %}\n                    {{- \"\\n        \" + prop_name + \": \" + (prop_value.type if prop_value.type is defined else \"any\") }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"\\n\\nInput JSON schema: \" + (func.parameters | tojson) + \"\\n\" }}\n    {%- endfor %}\n    {{- \"\\n# General Objective\\n\\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.<|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content if message.content is not none else '' %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = (content.split('</think>')[0]).rstrip('\\n') %}\n                {%- set reasoning_content = (reasoning_content.split('<think>')[-1]).lstrip('\\n') %}\n                {%- set content = (content.split('</think>')[-1]).lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<use_mcp_tool>\\n<server_name>default</server_name>\\n<tool_name>' }}\n                {{- tool_call.name }}\n                {{- '</tool_name>\\n<arguments>\\n' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '\\n</arguments>\\n</use_mcp_tool>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user\\n' }}\n        {%- else %}\n            {{- '\\n\\n' }}\n        {%- endif %}\n        {{- message.content }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}"
  },
  {
    "path": "apps/lobehub-compatibility/requirements.txt",
    "content": "vllm>=0.11.0\njson-repair\nregex"
  },
  {
    "path": "apps/lobehub-compatibility/test_tool_parser.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nTest MiroThinkerToolParser for correctness.\n\"\"\"\n\nimport json\nimport sys\nfrom types import SimpleNamespace\nfrom unittest.mock import MagicMock\n\nimport regex as re\n\n# Mock vLLM imports for testing without vLLM installed\n# Create mock modules\nmock_vllm = MagicMock()\nmock_vllm.entrypoints = MagicMock()\nmock_vllm.entrypoints.chat_utils = MagicMock()\nmock_vllm.entrypoints.chat_utils.make_tool_call_id = lambda: \"call_test_123\"\n\nmock_protocol = SimpleNamespace(\n    ChatCompletionRequest=MagicMock,\n    DeltaFunctionCall=MagicMock,\n    DeltaMessage=MagicMock,\n    DeltaToolCall=MagicMock,\n    ExtractedToolCallInformation=MagicMock,\n    FunctionCall=MagicMock,\n    ToolCall=MagicMock,\n)\n\nmock_tool_parser = SimpleNamespace(\n    ToolParser=object,\n    ToolParserManager=MagicMock(),\n)\n\nmock_logger = SimpleNamespace(\n    init_logger=lambda x: MagicMock(isEnabledFor=lambda _: False),\n)\n\nsys.modules[\"vllm\"] = mock_vllm\nsys.modules[\"vllm.entrypoints\"] = mock_vllm.entrypoints\nsys.modules[\"vllm.entrypoints.chat_utils\"] = mock_vllm.entrypoints.chat_utils\nsys.modules[\"vllm.entrypoints.openai\"] = MagicMock()\nsys.modules[\"vllm.entrypoints.openai.protocol\"] = mock_protocol\nsys.modules[\"vllm.entrypoints.openai.tool_parsers\"] = MagicMock()\nsys.modules[\"vllm.entrypoints.openai.tool_parsers.abstract_tool_parser\"] = (\n    mock_tool_parser\n)\nsys.modules[\"vllm.logger\"] = mock_logger\n\n\ndef test_tool_call_regex():\n    \"\"\"Test the main tool call regex pattern.\"\"\"\n    tool_call_regex = re.compile(\n        r\"<use_mcp_tool>\\s*\"\n        r\"<server_name>(.*?)</server_name>\\s*\"\n        r\"<tool_name>(.*?)</tool_name>\\s*\"\n        r\"<arguments>\\s*(.*?)\\s*</arguments>\\s*\"\n        r\"</use_mcp_tool>\",\n        re.DOTALL,\n    )\n\n    # Test 1: Basic tool call\n    text1 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>web_search</tool_name>\n<arguments>\n{\"query\": \"AI news\"}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text1)\n    assert match is not None, \"Should match basic tool call\"\n    assert match.group(1).strip() == \"my_mcp_server\"\n    assert match.group(2).strip() == \"web_search\"\n    assert json.loads(match.group(3).strip()) == {\"query\": \"AI news\"}\n    print(\"✅ Test 1: Basic tool call - PASSED\")\n\n    # Test 2: Tool call with content before\n    text2 = \"\"\"Let me search for that.\n\n<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>search</tool_name>\n<arguments>\n{\"q\": \"test\"}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text2)\n    assert match is not None, \"Should match tool call with content before\"\n    print(\"✅ Test 2: Tool call with content before - PASSED\")\n\n    # Test 3: Multiple tool calls\n    text3 = \"\"\"<use_mcp_tool>\n<server_name>server1</server_name>\n<tool_name>tool1</tool_name>\n<arguments>{\"a\": 1}</arguments>\n</use_mcp_tool>\n\n<use_mcp_tool>\n<server_name>server2</server_name>\n<tool_name>tool2</tool_name>\n<arguments>{\"b\": 2}</arguments>\n</use_mcp_tool>\"\"\"\n\n    matches = list(tool_call_regex.finditer(text3))\n    assert len(matches) == 2, f\"Should find 2 tool calls, found {len(matches)}\"\n    assert matches[0].group(2).strip() == \"tool1\"\n    assert matches[1].group(2).strip() == \"tool2\"\n    print(\"✅ Test 3: Multiple tool calls - PASSED\")\n\n    # Test 4: Complex JSON arguments\n    text4 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>complex_tool</tool_name>\n<arguments>\n{\n  \"query\": \"test with quotes and apostrophes\",\n  \"options\": {\"nested\": true},\n  \"list\": [1, 2, 3]\n}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text4)\n    assert match is not None, \"Should match complex JSON\"\n    args = json.loads(match.group(3).strip())\n    assert args[\"query\"] == \"test with quotes and apostrophes\"\n    assert args[\"options\"][\"nested\"] is True\n    print(\"✅ Test 4: Complex JSON arguments - PASSED\")\n\n    # Test 5: Empty arguments\n    text5 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>no_args_tool</tool_name>\n<arguments>\n{}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text5)\n    assert match is not None, \"Should match empty arguments\"\n    assert json.loads(match.group(3).strip()) == {}\n    print(\"✅ Test 5: Empty arguments - PASSED\")\n\n    # Test 6: Minimal whitespace\n    text6 = \"<use_mcp_tool><server_name>s</server_name><tool_name>t</tool_name><arguments>{}</arguments></use_mcp_tool>\"\n    match = tool_call_regex.search(text6)\n    assert match is not None, \"Should match minimal whitespace\"\n    print(\"✅ Test 6: Minimal whitespace - PASSED\")\n\n\ndef test_partial_tool_regex():\n    \"\"\"Test the partial tool regex for streaming.\"\"\"\n    partial_tool_regex = re.compile(\n        r\"<use_mcp_tool>\\s*\"\n        r\"(?:<server_name>(.*?)</server_name>\\s*)?\"\n        r\"(?:<tool_name>(.*?)</tool_name>\\s*)?\"\n        r\"(?:<arguments>(\\s*.*))?\",\n        re.DOTALL,\n    )\n\n    # Test partial: only opening tag\n    text1 = \"<use_mcp_tool>\\n\"\n    match = partial_tool_regex.search(text1)\n    assert match is not None\n    print(\"✅ Partial test 1: Only opening tag - PASSED\")\n\n    # Test partial: server_name only\n    text2 = \"<use_mcp_tool>\\n<server_name>my_server</server_name>\\n\"\n    match = partial_tool_regex.search(text2)\n    assert match is not None\n    assert match.group(1).strip() == \"my_server\"\n    assert match.group(2) is None\n    print(\"✅ Partial test 2: Server name only - PASSED\")\n\n    # Test partial: incomplete arguments\n    text3 = \"\"\"<use_mcp_tool>\n<server_name>my_server</server_name>\n<tool_name>my_tool</tool_name>\n<arguments>\n{\"query\": \"incomp\"\"\"\n\n    match = partial_tool_regex.search(text3)\n    assert match is not None\n    assert match.group(1).strip() == \"my_server\"\n    assert match.group(2).strip() == \"my_tool\"\n    assert '{\"query\": \"incomp' in match.group(3)\n    print(\"✅ Partial test 3: Incomplete arguments - PASSED\")\n\n\ndef test_complete_tool_block_regex():\n    \"\"\"Test the complete tool block regex used in streaming.\"\"\"\n    complete_regex = re.compile(\n        r\"<use_mcp_tool>\\s*\"\n        r\"(?:<server_name>(.*?)</server_name>\\s*)?\"\n        r\"(?:<tool_name>(.*?)</tool_name>\\s*)?\"\n        r\"(?:<arguments>\\s*(.*?)\\s*(?:</arguments>\\s*)?)?\"\n        r\"</use_mcp_tool>\",\n        re.DOTALL,\n    )\n\n    # Test: Complete block\n    text1 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>search</tool_name>\n<arguments>\n{\"q\": \"test\"}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = complete_regex.search(text1)\n    assert match is not None\n    assert match.group(1).strip() == \"my_mcp_server\"\n    assert match.group(2).strip() == \"search\"\n    assert json.loads(match.group(3).strip()) == {\"q\": \"test\"}\n    print(\"✅ Complete block test 1: Full block - PASSED\")\n\n    # Test: Without arguments tag\n    text2 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>simple_tool</tool_name>\n</use_mcp_tool>\"\"\"\n\n    match = complete_regex.search(text2)\n    assert match is not None\n    assert match.group(2).strip() == \"simple_tool\"\n    assert match.group(3) is None\n    print(\"✅ Complete block test 2: Without arguments - PASSED\")\n\n\ndef test_edge_cases():\n    \"\"\"Test edge cases and potential bugs.\"\"\"\n    tool_call_regex = re.compile(\n        r\"<use_mcp_tool>\\s*\"\n        r\"<server_name>(.*?)</server_name>\\s*\"\n        r\"<tool_name>(.*?)</tool_name>\\s*\"\n        r\"<arguments>\\s*(.*?)\\s*</arguments>\\s*\"\n        r\"</use_mcp_tool>\",\n        re.DOTALL,\n    )\n\n    # Edge case 1: Unicode in arguments\n    text1 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>search</tool_name>\n<arguments>\n{\"query\": \"你好世界 🎉\"}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text1)\n    assert match is not None\n    args = json.loads(match.group(3).strip())\n    assert args[\"query\"] == \"你好世界 🎉\"\n    print(\"✅ Edge case 1: Unicode in arguments - PASSED\")\n\n    # Edge case 2: Newlines in JSON\n    text2 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>search</tool_name>\n<arguments>\n{\n  \"query\": \"line1\\\\nline2\\\\nline3\"\n}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text2)\n    assert match is not None\n    args = json.loads(match.group(3).strip())\n    assert \"line1\\nline2\" in args[\"query\"]\n    print(\"✅ Edge case 2: Newlines in JSON - PASSED\")\n\n    # Edge case 3: Tags in content (should not match nested)\n    text3 = \"\"\"<use_mcp_tool>\n<server_name>my_mcp_server</server_name>\n<tool_name>search</tool_name>\n<arguments>\n{\"query\": \"<html><body>test</body></html>\"}\n</arguments>\n</use_mcp_tool>\"\"\"\n\n    match = tool_call_regex.search(text3)\n    assert match is not None\n    args = json.loads(match.group(3).strip())\n    assert \"<html>\" in args[\"query\"]\n    print(\"✅ Edge case 3: HTML tags in arguments - PASSED\")\n\n\ndef check_unused_code():\n    \"\"\"Check for unused code in the parser.\"\"\"\n    print(\"\\n\" + \"=\" * 60)\n    print(\"CODE ANALYSIS - Potential Issues\")\n    print(\"=\" * 60)\n\n    issues = []\n\n    # Issue 1: Unused variables\n    unused_vars = [\n        \"self.current_tool_name_sent\",\n        \"self.prev_tool_call_arr\",\n        \"self.current_tool_id\",\n        \"self.streamed_args_for_tool\",\n        \"self.buffer\",\n    ]\n    issues.append(\n        f\"⚠️  Unused instance variables (defined but never used in main logic):\\n   {', '.join(unused_vars)}\"\n    )\n\n    # Issue 2: Unused method\n    issues.append(\"⚠️  `_ensure_tool_id_valid` method is defined but never called\")\n\n    # Issue 3: Unused regex\n    issues.append(\"⚠️  `partial_tool_regex` is defined but never used\")\n\n    # Issue 4: server_name handling\n    issues.append(\n        \"⚠️  `_resolve_tool_name` checks for 'default' server_name,\\n   but chat_template.jinja uses 'my_mcp_server'\"\n    )\n\n    for issue in issues:\n        print(f\"\\n{issue}\")\n\n    print(\"\\n\" + \"=\" * 60)\n    print(\"RECOMMENDATIONS\")\n    print(\"=\" * 60)\n    print(\"\"\"\n1. Remove unused variables and methods to clean up the code\n2. Either use `partial_tool_regex` or remove it\n3. Update `_resolve_tool_name` to handle 'my_mcp_server' correctly\n4. The streaming implementation looks correct with the state machine approach\n5. The main `extract_tool_calls` and `extract_tool_calls_streaming` logic appears sound\n\"\"\")\n\n\ndef main():\n    print(\"=\" * 60)\n    print(\"MiroThinkerToolParser Test Suite\")\n    print(\"=\" * 60)\n\n    print(\"\\n--- Testing Main Tool Call Regex ---\")\n    test_tool_call_regex()\n\n    print(\"\\n--- Testing Partial Tool Regex ---\")\n    test_partial_tool_regex()\n\n    print(\"\\n--- Testing Complete Tool Block Regex ---\")\n    test_complete_tool_block_regex()\n\n    print(\"\\n--- Testing Edge Cases ---\")\n    test_edge_cases()\n\n    check_unused_code()\n\n    print(\"\\n\" + \"=\" * 60)\n    print(\"ALL REGEX TESTS PASSED ✅\")\n    print(\"=\" * 60)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/lobehub-compatibility/unit_test.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nUnit tests for MiroThinker chat template.\n\nRun with: pytest unit_test.py -v\n\"\"\"\n\nfrom datetime import datetime\nfrom pathlib import Path\n\nimport pytest\nfrom jinja2 import BaseLoader, Environment\n\n# ============================================================================\n# Fixtures\n# ============================================================================\n\n\ndef strftime_now(format_str: str) -> str:\n    \"\"\"Simulate vLLM's strftime_now function.\"\"\"\n    return datetime.now().strftime(format_str)\n\n\n@pytest.fixture\ndef template():\n    \"\"\"Load the chat template.\"\"\"\n    template_path = Path(__file__).parent / \"chat_template.jinja\"\n    with open(template_path, \"r\") as f:\n        template_str = f.read()\n\n    env = Environment(loader=BaseLoader())\n    env.globals[\"strftime_now\"] = strftime_now\n    return env.from_string(template_str)\n\n\n@pytest.fixture\ndef today_date():\n    \"\"\"Get today's date in YYYY-MM-DD format.\"\"\"\n    return datetime.now().strftime(\"%Y-%m-%d\")\n\n\n# ============================================================================\n# Test: Basic Message Formatting\n# ============================================================================\n\n\nclass TestBasicMessageFormatting:\n    \"\"\"Tests for basic message formatting without tools.\"\"\"\n\n    def test_user_message_format(self, template):\n        \"\"\"User message should be wrapped in <|im_start|>user ... <|im_end|>.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Hello!\"}]\n        result = template.render(messages=messages, add_generation_prompt=False)\n\n        assert \"<|im_start|>user\\nHello!<|im_end|>\" in result\n\n    def test_system_message_format(self, template):\n        \"\"\"System message should be wrapped correctly.\"\"\"\n        messages = [\n            {\"role\": \"system\", \"content\": \"You are helpful.\"},\n            {\"role\": \"user\", \"content\": \"Hi\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n\n        assert \"<|im_start|>system\\nYou are helpful.<|im_end|>\" in result\n\n    def test_assistant_message_format(self, template):\n        \"\"\"Assistant message should be wrapped correctly with <think> tags.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Hello\"},\n            {\"role\": \"assistant\", \"content\": \"Hi there!\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n\n        # Assistant always outputs <think> tags (even if empty)\n        assert (\n            \"<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\nHi there!<|im_end|>\"\n            in result\n        )\n\n    def test_add_generation_prompt(self, template):\n        \"\"\"add_generation_prompt should add <|im_start|>assistant at the end.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Hello\"}]\n        result = template.render(messages=messages, add_generation_prompt=True)\n\n        assert result.endswith(\"<|im_start|>assistant\\n\")\n\n    def test_multi_turn_conversation(self, template):\n        \"\"\"Multi-turn conversation should maintain correct order.\"\"\"\n        messages = [\n            {\"role\": \"system\", \"content\": \"System prompt\"},\n            {\"role\": \"user\", \"content\": \"User 1\"},\n            {\"role\": \"assistant\", \"content\": \"Assistant 1\"},\n            {\"role\": \"user\", \"content\": \"User 2\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=True)\n\n        # Check order\n        sys_pos = result.find(\"System prompt\")\n        user1_pos = result.find(\"User 1\")\n        asst1_pos = result.find(\"Assistant 1\")\n        user2_pos = result.find(\"User 2\")\n\n        assert sys_pos < user1_pos < asst1_pos < user2_pos\n\n\n# ============================================================================\n# Test: Thinking/Reasoning Content\n# ============================================================================\n\n\nclass TestThinkingContent:\n    \"\"\"Tests for <think> tag handling.\"\"\"\n\n    def test_reasoning_content_field(self, template):\n        \"\"\"reasoning_content field should be wrapped in <think> tags.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"What is 2+2?\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"The answer is 4.\",\n                \"reasoning_content\": \"2+2=4 by basic arithmetic.\",\n            },\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n\n        assert \"<think>\\n2+2=4 by basic arithmetic.\\n</think>\" in result\n        assert \"The answer is 4.\" in result\n\n    def test_think_tags_in_content(self, template):\n        \"\"\"<think> tags in content should be extracted and reformatted.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Question\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"<think>\\nMy reasoning here.\\n</think>\\n\\nMy answer here.\",\n            },\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n\n        assert \"<think>\\nMy reasoning here.\\n</think>\" in result\n        assert \"My answer here.\" in result\n\n    def test_think_preserved_in_history(self, template):\n        \"\"\"Think tags should be preserved in historical messages, not removed.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"First question\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"First answer\",\n                \"reasoning_content\": \"First reasoning\",\n            },\n            {\"role\": \"user\", \"content\": \"Second question\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=True)\n\n        # Historical thinking should be present\n        assert \"<think>\\nFirst reasoning\\n</think>\" in result\n\n    def test_enable_thinking_false(self, template):\n        \"\"\"enable_thinking=false should output empty think tags.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Hello\"}]\n        result = template.render(\n            messages=messages, add_generation_prompt=True, enable_thinking=False\n        )\n\n        assert result.endswith(\"<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\n\")\n\n    def test_enable_thinking_true(self, template):\n        \"\"\"enable_thinking=true should not output empty think tags.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Hello\"}]\n        result = template.render(\n            messages=messages, add_generation_prompt=True, enable_thinking=True\n        )\n\n        assert result.endswith(\"<|im_start|>assistant\\n\")\n        assert \"<think>\\n\\n</think>\" not in result\n\n\n# ============================================================================\n# Test: Tool Definitions in System Prompt\n# ============================================================================\n\n\nclass TestToolDefinitions:\n    \"\"\"Tests for tool definition formatting in system prompt.\"\"\"\n\n    def test_tools_trigger_system_prompt(self, template, today_date):\n        \"\"\"When tools are provided, a special system prompt should be generated.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Search something\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"web_search\",\n                    \"description\": \"Search the web\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"In this environment you have access to a set of tools\" in result\n        assert f\"Today is: {today_date}\" in result\n        assert \"# Tool-Use Formatting Instructions\" in result\n\n    def test_tool_name_format(self, template):\n        \"\"\"Tool should be formatted with ### Tool name: header.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"my_tool\",\n                    \"description\": \"My description\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"### Tool name: my_tool\" in result\n\n    def test_tool_server_name(self, template):\n        \"\"\"Tool server should be my_mcp_server.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test_tool\",\n                    \"description\": \"Test\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"## Server name: default\" in result\n\n    def test_tool_description_indentation(self, template):\n        \"\"\"Tool description should be indented with 4 spaces.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test_tool\",\n                    \"description\": \"My tool description\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"Description:\\n    My tool description\" in result\n\n    def test_tool_args_auto_generated(self, template):\n        \"\"\"Args section should be auto-generated from parameters.properties.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search function\",\n                    \"parameters\": {\n                        \"type\": \"object\",\n                        \"properties\": {\n                            \"query\": {\"type\": \"string\", \"description\": \"Search query\"},\n                            \"limit\": {\"type\": \"integer\", \"description\": \"Max results\"},\n                        },\n                    },\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"Args:\" in result\n        assert \"query: Search query\" in result\n        assert \"limit: Max results\" in result\n\n    def test_tool_args_not_duplicated(self, template):\n        \"\"\"If description already has Args:, don't add another.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search function\\n\\nArgs:\\n    query: The query\",\n                    \"parameters\": {\n                        \"type\": \"object\",\n                        \"properties\": {\n                            \"query\": {\"type\": \"string\", \"description\": \"Search query\"}\n                        },\n                    },\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        # Should only have one Args: section\n        assert result.count(\"Args:\") == 1\n\n    def test_tool_json_schema_included(self, template):\n        \"\"\"Input JSON schema should be included.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test\",\n                    \"description\": \"Test\",\n                    \"parameters\": {\n                        \"type\": \"object\",\n                        \"properties\": {\"x\": {\"type\": \"string\"}},\n                    },\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"Input JSON schema:\" in result\n        assert '\"type\": \"object\"' in result or '\"type\":\"object\"' in result\n\n    def test_tool_without_function_wrapper(self, template):\n        \"\"\"Tools can be passed without the function wrapper.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"name\": \"direct_tool\",\n                \"description\": \"Direct tool format\",\n                \"parameters\": {\"type\": \"object\", \"properties\": {}},\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"### Tool name: direct_tool\" in result\n\n    def test_tool_none_description(self, template):\n        \"\"\"Tool with None description should not crash.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test\",\n                    \"description\": None,\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        # Should not raise an exception\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n        assert \"### Tool name: test\" in result\n\n    def test_tool_empty_description(self, template):\n        \"\"\"Tool with empty description should not crash.\"\"\"\n        messages = [{\"role\": \"user\", \"content\": \"Test\"}]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test\",\n                    \"description\": \"\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n        assert \"### Tool name: test\" in result\n\n    def test_system_message_prepended_with_tools(self, template):\n        \"\"\"Custom system message should be prepended when tools are present.\"\"\"\n        messages = [\n            {\"role\": \"system\", \"content\": \"You are MiroThinker.\"},\n            {\"role\": \"user\", \"content\": \"Hi\"},\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"test\",\n                    \"description\": \"Test\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        # System message should come first, then tool instructions\n        sys_idx = result.find(\"You are MiroThinker.\")\n        tools_idx = result.find(\"In this environment you have access\")\n        assert sys_idx < tools_idx\n\n\n# ============================================================================\n# Test: Tool Calls in Assistant Messages\n# ============================================================================\n\n\nclass TestToolCalls:\n    \"\"\"Tests for tool call formatting in assistant messages.\"\"\"\n\n    def test_tool_call_format(self, template):\n        \"\"\"Tool calls should be formatted with <use_mcp_tool> tags.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search for AI\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"Let me search.\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"type\": \"function\",\n                        \"function\": {\n                            \"name\": \"web_search\",\n                            \"arguments\": '{\"query\": \"AI news\"}',\n                        },\n                    }\n                ],\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"web_search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=False\n        )\n\n        assert \"<use_mcp_tool>\" in result\n        assert \"<server_name>default</server_name>\" in result\n        assert \"<tool_name>web_search</tool_name>\" in result\n        assert \"<arguments>\" in result\n        assert '{\"query\": \"AI news\"}' in result\n        assert \"</arguments>\" in result\n        assert \"</use_mcp_tool>\" in result\n\n    def test_tool_call_no_content(self, template):\n        \"\"\"Tool call with None content should work.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": None,\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\n                            \"name\": \"search\",\n                            \"arguments\": '{\"q\": \"test\"}',\n                        },\n                    }\n                ],\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=False\n        )\n\n        # Should have tool call with empty think tags (no content before tool call)\n        assert \"<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\n<use_mcp_tool>\" in result\n\n    def test_multiple_tool_calls(self, template):\n        \"\"\"Multiple tool calls should be separated by newlines.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Compare Tokyo and Osaka\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"I'll search both.\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\n                            \"name\": \"search\",\n                            \"arguments\": '{\"q\": \"Tokyo\"}',\n                        },\n                    },\n                    {\n                        \"id\": \"call_2\",\n                        \"function\": {\n                            \"name\": \"search\",\n                            \"arguments\": '{\"q\": \"Osaka\"}',\n                        },\n                    },\n                ],\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=False\n        )\n\n        # Extract assistant message part (after the last <|im_start|>assistant)\n        assistant_start = result.rfind(\"<|im_start|>assistant\")\n        assistant_part = result[assistant_start:]\n\n        # Should have two tool calls in assistant message\n        assert assistant_part.count(\"<use_mcp_tool>\") == 2\n        assert assistant_part.count(\"</use_mcp_tool>\") == 2\n\n    def test_tool_call_arguments_dict(self, template):\n        \"\"\"Tool call with dict arguments should be JSON serialized.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\n                            \"name\": \"search\",\n                            \"arguments\": {\"q\": \"test\", \"limit\": 5},  # dict, not string\n                        },\n                    }\n                ],\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=False\n        )\n\n        # Arguments should be JSON serialized\n        assert \"<arguments>\" in result\n        assert '\"q\"' in result or \"'q'\" in result\n\n\n# ============================================================================\n# Test: Tool Responses\n# ============================================================================\n\n\nclass TestToolResponses:\n    \"\"\"Tests for tool response handling.\"\"\"\n\n    def test_tool_response_in_user_message(self, template):\n        \"\"\"Tool response should be embedded in a user message.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"Searching...\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\"name\": \"search\", \"arguments\": '{\"q\": \"test\"}'},\n                    }\n                ],\n            },\n            {\n                \"role\": \"tool\",\n                \"tool_call_id\": \"call_1\",\n                \"content\": \"Search results here\",\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        # Tool response should be in a user message\n        assert \"<|im_start|>user\\nSearch results here<|im_end|>\" in result\n\n    def test_multiple_tool_responses_merged(self, template):\n        \"\"\"Multiple consecutive tool responses should be merged into one user message.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Compare\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"Searching...\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\"name\": \"search\", \"arguments\": '{\"q\": \"A\"}'},\n                    },\n                    {\n                        \"id\": \"call_2\",\n                        \"function\": {\"name\": \"search\", \"arguments\": '{\"q\": \"B\"}'},\n                    },\n                ],\n            },\n            {\"role\": \"tool\", \"tool_call_id\": \"call_1\", \"content\": \"Result A\"},\n            {\"role\": \"tool\", \"tool_call_id\": \"call_2\", \"content\": \"Result B\"},\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        # Should have only one user message containing both results\n        # Results should be separated by \\n\\n\n        assert \"Result A\\n\\nResult B\" in result\n\n        # Count im_start|>user - should have 2 (original user + tool results)\n        user_count = result.count(\"<|im_start|>user\")\n        assert user_count == 2\n\n    def test_tool_response_no_wrapper_tags(self, template):\n        \"\"\"Tool responses should NOT be wrapped in <tool_response> tags.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\"name\": \"search\", \"arguments\": '{\"q\": \"test\"}'},\n                    }\n                ],\n            },\n            {\"role\": \"tool\", \"tool_call_id\": \"call_1\", \"content\": \"Results\"},\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        assert \"<tool_response>\" not in result\n        assert \"</tool_response>\" not in result\n\n\n# ============================================================================\n# Test: Edge Cases\n# ============================================================================\n\n\nclass TestEdgeCases:\n    \"\"\"Tests for edge cases and error handling.\"\"\"\n\n    def test_only_system_message(self, template):\n        \"\"\"Only system message should work.\"\"\"\n        messages = [{\"role\": \"system\", \"content\": \"You are helpful.\"}]\n        result = template.render(messages=messages, add_generation_prompt=False)\n        assert \"<|im_start|>system\\nYou are helpful.<|im_end|>\" in result\n\n    def test_assistant_empty_content(self, template):\n        \"\"\"Assistant with empty string content should work.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Hi\"},\n            {\"role\": \"assistant\", \"content\": \"\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n        # Assistant always outputs <think> tags (even with empty content)\n        assert \"<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\n<|im_end|>\" in result\n\n    def test_unicode_content(self, template):\n        \"\"\"Unicode content should be preserved.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"你好！🎉\"},\n            {\"role\": \"assistant\", \"content\": \"こんにちは！\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n        assert \"你好！🎉\" in result\n        assert \"こんにちは！\" in result\n\n    def test_special_characters_in_content(self, template):\n        \"\"\"Special characters should be preserved.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Test <tag> & \\\"quotes\\\" 'apostrophe'\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n        assert '<tag> & \"quotes\"' in result\n\n    def test_newlines_preserved(self, template):\n        \"\"\"Newlines in content should be preserved.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Line 1\\nLine 2\\n\\nLine 4\"},\n        ]\n        result = template.render(messages=messages, add_generation_prompt=False)\n        assert \"Line 1\\nLine 2\\n\\nLine 4\" in result\n\n\n# ============================================================================\n# Test: Complete Flow\n# ============================================================================\n\n\nclass TestCompleteFlow:\n    \"\"\"Integration tests for complete conversation flows.\"\"\"\n\n    def test_full_tool_use_flow(self, template, today_date):\n        \"\"\"Test a complete tool use flow.\"\"\"\n        messages = [\n            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n            {\"role\": \"user\", \"content\": \"What's the weather?\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"Let me check.\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\n                            \"name\": \"weather\",\n                            \"arguments\": '{\"city\": \"Tokyo\"}',\n                        },\n                    }\n                ],\n            },\n            {\"role\": \"tool\", \"tool_call_id\": \"call_1\", \"content\": \"Sunny, 25°C\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"It's sunny and 25°C in Tokyo!\",\n            },\n            {\"role\": \"user\", \"content\": \"Thanks!\"},\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"weather\",\n                    \"description\": \"Get weather info\",\n                    \"parameters\": {\n                        \"type\": \"object\",\n                        \"properties\": {\n                            \"city\": {\"type\": \"string\", \"description\": \"City name\"}\n                        },\n                    },\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=True\n        )\n\n        # Check structure\n        assert \"<|im_start|>system\" in result\n        assert \"You are a helpful assistant.\" in result\n        assert f\"Today is: {today_date}\" in result\n        assert \"### Tool name: weather\" in result\n        assert \"<use_mcp_tool>\" in result\n        assert \"<server_name>default</server_name>\" in result\n        assert \"Sunny, 25°C\" in result\n        assert \"It's sunny and 25°C in Tokyo!\" in result\n        assert result.endswith(\"<|im_start|>assistant\\n\")\n\n    def test_reasoning_with_tool_use(self, template):\n        \"\"\"Test reasoning content combined with tool use.\"\"\"\n        messages = [\n            {\"role\": \"user\", \"content\": \"Search for Python tutorials\"},\n            {\n                \"role\": \"assistant\",\n                \"content\": \"I'll search for Python tutorials.\",\n                \"reasoning_content\": \"User wants Python tutorials. I should use web search.\",\n                \"tool_calls\": [\n                    {\n                        \"id\": \"call_1\",\n                        \"function\": {\n                            \"name\": \"search\",\n                            \"arguments\": '{\"q\": \"Python tutorials\"}',\n                        },\n                    }\n                ],\n            },\n        ]\n        tools = [\n            {\n                \"type\": \"function\",\n                \"function\": {\n                    \"name\": \"search\",\n                    \"description\": \"Search\",\n                    \"parameters\": {\"type\": \"object\", \"properties\": {}},\n                },\n            }\n        ]\n        result = template.render(\n            messages=messages, tools=tools, add_generation_prompt=False\n        )\n\n        # Should have both thinking and tool call\n        assert \"<think>\" in result\n        assert \"User wants Python tutorials\" in result\n        assert \"</think>\" in result\n        assert \"<use_mcp_tool>\" in result\n\n\n# ============================================================================\n# Run tests\n# ============================================================================\n\nif __name__ == \"__main__\":\n    pytest.main([__file__, \"-v\"])\n"
  },
  {
    "path": "apps/miroflow-agent/README.md",
    "content": "# MiroFlow Agent\n\n> For comprehensive documentation, installation guide, and tool configuration, see the [main README](../../README.md).\n\n## Prerequisites\n\nBefore running the agent, ensure you have:\n\n1. **Installed dependencies**: Run `uv sync` in this directory\n1. **Configured environment variables**: Copy `.env.example` to `.env` and fill in your API keys\n   ```bash\n   cp .env.example .env\n   # Edit .env with your actual API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)\n   ```\n1. **Started your model server** (for MiroThinker models): See the [Serve the MiroThinker Model](../../README.md#serve-the-mirothinker-model) section\n\n## Quick Start\n\n### Run a Single Task\n\nThe simplest way to test the agent is running `main.py` directly. It will execute a default task: *\"What is the title of today's arxiv paper in computer science?\"*\n\n```bash\n# Using MiroThinker models (requires your own model server)\nuv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 llm.base_url=http://localhost:61002/v1\n\n# Using Claude (requires ANTHROPIC_API_KEY in .env)\nuv run python main.py llm=claude-3-7 agent=single_agent_keep5\n\n# Using GPT-5 (requires OPENAI_API_KEY in .env)\nuv run python main.py llm=gpt-5 agent=single_agent_keep5\n```\n\n### Customize Your Task\n\nTo ask a different question, edit `main.py` line 32:\n\n```python\ntask_description = \"Your custom question here\"\n```\n\nThen run the agent again. It will search the web, execute code, and provide an answer.\n\n### Run Benchmark Evaluation\n\nFor systematic evaluation on standard benchmarks, add the `benchmark=` parameter:\n\n```bash\n# Run on debug benchmark (quick test)\nuv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=debug llm.base_url=http://localhost:61002/v1\n\n# Run on specific benchmarks\nuv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=gaia-validation-text-103 llm.base_url=http://localhost:61002/v1\n```\n\n## Available Configurations\n\n### LLM Models\n\n| Model | Config Name | Requirements |\n|-------|-------------|--------------|\n| MiroThinker (self-hosted) | `qwen-3` | Model server + `llm.base_url` |\n| Claude 3.7 Sonnet | `claude-3-7` | `ANTHROPIC_API_KEY` in .env |\n| GPT-5 | `gpt-5` | `OPENAI_API_KEY` in .env |\n\n### Agent Configurations\n\n**MiroThinker v1.5:**\n\n- `mirothinker_v1.5_keep5_max200` ⭐ (recommended) - context management, up to 200 turns\n- `mirothinker_v1.5_keep5_max400` - context management, up to 400 turns (for BrowseComp)\n- `mirothinker_v1.5` - no context management, up to 600 turns\n\n**MiroThinker v1.0:**\n\n- `mirothinker_v1.0_keep5` (recommended) - context management, up to 600 turns\n- `mirothinker_v1.0` - no context management, up to 600 turns\n\n**General (for closed-source models like Claude, GPT-5):**\n\n- `single_agent_keep5` (recommended) - single agent with context management\n- `single_agent` - single agent without context management\n\n**Multi-Agent (Legacy for v0.1/v0.2):**\n\n- `multi_agent` - multi-agent with commercial tools\n- `multi_agent_os` - multi-agent with open-source tools\n\n### Benchmark Configs\n\n`debug`, `browsecomp`, `browsecomp_zh`, `hle`, `hle-text-2158`, `hle-text-500`, `gaia-validation-text-103`, `gaia-validation`, `frames`, `xbench_deepsearch`, `futurex`, `seal-0`, `aime2025`, `deepsearchqa`, `webwalkerqa`\n\n## Output\n\nThe agent will:\n\n1. Execute the task using available tools (search, code execution, etc.)\n1. Generate a final summary and boxed answer\n1. Save detailed logs to `../../logs/` directory\n1. Display the results in the terminal\n\n## Troubleshooting\n\n| Problem | Solution |\n|---------|----------|\n| API key errors | Check `.env` file has correct keys |\n| Model connection failed | Verify `llm.base_url` is accessible |\n| Tool execution errors | Check E2B/Serper/Jina API keys and quotas |\n| Out of memory | Use `mirothinker_v1.5_keep5_max200` config |\n\nFor detailed logs, check the `logs/` directory.\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/__init__.py",
    "content": ""
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"aime2025\"\nBENCHMARK_NAME_STD = \"AIME2025\"\nTASKS_PER_RUN = 30\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"browsecomp\"\nBENCHMARK_NAME_STD = \"BrowseComp-EN\"\nTASKS_PER_RUN = 1266\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"browsecomp_zh\"\nBENCHMARK_NAME_STD = \"BrowseComp-ZH\"\nTASKS_PER_RUN = 289\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_deepsearchqa.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport glob\nimport json\nimport os\nfrom pathlib import Path\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"deepsearchqa\"\nBENCHMARK_NAME_STD = \"DeepSearchQA\"\nTASKS_PER_RUN = 900\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef extract_eval_details_from_log(log_file: str) -> dict:\n    \"\"\"\n    Extract evaluation details from a completed task log file.\n\n    Returns:\n        Dict with num_correct, num_expected, num_excessive, or empty dict if not found\n    \"\"\"\n    try:\n        with open(log_file, \"r\") as f:\n            content = f.read()\n\n        # Try to parse as JSON first (task log files are JSON)\n        try:\n            log_data = json.loads(content)\n\n            # Method 1: Check for eval_details field (new format - saved directly)\n            if \"eval_details\" in log_data and log_data[\"eval_details\"]:\n                eval_details = log_data[\"eval_details\"]\n                if all(\n                    k in eval_details\n                    for k in [\"num_correct\", \"num_expected\", \"num_excessive\"]\n                ):\n                    return {\n                        \"num_correct\": eval_details[\"num_correct\"],\n                        \"num_expected\": eval_details[\"num_expected\"],\n                        \"num_excessive\": eval_details[\"num_excessive\"],\n                    }\n\n            # Method 2: Check if llm_response contains the evaluation output (legacy format)\n            if \"llm_response\" in log_data and log_data[\"llm_response\"]:\n                llm_response = log_data[\"llm_response\"]\n\n                # Look for DeepSearchQA Judge output\n                if \"DeepSearchQA Judge - Correct:\" in llm_response:\n                    for line in llm_response.split(\"\\n\"):\n                        if \"DeepSearchQA Judge - Correct:\" in line:\n                            # Parse \"Correct: X/Y, Excessive: Z\"\n                            parts = line.split(\"Correct:\")[1].strip()\n                            correct_part, excessive_part = parts.split(\", Excessive:\")\n                            num_correct, num_expected = map(\n                                int, correct_part.split(\"/\")\n                            )\n                            num_excessive = int(excessive_part.strip())\n\n                            return {\n                                \"num_correct\": num_correct,\n                                \"num_expected\": num_expected,\n                                \"num_excessive\": num_excessive,\n                            }\n        except json.JSONDecodeError:\n            # Not JSON, try as plain text (legacy format)\n            if \"DeepSearchQA Judge - Correct:\" in content:\n                for line in content.split(\"\\n\"):\n                    if \"DeepSearchQA Judge - Correct:\" in line:\n                        # Parse \"Correct: X/Y, Excessive: Z\"\n                        parts = line.split(\"Correct:\")[1].strip()\n                        correct_part, excessive_part = parts.split(\", Excessive:\")\n                        num_correct, num_expected = map(int, correct_part.split(\"/\"))\n                        num_excessive = int(excessive_part.strip())\n\n                        return {\n                            \"num_correct\": num_correct,\n                            \"num_expected\": num_expected,\n                            \"num_excessive\": num_excessive,\n                        }\n    except Exception:\n        pass\n\n    return {}\n\n\ndef calculate_deepsearchqa_metrics_from_logs(base_path: str) -> dict:\n    \"\"\"\n    Calculate metrics from individual task log files (for in-progress runs).\n\n    Returns:\n        Dict with metrics or None if no completed tasks found\n    \"\"\"\n    try:\n        # Find all completed task log files\n        pattern = os.path.join(base_path, \"run_*/task_*.json\")\n        log_files = glob.glob(pattern)\n\n        if not log_files:\n            return None\n\n        num_valid = 0\n        num_fully_correct = 0\n        num_fully_incorrect = 0\n        num_correct_with_extraneous = 0\n        f1_list = []\n\n        for log_file in log_files:\n            details = extract_eval_details_from_log(log_file)\n            if not details:\n                continue\n\n            num_correct = details[\"num_correct\"]\n            num_expected = details[\"num_expected\"]\n            num_excessive = details[\"num_excessive\"]\n\n            # Calculate per-item metrics\n            true_positives = num_correct\n            false_negatives = num_expected - num_correct\n            false_positives = num_excessive\n\n            # Calculate precision and recall for F1\n            precision = 0.0\n            if (true_positives + false_positives) > 0:\n                precision = true_positives / (true_positives + false_positives)\n\n            recall = 0.0\n            if (true_positives + false_negatives) > 0:\n                recall = true_positives / (true_positives + false_negatives)\n\n            f1 = 0.0\n            if (precision + recall) > 0:\n                f1 = 2 * (precision * recall) / (precision + recall)\n\n            f1_list.append(f1)\n\n            # Classify into categories\n            all_expected_correct = num_correct == num_expected\n            has_extraneous = num_excessive > 0\n\n            if all_expected_correct and not has_extraneous:\n                num_fully_correct += 1\n            elif num_correct == 0:\n                num_fully_incorrect += 1\n            elif all_expected_correct and has_extraneous:\n                num_correct_with_extraneous += 1\n\n            num_valid += 1\n\n        if num_valid > 0:\n            return {\n                \"num_valid\": num_valid,\n                \"fully_correct\": num_fully_correct,\n                \"fully_incorrect\": num_fully_incorrect,\n                \"correct_with_extraneous\": num_correct_with_extraneous,\n                \"pct_fully_correct\": num_fully_correct / num_valid,\n                \"pct_fully_incorrect\": num_fully_incorrect / num_valid,\n                \"pct_correct_with_extraneous\": num_correct_with_extraneous / num_valid,\n                \"avg_f1\": sum(f1_list) / len(f1_list),\n            }\n\n        return None\n\n    except Exception:\n        return None\n\n\ndef calculate_deepsearchqa_metrics(results_file: str) -> dict:\n    \"\"\"\n    Calculate DeepSearchQA-specific metrics from results file.\n    Following the official Google DeepSearchQA evaluation metrics:\n    1. Fully Correct: All expected answers correct + no extraneous answers\n    2. Fully Incorrect: No correct answers\n    3. Correct with Extraneous Answers: All expected answers correct + has extraneous\n    4. F1 Score: Harmonic mean of precision and recall\n\n    Returns:\n        Dict with the 4 core metrics\n    \"\"\"\n    try:\n        results = []\n        with open(results_file, \"r\") as f:\n            for line in f:\n                if line.strip():\n                    results.append(json.loads(line))\n\n        num_valid = 0\n        num_fully_correct = 0\n        num_fully_incorrect = 0\n        num_correct_with_extraneous = 0\n        f1_list = []\n\n        for result in results:\n            if result.get(\"status\") != \"success\":\n                continue\n\n            # Extract eval_details from attempts\n            if \"attempts\" in result and result[\"attempts\"]:\n                for attempt in result[\"attempts\"]:\n                    if \"eval_details\" in attempt and attempt[\"eval_details\"]:\n                        details = attempt[\"eval_details\"]\n                        num_correct = details.get(\"num_correct\", 0)\n                        num_expected = details.get(\"num_expected\", 0)\n                        num_excessive = details.get(\"num_excessive\", 0)\n\n                        # Calculate per-item metrics\n                        true_positives = num_correct\n                        false_negatives = num_expected - num_correct\n                        false_positives = num_excessive\n\n                        # Calculate precision and recall for F1\n                        precision = 0.0\n                        if (true_positives + false_positives) > 0:\n                            precision = true_positives / (\n                                true_positives + false_positives\n                            )\n\n                        recall = 0.0\n                        if (true_positives + false_negatives) > 0:\n                            recall = true_positives / (true_positives + false_negatives)\n\n                        f1 = 0.0\n                        if (precision + recall) > 0:\n                            f1 = 2 * (precision * recall) / (precision + recall)\n\n                        f1_list.append(f1)\n\n                        # Classify into categories\n                        all_expected_correct = num_correct == num_expected\n                        has_extraneous = num_excessive > 0\n\n                        if all_expected_correct and not has_extraneous:\n                            num_fully_correct += 1\n                        elif num_correct == 0:\n                            num_fully_incorrect += 1\n                        elif all_expected_correct and has_extraneous:\n                            num_correct_with_extraneous += 1\n\n                        num_valid += 1\n                        break  # Only use first attempt with details\n\n        if num_valid > 0:\n            return {\n                \"num_valid\": num_valid,\n                \"fully_correct\": num_fully_correct,\n                \"fully_incorrect\": num_fully_incorrect,\n                \"correct_with_extraneous\": num_correct_with_extraneous,\n                \"pct_fully_correct\": num_fully_correct / num_valid,\n                \"pct_fully_incorrect\": num_fully_incorrect / num_valid,\n                \"pct_correct_with_extraneous\": num_correct_with_extraneous / num_valid,\n                \"avg_f1\": sum(f1_list) / len(f1_list),\n            }\n        else:\n            return {\"num_valid\": 0}\n\n    except Exception as e:\n        print(f\"Warning: Could not calculate DeepSearchQA metrics: {e}\")\n        return {\"num_valid\": 0}\n\n\ndef show_deepsearchqa_metrics(base_path: str):\n    \"\"\"\n    Show DeepSearchQA-specific metrics for all runs.\n    Following Google DeepSearchQA official metrics:\n    1. Fully Correct\n    2. Fully Incorrect\n    3. Correct with Extraneous Answers\n    4. F1 Score\n    \"\"\"\n    print(\"\\n\" + \"=\" * 80)\n    print(\"DeepSearchQA Metrics (Official Google Metrics)\")\n    print(\"=\" * 80)\n\n    # Find all benchmark_results.jsonl files\n    results_files = glob.glob(os.path.join(base_path, \"run_*/benchmark_results.jsonl\"))\n\n    if not results_files:\n        print(\"(Metrics will be available after tasks complete)\")\n        return\n\n    all_fully_correct = []\n    all_fully_incorrect = []\n    all_correct_with_extraneous = []\n    all_f1 = []\n\n    for results_file in sorted(results_files):\n        run_dir = Path(results_file).parent.name\n        metrics = calculate_deepsearchqa_metrics(results_file)\n\n        if metrics[\"num_valid\"] > 0:\n            fully_correct_pct = metrics[\"pct_fully_correct\"]\n            fully_incorrect_pct = metrics[\"pct_fully_incorrect\"]\n            correct_with_extraneous_pct = metrics[\"pct_correct_with_extraneous\"]\n            f1 = metrics[\"avg_f1\"]\n\n            all_fully_correct.append(fully_correct_pct)\n            all_fully_incorrect.append(fully_incorrect_pct)\n            all_correct_with_extraneous.append(correct_with_extraneous_pct)\n            all_f1.append(f1)\n\n            print(f\"\\n{run_dir} ({metrics['num_valid']} items):\")\n            print(\n                f\"  Fully Correct:              {fully_correct_pct:6.2%}  ({metrics['fully_correct']} items)\"\n            )\n            print(\n                f\"  Fully Incorrect:            {fully_incorrect_pct:6.2%}  ({metrics['fully_incorrect']} items)\"\n            )\n            print(\n                f\"  Correct w/ Extraneous:      {correct_with_extraneous_pct:6.2%}  ({metrics['correct_with_extraneous']} items)\"\n            )\n            print(f\"  F1 Score:                   {f1:6.2%}\")\n\n    if all_fully_correct:\n        print(\"\\n\" + \"=\" * 80)\n        print(f\"Average across {len(all_fully_correct)} runs:\")\n        print(\"=\" * 80)\n        avg_fully_correct = sum(all_fully_correct) / len(all_fully_correct)\n        avg_fully_incorrect = sum(all_fully_incorrect) / len(all_fully_incorrect)\n        avg_correct_with_extraneous = sum(all_correct_with_extraneous) / len(\n            all_correct_with_extraneous\n        )\n        avg_f1 = sum(all_f1) / len(all_f1)\n\n        print(f\"  Fully Correct:              {avg_fully_correct:6.2%}\")\n        print(f\"  Fully Incorrect:            {avg_fully_incorrect:6.2%}\")\n        print(f\"  Correct w/ Extraneous:      {avg_correct_with_extraneous:6.2%}\")\n        print(f\"  F1 Score:                   {avg_f1:6.2%}\")\n\n    print(\"=\" * 80)\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n\n        # Show DeepSearchQA-specific metrics (only if runs are complete)\n        # Check if any run has completed all its tasks\n        has_complete_run = False\n        run_dirs = glob.glob(os.path.join(args.path, \"run_*\"))\n        for run_dir in run_dirs:\n            results_file = os.path.join(run_dir, \"benchmark_results.jsonl\")\n            if os.path.exists(results_file):\n                has_complete_run = True\n                break\n\n        if has_complete_run:\n            show_deepsearchqa_metrics(args.path)\n        elif summary.total_completed > 0:\n            # Try to show intermediate metrics from completed tasks\n            interim_metrics = calculate_deepsearchqa_metrics_from_logs(args.path)\n\n            print(\"\\n\" + \"=\" * 80)\n            print(\"DeepSearchQA Metrics (Official Google Metrics)\")\n            print(\"=\" * 80)\n\n            if interim_metrics and interim_metrics.get(\"num_valid\", 0) > 0:\n                num_with_details = interim_metrics[\"num_valid\"]\n                print(\n                    f\"⚠️  INTERIM RESULTS (based on {num_with_details}/{summary.total_completed} tasks with eval_details)\"\n                )\n                if num_with_details < summary.total_completed:\n                    print(\n                        f\"    Note: {summary.total_completed - num_with_details} completed tasks don't have eval_details (likely ran before the update)\"\n                    )\n                print(\"-\" * 80)\n\n                fully_correct_pct = interim_metrics[\"pct_fully_correct\"]\n                fully_incorrect_pct = interim_metrics[\"pct_fully_incorrect\"]\n                correct_with_extraneous_pct = interim_metrics[\n                    \"pct_correct_with_extraneous\"\n                ]\n                f1 = interim_metrics[\"avg_f1\"]\n\n                print(\n                    f\"  Fully Correct:              {fully_correct_pct:6.2%}  ({interim_metrics['fully_correct']} items)\"\n                )\n                print(\n                    f\"  Fully Incorrect:            {fully_incorrect_pct:6.2%}  ({interim_metrics['fully_incorrect']} items)\"\n                )\n                print(\n                    f\"  Correct w/ Extraneous:      {correct_with_extraneous_pct:6.2%}  ({interim_metrics['correct_with_extraneous']} items)\"\n                )\n                print(f\"  F1 Score:                   {f1:6.2%}\")\n                print()\n                print(\n                    f\"Note: Based on {interim_metrics['num_valid']} completed tasks. Final metrics may differ.\"\n                )\n            else:\n                print(f\"Tasks in progress... ({summary.total_completed} completed)\")\n                print(\"Detailed metrics will be available when runs complete.\")\n\n            print(\"=\" * 80)\n\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"frames\"\nBENCHMARK_NAME_STD = \"Frames\"\nTASKS_PER_RUN = 824\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import GAIAProgressChecker as ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"gaia-2023-validation-text-103\"\nBENCHMARK_NAME_STD = \"GAIA-Text-103\"\nTASKS_PER_RUN = 103\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([^_]+(?:-[^_]+)*)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import GAIAProgressChecker as ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"gaia-2023-validation\"\nBENCHMARK_NAME_STD = \"GAIA-Val-165\"\nTASKS_PER_RUN = 165\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([^_]+(?:-[^_]+)*)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"hle-text-2158\"\nBENCHMARK_NAME_STD = \"HLE-Text-2158\"\nTASKS_PER_RUN = 2158\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"hle-text-500\"\nBENCHMARK_NAME_STD = \"HLE-Text-500\"\nTASKS_PER_RUN = 500\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"hle\"\nBENCHMARK_NAME_STD = \"HLE-2500\"\nTASKS_PER_RUN = 2500\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"seal-0\"\nBENCHMARK_NAME_STD = \"SEAL-0\"\nTASKS_PER_RUN = 111\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"webwalkerqa\"\nBENCHMARK_NAME_STD = \"WebWalkerQA\"\nTASKS_PER_RUN = 680\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_task_id_(\\d+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport os\n\nfrom common import ProgressChecker\n\n# Benchmark configuration\nFILENAME = os.path.basename(__file__)\nBENCHMARK_NAME = \"xbench_deepsearch\"\nBENCHMARK_NAME_STD = \"XBench-DeepSearch\"\nTASKS_PER_RUN = 100\nDATA_PATH = f\"../../data/{BENCHMARK_NAME}/standardized_data.jsonl\"\nTASK_ID_PATTERN = r\"task_([a-f0-9]+)\"\n\n\ndef parse_args():\n    parser = argparse.ArgumentParser(\n        description=f\"Check progress of {BENCHMARK_NAME_STD} benchmark runs.\"\n    )\n    parser.add_argument(\n        \"path\", help=f\"Path to {BENCHMARK_NAME_STD} benchmark directory\"\n    )\n    return parser.parse_args()\n\n\nif __name__ == \"__main__\":\n    args = parse_args()\n\n    try:\n        # Create progress checker and run analysis\n        checker = ProgressChecker(\n            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH\n        )\n        summary = checker.run_analysis(\n            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN\n        )\n        # Exit with appropriate code\n        if summary.total_tasks == 0:\n            print(\"No task files found in any run directories\")\n        elif summary.total_completed == 0:\n            print(\"No tasks completed yet\")\n\n    except FileNotFoundError as e:\n        print(f\"Error: {e}\")\n    except PermissionError as e:\n        print(f\"Error: {e}\")\n    except ValueError as e:\n        print(f\"Error: {e}\")\n    except Exception as e:\n        print(f\"Unexpected error: {e}\")\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/check_progress/common.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport glob\nimport json\nimport math\nimport os\nimport re\nfrom dataclasses import dataclass\nfrom datetime import datetime\nfrom io import StringIO\nfrom typing import Dict, List, Optional, Tuple\n\n# Time estimation constants\nDEFAULT_TASK_TIME_MINUTES = 3.5\nMINUTES_PER_HOUR = 60\nHOURS_PER_DAY = 24\nMINUTES_PER_DAY = MINUTES_PER_HOUR * HOURS_PER_DAY\n\n# Progress bar configuration\nPROGRESS_BAR_WIDTH = 20\nGREEN_THRESHOLD = 80\nYELLOW_THRESHOLD = 60\nORANGE_THRESHOLD = 40\n\n# Judge result patterns for correctness\nCORRECT_RESULTS = [\"CORRECT\", \"SUCCESS\"]\nSUCCESS_PATTERNS = [\"PASS_AT_K_SUCCESS\"]\n\n# Log file configuration\nLOG_FILE_PREFIX = \"progress_analysis_\"\nLOG_FILE_TIMESTAMP_FORMAT = \"%Y%m%d_%H%M%S\"\n\n\ndef create_progress_bar(percentage: float, width: int = PROGRESS_BAR_WIDTH) -> str:\n    \"\"\"Create a visual progress bar for percentage display\"\"\"\n    filled = int(width * percentage / 100)\n    bar = \"█\" * filled + \"░\" * (width - filled)\n\n    # Add color based on percentage\n    if percentage >= GREEN_THRESHOLD:\n        color = \"\\033[92m\"  # Green\n    elif percentage >= YELLOW_THRESHOLD:\n        color = \"\\033[93m\"  # Yellow\n    elif percentage >= ORANGE_THRESHOLD:\n        color = \"\\033[33m\"  # Orange\n    else:\n        color = \"\\033[91m\"  # Red\n\n    reset = \"\\033[0m\"\n    return f\"{color}[{bar}] {percentage:.1f}%{reset}\"\n\n\ndef find_earliest_start_time(completed_files: List[str]) -> Optional[datetime]:\n    \"\"\"Find the earliest start time from all completed files\"\"\"\n    earliest_time = None\n\n    for file_path in completed_files:\n        try:\n            with open(file_path, \"r\", encoding=\"utf-8\") as f:\n                data = json.load(f)\n\n            if \"start_time\" in data:\n                # Parse UTC time and convert to naive datetime\n                start_time_str = data[\"start_time\"]\n                if start_time_str.endswith(\"Z\"):\n                    start_time_str = start_time_str[:-1] + \"+00:00\"\n                start_time = datetime.fromisoformat(start_time_str)\n                # Convert to naive datetime for comparison\n                start_time = start_time.replace(tzinfo=None)\n\n                if earliest_time is None or start_time < earliest_time:\n                    earliest_time = start_time\n\n        except (json.JSONDecodeError, KeyError, ValueError, OSError):\n            continue  # Skip files with invalid timing data\n\n    return earliest_time\n\n\ndef find_latest_end_time(completed_files: List[str]) -> Optional[datetime]:\n    \"\"\"Find the latest end time from all completed files\"\"\"\n    latest_time = None\n\n    for file_path in completed_files:\n        try:\n            with open(file_path, \"r\", encoding=\"utf-8\") as f:\n                data = json.load(f)\n\n            if \"end_time\" in data:\n                # Parse UTC time and convert to naive datetime\n                end_time_str = data[\"end_time\"]\n                if end_time_str.endswith(\"Z\"):\n                    end_time_str = end_time_str[:-1] + \"+00:00\"\n                end_time = datetime.fromisoformat(end_time_str)\n                # Convert to naive datetime for comparison (UTC-naive)\n                end_time = end_time.replace(tzinfo=None)\n\n                if latest_time is None or end_time > latest_time:\n                    latest_time = end_time\n\n        except (json.JSONDecodeError, KeyError, ValueError, OSError):\n            continue  # Skip files with invalid timing data\n\n    # If no valid end_time found, return current UTC (naive)\n    return latest_time or datetime.now().replace(tzinfo=None)\n\n\ndef calculate_mean_and_std(values: List[float]) -> Tuple[float, float]:\n    \"\"\"Calculate mean and standard deviation of a list of values\"\"\"\n    if not values:\n        return 0.0, 0.0\n\n    n = len(values)\n    mean = sum(values) / n\n\n    if n == 1:\n        return mean, 0.0\n\n    variance = sum((x - mean) ** 2 for x in values) / (n - 1)\n    std = math.sqrt(variance)\n\n    return mean, std\n\n\ndef estimate_completion_time(\n    total_tasks: int, completed_tasks: int, completed_files: List[str]\n) -> str:\n    \"\"\"Estimate completion time based on overall progress rate from all completed tasks\"\"\"\n    if completed_tasks == 0:\n        return \"Cannot estimate (no completed tasks)\"\n\n    # Check if all tasks are completed\n    if completed_tasks >= total_tasks:\n        return \"All tasks completed\"\n\n    remaining_tasks = total_tasks - completed_tasks\n\n    # Use overall completion rate from all successfully completed tasks\n    earliest_start = find_earliest_start_time(completed_files)\n    latest_end = find_latest_end_time(completed_files)\n\n    if earliest_start is None:\n        # Fallback to default estimation if no valid timing data\n        estimated_minutes = remaining_tasks * DEFAULT_TASK_TIME_MINUTES\n    else:\n        # Calculate overall elapsed time\n        elapsed_time = latest_end - earliest_start\n        elapsed_minutes = elapsed_time.total_seconds() / 60\n\n        if elapsed_minutes <= 0:\n            return \"Cannot estimate (time interval too short)\"\n\n        # Calculate average time per task based on all completed tasks\n        avg_minutes_per_task = elapsed_minutes / completed_tasks\n        if avg_minutes_per_task <= 0:\n            return \"Cannot estimate (invalid time per task)\"\n\n        estimated_minutes = remaining_tasks * avg_minutes_per_task\n\n    # Format the estimate in minutes\n    return f\"~{int(estimated_minutes)} minutes\"\n\n\n@dataclass\nclass TaskStats:\n    \"\"\"Statistics for a single task\"\"\"\n\n    completed: int = 0\n    running: int = 0\n    failed: int = 0\n    judge_correct: int = 0\n    total: int = 0\n\n    # Completed files for timing analysis\n    completed_files: List[str] = None\n\n    # Turn statistics\n    total_turns: int = 0\n    completed_tasks_with_turns: int = 0\n\n    # No boxed content found statistics\n    no_boxed_found: int = 0\n\n    def __post_init__(self):\n        if self.completed_files is None:\n            self.completed_files = []\n\n    @property\n    def judge_accuracy(self) -> float:\n        \"\"\"Calculate judge accuracy percentage\"\"\"\n        return (\n            (self.judge_correct / self.completed * 100) if self.completed > 0 else 0.0\n        )\n\n    @property\n    def completion_rate(self) -> float:\n        \"\"\"Calculate completion rate percentage\"\"\"\n        return (self.completed / self.total * 100) if self.total > 0 else 0.0\n\n    @property\n    def average_turns(self) -> float:\n        \"\"\"Calculate average turns per completed task\"\"\"\n        return (\n            (self.total_turns / self.completed_tasks_with_turns)\n            if self.completed_tasks_with_turns > 0\n            else 0.0\n        )\n\n\n@dataclass\nclass GAIATaskStats(TaskStats):\n    \"\"\"Statistics for a single task\"\"\"\n\n    # Difficulty level tracking\n    level1_completed: int = 0\n    level1_correct: int = 0\n    level2_completed: int = 0\n    level2_correct: int = 0\n    level3_completed: int = 0\n    level3_correct: int = 0\n\n    @property\n    def level1_accuracy(self) -> float:\n        \"\"\"Calculate Level 1 accuracy percentage\"\"\"\n        return (\n            (self.level1_correct / self.level1_completed * 100)\n            if self.level1_completed > 0\n            else 0.0\n        )\n\n    @property\n    def level2_accuracy(self) -> float:\n        \"\"\"Calculate Level 2 accuracy percentage\"\"\"\n        return (\n            (self.level2_correct / self.level2_completed * 100)\n            if self.level2_completed > 0\n            else 0.0\n        )\n\n    @property\n    def level3_accuracy(self) -> float:\n        \"\"\"Calculate Level 3 accuracy percentage\"\"\"\n        return (\n            (self.level3_correct / self.level3_completed * 100)\n            if self.level3_completed > 0\n            else 0.0\n        )\n\n\n@dataclass\nclass SummaryStats:\n    \"\"\"Summary statistics across all runs\"\"\"\n\n    total_tasks: int = 0\n    total_completed: int = 0\n    total_running: int = 0\n    total_failed: int = 0\n    total_judge_correct: int = 0\n    total_no_boxed_found: int = 0\n\n    @property\n    def total_judge_accuracy(self) -> float:\n        \"\"\"Calculate overall judge accuracy percentage\"\"\"\n        return (\n            (self.total_judge_correct / self.total_completed * 100)\n            if self.total_completed > 0\n            else 0.0\n        )\n\n    def average_run_accuracy(\n        self, run_stats_list: List[Tuple[str, TaskStats]]\n    ) -> Tuple[float, float]:\n        \"\"\"Calculate overall accuracy (mean) and standard deviation across individual runs\"\"\"\n        if not run_stats_list:\n            return 0.0, 0.0\n\n        # Mean accuracy is the overall accuracy (weighted average)\n        # This matches the OVERALL JUDGE ACCURACY calculation\n        mean = self.total_judge_accuracy\n\n        # Standard deviation is calculated from individual run accuracies\n        accuracies = [\n            stats.judge_accuracy for _, stats in run_stats_list if stats.completed > 0\n        ]\n\n        if not accuracies:\n            return mean, 0.0\n\n        _, std = calculate_mean_and_std(accuracies)\n        return mean, std\n\n    @property\n    def total_completion_rate(self) -> float:\n        \"\"\"Calculate overall completion rate percentage\"\"\"\n        return (\n            (self.total_completed / self.total_tasks * 100)\n            if self.total_tasks > 0\n            else 0.0\n        )\n\n\n@dataclass\nclass GAIASummaryStats(SummaryStats):\n    \"\"\"Summary statistics across all runs\"\"\"\n\n    # Difficulty level summary stats\n    level1_completed: int = 0\n    level1_correct: int = 0\n    level2_completed: int = 0\n    level2_correct: int = 0\n    level3_completed: int = 0\n    level3_correct: int = 0\n\n    @property\n    def level1_accuracy(self) -> float:\n        \"\"\"Calculate overall Level 1 accuracy percentage\"\"\"\n        return (\n            (self.level1_correct / self.level1_completed * 100)\n            if self.level1_completed > 0\n            else 0.0\n        )\n\n    @property\n    def level2_accuracy(self) -> float:\n        \"\"\"Calculate overall Level 2 accuracy percentage\"\"\"\n        return (\n            (self.level2_correct / self.level2_completed * 100)\n            if self.level2_completed > 0\n            else 0.0\n        )\n\n    @property\n    def level3_accuracy(self) -> float:\n        \"\"\"Calculate overall Level 3 accuracy percentage\"\"\"\n        return (\n            (self.level3_correct / self.level3_completed * 100)\n            if self.level3_completed > 0\n            else 0.0\n        )\n\n\nclass ProgressChecker:\n    \"\"\"Main class for checking benchmark progress\"\"\"\n\n    def __init__(self, target_path: str, task_per_run: int, data_path: str):\n        self.target_path = target_path\n        self.run_dirs: List[str] = []\n        self.total_tasks_per_run = task_per_run\n\n        # Load benchmark data\n        self._load_benchmark_data(data_path)\n\n    def _load_benchmark_data(self, data_path) -> None:\n        \"\"\"Load benchmark data and configuration\"\"\"\n        try:\n            # Load benchmark data if available\n            if os.path.exists(data_path):\n                with open(data_path) as f:\n                    benchmark_data = [json.loads(line) for line in f.readlines()]\n                print(f\"Loaded {len(benchmark_data)} tasks from {data_path}\")\n        except Exception as e:\n            print(f\"Warning: Could not load data: {e}\")\n\n    def find_run_directories(self) -> List[str]:\n        \"\"\"Find all run directories in the target path\"\"\"\n        run_dirs = []\n\n        if not os.path.exists(self.target_path):\n            raise FileNotFoundError(f\"Path '{self.target_path}' does not exist\")\n\n        # Check if target_path itself is a run directory\n        if os.path.basename(self.target_path).startswith(\"run_\"):\n            run_dirs.append(self.target_path)\n        else:\n            # Find run_* directories under target_path\n            try:\n                for item in os.listdir(self.target_path):\n                    item_path = os.path.join(self.target_path, item)\n                    if os.path.isdir(item_path) and item.startswith(\"run_\"):\n                        run_dirs.append(item_path)\n            except PermissionError:\n                raise PermissionError(\n                    f\"No permission to access directory '{self.target_path}'\"\n                )\n\n        # Sort by run number\n        run_dirs.sort(key=lambda x: self._extract_run_number(x))\n\n        if not run_dirs:\n            raise ValueError(f\"No run directories found in '{self.target_path}'\")\n\n        return run_dirs\n\n    def _extract_run_number(self, path: str) -> int:\n        \"\"\"Extract run number from directory path for sorting\"\"\"\n        basename = os.path.basename(path)\n        parts = basename.split(\"_\")\n        if len(parts) > 1 and parts[1].isdigit():\n            return int(parts[1])\n        return 0\n\n    def _extract_task_id(self, filename: str, task_id_pattern: str) -> Optional[str]:\n        \"\"\"Extract task ID from filename\"\"\"\n        match = re.match(task_id_pattern, filename)\n        return match.group(1) if match else None\n\n    def _get_latest_task_files(self, run_dir: str, task_id_pattern: str) -> List[str]:\n        \"\"\"Get the latest task file for each task ID in a run directory\"\"\"\n        json_files = glob.glob(os.path.join(run_dir, \"task_*.json\"))\n\n        if not json_files:\n            return []\n\n        # Group by task ID, keep only the latest file for each task\n        task_groups: Dict[str, Dict] = {}\n\n        for json_file in json_files:\n            filename = os.path.basename(json_file)\n            task_id = self._extract_task_id(filename, task_id_pattern)\n\n            if task_id:\n                try:\n                    # Read the JSON file to get the start_time\n                    with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                        data = json.load(f)\n\n                    start_time_str = data.get(\"start_time\", \"\")\n                    if start_time_str:\n                        # Parse the ISO format timestamp\n                        from datetime import datetime\n\n                        start_time = datetime.fromisoformat(\n                            start_time_str.replace(\"Z\", \"+00:00\")\n                        )\n                        start_timestamp = start_time.timestamp()\n                    else:\n                        # Fallback to file modification time if start_time is not available\n                        start_timestamp = os.path.getmtime(json_file)\n\n                    if (\n                        task_id not in task_groups\n                        or start_timestamp > task_groups[task_id][\"timestamp\"]\n                    ):\n                        task_groups[task_id] = {\n                            \"file\": json_file,\n                            \"timestamp\": start_timestamp,\n                        }\n                except (json.JSONDecodeError, ValueError, OSError) as e:\n                    # Fallback to file modification time if JSON parsing fails\n                    print(f\"Warning: Could not parse {json_file}: {e}\")\n                    file_mtime = os.path.getmtime(json_file)\n                    if (\n                        task_id not in task_groups\n                        or file_mtime > task_groups[task_id][\"timestamp\"]\n                    ):\n                        task_groups[task_id] = {\n                            \"file\": json_file,\n                            \"timestamp\": file_mtime,\n                        }\n\n        return [info[\"file\"] for info in task_groups.values()]\n\n    def _is_task_completed(self, data: Dict) -> bool:\n        \"\"\"Check if a task is completed based on its data\"\"\"\n        end_time = data.get(\"end_time\", \"\")\n        error = data.get(\"error\", \"\")\n        status = data.get(\"status\", \"\")\n        final_answer = data.get(\"final_boxed_answer\", \"\")\n\n        return (\n            (end_time != \"\" and error == \"\")\n            or (status == \"completed\")\n            or (final_answer != \"\" and error == \"\")\n        )\n\n    def _is_judge_correct(self, judge_result) -> bool:\n        \"\"\"Determine if LLM judge result indicates correct answer\"\"\"\n        if isinstance(judge_result, bool):\n            return judge_result\n        elif isinstance(judge_result, str):\n            result_str = judge_result.upper()\n            return (\n                result_str in CORRECT_RESULTS\n                or any(pattern in result_str for pattern in SUCCESS_PATTERNS)\n                or result_str.lower() in [\"true\", \"1\", \"yes\", \"pass\"]\n            )\n        elif isinstance(judge_result, (int, float)):\n            return judge_result > 0\n        elif isinstance(judge_result, dict):\n            return judge_result.get(\"correct\", False) or judge_result.get(\n                \"is_correct\", False\n            )\n        return False\n\n    def _calculate_turns(self, data: Dict) -> int:\n        \"\"\"Calculate number of turns from task data (excluding system prompt)\"\"\"\n        try:\n            main_agent_history = data.get(\"main_agent_message_history\", {})\n            message_history = main_agent_history.get(\"message_history\", [])\n\n            if not message_history:\n                return 0\n\n            # Filter out system messages and count total messages, then divide by 2\n            # Turn count = (total messages excluding system) / 2\n            non_system_messages = [\n                msg for msg in message_history if msg.get(\"role\") != \"system\"\n            ]\n\n            # Each turn consists of user + assistant, so divide by 2\n            turn_count = len(non_system_messages) // 2\n\n            return turn_count\n        except (KeyError, TypeError, IndexError):\n            return 0\n\n    def analyze_run_directory(\n        self, run_dir: str, task_id_pattern: str\n    ) -> Tuple[TaskStats, Dict[str, bool]]:\n        \"\"\"Analyze a single run directory and return statistics and task results\n\n        Returns:\n            Tuple[TaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct\n        \"\"\"\n        latest_files = self._get_latest_task_files(run_dir, task_id_pattern)\n\n        # Use the correct total tasks\n        stats = TaskStats(total=self.total_tasks_per_run)\n        completed_files = []  # Track completed files for timing analysis\n        task_results = {}  # Track task_id -> is_correct mapping\n\n        for json_file in latest_files:\n            try:\n                with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                    data = json.load(f)\n\n                status = data.get(\"status\", \"\")\n\n                if status == \"running\":\n                    stats.running += 1\n                elif self._is_task_completed(data):\n                    stats.completed += 1\n                    completed_files.append(json_file)  # Track for timing analysis\n\n                    # Check judge result for completed tasks\n                    judge_result = data.get(\"final_judge_result\", None)\n                    is_correct = judge_result is not None and self._is_judge_correct(\n                        judge_result\n                    )\n                    if is_correct:\n                        stats.judge_correct += 1\n\n                    # Extract task ID and store result\n                    filename = os.path.basename(json_file)\n                    task_id = self._extract_task_id(filename, task_id_pattern)\n                    if task_id:\n                        task_results[task_id] = is_correct\n\n                    # Check if final_boxed_answer contains \"No \\\\boxed{} content found\"\n                    final_boxed_answer = data.get(\"final_boxed_answer\", \"\")\n                    if (\n                        isinstance(final_boxed_answer, str)\n                        and \"No \\\\boxed{} content found\" in final_boxed_answer\n                    ):\n                        stats.no_boxed_found += 1\n\n                    # Calculate turns for completed tasks\n                    turns = self._calculate_turns(data)\n                    if turns > 0:\n                        stats.total_turns += turns\n                        stats.completed_tasks_with_turns += 1\n                else:\n                    stats.failed += 1\n\n            except (json.JSONDecodeError, IOError) as e:\n                # Skip files that are being written or corrupted\n                if \"Expecting value\" in str(e) or \"line 1 column 1\" in str(e):\n                    continue  # Skip corrupted/empty files\n                print(f\"Warning: Could not parse {json_file}: {e}\")\n                stats.failed += 1\n            except Exception as e:\n                print(f\"Warning: Unexpected error processing {json_file}: {e}\")\n                stats.failed += 1\n\n        # Store completed files in stats for timing analysis\n        stats.completed_files = completed_files\n        return stats, task_results\n\n    def run_analysis(\n        self, benchmark_name_std: str, task_id_pattern: str\n    ) -> SummaryStats:\n        \"\"\"Run the complete analysis and return summary statistics\"\"\"\n        self.run_dirs = self.find_run_directories()\n        summary = SummaryStats()\n        run_stats_list = []  # Store statistics for each run\n        all_completed_files = []  # Collect all completed files for timing analysis\n        all_task_results = {}  # Collect task_id -> list of is_correct across all runs\n\n        print()\n        print(\"=\" * 80)\n        print(f\"Analyzing benchmark progress for: {self.target_path}\")\n        print(f\"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n        print(\"=\" * 80)\n\n        # Analyze each run directory\n        for run_dir in self.run_dirs:\n            run_name = os.path.basename(run_dir)\n            stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)\n\n            if stats.total == 0:\n                print(f\"{run_name}: No task files found\")\n                print()\n                continue\n\n            # Display run statistics in a single line\n            run_info = f\"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}\"\n\n            # Add accuracy information\n            if stats.completed > 0:\n                run_info += f\" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)\"\n\n                # Add average turns information (show even if some tasks are still running)\n                if stats.completed_tasks_with_turns > 0:\n                    run_info += f\" | Avg Turns: {stats.average_turns:.1f}\"\n\n            print(run_info)\n            print()\n\n            # Store run statistics for later display\n            run_stats_list.append((run_name, stats))\n\n            # Collect completed files for timing analysis\n            all_completed_files.extend(stats.completed_files)\n\n            # Collect task results for Pass@n calculation\n            for task_id, is_correct in task_results.items():\n                if task_id not in all_task_results:\n                    all_task_results[task_id] = []\n                all_task_results[task_id].append(is_correct)\n\n            # Update summary statistics\n            summary.total_tasks += stats.total\n            summary.total_completed += stats.completed\n            summary.total_running += stats.running\n            summary.total_failed += stats.failed\n            summary.total_judge_correct += stats.judge_correct\n            summary.total_no_boxed_found += stats.no_boxed_found\n\n        # Display summary after all runs are processed\n        self._display_summary(\n            summary,\n            run_stats_list,\n            all_completed_files,\n            benchmark_name_std,\n            all_task_results,\n        )\n\n        return summary\n\n    def _calculate_pass_at_n(\n        self, all_task_results: Dict[str, List[bool]], total_tasks: int\n    ) -> Tuple[int, float]:\n        \"\"\"Calculate Pass@n: number of tasks with at least one correct answer across all runs\n\n        Returns:\n            Tuple[int, float]: (pass_at_n_count, pass_at_n_percentage)\n        \"\"\"\n        if not all_task_results or total_tasks == 0:\n            return 0, 0.0\n\n        pass_at_n_count = 0\n        for task_id, results in all_task_results.items():\n            # If at least one run got it correct, this task passes\n            if any(results):\n                pass_at_n_count += 1\n\n        pass_at_n_percentage = (\n            (pass_at_n_count / total_tasks * 100) if total_tasks > 0 else 0.0\n        )\n        return pass_at_n_count, pass_at_n_percentage\n\n    def _display_summary(\n        self,\n        summary: SummaryStats,\n        run_stats_list: List[Tuple[str, TaskStats]],\n        completed_files: List[str],\n        benchmark_name_std: str,\n        all_task_results: Dict[str, List[bool]] = None,\n    ):\n        \"\"\"Display summary statistics\"\"\"\n        print(\"=\" * 80)\n        print(\"SUMMARY STATISTICS\")\n        print(\"=\" * 80)\n        print(\n            f\"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\"\n        )\n\n        # Estimate completion time using overall progress rate\n        if summary.total_tasks > 0 and summary.total_completed > 0:\n            remaining_tasks = summary.total_tasks - summary.total_completed\n            earliest_start = find_earliest_start_time(completed_files)\n            latest_end = find_latest_end_time(completed_files)\n            completion_estimate = estimate_completion_time(\n                summary.total_tasks, summary.total_completed, completed_files\n            )\n\n            print(f\"Remaining Tasks: {remaining_tasks}\")\n            if earliest_start:\n                elapsed_time = latest_end - earliest_start\n                elapsed_minutes = elapsed_time.total_seconds() / 60\n                tasks_per_minute = (\n                    summary.total_completed / elapsed_minutes\n                    if elapsed_minutes > 0\n                    else 0\n                )\n                print(f\"Elapsed Time: {elapsed_minutes:.1f} minutes\")\n                print(f\"Completion Rate: {tasks_per_minute:.1f} tasks/minute\")\n            print(f\"Estimated Time to Complete: {completion_estimate}\")\n\n        if summary.total_completed > 0:\n            accuracy_bar = create_progress_bar(summary.total_judge_accuracy)\n            print(\n                f\"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}\"\n            )\n\n            # Calculate and display overall average turns\n            total_turns = sum(stats.total_turns for _, stats in run_stats_list)\n            total_tasks_with_turns = sum(\n                stats.completed_tasks_with_turns for _, stats in run_stats_list\n            )\n            if total_tasks_with_turns > 0:\n                overall_avg_turns = total_turns / total_tasks_with_turns\n                print(f\"Overall Average Turns: {overall_avg_turns:.1f}\")\n\n        # Display each run's correct percentage\n        if run_stats_list:\n            print()\n            print(\"INDIVIDUAL RUN ACCURACIES:\")\n            for run_name, stats in run_stats_list:\n                if stats.completed > 0:\n                    accuracy_bar = create_progress_bar(stats.judge_accuracy)\n                    print(\n                        f\"  {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}\"\n                    )\n                else:\n                    print(\n                        f\"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\"\n                    )\n\n            # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))\n            num_runs = len(run_stats_list)\n            mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)\n            if mean_acc > 0:\n                print()\n                if num_runs > 1:\n                    print(\n                        f\"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\"\n                    )\n                else:\n                    print(f\"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\")\n\n            # Display Pass@n if multiple runs\n            if num_runs > 1 and all_task_results:\n                # Calculate total unique tasks (use the first run's total as reference)\n                first_run_total = (\n                    run_stats_list[0][1].total\n                    if run_stats_list\n                    else summary.total_tasks\n                )\n                pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(\n                    all_task_results, first_run_total\n                )\n                pass_at_n_bar = create_progress_bar(pass_at_n_percentage)\n                print(\n                    f\"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}\"\n                )\n\n            # Display no boxed content found statistics\n            if summary.total_completed > 0:\n                print(\n                    f\"No \\\\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)\"\n                )\n\n        print(\"=\" * 80)\n        print()\n\n        # Save analysis results to log file\n        self._save_analysis_log(\n            summary,\n            run_stats_list,\n            completed_files,\n            benchmark_name_std,\n            all_task_results,\n        )\n\n    def _save_analysis_log(\n        self,\n        summary: SummaryStats,\n        run_stats_list: List[Tuple[str, TaskStats]],\n        completed_files: List[str],\n        benchmark_name_std: str,\n        all_task_results: Dict[str, List[bool]] = None,\n    ) -> None:\n        \"\"\"Save analysis results to a log file in the target directory\"\"\"\n        try:\n            # Create log filename with timestamp\n            timestamp = datetime.now().strftime(LOG_FILE_TIMESTAMP_FORMAT)\n            log_filename = f\"{LOG_FILE_PREFIX}{timestamp}.log\"\n            log_path = os.path.join(self.target_path, log_filename)\n\n            # Capture the analysis output\n            output_buffer = StringIO()\n\n            # Write header\n            output_buffer.write(\"=\" * 80 + \"\\n\")\n            output_buffer.write(f\"{benchmark_name_std} Progress Analysis\\n\")\n            output_buffer.write(\n                f\"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n\"\n            )\n            output_buffer.write(f\"Target Path: {self.target_path}\\n\")\n            output_buffer.write(\"=\" * 80 + \"\\n\\n\")\n\n            # Write run statistics\n            for run_name, stats in run_stats_list:\n                output_buffer.write(\n                    f\"{run_name}: Status: {stats.completed} completed, {stats.running} running, {stats.failed} failed\\n\"\n                )\n                if stats.completed > 0:\n                    accuracy = stats.judge_correct / stats.completed * 100\n                    output_buffer.write(\n                        f\"  Overall Accuracy: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\\n\"\n                    )\n                else:\n                    output_buffer.write(\n                        f\"  Overall Accuracy: {stats.judge_correct}/{stats.completed} (N/A)\\n\"\n                    )\n                output_buffer.write(\"\\n\")\n\n            # Write summary statistics\n            output_buffer.write(\"=\" * 80 + \"\\n\")\n            output_buffer.write(\"SUMMARY STATISTICS\\n\")\n            output_buffer.write(\"=\" * 80 + \"\\n\")\n            output_buffer.write(\n                f\"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\\n\"\n            )\n\n            # Write timing information\n            if summary.total_tasks > 0 and summary.total_completed > 0:\n                remaining_tasks = summary.total_tasks - summary.total_completed\n                earliest_start = find_earliest_start_time(completed_files)\n                latest_end = find_latest_end_time(completed_files)\n                completion_estimate = estimate_completion_time(\n                    summary.total_tasks, summary.total_completed, completed_files\n                )\n\n                output_buffer.write(f\"Remaining Tasks: {remaining_tasks}\\n\")\n                if earliest_start:\n                    elapsed_time = latest_end - earliest_start\n                    elapsed_minutes = elapsed_time.total_seconds() / 60\n                    tasks_per_minute = (\n                        summary.total_completed / elapsed_minutes\n                        if elapsed_minutes > 0\n                        else 0\n                    )\n                    output_buffer.write(\n                        f\"Elapsed Time: {elapsed_minutes:.1f} minutes\\n\"\n                    )\n                    output_buffer.write(\n                        f\"Completion Rate: {tasks_per_minute:.1f} tasks/minute\\n\"\n                    )\n                output_buffer.write(\n                    f\"Estimated Time to Complete: {completion_estimate}\\n\"\n                )\n\n            if summary.total_completed > 0:\n                accuracy = summary.total_judge_correct / summary.total_completed * 100\n                output_buffer.write(\n                    f\"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} ({accuracy:.1f}%)\\n\"\n                )\n                no_boxed_percentage = (\n                    summary.total_no_boxed_found / summary.total_completed * 100\n                )\n                output_buffer.write(\n                    f\"No \\\\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\\n\"\n                )\n\n            # Write individual run accuracies\n            if run_stats_list:\n                output_buffer.write(\"\\nINDIVIDUAL RUN ACCURACIES:\\n\")\n                for run_name, stats in run_stats_list:\n                    if stats.completed > 0:\n                        accuracy = stats.judge_correct / stats.completed * 100\n                        output_buffer.write(\n                            f\"  {run_name}: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\\n\"\n                        )\n                    else:\n                        output_buffer.write(\n                            f\"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\\n\"\n                        )\n\n                # Write mean accuracy and standard deviation (Pass@1 Acc (Avg@n))\n                num_runs = len(run_stats_list)\n                mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)\n                if mean_acc > 0:\n                    if num_runs > 1:\n                        output_buffer.write(\n                            f\"\\nPass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\\n\"\n                        )\n                    else:\n                        output_buffer.write(\n                            f\"\\nMEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\\n\"\n                        )\n\n                # Write Pass@n if multiple runs\n                if num_runs > 1 and all_task_results:\n                    first_run_total = (\n                        run_stats_list[0][1].total\n                        if run_stats_list\n                        else summary.total_tasks\n                    )\n                    pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(\n                        all_task_results, first_run_total\n                    )\n                    output_buffer.write(\n                        f\"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} ({pass_at_n_percentage:.1f}%)\\n\"\n                    )\n\n                    if summary.total_completed > 0:\n                        no_boxed_percentage = (\n                            summary.total_no_boxed_found / summary.total_completed * 100\n                        )\n                        output_buffer.write(\n                            f\"No \\\\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\\n\"\n                        )\n\n            output_buffer.write(\"=\" * 80 + \"\\n\")\n\n            # Write to file\n            with open(log_path, \"w\", encoding=\"utf-8\") as f:\n                f.write(output_buffer.getvalue())\n\n            output_buffer.close()\n            print(f\"Analysis results saved to: {log_path}\")\n\n        except Exception as e:\n            print(f\"Warning: Could not save analysis log: {e}\")\n\n\nclass GAIAProgressChecker(ProgressChecker):\n    \"\"\"Main class for checking GAIA benchmark progress\"\"\"\n\n    DIFFICULTY_LEVELS = [1, 2, 3]\n\n    def __init__(self, target_path: str, task_per_run: int, data_path: str):\n        super().__init__(target_path, task_per_run=0, data_path=\"\")  # 调用父类构造函数\n\n        # Difficulty level mapping\n        self.task_difficulty_map: Dict[str, int] = {}\n        self.total_tasks_per_run = task_per_run\n\n        # Load GAIA data if this is a GAIA validation directory\n        self._load_benchmark_data(data_path)\n\n    def _load_benchmark_data(self, data_path) -> None:\n        \"\"\"Load GAIA-specific data and configuration\"\"\"\n        try:\n            if os.path.exists(data_path):\n                with open(data_path) as f:\n                    benchmark_data = [json.loads(line) for line in f.readlines()]\n\n                print(f\"Loaded {len(benchmark_data)} tasks from {data_path}\")\n\n                for line in benchmark_data:\n                    task_id = line[\"task_id\"]\n                    metadata = line.get(\"metadata\", {})\n                    difficulty_level = (\n                        metadata.get(\"Level\") or metadata.get(\"level\") or 0\n                    )\n                    if difficulty_level in self.DIFFICULTY_LEVELS:\n                        self.task_difficulty_map[task_id] = difficulty_level\n\n                level_counts = {\n                    level: sum(\n                        1 for v in self.task_difficulty_map.values() if v == level\n                    )\n                    for level in self.DIFFICULTY_LEVELS\n                }\n                print(f\"Difficulty level distribution: {level_counts}\")\n\n        except Exception as e:\n            print(f\"Warning: Could not load GAIA data: {e}\")\n\n    def _update_difficulty_stats(\n        self, stats: GAIATaskStats, task_id: str, is_correct: bool\n    ) -> None:\n        \"\"\"Update difficulty level statistics for a task\"\"\"\n        if task_id not in self.task_difficulty_map:\n            return\n        difficulty_level = self.task_difficulty_map[task_id]\n        if difficulty_level == 1:\n            stats.level1_completed += 1\n            if is_correct:\n                stats.level1_correct += 1\n        elif difficulty_level == 2:\n            stats.level2_completed += 1\n            if is_correct:\n                stats.level2_correct += 1\n        elif difficulty_level == 3:\n            stats.level3_completed += 1\n            if is_correct:\n                stats.level3_correct += 1\n\n    def analyze_run_directory(\n        self, run_dir: str, task_id_pattern: str\n    ) -> Tuple[GAIATaskStats, Dict[str, bool]]:\n        \"\"\"Analyze a single run directory and return statistics (GAIA-specific)\n\n        Returns:\n            Tuple[GAIATaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct\n        \"\"\"\n        latest_files = self._get_latest_task_files(\n            run_dir, task_id_pattern\n        )  # 直接用父类的实现\n        stats = GAIATaskStats(total=len(latest_files))\n        completed_files = []\n        task_results = {}  # Track task_id -> is_correct mapping\n\n        for json_file in latest_files:\n            try:\n                with open(json_file, \"r\", encoding=\"utf-8\") as f:\n                    data = json.load(f)\n\n                status = data.get(\"status\", \"\")\n                if status == \"running\":\n                    stats.running += 1\n                elif self._is_task_completed(data):\n                    stats.completed += 1\n                    completed_files.append(json_file)\n\n                    judge_result = data.get(\"final_judge_result\", None)\n                    is_correct = judge_result is not None and self._is_judge_correct(\n                        judge_result\n                    )\n                    if is_correct:\n                        stats.judge_correct += 1\n\n                    # Check if final_boxed_answer contains \"No \\\\boxed{} content found\"\n                    final_boxed_answer = data.get(\"final_boxed_answer\", \"\")\n                    if (\n                        isinstance(final_boxed_answer, str)\n                        and \"No \\\\boxed{} content found\" in final_boxed_answer\n                    ):\n                        stats.no_boxed_found += 1\n\n                    task_id = self._extract_task_id(\n                        os.path.basename(json_file), task_id_pattern\n                    )\n                    if task_id:\n                        self._update_difficulty_stats(stats, task_id, is_correct)\n                        task_results[task_id] = is_correct\n\n                    # Calculate turns for completed tasks\n                    turns = self._calculate_turns(data)\n                    if turns > 0:\n                        stats.total_turns += turns\n                        stats.completed_tasks_with_turns += 1\n                else:\n                    stats.failed += 1\n            except Exception as e:\n                print(f\"Warning: Could not process {json_file}: {e}\")\n                stats.failed += 1\n\n        stats.completed_files = completed_files\n        return stats, task_results\n\n    def run_analysis(\n        self, benchmark_name_std: str, task_id_pattern: str\n    ) -> GAIASummaryStats:\n        \"\"\"Run the complete analysis and return summary statistics\"\"\"\n        self.run_dirs = self.find_run_directories()\n        summary = GAIASummaryStats()\n        run_stats_list = []  # Store statistics for each run\n        all_completed_files = []  # Collect all completed files for timing analysis\n        all_task_results = {}  # Collect task_id -> list of is_correct across all runs\n\n        print()\n        print(\"=\" * 80)\n        print(f\"Analyzing benchmark progress for: {self.target_path}\")\n        print(f\"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n        print(\"=\" * 80)\n\n        # Analyze each run directory\n        for run_dir in self.run_dirs:\n            run_name = os.path.basename(run_dir)\n            stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)\n\n            if stats.total == 0:\n                print(f\"{run_name}: No task files found\")\n                print()\n                continue\n\n            # Display run statistics in a single line\n            run_info = f\"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}\"\n\n            # Add accuracy information\n            if stats.completed > 0:\n                run_info += f\" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)\"\n\n                # Add average turns information (show even if some tasks are still running)\n                if stats.completed_tasks_with_turns > 0:\n                    run_info += f\" | Avg Turns: {stats.average_turns:.1f}\"\n\n            print(run_info)\n            print()\n\n            # Store run statistics for later display\n            run_stats_list.append((run_name, stats))\n\n            # Collect completed files for timing analysis\n            all_completed_files.extend(stats.completed_files)\n\n            # Collect task results for Pass@n calculation\n            for task_id, is_correct in task_results.items():\n                if task_id not in all_task_results:\n                    all_task_results[task_id] = []\n                all_task_results[task_id].append(is_correct)\n\n            # Update summary statistics\n            self._update_summary_stats(summary, stats)\n\n        # Display summary after all runs are processed\n        self._display_summary(\n            summary,\n            run_stats_list,\n            all_completed_files,\n            benchmark_name_std,\n            all_task_results,\n        )\n\n        return summary\n\n    def _update_summary_stats(\n        self, summary: GAIASummaryStats, stats: GAIATaskStats\n    ) -> None:\n        \"\"\"Update summary statistics with data from a single run\"\"\"\n        summary.total_tasks += stats.total\n        summary.total_completed += stats.completed\n        summary.total_running += stats.running\n        summary.total_failed += stats.failed\n        summary.total_judge_correct += stats.judge_correct\n        summary.total_no_boxed_found += stats.no_boxed_found\n\n        # Update difficulty level summary stats\n        summary.level1_completed += stats.level1_completed\n        summary.level1_correct += stats.level1_correct\n        summary.level2_completed += stats.level2_completed\n        summary.level2_correct += stats.level2_correct\n        summary.level3_completed += stats.level3_completed\n        summary.level3_correct += stats.level3_correct\n\n    def _display_summary(\n        self,\n        summary: GAIASummaryStats,\n        run_stats_list: List[Tuple[str, GAIATaskStats]],\n        completed_files: List[str],\n        benchmark_name_std: str,\n        all_task_results: Dict[str, List[bool]] = None,\n    ):\n        \"\"\"Display summary statistics\"\"\"\n        print(\"=\" * 80)\n        print(\"SUMMARY STATISTICS\")\n        print(\"=\" * 80)\n\n        # Estimate completion time using overall progress rate\n        if summary.total_completed > 0:\n            num_runs = len(run_stats_list) if run_stats_list else 1\n            expected_total_tasks = self.total_tasks_per_run * num_runs\n            remaining_tasks = expected_total_tasks - summary.total_completed\n            earliest_start = find_earliest_start_time(completed_files)\n            last_end = find_latest_end_time(completed_files)\n            completion_estimate = estimate_completion_time(\n                expected_total_tasks, summary.total_completed, completed_files\n            )\n\n            print(\n                f\"Current Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\"\n            )\n            print(f\"Remaining Tasks to Complete: {remaining_tasks}\")\n            if earliest_start:\n                elapsed_time = last_end - earliest_start\n                elapsed_minutes = elapsed_time.total_seconds() / 60\n                overall_rate = (\n                    summary.total_completed / elapsed_minutes\n                    if elapsed_minutes > 0\n                    else 0\n                )\n                print(f\"Elapsed Time: {elapsed_minutes:.1f} minutes\")\n                print(f\"Completion Rate: {overall_rate:.2f} tasks/minute\")\n\n            print(f\"Estimated Time to Complete: {completion_estimate}\")\n\n        # Display each run's correct percentage\n        if run_stats_list:\n            print()\n            print(\"INDIVIDUAL RUN ACCURACIES:\")\n            for run_name, stats in run_stats_list:\n                if stats.completed > 0:\n                    accuracy_bar = create_progress_bar(stats.judge_accuracy)\n                    print(\n                        f\"  {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}\"\n                    )\n\n                    # Add difficulty level information for each run\n                    if (\n                        stats.level1_completed > 0\n                        or stats.level2_completed > 0\n                        or stats.level3_completed > 0\n                    ):\n                        # Calculate total expected tasks for each difficulty level\n                        total_level1 = sum(\n                            1\n                            for level in self.task_difficulty_map.values()\n                            if level == 1\n                        )\n                        total_level2 = sum(\n                            1\n                            for level in self.task_difficulty_map.values()\n                            if level == 2\n                        )\n                        total_level3 = sum(\n                            1\n                            for level in self.task_difficulty_map.values()\n                            if level == 3\n                        )\n\n                        difficulty_info = (\n                            f\"    L1: {stats.level1_correct}/{stats.level1_completed}/{total_level1} ({stats.level1_accuracy:.1f}%) | \"\n                            f\"L2: {stats.level2_correct}/{stats.level2_completed}/{total_level2} ({stats.level2_accuracy:.1f}%) | \"\n                            f\"L3: {stats.level3_correct}/{stats.level3_completed}/{total_level3} ({stats.level3_accuracy:.1f}%)\"\n                        )\n                        print(f\"    {difficulty_info}\")\n                        print()\n                else:\n                    print(\n                        f\"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\"\n                    )\n\n            # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))\n            num_runs = len(run_stats_list)\n            mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)\n            if mean_acc > 0:\n                print()\n                if num_runs > 1:\n                    print(\n                        f\"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\"\n                    )\n                else:\n                    print(f\"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\")\n\n            # Display Pass@n if multiple runs\n            if num_runs > 1 and all_task_results:\n                # Use the first run's total as reference\n                first_run_total = (\n                    run_stats_list[0][1].total\n                    if run_stats_list\n                    else summary.total_tasks\n                )\n                pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(\n                    all_task_results, first_run_total\n                )\n                pass_at_n_bar = create_progress_bar(pass_at_n_percentage)\n                print(\n                    f\"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}\"\n                )\n\n            # Display no boxed content found statistics\n            if summary.total_completed > 0:\n                print(\n                    f\"No \\\\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)\"\n                )\n\n        # Display overall judge accuracy after individual runs\n        if summary.total_completed > 0:\n            print()\n            accuracy_bar = create_progress_bar(summary.total_judge_accuracy)\n            print(\n                f\"OVERALL JUDGE ACCURACY: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}\"\n            )\n\n            # Calculate and display overall average turns\n            total_turns = sum(stats.total_turns for _, stats in run_stats_list)\n            total_tasks_with_turns = sum(\n                stats.completed_tasks_with_turns for _, stats in run_stats_list\n            )\n            if total_tasks_with_turns > 0:\n                overall_avg_turns = total_turns / total_tasks_with_turns\n                print(f\"OVERALL AVERAGE TURNS: {overall_avg_turns:.1f}\")\n\n        # Display difficulty level summary if available\n        if (\n            summary.level1_completed > 0\n            or summary.level2_completed > 0\n            or summary.level3_completed > 0\n        ):\n            print()\n            print(\"DIFFICULTY LEVEL SUMMARY:\")\n            # Calculate total expected tasks for each difficulty level\n            total_level1 = sum(\n                1 for level in self.task_difficulty_map.values() if level == 1\n            )\n            total_level2 = sum(\n                1 for level in self.task_difficulty_map.values() if level == 2\n            )\n            total_level3 = sum(\n                1 for level in self.task_difficulty_map.values() if level == 3\n            )\n\n            print(\n                f\"  L1: {summary.level1_correct}/{summary.level1_completed}/{total_level1} ({summary.level1_accuracy:.1f}%) | L2: {summary.level2_correct}/{summary.level2_completed}/{total_level2} ({summary.level2_accuracy:.1f}%) | L3: {summary.level3_correct}/{summary.level3_completed}/{total_level3} ({summary.level3_accuracy:.1f}%)\"\n            )\n\n        print(\"=\" * 80)\n        print()\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/common_benchmark.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport gc\nimport json\nimport os\nimport random\nimport re\nfrom abc import ABC\nfrom concurrent.futures import ProcessPoolExecutor\nfrom dataclasses import asdict, dataclass, field\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Optional, Tuple\n\nimport hydra\n\n# Import from the new modular structure\nfrom evaluators.eval_utils import verify_answer_for_datasets\nfrom omegaconf import DictConfig, OmegaConf\nfrom src.core.pipeline import (\n    create_pipeline_components,\n    execute_task_pipeline,\n)\nfrom src.logging.summary_time_cost import generate_summary\nfrom src.utils.prompt_utils import (\n    FAILURE_EXPERIENCE_FOOTER,\n    FAILURE_EXPERIENCE_HEADER,\n    FAILURE_EXPERIENCE_ITEM,\n    FORMAT_ERROR_MESSAGE,\n)\n\n\ndef _task_worker(task_dict, cfg_dict, evaluator_kwargs):\n    \"\"\"\n    Worker function to run a single task in a separate process.\n    This function is called by ProcessPoolExecutor and must be at module level.\n    \"\"\"\n    import asyncio\n\n    from omegaconf import OmegaConf\n\n    # Reconstruct config in this process\n    cfg = OmegaConf.create(cfg_dict)\n\n    # Reconstruct task\n    task = BenchmarkTask(\n        task_id=task_dict[\"task_id\"],\n        task_question=task_dict[\"task_question\"],\n        ground_truth=task_dict[\"ground_truth\"],\n        file_path=task_dict.get(\"file_path\"),\n        metadata=task_dict.get(\"metadata\", {}),\n    )\n\n    # Create evaluator in this process\n    evaluator = GenericEvaluator(\n        data_dir=evaluator_kwargs[\"data_dir\"],\n        benchmark_name=evaluator_kwargs[\"benchmark_name\"],\n        cfg=cfg,\n        metadata_file=evaluator_kwargs.get(\"metadata_file\", \"metadata.jsonl\"),\n        task_id_field=evaluator_kwargs.get(\"task_id_field\", \"task_id\"),\n        question_field=evaluator_kwargs.get(\"question_field\", \"task_question\"),\n        ground_truth_field=evaluator_kwargs.get(\"ground_truth_field\", \"ground_truth\"),\n        file_name_field=evaluator_kwargs.get(\"file_name_field\"),\n    )\n\n    # Run task in new event loop\n    loop = asyncio.new_event_loop()\n    asyncio.set_event_loop(loop)\n\n    # Set exception handler to suppress \"Task exception was never retrieved\" warnings\n    def exception_handler(loop, context):\n        # Suppress all asyncio internal warnings for cleaner output\n        pass\n\n    loop.set_exception_handler(exception_handler)\n\n    try:\n        result = loop.run_until_complete(evaluator.run_single_task(task))\n        # Convert result to dict for serialization\n        return asdict(result)\n    finally:\n        loop.close()\n\n\n@dataclass\nclass BenchmarkTask:\n    \"\"\"Generic benchmark task data structure\"\"\"\n\n    task_id: str\n    task_question: str\n    ground_truth: str\n    file_path: Optional[str] = None\n    metadata: Dict[str, Any] = field(default_factory=dict)\n    model_boxed_answer: str = \"\"\n    status: str = \"pending\"  # pending, success, failed\n\n\n@dataclass\nclass BenchmarkResult:\n    \"\"\"Generic benchmark evaluation result structure\"\"\"\n\n    task_id: str\n    task_question: str\n    ground_truth: str\n    file_path: Optional[str]\n    status: str\n    model_boxed_answer: str = \"\"\n    metadata: Dict[str, Any] = field(default_factory=dict)\n    error_message: str = \"\"\n    final_judge_result: Optional[str] = None\n    judge_type: Optional[str] = None\n    log_file_path: Optional[str] = None\n    # Pass@K support fields\n    attempts: List[Dict[str, Any]] = field(default_factory=list)  # Store all attempts\n    pass_at_k_success: bool = False  # Whether task passed using pass@k evaluation\n    k_value: int = 1  # The k value used for this evaluation\n\n\nclass BenchmarkEvaluator(ABC):\n    \"\"\"Abstract base class for benchmark evaluators\"\"\"\n\n    def __init__(self, data_dir: str, benchmark_name: str, cfg: DictConfig):\n        \"\"\"\n        Initialize benchmark evaluator\n\n        Args:\n            data_dir: Path to benchmark data directory\n            benchmark_name: Name of the benchmark\n            cfg: The Hydra configuration object\n        \"\"\"\n        self.data_dir = Path(data_dir)\n        self.benchmark_name = benchmark_name\n        self.cfg = cfg\n        self.pass_at_k = cfg.benchmark.execution.get(\"pass_at_k\", 1)\n        self.tasks: List[BenchmarkTask] = []\n        self.results: List[BenchmarkResult] = []\n\n        # Format error tracking and retry configuration\n        # Read from agent config as it's part of context management\n        self.context_compress_limit = cfg.agent.get(\"context_compress_limit\", 0)\n\n        # Get LLM provider and model from the config object\n        self.llm_provider = cfg.llm.provider\n        self.llm_model = cfg.llm.model_name\n\n        # Initialize pipeline components\n        print(\"Initializing pipeline components...\")\n        (\n            self.main_agent_tool_manager,\n            self.sub_agent_tool_managers,\n            self.output_formatter,\n        ) = create_pipeline_components(cfg)\n        print(\n            f\"Pipeline components initialized successfully! Using pass@{self.pass_at_k}\"\n        )\n\n    def get_log_dir(self) -> Path:\n        \"\"\"Get the log directory for the current benchmark and model.\"\"\"\n        return Path(hydra.core.hydra_config.HydraConfig.get().run.dir)\n\n    async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:\n        \"\"\"\n        Run inference for a single benchmark task with pass@k support\n\n        Args:\n            task: BenchmarkTask object\n\n        Returns:\n            BenchmarkResult object\n        \"\"\"\n        print(f\"Processing task {task.task_id} with pass@{self.pass_at_k}\")\n\n        result = BenchmarkResult(\n            task_id=task.task_id,\n            task_question=task.task_question,\n            ground_truth=task.ground_truth,\n            file_path=task.file_path,\n            model_boxed_answer=\"\",\n            status=\"pending\",\n            metadata=task.metadata.copy(),\n            k_value=self.pass_at_k,\n        )\n\n        logs_dir = self.get_log_dir()\n        found_correct_answer = False\n\n        # Print debug info about log directory\n        print(f\"  Current log directory: {logs_dir}\")\n\n        try:\n            # Prepare task\n            task_description, task_file_path = self.prepare_task_description(task)\n\n            # Run up to k attempts (with early stopping when correct answer found)\n            for attempt in range(1, self.pass_at_k + 1):\n                print(f\"  Attempt {attempt}/{self.pass_at_k} for task {task.task_id}\")\n                format_retry_count = 0\n\n                # Check if log file exists for this specific attempt in current directory\n                log_pattern = f\"task_{task.task_id}_attempt-{attempt}_*.json\"\n                matching_logs = []\n\n                # Search only in current log directory\n                if logs_dir.exists():\n                    dir_logs = sorted(list(logs_dir.glob(log_pattern)))\n                    if dir_logs:\n                        matching_logs.extend(dir_logs)\n\n                if matching_logs:\n                    # Sort by timestamp in filename to get the most recent\n                    def extract_timestamp(file_path):\n                        filename = file_path.name\n                        # Extract timestamp from filename like: task_xxx_attempt-1_format-retry-0_2025-08-13-10-13-20.json\n                        # The timestamp is the last part before .json\n                        if \"_\" in filename and filename.endswith(\".json\"):\n                            timestamp_part = filename.split(\"_\")[-1].replace(\n                                \".json\", \"\"\n                            )\n                            # Convert timestamp to datetime for proper sorting\n                            from datetime import datetime\n\n                            return datetime.strptime(\n                                timestamp_part, \"%Y-%m-%d-%H-%M-%S\"\n                            )\n                        return filename\n\n                    matching_logs = sorted(matching_logs, key=extract_timestamp)\n\n                attempt_result = {\n                    \"attempt_number\": attempt,\n                    \"model_boxed_answer\": \"\",\n                    \"status\": \"pending\",\n                    \"log_file_path\": None,\n                    \"final_judge_result\": None,\n                    \"judge_type\": None,\n                    \"is_correct\": False,\n                }\n\n                # Try to load existing result for this attempt\n                if matching_logs:\n                    log_file = matching_logs[-1]\n                    attempt_result[\"log_file_path\"] = str(log_file)\n                    print(\n                        f\"    Found existing log for attempt {attempt}: {log_file.name}\"\n                    )\n\n                    match = re.search(r\"retry-(\\d+)\", os.path.basename(str(log_file)))\n                    if match:\n                        format_retry_count = int(match.group(1))\n                    else:\n                        raise ValueError(\n                            f\"Failed to extract retry number from log file: {log_file}\"\n                        )\n\n                    try:\n                        with open(log_file) as f:\n                            log_data = json.loads(f.read())\n                            if log_data.get(\"status\") == \"success\":\n                                format_retry_count += 1\n                            if log_data.get(\"final_boxed_answer\"):\n                                attempt_result[\"model_boxed_answer\"] = log_data[\n                                    \"final_boxed_answer\"\n                                ]\n                                attempt_result[\"status\"] = log_data.get(\"status\")\n                                # Check if we already have judge result in log\n                                if log_data.get(\"final_judge_result\"):\n                                    attempt_result[\"final_judge_result\"] = log_data[\n                                        \"final_judge_result\"\n                                    ]\n                                    attempt_result[\"judge_type\"] = log_data.get(\n                                        \"judge_type\", \"\"\n                                    )\n                                    attempt_result[\"is_correct\"] = (\n                                        log_data[\"final_judge_result\"] == \"CORRECT\"\n                                    )\n                                    # Load evaluation details if available\n                                    if log_data.get(\"eval_details\"):\n                                        attempt_result[\"eval_details\"] = log_data[\n                                            \"eval_details\"\n                                        ]\n                                print(\n                                    f\"    Loaded existing result: {attempt_result['model_boxed_answer']}\"\n                                )\n                    except Exception as e:\n                        print(f\"    Error loading log file {log_file}: {e}\")\n\n                # Run inference if no existing result or if we have a format error\n                if (\n                    not attempt_result[\"model_boxed_answer\"]\n                    or attempt_result[\"model_boxed_answer\"] == FORMAT_ERROR_MESSAGE\n                ):\n                    # Try to get a valid response with format retry\n                    print(f\"TASK ID: {task.task_id}, ATTEMPT: {attempt}\")\n\n                    max_format_retries = self.context_compress_limit\n\n                    # Track accumulated failure experiences for this attempt\n                    # Start with the original task description\n                    current_task_description = task_description\n                    failure_experiences = []\n\n                    # Resume: Recover failure experiences from previous retry logs\n                    if format_retry_count > 0 and logs_dir.exists():\n                        print(\n                            f\"    Resuming from retry {format_retry_count}, recovering previous failure experiences...\"\n                        )\n                        for prev_retry in range(format_retry_count):\n                            prev_log_pattern = f\"task_{task.task_id}_attempt-{attempt}_format-retry-{prev_retry}_*.json\"\n                            prev_logs = sorted(list(logs_dir.glob(prev_log_pattern)))\n                            if prev_logs:\n                                prev_log_file = prev_logs[-1]  # Get the latest one\n                                try:\n                                    with open(\n                                        prev_log_file, \"r\", encoding=\"utf-8\"\n                                    ) as f:\n                                        prev_log_data = json.load(f)\n                                        # Extract failure experience from trace_data\n                                        trace_data = prev_log_data.get(\"trace_data\", {})\n                                        prev_failure_exp = trace_data.get(\n                                            \"failure_experience_summary\"\n                                        )\n                                        if prev_failure_exp:\n                                            failure_experiences.append(prev_failure_exp)\n                                            print(\n                                                f\"      Recovered failure experience from retry {prev_retry}\"\n                                            )\n                                except Exception as e:\n                                    print(\n                                        f\"      Warning: Failed to load previous log {prev_log_file}: {e}\"\n                                    )\n\n                        # Rebuild enhanced task description with recovered failure experiences\n                        if failure_experiences:\n                            current_task_description += FAILURE_EXPERIENCE_HEADER\n                            for idx, exp in enumerate(failure_experiences, 1):\n                                current_task_description += (\n                                    FAILURE_EXPERIENCE_ITEM.format(\n                                        attempt_number=idx,\n                                        failure_summary=exp,\n                                    )\n                                )\n                            current_task_description += FAILURE_EXPERIENCE_FOOTER\n                            print(\n                                f\"    Recovered {len(failure_experiences)} failure experience(s) from previous retries\"\n                            )\n\n                    while format_retry_count <= max_format_retries:\n                        try:\n                            # Check if this is the final retry (no more chances after this)\n                            is_final_retry = format_retry_count == max_format_retries\n\n                            (\n                                response,\n                                final_boxed_answer,\n                                log_file_path,\n                                failure_experience_summary,\n                            ) = await execute_task_pipeline(\n                                cfg=self.cfg,\n                                task_id=f\"{task.task_id}_attempt-{attempt}_format-retry-{format_retry_count}\",\n                                task_file_name=task_file_path,\n                                task_description=current_task_description,\n                                main_agent_tool_manager=self.main_agent_tool_manager,\n                                sub_agent_tool_managers=self.sub_agent_tool_managers,\n                                output_formatter=self.output_formatter,\n                                ground_truth=task.ground_truth,\n                                log_dir=str(self.get_log_dir()),\n                                is_final_retry=is_final_retry,\n                            )\n\n                            attempt_result[\"model_boxed_answer\"] = (\n                                final_boxed_answer if final_boxed_answer else \"\"\n                            )\n                            attempt_result[\"log_file_path\"] = log_file_path\n\n                            # Check for format error\n                            if (\n                                attempt_result[\"model_boxed_answer\"]\n                                == FORMAT_ERROR_MESSAGE\n                            ):\n                                format_retry_count += 1\n                                if format_retry_count <= max_format_retries:\n                                    # Use the model-generated failure experience summary\n                                    print(\n                                        f\"    Format error detected, using model-generated failure summary for retry {format_retry_count}...\"\n                                    )\n\n                                    if failure_experience_summary:\n                                        failure_experiences.append(\n                                            failure_experience_summary\n                                        )\n\n                                        # Build enhanced task description with accumulated failure experiences\n                                        # Start fresh from original task_description each time\n                                        current_task_description = task_description\n                                        current_task_description += (\n                                            FAILURE_EXPERIENCE_HEADER\n                                        )\n                                        for idx, exp in enumerate(\n                                            failure_experiences, 1\n                                        ):\n                                            current_task_description += (\n                                                FAILURE_EXPERIENCE_ITEM.format(\n                                                    attempt_number=idx,\n                                                    failure_summary=exp,\n                                                )\n                                            )\n                                        current_task_description += (\n                                            FAILURE_EXPERIENCE_FOOTER\n                                        )\n\n                                        print(\n                                            f\"    Enhanced task description with {len(failure_experiences)} failure experience(s)\"\n                                        )\n                                    else:\n                                        print(\n                                            \"    No failure experience summary generated, retrying without enhancement...\"\n                                        )\n                                    continue\n                                else:\n                                    # Exceeded format retry limit\n                                    attempt_result[\"status\"] = \"success\"\n                                    attempt_result[\"model_boxed_answer\"] = (\n                                        f\"{FORMAT_ERROR_MESSAGE} (after {max_format_retries} retries)\"\n                                    )\n                                    attempt_result[\"error_message\"] = (\n                                        f\"Exceeded format error retry limit ({max_format_retries})\"\n                                    )\n                                    break\n                            else:\n                                # Got valid response, success\n                                attempt_result[\"status\"] = \"success\"\n                                break\n\n                        except Exception as e:\n                            attempt_result[\"status\"] = \"failed\"\n                            attempt_result[\"error_message\"] = str(e)\n                            print(\n                                f\"    Error in attempt {attempt}, format retry {format_retry_count}: {e}\"\n                            )\n                            break\n\n                # Perform LLM verification if we have an answer and haven't verified yet\n                if (\n                    attempt_result[\"model_boxed_answer\"]\n                    and attempt_result[\"final_judge_result\"] is None\n                    and task.ground_truth is not None\n                ):\n                    print(f\"    Verifying answer for attempt {attempt}...\")\n                    try:\n                        (\n                            evaluation_result,\n                            judge_type,\n                            eval_details,\n                        ) = await verify_answer_for_datasets(\n                            benchmark_name=self.benchmark_name,\n                            question=task.task_question,\n                            target=task.ground_truth,\n                            predicted_answer=attempt_result[\"model_boxed_answer\"],\n                            metadata=task.metadata,\n                        )\n                        attempt_result[\"final_judge_result\"] = evaluation_result\n                        attempt_result[\"judge_type\"] = judge_type\n                        attempt_result[\"is_correct\"] = evaluation_result == \"CORRECT\"\n\n                        # Store evaluation details (e.g., for DeepSearchQA metrics)\n                        if eval_details:\n                            attempt_result[\"eval_details\"] = eval_details\n\n                        # Update the log file with verification result\n                        if attempt_result[\"log_file_path\"]:\n                            self._update_log_file_with_evaluation(\n                                attempt_result[\"model_boxed_answer\"],\n                                attempt_result[\"log_file_path\"],\n                                evaluation_result,\n                                judge_type,\n                                eval_details,  # Pass eval_details to save in log file\n                            )\n\n                        if attempt_result[\"is_correct\"]:\n                            print(f\"    ✅ Attempt {attempt}: CORRECT!\")\n                            found_correct_answer = True\n                        else:\n                            print(\n                                f\"    ❌ Attempt {attempt}: INCORRECT ({evaluation_result})\"\n                            )\n\n                    except Exception as e:\n                        print(f\"    Error verifying attempt {attempt}: {e}\")\n                        attempt_result[\"final_judge_result\"] = \"ERROR\"\n                        attempt_result[\"judge_type\"] = \"error\"\n                        attempt_result[\"is_correct\"] = False\n\n                elif attempt_result[\"is_correct\"]:\n                    print(f\"    ✅ Attempt {attempt}: CORRECT (cached)\")\n                    found_correct_answer = True\n\n                elif attempt_result[\"final_judge_result\"]:\n                    print(\n                        f\"    ❌ Attempt {attempt}: INCORRECT (cached: {attempt_result['final_judge_result']})\"\n                    )\n                else:\n                    print(f\"    ⚠️  Attempt {attempt}: No valid answer to verify\")\n\n                result.attempts.append(attempt_result)\n\n                # Update main result with the first successful attempt or best attempt so far\n                if attempt == 1 or (\n                    attempt_result[\"status\"] == \"success\"\n                    and not result.model_boxed_answer\n                ):\n                    result.model_boxed_answer = attempt_result[\"model_boxed_answer\"]\n                    result.log_file_path = attempt_result[\"log_file_path\"]\n                    result.status = attempt_result[\"status\"]\n                    if \"error_message\" in attempt_result:\n                        result.error_message = attempt_result[\"error_message\"]\n\n                # Early stopping: if we found a correct answer, we can stop\n                if found_correct_answer:\n                    print(\n                        f\"    🎯 Found correct answer! Stopping early after {attempt} attempts.\"\n                    )\n                    break\n\n        except Exception as e:\n            result.error_message = str(e)\n            result.status = \"failed\"\n            print(f\"Error processing task {task.task_id}: {e}\")\n\n        finally:\n            result.pass_at_k_success = found_correct_answer\n\n            # Set main result judge result based on pass@k outcome\n            if found_correct_answer:\n                result.final_judge_result = \"PASS_AT_K_SUCCESS\"\n                result.judge_type = \"pass_at_k\"\n            else:\n                if result.ground_truth is None:\n                    result.final_judge_result = \"TEST_SET_MODE\"\n                else:\n                    result.final_judge_result = \"PASS_AT_K_FAILED\"\n                result.judge_type = \"pass_at_k\"\n\n            print(f\"Task {task.task_id} completed with {len(result.attempts)} attempts\")\n            if result.ground_truth is not None:\n                print(\n                    f\"    Pass@{self.pass_at_k} result: {'✅ SUCCESS' if found_correct_answer else '❌ FAILED'}\"\n                )\n\n        gc.collect()\n        return result\n\n    def _run_single_task_sync(self, task: BenchmarkTask) -> BenchmarkResult:\n        \"\"\"Sync wrapper for run_single_task to be used in threads\"\"\"\n        loop = asyncio.new_event_loop()\n        asyncio.set_event_loop(loop)\n\n        # Set exception handler to suppress \"Task exception was never retrieved\" warnings\n        def exception_handler(loop, context):\n            # Suppress all asyncio internal warnings for cleaner output\n            pass\n\n        loop.set_exception_handler(exception_handler)\n\n        try:\n            # Direct await is simpler and cleaner than gather for single task\n            return loop.run_until_complete(self.run_single_task(task))\n        finally:\n            loop.close()\n\n    def run_parallel_inference(\n        self, tasks: List[BenchmarkTask], max_concurrent: int = 3\n    ) -> List[BenchmarkResult]:\n        \"\"\"Run inference on multiple tasks in parallel using multiprocessing\"\"\"\n        print(\n            f\"Running inference on {len(tasks)} tasks with max_concurrent={max_concurrent} (multiprocessing)\"\n        )\n\n        # Serialize config\n        cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)\n\n        # Shuffle tasks to avoid order bias and improve balancing\n        shuffled_tasks = tasks.copy()\n        random.shuffle(shuffled_tasks)\n\n        # Prepare evaluator kwargs for worker processes\n        evaluator_kwargs = {\n            \"data_dir\": str(self.data_dir),\n            \"benchmark_name\": self.benchmark_name,\n        }\n        # Add GenericEvaluator specific kwargs if available\n        if hasattr(self, \"metadata_file\"):\n            evaluator_kwargs[\"metadata_file\"] = str(self.metadata_file.name)\n        if hasattr(self, \"task_id_field\"):\n            evaluator_kwargs[\"task_id_field\"] = self.task_id_field\n        if hasattr(self, \"question_field\"):\n            evaluator_kwargs[\"question_field\"] = self.question_field\n        if hasattr(self, \"ground_truth_field\"):\n            evaluator_kwargs[\"ground_truth_field\"] = self.ground_truth_field\n        if hasattr(self, \"file_name_field\"):\n            evaluator_kwargs[\"file_name_field\"] = self.file_name_field\n\n        # Prepare serializable arguments for worker processes\n        worker_args = []\n        for task in shuffled_tasks:\n            task_dict = {\n                \"task_id\": task.task_id,\n                \"task_question\": task.task_question,\n                \"ground_truth\": task.ground_truth,\n                \"file_path\": task.file_path,\n                \"metadata\": task.metadata,\n            }\n            worker_args.append((task_dict, cfg_dict, evaluator_kwargs))\n\n        # Use ProcessPoolExecutor for true parallelism (bypasses GIL)\n        processed_results = []\n        task_index_map = {\n            task.task_id: (i, task) for i, task in enumerate(shuffled_tasks)\n        }\n        results_dict = {}  # Store results by task_id to maintain order\n\n        executor = None\n        try:\n            executor = ProcessPoolExecutor(max_workers=max_concurrent)\n            # Submit all tasks\n            future_to_task_id = {}\n            for args in worker_args:\n                task_dict = args[0]  # First element is task_dict\n                future = executor.submit(_task_worker, *args)\n                future_to_task_id[future] = task_dict[\"task_id\"]\n\n            # Collect results as they complete\n            from concurrent.futures import as_completed\n\n            for future in as_completed(future_to_task_id):\n                task_id = future_to_task_id[future]\n                try:\n                    result_dict = future.result()\n                    # Reconstruct BenchmarkResult from dict\n                    result = BenchmarkResult(**result_dict)\n                    results_dict[task_id] = result\n                    completed = len(results_dict)\n                    print(\n                        f\"Progress: {completed}/{len(shuffled_tasks)} tasks completed\"\n                    )\n                except Exception as e:\n                    print(f\"Exception in task {task_id}: {e}\")\n                    # Get original task for error result\n                    _, original_task = task_index_map[task_id]\n                    error_result = BenchmarkResult(\n                        task_id=original_task.task_id,\n                        task_question=original_task.task_question,\n                        ground_truth=original_task.ground_truth,\n                        file_path=original_task.file_path,\n                        model_boxed_answer=\"\",\n                        status=\"failed\",\n                        metadata=original_task.metadata.copy(),\n                        error_message=str(e),\n                    )\n                    results_dict[task_id] = error_result\n        except KeyboardInterrupt:\n            print(\"\\n⚠️  Received interrupt signal, shutting down gracefully...\")\n            if executor:\n                print(\"  Cancelling pending tasks and terminating worker processes...\")\n                # Cancel all pending futures\n                for future in future_to_task_id:\n                    future.cancel()\n\n                # Forcefully terminate worker processes\n                # Access internal processes and terminate them\n                if hasattr(executor, \"_processes\") and executor._processes:\n                    for pid, process in executor._processes.items():\n                        try:\n                            if process.is_alive():\n                                print(f\"    Terminating worker process {pid}...\")\n                                process.terminate()\n                        except Exception as e:\n                            print(\n                                f\"    Warning: Failed to terminate process {pid}: {e}\"\n                            )\n\n                    # Give processes a short time to terminate gracefully\n                    import time\n\n                    time.sleep(0.5)\n\n                    # Force kill any remaining processes\n                    for pid, process in executor._processes.items():\n                        try:\n                            if process.is_alive():\n                                print(f\"    Force killing worker process {pid}...\")\n                                process.kill()\n                        except Exception as e:\n                            print(f\"    Warning: Failed to kill process {pid}: {e}\")\n\n                # Shutdown executor without waiting for pending tasks\n                executor.shutdown(wait=False, cancel_futures=True)\n            print(\"  Shutdown complete.\")\n            raise\n        finally:\n            # Ensure executor is properly cleaned up\n            if executor:\n                try:\n                    executor.shutdown(wait=True)\n                except Exception:\n                    pass  # Ignore errors during cleanup\n\n        # Reconstruct results in original task order\n        processed_results = [results_dict[task.task_id] for task in shuffled_tasks]\n\n        # Sort results to maintain original task order\n        task_id_to_index = {task.task_id: i for i, task in enumerate(tasks)}\n        processed_results.sort(\n            key=lambda r: task_id_to_index.get(r.task_id, len(tasks))\n        )\n\n        self.results = processed_results\n        return processed_results\n\n    def save_results(self, output_file: str) -> str:\n        \"\"\"Save evaluation results to JSONL file\"\"\"\n        output_path = Path(output_file)\n        output_path.parent.mkdir(parents=True, exist_ok=True)\n\n        with open(output_path, \"w\", encoding=\"utf-8\") as f:\n            for result in self.results:\n                f.write(json.dumps(asdict(result), ensure_ascii=False) + \"\\n\")\n\n        print(f\"Results saved to {output_path}\")\n        return str(output_path)\n\n    def evaluate_accuracy(self) -> float:\n        \"\"\"Evaluate pass@k accuracy (verification already done in run_single_task)\"\"\"\n        if not self.results:\n            print(\"No results to evaluate\")\n            return 0.0\n\n        print(\n            f\"Calculating pass@{self.pass_at_k} accuracy for {len(self.results)} results...\"\n        )\n\n        correct_count = 0\n        total_count = 0\n\n        for result in self.results:\n            total_count += 1\n\n            # Display task results\n            print(f\"\\nTask {result.task_id}:\")\n            print(f\"  Attempts: {len(result.attempts)}\")\n            if result.ground_truth is not None:\n                print(\n                    f\"  Pass@{self.pass_at_k}: {'✅ SUCCESS' if result.pass_at_k_success else '❌ FAILED'}\"\n                )\n\n            print(\"  \" + \"=\" * 50)\n            print(f\"  Reference: {result.ground_truth}\")\n            print(\"  \" + \"=\" * 50)\n\n            if result.pass_at_k_success:\n                correct_count += 1\n\n        pass_at_k_accuracy = correct_count / total_count if total_count > 0 else 0.0\n\n        print(f\"\\nPass@{self.pass_at_k} Final Results:\")\n        print(f\"Tasks passed: {correct_count}/{total_count}\")\n        print(f\"Pass@{self.pass_at_k} Accuracy: {pass_at_k_accuracy:.2%}\")\n\n        return pass_at_k_accuracy\n\n    def _update_log_file_with_evaluation(\n        self,\n        model_boxed_answer: str,\n        log_file_path: str,\n        evaluation_result: str,\n        judge_type: str,\n        eval_details: Optional[Dict[str, Any]] = None,\n    ):\n        \"\"\"Helper method to update log file with evaluation result\"\"\"\n        try:\n            log_file = Path(log_file_path)\n            # Read existing data\n            with open(log_file, \"r\", encoding=\"utf-8\") as f:\n                log_data = json.load(f)\n\n            # Update with evaluation result\n            log_data[\"final_boxed_answer\"] = model_boxed_answer\n            log_data[\"final_judge_result\"] = evaluation_result\n            log_data[\"judge_type\"] = judge_type\n\n            # Store evaluation details (e.g., for DeepSearchQA metrics)\n            if eval_details:\n                log_data[\"eval_details\"] = eval_details\n\n            # Write to a temporary file and then atomically replace\n            temp_log_file = log_file.with_suffix(f\"{log_file.suffix}.tmp\")\n            with open(temp_log_file, \"w\", encoding=\"utf-8\") as f:\n                json.dump(log_data, f, indent=2, ensure_ascii=False)\n\n            os.replace(temp_log_file, log_file)\n            print(f\"    Updated log file {log_file.name} with evaluation result.\")\n        except Exception as e:\n            print(f\"    Error updating log file {log_file_path}: {e}\")\n\n\nclass GenericEvaluator(BenchmarkEvaluator):\n    \"\"\"Generic benchmark evaluator for JSONL format\"\"\"\n\n    def __init__(\n        self,\n        data_dir: str,\n        benchmark_name: str,\n        cfg: DictConfig,\n        metadata_file: str = \"metadata.jsonl\",\n        task_id_field: str = \"task_id\",\n        question_field: str = \"task_question\",\n        ground_truth_field: str = \"ground_truth\",\n        file_name_field: Optional[str] = \"file_name_field\",\n    ):\n        \"\"\"\n        Initialize generic evaluator\n\n        Args:\n            data_dir: Path to benchmark data directory\n            benchmark_name: Name of the benchmark\n            cfg: The Hydra configuration object\n            metadata_file: Name of the metadata file\n            task_id_field: Field name for task ID in the data\n            question_field: Field name for task question in the data\n            ground_truth_field: Field name for ground truth answer in the data\n            file_name_field: Field name for file name in the data (optional)\n            pass_at_k: Pass@K value for evaluation (default: 1)\n        \"\"\"\n        super().__init__(data_dir=data_dir, benchmark_name=benchmark_name, cfg=cfg)\n        self.metadata_file = self.data_dir / metadata_file\n        self.task_id_field = task_id_field\n        self.question_field = question_field\n        self.ground_truth_field = ground_truth_field\n        self.file_name_field = file_name_field\n        self.tasks: List[BenchmarkTask] = []\n        self.results: List[BenchmarkResult] = []\n\n    def load_tasks(self, limit: Optional[int] = None) -> List[BenchmarkTask]:\n        \"\"\"\n        Load benchmark tasks from metadata.jsonl\n\n        Args:\n            limit: Maximum number of tasks to load (None for all)\n\n        Returns:\n            List of BenchmarkTask objects\n        \"\"\"\n        print(f\"Loading tasks from {self.metadata_file}\")\n\n        if not self.metadata_file.exists():\n            raise FileNotFoundError(f\"Metadata file not found: {self.metadata_file}\")\n\n        tasks = []\n        with open(self.metadata_file, \"r\", encoding=\"utf-8\") as f:\n            for i, line in enumerate(f):\n                if limit and i >= limit:\n                    break\n\n                try:\n                    data = json.loads(line.strip())\n\n                    # Extract file path if specified\n                    file_path = None\n                    if self.file_name_field and self.file_name_field in data:\n                        file_path = data[self.file_name_field]\n\n                    # Create metadata dict with all remaining fields\n                    metadata = {\n                        k: v\n                        for k, v in data.items()\n                        if k\n                        not in [\n                            self.task_id_field,\n                            self.question_field,\n                            self.ground_truth_field,\n                            self.file_name_field,\n                        ]\n                    }\n\n                    task = BenchmarkTask(\n                        task_id=data[self.task_id_field],\n                        task_question=data[self.question_field],\n                        ground_truth=data[self.ground_truth_field],\n                        file_path=file_path,\n                        metadata=metadata,\n                    )\n                    tasks.append(task)\n\n                except Exception as e:\n                    print(f\"Warning: Failed to parse line {i + 1}: {e}\")\n                    continue\n\n        gc.collect()\n        self.tasks = tasks\n        print(f\"Loaded {len(tasks)} tasks\")\n        return tasks\n\n    def prepare_task_description(\n        self, task: BenchmarkTask\n    ) -> Tuple[str, Optional[str]]:\n        \"\"\"\n        Prepare task description and file path for the agent\n\n        Args:\n            task: BenchmarkTask object\n\n        Returns:\n            Tuple of (task_description, task_file_path)\n        \"\"\"\n\n        task_file_path = None\n        if task.file_path:\n            # Build complete file path: data directory + relative path\n            full_file_path = self.data_dir / task.file_path\n            # Convert to absolute path and resolve any symbolic links\n            task_file_path = str(full_file_path.resolve())\n        else:\n            task_file_path = None\n\n        # Return task question and file path\n        return task.task_question, task_file_path\n\n\nclass CommonBenchmark:\n    \"\"\"Main class to run a benchmark\"\"\"\n\n    def __init__(self, cfg: DictConfig):\n        \"\"\"\n        Initialize the benchmark run\n\n        Args:\n            cfg: Hydra configuration object\n        \"\"\"\n        self.cfg = cfg\n        self.benchmark_name = cfg.benchmark.name\n        evaluator_kwargs = cfg.benchmark.get(\"evaluator_kwargs\", OmegaConf.create({}))\n        # Support for legacy config structure\n        if \"metadata_file\" in cfg.benchmark.data:\n            evaluator_kwargs[\"metadata_file\"] = cfg.benchmark.data.metadata_file\n        if \"field_mapping\" in cfg.benchmark.data:\n            mapping = cfg.benchmark.data.field_mapping\n            if \"task_id_field\" in mapping:\n                evaluator_kwargs[\"task_id_field\"] = mapping.task_id_field\n            if \"task_question_field\" in mapping:\n                evaluator_kwargs[\"question_field\"] = mapping.task_question_field\n            if \"ground_truth_field\" in mapping:\n                evaluator_kwargs[\"ground_truth_field\"] = mapping.ground_truth_field\n            if \"file_name_field\" in mapping:\n                evaluator_kwargs[\"file_name_field\"] = mapping.file_name_field\n\n        self.evaluator = GenericEvaluator(\n            data_dir=cfg.benchmark.data.data_dir,\n            benchmark_name=self.benchmark_name,\n            cfg=cfg,\n            **evaluator_kwargs,\n        )\n\n    def run_evaluation(self) -> float:\n        \"\"\"\n        Run the full benchmark evaluation process\n        \"\"\"\n        print(f\"Starting evaluation for benchmark: {self.benchmark_name}\")\n        print(f\"LLM Provider: {self.evaluator.llm_provider}\")\n        print(f\"LLM Model: {self.evaluator.llm_model}\")\n\n        # Load tasks\n        self.evaluator.load_tasks(limit=self.cfg.benchmark.execution.max_tasks)\n        if not self.evaluator.tasks:\n            print(\"No tasks loaded. Exiting.\")\n            return 0.0\n\n        # Run inference\n        print(\n            f\"\\nStarting parallel inference with {self.cfg.benchmark.execution.max_concurrent} concurrent tasks...\"\n        )\n        print(f\"Using pass@{self.evaluator.pass_at_k} evaluation...\")\n\n        self.evaluator.run_parallel_inference(\n            self.evaluator.tasks,\n            max_concurrent=self.cfg.benchmark.execution.max_concurrent,\n        )\n\n        # Evaluate accuracy\n        print(\"Evaluating accuracy...\")\n        accuracy = self.evaluator.evaluate_accuracy()\n        print(f\"\\nOverall pass@{self.evaluator.pass_at_k} accuracy: {accuracy:.2%}\")\n        # Save results\n\n        # Construct the full path in the correct log directory\n        log_dir = self.evaluator.get_log_dir()\n        results_path = log_dir / \"benchmark_results.jsonl\"\n\n        self.evaluator.save_results(str(results_path))\n        print(f\"\\nEvaluation completed! Results saved to {results_path}\")\n\n        # save accuracy to a file\n        accuracy_file = str(results_path).replace(\n            \".jsonl\", f\"_pass_at_{self.evaluator.pass_at_k}_accuracy.txt\"\n        )\n        with open(accuracy_file, \"w\") as f:\n            f.write(f\"{accuracy:.2%}\")\n        # Generate and save summary\n        generate_summary(log_dir)\n        return accuracy\n\n\n@hydra.main(config_path=\"../conf\", config_name=\"config\", version_base=None)\ndef run_benchmark(cfg: DictConfig) -> None:\n    \"\"\"\n    Main entry point for running benchmarks with Hydra.\n    \"\"\"\n    print(\"Benchmark configuration:\\n\", OmegaConf.to_yaml(cfg.benchmark))\n\n    benchmark = CommonBenchmark(cfg)\n    benchmark.run_evaluation()\n\n\nif __name__ == \"__main__\":\n    run_benchmark()\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/evaluators/__init__.py",
    "content": ""
  },
  {
    "path": "apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py",
    "content": "#!/usr/bin/env python3\n# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport glob\nimport os\nimport re\nimport statistics\nimport sys\n\n\ndef detect_pass_at_k(results_dir: str) -> tuple:\n    \"\"\"Detect the pass_at_k value used in the results directory\"\"\"\n\n    # Find all possible pass_at_k files\n    pattern = os.path.join(\n        results_dir, \"run_*\", \"benchmark_results_pass_at_*_accuracy.txt\"\n    )\n    all_files = glob.glob(pattern)\n\n    if not all_files:\n        print(f\"No accuracy files found in {results_dir}\")\n        print(f\"Expected pattern: {pattern}\")\n        return None, []\n\n    # Extract pass_at_k value from the first file\n    filename = os.path.basename(all_files[0])\n    match = re.search(r\"pass_at_(\\d+)_accuracy\\.txt\", filename)\n\n    if not match:\n        print(f\"Cannot extract pass_at_k from filename: {filename}\")\n        return None, []\n\n    k = int(match.group(1))\n\n    # Get all files with this k value\n    accuracy_files = glob.glob(\n        os.path.join(\n            results_dir, \"run_*\", f\"benchmark_results_pass_at_{k}_accuracy.txt\"\n        )\n    )\n\n    return k, accuracy_files\n\n\ndef calculate_average_scores(results_dir: str) -> dict:\n    \"\"\"Calculate average scores from multiple runs - automatically detect pass_at_k value\"\"\"\n\n    # Detect pass_at_k value and corresponding files\n    pass_at_k, accuracy_files = detect_pass_at_k(results_dir)\n\n    if pass_at_k is None:\n        return None\n\n    print(f\"Detected pass_at_{pass_at_k} files\")\n    print(f\"Found {len(accuracy_files)} accuracy files\")\n\n    scores = []\n\n    # Read each accuracy file\n    for i, file_path in enumerate(sorted(accuracy_files), 1):\n        try:\n            with open(file_path, \"r\") as f:\n                content = f.read().strip()\n                # Remove percentage sign and convert to float\n                score = float(content.replace(\"%\", \"\"))\n                scores.append(score)\n                print(f\"Run {i}: {score:.2f}%\")\n        except Exception as e:\n            print(f\"Error reading {file_path}: {e}\")\n            continue\n\n    if not scores:\n        print(\"No valid scores found\")\n        return None\n\n    # Calculate statistics\n    stats = {\n        \"pass_at_k\": pass_at_k,\n        \"num_runs\": len(scores),\n        \"individual_scores\": scores,\n        \"average_score\": statistics.mean(scores),\n        \"std_dev\": statistics.stdev(scores) if len(scores) > 1 else 0,\n        \"min_score\": min(scores),\n        \"max_score\": max(scores),\n    }\n\n    return stats\n\n\ndef print_results(stats: dict):\n    \"\"\"Print results\"\"\"\n    print(\"\\n\" + \"=\" * 50)\n    print(\"EVALUATION RESULTS\")\n    print(\"=\" * 50)\n\n    print(f\"Pass@{stats['pass_at_k']} Results:\")\n    print(f\"Number of runs: {stats['num_runs']}\")\n    print(f\"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\")\n    print()\n    print(f\"Standard deviation: {stats['std_dev']:.2f}%\")\n    print(f\"Min score: {stats['min_score']:.2f}%\")\n    print(f\"Max score: {stats['max_score']:.2f}%\")\n    print(f\"Average score: {stats['average_score']:.2f}%\")\n    print(\"=\" * 50)\n\n\ndef main():\n    if len(sys.argv) < 2:\n        print(\"Usage: python calculate_average_score.py <results_directory>\")\n        print(\"Example: python calculate_average_score.py logs/gaia-validation/mytest\")\n        sys.exit(1)\n\n    results_dir = sys.argv[1]\n\n    if not os.path.exists(results_dir):\n        print(f\"Results directory does not exist: {results_dir}\")\n        sys.exit(1)\n\n    print(f\"Analyzing results from: {results_dir}\")\n\n    stats = calculate_average_scores(results_dir)\n\n    if stats:\n        print_results(stats)\n\n        # Save simple statistics results\n        output_file = os.path.join(\n            results_dir, f\"average_scores_pass_at_{stats['pass_at_k']}.txt\"\n        )\n        with open(output_file, \"w\") as f:\n            f.write(\"EVALUATION RESULTS\\n\")\n            f.write(\"=\" * 50 + \"\\n\")\n            f.write(f\"Pass@{stats['pass_at_k']} Results:\\n\")\n            f.write(f\"Number of runs: {stats['num_runs']}\\n\")\n            f.write(\n                f\"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\\n\"\n            )\n            f.write(f\"Standard deviation: {stats['std_dev']:.2f}%\\n\")\n            f.write(f\"Min score: {stats['min_score']:.2f}%\\n\")\n            f.write(f\"Max score: {stats['max_score']:.2f}%\\n\")\n            f.write(f\"Average score: {stats['average_score']:.2f}%\\n\")\n            f.write(\"=\" * 50 + \"\\n\")\n\n        print(f\"\\nResults saved to: {output_file}\")\n    else:\n        print(\"Failed to calculate statistics\")\n        sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/evaluators/eval_utils.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport json\nimport os\nimport re\nimport string\nimport warnings\nfrom typing import Any, Dict, Literal, Optional\n\nfrom dotenv import load_dotenv\nfrom openai import AsyncOpenAI, OpenAI\nfrom pydantic import BaseModel\n\nload_dotenv()\n\nOPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\nOPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\")\n\nevaluation_llm_client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\nmodel_as_a_judge_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n\n# ================================================\n# verify_answer_simpleqa\n# ================================================\n\nEVALUATION_PROMPT_SIMPLEQA = \"\"\"\nYour job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either [\"CORRECT\", \"INCORRECT\", \"NOT_ATTEMPTED\"].\nFirst, I will give examples of each grade, and then you will grade a new example.\n\n\nThe following are examples of CORRECT predicted answers.\n```\nQuestion: What are the names of Barack Obama's children?\nGold target: Malia Obama and Sasha Obama\nPredicted answer 1: sasha and malia obama\nPredicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check\nPredicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.\n```\nThese predicted answers are all CORRECT because:\n    - They fully contain the important information in the gold target.\n    - They do not contain any information that contradicts the gold target.\n    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.\n    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.\n\n\nThe following are examples of INCORRECT predicted answers.\n```\nQuestion: What are the names of Barack Obama's children?\nGold target: Malia and Sasha\nPredicted answer 1: Malia.\nPredicted answer 2: Malia, Sasha, and Susan.\nPredicted answer 3: Barack Obama does not have any children.\nPredicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.\nPredicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.\nPredicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?\nPredicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.\n```\nThese predicted answers are all INCORRECT because:\n    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., \"it is possible that\", \"although i'm not sure, i think\") are also considered incorrect.\n\n\nThe following are examples of NOT_ATTEMPTED predicted answers.\n```\nQuestion: What are the names of Barack Obama's children?\nGold target: Malia and Sasha\nPredicted answer 1: I don't know.\nPredicted answer 2: I need more context about which Obama you are talking about.\nPredicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.\nPredicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.\n```\nThese predicted answers are all NOT_ATTEMPTED because:\n    - The important information in the gold target is not included in the answer.\n    - No statements in the answer contradict the gold target.\n\n\nAlso note the following things:\n- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question \"How many citations does the Transformer Paper have?\" with gold target \"120k\". \n    - Predicted answers \"120k\", \"124k\", and 115k\" are all CORRECT. \n    - Predicted answers \"100k\" and \"113k\" are INCORRECT. \n    - Predicted answers \"around 100k\" and \"more than 50k\" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.\n- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.\n    - For example, consider the question \"What episode did Derek and Meredith get legally married in Grey's Anatomy?\" with gold target \"Season 7, Episode 20: White Wedding\". Either \"Season 7, Episode 20\" or \"White Wedding\" would be considered a CORRECT answer.\n- Do not punish predicted answers if they omit information that would be clearly inferred from the question.\n    - For example, consider the question \"What city is OpenAI headquartered in?\" and the gold target \"San Francisco, California\". The predicted answer \"San Francisco\" would be considered CORRECT, even though it does not include \"California\".\n    - Consider the question \"What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?\", the gold target is \"Outstanding Paper Award\". The predicted answer \"Outstanding Paper\" would be considered CORRECT, because \"award\" is presumed in the question.\n    - For the question \"What is the height of Jason Wei in meters?\", the gold target is \"1.73 m\". The predicted answer \"1.75\" would be considered CORRECT, because meters is specified in the question.\n    - For the question \"What is the name of Barack Obama's wife?\", the gold target is \"Michelle Obama\". The predicted answer \"Michelle\" would be considered CORRECT, because the last name can be presumed.\n- Do not punish for typos in people's name if it's clearly the same name. \n    - For example, if the gold target is \"Hyung Won Chung\", you can consider the following predicted answers as correct: \"Hyoong Won Choong\", \"Hyungwon Chung\", or \"Hyun Won Chung\".\n\n\nHere is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.\n```\nQuestion: {}\nGold target: {}\nPredicted answer: {}\n```\n\nGrade the predicted answer of this new question as one of:\nA: CORRECT\nB: INCORRECT\nC: NOT_ATTEMPTED\n\nJust return the letters \"A\", \"B\", or \"C\", with no text around it.\n\"\"\".strip()\n\n\nasync def verify_answer_simpleqa(\n    question: str, target: str, predicted_answer: str\n) -> str:\n    \"\"\"\n    Use LLM to verify if the predicted answer is correct.\n    Expects the LLM to choose between A (correct), B or C (incorrect).\n    \"\"\"\n    messages = [\n        {\n            \"role\": \"user\",\n            \"content\": EVALUATION_PROMPT_SIMPLEQA.format(\n                question, target, predicted_answer\n            ),\n        }\n    ]\n    CHOICE_MAP = {\"A\": \"CORRECT\", \"B\": \"INCORRECT\", \"C\": \"NOT_ATTEMPTED\"}\n\n    try:\n        llm_response = await evaluation_llm_client.chat.completions.create(\n            model=\"gpt-4.1-2025-04-14\", messages=messages, max_completion_tokens=2\n        )\n        content = llm_response.choices[0].message.content\n        match = re.search(r\"(A|B|C)\", content)\n        if match:\n            return CHOICE_MAP[match.group(0)]\n    except Exception as e:\n        print(f\"LLM evaluation failed: {e}\")\n\n    return \"NOT_ATTEMPTED\"\n\n\n# ================================================\n# verify_answer_hle\n# ================================================\n\nHLE_JUDGE_PROMPT = \"\"\"Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.\n\n[question]: {question}\n\n[response]: {response}\n\nYour judgement must be in the format and criteria specified below:\n\nextracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.\n\n[correct_answer]: {correct_answer}\n\nreasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.\n\ncorrect: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.\n\nconfidence: The extracted confidence score between 0|\\%| and 100|\\%| from [response]. Put 100 if there is no confidence score available.\"\"\"\n\n\nclass HLEExtractedAnswer(BaseModel):\n    extracted_final_answer: str\n    reasoning: str\n    correct: Literal[\"yes\", \"no\"]\n    confidence: int\n    strict: Literal[True] = True  # 100% reliability\n\n\nasync def verify_answer_hle(question: str, target: str, predicted_answer: str) -> str:\n    \"\"\"\n    Use HLE-style LLM judge to verify if the predicted answer is correct.\n    Returns the evaluation result as a string: \"CORRECT\", \"INCORRECT\", or \"NOT_ATTEMPTED\".\n\n    Args:\n        question: The question being answered\n        target: The correct/target answer\n        predicted_answer: The model's predicted answer\n\n    Returns:\n        String indicating the evaluation result\n    \"\"\"\n    prompt = HLE_JUDGE_PROMPT.format(\n        question=question, correct_answer=target, response=predicted_answer\n    )\n\n    try:\n        response = await evaluation_llm_client.beta.chat.completions.parse(\n            model=\"o3-mini-2025-01-31\",\n            max_completion_tokens=4096,\n            messages=[{\"role\": \"user\", \"content\": prompt}],\n            response_format=HLEExtractedAnswer,\n        )\n\n        content = response.choices[0].message.parsed\n\n        # Print HLE reasoning\n        print(f\"LLM as Judge Reasoning: {content.reasoning}\")\n        print(f\"LLM as Judge Result: {content.correct}\")\n        print(f\"LLM as Judge Confidence: {content.confidence}%\")\n\n        # Convert HLE format to eval_utils format\n        if content.correct == \"yes\":\n            return \"CORRECT\"\n        else:\n            return \"INCORRECT\"\n\n    except Exception as e:\n        if \"Incorrect API key provided\" in str(e):\n            print(f\"LLM evaluation failed: {e}\")\n            exit()\n        print(f\"LLM evaluation failed: {e}\")\n        return \"NOT_ATTEMPTED\"\n\n\n# ================================================\n# verify_answer_gaia\n# ================================================\n\n\nasync def verify_answer_gaia(question: str, target: str, predicted_answer: str) -> str:\n    \"\"\"\n    Use GAIA-style judge to verify if the predicted answer is correct.\n    \"\"\"\n\n    def normalize_number_str(number_str: str) -> float | None:\n        # we replace these common units and commas to allow\n        # conversion to float\n        for char in [\"$\", \"%\", \",\"]:\n            number_str = number_str.replace(char, \"\")\n        try:\n            return float(number_str)\n        except ValueError:\n            print(f\"String {number_str} cannot be normalized to number str.\")\n            return None  # Return None instead of inf to handle gracefully\n\n    def split_string(\n        s: str,\n        char_list: list[str] = [\",\", \";\"],\n    ) -> list[str]:\n        pattern = f\"[{''.join(char_list)}]\"\n        return re.split(pattern, s)\n\n    def normalize_str(input_str, remove_punct=True) -> str:\n        \"\"\"\n        Normalize a string by:\n        - Removing all white spaces\n        - Optionally removing punctuation (if remove_punct is True)\n        - Converting to lowercase\n        Parameters:\n        - input_str: str, the string to normalize\n        - remove_punct: bool, whether to remove punctuation (default: True)\n        Returns:\n        - str, the normalized string\n        \"\"\"\n        # Remove all white spaces. Required e.g for seagull vs. sea gull\n        no_spaces = re.sub(r\"\\s\", \"\", input_str)\n\n        # Remove punctuation, if specified.\n        if remove_punct:\n            translator = str.maketrans(\"\", \"\", string.punctuation)\n            return no_spaces.lower().translate(translator)\n        else:\n            return no_spaces.lower()\n\n    def question_scorer(\n        model_answer: str,\n        ground_truth: str,\n    ) -> bool:\n        def is_float(element: any) -> bool:\n            try:\n                float(element)\n                return True\n            except ValueError:\n                return False\n\n        if model_answer is None:\n            model_answer = \"None\"\n\n        # if gt is a number\n        if is_float(ground_truth):\n            print(f\"Evaluating {model_answer} as a number.\")\n            normalized_answer = normalize_number_str(model_answer)\n            # If normalization failed, the answer is incorrect\n            if normalized_answer is None:\n                return False\n            return normalized_answer == float(ground_truth)\n\n        # if gt is a list\n        elif any(char in ground_truth for char in [\",\", \";\"]):\n            print(f\"Evaluating {model_answer} as a comma separated list.\")\n            # question with the fish: normalization removes punct\n\n            gt_elems = split_string(ground_truth)\n            ma_elems = split_string(model_answer)\n\n            # check length is the same\n            if len(gt_elems) != len(ma_elems):\n                warnings.warn(\n                    \"Answer lists have different lengths, returning False.\", UserWarning\n                )\n                return False\n\n            # compare each element as float or str\n            comparisons = []\n            for ma_elem, gt_elem in zip(ma_elems, gt_elems):\n                if is_float(gt_elem):\n                    normalized_ma_elem = normalize_number_str(ma_elem)\n                    # If normalization failed, this element is incorrect\n                    if normalized_ma_elem is None:\n                        comparisons.append(False)\n                    else:\n                        comparisons.append(normalized_ma_elem == float(gt_elem))\n                else:\n                    # we do not remove punct since comparisons can include punct\n                    comparisons.append(\n                        normalize_str(ma_elem, remove_punct=False)\n                        == normalize_str(gt_elem, remove_punct=False)\n                    )\n            return all(comparisons)\n\n        # if gt is a str\n        else:\n            print(f\"Evaluating {model_answer} as a string.\")\n            return normalize_str(model_answer) == normalize_str(ground_truth)\n\n    # Use the question_scorer to evaluate the answer\n    try:\n        is_correct = question_scorer(predicted_answer, target)\n        return \"CORRECT\" if is_correct else \"INCORRECT\"\n    except Exception as e:\n        print(f\"GAIA evaluation failed: {e}\")\n        raise e\n\n        # use raise error instead, later we could judge it as NOT_ATTEMPTED.\n        # return \"NOT_ATTEMPTED\"\n\n\n# ================================================\n# verify_answer_gaia_validation_text_103\n\n# Prompt from WebAgent\n# https://github.com/Alibaba-NLP/WebAgent/blob/f25dae54daf0ce2874ffd5ed5ffb20feca7c4c4e/WebSailor/src/prompt.py#L98\n# ================================================\n\nGAIA_VALIDATION_TEXT_103_SCORER_PROMPT = \"\"\"You are an evaluation assistant. Please determine if the predicted answer is equivalent to the labeled answer.\n\nQuestion: {question}\n\nLabeled Answer: {correct_answer}\n\nPredicted Answer: {response}\n\nDid the model give an answer **equivalent** to the labeled answer? Please respond with \"Correct\" if they are equivalent, or \"Incorrect\" if they are not equivalent. Do not include any other text.\n\"\"\"\n\n\nasync def verify_answer_gaia_validation_text_103(\n    question: str, target: str, predicted_answer: str\n) -> str:\n    prompt = GAIA_VALIDATION_TEXT_103_SCORER_PROMPT.format(\n        question=question, correct_answer=target, response=predicted_answer\n    )\n\n    max_tries = 10\n    for attempt in range(max_tries):\n        try:\n            response = await evaluation_llm_client.chat.completions.create(\n                model=\"gpt-4.1-2025-04-14\",\n                messages=[{\"role\": \"user\", \"content\": prompt}],\n            )\n\n            content = response.choices[0].message.content\n            print(\"LLM Judge Response: \", content)\n\n            if response:\n                break\n        except Exception as e:\n            if attempt == (max_tries - 1):\n                raise e\n\n    # Use case-insensitive matching and strip whitespace/punctuation\n    content_normalized = content.strip().rstrip(\".\").lower()\n    if content_normalized == \"correct\":\n        return \"CORRECT\"\n    elif content_normalized == \"incorrect\":\n        return \"INCORRECT\"\n    else:\n        # If we can't parse the response, default to NOT_ATTEMPTED to trigger retry\n        print(f\"Warning: Could not parse judge response: {content}\")\n        return \"NOT_ATTEMPTED\"\n\n\n# ================================================\n# verify_answer_browsecomp\n\n# Prompt from Tongyi DeepResearch\n# https://github.com/Alibaba-NLP/DeepResearch/blob/main/WebAgent/WebWatcher/infer/evaluation/prompt.py#L110\n# ================================================\n\nJUDGE_PROMPT_BC_zh = \"\"\"\n请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为：【正确】、【错误】。\n\n首先，我们将列出每个评定类别的示例，然后请您对新问题的预测答案进行评定。\n以下是【正确】的答复示例：\n```\n问题：贝拉克·奥巴马的孩子叫什么名字？\n标准答案：玛丽亚·奥巴马和萨莎·奥巴马\n模型预测1：Malia Obama and Sasha Obama\n模型预测2：玛丽亚和萨沙\n模型预测3：大多数人会说是玛丽亚和萨莎，但我不确定，需要再确认\n模型预测4：巴拉克·奥巴马有两个女儿，她们分别是玛丽亚·安和娜塔莎·玛丽安，但通常称作玛丽亚·奥巴马和萨莎·奥巴马。\n```\n这些答复均为【正确】，因为：\n    - 完整地包含了标准答案中的重要信息。\n    - 不包含任何与标准答案矛盾的信息。\n    - 只关注语义内容，中英文，大小写、标点、语法和顺序不重要。\n    - 答复中出现模糊语句或猜测是可以接受的，前提是包含了标准答案且不含有不正确信息或矛盾。\n\n以下是【错误】的答复示例：\n```\n问题：巴拉克·奥巴马的孩子叫什么名字？\n标准答案：玛丽亚·奥巴马和萨莎·奥巴马\n模型预测1：玛丽亚\n模型预测2：玛丽亚、萨莎和苏珊和萨莎·奥巴马或玛丽亚·奥巴马，或娜塔莎·玛丽安，或爱因斯坦\n模型预测3：虽然我不知道他们的确切名字，但能说出巴拉克·奥巴马有两个孩子。\n模型预测4：你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗？\n模型预测5：巴拉克·奥巴马的孩子\n```\n这些答复均为【错误】，因为：\n    - 答复中包含与标准答案矛盾的事实陈述。\n    - 答案为空、重复表述问题。\n    - 答案枚举了多个答案，重复表述答案。\n\n需要格外注意的是：\n- 标准答案中包含对于问题中多个方面的回答，并且在同一个方面的答案中可能会有多种不同的描述，这些描述均是正确的，并且在同一个括号中给出，通过逗号连接。例如，考虑问题\"抖音自己的人工智能大模型叫什么名字？\"，标准答案为\"【【豆包，云雀】】\"：\n    - 预测答案\"豆包\"、\"豆包、云雀\"、\"云雀\"等均为【正确】。\n- 对于标准答案中包含的不同方面的回答，模型需要同时给出所有方面的回答才可以算是正确，否则直接判断为【错误】，不存在【部分正确】这种输出方式，这些答案会在不同的括号中给出。例如，考虑问题\"TFBOYS组合中的成员有哪些？\"，标准答案为\"【【王俊凯】【王源】【易洋千玺】】\"：\n    - 预测答案\"王俊凯、王源、易洋千玺\"等同时包含所有答案，才可以算为【正确】。\n    - 预测答案为\"王俊凯、易洋千玺\"等没有同时包含所有答案，会被算为【错误】。\n\n另外注意以下几点：\n- 对于标准答案为数字的问题，预测答案应和标准答案一致。例如，考虑问题\"金山铁路黄浦江特大桥的全长是多少米？\"，标准答案为\"3518.17\"：\n    - 预测答案\"3518\"、\"3518.1\"、\"3518.17\"均为【正确】。\n    - 预测答案\"3520\"和\"3600\"均为【错误】。\n- 如果模型预测并没有直接回答问题，模型试图绕过或未能直接给出标准答案视为【错误】答案。\n    - 例如：问题\"林宥嘉的老婆是谁\"，标准答案为\"丁文琪\"。模型预测\"林宥嘉的老婆\"、\"林宥嘉的老婆应该很优秀\"、\"林宥嘉的老婆可能是某个公众人物\"均为【错误】。\n- 如果标准答案包含比问题更多的信息，预测答案只需包含问题中提到的信息。\n    - 例如，考虑问题\"菱镁矿的主要化学成分是什么？\"标准答案为\"碳酸镁（MgCO3）\"。\"碳酸镁\"或\"MgCO3\"均视为【正确】答案。\n- 如果从问题中明显可以推断出预测答案省略的信息，那么算作正确。\n    - 例如，问题\"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产，那么这遗址在哪个地区？\"标准答案为\"意大利撒丁岛\"，预测答案\"撒丁岛\"被视为【正确】。\n- 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。\n    - 例如，如果标准答案是\"Robinson\"，那么回答鲁滨逊或者鲁滨孙均正确。\n- 你应该更关注标准答案和模型预测的匹配度，而不是关心标准答案是否是正确的。\n\n下面是一个新的问题示例。请只回复【正确】、【错误】之一，不要道歉或纠正自己的错误，只需要评估该回答。\n```\n问题: {question}\n标准答案: {correct_answer}\n预测答案: {response}\n```\n\n将此新问题的预测答案评定为以下之一：\nA.【正确】\nB.【错误】\n\n只返回【正确】、【错误】所代表的选项即可，即仅返回A或B即可，无须添加任何其他的文本。\n\"\"\".strip()\n\n\nJUDGE_PROMPT_BC_en = \"\"\"\nBased on the given question, standard answer, and model-predicted answer, evaluate whether the model's response is correct. Your task is to classify the result as: [CORRECT] or [INCORRECT].\n\nFirst, we'll list examples for each category, then you'll evaluate a new question's predicted answer.\nHere are examples of [CORRECT] responses:\n```\nQuestion: What are the names of Barack Obama's children?\nStandard Answer: Malia Obama and Sasha Obama\nModel Prediction 1: Malia Obama and Sasha Obama\nModel Prediction 2: Malia and Sasha\nModel Prediction 3: Most would say Malia and Sasha, but I'm not sure, I should verify\nModel Prediction 4: Barack Obama has two daughters, Malia Ann and Natasha Marian, commonly known as Malia Obama and Sasha Obama.\n```\nThese responses are all [CORRECT] because they:\n    - Fully include the important information from the standard answer.\n    - Don't contain any information that contradicts the standard answer.\n    - Focus only on semantic content; language, capitalization, punctuation, grammar, and order aren't important.\n    - Vague statements or guesses are acceptable as long as they include the standard answer and don't contain incorrect information or contradictions.\n\nHere are examples of [INCORRECT] responses:\n```\nQuestion: What are the names of Barack Obama's children?\nStandard Answer: Malia Obama and Sasha Obama\nModel Prediction 1: Malia\nModel Prediction 2: Malia, Sasha and Susan or Sasha Obama or Malia Obama, or Natasha Marian, or Einstein\nModel Prediction 3: While I don't know their exact names, I can tell you Barack Obama has two children.\nModel Prediction 4: You might be thinking of Betsy and Olivia. But you should verify the details with the latest references. Is that the correct answer?\nModel Prediction 5: Barack Obama's children\n```\nThese responses are all [INCORRECT] because they:\n    - Contain factual statements that contradict the standard answer.\n    - Are empty or merely repeat the question.\n    - Enumerate multiple answers or repeat the answer.\n\nPay special attention to the following:\n- The standard answer may contain responses to multiple aspects of the question, and within the same aspect, there might be different descriptions, all of which are correct and are given in the same bracket, connected by commas. For example, for the question \"What is the name of ByteDance's AI model?\", the standard answer is \"[[Doubao, Skylark]]\":\n    - Predicted answers \"Doubao\", \"Doubao, Skylark\", \"Skylark\", etc. are all [CORRECT].\n- For standard answers containing responses to different aspects, the model needs to provide answers to all aspects to be considered correct; otherwise, it's directly judged as [INCORRECT]. There is no [PARTIALLY CORRECT] output option. These answers will be given in different brackets. For example, for the question \"Who are the members of TFBOYS?\", the standard answer is \"[[Wang Junkai][Wang Yuan][Yi Yangqianxi]]\":\n    - Predicted answers like \"Wang Junkai, Wang Yuan, Yi Yangqianxi\" that include all answers are [CORRECT].\n    - Predicted answers like \"Wang Junkai, Yi Yangqianxi\" that don't include all answers are [INCORRECT].\n\nAlso note the following points:\n- For questions with numerical standard answers, the predicted answer should match the standard answer. For example, for the question \"What is the total length in meters of the Huangpu River Bridge on the Jinshan Railway?\", the standard answer is \"3518.17\":\n    - Predicted answers \"3518\", \"3518.1\", \"3518.17\" are all [CORRECT].\n    - Predicted answers \"3520\" and \"3600\" are [INCORRECT].\n- If the model prediction doesn't directly answer the question, attempts to circumvent or fails to directly provide the standard answer, it's considered an [INCORRECT] answer.\n    - For example, for the question \"Who is JJ Lin's wife?\", with the standard answer \"Ding Wenqi\", model predictions like \"JJ Lin's wife\", \"JJ Lin's wife should be excellent\", \"JJ Lin's wife might be a public figure\" are all [INCORRECT].\n- If the standard answer contains more information than the question asks for, the predicted answer only needs to include the information mentioned in the question.\n    - For example, for the question \"What is the main chemical component of magnesite?\", with the standard answer \"Magnesium carbonate (MgCO3)\", \"Magnesium carbonate\" or \"MgCO3\" are both considered [CORRECT] answers.\n- If information omitted in the predicted answer can be clearly inferred from the question, it's considered correct.\n    - For example, for the question \"The Nuragic ruins of Barumini were listed as a World Cultural Heritage by UNESCO in 1997, so where is this site located?\", with the standard answer \"Sardinia, Italy\", the predicted answer \"Sardinia\" is considered [CORRECT].\n- If it's clear that different translations of a name refer to the same person, it's considered correct.\n    - For example, if the standard answer is \"Robinson\", answers like \"Lubinson\" or \"Lubinsun\" are both correct.\n- You should focus more on the match between the standard answer and the model prediction, rather than whether the standard answer itself is correct.\n\nBelow is a new question example. Please reply with only [CORRECT] or [INCORRECT], without apologies or corrections to your own errors, just evaluate the answer.\n```\nQuestion: {question}\nStandard Answer: {correct_answer}\nPredicted Answer: {response}\n```\n\nEvaluate this new question's predicted answer as one of the following:\nA. [CORRECT]\nB. [INCORRECT]\n\nReturn only the option representing [CORRECT] or [INCORRECT], i.e., just return A or B, without adding any other text.\n\"\"\".strip()\n\n\nasync def verify_answer_browsecomp(\n    question: str, target: str, predicted_answer: str\n) -> str:\n    \"\"\"\n    Use BrowseComp judge (English version) to verify if the predicted answer is correct.\n    Expects the LLM to return A (correct) or B (incorrect).\n    \"\"\"\n\n    prompt = JUDGE_PROMPT_BC_en.format(\n        question=question, correct_answer=target, response=predicted_answer\n    )\n\n    try:\n        response = await evaluation_llm_client.chat.completions.create(\n            model=\"gpt-4.1-2025-04-14\",\n            messages=[{\"role\": \"user\", \"content\": prompt}],\n            max_completion_tokens=2,\n        )\n\n        content = response.choices[0].message.content\n        print(f\"BrowseComp Judge Response: {content}\")\n\n        # Extract A or B from the response\n        match = re.search(r\"[AB]\", content)\n        if match:\n            choice = match.group(0)\n            if choice == \"A\":\n                return \"CORRECT\"\n            elif choice == \"B\":\n                return \"INCORRECT\"\n\n        # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry\n        print(f\"Warning: Could not parse BrowseComp judge response: {content}\")\n        return \"NOT_ATTEMPTED\"\n\n    except Exception as e:\n        print(f\"BrowseComp evaluation failed: {e}\")\n        raise e\n\n\nasync def verify_answer_browsecomp_zh(\n    question: str, target: str, predicted_answer: str\n) -> str:\n    \"\"\"\n    Use BrowseComp judge (Chinese version) to verify if the predicted answer is correct.\n    Expects the LLM to return A (correct) or B (incorrect).\n    \"\"\"\n\n    prompt = JUDGE_PROMPT_BC_zh.format(\n        question=question, correct_answer=target, response=predicted_answer\n    )\n\n    try:\n        response = await evaluation_llm_client.chat.completions.create(\n            model=\"gpt-4.1-2025-04-14\",\n            messages=[{\"role\": \"user\", \"content\": prompt}],\n            max_completion_tokens=2,\n        )\n\n        content = response.choices[0].message.content\n        print(f\"BrowseComp-ZH Judge Response: {content}\")\n\n        # Extract A or B from the response\n        match = re.search(r\"[AB]\", content)\n        if match:\n            choice = match.group(0)\n            if choice == \"A\":\n                return \"CORRECT\"\n            elif choice == \"B\":\n                return \"INCORRECT\"\n\n        # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry\n        print(f\"Warning: Could not parse BrowseComp-ZH judge response: {content}\")\n        return \"NOT_ATTEMPTED\"\n\n    except Exception as e:\n        print(f\"BrowseComp-ZH evaluation failed: {e}\")\n        raise e\n\n\n# ================================================\n# verify_answer_xbench_deepsearch\n\n# Prompt from XBench-Evals\n# https://github.com/xbench-ai/xbench-evals/blob/main/eval_grader.py#L25\n# ================================================\n\nJUDGE_PROMPT_XBENCH = \"\"\"\n你是一个通用人工智能助手。根据下面给出的[正确答案], 判断以下对[原问题]的[回答]的回答是否正确。\n\n[原问题]: {question}\n\n[正确答案]: {correct_answer}\n\n[回答]:{response}\n\n你的判断必须按照以下格式和标准进行:\n\n最终答案: 从[回答]中提取出的最终准确答案。如果[回答]中没有明确的最终答案, 则填写'无'。\n\n解释: 根据[正确答案]解释为什么[最终答案]是正确的或错误的。只关注[最终答案]与[正确答案]之间是否存在实质性差异, 不要评论题目的背景, 不要尝试重新解题, 不要为任何不同于[正确答案]的答案辩护, 只专注于判断答案是否一致。\n\n结论: 如果[最终答案]与上方给出的[正确答案]一致, 或者在数值题目中处于可接受的微小误差范围内, 则填写'正确'; 否则（即存在任何不一致、歧义、不等价或提取出的答案错误的情况）填写'错误'。\n\"\"\".strip()\n\n\nasync def verify_answer_xbench_deepsearch(\n    question: str, target: str, predicted_answer: str\n) -> str:\n    \"\"\"\n    Use XBench-DeepSearch judge to verify if the predicted answer is correct.\n    \"\"\"\n\n    def parse_match_result(match):\n        if match is None:\n            return match\n        match = match.group(0)\n        try:\n            target = match.split(\":\")[1].strip()\n            return target\n        except Exception:\n            return match  # return naive result in case of failed\n\n    if predicted_answer is None:\n        return \"INCORRECT\"\n\n    judge_prompt = JUDGE_PROMPT_XBENCH.format(\n        question=question,\n        correct_answer=target,\n        response=predicted_answer,\n    )\n    try:\n        response = await evaluation_llm_client.chat.completions.create(\n            model=\"gpt-4.1-2025-04-14\",\n            messages=[{\"role\": \"user\", \"content\": judge_prompt}],\n        )\n        judge_response = response.choices[0].message.content\n    except Exception:\n        judge_response = None\n    if judge_response is None:\n        return \"NOT_ATTEMPTED\"\n\n    # Extract grader conclusions\n    extract_match = re.search(r\"最终答案:*(.*)\", judge_response)\n    extract_match = parse_match_result(extract_match)\n\n    # Fixed regex: make the dot optional with \\s* (zero or more whitespace)\n    correct_match = re.search(r\"结论:*\\s*(正确|错误)\", judge_response)\n    correct_match = parse_match_result(correct_match)\n\n    explain_match = re.search(r\"解释:*(.*)\", judge_response)\n    explain_match = parse_match_result(explain_match)\n\n    # Print debug info\n    print(f\"XBench Judge - Extract: {extract_match}, Correct: {correct_match}\")\n\n    if correct_match == \"正确\":\n        return \"CORRECT\"\n    elif correct_match == \"错误\":\n        return \"INCORRECT\"\n    else:\n        # If we can't parse the result, return NOT_ATTEMPTED to trigger retry\n        print(\n            f\"Warning: Could not parse XBench judge response, correct_match={correct_match}\"\n        )\n        return \"NOT_ATTEMPTED\"\n\n\n# ================================================\n# verify_answer_deepsearchqa\n#\n# Official prompt from DeepSearchQA benchmark\n# https://www.kaggle.com/code/andrewmingwang/deepsearchqa-starter-code\n# ================================================\n\nJUDGE_PROMPT_DEEPSEARCHQA = \"\"\"Your task is to evaluate whether a given \"AI Response\" for a specific \"User Prompt\" arrived at the correct answer.\n\n**Answer Correctness Task**\n\n*   **Purpose:** Assess whether the AI response provides the correct answer(s) based on the provided \"Correct Answer\" and \"Prompt Type\".\n*   **Process:**\n    *   Identify the \"Prompt Type\": \"<prompt_type>\".\n    *   Refer to the \"Correct Answer\": \"<answer>\".\n    *   Based on the \"Prompt Type\", determine if the \"AI Response\" contains the expected answer(s).\n        *   **'Single Answer'**: Check if the response provides the answer that addresses the user's question. It does not have to match the exact wording of the provided answer.\n        *   **'Set Answer'**: Check if the response includes *each* item from the provided ground truth answers. The order might not matter unless specified otherwise. The response might include more answers than the list. Determine the correctness *only* based on the list first and then check if the response includes answers not in the list.\n    *   **Explanation:** Provide a brief explanation justifying your assessment of answer correctness, referencing specific parts of the AI response and the correct answer.\n    *   **Correctness Details:** Provide a dictionary, one key for each expected answer part, and value is a boolean indicating whether each expected answer part was found.\n        *   For 'Set Answer', this will be a list of attributes, one for each item/part in the \"Correct Answer\". Each key will be a string indicating the expected answer part, and the value will be a boolean indicating whether that part was found in the response.\n    *   **Excessive Answers:** Provide a list of strings, each indicating an excessive answer part. If the response provides answers that are **not** in the \"Correct Answer\" list, add these answers as excessive answers. Return an empty list when there's no excessive answers in the response.\n\n\n**Output Format:**\n\nYour evaluation *must* be structured as a nested JSON dictionary with the following top-level keys: `\"Answer Correctness\"`. Please return NULL if any of \"Prompt\", \"AI Response\" or \"Correct Answer\" is empty.\nThe value for `\"Answer Correctness\"` should be a dictionary containing `\"Explanation\"` (a string), `\"Correctness Details\"` (a dictionary where each key is the expected correct answer, and the value is a boolean indicating whether the response contains the correct answer), and `\"Excessive Answers\"` (a list of strings indicating the excessive answers).\n\nMake sure you return a valid JSON string. Pay special attention to quotes, commas and special characters in the JSON string. Make sure to escape all special characters and quotes in the JSON string.\n\n\n**Example (Partial):**\n\n\"```json\n{{\n  \"Answer Correctness\": {{\n    \"Explanation\": \"The response correctly identified Belgium and France but also includes an excessive answer, Italy.\",\n    \"Correctness Details\": {{\n      \"Belgium\": true,\n      \"France\": true,\n    }},\n    \"Excessive Answers\": [ \"Italy\" ]\n  }}\n}}\n```\"\n\n**Now, proceed with the evaluation using the provided User Prompt, AI Response, and Correct Answer.**\n\nUser Prompt (Wrapped in <prompt> and </prompt>):\n<prompt>\n{prompt}\n</prompt>\n--------------------\n**  Correct Answer (Wrapped in <answer> and </answer>):\nPrompt Type: {prompt_type}\n<answer>\n{answer}\n</answer>\n--------------------\nAI assistant response (Wrapped in <response> and </response>):\n<response>\n{response}\n</response>\n\n--------------------\nRating:\"\"\"\n\n\nasync def verify_answer_deepsearchqa(\n    question: str,\n    target: str,\n    predicted_answer: str,\n    metadata: Optional[Dict[str, Any]] = None,\n) -> tuple[str, str, Optional[Dict[str, Any]]]:\n    \"\"\"\n    Use DeepSearchQA-specific judge to verify if the predicted answer is correct.\n    Uses the official DeepSearchQA evaluation prompt with JSON output format.\n\n    Args:\n        question: The question being answered\n        target: The correct/target answer\n        predicted_answer: The model's predicted answer\n        metadata: Optional metadata dict with additional context (e.g., problem_category, answer_type)\n\n    Returns:\n        Tuple of (result, judge_type, details_dict):\n        - result: \"CORRECT\", \"INCORRECT\", or \"NOT_ATTEMPTED\"\n        - judge_type: \"deepsearchqa_judge\"\n        - details_dict: Dict with keys:\n            - correctness_details: Dict[str, bool] mapping answer parts to correctness\n            - excessive_answers: List[str] of extra answers not in ground truth\n            - explanation: str explaining the judgment\n            - num_correct: int number of correct answer parts\n            - num_expected: int total number of expected answer parts\n            - num_excessive: int number of excessive answers\n    \"\"\"\n\n    if predicted_answer is None:\n        return \"INCORRECT\", \"deepsearchqa_judge\", None\n\n    # Determine prompt_type from metadata\n    prompt_type = \"Single Answer\"  # Default\n    if metadata and \"answer_type\" in metadata:\n        answer_type = metadata[\"answer_type\"]\n        # Map answer_type to prompt_type\n        if answer_type == \"Set Answer\":\n            prompt_type = \"Set Answer\"\n        # Add more mappings if needed\n\n    judge_prompt = JUDGE_PROMPT_DEEPSEARCHQA.format(\n        prompt_type=prompt_type,\n        prompt=question,\n        answer=target,\n        response=predicted_answer,\n    )\n\n    try:\n        response = await evaluation_llm_client.chat.completions.create(\n            model=\"gpt-4.1-2025-04-14\",\n            messages=[{\"role\": \"user\", \"content\": judge_prompt}],\n        )\n        judge_response = response.choices[0].message.content\n    except Exception as e:\n        print(f\"DeepSearchQA judge failed: {e}\")\n        return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n\n    if judge_response is None:\n        return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n\n    # Parse JSON response\n    try:\n        # Extract JSON from the response (might be wrapped in markdown code blocks)\n        json_match = re.search(r\"```json\\s*(\\{.*?\\})\\s*```\", judge_response, re.DOTALL)\n        if json_match:\n            json_str = json_match.group(1)\n        else:\n            # Try to find JSON without code blocks\n            json_match = re.search(r\"\\{.*\\}\", judge_response, re.DOTALL)\n            if json_match:\n                json_str = json_match.group(0)\n            else:\n                print(\"Warning: Could not find JSON in DeepSearchQA judge response\")\n                return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n\n        result = json.loads(json_str)\n        answer_correctness = result.get(\"Answer Correctness\", {})\n\n        explanation = answer_correctness.get(\"Explanation\", \"\")\n        correctness_details = answer_correctness.get(\"Correctness Details\", {})\n        excessive_answers = answer_correctness.get(\"Excessive Answers\", [])\n\n        # Calculate statistics\n        num_expected = len(correctness_details)\n        num_correct = sum(1 for v in correctness_details.values() if v)\n        num_excessive = len(excessive_answers)\n\n        # Build details dict\n        details = {\n            \"correctness_details\": correctness_details,\n            \"excessive_answers\": excessive_answers,\n            \"explanation\": explanation,\n            \"num_correct\": num_correct,\n            \"num_expected\": num_expected,\n            \"num_excessive\": num_excessive,\n        }\n\n        # Print debug info\n        print(\n            f\"DeepSearchQA Judge - Correct: {num_correct}/{num_expected}, Excessive: {num_excessive}\"\n        )\n        print(f\"DeepSearchQA Judge - Explanation: {explanation}\")\n\n        # Determine if answer is correct\n        # Following official logic: all expected parts must be found, and no excessive answers\n        if correctness_details:\n            all_correct = all(correctness_details.values())\n            if all_correct and not excessive_answers:\n                return \"CORRECT\", \"deepsearchqa_judge\", details\n            else:\n                # Either missing some expected answers or has excessive answers\n                return \"INCORRECT\", \"deepsearchqa_judge\", details\n        else:\n            # No correctness details, can't determine\n            return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n\n    except json.JSONDecodeError as e:\n        print(f\"Warning: Failed to parse JSON from DeepSearchQA judge: {e}\")\n        print(f\"Response: {judge_response[:200]}...\")\n        return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n    except Exception as e:\n        print(f\"Warning: Error processing DeepSearchQA judge response: {e}\")\n        return \"NOT_ATTEMPTED\", \"deepsearchqa_judge\", None\n\n\n# ================================================\n# verify_answer_for_datasets\n# ================================================\n\n\nasync def _verify_answer_for_datasets_core(\n    benchmark_name: str,\n    question: str,\n    target: str,\n    predicted_answer: str,\n    metadata: Optional[Dict[str, Any]] = None,\n) -> tuple[str, str, Optional[Dict[str, Any]]]:\n    \"\"\"\n    Verify the answer for a given dataset.\n\n    Args:\n        benchmark_name: Name of the benchmark dataset\n        question: The question being answered\n        target: The correct/target answer\n        predicted_answer: The model's predicted answer\n        metadata: Optional metadata dict with additional context\n\n    Returns:\n        A tuple of (result, judge_type, details_dict).\n        details_dict is None for most benchmarks, but contains evaluation details for DeepSearchQA.\n    \"\"\"\n\n    # For benchmarks that need detailed evaluation, don't use exact_match\n    if benchmark_name not in [\"deepsearchqa\"]:\n        if predicted_answer == target:\n            return \"CORRECT\", \"exact_match\", None\n\n    # For gaia-validation, use gaia-validation-text-103-scorer\n    # We found that gaia_scorer tends to label many correct answers as incorrect, so we believe\n    # that using an LLM-as-judge approach can more accurately reflect the model’s performance.\n    if benchmark_name == \"gaia-validation\":\n        # result = await verify_answer_gaia(question, target, predicted_answer)\n        # return result, \"gaia_scorer\", None\n        result = await verify_answer_gaia_validation_text_103(\n            question, target, predicted_answer\n        )\n        return result, \"gaia_validation_text_103_judge\", None\n\n    # For gaia-validation-text-103, use gaia-validation-text-103-scorer\n    elif benchmark_name == \"gaia-validation-text-103\":\n        result = await verify_answer_gaia_validation_text_103(\n            question, target, predicted_answer\n        )\n        return result, \"gaia_validation_text_103_judge\", None\n\n    # For browsecomp (English) and browsecomp-zh (Chinese), use different judges\n    elif benchmark_name == \"browsecomp\":\n        result = await verify_answer_browsecomp(question, target, predicted_answer)\n        return result, \"browsecomp_judge\", None\n\n    elif benchmark_name == \"browsecomp_zh\":\n        result = await verify_answer_browsecomp_zh(question, target, predicted_answer)\n        return result, \"browsecomp_zh_judge\", None\n\n    # For hle, hle-text-500, and hle-text-2158, use hle_judge\n    elif \"hle\" in benchmark_name:\n        result = await verify_answer_hle(question, target, predicted_answer)\n        return result, \"hle_judge\", None\n\n    # For webwalkerqa, frames, and seal-0, use gaia_validation_text_103_judge\n    elif benchmark_name in [\"webwalkerqa\", \"frames\", \"seal-0\"]:\n        result = await verify_answer_gaia_validation_text_103(\n            question, target, predicted_answer\n        )\n        return result, \"gaia_validation_text_103_judge\", None\n\n    # For simpleqa, use simpleqa_judge\n    elif benchmark_name == \"simpleqa\" or benchmark_name == \"collect_trace\":\n        result = await verify_answer_simpleqa(question, target, predicted_answer)\n        return result, \"simpleqa_judge\", None\n\n    # For xbench_deepsearch, use xbench_deepsearch_judge\n    elif benchmark_name == \"xbench_deepsearch\":\n        result = await verify_answer_xbench_deepsearch(\n            question, target, predicted_answer\n        )\n        return result, \"xbench_deepsearch_judge\", None\n\n    # For deepsearchqa, use deepsearchqa_judge (with metadata support and detailed evaluation)\n    elif benchmark_name == \"deepsearchqa\":\n        result, judge_type, details = await verify_answer_deepsearchqa(\n            question, target, predicted_answer, metadata\n        )\n        # Return details for DeepSearchQA-specific metrics calculation\n        return result, judge_type, details\n\n    # For other benchmarks, use gaia_validation_text_103_judge\n    else:\n        result = await verify_answer_gaia_validation_text_103(\n            question, target, predicted_answer\n        )\n        return result, \"gaia_validation_text_103_judge\", None\n\n\nasync def verify_answer_for_datasets(\n    benchmark_name: str,\n    question: str,\n    target: str,\n    predicted_answer: str,\n    metadata: Optional[Dict[str, Any]] = None,\n    max_retries: int = 10,\n    retry_interval: int = 5,\n) -> tuple[str, str, Optional[Dict[str, Any]]]:\n    \"\"\"\n    Wrapper with retry logic for NOT_ATTEMPTED results.\n\n    Args:\n        benchmark_name: Name of the benchmark dataset\n        question: The question being answered\n        target: The correct/target answer\n        predicted_answer: The model's predicted answer\n        metadata: Optional metadata dict with additional context\n        max_retries: Maximum number of retry attempts\n        retry_interval: Seconds to wait between retries\n\n    Returns:\n        A tuple of (result, judge_type, details_dict).\n        details_dict contains evaluation details (for DeepSearchQA) or None (for other benchmarks).\n    \"\"\"\n    for attempt in range(1, max_retries + 1):\n        result, judge_type, details = await _verify_answer_for_datasets_core(\n            benchmark_name, question, target, predicted_answer, metadata\n        )\n        if result != \"NOT_ATTEMPTED\":\n            return result, judge_type, details\n        if attempt < max_retries:\n            print(\n                f\"[Retry {attempt}/{max_retries}] Got NOT_ATTEMPTED, retrying in {retry_interval}s...\"\n            )\n            await asyncio.sleep(retry_interval)\n\n    # still NOT_ATTEMPTED after retries\n    print(f\"All {max_retries} attempts resulted in NOT_ATTEMPTED.\")\n    return \"NOT_ATTEMPTED\", \"retry_wrapper\", None\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/evaluators/extract_futurex_results.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport json\nimport os\nfrom collections import Counter, defaultdict\nfrom typing import Dict, List, Tuple\n\n\ndef majority_vote(\n    preds: List[str], first_seen_order: Dict[str, int]\n) -> Tuple[str, Dict[str, int]]:\n    \"\"\"\n    Compute the majority-vote prediction for a list of candidate predictions.\n\n    Tie-breaking rules (deterministic):\n      1) Highest frequency wins.\n      2) If there is a tie on frequency, choose the candidate that appeared earliest\n         across all runs (based on the provided first_seen_order index).\n      3) As a final guard (shouldn't be needed if first_seen_order is complete),\n         fall back to lexicographic order.\n\n    Returns:\n      (chosen_prediction, counts_dict)\n    \"\"\"\n    counter = Counter(preds)\n    # Get the max vote count\n    max_count = max(counter.values())\n    # All candidates that share the max vote count\n    tied = [c for c, cnt in counter.items() if cnt == max_count]\n\n    if len(tied) == 1:\n        chosen = tied[0]\n    else:\n        # Prefer the one seen earliest globally\n        tied.sort(key=lambda x: (first_seen_order.get(x, float(\"inf\")), x))\n        chosen = tied[0]\n\n    # Expose counts for optional debugging/inspection\n    return chosen, dict(counter)\n\n\ndef discover_runs(results_dir: str) -> List[str]:\n    \"\"\"\n    Discover subdirectories inside results_dir that potentially contain a\n    'benchmark_results.jsonl'. We don't strictly require the subdir name to\n    start with 'run_', but we sort the list to keep processing deterministic.\n    \"\"\"\n    runs = []\n    for name in sorted(os.listdir(results_dir)):\n        path = os.path.join(results_dir, name)\n        if os.path.isdir(path):\n            fpath = os.path.join(path, \"benchmark_results.jsonl\")\n            if os.path.isfile(fpath):\n                runs.append(path)\n    return runs\n\n\ndef parse_args() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Aggregate multiple run_*/benchmark_results.jsonl files and produce a FutureX submission with majority voting.\"\n    )\n    parser.add_argument(\n        \"results_dir\",\n        help=\"Path to results dir containing run_*/benchmark_results.jsonl\",\n    )\n    parser.add_argument(\n        \"-o\",\n        \"--output\",\n        default=None,\n        help=\"Output JSONL file path (default: <results_dir>/futurex_submission.jsonl)\",\n    )\n    return parser.parse_args()\n\n\ndef main() -> None:\n    args = parse_args()\n\n    results_dir = os.path.abspath(args.results_dir)\n    if not os.path.isdir(results_dir):\n        raise FileNotFoundError(f\"Results dir not found: {results_dir}\")\n\n    output_file = (\n        os.path.abspath(args.output)\n        if args.output\n        else os.path.join(results_dir, \"futurex_submission.jsonl\")\n    )\n\n    # Maps task_id -> list of predictions collected across runs\n    preds_by_task: Dict[str, List[str]] = defaultdict(list)\n\n    # Track first-seen order index for each distinct prediction string across all runs.\n    # This enables deterministic tie-breaking.\n    first_seen_order: Dict[str, int] = {}\n    next_order_idx = 0\n\n    runs = discover_runs(results_dir)\n    if not runs:\n        raise FileNotFoundError(\n            f\"No run directories with 'benchmark_results.jsonl' found under: {results_dir}\"\n        )\n\n    total_lines = 0\n    used_lines = 0\n\n    # Read and aggregate predictions\n    for run_dir in runs:\n        fpath = os.path.join(run_dir, \"benchmark_results.jsonl\")\n        print(f\"Reading: {fpath}\")\n        with open(fpath, \"r\", encoding=\"utf-8\") as fin:\n            for line in fin:\n                total_lines += 1\n                line = line.strip()\n                if not line:\n                    continue\n                try:\n                    rec = json.loads(line)\n                except json.JSONDecodeError:\n                    # Skip malformed JSON lines, but keep going\n                    continue\n\n                task_id = rec.get(\"task_id\")\n                pred = rec.get(\"model_boxed_answer\")\n\n                # Only accept non-empty strings; coerce to str for safety\n                if task_id and pred is not None and str(pred).strip():\n                    pred_str = str(pred).strip()\n                    preds_by_task[task_id].append(pred_str)\n                    if pred_str not in first_seen_order:\n                        first_seen_order[pred_str] = next_order_idx\n                        next_order_idx += 1\n                    used_lines += 1\n\n    # Write submission JSONL\n    # We sort task_ids to keep output reproducible.\n    num_tasks = 0\n    with open(output_file, \"w\", encoding=\"utf-8\") as out:\n        for task_id in sorted(preds_by_task.keys()):\n            voted_pred, _counts = majority_vote(\n                preds_by_task[task_id], first_seen_order\n            )\n            out.write(\n                json.dumps(\n                    {\"id\": task_id, \"prediction\": voted_pred}, ensure_ascii=False\n                )\n                + \"\\n\"\n            )\n            num_tasks += 1\n\n    # Optional: small summary to stdout\n    print(f\"Collected from {len(runs)} run(s).\")\n    print(f\"Read {total_lines} line(s), accepted {used_lines} record(s).\")\n    print(f\"Aggregated {num_tasks} unique task_id(s).\")\n    print(f\"✅ Submission saved to {output_file}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/subset_extraction/gaia-text-103-grader.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nGAIA-Text-103 Task Grader\n\nThis script:\n1. Loads extracted GAIA-Text-103 tasks from the extraction directory\n2. Grades each task using the GAIA-Text-103 evaluator (LLM judgement)\n3. Updates the original task files with grading results\n\nUsage:\n    uv run benchmarks/subset_extraction/gaia-text-103-grader.py /path/to/extraction/directory\n\"\"\"\n\nimport argparse\nimport asyncio\nimport json\nimport os\nimport sys\nimport time\nfrom dataclasses import dataclass\nfrom pathlib import Path\nfrom typing import Dict, List, Optional\n\n# Add the benchmarks directory to the path to import evaluators\nsys.path.append(str(Path(__file__).parent.parent))\nfrom evaluators.eval_utils import verify_answer_gaia_validation_text_103\n\n\n@dataclass\nclass GradingResult:\n    \"\"\"Result of grading a single task\"\"\"\n\n    task_id: str\n    run_name: str\n    file_path: str\n    question: str\n    ground_truth: str\n    predicted_answer: str\n    judge_result: str\n    judge_type: str = \"gaia_validation_text_103_scorer\"\n    grading_time: float = 0.0\n    error_message: str = \"\"\n\n\nclass GAIAText103Grader:\n    \"\"\"Grader for GAIA-Text-103 tasks using LLM judgement\"\"\"\n\n    def __init__(self, extraction_dir: str):\n        \"\"\"\n        Initialize the grader\n\n        Args:\n            extraction_dir: Directory containing extracted GAIA-Text-103 tasks\n        \"\"\"\n        self.extraction_dir = Path(extraction_dir)\n        self.results: List[GradingResult] = []\n        self.stats = {\n            \"total_tasks\": 0,\n            \"graded_tasks\": 0,\n            \"errors\": 0,\n            \"total_grading_time\": 0.0,\n        }\n\n    def find_task_files(self) -> List[Path]:\n        \"\"\"Find all task JSON files in the extraction directory\"\"\"\n        task_files = []\n\n        # Recursively search for task files\n        for root, dirs, files in os.walk(self.extraction_dir):\n            for file in files:\n                if file.startswith(\"task_\") and file.endswith(\".json\"):\n                    task_files.append(Path(root) / file)\n\n        return sorted(task_files)\n\n    def extract_task_info(self, task_file: Path) -> Optional[Dict]:\n        \"\"\"Extract task information from a task file\"\"\"\n        try:\n            with open(task_file, \"r\", encoding=\"utf-8\") as f:\n                task_data = json.load(f)\n\n            # Check if task has already been graded with our specific scorer\n            if task_data.get(\"judge_type\") == \"gaia_validation_text_103_scorer\":\n                print(f\"Skipping already graded task: {task_file.name}\")\n                return None\n\n            # Extract basic information\n            task_info = {\n                \"task_id\": task_data.get(\"task_id\", \"\"),\n                \"run_name\": task_data.get(\"run_name\", \"\"),\n                \"file_path\": str(task_file),\n                \"question\": task_data.get(\"input\", {}).get(\"task_description\", \"\"),\n                \"ground_truth\": task_data.get(\"ground_truth\", \"\"),\n                \"predicted_answer\": task_data.get(\"final_boxed_answer\", \"\"),\n            }\n\n            # Validate required fields\n            if not all(\n                [\n                    task_info[\"question\"],\n                    task_info[\"ground_truth\"],\n                    task_info[\"predicted_answer\"],\n                ]\n            ):\n                print(f\"Warning: Missing required fields in {task_file}\")\n                print(f\"  question: {task_info['question']}\")\n                print(f\"  ground_truth: {task_info['ground_truth']}\")\n                print(f\"  predicted_answer: {task_info['predicted_answer']}\")\n                return None\n\n            return task_info\n\n        except Exception as e:\n            print(f\"Error reading task file {task_file}: {e}\")\n            return None\n\n    async def grade_single_task(self, task_info: Dict) -> GradingResult:\n        \"\"\"Grade a single task using GAIA-Text-103 evaluator\"\"\"\n        start_time = time.time()\n\n        result = GradingResult(\n            task_id=task_info[\"task_id\"],\n            run_name=task_info[\"run_name\"],\n            file_path=task_info[\"file_path\"],\n            question=task_info[\"question\"],\n            ground_truth=task_info[\"ground_truth\"],\n            predicted_answer=task_info[\"predicted_answer\"],\n            judge_result=\"\",\n            judge_type=\"gaia_validation_text_103_scorer\",\n        )\n\n        try:\n            # Use the GAIA-Text-103 evaluator\n            judge_result = await verify_answer_gaia_validation_text_103(\n                question=task_info[\"question\"],\n                target=task_info[\"ground_truth\"],\n                predicted_answer=task_info[\"predicted_answer\"],\n            )\n\n            result.judge_result = judge_result\n            result.grading_time = time.time() - start_time\n\n            print(\n                f\"Task {task_info['task_id']} ({task_info['run_name']}): {judge_result}\"\n            )\n\n        except Exception as e:\n            result.error_message = str(e)\n            result.judge_result = \"ERROR\"\n            result.grading_time = time.time() - start_time\n            self.stats[\"errors\"] += 1\n            print(f\"Error grading task {task_info['task_id']}: {e}\")\n\n        return result\n\n    async def grade_all_tasks(self, max_concurrent: int = 5) -> List[GradingResult]:\n        \"\"\"Grade all tasks with concurrent processing\"\"\"\n        task_files = self.find_task_files()\n        print(f\"Found {len(task_files)} task files to grade\")\n\n        # Extract task information\n        task_infos = []\n        for task_file in task_files:\n            task_info = self.extract_task_info(task_file)\n            if task_info:\n                task_infos.append(task_info)\n\n        self.stats[\"total_tasks\"] = len(task_infos)\n        print(f\"Extracted {len(task_infos)} valid tasks for grading\")\n\n        if not task_infos:\n            print(\"No valid tasks found for grading\")\n            return []\n\n        # Grade tasks with concurrency control\n        semaphore = asyncio.Semaphore(max_concurrent)\n\n        async def grade_with_semaphore(task_info):\n            async with semaphore:\n                return await self.grade_single_task(task_info)\n\n        # Create tasks for concurrent execution\n        tasks = [grade_with_semaphore(task_info) for task_info in task_infos]\n\n        # Execute all grading tasks\n        results = await asyncio.gather(*tasks, return_exceptions=True)\n\n        # Filter out exceptions and collect valid results\n        valid_results = []\n        for i, result in enumerate(results):\n            if isinstance(result, Exception):\n                print(f\"Exception in task {i}: {result}\")\n                self.stats[\"errors\"] += 1\n            else:\n                valid_results.append(result)\n                self.stats[\"graded_tasks\"] += 1\n                self.stats[\"total_grading_time\"] += result.grading_time\n\n        self.results = valid_results\n        return valid_results\n\n    def update_original_files(self):\n        \"\"\"Update original task files with grading results\"\"\"\n        updated_count = 0\n\n        for result in self.results:\n            try:\n                # Read original file\n                with open(result.file_path, \"r\", encoding=\"utf-8\") as f:\n                    task_data = json.load(f)\n\n                # Add grading information\n                task_data[\"final_judge_result\"] = result.judge_result\n                task_data[\"judge_type\"] = result.judge_type\n                task_data[\"grading_time\"] = result.grading_time\n\n                if result.error_message:\n                    task_data[\"grading_error\"] = result.error_message\n\n                # Write back to file\n                with open(result.file_path, \"w\", encoding=\"utf-8\") as f:\n                    json.dump(task_data, f, indent=2, ensure_ascii=False)\n\n                updated_count += 1\n\n            except Exception as e:\n                print(f\"Error updating file {result.file_path}: {e}\")\n\n        print(f\"Updated {updated_count} original task files with grading results\")\n\n    def print_summary(self):\n        \"\"\"Print grading summary\"\"\"\n        print(\"\\n\" + \"=\" * 60)\n        print(\"GAIA-Text-103 Grading Summary\")\n        print(\"=\" * 60)\n\n        print(f\"Total tasks found: {self.stats['total_tasks']}\")\n        print(f\"Successfully graded: {self.stats['graded_tasks']}\")\n        print(f\"Errors: {self.stats['errors']}\")\n        print(\"=\" * 60)\n\n\nasync def main():\n    \"\"\"Main function\"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Grade GAIA-Text-103 tasks using LLM judgement\"\n    )\n    parser.add_argument(\n        \"extraction_dir\", help=\"Directory containing extracted GAIA-Text-103 tasks\"\n    )\n    parser.add_argument(\n        \"--max-concurrent\",\n        type=int,\n        default=5,\n        help=\"Maximum number of concurrent grading tasks (default: 5)\",\n    )\n    args = parser.parse_args()\n\n    # Validate input directory\n    if not os.path.exists(args.extraction_dir):\n        print(f\"Error: Extraction directory not found: {args.extraction_dir}\")\n        return 1\n\n    print(f\"Extraction directory: {args.extraction_dir}\")\n    print(f\"Max concurrent tasks: {args.max_concurrent}\")\n    print()\n\n    # Create grader and run grading\n    grader = GAIAText103Grader(args.extraction_dir)\n\n    try:\n        print(\"Starting grading process...\")\n        results = await grader.grade_all_tasks(max_concurrent=args.max_concurrent)\n\n        if results:\n            # Update original files only\n            grader.update_original_files()\n\n            # Print summary\n            grader.print_summary()\n\n            print(\"\\n✅ Grading completed successfully!\")\n            print(\"📝 Original task files updated with grading results\")\n        else:\n            print(\"❌ No tasks were graded successfully\")\n            return 1\n\n    except KeyboardInterrupt:\n        print(\"\\nGrading interrupted by user\")\n        return 1\n    except Exception as e:\n        print(f\"Error during grading: {e}\")\n        return 1\n\n    return 0\n\n\nif __name__ == \"__main__\":\n    exit_code = asyncio.run(main())\n    sys.exit(exit_code)\n"
  },
  {
    "path": "apps/miroflow-agent/benchmarks/subset_extraction/gaia-to-text-103-mover.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nGAIA to Text-103 Task Copier\n\nThis script:\n1. Loads GAIA validation logs from a specified directory\n2. Identifies tasks that belong to GAIA-Text-103 dataset\n3. Copies those tasks to a new directory structure maintaining the original layout\n\"\"\"\n\nimport argparse\nimport json\nimport os\nimport shutil\nfrom pathlib import Path\nfrom typing import Set\n\n\nclass GAIAtoText103Copier:\n    \"\"\"Copy GAIA-Text-103 tasks from GAIA validation logs\"\"\"\n\n    def __init__(self, gaia_text_103_data_path: str, output_dir: str):\n        \"\"\"\n        Initialize the copier\n\n        Args:\n            gaia_text_103_data_path: Path to GAIA-Text-103 standardized data file\n            output_dir: Directory to save copied tasks\n        \"\"\"\n        self.gaia_text_103_data_path = gaia_text_103_data_path\n        self.output_dir = Path(output_dir)\n        self.gaia_text_103_task_ids: Set[str] = set()\n        self.copied_count = 0\n\n        # Load GAIA-Text-103 task IDs\n        self._load_gaia_text_103_tasks()\n\n    def _load_gaia_text_103_tasks(self):\n        \"\"\"Load task IDs from GAIA-Text-103 dataset\"\"\"\n        print(f\"Loading GAIA-Text-103 task IDs from {self.gaia_text_103_data_path}\")\n\n        if not os.path.exists(self.gaia_text_103_data_path):\n            raise FileNotFoundError(\n                f\"GAIA-Text-103 data file not found: {self.gaia_text_103_data_path}\"\n            )\n\n        with open(self.gaia_text_103_data_path, \"r\", encoding=\"utf-8\") as f:\n            for line in f:\n                if line.strip():\n                    task_data = json.loads(line)\n                    task_id = task_data.get(\"task_id\")\n                    if task_id:\n                        self.gaia_text_103_task_ids.add(task_id)\n\n        print(f\"Loaded {len(self.gaia_text_103_task_ids)} GAIA-Text-103 task IDs\")\n\n    def copy_gaia_text_103_tasks(self, gaia_logs_dir: str) -> int:\n        \"\"\"\n        Copy GAIA-Text-103 tasks from GAIA validation logs\n\n        Args:\n            gaia_logs_dir: Directory containing GAIA validation logs\n\n        Returns:\n            Number of copied tasks\n        \"\"\"\n        print(f\"Copying GAIA-Text-103 tasks from {gaia_logs_dir}\")\n\n        # Find all task JSON files in the logs directory (including in run subdirectories)\n        task_files = []\n        for root, dirs, files in os.walk(gaia_logs_dir):\n            for file in files:\n                if file.startswith(\"task_\") and file.endswith(\".json\"):\n                    task_files.append(os.path.join(root, file))\n\n        print(f\"Found {len(task_files)} task files to process\")\n\n        copied_count = 0\n\n        for task_file in task_files:\n            try:\n                filename = os.path.basename(task_file)\n                # Extract task ID from filename like: task_5188369a-3bbe-43d8-8b94-11558f909a08_attempt_1_format_retry_0_2025-08-06T21-14-23-770872Z.json\n                task_id = (\n                    filename.split(\"_\")[1]\n                    if filename.startswith(\"task_\") and \"_\" in filename\n                    else \"\"\n                )\n\n                if task_id and task_id in self.gaia_text_103_task_ids:\n                    # This is a GAIA-Text-103 task, copy it\n                    copied_count += 1\n\n                    # Preserve the original directory structure\n                    # Get the relative path from the original directory\n                    original_dir = os.path.dirname(gaia_logs_dir)\n                    relative_path = os.path.relpath(task_file, original_dir)\n\n                    # Create the same directory structure in the output\n                    output_file = self.output_dir / relative_path\n                    output_file.parent.mkdir(parents=True, exist_ok=True)\n\n                    # Copy the file\n                    shutil.copy2(task_file, output_file)\n\n                    if copied_count % 50 == 0:\n                        print(f\"Copied {copied_count} tasks...\")\n\n            except Exception as e:\n                print(f\"Error processing {task_file}: {e}\")\n                continue\n\n        print(f\"Successfully copied {copied_count} GAIA-Text-103 tasks\")\n        self.copied_count = copied_count\n        return copied_count\n\n    def print_summary(self):\n        \"\"\"Print copying summary to console\"\"\"\n        print(\"\\n\" + \"=\" * 60)\n        print(\"GAIA-Text-103 Task Copying Summary\")\n        print(\"=\" * 60)\n        print(f\"Total Tasks Copied: {self.copied_count}\")\n        print(f\"Output Directory: {self.output_dir}\")\n        print(\"=\" * 60)\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    parser = argparse.ArgumentParser(\n        description=\"Copy GAIA-Text-103 tasks from GAIA validation logs\"\n    )\n    parser.add_argument(\n        \"gaia_logs_dir\", help=\"Directory containing GAIA validation logs\"\n    )\n    parser.add_argument(\n        \"--gaia_text_103_data\",\n        default=\"../../data/gaia-2023-validation-text-103/standardized_data.jsonl\",\n        help=\"Path to GAIA-Text-103 standardized data file\",\n    )\n    parser.add_argument(\n        \"--output-dir\",\n        help=\"Output directory for copied tasks (default: side by side with gaia-validation)\",\n    )\n\n    args = parser.parse_args()\n\n    # Set default output directory side by side with gaia-validation\n    if not args.output_dir:\n        gaia_logs_path = Path(args.gaia_logs_dir)\n        # If the input is a gaia-validation directory, create gaia-text-103-extraction next to it\n        if gaia_logs_path.name == \"gaia-validation\":\n            args.output_dir = str(gaia_logs_path.parent / \"gaia-text-103-extraction\")\n        else:\n            # Otherwise, create in the same directory as the input\n            args.output_dir = str(gaia_logs_path.parent / \"gaia-text-103-extraction\")\n\n    # Validate inputs\n    if not os.path.exists(args.gaia_logs_dir):\n        print(f\"Error: GAIA logs directory not found: {args.gaia_logs_dir}\")\n        return 1\n\n    if not os.path.exists(args.gaia_text_103_data):\n        print(f\"Error: GAIA-Text-103 data file not found: {args.gaia_text_103_data}\")\n        return 1\n\n    print(f\"Input GAIA logs directory: {args.gaia_logs_dir}\")\n    print(f\"Output directory: {args.output_dir}\")\n    print(f\"GAIA-Text-103 data file: {args.gaia_text_103_data}\")\n    print()\n\n    try:\n        # Initialize copier\n        copier = GAIAtoText103Copier(args.gaia_text_103_data, args.output_dir)\n\n        # Copy tasks\n        copied_count = copier.copy_gaia_text_103_tasks(args.gaia_logs_dir)\n\n        if copied_count == 0:\n            print(\"No GAIA-Text-103 tasks found in the logs directory\")\n            return 0\n\n        # Print summary\n        copier.print_summary()\n\n        return 0\n\n    except Exception as e:\n        print(f\"Error: {e}\")\n        return 1\n\n\nif __name__ == \"__main__\":\n    exit_code = main()\n    exit(exit_code)\n"
  },
  {
    "path": "apps/miroflow-agent/conf/__init__.py",
    "content": "# This file makes the conf directory a Python package\n"
  },
  {
    "path": "apps/miroflow-agent/conf/agent/default.yaml",
    "content": "# conf/agent/default.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\nmain_agent:\n  tools:\n    - tool-python\n    - tool-vqa\n    - tool-transcribe\n    - tool-reasoning\n    - tool-reader\n  max_turns: 20  # Maximum number of turns for main agent execution\n\nsub_agents:\n  agent-browsing:\n    tools:\n      - tool-google-search\n      - tool-vqa\n      - tool-reader\n      - tool-python\n    max_turns: 20\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/demo.yaml",
    "content": "# conf/agent/demo.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 20  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max200.yaml",
    "content": "# conf/agent/mirothinker_1.7_keep5_max200.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 200  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).\nretry_with_summary: False # default is true"
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max300.yaml",
    "content": "# conf/agent/mirothinker_1.7_keep5_max300.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 300  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).\nretry_with_summary: False # default is true"
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml",
    "content": "# conf/agent/mirothinker_v1.0.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 600  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml",
    "content": "# conf/agent/mirothinker_v1.0_keep5.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 600  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml",
    "content": "# conf/agent/mirothinker_v1.5.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 600  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml",
    "content": "# conf/agent/mirothinker_v1.5_keep5_max200.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 200  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml",
    "content": "# conf/agent/mirothinker_v1.5_keep5_max400.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 400  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/multi_agent.yaml",
    "content": "# conf/agent/multi_agent.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - tool-python\n    - tool-vqa\n    - tool-transcribe\n    - tool-reasoning\n    - tool-reader\n  max_turns: 50  # Maximum number of turns for main agent execution\n\nsub_agents:\n  agent-browsing:\n    tools:\n      - tool-google-search\n      - tool-vqa\n      - tool-reader\n      - tool-python\n    max_turns: 50\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).\n"
  },
  {
    "path": "apps/miroflow-agent/conf/agent/multi_agent_os.yaml",
    "content": "# conf/agent/multi_agent_os.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - tool-python\n    - tool-vqa-os\n    - tool-transcribe-os\n    - tool-reasoning-os\n    - tool-reader\n  max_turns: 50  # Maximum number of turns for main agent execution\n\nsub_agents:\n  agent-browsing:\n    tools:\n      - tool-google-search\n      - tool-vqa-os\n      - tool-reader\n      - tool-python\n    max_turns: 50\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).\n"
  },
  {
    "path": "apps/miroflow-agent/conf/agent/single_agent.yaml",
    "content": "# conf/agent/single_agent.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 600  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: -1\ncontext_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/agent/single_agent_keep5.yaml",
    "content": "# conf/agent/single_agent_keep5.yaml\n# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py\n# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py\ndefaults:\n  - default\n  - _self_\n\nmain_agent:\n  tools:\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-python\n  tool_blacklist:\n    - [ \"search_and_scrape_webpage\", \"sogou_search\" ]\n    - [ \"tool-python\", \"download_file_from_sandbox_to_local\" ]\n  max_turns: 600  # Maximum number of turns for main agent execution\n\nsub_agents:\n\n# Settings for context management\nkeep_tool_result: 5\ncontext_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled)."
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/aime2025.yaml",
    "content": "# conf/benchmark/aime2025.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"aime2025\"\n\ndata:\n  data_dir: \"../../data/aime2025\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/browsecomp.yaml",
    "content": "# conf/benchmark/browsecomp.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"browsecomp\"\n\ndata:\n  data_dir: \"../../data/browsecomp\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml",
    "content": "# conf/benchmark/browsecomp_zh.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"browsecomp_zh\"\n\ndata:\n  data_dir: \"../../data/browsecomp_zh\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/collect_trace.yaml",
    "content": "# conf/benchmark/collect_trace.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"collect_trace\"\n\ndata:\n  data_dir: \"../../data/debug\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/debug.yaml",
    "content": "# conf/benchmark/debug.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"debug\"\n\ndata:\n  data_dir: \"../../data/debug\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/deepsearchqa.yaml",
    "content": "# conf/benchmark/deepsearchqa.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"deepsearchqa\"\n\ndata:\n  data_dir: \"../../data/deepsearchqa\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/default.yaml",
    "content": "# conf/benchmark/default.yaml - Default benchmark configuration\n# This is a base configuration for benchmarks. Specific benchmarks can override these defaults.\nname: \"default\"\n\ndata:\n  metadata_file: \"standardized_data.jsonl\"\n  field_mapping:\n    task_id_field: \"task_id\"\n    task_question_field: \"task_question\"\n    ground_truth_field: \"ground_truth\"\n    file_name_field: \"file_name\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5 \n  pass_at_k: 1"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/frames.yaml",
    "content": "# conf/benchmark/frames.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"frames\"\n\ndata:\n  data_dir: \"../../data/frames\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/futurex.yaml",
    "content": "# conf/benchmark/futurex.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"futurex\"\n\ndata:\n  data_dir: \"../../data/futurex\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml",
    "content": "# conf/benchmark/gaia-validation-text-103.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"gaia-validation-text-103\"\n\ndata:\n  data_dir: \"../../data/gaia-2023-validation-text-103\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/gaia-validation.yaml",
    "content": "# conf/benchmark/gaia-validation.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"gaia-validation\"\n\ndata:\n  data_dir: \"../../data/gaia-2023-validation\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml",
    "content": "# conf/benchmark/hle-text-2158.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"hle-text-2158\"\n\ndata:\n  data_dir: \"../../data/hle-text-2158\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/hle-text-500.yaml",
    "content": "# conf/benchmark/hle-text-500.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"hle-text-500\"\n\ndata:\n  data_dir: \"../../data/hle-text-500\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/hle.yaml",
    "content": "# conf/benchmark/hle.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"hle\"\n\ndata:\n  data_dir: \"../../data/hle\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/seal-0.yaml",
    "content": "# conf/benchmark/seal-0.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"seal-0\"\n\ndata:\n  data_dir: \"../../data/seal-0\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml",
    "content": "# conf/benchmark/webwalkerqa.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"webwalkerqa\"\n\ndata:\n  data_dir: \"../../data/webwalkerqa\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml",
    "content": "# conf/benchmark/xbench_deepsearch.yaml\ndefaults:\n  - default\n  - _self_\n\nname: \"xbench_deepsearch\"\n\ndata:\n  data_dir: \"../../data/xbench_deepsearch\"\n\nexecution:\n  max_tasks: null  # null means no limit\n  max_concurrent: 5\n  pass_at_k: 1\n"
  },
  {
    "path": "apps/miroflow-agent/conf/config.yaml",
    "content": "# conf/config.yaml\ndefaults:\n  - llm: default\n  - agent: default\n  - benchmark: default\n  - _self_  # Allows variables to be defined at the top of this file\n\nhydra:\n  run:\n    dir: ../../logs/debug\n\n# You can define some top-level or default parameters here\nproject_name: \"miroflow-agent\"\ndebug_dir: \"../../logs/debug\"\n"
  },
  {
    "path": "apps/miroflow-agent/conf/llm/claude-3-7.yaml",
    "content": "# conf/llm/claude-3-7.yaml\ndefaults:\n  - default\n  - _self_\n\nprovider: \"anthropic\"\nmodel_name: \"claude-3-7-sonnet-20250219\"\nbase_url: https://api.anthropic.com\nmax_context_length: 65536\n"
  },
  {
    "path": "apps/miroflow-agent/conf/llm/default.yaml",
    "content": "# conf/llm/default.yaml - Default LLM configuration\nprovider: \"anthropic\" # openai, anthropic, qwen\nmodel_name: \"claude-3-7-sonnet-20250219\"\nasync_client: false\ntemperature: 0.3\ntop_p: 1.0\nmin_p: 0.0\ntop_k: -1\nmax_tokens: 4096\napi_key: \"\"\nbase_url: https://api.anthropic.com\nrepetition_penalty: 1.0\n"
  },
  {
    "path": "apps/miroflow-agent/conf/llm/gpt-5.yaml",
    "content": "# conf/llm/gpt-5.yaml\ndefaults:\n  - default\n  - _self_\n\nprovider: \"openai\"\nmodel_name: \"gpt-5-2025-08-07\"\nbase_url: https://api.openai.com/v1\nmax_context_length: 65536\n"
  },
  {
    "path": "apps/miroflow-agent/conf/llm/qwen-3.yaml",
    "content": "# conf/llm/qwen-3.yaml\ndefaults:\n  - default\n  - _self_\n\nprovider: \"qwen\"\nmodel_name: \"qwen-3\"\nbase_url: \"https://your-api.com/v1\"\nmax_context_length: 262144\nmax_tokens: 16384\ntop_p: 0.95\nrepetition_penalty: 1.05\ntemperature: 1.0"
  },
  {
    "path": "apps/miroflow-agent/main.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\n\nimport hydra\nfrom omegaconf import DictConfig, OmegaConf\n\n# Import from the new modular structure\nfrom src.core.pipeline import (\n    create_pipeline_components,\n    execute_task_pipeline,\n)\nfrom src.logging.task_logger import bootstrap_logger\n\n# Configure logger and get the configured instance\nlogger = bootstrap_logger()\n\n\nasync def amain(cfg: DictConfig) -> None:\n    \"\"\"Asynchronous main function.\"\"\"\n\n    logger.info(OmegaConf.to_yaml(cfg))\n\n    # Create pipeline components using the factory function\n    main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (\n        create_pipeline_components(cfg)\n    )\n\n    # Define task parameters\n    task_id = \"task_example\"\n    task_description = \"What is the title of today's arxiv paper in computer science?\"\n    task_file_name = \"\"\n\n    # Execute task using the pipeline\n    final_summary, final_boxed_answer, log_file_path, _ = await execute_task_pipeline(\n        cfg=cfg,\n        task_id=task_id,\n        task_file_name=task_file_name,\n        task_description=task_description,\n        main_agent_tool_manager=main_agent_tool_manager,\n        sub_agent_tool_managers=sub_agent_tool_managers,\n        output_formatter=output_formatter,\n        log_dir=cfg.debug_dir,\n    )\n\n\n@hydra.main(config_path=\"conf\", config_name=\"config\", version_base=None)\ndef main(cfg: DictConfig) -> None:\n    asyncio.run(amain(cfg))\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/miroflow-agent/pyproject.toml",
    "content": "[project]\nname = \"miroflow-agent\"\nversion = \"0.1.0\"\ndescription = \"An agent framework for complex task solving with LLM and MCP tools\"\nreadme = \"README.md\"\nrequires-python = \">=3.12\"\ndependencies = [\n    \"miroflow-tools>=0.1.0\",\n    \"huggingface-hub>=0.28.0\",\n    \"requests>=2.32.3\",\n    \"rich>=13.9.4\",\n    \"jinja2>=3.1.4\",\n    \"pillow>=11.0.0\",\n    \"markdownify>=0.14.1\",\n    \"duckduckgo-search>=6.3.7\",\n    \"python-dotenv\",\n    \"pdfminer-six\",\n    \"python-pptx\",\n    \"puremagic\",\n    \"pydub\",\n    \"SpeechRecognition\",\n    \"youtube_transcript_api\",\n    \"mcp\",\n    \"fastmcp\",\n    \"anthropic\",\n    \"e2b-code-interpreter==1.2.1\",\n    \"jsonlines>=4.0.0\",\n    \"mammoth>=1.9.0\",\n    \"numpy>=2.2.5\",\n    \"ipdb>=0.13.13\",\n    \"datasets>=3.5.0\",\n    \"openpyxl>=3.1.5\",\n    \"markitdown-mcp>=0.0.1a3\",\n    \"markitdown>=0.1.1\",\n    \"regex>=2024.11.6\",\n    \"openai>=1.78.1\",\n    \"tenacity>=9.1.2\",\n    \"transformers>=4.51.3\",\n    \"omegaconf>=2.3.0\",\n    \"wikipedia\",\n    \"mutagen\",\n    \"hydra-core\",\n    \"google-genai\",\n    \"tiktoken>=0.9.0\",\n    \"aiohttp\",\n    \"colorama>=0.4.6\",\n    \"json-repair>=0.49.0\",\n    \"tencentcloud-sdk-python>=3.0.1451\"\n]\n\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.hatch.build.targets.wheel]\npackages = [\"src\"]\n\n\n[tool.uv.sources]\nmiroflow-tools = { path = \"../../libs/miroflow-tools\", editable = true }\n\n[dependency-groups]\ndev = [\n    \"inline-snapshot>=0.23.2\",\n    \"pyright>=1.1.403\",\n    \"pytest>=8.4.1\",\n    \"pytest-asyncio>=1.0.0\",\n    \"pytest-cov>=6.2.1\",\n    \"pytest-html>=4.1.1\",\n    \"pytest-xdist>=3.7.0\",\n    \"ty>=0.0.1a14\",\n]\n\n[tool.pytest.ini_options]\n# see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml\nminversion = \"8.3.5\"\ntestpaths = [\"tests\"]\n# make warning go away\n# https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915\nasyncio_default_fixture_loop_scope = \"function\"\naddopts = [\n    # summary for failed AND passed tests\n    \"-rA\",\n    # only show stderr for test. stdlog can contain sensitive information\n    \"--show-capture=stderr\",\n    # use `pytest-xdist` to run tests in parallel\n    \"-n=auto\",\n    # use `pytest-html` to generate test report in html format\n    \"--html=report.html\",\n    \"--self-contained-html\",\n    # use `pytest-testmon` to run tests on changed files only\n    # \"--testmon\",\n    # use `pytest-cov` to generate test coverage report\n    \"--cov=miroflow_agent\",\n    \"--cov-report=html\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-32}\nBENCHMARK_NAME=\"aime2025\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/aime2025 \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"browsecomp\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/browsecomp \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"browsecomp_zh\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/browsecomp_zh \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-1}\nBENCHMARK_NAME=\"debug\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-1}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/debug \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_deepsearchqa.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"deepsearchqa\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/deepsearchqa \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"frames\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/frames \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-8}\nBENCHMARK_NAME=\"futurex\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data_250924_250930.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/futurex \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\n# echo \"Calculating average scores...\"\n# uv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\necho \"Extracting predictions and formatting for FutureX submission...\"\nuv run python benchmarks/evaluators/extract_futurex_results.py \"$RESULTS_DIR\"\n\n# Check status and provide user-friendly message\nif [ $? -eq 0 ]; then\n    echo \"✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl\"\n    echo \"You can now upload this file to the FutureX test server.\"\nelse\n    echo \"❌ Failed to generate submission file. Please check the logs for details.\"\nfi\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-8}\nBENCHMARK_NAME=\"gaia-validation-text-103\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-8}\nBENCHMARK_NAME=\"gaia-validation\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/gaia-2023-validation \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"hle-text-2158\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data_original.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/hle-text-2158 \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"hle-text-500\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data_original.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/hle-text-500 \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"hle\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/hle \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-8}\nBENCHMARK_NAME=\"seal-0\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/seal-0 \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-3}\nBENCHMARK_NAME=\"webwalkerqa\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/webwalkerqa \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh",
    "content": "#!/bin/bash\n\n# Parse environment variables, use defaults if not set\nLLM_MODEL=${LLM_MODEL:-\"MiroThinker-Models\"}\nBASE_URL=${BASE_URL:-\"https://your-api.com/v1\"}\n\n# Configuration parameters\nNUM_RUNS=${NUM_RUNS:-8}\nBENCHMARK_NAME=\"xbench_deepsearch\"\nLLM_PROVIDER=${LLM_PROVIDER:-\"qwen\"}\nAGENT_SET=${AGENT_SET:-\"single_agent_keep5\"}\nMAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}\nMAX_CONCURRENT=${MAX_CONCURRENT:-10}\nPASS_AT_K=${PASS_AT_K:-1}\nTEMPERATURE=${TEMPERATURE:-1.0}\nAPI_KEY=${API_KEY:-\"xxx\"}\n\n# Set results directory\nRESULTS_DIR=\"../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}\"\n\necho \"Starting $NUM_RUNS runs of the evaluation...\"\necho \"Results will be saved in: $RESULTS_DIR\"\n\n# Create results directory\nmkdir -p \"$RESULTS_DIR\"\n\n# Launch all parallel tasks\nfor i in $(seq 1 $NUM_RUNS); do\n    echo \"==========================================\"\n    echo \"Launching experiment $i/$NUM_RUNS\"\n    echo \"Output log: please view $RESULTS_DIR/run_${i}_output.log\"\n    echo \"==========================================\"\n    \n    # Set specific identifier for this run\n    RUN_ID=\"run_$i\"\n    \n    # Run experiment (background execution)\n    (\n        uv run python benchmarks/common_benchmark.py \\\n            benchmark=$BENCHMARK_NAME \\\n            benchmark.data.metadata_file=\"standardized_data.jsonl\" \\\n            llm=qwen-3 \\\n            llm.provider=$LLM_PROVIDER \\\n            llm.model_name=$LLM_MODEL \\\n            llm.base_url=$BASE_URL \\\n            llm.async_client=true \\\n            llm.temperature=$TEMPERATURE \\\n            llm.max_context_length=$MAX_CONTEXT_LENGTH \\\n            llm.api_key=$API_KEY \\\n            benchmark.execution.max_tasks=null \\\n            benchmark.execution.max_concurrent=$MAX_CONCURRENT \\\n            benchmark.execution.pass_at_k=$PASS_AT_K \\\n            benchmark.data.data_dir=../../data/xbench_deepsearch \\\n            agent=$AGENT_SET \\\n            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \\\n            2>&1 | tee \"$RESULTS_DIR/${RUN_ID}_output.log\" \n        \n        # Check if run was successful\n        if [ $? -eq 0 ]; then\n            echo \"Run $i completed successfully\"\n            RESULT_FILE=$(find \"${RESULTS_DIR}/$RUN_ID\" -name \"*accuracy.txt\" 2>/dev/null | head -1)\n            if [ -f \"$RESULT_FILE\" ]; then\n                echo \"Results saved to $RESULT_FILE\"\n            else\n                echo \"Warning: Result file not found for run $i\"\n            fi\n        else\n            echo \"Run $i failed!\"\n        fi\n    ) &\n    \n    # Small delay between launches to avoid simultaneous requests\n    sleep 2\ndone\n\necho \"All $NUM_RUNS runs have been launched in parallel\"\necho \"Waiting for all runs to complete...\"\n\n# Wait for all background tasks to complete\nwait\n\necho \"==========================================\"\necho \"All $NUM_RUNS runs completed!\"\necho \"==========================================\"\n\n# Calculate average scores\necho \"Calculating average scores...\"\nuv run python benchmarks/evaluators/calculate_average_score.py \"$RESULTS_DIR\"\n\necho \"==========================================\"\necho \"Multiple runs evaluation completed!\"\necho \"Check results in: $RESULTS_DIR\"\necho \"Check individual run logs: $RESULTS_DIR/run_*_output.log\"\necho \"==========================================\" \n"
  },
  {
    "path": "apps/miroflow-agent/src/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"MiroFlow Agent - A modular agent framework for task execution.\"\"\"\n\nfrom .core.orchestrator import Orchestrator\nfrom .core.pipeline import create_pipeline_components, execute_task_pipeline\nfrom .io.output_formatter import OutputFormatter\nfrom .llm.factory import ClientFactory\nfrom .logging.task_logger import TaskLog, bootstrap_logger\n\n__all__ = [\n    \"Orchestrator\",\n    \"create_pipeline_components\",\n    \"execute_task_pipeline\",\n    \"OutputFormatter\",\n    \"ClientFactory\",\n    \"TaskLog\",\n    \"bootstrap_logger\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/config/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Configuration module for MiroFlow Agent.\"\"\"\n\nfrom .settings import (\n    create_mcp_server_parameters,\n    expose_sub_agents_as_tools,\n    get_env_info,\n)\n\n__all__ = [\n    \"create_mcp_server_parameters\",\n    \"expose_sub_agents_as_tools\",\n    \"get_env_info\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/config/settings.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nConfiguration settings and MCP server parameter management.\n\nThis module handles:\n- Loading environment variables for API keys and service URLs\n- Creating MCP server configurations for different tools\n- Exposing sub-agents as callable tools\n- Collecting environment information for logging\n\"\"\"\n\nimport os\nimport sys\n\nfrom dotenv import load_dotenv\nfrom mcp import StdioServerParameters\nfrom omegaconf import DictConfig\n\n# Load environment variables from .env file\nload_dotenv()\n\n# API for Google Search\nSERPER_API_KEY = os.environ.get(\"SERPER_API_KEY\")\nSERPER_BASE_URL = os.environ.get(\"SERPER_BASE_URL\", \"https://google.serper.dev\")\n\n# API for Web Scraping\nJINA_API_KEY = os.environ.get(\"JINA_API_KEY\")\nJINA_BASE_URL = os.environ.get(\"JINA_BASE_URL\", \"https://r.jina.ai\")\n\n# API for Linux Sandbox\nE2B_API_KEY = os.environ.get(\"E2B_API_KEY\")\n\n# API for Open-Source Audio Transcription Tool\nWHISPER_BASE_URL = os.environ.get(\"WHISPER_BASE_URL\")\nWHISPER_API_KEY = os.environ.get(\"WHISPER_API_KEY\")\nWHISPER_MODEL_NAME = os.environ.get(\"WHISPER_MODEL_NAME\")\n\n# API for Open-Source VQA Tool\nVISION_API_KEY = os.environ.get(\"VISION_API_KEY\")\nVISION_BASE_URL = os.environ.get(\"VISION_BASE_URL\")\nVISION_MODEL_NAME = os.environ.get(\"VISION_MODEL_NAME\")\n\n# API for Open-Source Reasoning Tool\nREASONING_API_KEY = os.environ.get(\"REASONING_API_KEY\")\nREASONING_BASE_URL = os.environ.get(\"REASONING_BASE_URL\")\nREASONING_MODEL_NAME = os.environ.get(\"REASONING_MODEL_NAME\")\n\n# API for Claude Sonnet 3.7 as Commercial Tools\nANTHROPIC_API_KEY = os.environ.get(\"ANTHROPIC_API_KEY\")\nANTHROPIC_BASE_URL = os.environ.get(\"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\")\n\n# API Keys for Commercial Tools\nOPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\nOPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n# API for Sogou Search\nTENCENTCLOUD_SECRET_ID = os.environ.get(\"TENCENTCLOUD_SECRET_ID\")\nTENCENTCLOUD_SECRET_KEY = os.environ.get(\"TENCENTCLOUD_SECRET_KEY\")\n\n# API for Summary LLM\nSUMMARY_LLM_API_KEY = os.environ.get(\"SUMMARY_LLM_API_KEY\")\nSUMMARY_LLM_BASE_URL = os.environ.get(\"SUMMARY_LLM_BASE_URL\")\nSUMMARY_LLM_MODEL_NAME = os.environ.get(\"SUMMARY_LLM_MODEL_NAME\")\n\n\n# MCP server configuration generation function\ndef create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig):\n    \"\"\"\n    Create MCP server configurations based on agent configuration.\n\n    Dynamically generates StdioServerParameters for each tool specified in the\n    agent configuration. Each tool type (search, python, vqa, etc.) has its own\n    MCP server with appropriate environment variables.\n\n    Args:\n        cfg: Global Hydra configuration object\n        agent_cfg: Agent-specific configuration containing 'tools' and 'tool_blacklist'\n\n    Returns:\n        Tuple of (configs, blacklist) where:\n        - configs: List of dicts with 'name' and 'params' (StdioServerParameters)\n        - blacklist: Set of (server_name, tool_name) tuples to exclude\n    \"\"\"\n    configs = []\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-google-search\" in agent_cfg[\"tools\"]\n    ):\n        if not SERPER_API_KEY:\n            raise ValueError(\n                \"SERPER_API_KEY not set, tool-google-search will be unavailable.\"\n            )\n\n        configs.append(\n            {\n                \"name\": \"tool-google-search\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.mcp_servers.searching_google_mcp_server\",\n                    ],\n                    env={\n                        \"SERPER_API_KEY\": SERPER_API_KEY,\n                        \"SERPER_BASE_URL\": SERPER_BASE_URL,\n                        \"JINA_API_KEY\": JINA_API_KEY,\n                        \"JINA_BASE_URL\": JINA_BASE_URL,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-sogou-search\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-sogou-search\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.mcp_servers.searching_sogou_mcp_server\",\n                    ],\n                    env={\n                        \"TENCENTCLOUD_SECRET_ID\": TENCENTCLOUD_SECRET_ID,\n                        \"TENCENTCLOUD_SECRET_KEY\": TENCENTCLOUD_SECRET_KEY,\n                        \"JINA_API_KEY\": JINA_API_KEY,\n                        \"JINA_BASE_URL\": JINA_BASE_URL,\n                    },\n                ),\n            }\n        )\n\n    if agent_cfg.get(\"tools\", None) is not None and \"tool-python\" in agent_cfg[\"tools\"]:\n        configs.append(\n            {\n                \"name\": \"tool-python\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.python_mcp_server\"],\n                    env={\"E2B_API_KEY\": E2B_API_KEY},\n                ),\n            }\n        )\n\n    if agent_cfg.get(\"tools\", None) is not None and \"tool-vqa\" in agent_cfg[\"tools\"]:\n        configs.append(\n            {\n                \"name\": \"tool-vqa\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.vision_mcp_server\"],\n                    env={\n                        \"OPENAI_API_KEY\": OPENAI_API_KEY,\n                        \"OPENAI_BASE_URL\": OPENAI_BASE_URL,\n                    },\n                ),\n            }\n        )\n\n    if agent_cfg.get(\"tools\", None) is not None and \"tool-vqa-os\" in agent_cfg[\"tools\"]:\n        configs.append(\n            {\n                \"name\": \"tool-vqa-os\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.vision_mcp_server_os\"],\n                    env={\n                        \"VISION_API_KEY\": VISION_API_KEY,\n                        \"VISION_BASE_URL\": VISION_BASE_URL,\n                        \"VISION_MODEL_NAME\": VISION_MODEL_NAME,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-transcribe\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-transcribe\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.audio_mcp_server\"],\n                    env={\n                        \"OPENAI_API_KEY\": OPENAI_API_KEY,\n                        \"OPENAI_BASE_URL\": OPENAI_BASE_URL,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-transcribe-os\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-transcribe-os\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.audio_mcp_server_os\"],\n                    env={\n                        \"WHISPER_BASE_URL\": WHISPER_BASE_URL,\n                        \"WHISPER_API_KEY\": WHISPER_API_KEY,\n                        \"WHISPER_MODEL_NAME\": WHISPER_MODEL_NAME,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-reasoning\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-reasoning\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.mcp_servers.reasoning_mcp_server\",\n                    ],\n                    env={\n                        \"ANTHROPIC_API_KEY\": ANTHROPIC_API_KEY,\n                        \"ANTHROPIC_BASE_URL\": ANTHROPIC_BASE_URL,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-reasoning-os\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-reasoning-os\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.mcp_servers.reasoning_mcp_server_os\",\n                    ],\n                    env={\n                        \"REASONING_API_KEY\": REASONING_API_KEY,\n                        \"REASONING_BASE_URL\": REASONING_BASE_URL,\n                        \"REASONING_MODEL_NAME\": REASONING_MODEL_NAME,\n                    },\n                ),\n            }\n        )\n\n    # reader\n    if agent_cfg.get(\"tools\", None) is not None and \"tool-reader\" in agent_cfg[\"tools\"]:\n        configs.append(\n            {\n                \"name\": \"tool-reader\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"markitdown_mcp\"],\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"tool-reading\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"tool-reading\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\"-m\", \"miroflow_tools.mcp_servers.reading_mcp_server\"],\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"search_and_scrape_webpage\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"search_and_scrape_webpage\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.dev_mcp_servers.search_and_scrape_webpage\",\n                    ],\n                    env={\n                        \"SERPER_API_KEY\": SERPER_API_KEY,\n                        \"SERPER_BASE_URL\": SERPER_BASE_URL,\n                        \"TENCENTCLOUD_SECRET_ID\": TENCENTCLOUD_SECRET_ID,\n                        \"TENCENTCLOUD_SECRET_KEY\": TENCENTCLOUD_SECRET_KEY,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"jina_scrape_llm_summary\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"jina_scrape_llm_summary\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary\",\n                    ],\n                    env={\n                        \"JINA_API_KEY\": JINA_API_KEY,\n                        \"JINA_BASE_URL\": JINA_BASE_URL,\n                        \"SUMMARY_LLM_BASE_URL\": SUMMARY_LLM_BASE_URL,\n                        \"SUMMARY_LLM_MODEL_NAME\": SUMMARY_LLM_MODEL_NAME,\n                        \"SUMMARY_LLM_API_KEY\": SUMMARY_LLM_API_KEY,\n                    },\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"stateless_python\" in agent_cfg[\"tools\"]\n    ):\n        configs.append(\n            {\n                \"name\": \"stateless_python\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.dev_mcp_servers.stateless_python_server\",\n                    ],\n                    env={\"E2B_API_KEY\": E2B_API_KEY},\n                ),\n            }\n        )\n\n    if (\n        agent_cfg.get(\"tools\", None) is not None\n        and \"task_planner\" in agent_cfg[\"tools\"]\n    ):\n        # Generate a random UUID for each MCP server instance to ensure isolation\n        # Each time create_mcp_server_parameters is called, a new UUID is generated\n        # This automatically isolates todo lists for concurrent tasks\n        import uuid\n\n        todo_task_id = str(uuid.uuid4())\n        configs.append(\n            {\n                \"name\": \"task_planner\",\n                \"params\": StdioServerParameters(\n                    command=sys.executable,\n                    args=[\n                        \"-m\",\n                        \"miroflow_tools.dev_mcp_servers.task_planner\",\n                    ],\n                    env={\"TASK_ID\": todo_task_id},\n                ),\n            }\n        )\n\n    blacklist = set()\n    for black_list_item in agent_cfg.get(\"tool_blacklist\", []):\n        blacklist.add((black_list_item[0], black_list_item[1]))\n    return configs, blacklist\n\n\ndef expose_sub_agents_as_tools(sub_agents_cfg: DictConfig):\n    \"\"\"\n    Convert sub-agent configurations into tool definitions for the main agent.\n\n    This allows the main agent to invoke sub-agents (like the browsing agent)\n    as if they were regular MCP tools, enabling a hierarchical agent architecture.\n\n    Args:\n        sub_agents_cfg: Configuration containing sub-agent definitions\n\n    Returns:\n        List of server parameter dicts, each with 'name' and 'tools' keys.\n        Each tool includes 'name', 'description', and 'schema' for the sub-agent.\n    \"\"\"\n    sub_agents_server_params = []\n    for sub_agent in sub_agents_cfg.keys():\n        if \"agent-browsing\" in sub_agent:\n            sub_agents_server_params.append(\n                dict(\n                    name=\"agent-browsing\",\n                    tools=[\n                        dict(\n                            name=\"search_and_browse\",\n                            description=\"This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \\nArgs: \\n\\tsubtask: the subtask to be performed. \\nReturns: \\n\\tthe result of the subtask. \",\n                            schema={\n                                \"type\": \"object\",\n                                \"properties\": {\n                                    \"subtask\": {\"title\": \"Subtask\", \"type\": \"string\"}\n                                },\n                                \"required\": [\"subtask\"],\n                                \"title\": \"search_and_browseArguments\",\n                            },\n                        )\n                    ],\n                )\n            )\n    return sub_agents_server_params\n\n\ndef get_env_info(cfg: DictConfig) -> dict:\n    \"\"\"\n    Collect current configuration and environment information for logging.\n\n    Gathers LLM settings, agent configuration, API key availability (masked),\n    and base URLs. Used for debugging and task log enrichment.\n\n    Args:\n        cfg: Hydra configuration object\n\n    Returns:\n        Dictionary containing:\n        - LLM configuration (provider, model, temperature, etc.)\n        - Agent configuration (max turns for main/sub agents)\n        - API key availability flags (boolean, not actual keys)\n        - Service base URLs\n    \"\"\"\n    return {\n        # LLM Configuration\n        \"llm_provider\": cfg.llm.provider,\n        \"llm_base_url\": cfg.llm.base_url,\n        \"llm_model_name\": cfg.llm.model_name,\n        \"llm_temperature\": cfg.llm.temperature,\n        \"llm_top_p\": cfg.llm.top_p,\n        \"llm_min_p\": cfg.llm.min_p,\n        \"llm_top_k\": cfg.llm.top_k,\n        \"llm_max_tokens\": cfg.llm.max_tokens,\n        \"llm_repetition_penalty\": cfg.llm.repetition_penalty,\n        \"llm_async_client\": cfg.llm.async_client,\n        \"keep_tool_result\": cfg.agent.keep_tool_result,\n        # Agent Configuration\n        \"main_agent_max_turns\": cfg.agent.main_agent.max_turns,\n        **(\n            {\n                f\"sub_{sub_agent}_max_turns\": cfg.agent.sub_agents[sub_agent].max_turns\n                for sub_agent in cfg.agent.sub_agents\n            }\n            if cfg.agent.sub_agents is not None\n            else {}\n        ),\n        # API Keys (masked for security)\n        \"has_serper_api_key\": bool(SERPER_API_KEY),\n        \"has_jina_api_key\": bool(JINA_API_KEY),\n        \"has_anthropic_api_key\": bool(ANTHROPIC_API_KEY),\n        \"has_openai_api_key\": bool(OPENAI_API_KEY),\n        \"has_e2b_api_key\": bool(E2B_API_KEY),\n        \"has_tencent_secret_id\": bool(TENCENTCLOUD_SECRET_ID),\n        \"has_tencent_secret_key\": bool(TENCENTCLOUD_SECRET_KEY),\n        \"has_summary_llm_api_key\": bool(SUMMARY_LLM_API_KEY),\n        # Base URLs\n        \"openai_base_url\": OPENAI_BASE_URL,\n        \"anthropic_base_url\": ANTHROPIC_BASE_URL,\n        \"jina_base_url\": JINA_BASE_URL,\n        \"serper_base_url\": SERPER_BASE_URL,\n        \"whisper_base_url\": WHISPER_BASE_URL,\n        \"vision_base_url\": VISION_BASE_URL,\n        \"reasoning_base_url\": REASONING_BASE_URL,\n        \"summary_llm_base_url\": SUMMARY_LLM_BASE_URL,\n    }\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Core module containing orchestrator and pipeline components.\"\"\"\n\nfrom .answer_generator import AnswerGenerator\nfrom .orchestrator import Orchestrator\nfrom .pipeline import create_pipeline_components, execute_task_pipeline\nfrom .stream_handler import StreamHandler\nfrom .tool_executor import ToolExecutor\n\n__all__ = [\n    \"AnswerGenerator\",\n    \"Orchestrator\",\n    \"StreamHandler\",\n    \"ToolExecutor\",\n    \"create_pipeline_components\",\n    \"execute_task_pipeline\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/answer_generator.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nAnswer generator module for final answer generation and context management.\n\nThis module provides the AnswerGenerator class that handles:\n- LLM call processing\n- Failure summary generation for context compression\n- Final answer generation with retries\n- Context management fallback strategies\n\"\"\"\n\nimport logging\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom omegaconf import DictConfig\n\nfrom ..io.output_formatter import OutputFormatter\nfrom ..llm.base_client import BaseClient\nfrom ..logging.task_logger import TaskLog\nfrom ..utils.parsing_utils import extract_failure_experience_summary\nfrom ..utils.prompt_utils import (\n    FAILURE_SUMMARY_ASSISTANT_PREFIX,\n    FAILURE_SUMMARY_PROMPT,\n    FORMAT_ERROR_MESSAGE,\n    generate_agent_summarize_prompt,\n)\nfrom ..utils.wrapper_utils import ErrorBox, ResponseBox\nfrom .stream_handler import StreamHandler\n\nlogger = logging.getLogger(__name__)\n\n# Safety limits for retry loops\nDEFAULT_MAX_FINAL_ANSWER_RETRIES = 3\n\n\nclass AnswerGenerator:\n    \"\"\"\n    Generator for final answers with context management support.\n\n    Handles the generation of final answers, failure summaries for retry,\n    and various fallback strategies based on context management settings.\n    \"\"\"\n\n    def __init__(\n        self,\n        llm_client: BaseClient,\n        output_formatter: OutputFormatter,\n        task_log: TaskLog,\n        stream_handler: StreamHandler,\n        cfg: DictConfig,\n        intermediate_boxed_answers: List[str],\n    ):\n        \"\"\"\n        Initialize the answer generator.\n\n        Args:\n            llm_client: The LLM client for API calls\n            output_formatter: Formatter for output processing\n            task_log: Logger for task execution\n            stream_handler: Handler for streaming events\n            cfg: Configuration object\n            intermediate_boxed_answers: List to track intermediate answers\n        \"\"\"\n        self.llm_client = llm_client\n        self.output_formatter = output_formatter\n        self.task_log = task_log\n        self.stream = stream_handler\n        self.cfg = cfg\n        self.intermediate_boxed_answers = intermediate_boxed_answers\n\n        # Context management settings\n        self.context_compress_limit = cfg.agent.get(\"context_compress_limit\", 0)\n        self.max_final_answer_retries = (\n            DEFAULT_MAX_FINAL_ANSWER_RETRIES if cfg.agent.keep_tool_result == -1 else 1\n        )\n        self.retry_with_summary = cfg.agent.get(\"retry_with_summary\", True)\n\n    async def handle_llm_call(\n        self,\n        system_prompt: str,\n        message_history: List[Dict[str, Any]],\n        tool_definitions: List[Dict],\n        step_id: int,\n        purpose: str = \"\",\n        agent_type: str = \"main\",\n    ) -> Tuple[Optional[str], bool, Optional[Any], List[Dict[str, Any]]]:\n        \"\"\"\n        Unified LLM call and logging processing.\n\n        Args:\n            system_prompt: System prompt for the LLM\n            message_history: Conversation history\n            tool_definitions: Available tool definitions\n            step_id: Current step ID for logging\n            purpose: Description of the call purpose\n            agent_type: Type of agent making the call\n\n        Returns:\n            Tuple of (response_text, should_break, tool_calls_info, message_history)\n        \"\"\"\n        original_message_history = message_history\n        try:\n            response, message_history = await self.llm_client.create_message(\n                system_prompt=system_prompt,\n                message_history=message_history,\n                tool_definitions=tool_definitions,\n                keep_tool_result=self.cfg.agent.keep_tool_result,\n                step_id=step_id,\n                task_log=self.task_log,\n                agent_type=agent_type,\n            )\n\n            if ErrorBox.is_error_box(response):\n                await self.stream.show_error(str(response))\n                response = None\n\n            if ResponseBox.is_response_box(response):\n                if response.has_extra_info():\n                    extra_info = response.get_extra_info()\n                    if extra_info.get(\"warning_msg\"):\n                        await self.stream.show_error(\n                            extra_info.get(\"warning_msg\", \"Empty warning message\")\n                        )\n                response = response.get_response()\n\n            # Check if response is None (indicating an error occurred)\n            if response is None:\n                self.task_log.log_step(\n                    \"error\",\n                    f\"{purpose} | LLM Call Failed\",\n                    f\"{purpose} failed - no response received\",\n                )\n                return \"\", False, None, original_message_history\n\n            # Use client's response processing method\n            assistant_response_text, should_break, message_history = (\n                self.llm_client.process_llm_response(\n                    response, message_history, agent_type\n                )\n            )\n\n            # Use client's tool call information extraction method\n            tool_calls_info = self.llm_client.extract_tool_calls_info(\n                response, assistant_response_text\n            )\n\n            self.task_log.log_step(\n                \"info\",\n                f\"{purpose} | LLM Call\",\n                \"completed successfully\",\n            )\n            return (\n                assistant_response_text,\n                should_break,\n                tool_calls_info,\n                message_history,\n            )\n\n        except Exception as e:\n            self.task_log.log_step(\n                \"error\",\n                f\"{purpose} | LLM Call ERROR\",\n                f\"{purpose} error: {str(e)}\",\n            )\n            # Return empty response with should_break=False, need to retry\n            return \"\", False, None, original_message_history\n\n    async def generate_failure_summary(\n        self,\n        system_prompt: str,\n        message_history: List[Dict[str, Any]],\n        tool_definitions: List[Dict],\n        turn_count: int,\n    ) -> Optional[str]:\n        \"\"\"\n        Generate a failure experience summary for context compression.\n\n        This is the core of the context management mechanism. When a task attempt fails\n        (i.e., the task is not completed within the given turns and context window),\n        we compress the entire conversation history into a structured summary containing:\n        - Failure type: incomplete / blocked / misdirected / format_missed\n        - What happened: the approach taken and why a final answer was not reached\n        - Useful findings: facts, intermediate results, or conclusions to be reused\n\n        Args:\n            system_prompt: The system prompt used in the conversation\n            message_history: The full conversation history to be compressed\n            tool_definitions: Available tool definitions\n            turn_count: Current turn count for step ID\n\n        Returns:\n            The compressed failure experience summary, or None if generation failed\n        \"\"\"\n        self.task_log.log_step(\n            \"info\",\n            \"Main Agent | Failure Summary\",\n            \"Generating failure experience summary for potential retry...\",\n        )\n\n        # Build failure summary history\n        failure_summary_history = message_history.copy()\n        if failure_summary_history and failure_summary_history[-1][\"role\"] == \"user\":\n            failure_summary_history.pop()\n\n        # Add failure summary prompt and assistant prefix for structured output\n        failure_summary_history.append(\n            {\"role\": \"user\", \"content\": FAILURE_SUMMARY_PROMPT}\n        )\n        failure_summary_history.append(\n            {\"role\": \"assistant\", \"content\": FAILURE_SUMMARY_ASSISTANT_PREFIX}\n        )\n\n        # Call LLM to generate failure summary\n        (\n            failure_summary_text,\n            _,\n            _,\n            _,\n        ) = await self.handle_llm_call(\n            system_prompt,\n            failure_summary_history,\n            tool_definitions,\n            turn_count + 10,  # Use a different step id\n            \"Main Agent | Failure Experience Summary\",\n            agent_type=\"main\",\n        )\n\n        # Prepend the assistant prefix to the response for complete output\n        if failure_summary_text:\n            failure_summary_text = (\n                FAILURE_SUMMARY_ASSISTANT_PREFIX + failure_summary_text\n            )\n            failure_experience_summary = extract_failure_experience_summary(\n                failure_summary_text\n            )\n            # Truncate for logging, but only add \"...\" if actually truncated\n            log_preview = failure_experience_summary[:500]\n            if len(failure_experience_summary) > 500:\n                log_preview += \"...\"\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Failure Summary\",\n                f\"Generated failure experience summary:\\n{log_preview}\",\n            )\n            return failure_experience_summary\n        else:\n            self.task_log.log_step(\n                \"warning\",\n                \"Main Agent | Failure Summary\",\n                \"Failed to generate failure experience summary\",\n            )\n            return None\n\n    async def generate_final_answer_with_retries(\n        self,\n        system_prompt: str,\n        message_history: List[Dict[str, Any]],\n        tool_definitions: List[Dict],\n        turn_count: int,\n        task_description: str,\n    ) -> Tuple[Optional[str], str, Optional[str], str, List[Dict[str, Any]]]:\n        \"\"\"\n        Generate final answer with retry mechanism.\n\n        Args:\n            system_prompt: System prompt for the LLM\n            message_history: Conversation history\n            tool_definitions: Available tool definitions\n            turn_count: Current turn count\n            task_description: Original task description\n\n        Returns:\n            Tuple of (final_answer_text, final_summary, final_boxed_answer, usage_log, message_history)\n        \"\"\"\n        # Generate summary prompt\n        summary_prompt = generate_agent_summarize_prompt(\n            task_description,\n            agent_type=\"main\",\n        )\n\n        if message_history[-1][\"role\"] == \"user\":\n            message_history.pop(-1)\n        message_history.append({\"role\": \"user\", \"content\": summary_prompt})\n\n        final_answer_text = None\n        final_boxed_answer = None\n        final_summary = \"\"\n        usage_log = \"\"\n\n        for retry_idx in range(self.max_final_answer_retries):\n            (\n                final_answer_text,\n                should_break,\n                tool_calls_info,\n                message_history,\n            ) = await self.handle_llm_call(\n                system_prompt,\n                message_history,\n                tool_definitions,\n                turn_count + 1 + retry_idx,\n                f\"Main agent | Final Summary (attempt {retry_idx + 1}/{self.max_final_answer_retries})\",\n                agent_type=\"main\",\n            )\n\n            if final_answer_text:\n                final_summary, final_boxed_answer, usage_log = (\n                    self.output_formatter.format_final_summary_and_log(\n                        final_answer_text, self.llm_client\n                    )\n                )\n\n                if final_boxed_answer != FORMAT_ERROR_MESSAGE:\n                    self.task_log.log_step(\n                        \"info\",\n                        \"Main Agent | Final Answer\",\n                        f\"Boxed answer found on attempt {retry_idx + 1}\",\n                    )\n                    break\n                else:\n                    self.task_log.log_step(\n                        \"warning\",\n                        \"Main Agent | Final Answer\",\n                        f\"No boxed answer on attempt {retry_idx + 1}, retrying...\",\n                    )\n                    if retry_idx < self.max_final_answer_retries - 1:\n                        if (\n                            message_history\n                            and message_history[-1][\"role\"] == \"assistant\"\n                        ):\n                            message_history.pop()\n            else:\n                self.task_log.log_step(\n                    \"warning\",\n                    \"Main Agent | Final Answer\",\n                    f\"Failed to generate answer on attempt {retry_idx + 1}\",\n                )\n                if retry_idx < self.max_final_answer_retries - 1:\n                    if message_history and message_history[-1][\"role\"] == \"assistant\":\n                        message_history.pop()\n\n        # Ensure final_boxed_answer is never None\n        if final_boxed_answer is None:\n            final_boxed_answer = FORMAT_ERROR_MESSAGE\n\n        return (\n            final_answer_text,\n            final_summary,\n            final_boxed_answer,\n            usage_log,\n            message_history,\n        )\n\n    def handle_no_context_management_fallback(\n        self,\n        final_answer_text: Optional[str],\n        final_summary: str,\n        final_boxed_answer: Optional[str],\n    ) -> Tuple[str, str, str]:\n        \"\"\"\n        Handle fallback when context_compress_limit == 0 (no context management).\n\n        In this mode, the model has only one chance to answer.\n        We should try to use intermediate answers as fallback to maximize accuracy.\n\n        Args:\n            final_answer_text: The generated final answer text\n            final_summary: The final summary\n            final_boxed_answer: The extracted boxed answer\n\n        Returns:\n            Tuple of (final_answer_text, final_summary, final_boxed_answer)\n        \"\"\"\n        # Validate final_answer_text\n        if not final_answer_text:\n            final_answer_text = \"No final answer generated.\"\n            final_summary = final_answer_text\n            final_boxed_answer = FORMAT_ERROR_MESSAGE\n            self.task_log.log_step(\n                \"error\",\n                \"Main Agent | Final Answer\",\n                \"Unable to generate final answer after all retries\",\n            )\n        else:\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Final Answer\",\n                f\"Final answer content:\\n\\n{final_answer_text}\",\n            )\n\n        # Fallback to intermediate answer if no valid boxed answer\n        if (\n            final_boxed_answer == FORMAT_ERROR_MESSAGE or final_boxed_answer is None\n        ) and self.intermediate_boxed_answers:\n            final_boxed_answer = self.intermediate_boxed_answers[-1]\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Final Answer (No Context Management)\",\n                f\"Using intermediate boxed answer as fallback: {final_boxed_answer}\",\n            )\n\n        # Ensure final_boxed_answer is never None\n        if final_boxed_answer is None:\n            final_boxed_answer = FORMAT_ERROR_MESSAGE\n\n        return final_answer_text, final_summary, final_boxed_answer\n\n    def handle_context_management_no_fallback(\n        self,\n        final_answer_text: Optional[str],\n        final_summary: str,\n        final_boxed_answer: Optional[str],\n    ) -> Tuple[str, str, str]:\n        \"\"\"\n        Handle failure when context_compress_limit > 0 (context management enabled).\n\n        In this mode, the model has multiple chances to retry with context management.\n        We should NOT guess or use intermediate answers, because:\n        - A wrong guess can reduce accuracy\n        - The model will have another chance to answer with failure experience\n\n        Args:\n            final_answer_text: The generated final answer text\n            final_summary: The final summary\n            final_boxed_answer: The extracted boxed answer\n\n        Returns:\n            Tuple of (final_answer_text, final_summary, final_boxed_answer)\n        \"\"\"\n        # Validate final_answer_text\n        if not final_answer_text:\n            final_answer_text = \"No final answer generated.\"\n            final_summary = final_answer_text\n            final_boxed_answer = FORMAT_ERROR_MESSAGE\n            self.task_log.log_step(\n                \"error\",\n                \"Main Agent | Final Answer\",\n                \"Unable to generate final answer after all retries\",\n            )\n        else:\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Final Answer\",\n                f\"Final answer content:\\n\\n{final_answer_text}\",\n            )\n\n        # Ensure final_boxed_answer is never None\n        if final_boxed_answer is None:\n            final_boxed_answer = FORMAT_ERROR_MESSAGE\n\n        # With context management, do NOT fallback to intermediate answers\n        if final_boxed_answer == FORMAT_ERROR_MESSAGE:\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Final Answer (Context Management Mode)\",\n                \"No valid boxed answer found. Not using intermediate fallback - will generate failure summary for retry.\",\n            )\n\n        return final_answer_text, final_summary, final_boxed_answer\n\n    async def generate_and_finalize_answer(\n        self,\n        system_prompt: str,\n        message_history: List[Dict[str, Any]],\n        tool_definitions: List[Dict],\n        turn_count: int,\n        task_description: str,\n        reached_max_turns: bool = False,\n        is_final_retry: bool = False,\n        save_callback=None,\n    ) -> Tuple[str, str, Optional[str], str, List[Dict[str, Any]]]:\n        \"\"\"\n        Generate final answer and handle fallback based on context management settings.\n\n        Context Management (context_compress_limit > 0) is essentially a context compression\n        mechanism that enables multi-attempt problem solving.\n\n        Decision table based on (context_management, reached_max_turns):\n\n        | Context Management | Reached Max Turns | Behavior                                    |\n        |--------------------|-------------------|---------------------------------------------|\n        | OFF (limit=0)      | No                | Generate answer → fallback to intermediate  |\n        | OFF (limit=0)      | Yes               | Generate answer → fallback to intermediate  |\n        | ON  (limit>0)      | No                | Generate answer → no fallback, fail summary |\n        | ON  (limit>0)      | Yes               | SKIP generation → fail summary directly     |\n\n        Args:\n            system_prompt: System prompt for the LLM\n            message_history: Conversation history\n            tool_definitions: Available tool definitions\n            turn_count: Current turn count\n            task_description: Original task description\n            reached_max_turns: Whether the main loop ended due to reaching max turns\n            save_callback: Optional callback to save message history\n\n        Returns:\n            Tuple of (final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history)\n        \"\"\"\n        context_management_enabled = self.context_compress_limit > 0\n        failure_experience_summary = None\n        usage_log = \"\"\n\n        # CASE: Context management ON + reached max turns + NOT final retry\n        # Skip answer generation entirely - any answer would be a blind guess\n        # But if this is the final retry, we still try to generate an answer (last chance)\n        if context_management_enabled and reached_max_turns and not is_final_retry:\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Final Answer (Context Management Mode)\",\n                \"Reached max turns. Skipping answer generation to avoid blind guessing.\",\n            )\n\n            if save_callback:\n                save_callback(system_prompt, message_history)\n\n            if self.retry_with_summary:\n                failure_experience_summary = await self.generate_failure_summary(\n                    system_prompt, message_history, tool_definitions, turn_count\n                )\n\n            return (\n                \"Task incomplete - reached maximum turns. Will retry with failure experience.\",\n                FORMAT_ERROR_MESSAGE,\n                failure_experience_summary,\n                usage_log,\n                message_history,\n            )\n\n        # ALL OTHER CASES: Generate final answer first\n        # (including final retry with reached_max_turns - last chance to get an answer)\n        (\n            final_answer_text,\n            final_summary,\n            final_boxed_answer,\n            usage_log,\n            message_history,\n        ) = await self.generate_final_answer_with_retries(\n            system_prompt=system_prompt,\n            message_history=message_history,\n            tool_definitions=tool_definitions,\n            turn_count=turn_count,\n            task_description=task_description,\n        )\n\n        if save_callback:\n            save_callback(system_prompt, message_history)\n\n        # CASE: Context management OFF or final retry\n        # Try to use intermediate answers as fallback to maximize accuracy\n        # For final retry, there's no more retry opportunity, so we use fallback\n        if not context_management_enabled or is_final_retry:\n            final_answer_text, final_summary, final_boxed_answer = (\n                self.handle_no_context_management_fallback(\n                    final_answer_text, final_summary, final_boxed_answer\n                )\n            )\n            if is_final_retry:\n                self.task_log.log_step(\n                    \"info\",\n                    \"Main Agent | Final Answer (Final Retry)\",\n                    \"This is the final retry. Using intermediate fallback if available.\",\n                )\n            return (\n                final_summary,\n                final_boxed_answer,\n                None,\n                usage_log,\n                message_history,\n            )\n\n        # CASE: Context management ON + normal completion (not reached max turns, not final retry)\n        # Don't use fallback - wrong guess would reduce accuracy\n        final_answer_text, final_summary, final_boxed_answer = (\n            self.handle_context_management_no_fallback(\n                final_answer_text, final_summary, final_boxed_answer\n            )\n        )\n\n        if final_boxed_answer == FORMAT_ERROR_MESSAGE and self.retry_with_summary:\n            failure_experience_summary = await self.generate_failure_summary(\n                system_prompt, message_history, tool_definitions, turn_count\n            )\n\n        return (\n            final_summary,\n            final_boxed_answer,\n            failure_experience_summary,\n            usage_log,\n            message_history,\n        )\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/orchestrator.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nOrchestrator module for coordinating agent task execution.\n\nThis module contains the main Orchestrator class that manages the execution of tasks\nby coordinating between the main agent, sub-agents, and various tools.\n\"\"\"\n\nimport asyncio\nimport gc\nimport logging\nimport time\nimport uuid\nfrom collections import defaultdict\nfrom datetime import date\nfrom typing import Any, Dict, List, Optional\n\nfrom miroflow_tools.manager import ToolManager\nfrom omegaconf import DictConfig\n\nfrom ..config.settings import expose_sub_agents_as_tools\nfrom ..io.input_handler import process_input\nfrom ..io.output_formatter import OutputFormatter\nfrom ..llm.base_client import BaseClient\nfrom ..logging.task_logger import TaskLog, get_utc_plus_8_time\nfrom ..utils.parsing_utils import extract_llm_response_text\nfrom ..utils.prompt_utils import (\n    generate_agent_specific_system_prompt,\n    generate_agent_summarize_prompt,\n    mcp_tags,\n    refusal_keywords,\n)\nfrom .answer_generator import AnswerGenerator\nfrom .stream_handler import StreamHandler\nfrom .tool_executor import ToolExecutor\n\nlogger = logging.getLogger(__name__)\n\n\n# =============================================================================\n# Constants\n# =============================================================================\n\n# Default timeout for LLM calls in seconds\nDEFAULT_LLM_TIMEOUT = 600\n\n# Safety limits for retry loops\nDEFAULT_MAX_CONSECUTIVE_ROLLBACKS = 5\n\n# Additional attempts beyond max_turns for total loop protection\nEXTRA_ATTEMPTS_BUFFER = 200\n\n\ndef _list_tools(sub_agent_tool_managers: Dict[str, ToolManager]):\n    \"\"\"\n    Create a cached async function for fetching sub-agent tool definitions.\n\n    This factory function returns an async closure that lazily fetches and caches\n    tool definitions from all sub-agent tool managers. The cache ensures that\n    tool definitions are only fetched once per orchestrator instance.\n\n    Args:\n        sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.\n\n    Returns:\n        An async function that returns a dictionary of tool definitions for each sub-agent.\n    \"\"\"\n    cache = None\n\n    async def wrapped():\n        nonlocal cache\n        if cache is None:\n            # Only fetch tool definitions if not already cached\n            result = {\n                name: await tool_manager.get_all_tool_definitions()\n                for name, tool_manager in sub_agent_tool_managers.items()\n            }\n            cache = result\n        return cache\n\n    return wrapped\n\n\nclass Orchestrator:\n    \"\"\"\n    Main orchestrator for coordinating agent task execution.\n\n    Manages the execution loop for main and sub-agents, coordinating\n    LLM calls, tool execution, streaming events, and context management.\n    \"\"\"\n\n    def __init__(\n        self,\n        main_agent_tool_manager: ToolManager,\n        sub_agent_tool_managers: Dict[str, ToolManager],\n        llm_client: BaseClient,\n        output_formatter: OutputFormatter,\n        cfg: DictConfig,\n        task_log: Optional[\"TaskLog\"] = None,\n        stream_queue: Optional[Any] = None,\n        tool_definitions: Optional[List[Dict[str, Any]]] = None,\n        sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,\n    ):\n        \"\"\"\n        Initialize the orchestrator.\n\n        Args:\n            main_agent_tool_manager: Tool manager for main agent\n            sub_agent_tool_managers: Dictionary of tool managers for sub-agents\n            llm_client: The LLM client for API calls\n            output_formatter: Formatter for output processing\n            cfg: Configuration object\n            task_log: Logger for task execution\n            stream_queue: Optional async queue for streaming events\n            tool_definitions: Pre-fetched tool definitions (optional)\n            sub_agent_tool_definitions: Pre-fetched sub-agent tool definitions (optional)\n        \"\"\"\n        self.main_agent_tool_manager = main_agent_tool_manager\n        self.sub_agent_tool_managers = sub_agent_tool_managers\n        self.llm_client = llm_client\n        self.output_formatter = output_formatter\n        self.cfg = cfg\n        self.task_log = task_log\n        self.stream_queue = stream_queue\n        self.tool_definitions = tool_definitions\n        self.sub_agent_tool_definitions = sub_agent_tool_definitions\n\n        # Initialize sub-agent tool list function\n        self._list_sub_agent_tools = None\n        if sub_agent_tool_managers:\n            self._list_sub_agent_tools = _list_tools(sub_agent_tool_managers)\n\n        # Pass task_log to llm_client\n        if self.llm_client and task_log:\n            self.llm_client.task_log = task_log\n\n        # Track boxed answers extracted during main loop turns\n        self.intermediate_boxed_answers: List[str] = []\n\n        # Record used subtask / q / Query to detect duplicates\n        self.used_queries: Dict[str, Dict[str, int]] = {}\n\n        # Retry loop protection limits\n        self.MAX_CONSECUTIVE_ROLLBACKS = DEFAULT_MAX_CONSECUTIVE_ROLLBACKS\n\n        # Context management settings\n        self.context_compress_limit = cfg.agent.get(\"context_compress_limit\", 0)\n\n        # Initialize helper components\n        self.stream = StreamHandler(stream_queue)\n        self.tool_executor = ToolExecutor(\n            main_agent_tool_manager=main_agent_tool_manager,\n            sub_agent_tool_managers=sub_agent_tool_managers,\n            output_formatter=output_formatter,\n            task_log=task_log,\n            stream_handler=self.stream,\n            max_consecutive_rollbacks=DEFAULT_MAX_CONSECUTIVE_ROLLBACKS,\n        )\n        self.answer_generator = AnswerGenerator(\n            llm_client=llm_client,\n            output_formatter=output_formatter,\n            task_log=task_log,\n            stream_handler=self.stream,\n            cfg=cfg,\n            intermediate_boxed_answers=self.intermediate_boxed_answers,\n        )\n\n    def _save_message_history(\n        self, system_prompt: str, message_history: List[Dict[str, Any]]\n    ):\n        \"\"\"Save message history to task log.\"\"\"\n        self.task_log.main_agent_message_history = {\n            \"system_prompt\": system_prompt,\n            \"message_history\": message_history,\n        }\n        self.task_log.save()\n\n    async def _handle_response_format_issues(\n        self,\n        assistant_response_text: str,\n        message_history: List[Dict[str, Any]],\n        turn_count: int,\n        consecutive_rollbacks: int,\n        total_attempts: int,\n        max_attempts: int,\n        agent_name: str,\n    ) -> tuple:\n        \"\"\"\n        Handle MCP tag format errors and refusal keywords.\n\n        Args:\n            assistant_response_text: The LLM response text\n            message_history: Current message history\n            turn_count: Current turn count\n            consecutive_rollbacks: Current consecutive rollback count\n            total_attempts: Total attempts made\n            max_attempts: Maximum allowed attempts\n            agent_name: Name of the agent for logging\n\n        Returns:\n            Tuple of (should_continue, should_break, turn_count, consecutive_rollbacks, message_history)\n        \"\"\"\n        # Check for MCP tags in response (format error)\n        if any(mcp_tag in assistant_response_text for mcp_tag in mcp_tags):\n            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:\n                turn_count -= 1\n                consecutive_rollbacks += 1\n                if message_history[-1][\"role\"] == \"assistant\":\n                    message_history.pop()\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | Rollback\",\n                    f\"Tool call format incorrect - found MCP tags in response. \"\n                    f\"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, \"\n                    f\"Total attempts: {total_attempts}/{max_attempts}\",\n                )\n                return True, False, turn_count, consecutive_rollbacks, message_history\n            else:\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | End After Max Rollbacks\",\n                    f\"Ending agent loop after {consecutive_rollbacks} consecutive MCP format errors\",\n                )\n                return False, True, turn_count, consecutive_rollbacks, message_history\n\n        # Check for refusal keywords\n        if any(keyword in assistant_response_text for keyword in refusal_keywords):\n            matched_keywords = [\n                kw for kw in refusal_keywords if kw in assistant_response_text\n            ]\n            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:\n                turn_count -= 1\n                consecutive_rollbacks += 1\n                if message_history[-1][\"role\"] == \"assistant\":\n                    message_history.pop()\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | Rollback\",\n                    f\"LLM refused to answer - found refusal keywords: {matched_keywords}. \"\n                    f\"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, \"\n                    f\"Total attempts: {total_attempts}/{max_attempts}\",\n                )\n                return True, False, turn_count, consecutive_rollbacks, message_history\n            else:\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | End After Max Rollbacks\",\n                    f\"Ending agent loop after {consecutive_rollbacks} consecutive refusals with keywords: {matched_keywords}\",\n                )\n                return False, True, turn_count, consecutive_rollbacks, message_history\n\n        # No format issues - normal end without tool calls\n        return False, True, turn_count, consecutive_rollbacks, message_history\n\n    async def _check_duplicate_query(\n        self,\n        tool_name: str,\n        arguments: dict,\n        cache_name: str,\n        consecutive_rollbacks: int,\n        turn_count: int,\n        total_attempts: int,\n        max_attempts: int,\n        message_history: List[Dict[str, Any]],\n        agent_name: str,\n    ) -> tuple:\n        \"\"\"\n        Check for duplicate queries and handle rollback if needed.\n\n        Args:\n            tool_name: Name of the tool being called\n            arguments: Tool arguments\n            cache_name: Name of the query cache to use\n            consecutive_rollbacks: Current consecutive rollback count\n            turn_count: Current turn count\n            total_attempts: Total attempts made\n            max_attempts: Maximum allowed attempts\n            message_history: Current message history\n            agent_name: Name of the agent for logging\n\n        Returns:\n            Tuple of (is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history)\n        \"\"\"\n        query_str = self.tool_executor.get_query_str_from_tool_call(\n            tool_name, arguments\n        )\n        if not query_str:\n            return False, False, turn_count, consecutive_rollbacks, message_history\n\n        self.used_queries.setdefault(cache_name, defaultdict(int))\n        count = self.used_queries[cache_name][query_str]\n\n        if count > 0:\n            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:\n                message_history.pop()\n                turn_count -= 1\n                consecutive_rollbacks += 1\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | Rollback\",\n                    f\"Duplicate query detected - tool: {tool_name}, query: '{query_str}', \"\n                    f\"previous count: {count}. Consecutive rollbacks: {consecutive_rollbacks}/\"\n                    f\"{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}\",\n                )\n                return True, True, turn_count, consecutive_rollbacks, message_history\n            else:\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"{agent_name} | Turn: {turn_count} | Allow Duplicate\",\n                    f\"Allowing duplicate query after {consecutive_rollbacks} rollbacks - \"\n                    f\"tool: {tool_name}, query: '{query_str}', previous count: {count}\",\n                )\n\n        return False, False, turn_count, consecutive_rollbacks, message_history\n\n    async def _record_query(self, cache_name: str, tool_name: str, arguments: dict):\n        \"\"\"Record a successful query execution.\"\"\"\n        query_str = self.tool_executor.get_query_str_from_tool_call(\n            tool_name, arguments\n        )\n        if query_str:\n            self.used_queries.setdefault(cache_name, defaultdict(int))\n            self.used_queries[cache_name][query_str] += 1\n\n    async def run_sub_agent(\n        self,\n        sub_agent_name: str,\n        task_description: str,\n    ):\n        \"\"\"\n        Run a sub-agent to handle a subtask.\n\n        Args:\n            sub_agent_name: Name of the sub-agent to run\n            task_description: Description of the subtask\n\n        Returns:\n            The final answer text from the sub-agent\n        \"\"\"\n        task_description += \"\\n\\nPlease provide the answer and detailed supporting information of the subtask given to you.\"\n        self.task_log.log_step(\n            \"info\",\n            f\"{sub_agent_name} | Task Description\",\n            f\"Subtask: {task_description}\",\n        )\n\n        # Stream sub-agent start\n        display_name = sub_agent_name.replace(\"agent-\", \"\")\n        sub_agent_id = await self.stream.start_agent(display_name)\n        await self.stream.start_llm(display_name)\n\n        # Start new sub-agent session\n        self.task_log.start_sub_agent_session(sub_agent_name, task_description)\n\n        # Initialize message history\n        message_history = [{\"role\": \"user\", \"content\": task_description}]\n\n        # Get sub-agent tool definitions\n        if not self.sub_agent_tool_definitions:\n            tool_definitions = await self._list_sub_agent_tools()\n            tool_definitions = tool_definitions.get(sub_agent_name, {})\n        else:\n            tool_definitions = self.sub_agent_tool_definitions[sub_agent_name]\n\n        if not tool_definitions:\n            self.task_log.log_step(\n                \"warning\",\n                f\"{sub_agent_name} | No Tools\",\n                \"No tool definitions available.\",\n            )\n\n        # Generate sub-agent system prompt\n        system_prompt = self.llm_client.generate_agent_system_prompt(\n            date=date.today(),\n            mcp_servers=tool_definitions,\n        ) + generate_agent_specific_system_prompt(agent_type=sub_agent_name)\n\n        # Limit sub-agent turns\n        if self.cfg.agent.sub_agents:\n            max_turns = self.cfg.agent.sub_agents[sub_agent_name].max_turns\n        else:\n            max_turns = 0\n        turn_count = 0\n        total_attempts = 0\n        max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER\n        consecutive_rollbacks = 0\n\n        while turn_count < max_turns and total_attempts < max_attempts:\n            turn_count += 1\n            total_attempts += 1\n\n            if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:\n                self.task_log.log_step(\n                    \"error\",\n                    f\"{sub_agent_name} | Too Many Rollbacks\",\n                    f\"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.\",\n                )\n                break\n\n            self.task_log.save()\n\n            # Reset 'last_call_tokens'\n            self.llm_client.last_call_tokens = {\n                \"prompt_tokens\": 0,\n                \"completion_tokens\": 0,\n            }\n\n            # LLM call using answer generator\n            (\n                assistant_response_text,\n                should_break,\n                tool_calls,\n                message_history,\n            ) = await self.answer_generator.handle_llm_call(\n                system_prompt,\n                message_history,\n                tool_definitions,\n                turn_count,\n                f\"{sub_agent_name} | Turn: {turn_count}\",\n                agent_type=sub_agent_name,\n            )\n\n            if should_break:\n                self.task_log.log_step(\n                    \"info\",\n                    f\"{sub_agent_name} | Turn: {turn_count} | LLM Call\",\n                    \"should break is True, breaking the loop\",\n                )\n                break\n\n            if assistant_response_text:\n                text_response = extract_llm_response_text(assistant_response_text)\n                if text_response:\n                    await self.stream.tool_call(\"show_text\", {\"text\": text_response})\n            else:\n                self.task_log.log_step(\n                    \"info\",\n                    f\"{sub_agent_name} | Turn: {turn_count} | LLM Call\",\n                    \"LLM call failed\",\n                )\n                await asyncio.sleep(5)\n                continue\n\n            # Handle no tool calls case\n            if not tool_calls:\n                (\n                    should_continue,\n                    should_break_loop,\n                    turn_count,\n                    consecutive_rollbacks,\n                    message_history,\n                ) = await self._handle_response_format_issues(\n                    assistant_response_text,\n                    message_history,\n                    turn_count,\n                    consecutive_rollbacks,\n                    total_attempts,\n                    max_attempts,\n                    sub_agent_name,\n                )\n                if should_continue:\n                    continue\n                if should_break_loop:\n                    if not any(\n                        mcp_tag in assistant_response_text for mcp_tag in mcp_tags\n                    ) and not any(\n                        keyword in assistant_response_text\n                        for keyword in refusal_keywords\n                    ):\n                        self.task_log.log_step(\n                            \"info\",\n                            f\"{sub_agent_name} | Turn: {turn_count} | LLM Call\",\n                            f\"No tool calls found in {sub_agent_name}, ending on turn {turn_count}\",\n                        )\n                    break\n\n            # Execute tool calls\n            tool_calls_data = []\n            all_tool_results_content_with_id = []\n            should_rollback_turn = False\n\n            for call in tool_calls:\n                server_name = call[\"server_name\"]\n                tool_name = call[\"tool_name\"]\n                arguments = call[\"arguments\"]\n                call_id = call[\"id\"]\n\n                # Fix common parameter name mistakes\n                arguments = self.tool_executor.fix_tool_call_arguments(\n                    tool_name, arguments\n                )\n\n                self.task_log.log_step(\n                    \"info\",\n                    f\"{sub_agent_name} | Turn: {turn_count} | Tool Call\",\n                    f\"Executing {tool_name} on {server_name}\",\n                )\n\n                call_start_time = time.time()\n                try:\n                    # Check for duplicate query\n                    cache_name = sub_agent_id + \"_\" + tool_name\n                    (\n                        is_duplicate,\n                        should_rollback,\n                        turn_count,\n                        consecutive_rollbacks,\n                        message_history,\n                    ) = await self._check_duplicate_query(\n                        tool_name,\n                        arguments,\n                        cache_name,\n                        consecutive_rollbacks,\n                        turn_count,\n                        total_attempts,\n                        max_attempts,\n                        message_history,\n                        sub_agent_name,\n                    )\n                    if should_rollback:\n                        should_rollback_turn = True\n                        break\n\n                    # Send stream event\n                    tool_call_id = await self.stream.tool_call(tool_name, arguments)\n\n                    # Execute tool call\n                    tool_result = await self.sub_agent_tool_managers[\n                        sub_agent_name\n                    ].execute_tool_call(server_name, tool_name, arguments)\n\n                    # Update query count if successful\n                    if \"error\" not in tool_result:\n                        await self._record_query(cache_name, tool_name, arguments)\n\n                    # Post-process result\n                    tool_result = self.tool_executor.post_process_tool_call_result(\n                        tool_name, tool_result\n                    )\n                    result = (\n                        tool_result.get(\"result\")\n                        if tool_result.get(\"result\")\n                        else tool_result.get(\"error\")\n                    )\n\n                    # Check for errors that should trigger rollback\n                    if self.tool_executor.should_rollback_result(\n                        tool_name, result, tool_result\n                    ):\n                        if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:\n                            message_history.pop()\n                            turn_count -= 1\n                            consecutive_rollbacks += 1\n                            should_rollback_turn = True\n                            self.task_log.log_step(\n                                \"warning\",\n                                f\"{sub_agent_name} | Turn: {turn_count} | Rollback\",\n                                f\"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'\",\n                            )\n                            break\n\n                    await self.stream.tool_call(\n                        tool_name, {\"result\": result}, tool_call_id=tool_call_id\n                    )\n                    call_end_time = time.time()\n                    call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n                    self.task_log.log_step(\n                        \"info\",\n                        f\"{sub_agent_name} | Turn: {turn_count} | Tool Call\",\n                        f\"Tool {tool_name} completed in {call_duration_ms}ms\",\n                    )\n\n                    tool_calls_data.append(\n                        {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"arguments\": arguments,\n                            \"result\": tool_result,\n                            \"duration_ms\": call_duration_ms,\n                            \"call_time\": get_utc_plus_8_time(),\n                        }\n                    )\n\n                except Exception as e:\n                    call_end_time = time.time()\n                    call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n                    tool_calls_data.append(\n                        {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"arguments\": arguments,\n                            \"error\": str(e),\n                            \"duration_ms\": call_duration_ms,\n                            \"call_time\": get_utc_plus_8_time(),\n                        }\n                    )\n                    tool_result = {\n                        \"error\": f\"Tool call failed: {str(e)}\",\n                        \"server_name\": server_name,\n                        \"tool_name\": tool_name,\n                    }\n                    self.task_log.log_step(\n                        \"error\",\n                        f\"{sub_agent_name} | Turn: {turn_count} | Tool Call\",\n                        f\"Tool {tool_name} failed to execute: {str(e)}\",\n                    )\n\n                tool_result_for_llm = self.output_formatter.format_tool_result_for_user(\n                    tool_result\n                )\n                all_tool_results_content_with_id.append((call_id, tool_result_for_llm))\n\n            if should_rollback_turn:\n                continue\n\n            # Reset consecutive rollbacks on successful execution\n            if consecutive_rollbacks > 0:\n                self.task_log.log_step(\n                    \"info\",\n                    f\"{sub_agent_name} | Turn: {turn_count} | Recovery\",\n                    f\"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks\",\n                )\n            consecutive_rollbacks = 0\n\n            # Update message history\n            message_history = self.llm_client.update_message_history(\n                message_history, all_tool_results_content_with_id\n            )\n\n            # Check context length\n            temp_summary_prompt = generate_agent_summarize_prompt(\n                task_description,\n                agent_type=sub_agent_name,\n            )\n\n            pass_length_check, message_history = self.llm_client.ensure_summary_context(\n                message_history, temp_summary_prompt\n            )\n\n            if not pass_length_check:\n                turn_count = max_turns\n                self.task_log.log_step(\n                    \"info\",\n                    f\"{sub_agent_name} | Turn: {turn_count} | Context Limit Reached\",\n                    \"Context limit reached, triggering summary\",\n                )\n                break\n\n        # Log loop end\n        if turn_count >= max_turns:\n            self.task_log.log_step(\n                \"info\",\n                f\"{sub_agent_name} | Max Turns Reached / Context Limit Reached\",\n                f\"Reached maximum turns ({max_turns}) or context limit reached\",\n            )\n        else:\n            self.task_log.log_step(\n                \"info\",\n                f\"{sub_agent_name} | Main Loop Completed\",\n                f\"Main loop completed after {turn_count} turns\",\n            )\n\n        # Generate final summary\n        self.task_log.log_step(\n            \"info\",\n            f\"{sub_agent_name} | Final Summary\",\n            f\"Generating {sub_agent_name} final summary\",\n        )\n\n        summary_prompt = generate_agent_summarize_prompt(\n            task_description,\n            agent_type=sub_agent_name,\n        )\n\n        if message_history[-1][\"role\"] == \"user\":\n            message_history.pop()\n        message_history.append({\"role\": \"user\", \"content\": summary_prompt})\n\n        await self.stream.tool_call(\n            \"Partial Summary\", {}, tool_call_id=str(uuid.uuid4())\n        )\n\n        # Generate final answer\n        (\n            final_answer_text,\n            should_break,\n            tool_calls_info,\n            message_history,\n        ) = await self.answer_generator.handle_llm_call(\n            system_prompt,\n            message_history,\n            tool_definitions,\n            turn_count + 1,\n            f\"{sub_agent_name} | Final summary\",\n            agent_type=sub_agent_name,\n        )\n\n        if final_answer_text:\n            self.task_log.log_step(\n                \"info\",\n                f\"{sub_agent_name} | Final Answer\",\n                \"Final answer generated successfully\",\n            )\n        else:\n            final_answer_text = (\n                f\"No final answer generated by sub agent {sub_agent_name}.\"\n            )\n            self.task_log.log_step(\n                \"error\",\n                f\"{sub_agent_name} | Final Answer\",\n                \"Unable to generate final answer\",\n            )\n\n        # Save session history\n        self.task_log.sub_agent_message_history_sessions[\n            self.task_log.current_sub_agent_session_id\n        ] = {\"system_prompt\": system_prompt, \"message_history\": message_history}\n\n        self.task_log.save()\n        self.task_log.end_sub_agent_session(sub_agent_name)\n\n        # Remove thinking content\n        final_answer_text = final_answer_text.split(\"<think>\")[-1].strip()\n        final_answer_text = final_answer_text.split(\"</think>\")[-1].strip()\n\n        # Stream sub-agent end\n        await self.stream.end_llm(display_name)\n        await self.stream.end_agent(display_name, sub_agent_id)\n\n        return final_answer_text\n\n    async def run_main_agent(\n        self,\n        task_description,\n        task_file_name=None,\n        task_id=\"default_task\",\n        is_final_retry=False,\n    ):\n        \"\"\"\n        Execute the main end-to-end task.\n\n        Args:\n            task_description: Description of the task to execute\n            task_file_name: Optional file associated with the task\n            task_id: Unique identifier for the task\n\n        Returns:\n            Tuple of (final_summary, final_boxed_answer, failure_experience_summary)\n        \"\"\"\n        workflow_id = await self.stream.start_workflow(task_description)\n\n        self.task_log.log_step(\"info\", \"Main Agent\", f\"Start task with id: {task_id}\")\n        self.task_log.log_step(\n            \"info\", \"Main Agent\", f\"Task description: {task_description}\"\n        )\n        if task_file_name:\n            self.task_log.log_step(\n                \"info\", \"Main Agent\", f\"Associated file: {task_file_name}\"\n            )\n\n        # Process input\n        initial_user_content, processed_task_desc = process_input(\n            task_description, task_file_name\n        )\n        message_history = [{\"role\": \"user\", \"content\": initial_user_content}]\n\n        # Record initial user input\n        user_input = processed_task_desc\n        if task_file_name:\n            user_input += f\"\\n[Attached file: {task_file_name}]\"\n\n        # Get tool definitions\n        if not self.tool_definitions:\n            tool_definitions = (\n                await self.main_agent_tool_manager.get_all_tool_definitions()\n            )\n            if self.cfg.agent.sub_agents is not None:\n                tool_definitions += expose_sub_agents_as_tools(\n                    self.cfg.agent.sub_agents\n                )\n        else:\n            tool_definitions = self.tool_definitions\n\n        if not tool_definitions:\n            self.task_log.log_step(\n                \"warning\",\n                \"Main Agent | Tool Definitions\",\n                \"Warning: No tool definitions found. LLM cannot use any tools.\",\n            )\n\n        # Generate system prompt\n        system_prompt = self.llm_client.generate_agent_system_prompt(\n            date=date.today(),\n            mcp_servers=tool_definitions,\n        ) + generate_agent_specific_system_prompt(agent_type=\"main\")\n        system_prompt = system_prompt.strip()\n\n        # Main loop configuration\n        max_turns = self.cfg.agent.main_agent.max_turns\n        turn_count = 0\n        total_attempts = 0\n        max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER\n        consecutive_rollbacks = 0\n\n        self.current_agent_id = await self.stream.start_agent(\"main\")\n        await self.stream.start_llm(\"main\")\n\n        while turn_count < max_turns and total_attempts < max_attempts:\n            turn_count += 1\n            total_attempts += 1\n\n            if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:\n                self.task_log.log_step(\n                    \"error\",\n                    \"Main Agent | Too Many Rollbacks\",\n                    f\"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.\",\n                )\n                break\n\n            self.task_log.save()\n\n            # LLM call\n            (\n                assistant_response_text,\n                should_break,\n                tool_calls,\n                message_history,\n            ) = await self.answer_generator.handle_llm_call(\n                system_prompt,\n                message_history,\n                tool_definitions,\n                turn_count,\n                f\"Main agent | Turn: {turn_count}\",\n                agent_type=\"main\",\n            )\n\n            # Process LLM response\n            if assistant_response_text:\n                text_response = extract_llm_response_text(assistant_response_text)\n                if text_response:\n                    await self.stream.tool_call(\"show_text\", {\"text\": text_response})\n\n                # Extract boxed content\n                boxed_content = self.output_formatter._extract_boxed_content(\n                    assistant_response_text\n                )\n                if boxed_content:\n                    self.intermediate_boxed_answers.append(boxed_content)\n\n                if should_break:\n                    self.task_log.log_step(\n                        \"info\",\n                        f\"Main Agent | Turn: {turn_count} | LLM Call\",\n                        \"should break is True, breaking the loop\",\n                    )\n                    break\n            else:\n                turn_count -= 1\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"Main Agent | Turn: {turn_count} | LLM Call\",\n                    \"No valid response from LLM, retrying\",\n                )\n                await asyncio.sleep(5)\n                continue\n\n            # Handle no tool calls case\n            if not tool_calls:\n                (\n                    should_continue,\n                    should_break_loop,\n                    turn_count,\n                    consecutive_rollbacks,\n                    message_history,\n                ) = await self._handle_response_format_issues(\n                    assistant_response_text,\n                    message_history,\n                    turn_count,\n                    consecutive_rollbacks,\n                    total_attempts,\n                    max_attempts,\n                    \"Main Agent\",\n                )\n                if should_continue:\n                    continue\n                if should_break_loop:\n                    if not any(\n                        mcp_tag in assistant_response_text for mcp_tag in mcp_tags\n                    ) and not any(\n                        keyword in assistant_response_text\n                        for keyword in refusal_keywords\n                    ):\n                        self.task_log.log_step(\n                            \"info\",\n                            f\"Main Agent | Turn: {turn_count} | LLM Call\",\n                            \"LLM did not request tool usage, ending process.\",\n                        )\n                    break\n\n            # Execute tool calls\n            tool_calls_data = []\n            all_tool_results_content_with_id = []\n            should_rollback_turn = False\n            main_agent_last_call_tokens = self.llm_client.last_call_tokens\n\n            for call in tool_calls:\n                server_name = call[\"server_name\"]\n                tool_name = call[\"tool_name\"]\n                arguments = call[\"arguments\"]\n                call_id = call[\"id\"]\n\n                # Fix common parameter name mistakes\n                arguments = self.tool_executor.fix_tool_call_arguments(\n                    tool_name, arguments\n                )\n\n                call_start_time = time.time()\n                try:\n                    if server_name.startswith(\"agent-\") and self.cfg.agent.sub_agents:\n                        # Sub-agent execution\n                        cache_name = \"main_\" + tool_name\n                        (\n                            is_duplicate,\n                            should_rollback,\n                            turn_count,\n                            consecutive_rollbacks,\n                            message_history,\n                        ) = await self._check_duplicate_query(\n                            tool_name,\n                            arguments,\n                            cache_name,\n                            consecutive_rollbacks,\n                            turn_count,\n                            total_attempts,\n                            max_attempts,\n                            message_history,\n                            \"Main Agent\",\n                        )\n                        if should_rollback:\n                            should_rollback_turn = True\n                            break\n\n                        # Stream events\n                        await self.stream.end_llm(\"main\")\n                        await self.stream.end_agent(\"main\", self.current_agent_id)\n\n                        # Execute sub-agent\n                        sub_agent_result = await self.run_sub_agent(\n                            server_name,\n                            arguments[\"subtask\"],\n                        )\n\n                        # Update query count\n                        await self._record_query(cache_name, tool_name, arguments)\n\n                        tool_result = {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"result\": sub_agent_result,\n                        }\n                        self.current_agent_id = await self.stream.start_agent(\n                            \"main\", display_name=\"Summarizing\"\n                        )\n                        await self.stream.start_llm(\"main\", display_name=\"Summarizing\")\n                    else:\n                        # Regular tool execution\n                        cache_name = \"main_\" + tool_name\n                        (\n                            is_duplicate,\n                            should_rollback,\n                            turn_count,\n                            consecutive_rollbacks,\n                            message_history,\n                        ) = await self._check_duplicate_query(\n                            tool_name,\n                            arguments,\n                            cache_name,\n                            consecutive_rollbacks,\n                            turn_count,\n                            total_attempts,\n                            max_attempts,\n                            message_history,\n                            \"Main Agent\",\n                        )\n                        if should_rollback:\n                            should_rollback_turn = True\n                            break\n\n                        # Send stream event\n                        tool_call_id = await self.stream.tool_call(tool_name, arguments)\n\n                        # Execute tool call\n                        tool_result = (\n                            await self.main_agent_tool_manager.execute_tool_call(\n                                server_name=server_name,\n                                tool_name=tool_name,\n                                arguments=arguments,\n                            )\n                        )\n\n                        # Update query count if successful\n                        if \"error\" not in tool_result:\n                            await self._record_query(cache_name, tool_name, arguments)\n\n                        # Post-process result\n                        tool_result = self.tool_executor.post_process_tool_call_result(\n                            tool_name, tool_result\n                        )\n                        result = (\n                            tool_result.get(\"result\")\n                            if tool_result.get(\"result\")\n                            else tool_result.get(\"error\")\n                        )\n\n                        # Check for errors that should trigger rollback\n                        if self.tool_executor.should_rollback_result(\n                            tool_name, result, tool_result\n                        ):\n                            if (\n                                consecutive_rollbacks\n                                < self.MAX_CONSECUTIVE_ROLLBACKS - 1\n                            ):\n                                message_history.pop()\n                                turn_count -= 1\n                                consecutive_rollbacks += 1\n                                should_rollback_turn = True\n                                self.task_log.log_step(\n                                    \"warning\",\n                                    f\"Main Agent | Turn: {turn_count} | Rollback\",\n                                    f\"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'\",\n                                )\n                                break\n\n                        await self.stream.tool_call(\n                            tool_name, {\"result\": result}, tool_call_id=tool_call_id\n                        )\n\n                    call_end_time = time.time()\n                    call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n                    tool_calls_data.append(\n                        {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"arguments\": arguments,\n                            \"result\": tool_result,\n                            \"duration_ms\": call_duration_ms,\n                            \"call_time\": get_utc_plus_8_time(),\n                        }\n                    )\n                    self.task_log.log_step(\n                        \"info\",\n                        f\"Main Agent | Turn: {turn_count} | Tool Call\",\n                        f\"Tool {tool_name} completed in {call_duration_ms}ms\",\n                    )\n\n                except Exception as e:\n                    call_end_time = time.time()\n                    call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n                    tool_calls_data.append(\n                        {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"arguments\": arguments,\n                            \"error\": str(e),\n                            \"duration_ms\": call_duration_ms,\n                            \"call_time\": get_utc_plus_8_time(),\n                        }\n                    )\n                    tool_result = {\n                        \"server_name\": server_name,\n                        \"tool_name\": tool_name,\n                        \"error\": str(e),\n                    }\n                    self.task_log.log_step(\n                        \"error\",\n                        f\"Main Agent | Turn: {turn_count} | Tool Call\",\n                        f\"Tool {tool_name} failed to execute: {str(e)}\",\n                    )\n\n                # Format results for LLM\n                tool_result_for_llm = self.output_formatter.format_tool_result_for_user(\n                    tool_result\n                )\n                all_tool_results_content_with_id.append((call_id, tool_result_for_llm))\n\n            if should_rollback_turn:\n                continue\n\n            # Reset consecutive rollbacks on successful execution\n            if consecutive_rollbacks > 0:\n                self.task_log.log_step(\n                    \"info\",\n                    f\"Main Agent | Turn: {turn_count} | Recovery\",\n                    f\"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks\",\n                )\n            consecutive_rollbacks = 0\n\n            # Update 'last_call_tokens'\n            self.llm_client.last_call_tokens = main_agent_last_call_tokens\n\n            # Update message history\n            message_history = self.llm_client.update_message_history(\n                message_history, all_tool_results_content_with_id\n            )\n\n            self.task_log.main_agent_message_history = {\n                \"system_prompt\": system_prompt,\n                \"message_history\": message_history,\n            }\n            self.task_log.save()\n\n            # Check context length\n            temp_summary_prompt = generate_agent_summarize_prompt(\n                task_description,\n                agent_type=\"main\",\n            )\n\n            pass_length_check, message_history = self.llm_client.ensure_summary_context(\n                message_history, temp_summary_prompt\n            )\n\n            if not pass_length_check:\n                turn_count = max_turns\n                self.task_log.log_step(\n                    \"warning\",\n                    f\"Main Agent | Turn: {turn_count} | Context Limit Reached\",\n                    \"Context limit reached, triggering summary\",\n                )\n                break\n\n        await self.stream.end_llm(\"main\")\n        await self.stream.end_agent(\"main\", self.current_agent_id)\n\n        # Determine if max turns was reached\n        reached_max_turns = turn_count >= max_turns\n        if reached_max_turns:\n            self.task_log.log_step(\n                \"warning\",\n                \"Main Agent | Max Turns Reached / Context Limit Reached\",\n                f\"Reached maximum turns ({max_turns}) or context limit reached\",\n            )\n        else:\n            self.task_log.log_step(\n                \"info\",\n                \"Main Agent | Main Loop Completed\",\n                f\"Main loop completed after {turn_count} turns\",\n            )\n\n        # Final summary\n        self.task_log.log_step(\n            \"info\", \"Main Agent | Final Summary\", \"Generating final summary\"\n        )\n\n        self.current_agent_id = await self.stream.start_agent(\"Final Summary\")\n        await self.stream.start_llm(\"Final Summary\")\n\n        # Generate final answer using answer generator\n        (\n            final_summary,\n            final_boxed_answer,\n            failure_experience_summary,\n            usage_log,\n            message_history,\n        ) = await self.answer_generator.generate_and_finalize_answer(\n            system_prompt=system_prompt,\n            message_history=message_history,\n            tool_definitions=tool_definitions,\n            turn_count=turn_count,\n            task_description=task_description,\n            reached_max_turns=reached_max_turns,\n            is_final_retry=is_final_retry,\n            save_callback=self._save_message_history,\n        )\n\n        await self.stream.tool_call(\"show_text\", {\"text\": final_boxed_answer})\n        await self.stream.end_llm(\"Final Summary\")\n        await self.stream.end_agent(\"Final Summary\", self.current_agent_id)\n        await self.stream.end_workflow(workflow_id)\n\n        self.task_log.log_step(\n            \"info\", \"Main Agent | Usage Calculation\", f\"Usage log: {usage_log}\"\n        )\n\n        self.task_log.log_step(\n            \"info\",\n            \"Main Agent | Final boxed answer\",\n            f\"Final boxed answer:\\n\\n{final_boxed_answer}\",\n        )\n\n        self.task_log.log_step(\n            \"info\",\n            \"Main Agent | Task Completed\",\n            f\"Main agent task {task_id} completed successfully\",\n        )\n        gc.collect()\n        return final_summary, final_boxed_answer, failure_experience_summary\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/pipeline.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nTask execution pipeline module.\n\nThis module provides:\n- execute_task_pipeline: Main function to run a complete task from start to finish\n- create_pipeline_components: Factory function to initialize all pipeline components\n\nThe pipeline orchestrates the interaction between LLM clients, tool managers,\nand the orchestrator to execute complex multi-turn agent tasks.\n\"\"\"\n\nimport traceback\nimport uuid\nfrom typing import Any, Dict, List, Optional\n\nfrom miroflow_tools.manager import ToolManager\nfrom omegaconf import DictConfig\n\nfrom ..config.settings import (\n    create_mcp_server_parameters,\n    get_env_info,\n)\nfrom ..io.output_formatter import OutputFormatter\nfrom ..llm.factory import ClientFactory\nfrom ..logging.task_logger import (\n    TaskLog,\n    get_utc_plus_8_time,\n)\nfrom .orchestrator import Orchestrator\n\n\nasync def execute_task_pipeline(\n    cfg: DictConfig,\n    task_id: str,\n    task_description: str,\n    task_file_name: str,\n    main_agent_tool_manager: ToolManager,\n    sub_agent_tool_managers: Dict[str, ToolManager],\n    output_formatter: OutputFormatter,\n    ground_truth: Optional[Any] = None,\n    log_dir: str = \"logs\",\n    stream_queue: Optional[Any] = None,\n    tool_definitions: Optional[List[Dict[str, Any]]] = None,\n    sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,\n    is_final_retry: bool = False,\n):\n    \"\"\"\n    Executes the full pipeline for a single task.\n\n    Args:\n        cfg: The Hydra configuration object.\n        task_id: A unique identifier for this task run (used for logging).\n        task_description: The description of the task for the LLM.\n        task_file_name: The path to an associated file (empty string if none).\n        main_agent_tool_manager: An initialized main agent ToolManager instance.\n        sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.\n        output_formatter: An initialized OutputFormatter instance.\n        ground_truth: The ground truth for the task (optional).\n        log_dir: The directory to save the task log (default: \"logs\").\n        stream_queue: A queue for streaming the task execution (optional).\n        tool_definitions: The definitions of the tools for the main agent (optional).\n        sub_agent_tool_definitions: The definitions of the tools for the sub-agents (optional).\n\n    Returns:\n        A tuple of (final_summary, final_boxed_answer, log_file_path, failure_experience_summary):\n        - final_summary: A string with the final execution summary, or an error message.\n        - final_boxed_answer: The extracted boxed answer from the LLM response.\n        - log_file_path: The path to the saved task log file.\n        - failure_experience_summary: Summary of failure experience for retry (None if successful).\n    \"\"\"\n    # Create task log\n    task_log = TaskLog(\n        log_dir=log_dir,\n        task_id=task_id,\n        start_time=get_utc_plus_8_time(),\n        input={\"task_description\": task_description, \"task_file_name\": task_file_name},\n        env_info=get_env_info(cfg),\n        ground_truth=ground_truth,\n    )\n\n    # Log task start\n    task_log.log_step(\n        \"info\", \"Main | Task Start\", f\"--- Starting Task Execution: {task_id} ---\"\n    )\n\n    # Set task_log for all ToolManager instances\n    main_agent_tool_manager.set_task_log(task_log)\n    if sub_agent_tool_managers:\n        for sub_agent_tool_manager in sub_agent_tool_managers.values():\n            sub_agent_tool_manager.set_task_log(task_log)\n\n    try:\n        # Initialize LLM client\n        random_uuid = str(uuid.uuid4())\n        unique_id = f\"{task_id}-{random_uuid}\"\n        llm_client = ClientFactory(task_id=unique_id, cfg=cfg, task_log=task_log)\n\n        # Initialize orchestrator\n        orchestrator = Orchestrator(\n            main_agent_tool_manager=main_agent_tool_manager,\n            sub_agent_tool_managers=sub_agent_tool_managers,\n            llm_client=llm_client,\n            output_formatter=output_formatter,\n            cfg=cfg,\n            task_log=task_log,\n            stream_queue=stream_queue,\n            tool_definitions=tool_definitions,\n            sub_agent_tool_definitions=sub_agent_tool_definitions,\n        )\n\n        (\n            final_summary,\n            final_boxed_answer,\n            failure_experience_summary,\n        ) = await orchestrator.run_main_agent(\n            task_description=task_description,\n            task_file_name=task_file_name,\n            task_id=task_id,\n            is_final_retry=is_final_retry,\n        )\n\n        llm_client.close()\n\n        task_log.final_boxed_answer = final_boxed_answer\n        task_log.status = \"success\"\n\n        # Store failure experience summary in task log if available\n        if failure_experience_summary:\n            task_log.trace_data[\"failure_experience_summary\"] = (\n                failure_experience_summary\n            )\n\n        log_file_path = task_log.save()\n        return (\n            final_summary,\n            final_boxed_answer,\n            log_file_path,\n            failure_experience_summary,\n        )\n\n    except Exception as e:\n        error_details = traceback.format_exc()\n        task_log.log_step(\n            \"warning\",\n            \"task_error_notification\",\n            f\"An error occurred during task {task_id}\",\n        )\n        task_log.log_step(\"error\", \"task_error_details\", error_details)\n\n        error_message = (\n            f\"Error executing task {task_id}:\\n\"\n            f\"Description: {task_description}\\n\"\n            f\"File: {task_file_name}\\n\"\n            f\"Error Type: {type(e).__name__}\\n\"\n            f\"Error Details:\\n{error_details}\"\n        )\n\n        task_log.status = \"failed\"\n        task_log.error = error_details\n\n        log_file_path = task_log.save()\n\n        return error_message, \"\", log_file_path, None\n\n    finally:\n        task_log.end_time = get_utc_plus_8_time()\n\n        # Record task summary to structured log\n        task_log.log_step(\n            \"info\",\n            \"task_execution_finished\",\n            f\"Task {task_id} execution completed with status: {task_log.status}\",\n        )\n        task_log.save()\n\n\ndef create_pipeline_components(cfg: DictConfig):\n    \"\"\"\n    Creates and initializes the core components of the agent pipeline.\n\n    Args:\n        cfg: The Hydra configuration object.\n\n    Returns:\n        Tuple of (main_agent_tool_manager, sub_agent_tool_managers, output_formatter)\n    \"\"\"\n    # Create ToolManagers for main agent and sub-agents\n    main_agent_mcp_server_configs, main_agent_blacklist = create_mcp_server_parameters(\n        cfg, cfg.agent.main_agent\n    )\n    main_agent_tool_manager = ToolManager(\n        main_agent_mcp_server_configs,\n        tool_blacklist=main_agent_blacklist,\n    )\n\n    # Create OutputFormatter\n    output_formatter = OutputFormatter()\n    sub_agent_tool_managers = {}\n\n    # For single agent mode\n    if not cfg.agent.sub_agents:\n        return main_agent_tool_manager, {}, output_formatter\n\n    for sub_agent in cfg.agent.sub_agents:\n        sub_agent_mcp_server_configs, sub_agent_blacklist = (\n            create_mcp_server_parameters(cfg, cfg.agent.sub_agents[sub_agent])\n        )\n        sub_agent_tool_manager = ToolManager(\n            sub_agent_mcp_server_configs,\n            tool_blacklist=sub_agent_blacklist,\n        )\n        sub_agent_tool_managers[sub_agent] = sub_agent_tool_manager\n\n    return main_agent_tool_manager, sub_agent_tool_managers, output_formatter\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/stream_handler.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nStream handler module for SSE (Server-Sent Events) protocol.\n\nThis module provides the StreamHandler class that manages all streaming events\nfor real-time communication with clients during agent task execution.\n\"\"\"\n\nimport logging\nimport uuid\nfrom typing import Any, Optional\n\nlogger = logging.getLogger(__name__)\n\n\nclass StreamHandler:\n    \"\"\"\n    Handler for streaming events in SSE protocol format.\n\n    Manages the sending of various event types including workflow lifecycle,\n    agent lifecycle, LLM interactions, and tool calls.\n    \"\"\"\n\n    def __init__(self, stream_queue: Optional[Any] = None):\n        \"\"\"\n        Initialize the stream handler.\n\n        Args:\n            stream_queue: Optional async queue for sending stream messages.\n                         If None, streaming is disabled.\n        \"\"\"\n        self.stream_queue = stream_queue\n\n    async def update(self, event_type: str, data: dict):\n        \"\"\"\n        Send a streaming update in SSE protocol format.\n\n        Args:\n            event_type: The type of event (e.g., 'start_of_workflow', 'tool_call')\n            data: The event payload data\n        \"\"\"\n        if self.stream_queue:\n            try:\n                stream_message = {\n                    \"event\": event_type,\n                    \"data\": data,\n                }\n                await self.stream_queue.put(stream_message)\n            except Exception as e:\n                logger.warning(f\"Failed to send stream update: {e}\")\n\n    async def start_workflow(self, user_input: str) -> str:\n        \"\"\"\n        Send start_of_workflow event.\n\n        Args:\n            user_input: The initial user input for the workflow\n\n        Returns:\n            The generated workflow ID\n        \"\"\"\n        workflow_id = str(uuid.uuid4())\n        await self.update(\n            \"start_of_workflow\",\n            {\n                \"workflow_id\": workflow_id,\n                \"input\": [\n                    {\n                        \"role\": \"user\",\n                        \"content\": user_input,\n                    }\n                ],\n            },\n        )\n        return workflow_id\n\n    async def end_workflow(self, workflow_id: str):\n        \"\"\"\n        Send end_of_workflow event.\n\n        Args:\n            workflow_id: The workflow ID to end\n        \"\"\"\n        await self.update(\n            \"end_of_workflow\",\n            {\n                \"workflow_id\": workflow_id,\n            },\n        )\n\n    async def show_error(self, error: str):\n        \"\"\"\n        Send show_error event and signal stream end.\n\n        Args:\n            error: The error message to display\n        \"\"\"\n        await self.tool_call(\"show_error\", {\"error\": error})\n        if self.stream_queue:\n            try:\n                await self.stream_queue.put(None)\n            except Exception as e:\n                logger.warning(f\"Failed to send show_error: {e}\")\n\n    async def start_agent(self, agent_name: str, display_name: str = None) -> str:\n        \"\"\"\n        Send start_of_agent event.\n\n        Args:\n            agent_name: Internal name of the agent\n            display_name: Optional display name for UI\n\n        Returns:\n            The generated agent ID\n        \"\"\"\n        agent_id = str(uuid.uuid4())\n        await self.update(\n            \"start_of_agent\",\n            {\n                \"agent_name\": agent_name,\n                \"display_name\": display_name,\n                \"agent_id\": agent_id,\n            },\n        )\n        return agent_id\n\n    async def end_agent(self, agent_name: str, agent_id: str):\n        \"\"\"\n        Send end_of_agent event.\n\n        Args:\n            agent_name: Internal name of the agent\n            agent_id: The agent ID to end\n        \"\"\"\n        await self.update(\n            \"end_of_agent\",\n            {\n                \"agent_name\": agent_name,\n                \"agent_id\": agent_id,\n            },\n        )\n\n    async def start_llm(self, agent_name: str, display_name: str = None):\n        \"\"\"\n        Send start_of_llm event.\n\n        Args:\n            agent_name: Name of the agent making the LLM call\n            display_name: Optional display name for UI\n        \"\"\"\n        await self.update(\n            \"start_of_llm\",\n            {\n                \"agent_name\": agent_name,\n                \"display_name\": display_name,\n            },\n        )\n\n    async def end_llm(self, agent_name: str):\n        \"\"\"\n        Send end_of_llm event.\n\n        Args:\n            agent_name: Name of the agent that finished LLM call\n        \"\"\"\n        await self.update(\n            \"end_of_llm\",\n            {\n                \"agent_name\": agent_name,\n            },\n        )\n\n    async def message(self, message_id: str, delta_content: str):\n        \"\"\"\n        Send message event with streaming content.\n\n        Args:\n            message_id: Unique identifier for the message\n            delta_content: The content delta to send\n        \"\"\"\n        await self.update(\n            \"message\",\n            {\n                \"message_id\": message_id,\n                \"delta\": {\n                    \"content\": delta_content,\n                },\n            },\n        )\n\n    async def tool_call(\n        self,\n        tool_name: str,\n        payload: dict,\n        streaming: bool = False,\n        tool_call_id: str = None,\n    ) -> str:\n        \"\"\"\n        Send tool_call event.\n\n        Args:\n            tool_name: Name of the tool being called\n            payload: Tool call arguments or results\n            streaming: If True, send payload keys as deltas\n            tool_call_id: Optional existing tool call ID\n\n        Returns:\n            The tool call ID (generated if not provided)\n        \"\"\"\n        if not tool_call_id:\n            tool_call_id = str(uuid.uuid4())\n\n        if streaming:\n            for key, value in payload.items():\n                await self.update(\n                    \"tool_call\",\n                    {\n                        \"tool_call_id\": tool_call_id,\n                        \"tool_name\": tool_name,\n                        \"delta_input\": {key: value},\n                    },\n                )\n        else:\n            # Send complete tool call\n            await self.update(\n                \"tool_call\",\n                {\n                    \"tool_call_id\": tool_call_id,\n                    \"tool_name\": tool_name,\n                    \"tool_input\": payload,\n                },\n            )\n\n        return tool_call_id\n"
  },
  {
    "path": "apps/miroflow-agent/src/core/tool_executor.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nTool executor module for handling tool call execution.\n\nThis module provides the ToolExecutor class that manages tool call execution,\nincluding argument fixing, duplicate detection, result processing, and error handling.\n\"\"\"\n\nimport json\nimport logging\nimport os\nimport time\nfrom collections import defaultdict\nfrom typing import Any, Dict, List, Optional, Tuple\n\nfrom miroflow_tools.manager import ToolManager\n\nfrom ..io.output_formatter import OutputFormatter\nfrom ..logging.task_logger import TaskLog, get_utc_plus_8_time\nfrom .stream_handler import StreamHandler\n\nlogger = logging.getLogger(__name__)\n\n# Maximum length for scrape results in demo mode (to support more conversation turns)\nDEMO_SCRAPE_MAX_LENGTH = 20_000\n\n\nclass ToolExecutor:\n    \"\"\"\n    Executor for tool calls with support for duplicate detection and result processing.\n\n    Handles the execution of tool calls, including parameter fixing, duplicate query\n    detection, result truncation in demo mode, and error handling.\n    \"\"\"\n\n    def __init__(\n        self,\n        main_agent_tool_manager: ToolManager,\n        sub_agent_tool_managers: Dict[str, ToolManager],\n        output_formatter: OutputFormatter,\n        task_log: TaskLog,\n        stream_handler: StreamHandler,\n        max_consecutive_rollbacks: int = 5,\n    ):\n        \"\"\"\n        Initialize the tool executor.\n\n        Args:\n            main_agent_tool_manager: Tool manager for main agent\n            sub_agent_tool_managers: Dictionary of tool managers for sub-agents\n            output_formatter: Formatter for tool results\n            task_log: Logger for task execution\n            stream_handler: Handler for streaming events\n            max_consecutive_rollbacks: Maximum allowed consecutive rollbacks\n        \"\"\"\n        self.main_agent_tool_manager = main_agent_tool_manager\n        self.sub_agent_tool_managers = sub_agent_tool_managers\n        self.output_formatter = output_formatter\n        self.task_log = task_log\n        self.stream = stream_handler\n        self.max_consecutive_rollbacks = max_consecutive_rollbacks\n\n        # Track used queries to detect duplicates\n        self.used_queries: Dict[str, Dict[str, int]] = {}\n\n    def fix_tool_call_arguments(self, tool_name: str, arguments: dict) -> dict:\n        \"\"\"\n        Fix common parameter name mistakes made by LLM.\n\n        Args:\n            tool_name: Name of the tool being called\n            arguments: Original arguments dictionary\n\n        Returns:\n            Fixed arguments dictionary\n        \"\"\"\n        # Create a copy to avoid modifying the original\n        fixed_args = arguments.copy()\n\n        # Fix scrape_and_extract_info parameter names\n        if tool_name == \"scrape_and_extract_info\":\n            # Map common mistakes to the correct parameter name\n            mistake_names = [\"description\", \"introduction\"]\n            if \"info_to_extract\" not in fixed_args:\n                for mistake_name in mistake_names:\n                    if mistake_name in fixed_args:\n                        fixed_args[\"info_to_extract\"] = fixed_args.pop(mistake_name)\n                        break\n\n        # Fix run_python_code parameter names: 'code' -> 'code_block'\n        # Also add default sandbox_id if missing (will trigger stateless fallback)\n        if tool_name == \"run_python_code\":\n            if \"code_block\" not in fixed_args and \"code\" in fixed_args:\n                fixed_args[\"code_block\"] = fixed_args.pop(\"code\")\n            if \"sandbox_id\" not in fixed_args:\n                fixed_args[\"sandbox_id\"] = \"default\"\n\n        return fixed_args\n\n    def get_query_str_from_tool_call(\n        self, tool_name: str, arguments: dict\n    ) -> Optional[str]:\n        \"\"\"\n        Extract the query string from tool call arguments based on tool_name.\n\n        Supports search_and_browse, google_search, sogou_search, scrape_website,\n        and scrape_and_extract_info.\n\n        Args:\n            tool_name: Name of the tool\n            arguments: Tool arguments dictionary\n\n        Returns:\n            Query string for duplicate detection, or None if not applicable\n        \"\"\"\n        if tool_name == \"search_and_browse\":\n            return tool_name + \"_\" + arguments.get(\"subtask\", \"\")\n        elif tool_name == \"google_search\":\n            return tool_name + \"_\" + arguments.get(\"q\", \"\")\n        elif tool_name == \"sogou_search\":\n            return tool_name + \"_\" + arguments.get(\"Query\", \"\")\n        elif tool_name == \"scrape_website\":\n            return tool_name + \"_\" + arguments.get(\"url\", \"\")\n        elif tool_name == \"scrape_and_extract_info\":\n            return (\n                tool_name\n                + \"_\"\n                + arguments.get(\"url\", \"\")\n                + \"_\"\n                + arguments.get(\"info_to_extract\", \"\")\n            )\n        return None\n\n    def is_duplicate_query(self, cache_name: str, query_str: str) -> Tuple[bool, int]:\n        \"\"\"\n        Check if a query has been executed before.\n\n        Args:\n            cache_name: Name of the cache (e.g., \"main_google_search\")\n            query_str: The query string to check\n\n        Returns:\n            Tuple of (is_duplicate, previous_count)\n        \"\"\"\n        self.used_queries.setdefault(cache_name, defaultdict(int))\n        count = self.used_queries[cache_name][query_str]\n        return count > 0, count\n\n    def record_query(self, cache_name: str, query_str: str):\n        \"\"\"\n        Record that a query has been executed.\n\n        Args:\n            cache_name: Name of the cache\n            query_str: The query string to record\n        \"\"\"\n        self.used_queries.setdefault(cache_name, defaultdict(int))\n        self.used_queries[cache_name][query_str] += 1\n\n    def is_google_search_empty_result(self, tool_name: str, tool_result: dict) -> bool:\n        \"\"\"\n        Check if google_search result has empty organic results.\n\n        This indicates a poor search query that should be retried.\n\n        Args:\n            tool_name: Name of the tool\n            tool_result: The tool execution result\n\n        Returns:\n            True if the result is empty and should trigger retry\n        \"\"\"\n        if tool_name != \"google_search\":\n            return False\n\n        result = tool_result.get(\"result\")\n        if not result:\n            return False\n\n        try:\n            if isinstance(result, str):\n                result_dict = json.loads(result)\n            else:\n                result_dict = result\n\n            organic = result_dict.get(\"organic\", [])\n            return len(organic) == 0\n        except (json.JSONDecodeError, TypeError, AttributeError):\n            return False\n\n    def get_scrape_result(self, result: str) -> str:\n        \"\"\"\n        Process scrape result and truncate if too long.\n\n        Args:\n            result: Raw scrape result string (JSON or plain text)\n\n        Returns:\n            Processed result, truncated to DEMO_SCRAPE_MAX_LENGTH if necessary\n        \"\"\"\n        try:\n            scrape_result_dict = json.loads(result)\n            text = scrape_result_dict.get(\"text\")\n            if text and len(text) > DEMO_SCRAPE_MAX_LENGTH:\n                text = text[:DEMO_SCRAPE_MAX_LENGTH]\n            return json.dumps({\"text\": text}, ensure_ascii=False)\n        except json.JSONDecodeError:\n            if isinstance(result, str) and len(result) > DEMO_SCRAPE_MAX_LENGTH:\n                result = result[:DEMO_SCRAPE_MAX_LENGTH]\n            return result\n\n    def post_process_tool_call_result(\n        self, tool_name: str, tool_call_result: dict\n    ) -> dict:\n        \"\"\"\n        Process tool call results.\n\n        Only in demo mode: truncate scrape results to 20,000 chars\n        to support more conversation turns.\n\n        Args:\n            tool_name: Name of the tool\n            tool_call_result: The tool execution result\n\n        Returns:\n            Processed tool result\n        \"\"\"\n        if os.environ.get(\"DEMO_MODE\") == \"1\":\n            if \"result\" in tool_call_result and tool_name in [\n                \"scrape\",\n                \"scrape_website\",\n            ]:\n                tool_call_result[\"result\"] = self.get_scrape_result(\n                    tool_call_result[\"result\"]\n                )\n        return tool_call_result\n\n    def should_rollback_result(\n        self, tool_name: str, result: Any, tool_result: dict\n    ) -> bool:\n        \"\"\"\n        Check if a tool result should trigger a rollback.\n\n        Args:\n            tool_name: Name of the tool\n            result: The result value\n            tool_result: Full tool result dictionary\n\n        Returns:\n            True if the result indicates an error that should trigger rollback\n        \"\"\"\n        return (\n            str(result).startswith(\"Unknown tool:\")\n            or str(result).startswith(\"Error executing tool\")\n            or self.is_google_search_empty_result(tool_name, tool_result)\n        )\n\n    async def execute_single_tool_call(\n        self,\n        tool_manager: ToolManager,\n        server_name: str,\n        tool_name: str,\n        arguments: dict,\n        agent_name: str,\n        turn_count: int,\n    ) -> Tuple[dict, int, List[dict]]:\n        \"\"\"\n        Execute a single tool call.\n\n        Args:\n            tool_manager: The tool manager to use\n            server_name: Name of the MCP server\n            tool_name: Name of the tool\n            arguments: Tool arguments\n            agent_name: Name of the agent making the call\n            turn_count: Current turn count\n\n        Returns:\n            Tuple of (tool_result, duration_ms, tool_calls_data)\n        \"\"\"\n        call_start_time = time.time()\n        tool_calls_data = []\n\n        try:\n            # Execute tool call\n            tool_result = await tool_manager.execute_tool_call(\n                server_name, tool_name, arguments\n            )\n\n            # Post-process result\n            tool_result = self.post_process_tool_call_result(tool_name, tool_result)\n\n            call_end_time = time.time()\n            call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n            self.task_log.log_step(\n                \"info\",\n                f\"{agent_name} | Turn: {turn_count} | Tool Call\",\n                f\"Tool {tool_name} completed in {call_duration_ms}ms\",\n            )\n\n            tool_calls_data.append(\n                {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"arguments\": arguments,\n                    \"result\": tool_result,\n                    \"duration_ms\": call_duration_ms,\n                    \"call_time\": get_utc_plus_8_time(),\n                }\n            )\n\n            return tool_result, call_duration_ms, tool_calls_data\n\n        except Exception as e:\n            call_end_time = time.time()\n            call_duration_ms = int((call_end_time - call_start_time) * 1000)\n\n            tool_calls_data.append(\n                {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"arguments\": arguments,\n                    \"error\": str(e),\n                    \"duration_ms\": call_duration_ms,\n                    \"call_time\": get_utc_plus_8_time(),\n                }\n            )\n\n            tool_result = {\n                \"error\": f\"Tool call failed: {str(e)}\",\n                \"server_name\": server_name,\n                \"tool_name\": tool_name,\n            }\n\n            self.task_log.log_step(\n                \"error\",\n                f\"{agent_name} | Turn: {turn_count} | Tool Call\",\n                f\"Tool {tool_name} failed to execute: {str(e)}\",\n            )\n\n            return tool_result, call_duration_ms, tool_calls_data\n\n    def format_tool_result_for_llm(self, tool_result: dict) -> dict:\n        \"\"\"\n        Format tool result for feeding back to LLM.\n\n        Args:\n            tool_result: The tool execution result\n\n        Returns:\n            Formatted result suitable for LLM message\n        \"\"\"\n        return self.output_formatter.format_tool_result_for_user(tool_result)\n"
  },
  {
    "path": "apps/miroflow-agent/src/io/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Input/Output module for processing task inputs and formatting outputs.\"\"\"\n\nfrom .input_handler import process_input\nfrom .output_formatter import OutputFormatter\n\n__all__ = [\n    \"process_input\",\n    \"OutputFormatter\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/io/input_handler.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nInput handler module for processing various file types.\n\nThis module provides functions for:\n- Processing task inputs with associated files\n- Converting documents (PDF, DOCX, PPTX, XLSX) to markdown\n- Generating captions for images, audio, and video files\n- Extracting task-relevant information from media files\n\nSupported file formats:\n- Documents: PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, HTML\n- Images: JPG, JPEG, PNG, GIF, WEBP\n- Audio: WAV, MP3, M4A\n- Video: MP4, MOV, AVI, MKV, WEBM\n- Data: JSON, JSONLD, CSV, YAML, TOML\n- Code: PY, SH, MD, TXT\n- Archives: ZIP\n\"\"\"\n\nimport base64\nimport html\nimport json\nimport os\nimport re\nimport shutil\nimport tempfile\nimport traceback\nfrom typing import Any, Tuple, Union\nfrom urllib.parse import quote, unquote, urlparse, urlunparse\n\nimport mammoth\nimport markdownify\nimport openpyxl\nimport pdfminer\nimport pdfminer.high_level\nimport pptx\nfrom bs4 import BeautifulSoup\nfrom dotenv import load_dotenv\nfrom markitdown import MarkItDown\nfrom openai import OpenAI\nfrom openpyxl.utils import get_column_letter\n\n# Ensure .env file is loaded\nload_dotenv()\n\n# File extension constants for different media types\nIMAGE_EXTENSIONS = {\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"}\nAUDIO_EXTENSIONS = {\"wav\", \"mp3\", \"m4a\"}\nVIDEO_EXTENSIONS = {\"mp4\", \"mov\", \"avi\", \"mkv\", \"webm\"}\nMEDIA_EXTENSIONS = IMAGE_EXTENSIONS | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS\n# Extensions that should skip MarkItDown fallback processing\nSKIP_MARKITDOWN_EXTENSIONS = MEDIA_EXTENSIONS | {\"pdb\"}\n\n\ndef _generate_image_caption(image_path: str) -> str:\n    \"\"\"\n    Generate a caption for an image using OpenAI's GPT-4o vision model.\n\n    Args:\n        image_path: Path to the image file\n\n    Returns:\n        Caption string, or error message if failed\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"[Caption unavailable: OPENAI_API_KEY not set]\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Read and encode image\n        with open(image_path, \"rb\") as image_file:\n            image_data = base64.b64encode(image_file.read()).decode(\"utf-8\")\n\n        # Guess MIME type\n        _, ext = os.path.splitext(image_path)\n        ext = ext.lower()\n        mime_type = {\n            \".jpg\": \"image/jpeg\",\n            \".jpeg\": \"image/jpeg\",\n            \".png\": \"image/png\",\n            \".gif\": \"image/gif\",\n            \".webp\": \"image/webp\",\n        }.get(ext, \"image/jpeg\")\n\n        # Call OpenAI API\n        response = client.chat.completions.create(\n            model=\"gpt-4o\",\n            messages=[\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"text\",\n                            \"text\": \"Please provide a detailed description of this image. Include key objects, people, text, colors, and any other relevant details.\",\n                        },\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:{mime_type};base64,{image_data}\"\n                            },\n                        },\n                    ],\n                }\n            ],\n            max_tokens=2048,\n            temperature=0,\n        )\n\n        content = response.choices[0].message.content\n        return content if content else \"[Caption unavailable: Empty response]\"\n\n    except Exception as e:\n        return f\"[Caption generation failed: {str(e)}]\"\n\n\ndef _generate_audio_caption(audio_path: str) -> str:\n    \"\"\"\n    Generate a caption for an audio file using OpenAI's audio transcription.\n\n    Args:\n        audio_path: Path to the audio file\n\n    Returns:\n        Caption string (transcription), or error message if failed\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"[Caption unavailable: OPENAI_API_KEY not set]\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Transcribe audio\n        with open(audio_path, \"rb\") as audio_file:\n            transcription = client.audio.transcriptions.create(\n                model=\"gpt-4o-transcribe\", file=audio_file\n            )\n\n        text = transcription.text\n        return text if text else \"[Transcription unavailable: Empty response]\"\n\n    except Exception as e:\n        return f\"[Caption generation failed: {str(e)}]\"\n\n\ndef _generate_video_caption(video_path: str) -> str:\n    \"\"\"\n    Generate a caption for a video using OpenAI's GPT-4o vision model.\n\n    Args:\n        video_path: Path to the video file\n\n    Returns:\n        Caption string, or error message if failed\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"[Caption unavailable: OPENAI_API_KEY not set]\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Read and encode video\n        with open(video_path, \"rb\") as video_file:\n            video_data = base64.b64encode(video_file.read()).decode(\"utf-8\")\n\n        # Guess MIME type\n        _, ext = os.path.splitext(video_path)\n        ext = ext.lower()\n        mime_type = {\n            \".mp4\": \"video/mp4\",\n            \".mov\": \"video/quicktime\",\n            \".avi\": \"video/x-msvideo\",\n            \".mkv\": \"video/x-matroska\",\n            \".webm\": \"video/webm\",\n        }.get(ext, \"video/mp4\")\n\n        # Call OpenAI API\n        response = client.chat.completions.create(\n            model=\"gpt-4o\",\n            messages=[\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"text\",\n                            \"text\": \"Please provide a detailed description of this video. Include key events, people, objects, actions, audio information, and any text visible in the video.\",\n                        },\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:{mime_type};base64,{video_data}\"\n                            },\n                        },\n                    ],\n                }\n            ],\n            max_tokens=2048,\n            temperature=0,\n        )\n\n        content = response.choices[0].message.content\n        return content if content else \"[Caption unavailable: Empty response]\"\n\n    except Exception as e:\n        return f\"[Caption generation failed: {str(e)}]\"\n\n\ndef _extract_task_relevant_info_from_image(\n    image_path: str, task_description: str\n) -> str:\n    \"\"\"\n    Extract task-relevant information directly from an image based on the task description.\n\n    Args:\n        image_path: Path to the image file\n        task_description: The user's task description\n\n    Returns:\n        Extracted relevant information, or empty string if extraction fails\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Read and encode image\n        with open(image_path, \"rb\") as image_file:\n            image_data = base64.b64encode(image_file.read()).decode(\"utf-8\")\n\n        # Guess MIME type\n        _, ext = os.path.splitext(image_path)\n        ext = ext.lower()\n        mime_type = {\n            \".jpg\": \"image/jpeg\",\n            \".jpeg\": \"image/jpeg\",\n            \".png\": \"image/png\",\n            \".gif\": \"image/gif\",\n            \".webp\": \"image/webp\",\n        }.get(ext, \"image/jpeg\")\n\n        # Call OpenAI API with task-specific prompt\n        response = client.chat.completions.create(\n            model=\"gpt-4o\",\n            messages=[\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"text\",\n                            \"text\": f\"\"\"Based on the following task, analyze this image and extract only the information that is directly relevant to completing the task.\n\nTask: {task_description}\n\nPlease provide a concise summary of the relevant information from the image that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state \"No specific task-relevant details identified in the image.\" Keep the response brief and focused.\"\"\",\n                        },\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:{mime_type};base64,{image_data}\"\n                            },\n                        },\n                    ],\n                }\n            ],\n            max_tokens=1024,\n            temperature=0,\n        )\n\n        return response.choices[0].message.content.strip()\n\n    except Exception as e:\n        print(f\"Warning: Failed to extract task-relevant info from image: {str(e)}\")\n        return \"\"\n\n\ndef _extract_task_relevant_info_from_audio(\n    audio_path: str, task_description: str\n) -> str:\n    \"\"\"\n    Extract task-relevant information directly from an audio file based on the task description.\n\n    Args:\n        audio_path: Path to the audio file\n        task_description: The user's task description\n\n    Returns:\n        Extracted relevant information, or empty string if extraction fails\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Read and encode audio file\n        with open(audio_path, \"rb\") as audio_file:\n            audio_data = base64.b64encode(audio_file.read()).decode(\"utf-8\")\n\n        # Detect audio format\n        _, ext = os.path.splitext(audio_path)\n        ext = ext.lower()\n        audio_format = {\n            \".mp3\": \"mp3\",\n            \".wav\": \"wav\",\n            \".m4a\": \"m4a\",\n        }.get(ext, \"mp3\")\n\n        # Use gpt-4o-audio-preview for direct audio question answering\n        text_prompt = f\"\"\"Based on the following task, analyze this audio and extract only the information that is directly relevant to completing the task.\n\nTask: {task_description}\n\nPlease provide a concise summary of the relevant information from the audio that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state \"No specific task-relevant details identified in the audio.\" Keep the response brief and focused.\"\"\"\n\n        response = client.chat.completions.create(\n            model=\"gpt-4o-audio-preview\",\n            messages=[\n                {\n                    \"role\": \"system\",\n                    \"content\": \"You are a helpful assistant specializing in audio analysis.\",\n                },\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\"type\": \"text\", \"text\": text_prompt},\n                        {\n                            \"type\": \"input_audio\",\n                            \"input_audio\": {\n                                \"data\": audio_data,\n                                \"format\": audio_format,\n                            },\n                        },\n                    ],\n                },\n            ],\n            max_tokens=1024,\n            temperature=0,\n        )\n\n        return response.choices[0].message.content.strip()\n\n    except Exception as e:\n        print(f\"Warning: Failed to extract task-relevant info from audio: {str(e)}\")\n        return \"\"\n\n\ndef _extract_task_relevant_info_from_video(\n    video_path: str, task_description: str\n) -> str:\n    \"\"\"\n    Extract task-relevant information directly from a video based on the task description.\n\n    Args:\n        video_path: Path to the video file\n        task_description: The user's task description\n\n    Returns:\n        Extracted relevant information, or empty string if extraction fails\n    \"\"\"\n    try:\n        OPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n        OPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n        if not OPENAI_API_KEY:\n            return \"\"\n\n        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n        # Read and encode video\n        with open(video_path, \"rb\") as video_file:\n            video_data = base64.b64encode(video_file.read()).decode(\"utf-8\")\n\n        # Guess MIME type\n        _, ext = os.path.splitext(video_path)\n        ext = ext.lower()\n        mime_type = {\n            \".mp4\": \"video/mp4\",\n            \".mov\": \"video/quicktime\",\n            \".avi\": \"video/x-msvideo\",\n            \".mkv\": \"video/x-matroska\",\n            \".webm\": \"video/webm\",\n        }.get(ext, \"video/mp4\")\n\n        # Call OpenAI API with task-specific prompt\n        response = client.chat.completions.create(\n            model=\"gpt-4o\",\n            messages=[\n                {\n                    \"role\": \"user\",\n                    \"content\": [\n                        {\n                            \"type\": \"text\",\n                            \"text\": f\"\"\"Based on the following task, analyze this video and extract only the information that is directly relevant to completing the task.\n\nTask: {task_description}\n\nPlease provide a concise summary of the relevant information from the video that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state \"No specific task-relevant details identified in the video.\" Keep the response brief and focused.\"\"\",\n                        },\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:{mime_type};base64,{video_data}\"\n                            },\n                        },\n                    ],\n                }\n            ],\n            max_tokens=1024,\n            temperature=0,\n        )\n\n        return response.choices[0].message.content.strip()\n\n    except Exception as e:\n        print(f\"Warning: Failed to extract task-relevant info from video: {str(e)}\")\n        return \"\"\n\n\ndef process_input(task_description: str, task_file_name: str) -> Tuple[str, str]:\n    \"\"\"\n    Process user input and associated files.\n\n    Extracts content from the task file (if provided) and appends it to the\n    task description in a format suitable for the LLM.\n\n    Args:\n        task_description: The original task description\n        task_file_name: Path to an associated file, or empty string if none\n\n    Returns:\n        Tuple of (updated_task_description, updated_task_description)\n        Both values are the same - the task description with file content appended\n    \"\"\"\n    updated_task_description = task_description\n    file_content_section = \"\"  # Collect file content to append at the end\n\n    if task_file_name:\n        try:\n            file_extension = task_file_name.rsplit(\".\", maxsplit=1)[-1].lower()\n            parsing_result = None\n\n            if file_extension in IMAGE_EXTENSIONS:\n                # Generate unconditional image caption\n                caption = _generate_image_caption(task_file_name)\n\n                # Extract task-relevant information directly from the image\n                relevant_info = _extract_task_relevant_info_from_image(\n                    task_file_name, task_description\n                )\n\n                # Format as Markdown\n                file_content_section += f\"\\n\\nNote: An image file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Image Content\\nFile: {task_file_name}\\n\\n\"\n                file_content_section += f\"> {caption}\\n\\n\"\n\n                if relevant_info:\n                    file_content_section += \"Task-Relevant Information:\\n\\n\"\n                    file_content_section += f\"{relevant_info}\\n\\n\"\n\n            elif file_extension == \"py\":\n                # Python files - read directly\n                with open(task_file_name, \"r\", encoding=\"utf-8\") as f:\n                    parsing_result = DocumentConverterResult(\n                        title=None, text_content=f.read()\n                    )\n                file_content_section += f\"\\n\\nNote: A Python file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Python File\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension in [\"txt\", \"md\", \"sh\", \"yaml\", \"yml\", \"toml\", \"csv\"]:\n                # Text-based files - read directly\n                with open(task_file_name, \"r\", encoding=\"utf-8\") as f:\n                    parsing_result = DocumentConverterResult(\n                        title=None, text_content=f.read()\n                    )\n                file_type_name = {\n                    \"txt\": \"Text\",\n                    \"md\": \"Markdown\",\n                    \"sh\": \"Shell Script\",\n                    \"yaml\": \"YAML\",\n                    \"yml\": \"YAML\",\n                    \"toml\": \"TOML\",\n                    \"csv\": \"CSV\",\n                }.get(file_extension, \"Text\")\n                file_content_section += f\"\\n\\nNote: A {file_type_name.lower()} file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += (\n                    f\"## {file_type_name} File\\nFile: {task_file_name}\\n\\n\"\n                )\n\n            elif file_extension in [\"jsonld\", \"json\"]:\n                with open(task_file_name, \"r\", encoding=\"utf-8\") as f:\n                    parsing_result = DocumentConverterResult(\n                        title=None,\n                        text_content=json.dumps(\n                            json.load(f), ensure_ascii=False, indent=2\n                        ),\n                    )\n                file_content_section += f\"\\n\\nNote: A JSON file '{task_file_name}' is associated with this task. The content has been extracted as JSON format below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## JSON File\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension in [\"xlsx\", \"xls\"]:\n                parsing_result = XlsxConverter(local_path=task_file_name)\n                file_content_section += f\"\\n\\nNote: An Excel file '{task_file_name}' is associated with this task. The content has been extracted as a markdown table below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Excel File\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension == \"pdf\":\n                parsing_result = DocumentConverterResult(\n                    title=None,\n                    text_content=pdfminer.high_level.extract_text(task_file_name),\n                )\n                file_content_section += f\"\\n\\nNote: A PDF file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## PDF File\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension in [\"docx\", \"doc\"]:\n                parsing_result = DocxConverter(local_path=task_file_name)\n                file_content_section += f\"\\n\\nNote: A Word document '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Word Document\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension in [\"html\", \"htm\"]:\n                parsing_result = HtmlConverter(local_path=task_file_name)\n                file_content_section += f\"\\n\\nNote: An HTML file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## HTML File\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension in [\"pptx\", \"ppt\"]:\n                parsing_result = PptxConverter(local_path=task_file_name)\n                file_content_section += f\"\\n\\nNote: A PowerPoint presentation '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += (\n                    f\"## PowerPoint Presentation\\nFile: {task_file_name}\\n\\n\"\n                )\n\n            elif file_extension in AUDIO_EXTENSIONS:\n                # Generate unconditional audio transcription\n                caption = _generate_audio_caption(task_file_name)\n\n                # Extract task-relevant information directly from the audio\n                relevant_info = _extract_task_relevant_info_from_audio(\n                    task_file_name, task_description\n                )\n\n                # Format as Markdown\n                file_content_section += f\"\\n\\nNote: An audio file '{task_file_name}' is associated with this task. The content has been extracted as a transcription below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Audio Content\\nFile: {task_file_name}\\n\\n\"\n                file_content_section += f\"> {caption}\\n\\n\"\n\n                if relevant_info:\n                    file_content_section += \"Task-Relevant Information:\\n\\n\"\n                    file_content_section += f\"{relevant_info}\\n\\n\"\n\n            elif file_extension in VIDEO_EXTENSIONS:\n                # Generate unconditional video caption\n                caption = _generate_video_caption(task_file_name)\n\n                # Extract task-relevant information directly from the video\n                relevant_info = _extract_task_relevant_info_from_video(\n                    task_file_name, task_description\n                )\n\n                # Format as Markdown\n                file_content_section += f\"\\n\\nNote: A video file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## Video Content\\nFile: {task_file_name}\\n\\n\"\n                file_content_section += f\"> {caption}\\n\\n\"\n\n                if relevant_info:\n                    file_content_section += \"Task-Relevant Information:\\n\\n\"\n                    file_content_section += f\"{relevant_info}\\n\\n\"\n\n            elif file_extension in [\"zip\"]:\n                parsing_result = ZipConverter(local_path=task_file_name)\n                file_content_section += f\"\\n\\nNote: A ZIP archive '{task_file_name}' is associated with this task. The content has been extracted as file list and contents below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                file_content_section += f\"## ZIP Archive\\nFile: {task_file_name}\\n\\n\"\n\n            elif file_extension == \"pdb\":\n                # PDB files (protein database) - only add note\n                file_content_section += f\"\\n\\nNote: A PDB file '{task_file_name}' is associated with this task. You may use available tools to read its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n\n            else:\n                # For other file types, let MarkItDown try to handle it\n                pass  # MarkItDown will be tried below\n\n            #### markitdown process - ONLY if no specialized converter handled it ####\n            if parsing_result is None:\n                try:\n                    if file_extension not in SKIP_MARKITDOWN_EXTENSIONS:\n                        md = MarkItDown(enable_plugins=True)\n                        parsing_result = md.convert(task_file_name)\n                        print(\n                            f\"Info: Used MarkItDown as fallback to process file {task_file_name}\"\n                        )\n                        # Add prompt for files processed by MarkItDown\n                        file_content_section += f\"\\n\\nNote: A file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\\n\\n\"\n                        file_content_section += (\n                            f\"## File Content\\nFile: {task_file_name}\\n\\n\"\n                        )\n                except Exception as e:\n                    print(\n                        f\"Warning: MarkItDown failed to process {task_file_name}: {e}\"\n                    )\n                    pass\n            ############################\n\n            # Collect the content and title (if has) to append later\n            if getattr(parsing_result, \"title\", None):\n                file_content_section += \"Title:\\n\\n{}\\n\\n\".format(parsing_result.title)\n                file_content_section += \"Content:\\n\\n```\\n{}\\n```\\n\".format(\n                    parsing_result.text_content\n                )\n            elif getattr(parsing_result, \"text_content\", None):\n                content = parsing_result.text_content\n                max_len = 200_000  # Limit the length of results returned to LLM\n                if len(content) > max_len:\n                    content = content[:max_len] + \"\\n... [File truncated]\"\n                file_content_section += \"```\\n{}\\n```\\n\".format(content)\n            else:\n                pass  # for image, audio, video files that already have their content formatted\n\n        except FileNotFoundError:\n            print(f\"Error: File not found {task_file_name}\")\n            file_content_section += (\n                f\"\\nWarning: The specified file '{task_file_name}' was not found.\"\n            )\n        except Exception as e:\n            print(f\"Error: Error processing file {task_file_name}: {e}\")\n            traceback.print_exc()\n            file_content_section += f\"\\nWarning: There was an error processing the file '{task_file_name}': {str(e)}\"\n\n    # output format requirement\n    updated_task_description += \"\\nYou should follow the format instruction in the request strictly and wrap the final answer in \\\\boxed{}.\"\n\n    # Append file content at the end\n    updated_task_description += file_content_section\n    updated_task_description = updated_task_description.strip()\n\n    return updated_task_description, updated_task_description\n\n\nclass _CustomMarkdownify(markdownify.MarkdownConverter):\n    \"\"\"\n    A custom version of markdownify's MarkdownConverter. Changes include:\n\n    - Altering the default heading style to use '#', '##', etc.\n    - Removing javascript hyperlinks.\n    - Truncating images with large data:uri sources.\n    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax\n    \"\"\"\n\n    def __init__(self, **options: Any):\n        options[\"heading_style\"] = options.get(\"heading_style\", markdownify.ATX)\n        # Explicitly cast options to the expected type if necessary\n        super().__init__(**options)\n\n    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:\n        \"\"\"Same as usual, but be sure to start with a new line\"\"\"\n        if not convert_as_inline:\n            if not re.search(r\"^\\n\", text):\n                return \"\\n\" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore\n\n        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore\n\n    def convert_a(self, el: Any, text: str, convert_as_inline: bool):\n        \"\"\"Same as usual converter, but removes Javascript links and escapes URIs.\"\"\"\n        prefix, suffix, text = markdownify.chomp(text)  # type: ignore\n        if not text:\n            return \"\"\n        href = el.get(\"href\")\n        title = el.get(\"title\")\n\n        # Escape URIs and skip non-http or file schemes\n        if href:\n            try:\n                parsed_url = urlparse(href)  # type: ignore\n                if parsed_url.scheme and parsed_url.scheme.lower() not in [\n                    \"http\",\n                    \"https\",\n                    \"file\",\n                ]:  # type: ignore\n                    return \"%s%s%s\" % (prefix, text, suffix)\n                href = urlunparse(\n                    parsed_url._replace(path=quote(unquote(parsed_url.path)))\n                )  # type: ignore\n            except ValueError:  # It's not clear if this ever gets thrown\n                return \"%s%s%s\" % (prefix, text, suffix)\n\n        # For the replacement see #29: text nodes underscores are escaped\n        if (\n            self.options[\"autolinks\"]\n            and text.replace(r\"\\_\", \"_\") == href\n            and not title\n            and not self.options[\"default_title\"]\n        ):\n            # Shortcut syntax\n            return \"<%s>\" % href\n        if self.options[\"default_title\"] and not title:\n            title = href\n        title_part = ' \"%s\"' % title.replace('\"', r\"\\\"\") if title else \"\"\n        return (\n            \"%s[%s](%s%s)%s\" % (prefix, text, href, title_part, suffix)\n            if href\n            else text\n        )\n\n    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:\n        \"\"\"Same as usual converter, but removes data URIs\"\"\"\n\n        alt = el.attrs.get(\"alt\", None) or \"\"\n        src = el.attrs.get(\"src\", None) or \"\"\n        title = el.attrs.get(\"title\", None) or \"\"\n        title_part = ' \"%s\"' % title.replace('\"', r\"\\\"\") if title else \"\"\n        if (\n            convert_as_inline\n            and el.parent.name not in self.options[\"keep_inline_images_in\"]\n        ):\n            return alt\n\n        # Remove dataURIs\n        if src.startswith(\"data:\"):\n            src = src.split(\",\")[0] + \"...\"\n\n        return \"![%s](%s%s)\" % (alt, src, title_part)\n\n    def convert_soup(self, soup: Any) -> str:\n        return super().convert_soup(soup)  # type: ignore\n\n\nclass DocumentConverterResult:\n    \"\"\"The result of converting a document to text.\"\"\"\n\n    def __init__(self, title: Union[str, None] = None, text_content: str = \"\"):\n        self.title: Union[str, None] = title\n        self.text_content: str = text_content\n\n\ndef convert_html_to_md(html_content):\n    \"\"\"\n    Placeholder for HTML to Markdown conversion function\n    In the original class, this would call self._convert()\n    \"\"\"\n    soup = BeautifulSoup(html_content, \"html.parser\")\n    for script in soup([\"script\", \"style\"]):\n        script.extract()\n\n    # Print only the main content\n    body_elm = soup.find(\"body\")\n    webpage_text = \"\"\n    if body_elm:\n        webpage_text = _CustomMarkdownify().convert_soup(body_elm)\n    else:\n        webpage_text = _CustomMarkdownify().convert_soup(soup)\n\n    assert isinstance(webpage_text, str)\n\n    return DocumentConverterResult(\n        title=None if soup.title is None else soup.title.string,\n        text_content=webpage_text,\n    )\n\n\ndef HtmlConverter(local_path: str):\n    \"\"\"\n    Convert an HTML file to Markdown format.\n\n    Args:\n        local_path: Path to the HTML file to convert.\n\n    Returns:\n        DocumentConverterResult containing the converted Markdown text.\n    \"\"\"\n    with open(local_path, \"rt\", encoding=\"utf-8\") as fh:\n        html_content = fh.read()\n\n        return convert_html_to_md(html_content)\n\n\ndef DocxConverter(local_path: str):\n    \"\"\"\n    Convert a DOCX file to Markdown format.\n\n    Uses mammoth library to first convert DOCX to HTML, then converts\n    the HTML to Markdown.\n\n    Args:\n        local_path: Path to the DOCX file to convert.\n\n    Returns:\n        DocumentConverterResult containing the converted Markdown text.\n    \"\"\"\n    with open(local_path, \"rb\") as docx_file:\n        result = mammoth.convert_to_html(docx_file)\n        html_content = result.value\n    return convert_html_to_md(html_content)\n\n\ndef XlsxConverter(local_path: str):\n    \"\"\"\n    Converts Excel files to Markdown using openpyxl.\n    Preserves color formatting and other cell styling information.\n\n    Args:\n        local_path: Path to the Excel file\n\n    Returns:\n        DocumentConverterResult with the Markdown representation of the Excel file\n    \"\"\"\n    # Load the workbook\n    wb = openpyxl.load_workbook(local_path, data_only=True)\n    md_content = \"\"\n\n    # Helper function to convert RGB color to hex\n    def rgb_to_hex(rgb_value):\n        if not rgb_value:\n            return None\n\n        # Convert RGB value to string for processing\n        rgb_string = str(rgb_value)\n\n        # Handle RGB format like 'RGB(255, 255, 255)'\n        if isinstance(rgb_value, str) and rgb_string.startswith(\"RGB\"):\n            rgb_match = re.match(r\"RGB\\((\\d+), (\\d+), (\\d+)\\)\", rgb_string)\n            if rgb_match:\n                r, g, b = map(int, rgb_match.groups())\n                return f\"#{r:02x}{g:02x}{b:02x}\"\n\n        # Special handling for FFFFFFFF (white) and 00000000 (transparent/none)\n        if rgb_string in [\"FFFFFFFF\", \"00000000\", \"none\", \"auto\"]:\n            return None\n\n        # Handle ARGB format (common in openpyxl)\n        if len(rgb_string) == 8:  # ARGB format like 'FF5733FF'\n            return f\"#{rgb_string[2:]}\"  # Strip alpha channel\n\n        # Handle direct hex values like 'FF5733'\n        if isinstance(rgb_value, str):\n            return f\"#{rgb_string}\" if not rgb_string.startswith(\"#\") else rgb_string\n\n        return None  # Return None for unrecognized formats\n\n    # Helper function to detect and format cell styling\n    def get_cell_format_info(cell):\n        info = {}\n\n        # Get background color if it exists\n        if cell.fill and hasattr(cell.fill, \"fgColor\") and cell.fill.fgColor:\n            # Get the RGB value - in openpyxl this can be stored in different attributes\n            rgb_value = None\n            if hasattr(cell.fill.fgColor, \"rgb\") and cell.fill.fgColor.rgb:\n                rgb_value = cell.fill.fgColor.rgb\n            elif hasattr(cell.fill.fgColor, \"value\") and cell.fill.fgColor.value:\n                rgb_value = cell.fill.fgColor.value\n\n            if rgb_value:\n                bg_color = rgb_to_hex(rgb_value)\n                if bg_color:  # Skip transparent or white (handled in rgb_to_hex)\n                    info[\"bg_color\"] = bg_color\n\n        # Get font color if it exists\n        if cell.font and hasattr(cell.font, \"color\") and cell.font.color:\n            # Get the RGB value - in openpyxl this can be stored in different attributes\n            rgb_value = None\n            if hasattr(cell.font.color, \"rgb\") and cell.font.color.rgb:\n                rgb_value = cell.font.color.rgb\n            elif hasattr(cell.font.color, \"value\") and cell.font.color.value:\n                rgb_value = cell.font.color.value\n\n            if rgb_value:\n                font_color = rgb_to_hex(rgb_value)\n                if font_color:  # Skip transparent (handled in rgb_to_hex)\n                    info[\"font_color\"] = font_color\n\n        # Get font weight (bold)\n        if cell.font and cell.font.bold:\n            info[\"bold\"] = True\n\n        # Get font style (italic)\n        if cell.font and cell.font.italic:\n            info[\"italic\"] = True\n\n        # Get font underline\n        if cell.font and cell.font.underline and cell.font.underline != \"none\":\n            info[\"underline\"] = True\n\n        return info\n\n    # Process each sheet in the workbook\n    for sheet_name in wb.sheetnames:\n        try:\n            sheet = wb[sheet_name]\n            md_content += f\"## {sheet_name}\\n\\n\"\n\n            # Get the dimensions of the used part of the sheet\n            min_row, min_col = 1, 1\n            max_row = max(\n                (cell.row for cell in sheet._cells.values() if cell.value is not None),\n                default=0,\n            )\n            max_col = max(\n                (\n                    cell.column\n                    for cell in sheet._cells.values()\n                    if cell.value is not None\n                ),\n                default=0,\n            )\n\n            if max_row == 0 or max_col == 0:\n                md_content += \"This sheet is empty.\\n\\n\"\n                continue\n        except Exception as e:\n            error_msg = f\"Error processing sheet '{sheet_name}': {str(e)}\"\n            print(error_msg)\n            md_content += (\n                f\"## {sheet_name}\\n\\nError processing this sheet: {str(e)}\\n\\n\"\n            )\n            continue\n\n        try:\n            # First, determine column widths\n            col_widths = {}\n            for col_idx in range(min_col, max_col + 1):\n                max_length = 0\n                # col_letter = get_column_letter(col_idx)\n                _ = get_column_letter(col_idx)\n                for row_idx in range(min_row, max_row + 1):\n                    try:\n                        cell = sheet.cell(row=row_idx, column=col_idx)\n                        cell_value = str(cell.value) if cell.value is not None else \"\"\n                        max_length = max(max_length, len(cell_value))\n                    except Exception as e:\n                        print(\n                            f\"Warning: Error processing cell at row {row_idx}, column {col_idx}: {str(e)}\"\n                        )\n                        max_length = max(max_length, 10)  # Use reasonable default\n                col_widths[col_idx] = max(max_length + 2, 5)  # Min width of 5\n\n            # Start building the table\n            # Header row with column separators\n            md_content += \"|\"\n            for col_idx in range(min_col, max_col + 1):\n                md_content += \" \" + \" \" * col_widths[col_idx] + \" |\"\n            md_content += \"\\n\"\n\n            # Separator row\n            md_content += \"|\"\n            for col_idx in range(min_col, max_col + 1):\n                md_content += \":\" + \"-\" * col_widths[col_idx] + \":|\"\n            md_content += \"\\n\"\n\n            # Data rows\n            for row_idx in range(min_row, max_row + 1):\n                md_content += \"|\"\n                for col_idx in range(min_col, max_col + 1):\n                    try:\n                        cell = sheet.cell(row=row_idx, column=col_idx)\n                        cell_value = str(cell.value) if cell.value is not None else \"\"\n\n                        # Get formatting info\n                        try:\n                            format_info = get_cell_format_info(cell)\n                        except Exception as e:\n                            print(\n                                f\"Warning: Error getting formatting for cell at row {row_idx}, column {col_idx}: {str(e)}\"\n                            )\n                            format_info = {}\n\n                        formatted_value = cell_value\n\n                        # Add HTML-style formatting if needed\n                        if format_info:\n                            style_parts = []\n\n                            if \"bg_color\" in format_info:\n                                style_parts.append(\n                                    f\"background-color:{format_info['bg_color']}\"\n                                )\n\n                            if \"font_color\" in format_info:\n                                style_parts.append(f\"color:{format_info['font_color']}\")\n\n                            span_attributes = []\n                            if style_parts:\n                                span_attributes.append(\n                                    f'style=\"{\"; \".join(style_parts)}\"'\n                                )\n\n                            # Format with bold/italic/underline if needed\n                            inner_value = cell_value\n                            if \"bold\" in format_info:\n                                inner_value = f\"<strong>{inner_value}</strong>\"\n                            if \"italic\" in format_info:\n                                inner_value = f\"<em>{inner_value}</em>\"\n                            if \"underline\" in format_info:\n                                inner_value = f\"<u>{inner_value}</u>\"\n\n                            # Only add a span if we have style attributes\n                            if span_attributes:\n                                formatted_value = f\"<span {' '.join(span_attributes)}>{inner_value}</span>\"\n                            else:\n                                formatted_value = inner_value\n\n                        # Pad to column width and add to markdown\n                        padding = col_widths[col_idx] - len(cell_value)\n                        padded_value = \" \" + formatted_value + \" \" * (padding + 1)\n                        md_content += padded_value + \"|\"\n                    except Exception as e:\n                        print(\n                            f\"Error processing cell at row {row_idx}, column {col_idx}: {str(e)}\"\n                        )\n                        # Add a placeholder for the failed cell\n                        padded_value = \" [Error] \" + \" \" * (col_widths[col_idx] - 7)\n                        md_content += padded_value + \" |\"\n\n                md_content += \"\\n\"\n        except Exception as e:\n            error_msg = f\"Error generating table for sheet '{sheet_name}': {str(e)}\\n{traceback.format_exc()}\"\n            print(error_msg)\n            md_content += f\"Error generating table: {str(e)}\\n\\n\"\n\n        # Add formatting legend\n        has_formatting = False\n        for row_idx in range(min_row, max_row + 1):\n            for col_idx in range(min_col, max_col + 1):\n                cell = sheet.cell(row=row_idx, column=col_idx)\n                if get_cell_format_info(cell):\n                    has_formatting = True\n                    break\n            if has_formatting:\n                break\n\n        if has_formatting:\n            md_content += \"\\n### Formatting Information\\n\"\n            md_content += \"The table above includes HTML formatting to represent colors and styles from the original Excel file.\\n\"\n            md_content += \"This formatting may not display in all Markdown viewers.\\n\"\n\n        md_content += \"\\n\\n\"  # Extra newlines between sheets\n\n    return DocumentConverterResult(\n        title=None,\n        text_content=md_content.strip(),\n    )\n\n\ndef PptxConverter(local_path: str) -> DocumentConverterResult:\n    \"\"\"\n    Converts PPTX files to Markdown. Supports headings, tables and images with alt text.\n\n    Args:\n        local_path: Path to the PPTX file\n\n    Returns:\n        DocumentConverterResult containing the converted Markdown text\n    \"\"\"\n\n    def is_picture(shape):\n        \"\"\"Check if a shape is a picture\"\"\"\n        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:\n            return True\n        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:\n            if hasattr(shape, \"image\"):\n                return True\n        return False\n\n    def is_table(shape):\n        \"\"\"Check if a shape is a table\"\"\"\n        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:\n            return True\n        return False\n\n    if not local_path.endswith(\".pptx\"):\n        return DocumentConverterResult(\n            title=None,\n            text_content=f\"Error: Expected .pptx file, got: {local_path}\",\n        )\n\n    md_content = \"\"\n    presentation = pptx.Presentation(local_path)\n    slide_num = 0\n\n    for slide in presentation.slides:\n        slide_num += 1\n        md_content += f\"\\n\\n<!-- Slide number: {slide_num} -->\\n\"\n        title = slide.shapes.title\n\n        for shape in slide.shapes:\n            # Pictures\n            if is_picture(shape):\n                # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069\n                alt_text = \"\"\n                try:\n                    alt_text = shape._element._nvXxPr.cNvPr.attrib.get(\"descr\", \"\")\n                except Exception:\n                    pass\n                # A placeholder name\n                filename = re.sub(r\"\\W\", \"\", shape.name) + \".jpg\"\n                md_content += (\n                    \"\\n![\"\n                    + (alt_text if alt_text else shape.name)\n                    + \"](\"\n                    + filename\n                    + \")\\n\"\n                )\n\n            # Tables\n            if is_table(shape):\n                html_table = \"<html><body><table>\"\n                first_row = True\n                for row in shape.table.rows:\n                    html_table += \"<tr>\"\n                    for cell in row.cells:\n                        if first_row:\n                            html_table += \"<th>\" + html.escape(cell.text) + \"</th>\"\n                        else:\n                            html_table += \"<td>\" + html.escape(cell.text) + \"</td>\"\n                    html_table += \"</tr>\"\n                    first_row = False\n                html_table += \"</table></body></html>\"\n\n                # Note: This would require a separate HTML to Markdown converter function\n                # In this version, I'm assuming a convert_html_to_md function exists\n                md_content += (\n                    \"\\n\" + convert_html_to_md(html_table).text_content.strip() + \"\\n\"\n                )\n\n            # Text areas\n            elif shape.has_text_frame:\n                if shape == title:\n                    md_content += \"# \" + shape.text.lstrip() + \"\\n\"\n                else:\n                    md_content += shape.text + \"\\n\"\n\n        md_content = md_content.strip()\n        if slide.has_notes_slide:\n            md_content += \"\\n\\n### Notes:\\n\"\n            notes_frame = slide.notes_slide.notes_text_frame\n            if notes_frame is not None:\n                md_content += notes_frame.text\n            md_content = md_content.strip()\n\n    return DocumentConverterResult(\n        title=None,\n        text_content=md_content.strip(),\n    )\n\n\ndef ZipConverter(local_path: str, **kwargs):\n    \"\"\"\n    Extracts ZIP files to a temporary directory and processes each file according to its extension.\n    Returns a combined result of all processed files.\n    \"\"\"\n    import zipfile\n\n    temp_dir = tempfile.mkdtemp(prefix=\"zip_extract_\")\n    md_content = f\"# Extracted from ZIP: {os.path.basename(local_path)}\\n\\n\"\n\n    try:\n        with zipfile.ZipFile(local_path, \"r\") as zip_ref:\n            zip_ref.extractall(temp_dir)\n\n        # Get all extracted files\n        extracted_files = []\n        for root, dirs, files in os.walk(temp_dir):\n            for file in files:\n                file_path = os.path.join(root, file)\n                rel_path = os.path.relpath(file_path, temp_dir)\n                extracted_files.append((file_path, rel_path))\n\n        if not extracted_files:\n            md_content += \"The ZIP file is empty or contains no files.\\n\"\n        else:\n            md_content += f\"Total files extracted: {len(extracted_files)}\\n\\n\"\n\n            for file_path, rel_path in extracted_files:\n                md_content += f\"## File: {rel_path}\\n\\n\"\n\n                # Process each file based on its extension\n                file_extension = (\n                    file_path.rsplit(\".\", maxsplit=1)[-1].lower()\n                    if \".\" in file_path\n                    else \"\"\n                )\n                file_result = None\n\n                try:\n                    # Use the same processing logic as process_input\n                    if file_extension == \"py\":\n                        with open(file_path, \"r\", encoding=\"utf-8\") as f:\n                            file_result = DocumentConverterResult(\n                                title=None, text_content=f.read()\n                            )\n\n                    elif file_extension in [\n                        \"txt\",\n                        \"md\",\n                        \"sh\",\n                        \"yaml\",\n                        \"yml\",\n                        \"toml\",\n                        \"csv\",\n                    ]:\n                        with open(file_path, \"r\", encoding=\"utf-8\") as f:\n                            file_result = DocumentConverterResult(\n                                title=None, text_content=f.read()\n                            )\n\n                    elif file_extension in [\"jsonld\", \"json\"]:\n                        with open(file_path, \"r\", encoding=\"utf-8\") as f:\n                            file_result = DocumentConverterResult(\n                                title=None,\n                                text_content=json.dumps(\n                                    json.load(f), ensure_ascii=False, indent=2\n                                ),\n                            )\n\n                    elif file_extension in [\"xlsx\", \"xls\"]:\n                        file_result = XlsxConverter(local_path=file_path)\n\n                    elif file_extension == \"pdf\":\n                        file_result = DocumentConverterResult(\n                            title=None,\n                            text_content=pdfminer.high_level.extract_text(file_path),\n                        )\n\n                    elif file_extension in [\"docx\", \"doc\"]:\n                        file_result = DocxConverter(local_path=file_path)\n\n                    elif file_extension in [\"html\", \"htm\"]:\n                        file_result = HtmlConverter(local_path=file_path)\n\n                    elif file_extension in [\"pptx\", \"ppt\"]:\n                        file_result = PptxConverter(local_path=file_path)\n\n                    elif file_extension in IMAGE_EXTENSIONS:\n                        # Generate image caption for files in ZIP\n                        caption = _generate_image_caption(file_path)\n                        md_content += \"[Image file]\\n\\n\"\n                        md_content += f\"> {caption}\\n\\n\"\n                        continue\n\n                    elif file_extension in AUDIO_EXTENSIONS:\n                        # Generate audio caption for files in ZIP\n                        caption = _generate_audio_caption(file_path)\n                        md_content += \"[Audio file]\\n\\n\"\n                        md_content += f\"> {caption}\\n\\n\"\n                        continue\n\n                    elif file_extension in VIDEO_EXTENSIONS:\n                        # Generate video caption for files in ZIP\n                        caption = _generate_video_caption(file_path)\n                        md_content += \"[Video file]\\n\\n\"\n                        md_content += f\"> {caption}\\n\\n\"\n                        continue\n\n                    elif file_extension == \"pdb\":\n                        md_content += \"[PDB file - specialized format]\\n\\n\"\n                        continue\n\n                    else:\n                        # Try MarkItDown as fallback\n                        try:\n                            md_tool = MarkItDown(enable_plugins=True)\n                            file_result = md_tool.convert(file_path)\n                        except Exception:\n                            md_content += (\n                                f\"[Unsupported file type: {file_extension}]\\n\\n\"\n                            )\n                            continue\n\n                    # Add the processed content\n                    if file_result and getattr(file_result, \"text_content\", None):\n                        content = file_result.text_content\n                        # Limit length for each file\n                        max_len = 50_000\n                        if len(content) > max_len:\n                            content = content[:max_len] + \"\\n... [Content truncated]\"\n                        md_content += f\"```\\n{content}\\n```\\n\\n\"\n\n                except Exception as e:\n                    md_content += f\"[Error processing file: {str(e)}]\\n\\n\"\n                    print(f\"Warning: Error processing {rel_path} from ZIP: {e}\")\n\n    finally:\n        # Clean up temporary directory\n        try:\n            shutil.rmtree(temp_dir)\n        except Exception as e:\n            print(f\"Warning: Could not remove temporary directory {temp_dir}: {e}\")\n\n    return DocumentConverterResult(\n        title=\"ZIP Archive Contents\", text_content=md_content.strip()\n    )\n"
  },
  {
    "path": "apps/miroflow-agent/src/io/output_formatter.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Output formatting utilities for agent responses.\"\"\"\n\nimport re\nfrom typing import Tuple\n\nfrom ..utils.prompt_utils import FORMAT_ERROR_MESSAGE\n\n# Maximum length for tool results before truncation (100k chars ≈ 25k tokens)\nTOOL_RESULT_MAX_LENGTH = 100_000\n\n\nclass OutputFormatter:\n    \"\"\"Formatter for processing and formatting agent outputs.\"\"\"\n\n    def _extract_boxed_content(self, text: str) -> str:\n        r\"\"\"\n        Extract the content of the last \\boxed{...} occurrence in the given text.\n\n        Supports:\n          - Arbitrary levels of nested braces\n          - Escaped braces (\\{ and \\})\n          - Whitespace between \\boxed and the opening brace\n          - Empty content inside braces\n          - Incomplete boxed expressions (extracts to end of string as fallback)\n\n        Args:\n            text: Input text that may contain \\boxed{...} expressions\n\n        Returns:\n            The extracted boxed content, or empty string if no match is found.\n        \"\"\"\n        if not text:\n            return \"\"\n\n        _BOXED_RE = re.compile(r\"\\\\boxed\\b\", re.DOTALL)\n\n        last_result = None  # Track the last boxed content (complete or incomplete)\n        i = 0\n        n = len(text)\n\n        while True:\n            # Find the next \\boxed occurrence\n            m = _BOXED_RE.search(text, i)\n            if not m:\n                break\n            j = m.end()\n\n            # Skip any whitespace after \\boxed\n            while j < n and text[j].isspace():\n                j += 1\n\n            # Require that the next character is '{'\n            if j >= n or text[j] != \"{\":\n                i = j\n                continue\n\n            # Parse the brace content manually to handle nesting and escapes\n            depth = 0\n            k = j\n            escaped = False\n            found_closing = False\n            while k < n:\n                ch = text[k]\n                if escaped:\n                    escaped = False\n                elif ch == \"\\\\\":\n                    escaped = True\n                elif ch == \"{\":\n                    depth += 1\n                elif ch == \"}\":\n                    depth -= 1\n                    # When depth returns to zero, the boxed content ends\n                    if depth == 0:\n                        last_result = text[j + 1 : k]\n                        i = k + 1\n                        found_closing = True\n                        break\n                k += 1\n\n            # If we didn't find a closing brace, this is an incomplete boxed\n            # Store it as the last result (will be overwritten if we find more boxed later)\n            if not found_closing and depth > 0:\n                last_result = text[j + 1 : n]\n                i = k  # Continue from where we stopped\n            elif not found_closing:\n                i = j + 1  # Move past this invalid boxed\n\n        # Return the last boxed content found (complete or incomplete)\n        black_list = [\"?\", \"??\", \"???\", \"？\", \"……\", \"…\", \"...\", \"unknown\", None]\n        return last_result.strip() if last_result not in black_list else \"\"\n\n    def format_tool_result_for_user(self, tool_call_execution_result: dict) -> dict:\n        \"\"\"\n        Format tool execution results to be fed back to LLM as user messages.\n\n        Only includes necessary information (results or errors). Long results\n        are truncated to TOOL_RESULT_MAX_LENGTH to prevent context overflow.\n\n        Args:\n            tool_call_execution_result: Dict containing server_name, tool_name,\n                and either 'result' or 'error'.\n\n        Returns:\n            Dict with 'type' and 'text' keys suitable for LLM message content.\n        \"\"\"\n        server_name = tool_call_execution_result[\"server_name\"]\n        tool_name = tool_call_execution_result[\"tool_name\"]\n\n        if \"error\" in tool_call_execution_result:\n            # Provide concise error information to LLM\n            content = f\"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}\"\n        elif \"result\" in tool_call_execution_result:\n            # Provide the original output result of the tool\n            content = tool_call_execution_result[\"result\"]\n            # Truncate overly long results to prevent context overflow\n            if len(content) > TOOL_RESULT_MAX_LENGTH:\n                content = content[:TOOL_RESULT_MAX_LENGTH] + \"\\n... [Result truncated]\"\n        else:\n            content = f\"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result.\"\n\n        return {\"type\": \"text\", \"text\": content}\n\n    def format_final_summary_and_log(\n        self, final_answer_text: str, client=None\n    ) -> Tuple[str, str, str]:\n        \"\"\"\n        Format final summary information, including answers and token statistics.\n\n        Args:\n            final_answer_text: The final answer text from the agent\n            client: Optional LLM client for token usage statistics\n\n        Returns:\n            Tuple of (summary_text, boxed_result, usage_log)\n        \"\"\"\n        summary_lines = []\n        summary_lines.append(\"\\n\" + \"=\" * 30 + \" Final Answer \" + \"=\" * 30)\n        summary_lines.append(final_answer_text)\n\n        # Extract boxed result - find the last match using safer regex patterns\n        boxed_result = self._extract_boxed_content(final_answer_text)\n\n        # Add extracted result section\n        summary_lines.append(\"\\n\" + \"-\" * 20 + \" Extracted Result \" + \"-\" * 20)\n\n        if boxed_result:\n            summary_lines.append(boxed_result)\n        elif final_answer_text:\n            summary_lines.append(\"No \\\\boxed{} content found.\")\n            boxed_result = FORMAT_ERROR_MESSAGE\n\n        # Token usage statistics and cost estimation - use client method\n        if client and hasattr(client, \"format_token_usage_summary\"):\n            token_summary_lines, log_string = client.format_token_usage_summary()\n            summary_lines.extend(token_summary_lines)\n        else:\n            # If no client or client doesn't support it, use default format\n            summary_lines.append(\"\\n\" + \"-\" * 20 + \" Token Usage & Cost \" + \"-\" * 20)\n            summary_lines.append(\"Token usage information not available.\")\n            summary_lines.append(\"-\" * (40 + len(\" Token Usage & Cost \")))\n            log_string = \"Token usage information not available.\"\n\n        return \"\\n\".join(summary_lines), boxed_result, log_string\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nfrom .base_client import BaseClient\nfrom .factory import ClientFactory\nfrom .providers import (\n    AnthropicClient,\n    OpenAIClient,\n)\n\n__all__ = [\n    \"BaseClient\",\n    \"ClientFactory\",\n    \"AnthropicClient\",\n    \"OpenAIClient\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/base_client.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nBase client module for LLM providers.\n\nThis module defines the abstract base class and common utilities for LLM clients,\nsupporting both OpenAI and Anthropic API formats.\n\"\"\"\n\nimport asyncio\nimport dataclasses\nfrom abc import ABC\nfrom typing import (\n    Any,\n    Dict,\n    List,\n    Optional,\n    Tuple,\n    TypedDict,\n)\n\nfrom omegaconf import DictConfig\n\nfrom ..logging.task_logger import TaskLog\nfrom .util import with_timeout\n\n# Default timeout for LLM API calls (10 minutes)\nDEFAULT_LLM_TIMEOUT_SECONDS = 600\n\n\nclass TokenUsage(TypedDict, total=True):\n    \"\"\"\n    Unified token usage tracking across different LLM providers.\n\n    We unify OpenAI and Anthropic formats. There are four usage types:\n    - input/output tokens: Standard input and output token counts\n    - cache write/read tokens: Tokens involved in caching operations\n\n    Provider-specific notes:\n    - OpenAI: Cache write is free, cache read is cheaper\n    - Anthropic: Cache write has a small cost, cache read is cheaper\n    \"\"\"\n\n    total_input_tokens: int\n    total_output_tokens: int\n    total_cache_read_input_tokens: int\n    total_cache_write_input_tokens: int\n\n\n@dataclasses.dataclass\nclass BaseClient(ABC):\n    \"\"\"\n    Abstract base class for LLM provider clients.\n\n    This class provides the common interface and utilities for interacting with\n    different LLM providers (OpenAI, Anthropic, etc.). Concrete implementations\n    should override _create_client() and provider-specific methods.\n\n    Attributes:\n        task_id: Unique identifier for the current task (used for tracking)\n        cfg: Hydra configuration containing LLM settings\n        task_log: Optional logger for recording task execution details\n    \"\"\"\n\n    # Required arguments (no default value)\n    task_id: str\n    cfg: DictConfig\n\n    # Optional arguments (with default value)\n    task_log: Optional[\"TaskLog\"] = None\n\n    # Initialized in __post_init__\n    client: Any = dataclasses.field(init=False)\n    token_usage: TokenUsage = dataclasses.field(init=False)\n    last_call_tokens: Dict[str, int] = dataclasses.field(init=False)\n\n    def __post_init__(self):\n        # Initialize last_call_tokens before other operations\n        self.last_call_tokens: Dict[str, int] = {\n            \"prompt_tokens\": 0,\n            \"completion_tokens\": 0,\n        }\n\n        # Explicitly assign from cfg object\n        self.provider: str = self.cfg.llm.provider\n        self.model_name: str = self.cfg.llm.model_name\n        self.temperature: float = self.cfg.llm.temperature\n        self.top_p: float = self.cfg.llm.top_p\n        self.min_p: float = self.cfg.llm.min_p\n        self.top_k: int = self.cfg.llm.top_k\n        self.max_context_length: int = self.cfg.llm.max_context_length\n        self.max_tokens: int = self.cfg.llm.max_tokens\n        self.async_client: bool = self.cfg.llm.async_client\n        self.keep_tool_result: int = self.cfg.agent.keep_tool_result\n        self.api_key: Optional[str] = self.cfg.llm.get(\"api_key\")\n        self.base_url: Optional[str] = self.cfg.llm.get(\"base_url\")\n        self.use_tool_calls: Optional[bool] = self.cfg.llm.get(\"use_tool_calls\")\n        self.repetition_penalty: float = self.cfg.llm.get(\"repetition_penalty\", 1.0)\n\n        self.token_usage = self._reset_token_usage()\n        self.client = self._create_client()\n\n        self.task_log.log_step(\n            \"info\",\n            \"LLM | Initialization\",\n            f\"LLMClient {self.provider} {self.model_name} initialization completed.\",\n        )\n\n    def _reset_token_usage(self) -> TokenUsage:\n        \"\"\"\n        Reset token usage counter to zero.\n\n        Returns:\n            A new TokenUsage dict with all counters set to zero.\n        \"\"\"\n        return TokenUsage(\n            total_input_tokens=0,\n            total_output_tokens=0,\n            total_cache_write_input_tokens=0,\n            total_cache_read_input_tokens=0,\n        )\n\n    def _remove_tool_result_from_messages(\n        self, messages, keep_tool_result\n    ) -> List[Dict]:\n        \"\"\"Remove tool results from messages\n\n        Args:\n            messages: List of message dictionaries\n            keep_tool_result: Number of tool results to keep. -1 means keep all.\n\n        Returns:\n            List of messages with tool results filtered according to keep_tool_result\n        \"\"\"\n        messages_copy = [m.copy() for m in messages]\n\n        if keep_tool_result == -1:\n            # No processing needed, keep all messages\n            return messages_copy\n\n        # Find indices of all user/tool messages (these are tool results)\n        user_indices = [\n            i\n            for i, msg in enumerate(messages_copy)\n            if msg.get(\"role\") == \"user\" or msg.get(\"role\") == \"tool\"\n        ]\n\n        if len(user_indices) == 0:\n            # No user/tool messages found\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Message Retention\",\n                \"No user/tool messages found in the history.\",\n            )\n            return messages_copy\n\n        # The first user message is the initial task, not a tool result\n        # Tool results start from the second user message onwards\n        if len(user_indices) == 1:\n            # Only one user message (the initial task), no tool results to filter\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Message Retention\",\n                \"Only 1 user message found (initial task). Keeping it as is.\",\n            )\n            return messages_copy\n\n        # Tool result indices (excluding the first user message which is the initial task)\n        tool_result_indices = user_indices[1:]\n        first_user_idx = user_indices[\n            0\n        ]  # Always keep the first user message (initial task)\n\n        # Calculate how many tool results to keep from the end\n        if keep_tool_result == 0:\n            # Keep 0 tool results, only keep the initial task\n            num_tool_results_to_keep = 0\n        else:\n            # Keep the last keep_tool_result tool results\n            num_tool_results_to_keep = min(keep_tool_result, len(tool_result_indices))\n\n        # Get indices of tool results to keep from the end\n        tool_result_indices_to_keep = (\n            tool_result_indices[-num_tool_results_to_keep:]\n            if num_tool_results_to_keep > 0\n            else []\n        )\n\n        # Combine first message (initial task) and tool results to keep\n        indices_to_keep = [first_user_idx] + tool_result_indices_to_keep\n\n        self.task_log.log_step(\n            \"info\",\n            \"LLM | Message Retention\",\n            f\"Message retention summary: Total user/tool messages: {len(user_indices)}, \"\n            f\"Initial task at index: {first_user_idx}, \"\n            f\"Keeping last {num_tool_results_to_keep} tool results at indices: {tool_result_indices_to_keep}, \"\n            f\"Total messages to keep: {len(indices_to_keep)}\",\n        )\n\n        # Replace content of tool results that should be omitted\n        for i, msg in enumerate(messages_copy):\n            if (\n                msg.get(\"role\") == \"user\" or msg.get(\"role\") == \"tool\"\n            ) and i not in indices_to_keep:\n                # Preserve the message structure but replace content\n                if isinstance(msg.get(\"content\"), list):\n                    # For Anthropic format\n                    msg[\"content\"] = [\n                        {\n                            \"type\": \"text\",\n                            \"text\": \"Tool result is omitted to save tokens.\",\n                        }\n                    ]\n                else:\n                    # For OpenAI format\n                    msg[\"content\"] = \"Tool result is omitted to save tokens.\"\n\n        return messages_copy\n\n    @with_timeout(DEFAULT_LLM_TIMEOUT_SECONDS)\n    async def create_message(\n        self,\n        system_prompt: str,\n        message_history: List[Dict],\n        tool_definitions: List[Dict],\n        keep_tool_result: int = -1,\n        step_id: int = 1,\n        task_log: Optional[\"TaskLog\"] = None,\n        agent_type: str = \"main\",\n    ) -> Tuple[Any, List[Dict]]:\n        \"\"\"\n        Call LLM to generate a response with optional tool call support.\n\n        This is the main entry point for LLM interactions. It handles:\n        - Message history management\n        - Tool result filtering based on keep_tool_result\n        - Error handling and logging\n\n        Args:\n            system_prompt: System prompt to guide the LLM's behavior\n            message_history: List of previous messages in the conversation\n            tool_definitions: List of available tool definitions\n            keep_tool_result: Number of recent tool results to keep (-1 = keep all)\n            step_id: Current step identifier for logging\n            task_log: Optional logger for task execution\n            agent_type: Type of agent making the call (\"main\" or sub-agent name)\n\n        Returns:\n            Tuple of (response, updated_message_history)\n        \"\"\"\n        # Unified LLM call processing\n        try:\n            response, message_history = await self._create_message(\n                system_prompt,\n                message_history,\n                tool_definitions,\n                keep_tool_result=keep_tool_result,\n            )\n\n        except Exception as e:\n            self.task_log.log_step(\n                \"error\",\n                f\"FATAL ERROR | {agent_type} | LLM Call ERROR\",\n                f\"{agent_type} failed: {str(e)}\",\n            )\n            response = None\n\n        return response, message_history\n\n    @staticmethod\n    async def convert_tool_definition_to_tool_call(tools_definitions):\n        \"\"\"\n        Convert MCP tool definitions to OpenAI function call format.\n\n        Transforms the internal tool definition format used by MCP servers into\n        the format expected by OpenAI's function calling API.\n\n        Args:\n            tools_definitions: List of server definitions, each containing a 'name'\n                and 'tools' list with tool specifications.\n\n        Returns:\n            List of tool definitions in OpenAI function call format, where each\n            tool name is prefixed with its server name (e.g., \"server-name-tool-name\").\n        \"\"\"\n        tool_list = []\n        for server in tools_definitions:\n            if \"tools\" in server and len(server[\"tools\"]) > 0:\n                for tool in server[\"tools\"]:\n                    tool_def = dict(\n                        type=\"function\",\n                        function=dict(\n                            name=f\"{server['name']}-{tool['name']}\",\n                            description=tool[\"description\"],\n                            parameters=tool[\"schema\"],\n                        ),\n                    )\n                    tool_list.append(tool_def)\n        return tool_list\n\n    def close(self):\n        \"\"\"Close client connection.\n\n        Note: For async clients (AsyncOpenAI, AsyncAnthropic), the connection\n        will be closed when the client object is garbage collected.\n        For proper async cleanup, use `await client.aclose()` in an async context.\n        \"\"\"\n        if hasattr(self.client, \"close\"):\n            if asyncio.iscoroutinefunction(self.client.close):\n                # For async clients, we cannot call close() synchronously.\n                # The async HTTP client will be closed when garbage collected.\n                # For explicit async cleanup, call aclose() from an async context.\n                if hasattr(self.client, \"_client\"):\n                    # Try to close the underlying httpx client if available\n                    try:\n                        self.client._client.close()\n                    except Exception:\n                        pass  # Ignore errors during cleanup\n            else:\n                self.client.close()\n        elif hasattr(self.client, \"_client\") and hasattr(self.client._client, \"close\"):\n            # Some clients may have internal _client attribute\n            self.client._client.close()\n\n    def _format_response_for_log(self, response) -> Dict:\n        \"\"\"Format response for logging\"\"\"\n        if not response:\n            return {}\n\n        # Basic response information\n        formatted = {\n            \"response_type\": type(response).__name__,\n        }\n\n        # Anthropic response\n        if hasattr(response, \"content\"):\n            formatted[\"content\"] = []\n            for block in response.content:\n                if hasattr(block, \"type\"):\n                    if block.type == \"text\":\n                        formatted[\"content\"].append(\n                            {\n                                \"type\": \"text\",\n                                \"text\": block.text[:500] + \"...\"\n                                if len(block.text) > 500\n                                else block.text,\n                            }\n                        )\n                    elif block.type == \"tool_use\":\n                        formatted[\"content\"].append(\n                            {\n                                \"type\": \"tool_use\",\n                                \"id\": block.id,\n                                \"name\": block.name,\n                                \"input\": str(block.input)[:200] + \"...\"\n                                if len(str(block.input)) > 200\n                                else str(block.input),\n                            }\n                        )\n\n        # OpenAI response\n        if hasattr(response, \"choices\"):\n            formatted[\"choices\"] = []\n            for choice in response.choices:\n                choice_data = {\"finish_reason\": choice.finish_reason}\n                if hasattr(choice, \"message\"):\n                    message = choice.message\n                    choice_data[\"message\"] = {\n                        \"role\": message.role,\n                        \"content\": message.content[:500] + \"...\"\n                        if message.content and len(message.content) > 500\n                        else message.content,\n                    }\n                    if hasattr(message, \"tool_calls\") and message.tool_calls:\n                        choice_data[\"message\"][\"tool_calls_count\"] = len(\n                            message.tool_calls\n                        )\n                formatted[\"choices\"].append(choice_data)\n\n        return formatted\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/factory.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nLLM Client Factory module.\n\nThis module provides a factory function for creating LLM clients based on\nconfiguration. It supports multiple providers including OpenAI, Anthropic,\nand Qwen (via OpenAI-compatible API).\n\"\"\"\n\nfrom typing import Optional, Union\n\nfrom omegaconf import DictConfig, OmegaConf\n\nfrom ..logging.task_logger import TaskLog\nfrom .providers.anthropic_client import AnthropicClient\nfrom .providers.openai_client import OpenAIClient\n\n# Supported LLM providers\nSUPPORTED_PROVIDERS = {\"anthropic\", \"openai\", \"qwen\"}\n\n\ndef ClientFactory(\n    task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs\n) -> Union[OpenAIClient, AnthropicClient]:\n    \"\"\"\n    Create an LLM client based on the provider specified in configuration.\n\n    This factory function automatically selects and instantiates the appropriate\n    client class based on the `llm.provider` field in the configuration.\n\n    Args:\n        task_id: Unique identifier for the current task (used for tracking)\n        cfg: Hydra configuration object containing LLM settings\n        task_log: Optional logger for recording task execution details\n        **kwargs: Additional keyword arguments to merge into configuration\n\n    Returns:\n        An instance of the appropriate LLM client (OpenAIClient or AnthropicClient)\n\n    Example:\n        >>> client = ClientFactory(\n        ...     task_id=\"task_001\",\n        ...     cfg=cfg,\n        ...     task_log=task_log\n        ... )\n    \"\"\"\n    provider = cfg.llm.provider\n    config = OmegaConf.merge(cfg, kwargs)\n\n    client_creators = {\n        \"anthropic\": lambda: AnthropicClient(\n            task_id=task_id, task_log=task_log, cfg=config\n        ),\n        \"qwen\": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),\n        \"openai\": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),\n    }\n\n    factory = client_creators.get(provider)\n    if not factory:\n        raise ValueError(\n            f\"Unsupported provider: '{provider}'. \"\n            f\"Supported providers are: {', '.join(sorted(SUPPORTED_PROVIDERS))}\"\n        )\n\n    return factory()\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/providers/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nfrom .anthropic_client import AnthropicClient\nfrom .openai_client import OpenAIClient\n\n__all__ = [\n    \"AnthropicClient\",\n    \"OpenAIClient\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/providers/anthropic_client.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nAnthropic Claude LLM client implementation.\n\nThis module provides the AnthropicClient class for interacting with Anthropic's\nClaude API, with support for prompt caching and extended thinking.\n\nFeatures:\n- Async and sync API support\n- Prompt caching with ephemeral cache control\n- Token usage tracking including cache statistics\n- MCP tool call parsing and response processing\n\"\"\"\n\nimport asyncio\nimport dataclasses\nimport logging\nfrom typing import Any, Dict, List, Tuple, Union\n\nimport tiktoken\nfrom anthropic import (\n    NOT_GIVEN,\n    Anthropic,\n    AsyncAnthropic,\n    DefaultAsyncHttpxClient,\n    DefaultHttpxClient,\n)\nfrom tenacity import retry, stop_after_attempt, wait_fixed\n\nfrom ...utils.prompt_utils import generate_mcp_system_prompt\nfrom ..base_client import BaseClient\n\nlogger = logging.getLogger(\"miroflow_agent\")\n\n\n@dataclasses.dataclass\nclass AnthropicClient(BaseClient):\n    def __post_init__(self):\n        super().__post_init__()\n\n        # Anthropic-specific token counters\n        self.input_tokens: int = 0\n        self.output_tokens: int = 0\n        self.cache_creation_tokens: int = 0\n        self.cache_read_tokens: int = 0\n\n    def _create_client(self) -> Union[AsyncAnthropic, Anthropic]:\n        \"\"\"Create LLM client\"\"\"\n        http_client_args = {\"headers\": {\"x-upstream-session-id\": self.task_id}}\n        if self.async_client:\n            return AsyncAnthropic(\n                api_key=self.api_key,\n                base_url=self.base_url,\n                http_client=DefaultAsyncHttpxClient(**http_client_args),\n            )\n        else:\n            return Anthropic(\n                api_key=self.api_key,\n                base_url=self.base_url,\n                http_client=DefaultHttpxClient(**http_client_args),\n            )\n\n    def _update_token_usage(self, usage_data: Any) -> None:\n        \"\"\"Update cumulative token usage\"\"\"\n        if usage_data:\n            # Update based on actual field names returned by Anthropic API\n            self.token_usage[\"total_cache_write_input_tokens\"] += (\n                getattr(usage_data, \"cache_creation_input_tokens\", 0) or 0\n            )\n            self.token_usage[\"total_cache_read_input_tokens\"] += (\n                getattr(usage_data, \"cache_read_input_tokens\", 0) or 0\n            )\n            self.token_usage[\"total_input_tokens\"] += (\n                getattr(usage_data, \"input_tokens\", 0) or 0\n            )\n            self.token_usage[\"total_output_tokens\"] += (\n                getattr(usage_data, \"output_tokens\", 0) or 0\n            )\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Token Usage\",\n                f\"Input: {getattr(usage_data, 'input_tokens', 0)}, \"\n                f\"Cache: {getattr(usage_data, 'cache_creation_input_tokens', 0)}+{getattr(usage_data, 'cache_read_input_tokens', 0)}, \"\n                f\"Output: {getattr(usage_data, 'output_tokens', 0)}\",\n            )\n\n            self.last_call_tokens = {\n                \"input_tokens\": getattr(usage_data, \"input_tokens\", 0)\n                + getattr(usage_data, \"cache_creation_input_tokens\", 0)\n                + getattr(usage_data, \"cache_read_input_tokens\", 0),\n                \"output_tokens\": getattr(usage_data, \"output_tokens\", 0),\n            }\n        else:\n            self.task_log.log_step(\n                \"warning\", \"LLM | Token Usage\", \"Warning: No valid usage_data received.\"\n            )\n\n    @retry(wait=wait_fixed(10), stop=stop_after_attempt(5))\n    async def _create_message(\n        self,\n        system_prompt: str,\n        messages_history: List[Dict[str, Any]],\n        tools_definitions,\n        keep_tool_result: int = -1,\n    ):\n        \"\"\"\n        Send message to Anthropic API.\n        :param system_prompt: System prompt string.\n        :param messages_history: Message history list.\n        :return: Anthropic API response object or None (if error occurs).\n        \"\"\"\n        self.task_log.log_step(\n            \"info\",\n            \"LLM | Call Start\",\n            f\"Calling LLM ({'async' if self.async_client else 'sync'})\",\n        )\n\n        # Create a filtered copy for sending to LLM (to save tokens)\n        # But keep the original messages_history for returning (for complete log)\n        messages_for_llm = self._remove_tool_result_from_messages(\n            messages_history, keep_tool_result\n        )\n\n        # Apply cache control\n        processed_messages = self._apply_cache_control(messages_for_llm)\n\n        try:\n            # Note: Anthropic API does not support repetition_penalty parameter\n            if self.async_client:\n                response = await self.client.messages.create(\n                    model=self.model_name,\n                    temperature=self.temperature,\n                    top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,\n                    top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,\n                    max_tokens=self.max_tokens,\n                    system=[\n                        {\n                            \"type\": \"text\",\n                            \"text\": system_prompt,\n                            \"cache_control\": {\"type\": \"ephemeral\"},\n                        }\n                    ],\n                    messages=processed_messages,\n                    stream=False,\n                )\n            else:\n                response = self.client.messages.create(\n                    model=self.model_name,\n                    temperature=self.temperature,\n                    top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,\n                    top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,\n                    max_tokens=self.max_tokens,\n                    system=[\n                        {\n                            \"type\": \"text\",\n                            \"text\": system_prompt,\n                            \"cache_control\": {\"type\": \"ephemeral\"},\n                        }\n                    ],\n                    messages=processed_messages,\n                    stream=False,\n                )\n            self._update_token_usage(getattr(response, \"usage\", None))\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Call Status\",\n                f\"LLM call status: {getattr(response, 'stop_reason', 'N/A')}\",\n            )\n            # Return the original messages_history (not the filtered copy)\n            # This ensures that the complete conversation history is preserved in logs\n            return response, messages_history\n        except asyncio.CancelledError:\n            self.task_log.log_step(\n                \"warning\",\n                \"LLM | Call Cancelled\",\n                \"⚠️ LLM API call was cancelled during execution\",\n            )\n            raise  # Re-raise to allow decorator to log it\n        except Exception as e:\n            self.task_log.log_step(\n                \"error\", \"LLM | Call Failed\", f\"Anthropic LLM call failed: {str(e)}\"\n            )\n            raise e\n\n    def process_llm_response(\n        self, llm_response: Any, message_history: List[Dict], agent_type: str = \"main\"\n    ) -> tuple[str, bool, List[Dict]]:\n        \"\"\"Process LLM response\"\"\"\n        if not llm_response:\n            self.task_log.log_step(\n                \"error\",\n                \"LLM | Response Processing\",\n                \"❌ LLM call failed, skipping this response.\",\n            )\n            return \"\", True, message_history\n\n        if not hasattr(llm_response, \"content\") or not llm_response.content:\n            self.task_log.log_step(\n                \"error\",\n                \"LLM | Response Processing\",\n                \"❌ LLM response is empty or contains no content.\",\n            )\n            return \"\", True, message_history\n\n        # Extract response content\n        assistant_response_text = \"\"\n        assistant_response_content = []\n\n        from ...utils.parsing_utils import fix_server_name_in_text\n\n        for block in llm_response.content:\n            if block.type == \"text\":\n                assistant_response_text += block.text + \"\\n\"\n                assistant_response_content.append({\"type\": \"text\", \"text\": block.text})\n            elif block.type == \"tool_use\":\n                assistant_response_content.append(\n                    {\n                        \"type\": \"tool_use\",\n                        \"id\": block.id,\n                        \"name\": block.name,\n                        \"input\": block.input,\n                    }\n                )\n\n        # Fix server_name in text content\n        assistant_response_text = fix_server_name_in_text(assistant_response_text)\n        for item in assistant_response_content:\n            if item.get(\"type\") == \"text\":\n                item[\"text\"] = fix_server_name_in_text(item[\"text\"])\n\n        # Add assistant response to history\n        message_history.append(\n            {\"role\": \"assistant\", \"content\": assistant_response_content}\n        )\n\n        self.task_log.log_step(\n            \"info\", \"LLM | Response\", f\"LLM Response: {assistant_response_text}\"\n        )\n\n        return assistant_response_text, False, message_history\n\n    def extract_tool_calls_info(\n        self, llm_response: Any, assistant_response_text: str\n    ) -> List[Dict]:\n        \"\"\"Extract tool call information from LLM response\"\"\"\n        from ...utils.parsing_utils import parse_llm_response_for_tool_calls\n\n        return parse_llm_response_for_tool_calls(assistant_response_text)\n\n    def update_message_history(\n        self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]\n    ) -> List[Dict]:\n        \"\"\"Update message history with tool calls data (llm client specific)\"\"\"\n\n        merged_text = \"\\n\".join(\n            [\n                item[1][\"text\"]\n                for item in all_tool_results_content_with_id\n                if item[1][\"type\"] == \"text\"\n            ]\n        )\n\n        message_history.append(\n            {\n                \"role\": \"user\",\n                \"content\": [{\"type\": \"text\", \"text\": merged_text}],\n            }\n        )\n\n        return message_history\n\n    def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:\n        from ...utils.parsing_utils import set_tool_server_mapping\n\n        prompt = generate_mcp_system_prompt(date, mcp_servers)\n        set_tool_server_mapping(prompt)\n        return prompt\n\n    def _estimate_tokens(self, text: str) -> int:\n        \"\"\"Use tiktoken to estimate the number of tokens in text\"\"\"\n        if not hasattr(self, \"encoding\"):\n            # Initialize tiktoken encoder\n            try:\n                self.encoding = tiktoken.get_encoding(\"o200k_base\")\n            except Exception:\n                # If o200k_base is not available, use cl100k_base as fallback\n                self.encoding = tiktoken.get_encoding(\"cl100k_base\")\n\n        try:\n            return len(self.encoding.encode(text))\n        except Exception as e:\n            # If encoding fails, use simple estimation: approximately 1 token per 4 characters\n            self.task_log.log_step(\n                \"error\",\n                \"LLM | Token Estimation Error\",\n                f\"Error: {str(e)}\",\n            )\n            return len(text) // 4\n\n    def ensure_summary_context(\n        self, message_history: list, summary_prompt: str\n    ) -> tuple[bool, list]:\n        \"\"\"\n        Check if current message_history + summary_prompt will exceed context\n        If it will exceed, remove the last assistant-user pair and return False\n        Return True to continue, False if messages have been rolled back\n        \"\"\"\n        # Get token usage from the last LLM call\n        last_input_tokens = self.last_call_tokens.get(\"input_tokens\", 0)\n        last_output_tokens = self.last_call_tokens.get(\"output_tokens\", 0)\n        buffer_factor = 1.5\n\n        # Calculate token count for summary prompt\n        summary_tokens = int(self._estimate_tokens(str(summary_prompt)) * buffer_factor)\n\n        # Calculate token count for the last user message in message_history\n        last_user_tokens = 0\n        if message_history[-1][\"role\"] == \"user\":\n            content = message_history[-1][\"content\"]\n            last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)\n\n        # Calculate total token count: last input + output + last user message + summary + reserved response space\n        estimated_total = (\n            last_input_tokens\n            + last_output_tokens\n            + last_user_tokens\n            + summary_tokens\n            + self.max_tokens\n            + 1000  # Add 1000 tokens as buffer\n        )\n\n        if estimated_total >= self.max_context_length:\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Context Limit Reached\",\n                \"Context limit reached, proceeding to step back and summarize the conversation\",\n            )\n\n            # Remove the last user message (tool call results)\n            if message_history[-1][\"role\"] == \"user\":\n                message_history.pop()\n\n            # Remove the second-to-last assistant message (tool call request)\n            if message_history[-1][\"role\"] == \"assistant\":\n                message_history.pop()\n\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Context Limit Reached\",\n                f\"Removed the last assistant-user pair, current message_history length: {len(message_history)}\",\n            )\n\n            return False, message_history\n\n        self.task_log.log_step(\n            \"info\",\n            \"LLM | Context Limit Not Reached\",\n            f\"{estimated_total}/{self.max_context_length}\",\n        )\n        return True, message_history\n\n    def format_token_usage_summary(self) -> tuple[List[str], str]:\n        \"\"\"Format token usage statistics, return summary_lines for format_final_summary and log string\"\"\"\n        token_usage = self.get_token_usage()\n\n        total_input = token_usage.get(\"total_input_tokens\", 0)\n        total_output = token_usage.get(\"total_output_tokens\", 0)\n        total_cache_creation = token_usage.get(\"total_cache_write_input_tokens\", 0)\n        total_cache_read = token_usage.get(\"total_cache_read_input_tokens\", 0)\n\n        summary_lines = []\n        summary_lines.append(\"\\n\" + \"-\" * 20 + \" Token Usage \" + \"-\" * 20)\n        summary_lines.append(f\"Total Input Tokens (non-cache): {total_input}\")\n        summary_lines.append(\n            f\"Total Cache Creation Input Tokens: {total_cache_creation}\"\n        )\n        summary_lines.append(f\"Total Cache Read Input Tokens: {total_cache_read}\")\n        summary_lines.append(f\"Total Output Tokens: {total_output}\")\n        summary_lines.append(\"-\" * (40 + len(\" Token Usage \")))\n        summary_lines.append(\"Pricing is disabled - no cost information available\")\n        summary_lines.append(\"-\" * (40 + len(\" Token Usage \")))\n\n        # Generate log string\n        log_string = (\n            f\"[{self.model_name}] Total Input: {total_input}, \"\n            f\"Cache Creation: {total_cache_creation}, \"\n            f\"Cache Read: {total_cache_read}, \"\n            f\"Output: {total_output}\"\n        )\n\n        return summary_lines, log_string\n\n    def get_token_usage(self):\n        return self.token_usage.copy()\n\n    def _apply_cache_control(self, messages: List[Dict]) -> List[Dict]:\n        \"\"\"Apply cache control to the last user message and system message (if applicable)\"\"\"\n        cached_messages = []\n        user_turns_processed = 0\n        for turn in reversed(messages):\n            if turn[\"role\"] == \"user\" and user_turns_processed < 1:\n                # Add ephemeral cache control to the text part of the last user message\n                new_content = []\n                processed_text = False\n                # Check if content is a list\n                if isinstance(turn[\"content\"], str):\n                    turn[\"content\"] = [{\"type\": \"text\", \"text\": turn[\"content\"]}]\n                if isinstance(turn.get(\"content\"), list):\n                    # see example here\n                    # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching\n                    for item in turn[\"content\"]:\n                        if (\n                            item.get(\"type\") == \"text\"\n                            and len(item.get(\"text\")) > 0\n                            and not processed_text\n                        ):\n                            # Copy and add cache control\n                            text_item = item.copy()\n                            text_item[\"cache_control\"] = {\"type\": \"ephemeral\"}\n                            new_content.append(text_item)\n                            processed_text = True\n                        else:\n                            # Other types of content (like image) copied directly\n                            new_content.append(item.copy())\n                    cached_messages.append({\"role\": \"user\", \"content\": new_content})\n                else:\n                    # If content is not a list (e.g., plain text), add as is without cache control\n                    # Or adjust logic as needed\n                    self.task_log.log_step(\n                        \"warning\",\n                        \"LLM | Cache Control\",\n                        \"Warning: User message content is not in expected list format, cache control not applied.\",\n                    )\n                    cached_messages.append(turn)\n\n                user_turns_processed += 1\n            else:\n                # Add other messages directly\n                cached_messages.append(turn)\n        return list(reversed(cached_messages))\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/providers/openai_client.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nOpenAI-compatible LLM client implementation.\n\nThis module provides the OpenAIClient class for interacting with OpenAI's API\nand OpenAI-compatible endpoints (such as vLLM, Qwen, DeepSeek, etc.).\n\nFeatures:\n- Async and sync API support\n- Automatic retry with exponential backoff\n- Token usage tracking and context length management\n- MCP tool call parsing and response processing\n\"\"\"\n\nimport asyncio\nimport dataclasses\nimport logging\nfrom typing import Any, Dict, List, Tuple, Union\n\nimport tiktoken\nfrom openai import AsyncOpenAI, DefaultAsyncHttpxClient, DefaultHttpxClient, OpenAI\n\nfrom ...utils.prompt_utils import generate_mcp_system_prompt\nfrom ..base_client import BaseClient\n\nlogger = logging.getLogger(\"miroflow_agent\")\n\n\n@dataclasses.dataclass\nclass OpenAIClient(BaseClient):\n    def _create_client(self) -> Union[AsyncOpenAI, OpenAI]:\n        \"\"\"Create LLM client\"\"\"\n        http_client_args = {\"headers\": {\"x-upstream-session-id\": self.task_id}}\n        if self.async_client:\n            return AsyncOpenAI(\n                api_key=self.api_key,\n                base_url=self.base_url,\n                http_client=DefaultAsyncHttpxClient(**http_client_args),\n            )\n        else:\n            return OpenAI(\n                api_key=self.api_key,\n                base_url=self.base_url,\n                http_client=DefaultHttpxClient(**http_client_args),\n            )\n\n    def _update_token_usage(self, usage_data: Any) -> None:\n        \"\"\"Update cumulative token usage\"\"\"\n        if usage_data:\n            input_tokens = getattr(usage_data, \"prompt_tokens\", 0)\n            output_tokens = getattr(usage_data, \"completion_tokens\", 0)\n            prompt_tokens_details = getattr(usage_data, \"prompt_tokens_details\", None)\n            if prompt_tokens_details:\n                cached_tokens = (\n                    getattr(prompt_tokens_details, \"cached_tokens\", None) or 0\n                )\n            else:\n                cached_tokens = 0\n\n            # Record token usage for the most recent call\n            self.last_call_tokens = {\n                \"prompt_tokens\": input_tokens,\n                \"completion_tokens\": output_tokens,\n            }\n\n            # OpenAI does not provide cache_creation_input_tokens\n            self.token_usage[\"total_input_tokens\"] += input_tokens\n            self.token_usage[\"total_output_tokens\"] += output_tokens\n            self.token_usage[\"total_cache_read_input_tokens\"] += cached_tokens\n\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Token Usage\",\n                f\"Input: {self.token_usage['total_input_tokens']}, \"\n                f\"Output: {self.token_usage['total_output_tokens']}\",\n            )\n\n    async def _create_message(\n        self,\n        system_prompt: str,\n        messages_history: List[Dict[str, Any]],\n        tools_definitions,\n        keep_tool_result: int = -1,\n    ):\n        \"\"\"\n        Send message to OpenAI API.\n        :param system_prompt: System prompt string.\n        :param messages_history: Message history list.\n        :return: OpenAI API response object or None (if error occurs).\n        \"\"\"\n\n        # Create a copy for sending to LLM (to avoid modifying the original)\n        messages_for_llm = [m.copy() for m in messages_history]\n\n        # put the system prompt in the first message since OpenAI API does not support system prompt in\n        if system_prompt:\n            # Check if there's already a system or developer message\n            if messages_for_llm and messages_for_llm[0][\"role\"] in [\n                \"system\",\n                \"developer\",\n            ]:\n                messages_for_llm[0] = {\n                    \"role\": \"system\",\n                    \"content\": system_prompt,\n                }\n\n            else:\n                messages_for_llm.insert(\n                    0,\n                    {\n                        \"role\": \"system\",\n                        \"content\": system_prompt,\n                    },\n                )\n\n        # Filter tool results to save tokens (only affects messages sent to LLM)\n        messages_for_llm = self._remove_tool_result_from_messages(\n            messages_for_llm, keep_tool_result\n        )\n\n        # Retry loop with dynamic max_tokens adjustment\n        max_retries = 10\n        base_wait_time = 30\n        current_max_tokens = self.max_tokens\n\n        for attempt in range(max_retries):\n            params = {\n                \"model\": self.model_name,\n                \"temperature\": self.temperature,\n                \"messages\": messages_for_llm,\n                \"stream\": False,\n                \"top_p\": self.top_p,\n                \"extra_body\": {},\n            }\n            # Check if the model is GPT-5, and adjust the parameter accordingly\n            if \"gpt-5\" in self.model_name:\n                # Use 'max_completion_tokens' for GPT-5\n                params[\"max_completion_tokens\"] = current_max_tokens\n            else:\n                # Use 'max_tokens' for GPT-4 and other models\n                params[\"max_tokens\"] = current_max_tokens\n\n            # Add repetition_penalty if it's not the default value\n            if self.repetition_penalty != 1.0:\n                params[\"extra_body\"][\"repetition_penalty\"] = self.repetition_penalty\n\n            if \"deepseek-v3-1\" in self.model_name:\n                params[\"extra_body\"][\"thinking\"] = {\"type\": \"enabled\"}\n\n            # auto-detect if we need to continue from the last assistant message\n            if messages_for_llm and messages_for_llm[-1].get(\"role\") == \"assistant\":\n                params[\"extra_body\"][\"continue_final_message\"] = True\n                params[\"extra_body\"][\"add_generation_prompt\"] = False\n\n            try:\n                if self.async_client:\n                    response = await self.client.chat.completions.create(**params)\n                else:\n                    response = self.client.chat.completions.create(**params)\n                # Update token count\n                self._update_token_usage(getattr(response, \"usage\", None))\n                self.task_log.log_step(\n                    \"info\",\n                    \"LLM | Response Status\",\n                    f\"{getattr(response.choices[0], 'finish_reason', 'N/A')}\",\n                )\n\n                # Check if response was truncated due to length limit\n                finish_reason = getattr(response.choices[0], \"finish_reason\", None)\n                if finish_reason == \"length\":\n                    # If this is not the last retry, increase max_tokens and retry\n                    if attempt < max_retries - 1:\n                        # Increase max_tokens by 10%\n                        current_max_tokens = int(current_max_tokens * 1.1)\n                        self.task_log.log_step(\n                            \"warning\",\n                            \"LLM | Length Limit Reached\",\n                            f\"Response was truncated due to length limit (attempt {attempt + 1}/{max_retries}). Increasing max_tokens to {current_max_tokens} and retrying...\",\n                        )\n                        await asyncio.sleep(base_wait_time)\n                        continue\n                    else:\n                        # Last retry, return the truncated response instead of raising exception\n                        self.task_log.log_step(\n                            \"warning\",\n                            \"LLM | Length Limit Reached - Returning Truncated Response\",\n                            f\"Response was truncated after {max_retries} attempts. Returning truncated response to allow ReAct loop to continue.\",\n                        )\n                        # Return the truncated response and let the orchestrator handle it\n                        return response, messages_history\n\n                # Check if the last 50 characters of the response appear more than 5 times in the response content.\n                # If so, treat it as a severe repeat and trigger a retry.\n                if hasattr(response.choices[0], \"message\") and hasattr(\n                    response.choices[0].message, \"content\"\n                ):\n                    resp_content = response.choices[0].message.content or \"\"\n                else:\n                    resp_content = getattr(response.choices[0], \"text\", \"\")\n\n                if resp_content and len(resp_content) >= 50:\n                    tail_50 = resp_content[-50:]\n                    repeat_count = resp_content.count(tail_50)\n                    if repeat_count > 5:\n                        # If this is not the last retry, retry\n                        if attempt < max_retries - 1:\n                            self.task_log.log_step(\n                                \"warning\",\n                                \"LLM | Repeat Detected\",\n                                f\"Severe repeat: the last 50 chars appeared over 5 times (attempt {attempt + 1}/{max_retries}), retrying...\",\n                            )\n                            await asyncio.sleep(base_wait_time)\n                            continue\n                        else:\n                            # Last retry, return anyway\n                            self.task_log.log_step(\n                                \"warning\",\n                                \"LLM | Repeat Detected - Returning Anyway\",\n                                f\"Severe repeat detected after {max_retries} attempts. Returning response anyway.\",\n                            )\n\n                # Success - return the original messages_history (not the filtered copy)\n                # This ensures that the complete conversation history is preserved in logs\n                return response, messages_history\n\n            except asyncio.TimeoutError as e:\n                if attempt < max_retries - 1:\n                    self.task_log.log_step(\n                        \"warning\",\n                        \"LLM | Timeout Error\",\n                        f\"Timeout error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...\",\n                    )\n                    await asyncio.sleep(base_wait_time)\n                    continue\n                else:\n                    self.task_log.log_step(\n                        \"error\",\n                        \"LLM | Timeout Error\",\n                        f\"Timeout error after {max_retries} attempts: {str(e)}\",\n                    )\n                    raise e\n            except asyncio.CancelledError as e:\n                self.task_log.log_step(\n                    \"error\",\n                    \"LLM | Request Cancelled\",\n                    f\"Request was cancelled: {str(e)}\",\n                )\n                raise e\n            except Exception as e:\n                if \"Error code: 400\" in str(e) and \"longer than the model\" in str(e):\n                    self.task_log.log_step(\n                        \"error\",\n                        \"LLM | Context Length Error\",\n                        f\"Error: {str(e)}\",\n                    )\n                    raise e\n                else:\n                    if attempt < max_retries - 1:\n                        self.task_log.log_step(\n                            \"warning\",\n                            \"LLM | API Error\",\n                            f\"Error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...\",\n                        )\n                        await asyncio.sleep(base_wait_time)\n                        continue\n                    else:\n                        self.task_log.log_step(\n                            \"error\",\n                            \"LLM | API Error\",\n                            f\"Error after {max_retries} attempts: {str(e)}\",\n                        )\n                        raise e\n\n        # Should never reach here, but just in case\n        raise Exception(\"Unexpected error: retry loop completed without returning\")\n\n    def process_llm_response(\n        self, llm_response: Any, message_history: List[Dict], agent_type: str = \"main\"\n    ) -> tuple[str, bool, List[Dict]]:\n        \"\"\"Process LLM response\"\"\"\n        if not llm_response or not llm_response.choices:\n            error_msg = \"LLM did not return a valid response.\"\n            self.task_log.log_step(\n                \"error\", \"LLM | Response Error\", f\"Error: {error_msg}\"\n            )\n            return \"\", True, message_history  # Exit loop, return message_history\n\n        # Extract LLM response text\n        from ...utils.parsing_utils import fix_server_name_in_text\n\n        if llm_response.choices[0].finish_reason == \"stop\":\n            assistant_response_text = llm_response.choices[0].message.content or \"\"\n            assistant_response_text = fix_server_name_in_text(assistant_response_text)\n\n            message_history.append(\n                {\"role\": \"assistant\", \"content\": assistant_response_text}\n            )\n\n        elif llm_response.choices[0].finish_reason == \"length\":\n            assistant_response_text = llm_response.choices[0].message.content or \"\"\n            assistant_response_text = fix_server_name_in_text(assistant_response_text)\n            if assistant_response_text == \"\":\n                assistant_response_text = \"LLM response is empty.\"\n            elif \"Context length exceeded\" in assistant_response_text:\n                # This is the case where context length is exceeded, needs special handling\n                self.task_log.log_step(\n                    \"warning\",\n                    \"LLM | Context Length\",\n                    \"Detected context length exceeded, returning error status\",\n                )\n                message_history.append(\n                    {\"role\": \"assistant\", \"content\": assistant_response_text}\n                )\n                return (\n                    assistant_response_text,\n                    True,\n                    message_history,\n                )  # Return True to indicate need to exit loop\n\n            # Add assistant response to history\n            message_history.append(\n                {\"role\": \"assistant\", \"content\": assistant_response_text}\n            )\n\n        else:\n            raise ValueError(\n                f\"Unsupported finish reason: {llm_response.choices[0].finish_reason}\"\n            )\n\n        return assistant_response_text, False, message_history\n\n    def extract_tool_calls_info(\n        self, llm_response: Any, assistant_response_text: str\n    ) -> List[Dict]:\n        \"\"\"Extract tool call information from LLM response\"\"\"\n        from ...utils.parsing_utils import parse_llm_response_for_tool_calls\n\n        return parse_llm_response_for_tool_calls(assistant_response_text)\n\n    def update_message_history(\n        self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]\n    ) -> List[Dict]:\n        \"\"\"Update message history with tool calls data (llm client specific)\"\"\"\n\n        merged_text = \"\\n\".join(\n            [\n                item[1][\"text\"]\n                for item in all_tool_results_content_with_id\n                if item[1][\"type\"] == \"text\"\n            ]\n        )\n\n        message_history.append(\n            {\n                \"role\": \"user\",\n                \"content\": merged_text,\n            }\n        )\n\n        return message_history\n\n    def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:\n        from ...utils.parsing_utils import set_tool_server_mapping\n\n        prompt = generate_mcp_system_prompt(date, mcp_servers)\n        set_tool_server_mapping(prompt)\n        return prompt\n\n    def _estimate_tokens(self, text: str) -> int:\n        \"\"\"Use tiktoken to estimate the number of tokens in text\"\"\"\n        if not hasattr(self, \"encoding\"):\n            # Initialize tiktoken encoder\n            try:\n                self.encoding = tiktoken.get_encoding(\"o200k_base\")\n            except Exception:\n                # If o200k_base is not available, use cl100k_base as fallback\n                self.encoding = tiktoken.get_encoding(\"cl100k_base\")\n\n        try:\n            return len(self.encoding.encode(text))\n        except Exception as e:\n            # If encoding fails, use simple estimation: approximately 1 token per 4 characters\n            self.task_log.log_step(\n                \"error\",\n                \"LLM | Token Estimation Error\",\n                f\"Error: {str(e)}\",\n            )\n            return len(text) // 4\n\n    def ensure_summary_context(\n        self, message_history: list, summary_prompt: str\n    ) -> tuple[bool, list]:\n        \"\"\"\n        Check if current message_history + summary_prompt will exceed context\n        If it will exceed, remove the last assistant-user pair and return False\n        Return True to continue, False if messages have been rolled back\n        \"\"\"\n        # Get token usage from the last LLM call\n        last_prompt_tokens = self.last_call_tokens.get(\"prompt_tokens\", 0)\n        last_completion_tokens = self.last_call_tokens.get(\"completion_tokens\", 0)\n        buffer_factor = 1.5\n\n        # Calculate token count for summary prompt\n        summary_tokens = int(self._estimate_tokens(summary_prompt) * buffer_factor)\n\n        # Calculate token count for the last user message in message_history\n        last_user_tokens = 0\n        if message_history[-1][\"role\"] == \"user\":\n            content = message_history[-1][\"content\"]\n            last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)\n\n        # Calculate total token count: last prompt + completion + last user message + summary + reserved response space\n        estimated_total = (\n            last_prompt_tokens\n            + last_completion_tokens\n            + last_user_tokens\n            + summary_tokens\n            + self.max_tokens\n            + 1000  # Add 1000 tokens as buffer\n        )\n\n        if estimated_total >= self.max_context_length:\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Context Limit Reached\",\n                \"Context limit reached, proceeding to step back and summarize the conversation\",\n            )\n\n            # Remove the last user message (tool call results)\n            if message_history[-1][\"role\"] == \"user\":\n                message_history.pop()\n\n            # Remove the second-to-last assistant message (tool call request)\n            if message_history[-1][\"role\"] == \"assistant\":\n                message_history.pop()\n\n            self.task_log.log_step(\n                \"info\",\n                \"LLM | Context Limit Reached\",\n                f\"Removed the last assistant-user pair, current message_history length: {len(message_history)}\",\n            )\n\n            return False, message_history\n\n        self.task_log.log_step(\n            \"info\",\n            \"LLM | Context Limit Not Reached\",\n            f\"{estimated_total}/{self.max_context_length}\",\n        )\n        return True, message_history\n\n    def format_token_usage_summary(self) -> tuple[List[str], str]:\n        \"\"\"Format token usage statistics, return summary_lines for format_final_summary and log string\"\"\"\n        token_usage = self.get_token_usage()\n\n        total_input = token_usage.get(\"total_input_tokens\", 0)\n        total_output = token_usage.get(\"total_output_tokens\", 0)\n        cache_input = token_usage.get(\"total_cache_input_tokens\", 0)\n\n        summary_lines = []\n        summary_lines.append(\"\\n\" + \"-\" * 20 + \" Token Usage \" + \"-\" * 20)\n        summary_lines.append(f\"Total Input Tokens: {total_input}\")\n        summary_lines.append(f\"Total Cache Input Tokens: {cache_input}\")\n        summary_lines.append(f\"Total Output Tokens: {total_output}\")\n        summary_lines.append(\"-\" * (40 + len(\" Token Usage \")))\n        summary_lines.append(\"Pricing is disabled - no cost information available\")\n        summary_lines.append(\"-\" * (40 + len(\" Token Usage \")))\n\n        # Generate log string\n        log_string = (\n            f\"[{self.model_name}] Total Input: {total_input}, \"\n            f\"Cache Input: {cache_input}, \"\n            f\"Output: {total_output}\"\n        )\n\n        return summary_lines, log_string\n\n    def get_token_usage(self):\n        return self.token_usage.copy()\n"
  },
  {
    "path": "apps/miroflow-agent/src/llm/util.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nUtility decorators and helpers for LLM client operations.\n\nThis module provides:\n- Timeout decorator for async LLM API calls\n- Other common utilities shared across LLM providers\n\"\"\"\n\nimport asyncio\nimport functools\nfrom typing import Awaitable, Callable, TypeVar\n\nT = TypeVar(\"T\")\n\n\ndef with_timeout(\n    timeout_s: float = 300.0,\n) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:\n    \"\"\"\n    Decorator: wraps any *async* function in asyncio.wait_for().\n    Usage:\n        @with_timeout(20)\n        async def create_message_foo(...): ...\n    \"\"\"\n\n    def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:\n        @functools.wraps(func)\n        async def wrapper(*args, **kwargs) -> T:\n            return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)\n\n        return wrapper\n\n    return decorator\n"
  },
  {
    "path": "apps/miroflow-agent/src/logging/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Logging module for task execution tracking.\"\"\"\n\nfrom .task_logger import (\n    LLMCallLog,\n    StepLog,\n    TaskLog,\n    ToolCallLog,\n    bootstrap_logger,\n    get_utc_plus_8_time,\n)\n\n__all__ = [\n    \"TaskLog\",\n    \"StepLog\",\n    \"LLMCallLog\",\n    \"ToolCallLog\",\n    \"bootstrap_logger\",\n    \"get_utc_plus_8_time\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/logging/summary_time_cost.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nfrom collections import defaultdict\nfrom pathlib import Path\n\nfrom .task_logger import logger\n\n\ndef _get_summary_template():\n    \"\"\"Returns a template for the summary data structure.\"\"\"\n    return {\n        \"total_tasks\": 0,\n        \"total_wall_time\": 0.0,\n        \"primary_breakdown\": {\n            \"main_agent\": defaultdict(float),\n            \"browsing_agent\": defaultdict(float),\n        },\n        \"cross_cutting_breakdown\": defaultdict(float),\n        \"tool_workload_breakdown\": defaultdict(float),\n    }\n\n\ndef _update_summary_data(summary_block, perf_summary, tool_workload):\n    \"\"\"Updates a summary block with data from a single result.\"\"\"\n    summary_block[\"total_tasks\"] += 1\n    summary_block[\"total_wall_time\"] += perf_summary.get(\"total_wall_time\", 0.0)\n\n    # Update primary breakdown\n    primary_breakdown = perf_summary.get(\"primary_breakdown\", {})\n    for agent, data in primary_breakdown.items():\n        if agent in summary_block[\"primary_breakdown\"]:\n            for key, value in data.items():\n                summary_block[\"primary_breakdown\"][agent][key] += value\n\n    # Update cross-cutting breakdown\n    cross_cutting_breakdown = perf_summary.get(\"cross_cutting_breakdown\", {})\n    for key, value in cross_cutting_breakdown.items():\n        summary_block[\"cross_cutting_breakdown\"][key] += value\n\n    # Update tool workload breakdown\n    for key, value in tool_workload.items():\n        summary_block[\"tool_workload_breakdown\"][key] += value\n\n\ndef _calculate_averages(summary_block):\n    \"\"\"Calculates and adds average values to a summary block.\"\"\"\n    num_tasks = summary_block[\"total_tasks\"]\n    if num_tasks == 0:\n        return\n\n    summary_block[\"average_wall_time\"] = summary_block[\"total_wall_time\"] / num_tasks\n\n    # Calculate averages for primary breakdown\n    for agent, data in summary_block[\"primary_breakdown\"].items():\n        summary_block[\"primary_breakdown\"][agent] = dict(data)  # Convert back to dict\n        avg_data = {f\"avg_{k}\": v / num_tasks for k, v in data.items()}\n        summary_block[\"primary_breakdown\"][agent].update(avg_data)\n\n    # Calculate averages for cross-cutting breakdown\n    summary_block[\"cross_cutting_breakdown\"] = dict(\n        summary_block[\"cross_cutting_breakdown\"]\n    )\n    avg_cross_cutting = {\n        f\"avg_{k}\": v / num_tasks\n        for k, v in summary_block[\"cross_cutting_breakdown\"].items()\n    }\n    summary_block[\"cross_cutting_breakdown\"].update(avg_cross_cutting)\n\n    # Calculate averages for tool workload breakdown\n    summary_block[\"tool_workload_breakdown\"] = dict(\n        summary_block[\"tool_workload_breakdown\"]\n    )\n    avg_tool_workload = {\n        f\"avg_{k}\": v / num_tasks\n        for k, v in summary_block[\"tool_workload_breakdown\"].items()\n    }\n    summary_block[\"tool_workload_breakdown\"].update(avg_tool_workload)\n\n\ndef generate_summary(log_dir: Path):\n    \"\"\"\n    Generates a summary of benchmark results by reading log files from a directory,\n    calculating total and average trace data, both overall and grouped by\n    final_judge_result.\n\n    Args:\n        log_dir: The directory where the individual result log files are and where\n                 the summary file will be saved.\n    \"\"\"\n    results = []\n    for log_file in log_dir.glob(\"*.json\"):\n        if log_file.name == \"summary.json\":\n            continue\n        try:\n            with open(log_file, \"r\", encoding=\"utf-8\") as f:\n                results.append(json.load(f))\n        except json.JSONDecodeError:\n            logger.info(f\"Warning: Could not decode JSON from {log_file}. Skipping.\")\n        except Exception as e:\n            logger.info(f\"Warning: Could not read file {log_file}: {e}. Skipping.\")\n\n    overall_summary = _get_summary_template()\n    summary_by_judge = defaultdict(_get_summary_template)\n\n    for result in results:\n        trace_data = result.get(\"trace_data\")\n        if not trace_data or \"performance_summary\" not in trace_data:\n            continue\n\n        perf_summary = trace_data[\"performance_summary\"]\n        tool_workload = trace_data.get(\"tool_workload_breakdown\", {})\n\n        # Update overall summary\n        _update_summary_data(overall_summary, perf_summary, tool_workload)\n\n        # Update summary by judge result\n        judge_result = result.get(\"final_judge_result\", \"unknown\")\n        _update_summary_data(\n            summary_by_judge[judge_result], perf_summary, tool_workload\n        )\n\n    # Calculate averages for all summary blocks\n    _calculate_averages(overall_summary)\n    for judge_result in summary_by_judge:\n        _calculate_averages(summary_by_judge[judge_result])\n\n    summary_data = {\n        \"overall_summary\": overall_summary,\n        \"summary_by_final_judge_result\": dict(summary_by_judge),\n    }\n\n    summary_file = log_dir / \"summary_time_cost.json\"\n    with open(summary_file, \"w\", encoding=\"utf-8\") as f:\n        json.dump(summary_data, f, indent=4, ensure_ascii=False)\n"
  },
  {
    "path": "apps/miroflow-agent/src/logging/task_logger.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nTask logging and structured output module.\n\nThis module provides:\n- TaskLog: Main dataclass for tracking task execution state and history\n- StepLog: Individual step logging with timestamps and metadata\n- ColoredFormatter: Console output formatting with color-coded log levels\n- Utility functions for time handling and logger configuration\n\nAll logs are persisted to JSON files for later analysis and debugging.\n\"\"\"\n\nimport json\nimport logging\nimport os\nfrom dataclasses import asdict, dataclass, field\nfrom datetime import datetime, timedelta, timezone\nfrom pathlib import Path\nfrom typing import Any, Dict, List, Literal, Optional\n\n# Import colorama for cross-platform colored output\nfrom colorama import Fore, Style, init\n\n# Initialize colorama\ninit(autoreset=True, strip=False)\n\n# This will be set to the configured logger instance\nlogger = None\n\n\ndef get_color_for_level(level: str) -> str:\n    \"\"\"Get color code based on log level for better visual distinction\"\"\"\n    if level == \"ERROR\":\n        return f\"{Fore.RED}{Style.BRIGHT}\"\n    elif level == \"WARNING\":\n        return f\"{Fore.YELLOW}{Style.BRIGHT}\"\n    elif level == \"INFO\":\n        return f\"{Fore.GREEN}{Style.BRIGHT}\"\n    elif level == \"DEBUG\":\n        return f\"{Fore.CYAN}{Style.BRIGHT}\"\n    else:\n        return f\"{Fore.WHITE}{Style.BRIGHT}\"\n\n\nclass ColoredFormatter(logging.Formatter):\n    \"\"\"Custom formatter that adds colors for better developer visualization\"\"\"\n\n    def format(self, record):\n        # Get timestamp and format it\n        timestamp = self.formatTime(record, self.datefmt)\n\n        # Color the level name based on severity\n        level_color = get_color_for_level(record.levelname)\n        level_reset = Style.RESET_ALL\n\n        # Color the logger name (miroflow_agent)\n        name_color = f\"{Fore.BLUE}{Style.BRIGHT}\"\n        name_reset = Style.RESET_ALL\n\n        # Get the message as is (icons are already added in log_step)\n        message = record.getMessage()\n\n        # Format with selective coloring\n        formatted = f\"[{timestamp}][{name_color}{record.name}{name_reset}][{level_color}{record.levelname}{level_reset}] - {message}\"\n\n        return formatted\n\n\ndef bootstrap_logger() -> logging.Logger:\n    \"\"\"Configure the miroflow_agent logger with consistent formatting\"\"\"\n\n    global logger\n\n    # Configure miroflow_agent logger\n    miroflow_agent_logger = logging.getLogger(\"miroflow_agent\")\n\n    # Check if logger already has handlers to prevent duplicate configuration\n    if miroflow_agent_logger.handlers:\n        logger = miroflow_agent_logger\n        return miroflow_agent_logger\n\n    # Create formatter with consistent format\n    formatter = ColoredFormatter(\n        \"%(asctime)s,%(msecs)03d\",\n        datefmt=\"%Y-%m-%d %H:%M:%S\",\n    )\n\n    # Add our handler with the specified formatter\n    handler = logging.StreamHandler()\n    handler.setFormatter(formatter)\n    miroflow_agent_logger.addHandler(handler)\n    miroflow_agent_logger.setLevel(logging.DEBUG)\n\n    # Disable propagation to prevent duplicate logging from root logger\n    miroflow_agent_logger.propagate = False\n\n    # Set the global logger variable\n    logger = miroflow_agent_logger\n\n    return miroflow_agent_logger\n\n\ndef get_utc_plus_8_time() -> str:\n    \"\"\"Get UTC+8 timezone current time string\"\"\"\n    utc_plus_8 = timezone(timedelta(hours=8))\n    return datetime.now(utc_plus_8).strftime(\"%Y-%m-%d %H:%M:%S\")\n\n\n@dataclass\nclass LLMCallLog:\n    \"\"\"Record technical details of LLM calls\"\"\"\n\n    provider: str\n    model: str\n    input_tokens: int = 0\n    output_tokens: int = 0\n    cache_creation_tokens: int = 0\n    cache_read_tokens: int = 0\n    error: Optional[str] = None\n\n\n@dataclass\nclass ToolCallLog:\n    \"\"\"Record detailed information of tool calls\"\"\"\n\n    server_name: str\n    tool_name: str\n    arguments: Dict[str, Any] = field(default_factory=dict)\n    result: Any = None\n    error: Optional[str] = None\n    call_time: Optional[str] = None\n\n\n@dataclass\nclass StepLog:\n    \"\"\"Record detailed information of task execution steps\"\"\"\n\n    step_name: str\n    message: str\n    timestamp: str\n    info_level: Literal[\"info\", \"warning\", \"error\", \"debug\"] = \"info\"\n    metadata: Dict[str, Any] = field(default_factory=dict)\n\n    def __post_init__(self):\n        \"\"\"Validate info_level after initialization\"\"\"\n        valid_levels = {\"info\", \"warning\", \"error\", \"debug\"}\n        if self.info_level not in valid_levels:\n            raise ValueError(\n                f\"info_level must be one of {valid_levels}, got '{self.info_level}'\"\n            )\n\n\n@dataclass\nclass TaskLog:\n    status: str = \"running\"\n    start_time: str = \"\"\n    end_time: str = \"\"\n\n    task_id: str = \"\"\n    input: Any = None\n    ground_truth: str = \"\"\n    final_boxed_answer: str = \"\"\n    final_judge_result: str = \"\"\n    judge_type: str = \"\"\n    eval_details: Optional[Dict[str, Any]] = None  # For DeepSearchQA metrics\n    error: str = \"\"\n\n    # Main records: main agent conversation turns\n    current_main_turn_id: int = 0\n    current_sub_agent_turn_id: int = 0\n    sub_agent_counter: int = 0\n    current_sub_agent_session_id: Optional[str] = None\n\n    env_info: Optional[dict] = field(default_factory=dict)\n    log_dir: str = \"logs\"\n\n    main_agent_message_history: List[Dict[str, Any]] = field(default_factory=list)\n    sub_agent_message_history_sessions: Dict[str, List[Dict[str, Any]]] = field(\n        default_factory=dict\n    )\n\n    step_logs: List[StepLog] = field(default_factory=list)\n    trace_data: Dict[str, Any] = field(default_factory=dict)\n\n    def start_sub_agent_session(\n        self, sub_agent_name: str, subtask_description: str\n    ) -> str:\n        \"\"\"Start a new sub-agent session\"\"\"\n        self.sub_agent_counter += 1\n        session_id = f\"{sub_agent_name}_{self.sub_agent_counter}\"\n        self.current_sub_agent_session_id = session_id\n\n        # Record sub-agent session start\n        self.log_step(\n            \"info\",\n            f\"{sub_agent_name} | Session Start\",\n            f\"Starting {session_id} for subtask: {subtask_description[:100]}{'...' if len(subtask_description) > 100 else ''}\",\n            metadata={\"session_id\": session_id, \"subtask\": subtask_description},\n        )\n\n        return session_id\n\n    def end_sub_agent_session(self, sub_agent_name: str) -> Optional[str]:\n        \"\"\"End the current sub-agent session\"\"\"\n        self.log_step(\n            \"info\",\n            f\"{sub_agent_name} | Session End\",\n            f\"Ending {self.current_sub_agent_session_id}\",\n            metadata={\"session_id\": self.current_sub_agent_session_id},\n        )\n        self.current_sub_agent_session_id = None\n        return None\n\n    def log_step(\n        self,\n        info_level: Literal[\"info\", \"warning\", \"error\", \"debug\"],\n        step_name: str,\n        message: str,\n        metadata: Optional[Dict[str, Any]] = None,\n    ):\n        \"\"\"Record execution step\"\"\"\n        # Add icons to step_name based on content\n        icon = \"\"\n        if \"Tool Call Start\" in step_name:\n            icon = \"▶️ \"\n        elif \"Tool Call Success\" in step_name:\n            icon = \"✅ \"\n        elif \"Tool Call Error\" in step_name or (\n            \"error\" in info_level and \"tool\" in step_name.lower()\n        ):\n            icon = \"❌ \"\n        elif \"agent-\" in step_name:\n            icon = \"🤖 \"\n        elif \"Main Agent\" in step_name:\n            icon = \"👑 \"\n        elif \"LLM\" in step_name:\n            icon = \"🧠 \"\n        elif \"ToolManager\" in step_name or \"Tool Call\" in step_name:\n            icon = \"🔧 \"\n        elif \"tool-python\" in step_name.lower():\n            icon = \"🐍 \"\n        elif \"tool-google-search\" in step_name.lower():\n            icon = \"🔍 \"\n        elif \"tool-browser\" in step_name.lower() or \"playwright\" in step_name.lower():\n            icon = \"🌐 \"\n\n        # Add icon to step_name\n        step_name_with_icon = f\"{icon}{step_name}\"\n\n        step_log = StepLog(\n            step_name=step_name_with_icon,\n            message=message,\n            timestamp=get_utc_plus_8_time(),\n            info_level=info_level,\n            metadata=metadata or {},\n        )\n\n        self.step_logs.append(step_log)\n\n        # Print the structured log to console using the configured logger\n        log_message = f\"{step_name_with_icon}: {message}\"\n\n        # Ensure logger is configured\n        global logger\n        if logger is None:\n            logger = bootstrap_logger()\n\n        if info_level == \"error\":\n            logger.error(log_message)\n        elif info_level == \"warning\":\n            logger.warning(log_message)\n        elif info_level == \"debug\":\n            logger.debug(log_message)\n        else:  # info\n            logger.info(log_message)\n\n    def serialize_for_json(self, obj):\n        \"\"\"Convert objects to JSON-serializable format\"\"\"\n        if isinstance(obj, Path):\n            return str(obj)\n        elif isinstance(obj, dict):\n            return {k: self.serialize_for_json(v) for k, v in obj.items()}\n        elif isinstance(obj, list):\n            return [self.serialize_for_json(item) for item in obj]\n        elif hasattr(obj, \"__dict__\"):\n            return self.serialize_for_json(obj.__dict__)\n        else:\n            return obj\n\n    def to_json(self) -> str:\n        \"\"\"\n        Serialize the TaskLog to a JSON string.\n\n        Converts the dataclass to a dictionary, handles non-JSON-serializable\n        objects (like Path), and returns a formatted JSON string.\n\n        Returns:\n            A JSON string representation of the task log with 2-space indentation.\n\n        Note:\n            Falls back to ASCII encoding if Unicode encoding fails.\n        \"\"\"\n        # Convert to dict first\n        data_dict = asdict(self)\n        # Serialize any non-JSON-serializable objects\n        serialized_dict = self.serialize_for_json(data_dict)\n        try:\n            return json.dumps(serialized_dict, ensure_ascii=False, indent=2)\n        except UnicodeEncodeError as e:\n            # Fallback: try with ASCII encoding if Unicode fails\n            print(f\"Warning: Unicode encoding failed, falling back to ASCII: {e}\")\n            return json.dumps(serialized_dict, ensure_ascii=True, indent=2)\n\n    def save(self):\n        \"\"\"Save as a single JSON file\"\"\"\n        os.makedirs(self.log_dir, exist_ok=True)\n        timestamp = (\n            self.start_time.replace(\":\", \"-\").replace(\".\", \"-\").replace(\" \", \"-\")\n        )\n\n        filename = f\"{self.log_dir}/task_{self.task_id}_{timestamp}.json\"\n        try:\n            with open(filename, \"w\", encoding=\"utf-8\") as f:\n                f.write(self.to_json())\n        except UnicodeEncodeError as e:\n            # Fallback: try with different encoding if UTF-8 fails\n            print(f\"Warning: UTF-8 encoding failed, trying with system default: {e}\")\n            with open(filename, \"w\") as f:\n                f.write(self.to_json())\n        return filename\n\n    @classmethod\n    def from_dict(cls, d: dict) -> \"TaskLog\":\n        \"\"\"\n        Create a TaskLog instance from a dictionary.\n\n        Args:\n            d: Dictionary containing TaskLog field values.\n\n        Returns:\n            A new TaskLog instance initialized with the dictionary values.\n\n        Note:\n            The dictionary keys should match the TaskLog field names.\n        \"\"\"\n        return cls(**d)\n"
  },
  {
    "path": "apps/miroflow-agent/src/utils/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Utility functions for parsing, prompts, and wrappers.\"\"\"\n\nfrom .parsing_utils import (\n    extract_failure_experience_summary,\n    extract_llm_response_text,\n    fix_server_name_in_text,\n    parse_llm_response_for_tool_calls,\n    safe_json_loads,\n    set_tool_server_mapping,\n)\nfrom .prompt_utils import (\n    FORMAT_ERROR_MESSAGE,\n    generate_agent_specific_system_prompt,\n    generate_agent_summarize_prompt,\n    generate_mcp_system_prompt,\n)\nfrom .wrapper_utils import ErrorBox, ResponseBox\n\n__all__ = [\n    # parsing_utils\n    \"parse_llm_response_for_tool_calls\",\n    \"extract_llm_response_text\",\n    \"extract_failure_experience_summary\",\n    \"fix_server_name_in_text\",\n    \"set_tool_server_mapping\",\n    \"safe_json_loads\",\n    # prompt_utils\n    \"FORMAT_ERROR_MESSAGE\",\n    \"generate_mcp_system_prompt\",\n    \"generate_agent_specific_system_prompt\",\n    \"generate_agent_summarize_prompt\",\n    # wrapper_utils\n    \"ErrorBox\",\n    \"ResponseBox\",\n]\n"
  },
  {
    "path": "apps/miroflow-agent/src/utils/parsing_utils.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nParsing utilities for LLM responses and tool calls.\n\nThis module provides functions for:\n- Parsing tool calls from LLM responses (both OpenAI and MCP formats)\n- Extracting text content from responses\n- Safe JSON parsing with automatic repair\n- Failure experience summary extraction\n\"\"\"\n\nimport json\nimport logging\nimport re\nfrom typing import Any, Dict, List, Union\n\nfrom json_repair import repair_json\n\nlogger = logging.getLogger(\"miroflow_agent\")\n\n\ndef parse_tool_server_mapping(system_prompt: str) -> dict:\n    \"\"\"\n    Parse system prompt to extract tool_name → server_name mapping.\n\n    Parses patterns like:\n        ## Server name: tool-python\n        ### Tool name: run_python_code\n\n    Only extracts mappings for the 3 target tools that models commonly get wrong:\n    run_python_code, google_search, scrape_and_extract_info.\n\n    Args:\n        system_prompt: The system prompt containing MCP tool definitions\n\n    Returns:\n        Dict mapping tool_name to correct server_name, e.g.\n        {\"run_python_code\": \"tool-python\", \"google_search\": \"search_and_scrape_webpage\", ...}\n    \"\"\"\n    TARGET_TOOLS = {\"run_python_code\", \"google_search\", \"scrape_and_extract_info\"}\n    mapping = {}\n    current_server = None\n    for line in system_prompt.split(\"\\n\"):\n        server_match = re.match(r\"## Server name:\\s*(.+)\", line)\n        if server_match:\n            current_server = server_match.group(1).strip()\n            continue\n        tool_match = re.match(r\"### Tool name:\\s*(.+)\", line)\n        if tool_match and current_server:\n            tool_name = tool_match.group(1).strip()\n            if tool_name in TARGET_TOOLS:\n                mapping[tool_name] = current_server\n    return mapping\n\n\n# Module-level cache for tool_server_mapping\n_tool_server_mapping: dict = {}\n\n\ndef set_tool_server_mapping(system_prompt: str) -> None:\n    \"\"\"\n    Parse system prompt and cache the tool_name → server_name mapping.\n\n    Should be called once when system prompt is available.\n\n    Args:\n        system_prompt: The system prompt containing MCP tool definitions\n    \"\"\"\n    global _tool_server_mapping\n    _tool_server_mapping = parse_tool_server_mapping(system_prompt)\n\n\ndef fix_server_name_in_text(text: str) -> str:\n    \"\"\"\n    Fix incorrect server_name and tool_name in MCP XML tool calls.\n\n    Uses the cached tool_server_mapping (parsed from system prompt) to determine\n    the correct server_name for each tool. Only fixes the 3 target tools:\n    run_python_code, google_search, scrape_and_extract_info.\n\n    Also handles the special case where model outputs tool_name=python\n    (should be run_python_code).\n\n    Args:\n        text: The LLM response text containing MCP tool calls\n\n    Returns:\n        Text with corrected server_name and tool_name if needed\n    \"\"\"\n    if not isinstance(text, str):\n        return text\n\n    mapping = _tool_server_mapping\n    if not mapping:\n        return text\n\n    # Special case: tool_name=python or python_code → rename to run_python_code\n    # Only apply if system prompt defines run_python_code (not python)\n    if \"run_python_code\" in mapping:\n        for wrong_name in (\"python\", \"python_code\"):\n            tag = f\"<tool_name>{wrong_name}</tool_name>\"\n            if tag in text:\n                text = text.replace(tag, \"<tool_name>run_python_code</tool_name>\")\n\n    # Fix server_name for each target tool using the mapping from system prompt\n    for tool_name, correct_server in mapping.items():\n        tool_tag = f\"<tool_name>{tool_name}</tool_name>\"\n        if tool_tag not in text:\n            continue\n        correct_server_tag = f\"<server_name>{correct_server}</server_name>\"\n        if correct_server_tag in text:\n            continue\n        text = re.sub(\n            r\"<server_name>[^<]+</server_name>(\\s*\" + re.escape(tool_tag) + r\")\",\n            correct_server_tag + r\"\\1\",\n            text,\n        )\n\n    return text\n\n\ndef filter_none_values(arguments: Union[Dict, Any]) -> Union[Dict, Any]:\n    \"\"\"\n    Filter out keys with None values from arguments dictionary.\n\n    Args:\n        arguments: A dictionary to filter, or any other value\n\n    Returns:\n        The filtered dictionary, or the original value if not a dict\n    \"\"\"\n    if not isinstance(arguments, dict):\n        return arguments\n    return {k: v for k, v in arguments.items() if v is not None}\n\n\ndef _fix_backslash_escapes(json_str: str) -> str:\n    \"\"\"\n    Fix common backslash escape issues in JSON strings.\n    This handles cases where backslashes in string values are not properly escaped.\n\n    Common issues:\n    - Unescaped backslashes before non-escape characters\n\n    Note: This is a conservative fix that preserves valid escape sequences\n    (\\\\, \\\", \\/, \\b, \\f, \\n, \\r, \\t) and only fixes clearly problematic cases.\n    \"\"\"\n    fixed_str = json_str\n\n    # Fix backslashes that are not part of valid escape sequences\n    # Valid JSON escape sequences: \\\\, \\\", \\/, \\b, \\f, \\n, \\r, \\t, \\uXXXX\n    # Pattern: backslash not followed by a valid escape character\n    # This regex matches \\ followed by anything except valid escape chars\n    # But we need to be careful not to match already-escaped backslashes (\\\\)\n\n    # Strategy: Find all backslashes, but skip those that are:\n    # 1. Already escaped (\\\\)\n    # 2. Part of valid escape sequences (\\\", \\/, \\b, \\f, \\n, \\r, \\t, \\u)\n\n    # More conservative approach: Only fix backslashes before uppercase letters\n    # (common in Windows paths) and other clearly problematic patterns\n    # This avoids breaking valid JSON escape sequences\n\n    # Fix backslashes before uppercase letters (Windows paths like C:\\Users)\n    fixed_str = re.sub(\n        r\"(?<!\\\\)\\\\([A-Z])\",  # Backslash before uppercase letter, not already escaped\n        r\"\\\\\\\\\\1\",\n        fixed_str,\n    )\n\n    # Fix backslashes before digits (common in paths like \\1, \\2)\n    fixed_str = re.sub(\n        r\"(?<!\\\\)\\\\([0-9])\",  # Backslash before digit, not already escaped\n        r\"\\\\\\\\\\1\",\n        fixed_str,\n    )\n\n    # Fix other unescaped backslashes that are not part of valid escape sequences\n    # This is more aggressive but should be safe after json_repair fails\n    # Valid escape chars: \\\\, \", /, b, f, n, r, t, u\n    # Use a capturing group to preserve the character after backslash\n    fixed_str = re.sub(\n        r'(?<!\\\\)\\\\([^\\\\\"/bfnrtu])',  # Backslash followed by invalid escape char\n        r\"\\\\\\\\\\1\",  # Escape it and preserve the character\n        fixed_str,\n    )\n\n    return fixed_str\n\n\ndef safe_json_loads(arguments_str: str) -> Dict[str, Any]:\n    \"\"\"\n    Safely parse a JSON string with multiple fallbacks.\n\n    Parsing strategy:\n    1. Try standard json.loads()\n    2. If it fails, try json_repair to fix common issues\n    3. If all attempts fail, return an error object\n\n    Args:\n        arguments_str: JSON string to parse\n\n    Returns:\n        Parsed dictionary, or error dict with 'error' and 'raw' keys\n    \"\"\"\n    # Step 1: Try standard JSON parsing\n    try:\n        return json.loads(arguments_str)\n    except json.JSONDecodeError:\n        pass\n\n    # Step 2: Try json_repair to fix common issues\n    try:\n        repaired = repair_json(arguments_str, ensure_ascii=False)\n        return json.loads(repaired)\n    except Exception:\n        logger.warning(f\"Unable to parse JSON: {arguments_str}\")\n\n    # Step 3: Give up and return error information\n    return {\n        \"error\": \"Failed to parse arguments\",\n        \"raw\": arguments_str,\n    }\n\n\ndef extract_failure_experience_summary(text: str) -> str:\n    \"\"\"\n    Extract failure experience summary from LLM response text.\n\n    The text may contain:\n    - <think>...</think> block (thinking content)\n    - Main content after </think> and before <use_mcp_tool>\n    - <use_mcp_tool>...</use_mcp_tool> block (tool call, ignored)\n\n    Examples:\n        \"<think>\\n{xxx}\\n</think>\\n\\n{content}\\n\\n<use_mcp_tool>...\"\n        \"<think>\\n{xxx}\\n</think>\\n\\n{content}\"\n        \"{content}\"  (no think block)\n\n    Returns:\n        - If content is empty after strip, return think_content\n        - If both think_content and content are non-empty, return content\n        - mcp_block is never used\n    \"\"\"\n    if not text:\n        return \"\"\n\n    think_content = \"\"\n    content = \"\"\n\n    # Extract think content\n    think_match = re.search(r\"<think>([\\s\\S]*?)</think>\", text)\n    if think_match:\n        think_content = think_match.group(1).strip()\n        # Get content after </think>\n        after_think = text[think_match.end() :]\n    else:\n        # No think block, entire text is potential content\n        after_think = text\n\n    # Remove <use_mcp_tool>...</use_mcp_tool> block from content\n    mcp_match = re.search(r\"<use_mcp_tool>[\\s\\S]*\", after_think)\n    if mcp_match:\n        content = after_think[: mcp_match.start()].strip()\n    else:\n        content = after_think.strip()\n\n    # Apply the rules:\n    # - If content is empty, use think_content\n    # - If both are non-empty, use content\n    if content:\n        return content\n    else:\n        return think_content\n\n\ndef extract_llm_response_text(llm_response: Union[str, Dict]) -> str:\n    \"\"\"\n    Extract text from LLM response, excluding <use_mcp_tool> tags.\n\n    Stops immediately when <use_mcp_tool> tag is encountered, returning\n    only the content before it.\n\n    Args:\n        llm_response: Either a string or a dict with 'content' key\n\n    Returns:\n        Extracted text content, stripped of trailing whitespace\n    \"\"\"\n    # If it's a dictionary type, extract the content field\n    if isinstance(llm_response, dict):\n        content = llm_response.get(\"content\", \"\")\n    else:\n        # If it's a string type, use directly\n        content = str(llm_response)\n\n    # Find the position of <use_mcp_tool> tag\n    tool_start_pattern = r\"<use_mcp_tool>\"\n    match = re.search(tool_start_pattern, content)\n\n    if match:\n        # If <use_mcp_tool> tag is found, only return content before the tag\n        return content[: match.start()].strip()\n    else:\n        # If no tag is found, return the complete content\n        return content.strip()\n\n\ndef parse_llm_response_for_tool_calls(\n    llm_response_content_text: Union[str, Dict, List],\n) -> List[Dict[str, Any]]:\n    \"\"\"\n    Parse tool calls from LLM response content.\n\n    Supports multiple formats:\n    - OpenAI Response API format (dict with 'output' containing function_call items)\n    - OpenAI Completion API format (list of tool_call objects)\n    - MCP format (<use_mcp_tool> XML tags in text)\n\n    Args:\n        llm_response_content_text: Response content in any supported format\n\n    Returns:\n        List of tool call dicts with keys: server_name, tool_name, arguments, id\n    \"\"\"\n    # tool_calls or MCP reponse are handled differently\n    # for openai response api, the tool_calls are in the response text\n    if isinstance(llm_response_content_text, dict):\n        tool_calls = []\n        for item in llm_response_content_text.get(\"output\") or []:\n            if item.get(\"type\") == \"function_call\":\n                name = item.get(\"name\", \"\")\n                if \"-\" in name:\n                    server_name, tool_name = name.rsplit(\"-\", maxsplit=1)\n                else:\n                    server_name = \"unknown\"\n                    tool_name = name\n                arguments_str = item.get(\"arguments\")\n                arguments = safe_json_loads(arguments_str)\n                arguments = filter_none_values(arguments)\n                tool_calls.append(\n                    dict(\n                        server_name=server_name,\n                        tool_name=tool_name,\n                        arguments=arguments,\n                        id=item.get(\"call_id\"),\n                    )\n                )\n        return tool_calls\n\n    # for openai completion api, the tool_calls are in the response text\n    if isinstance(llm_response_content_text, list):\n        tool_calls = []\n        for tool_call in llm_response_content_text:\n            name = tool_call.function.name\n            if \"-\" in name:\n                server_name, tool_name = name.rsplit(\"-\", maxsplit=1)\n            else:\n                server_name = \"unknown\"\n                tool_name = name\n            arguments_str = tool_call.function.arguments\n\n            # Parse JSON string to dictionary\n            try:\n                # Try to handle possible newlines and escape characters\n                arguments = json.loads(arguments_str)\n            except json.JSONDecodeError:\n                logger.info(\n                    f\"Warning: Unable to parse tool arguments JSON: {arguments_str}\"\n                )\n                # Try more lenient parsing or log error\n                try:\n                    # Try to replace some common error formats, such as Python dict strings\n                    arguments_str_fixed = (\n                        arguments_str.replace(\"'\", '\"')\n                        .replace(\"None\", \"null\")\n                        .replace(\"True\", \"true\")\n                        .replace(\"False\", \"false\")\n                    )\n                    arguments = json.loads(arguments_str_fixed)\n                    logger.info(\n                        \"Info: Successfully parsed arguments after attempting to fix.\"\n                    )\n                except json.JSONDecodeError:\n                    logger.info(\n                        f\"Error: Still unable to parse tool arguments JSON after fixing: {arguments_str}\"\n                    )\n                    arguments = {\n                        \"error\": \"Failed to parse arguments\",\n                        \"raw\": arguments_str,\n                    }\n\n            arguments = filter_none_values(arguments)\n            tool_calls.append(\n                dict(\n                    server_name=server_name,\n                    tool_name=tool_name,\n                    arguments=arguments,\n                    id=tool_call.id,\n                )\n            )\n        return tool_calls\n\n    # for other clients, such as qwen and anthropic, we use MCP instead of tool calls\n    tool_calls = []\n    # Find all <use_mcp_tool> tags\n    tool_call_patterns = re.findall(\n        r\"<use_mcp_tool>\\s*<server_name>(.*?)</server_name>\\s*<tool_name>(.*?)</tool_name>\\s*<arguments>\\s*([\\s\\S]*?)\\s*</arguments>\\s*</use_mcp_tool>\",\n        llm_response_content_text,\n        re.DOTALL,\n    )\n\n    for match in tool_call_patterns:\n        server_name = match[0].strip()\n        tool_name = match[1].strip()\n        arguments_str = match[2].strip()\n\n        # Parse JSON string to dictionary\n        arguments = safe_json_loads(arguments_str)\n        arguments = filter_none_values(arguments)\n\n        tool_calls.append(\n            {\n                \"server_name\": server_name,\n                \"tool_name\": tool_name,\n                \"arguments\": arguments,\n                \"id\": None,\n            }\n        )\n\n    return tool_calls\n"
  },
  {
    "path": "apps/miroflow-agent/src/utils/prompt_utils.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nPrompt templates and utilities for agent system prompts.\n\nThis module provides:\n- System prompt generation for MCP tool usage\n- Agent-specific prompt generation (main agent, browsing agent)\n- Summary prompt templates for final answer generation\n- Failure experience templates for retry mechanisms\n\"\"\"\n\n# ============================================================================\n# Format Error Messages\n# ============================================================================\n\nFORMAT_ERROR_MESSAGE = \"No \\\\boxed{} content found in the final answer.\"\n\n# ============================================================================\n# Failure Experience Templates (for format error retry)\n# ============================================================================\n\n# Header that appears once before all failure experiences\nFAILURE_EXPERIENCE_HEADER = \"\"\"\n\n=== Previous Attempts Analysis ===\nThe following summarizes what was tried before and why it didn't work. Use this to guide a NEW approach.\n\n\"\"\"\n\n# Template for each individual failure experience (used multiple times)\nFAILURE_EXPERIENCE_ITEM = \"\"\"[Attempt {attempt_number}]\n{failure_summary}\n\n\"\"\"\n\n# Footer that appears once after all failure experiences\nFAILURE_EXPERIENCE_FOOTER = \"\"\"=== End of Analysis ===\n\nBased on the above, you should try a different strategy this time.\n\"\"\"\n\nFAILURE_SUMMARY_PROMPT = \"\"\"The task was not completed successfully. Do NOT call any tools. Provide a summary:\n\nFailure type: [incomplete / blocked / misdirected / format_missed]\n  - incomplete: ran out of turns before finishing\n  - blocked: got stuck due to tool failure or missing information\n  - misdirected: went down the wrong path\n  - format_missed: found the answer but forgot to use \\\\boxed{}\nWhat happened: [describe the approach taken and why a final answer was not reached]\nUseful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]\"\"\"\n\n# Assistant prefix for failure summary generation (guides model to follow structured format)\nFAILURE_SUMMARY_THINK_CONTENT = \"\"\"We need to write a structured post-mortem style summary **without calling any tools**, explaining why the task was not completed, using these required sections:\n\n* **Failure type**: pick one from **incomplete / blocked / misdirected / format_missed**\n* **What happened**: describe the approach taken and why it didn't reach a final answer\n* **Useful findings**: list any facts, intermediate results, or conclusions that can be reused\"\"\"\n\nFAILURE_SUMMARY_ASSISTANT_PREFIX = (\n    f\"<think>\\n{FAILURE_SUMMARY_THINK_CONTENT}\\n</think>\\n\\n\"\n)\n\n# ============================================================================\n# MCP Tags for Parsing\n# ============================================================================\n\nmcp_tags = [\n    \"<use_mcp_tool>\",\n    \"</use_mcp_tool>\",\n    \"<server_name>\",\n    \"</server_name>\",\n    \"<arguments>\",\n    \"</arguments>\",\n]\n\nrefusal_keywords = [\n    \"time constraint\",\n    \"I’m sorry, but I can’t\",\n    \"I'm sorry, I cannot solve\",\n]\n\n\ndef generate_mcp_system_prompt(date, mcp_servers):\n    \"\"\"\n    Generate the MCP (Model Context Protocol) system prompt for LLM.\n\n    Creates a structured prompt that instructs the LLM on how to use available\n    MCP tools. Includes tool definitions, XML formatting instructions, and\n    general task-solving guidelines.\n\n    Args:\n        date: Current date object for timestamp inclusion\n        mcp_servers: List of server definitions, each containing 'name' and 'tools'\n\n    Returns:\n        Complete system prompt string with tool definitions and usage instructions\n    \"\"\"\n    formatted_date = date.strftime(\"%Y-%m-%d\")\n\n    # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt\n    template = f\"\"\"In this environment you have access to a set of tools you can use to answer the user's question. \n\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use. Today is: {formatted_date}\n\n# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n<use_mcp_tool>\n<server_name>server name here</server_name>\n<tool_name>tool name here</tool_name>\n<arguments>\n{{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\\\"escaped string\\\\\"\"\n}}\n</arguments>\n</use_mcp_tool>\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n\"\"\"\n\n    # Add MCP servers section\n    if mcp_servers and len(mcp_servers) > 0:\n        for server in mcp_servers:\n            template += f\"\\n## Server name: {server['name']}\\n\"\n\n            if \"tools\" in server and len(server[\"tools\"]) > 0:\n                for tool in server[\"tools\"]:\n                    # Skip tools that failed to load (they only have 'error' key)\n                    if \"error\" in tool and \"name\" not in tool:\n                        continue\n                    template += f\"### Tool name: {tool['name']}\\n\"\n                    template += f\"Description: {tool['description']}\\n\"\n                    template += f\"Input JSON schema: {tool['schema']}\\n\"\n\n    # Add the full objective system prompt\n    template += \"\"\"\n# General Objective\n\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.\n\n\"\"\"\n\n    return template\n\n\ndef generate_no_mcp_system_prompt(date):\n    \"\"\"\n    Generate a minimal system prompt without MCP tool definitions.\n\n    Used when no tools are available or when running in tool-less mode.\n\n    Args:\n        date: Current date object for timestamp inclusion\n\n    Returns:\n        Basic system prompt string without tool definitions\n    \"\"\"\n    formatted_date = date.strftime(\"%Y-%m-%d\")\n\n    # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt\n    template = \"\"\"In this environment you have access to a set of tools you can use to answer the user's question. \"\"\"\n\n    template += f\" Today is: {formatted_date}\\n\"\n\n    template += \"\"\"\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\n\"\"\"\n\n    # Add the full objective system prompt\n    template += \"\"\"\n# General Objective\n\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.\n\n\"\"\"\n    return template\n\n\ndef generate_agent_specific_system_prompt(agent_type=\"\"):\n    \"\"\"\n    Generate agent-specific objective prompts based on agent type.\n\n    Different agent types have different objectives:\n    - main: Task-solving agent that uses tools to answer questions\n    - agent-browsing: Web search and browsing agent for information retrieval\n\n    Args:\n        agent_type: Type of agent (\"main\", \"agent-browsing\", or \"browsing-agent\")\n\n    Returns:\n        Agent-specific objective prompt string\n    \"\"\"\n    if agent_type == \"main\":\n        system_prompt = \"\"\"\\n\n# Agent Specific Objective\n\nYou are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools.\n\n\"\"\"\n    elif agent_type == \"agent-browsing\" or agent_type == \"browsing-agent\":\n        system_prompt = \"\"\"# Agent Specific Objective\n\nYou are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps.\nDo not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content.\n\"\"\"\n    else:\n        raise ValueError(f\"Unknown agent type: {agent_type}\")\n    return system_prompt.strip()\n\n\ndef generate_agent_summarize_prompt(task_description, agent_type=\"\"):\n    \"\"\"\n    Generate the final summarization prompt for an agent.\n\n    Creates prompts that instruct agents to summarize their work and provide\n    final answers. Different agent types have different summarization formats:\n    - main: Must wrap answer in \\\\boxed{} with strict formatting rules\n    - agent-browsing: Provides structured report of findings\n\n    Args:\n        task_description: The original task/question to reference in the summary\n        agent_type: Type of agent (\"main\" or \"agent-browsing\")\n\n    Returns:\n        Summarization prompt string with formatting instructions\n    \"\"\"\n    if agent_type == \"main\":\n        summarize_prompt = (\n            \"Summarize the above conversation, and output the FINAL ANSWER to the original question.\\n\\n\"\n            \"If a clear answer has already been provided earlier in the conversation, do not rethink or recalculate it — \"\n            \"simply extract that answer and reformat it to match the required format below.\\n\"\n            \"If a definitive answer could not be determined, make a well-informed educated guess based on the conversation.\\n\\n\"\n            \"The original question is repeated here for reference:\\n\\n\"\n            f'\"{task_description}\"\\n\\n'\n            \"Wrap your final answer in \\\\boxed{}.\\n\"\n            \"Your final answer should be:\\n\"\n            \"- a number, OR\\n\"\n            \"- as few words as possible, OR\\n\"\n            \"- a comma-separated list of numbers and/or strings.\\n\\n\"\n            \"ADDITIONALLY, your final answer MUST strictly follow any formatting instructions in the original question — \"\n            \"such as alphabetization, sequencing, units, rounding, decimal places, etc.\\n\"\n            \"If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.\\n\"\n            \"If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.\\n\"\n            \"If you are asked for a comma-separated list, apply the above rules depending on whether the elements are numbers or strings.\\n\"\n            \"Do NOT include any punctuation such as '.', '!', or '?' at the end of the answer.\\n\"\n            \"Do NOT include any invisible or non-printable characters in the answer output.\\n\\n\"\n            \"You must absolutely not perform any MCP tool call, tool invocation, search, scrape, code execution, or similar actions.\\n\"\n            \"You can only answer the original question based on the information already retrieved and your own internal knowledge.\\n\"\n            \"If you attempt to call any tool, it will be considered a mistake.\"\n        )\n    elif agent_type == \"agent-browsing\":\n        summarize_prompt = (\n            \"This is a direct instruction to you (the assistant), not the result of a tool call.\\n\\n\"\n            \"We are now ending this session, and your conversation history will be deleted. \"\n            \"You must NOT initiate any further tool use. This is your final opportunity to report \"\n            \"*all* of the information gathered during the session.\\n\\n\"\n            \"The original task is repeated here for reference:\\n\\n\"\n            f'\"{task_description}\"\\n\\n'\n            \"Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\\n\\n\"\n            \"If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\\n\"\n            \"If you reached a conclusion or answer, include it as part of the response.\\n\"\n            \"If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, \"\n            \"Search results, quotes, and observations that might help a downstream agent solve the problem.\\n\"\n            \"If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\\n\\n\"\n            \"Your final response should be a clear, complete, and structured report.\\n\"\n            \"Organize the content into logical sections with appropriate headings.\\n\"\n            \"Do NOT include any tool call instructions, speculative filler, or vague summaries.\\n\"\n            \"Focus on factual, specific, and well-organized information.\"\n        )\n    else:\n        raise ValueError(f\"Unknown agent type: {agent_type}\")\n\n    return summarize_prompt.strip()\n"
  },
  {
    "path": "apps/miroflow-agent/src/utils/wrapper_utils.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"Wrapper utilities for handling responses and errors in a type-safe manner.\"\"\"\n\nfrom typing import Any, Dict, Optional\n\n\nclass ErrorBox:\n    \"\"\"\n    A wrapper class for error messages.\n\n    Use this to wrap error messages that should be distinguishable from normal responses.\n\n    Example:\n        >>> error = ErrorBox(\"Connection failed\")\n        >>> if ErrorBox.is_error_box(error):\n        ...     print(f\"Error: {error}\")\n    \"\"\"\n\n    def __init__(self, error_msg: str) -> None:\n        self.error_msg = error_msg\n\n    def __str__(self) -> str:\n        return self.error_msg\n\n    def __repr__(self) -> str:\n        return f\"ErrorBox({self.error_msg!r})\"\n\n    @staticmethod\n    def is_error_box(something: Any) -> bool:\n        \"\"\"Check if the given object is an ErrorBox instance.\"\"\"\n        return isinstance(something, ErrorBox)\n\n\nclass ResponseBox:\n    \"\"\"\n    A wrapper class for responses with optional extra information.\n\n    Use this to wrap responses that may include additional metadata.\n\n    Example:\n        >>> response = ResponseBox({\"data\": \"value\"}, {\"warning_msg\": \"Rate limited\"})\n        >>> if response.has_extra_info():\n        ...     print(response.get_extra_info())\n    \"\"\"\n\n    def __init__(\n        self, response: Any, extra_info: Optional[Dict[str, Any]] = None\n    ) -> None:\n        self.response = response\n        self.extra_info = extra_info\n\n    def __str__(self) -> str:\n        return str(self.response)\n\n    def __repr__(self) -> str:\n        return f\"ResponseBox({self.response!r}, extra_info={self.extra_info!r})\"\n\n    @staticmethod\n    def is_response_box(something: Any) -> bool:\n        \"\"\"Check if the given object is a ResponseBox instance.\"\"\"\n        return isinstance(something, ResponseBox)\n\n    def has_extra_info(self) -> bool:\n        \"\"\"Check if this response has extra information attached.\"\"\"\n        return self.extra_info is not None\n\n    def get_extra_info(self) -> Optional[Dict[str, Any]]:\n        \"\"\"Get the extra information attached to this response.\"\"\"\n        return self.extra_info\n\n    def get_response(self) -> Any:\n        \"\"\"Get the wrapped response object.\"\"\"\n        return self.response\n"
  },
  {
    "path": "apps/visualize-trace/.python-version",
    "content": "3.11 "
  },
  {
    "path": "apps/visualize-trace/README.md",
    "content": "# Trace Analysis Web Demo\n\nAn interactive web interface for analyzing and visualizing trace JSON files.\n\n## Installation and Running\n\n### Method 1: Using Python (Recommended)\n\n```bash\npip install -r requirements.txt\npython run.py\n```\n\nThe startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000`\n\n### Method 2: Using uv\n\n```bash\nuv run run.py\n```\n\n## Usage\n\n1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser\n\n1. **Load files**:\n\n   - Select the trace JSON file to analyze from the dropdown menu in the top navigation bar\n   - Click the \"Load\" button to load the file\n\n1. **View analysis results**:\n\n   - **Left panel**: Shows basic information, execution summary, and performance statistics\n   - **Right panel**: Displays detailed execution flow\n   - **Bottom panel**: Shows spans statistics and step logs statistics\n\n1. **Interactive operations**:\n\n   - Click on execution steps to expand/collapse detailed information\n   - Use \"Expand All\"/\"Collapse All\" buttons to control all steps\n   - Click \"View Details\" button to see complete message content\n"
  },
  {
    "path": "apps/visualize-trace/app.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport os\n\nfrom flask import Flask, jsonify, render_template, request\nfrom trace_analyzer import TraceAnalyzer\n\napp = Flask(__name__)\n\n# Global variable to store analyzer instance\nanalyzer = None\n\n\n@app.route(\"/\")\ndef index():\n    \"\"\"Main page\"\"\"\n    return render_template(\"index.html\")\n\n\n@app.route(\"/api/list_files\", methods=[\"GET\"])\ndef list_files():\n    \"\"\"List available JSON files\"\"\"\n    try:\n        directory = request.args.get(\"directory\", \"\")\n\n        if not directory:\n            # Default behavior: check parent directory\n            directory = os.path.abspath(\"..\")\n\n        # Expand path (handle ~ and other symbols)\n        directory = os.path.expanduser(directory)\n\n        # Convert to absolute path\n        directory = os.path.abspath(directory)\n\n        if not os.path.exists(directory):\n            return jsonify({\"error\": f\"Directory does not exist: {directory}\"}), 404\n\n        if not os.path.isdir(directory):\n            return jsonify({\"error\": f\"Path is not a directory: {directory}\"}), 400\n\n        try:\n            json_files = []\n            for file in os.listdir(directory):\n                if file.endswith(\".json\"):\n                    file_path = os.path.join(directory, file)\n                    try:\n                        # Get file size and modification time\n                        stat = os.stat(file_path)\n                        json_files.append(\n                            {\n                                \"name\": file,\n                                \"path\": file_path,\n                                \"size\": stat.st_size,\n                                \"modified\": stat.st_mtime,\n                            }\n                        )\n                    except Exception:\n                        json_files.append(\n                            {\"name\": file, \"path\": file_path, \"size\": 0, \"modified\": 0}\n                        )\n\n            # Sort by filename\n            json_files.sort(key=lambda x: x[\"name\"])\n\n            return jsonify(\n                {\n                    \"files\": json_files,\n                    \"directory\": directory,\n                    \"message\": f'Found {len(json_files)} JSON files in directory \"{directory}\"',\n                }\n            )\n        except PermissionError:\n            return jsonify(\n                {\"error\": f\"No permission to access directory: {directory}\"}\n            ), 403\n        except Exception as e:\n            return jsonify({\"error\": f\"Failed to read directory: {str(e)}\"}), 500\n\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/load_trace\", methods=[\"POST\"])\ndef load_trace():\n    \"\"\"Load trace file\"\"\"\n    global analyzer\n\n    data = request.get_json()\n    file_path = data.get(\"file_path\")\n\n    if not file_path:\n        return jsonify({\"error\": \"Please provide file path\"}), 400\n\n    # If it's a relative path, convert to absolute path\n    if not os.path.isabs(file_path):\n        file_path = os.path.abspath(file_path)\n\n    if not os.path.exists(file_path):\n        return jsonify({\"error\": f\"File does not exist: {file_path}\"}), 404\n\n    try:\n        analyzer = TraceAnalyzer(file_path)\n        return jsonify(\n            {\n                \"message\": \"File loaded successfully\",\n                \"file_path\": file_path,\n                \"file_name\": os.path.basename(file_path),\n            }\n        )\n    except Exception as e:\n        return jsonify({\"error\": f\"Failed to load file: {str(e)}\"}), 500\n\n\n@app.route(\"/api/basic_info\")\ndef get_basic_info():\n    \"\"\"Get basic information\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.get_basic_info())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/performance_summary\")\ndef get_performance_summary():\n    \"\"\"Get performance summary\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.get_performance_summary())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/execution_flow\")\ndef get_execution_flow():\n    \"\"\"Get execution flow\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.analyze_conversation_flow())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/execution_summary\")\ndef get_execution_summary():\n    \"\"\"Get execution summary\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.get_execution_summary())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/spans_summary\")\ndef get_spans_summary():\n    \"\"\"Get spans summary\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.get_spans_summary())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/step_logs_summary\")\ndef get_step_logs_summary():\n    \"\"\"Get step logs summary\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        return jsonify(analyzer.get_step_logs_summary())\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\n@app.route(\"/api/debug/raw_messages\")\ndef get_raw_messages():\n    \"\"\"Get raw message data for debugging\"\"\"\n    if not analyzer:\n        return jsonify({\"error\": \"Please load trace file first\"}), 400\n\n    try:\n        main_history = analyzer.get_main_agent_history()\n        browser_sessions = analyzer.get_browser_agent_sessions()\n\n        # Get message structure overview\n        main_messages = analyzer.get_main_agent_messages()\n        message_structure = []\n\n        for i, message in enumerate(main_messages):\n            message_structure.append(\n                {\n                    \"index\": i,\n                    \"role\": message.get(\"role\"),\n                    \"content_length\": len(str(message.get(\"content\", \"\"))),\n                    \"has_timestamp\": \"timestamp\" in message,\n                    \"content_preview\": str(message.get(\"content\", \"\"))[:100] + \"...\"\n                    if len(str(message.get(\"content\", \"\"))) > 100\n                    else str(message.get(\"content\", \"\")),\n                }\n            )\n\n        return jsonify(\n            {\n                \"main_agent_history_structure\": {\n                    \"total_messages\": len(main_messages),\n                    \"messages\": message_structure,\n                },\n                \"browser_sessions\": list(browser_sessions.keys()),\n                \"raw_main_history\": main_history,\n                \"raw_browser_sessions\": {\n                    k: v for k, v in list(browser_sessions.items())[:2]\n                },  # Only show first two sessions\n            }\n        )\n    except Exception as e:\n        return jsonify({\"error\": str(e)}), 500\n\n\nif __name__ == \"__main__\":\n    app.run(debug=True, host=\"0.0.0.0\", port=5000)\n"
  },
  {
    "path": "apps/visualize-trace/pyproject.toml",
    "content": "[project]\nname = \"trace-dashboard\"\nversion = \"1.0.0\"\ndescription = \"A web dashboard for analyzing trace JSON files\"\nrequires-python = \">=3.8\"\ndependencies = [\n    \"flask>=2.3.3\",\n    \"werkzeug>=2.3.7\",\n]\n\n[tool.uv]\ndev-dependencies = [] "
  },
  {
    "path": "apps/visualize-trace/requirements.txt",
    "content": "flask==2.3.3\nwerkzeug==2.3.7 "
  },
  {
    "path": "apps/visualize-trace/run.py",
    "content": "#!/usr/bin/env python3\n# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport os\nimport subprocess\nimport sys\n\n\ndef check_dependencies():\n    \"\"\"Check if dependencies are installed\"\"\"\n    try:\n        import importlib.util\n\n        if importlib.util.find_spec(\"flask\") is not None:\n            print(\"✓ Flask is installed\")\n            return True\n        else:\n            raise ImportError(\"Flask not found\")\n    except ImportError:\n        print(\"✗ Flask is not installed\")\n        print(\"Please use the following commands to install dependencies:\")\n        print(\"  uv sync\")\n        print(\"or:\")\n        print(\"  uv pip install -r requirements.txt\")\n        return False\n\n\ndef install_dependencies():\n    \"\"\"Install dependencies (recommended to use uv)\"\"\"\n    print(\"Installing dependencies...\")\n    try:\n        # Try using uv first\n        try:\n            subprocess.check_call([\"uv\", \"sync\"])\n            print(\"✓ Dependencies installed successfully using uv\")\n            return True\n        except (subprocess.CalledProcessError, FileNotFoundError):\n            # Fallback to pip\n            subprocess.check_call(\n                [sys.executable, \"-m\", \"pip\", \"install\", \"-r\", \"requirements.txt\"]\n            )\n            print(\"✓ Dependencies installed successfully using pip\")\n            return True\n    except subprocess.CalledProcessError:\n        print(\"✗ Failed to install dependencies\")\n        print(\"Please manually run: uv sync or pip install -r requirements.txt\")\n        return False\n\n\ndef main():\n    \"\"\"Main function\"\"\"\n    import argparse\n\n    # Parse command line arguments\n    parser = argparse.ArgumentParser(description=\"Trace Analysis Web Demo\")\n    parser.add_argument(\n        \"-p\",\n        \"--port\",\n        type=int,\n        default=5000,\n        help=\"Specify port number (default: 5000)\",\n    )\n    args = parser.parse_args()\n\n    print(\"=\" * 50)\n    print(\"Trace Analysis Web Demo\")\n    print(\"=\" * 50)\n\n    # Check dependencies\n    if not check_dependencies():\n        print(\"\\nInstalling dependencies...\")\n        if not install_dependencies():\n            print(\n                \"Please manually install dependencies: pip install -r requirements.txt\"\n            )\n            return\n\n    # Check JSON files\n    parent_dir = os.path.dirname(os.path.abspath(__file__))\n    json_files = [\n        f for f in os.listdir(os.path.join(parent_dir, \"..\")) if f.endswith(\".json\")\n    ]\n\n    if not json_files:\n        print(\"\\nWarning: No JSON files found in parent directory\")\n        print(\"Please ensure trace JSON files are in the trace_analyze/ directory\")\n    else:\n        print(f\"\\nFound {len(json_files)} JSON files:\")\n        for file in json_files[:5]:  # Only show first 5\n            print(f\"  - {file}\")\n        if len(json_files) > 5:\n            print(f\"  ... and {len(json_files) - 5} other files\")\n\n    # Start application\n    print(\"\\nStarting web application...\")\n    print(f\"Application will run at http://localhost:{args.port}\")\n    print(\"Press Ctrl+C to stop the application\")\n    print(\"=\" * 50)\n\n    try:\n        from app import app\n\n        app.run(debug=True, host=\"0.0.0.0\", port=args.port)\n    except KeyboardInterrupt:\n        print(\"\\nApplication stopped\")\n    except Exception as e:\n        print(f\"\\nFailed to start application: {e}\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "apps/visualize-trace/static/css/style.css",
    "content": "/* Global styles */\nbody {\n    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n    background-color: #f8f9fa;\n}\n\n/* Set special font for non-tool call content */\n.rendered-content, .preview-text, .browser-agent-content {\n    font-family: 'Courier New', 'Monaco', 'Menlo', monospace;\n    font-size: 14px;\n    line-height: 1.6;\n}\n\n/* Keep MCP tool calls using original font */\n.mcp-tool-call {\n    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;\n}\n\n/* Ensure MCP tool call content uses original font */\n.mcp-tool-call * {\n    font-family: inherit;\n}\n\n/* Navigation button styles */\n.nav-btn {\n    transition: all 0.3s ease;\n}\n\n.nav-btn:hover:not(:disabled) {\n    background-color: rgba(255, 255, 255, 0.2);\n}\n\n.nav-btn:disabled {\n    opacity: 0.5;\n    cursor: not-allowed;\n}\n\n/* File selection input group styles */\n.file-navigation {\n    display: flex;\n    align-items: center;\n    gap: 0;\n}\n\n.file-navigation .form-select {\n    border-radius: 0;\n    border-left: 0;\n    border-right: 0;\n}\n\n.file-navigation .btn:first-child {\n    border-top-right-radius: 0;\n    border-bottom-right-radius: 0;\n}\n\n.file-navigation .btn:last-child {\n    border-top-left-radius: 0;\n    border-bottom-left-radius: 0;\n}\n\n/* Loading overlay */\n.loading-overlay {\n    position: fixed;\n    top: 0;\n    left: 0;\n    width: 100%;\n    height: 100%;\n    background-color: rgba(0, 0, 0, 0.5);\n    display: flex;\n    justify-content: center;\n    align-items: center;\n    z-index: 9999;\n}\n\n/* Card styles */\n.card {\n    box-shadow: 0 2px 4px rgba(0,0,0,0.1);\n    border: none;\n    border-radius: 8px;\n}\n\n.card-header {\n    background-color: #f8f9fa;\n    border-bottom: 1px solid #dee2e6;\n    font-weight: 500;\n}\n\n/* Top summary panel styles */\n.summary-panel {\n    background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);\n    border: none;\n    box-shadow: 0 2px 10px rgba(0,0,0,0.1);\n}\n\n.summary-panel h6 {\n    color: #495057;\n    font-weight: 600;\n    margin-bottom: 15px;\n    padding-bottom: 8px;\n    border-bottom: 2px solid #dee2e6;\n}\n\n.summary-panel .answer-box {\n    background: #fff;\n    border: 1px solid #dee2e6;\n    border-radius: 6px;\n    padding: 8px 12px;\n    margin-bottom: 10px;\n    display: flex;\n    align-items: center;\n    gap: 10px;\n}\n\n.summary-panel .answer-label {\n    font-weight: 600;\n    color: #6c757d;\n    font-size: 12px;\n    margin-bottom: 0;\n    white-space: nowrap;\n}\n\n.summary-panel .answer-content {\n    font-size: 14px;\n    line-height: 1.4;\n    flex: 1;\n}\n\n.summary-panel .final-answer {\n    border-left: 4px solid #007bff;\n}\n\n.summary-panel .ground-truth {\n    border-left: 4px solid #28a745;\n}\n\n.summary-panel .stat-item {\n    background: #fff;\n    border: 1px solid #dee2e6;\n    border-radius: 6px;\n    padding: 8px 12px;\n    margin-bottom: 8px;\n    display: flex;\n    justify-content: space-between;\n    align-items: center;\n}\n\n.summary-panel .stat-label {\n    font-size: 12px;\n    color: #6c757d;\n    font-weight: 500;\n}\n\n.summary-panel .stat-value {\n    font-size: 14px;\n    font-weight: 600;\n    color: #495057;\n}\n\n/* Navigation panel styles */\n.navigation-panel {\n    position: sticky;\n    top: 20px;\n    max-height: calc(100vh - 40px);\n    overflow-y: auto;\n}\n\n.navigation-list {\n    max-height: calc(100vh - 120px);\n    overflow-y: auto;\n}\n\n.nav-item {\n    padding: 8px 12px;\n    border-bottom: 1px solid #f1f1f1;\n    cursor: pointer;\n    transition: all 0.2s ease;\n    font-size: 13px;\n}\n\n.nav-item:hover {\n    /* Remove background color change, can add other subtle visual feedback */\n}\n\n.nav-item.active {\n    background-color: #007bff;\n    color: white;\n}\n\n.nav-item .step-number {\n    font-weight: bold;\n    color: #6c757d;\n}\n\n.nav-item.active .step-number {\n    color: white;\n}\n\n.nav-item .step-role {\n    font-size: 11px;\n    padding: 2px 6px;\n    border-radius: 3px;\n    margin-left: 8px;\n}\n\n.nav-item .step-role.user {\n    background-color: #28a745;\n    color: white;\n}\n\n.nav-item .step-role.assistant {\n    background-color: #007bff;\n    color: white;\n}\n\n.nav-item .step-role.tool {\n    background-color: #fd7e14;\n    color: white;\n}\n\n.nav-item .step-role.system {\n    background-color: #6c757d;\n    color: white;\n}\n\n.nav-item .step-summary {\n    color: #6c757d;\n    font-size: 12px;\n    margin-top: 4px;\n    display: -webkit-box;\n    -webkit-line-clamp: 2;\n    -webkit-box-orient: vertical;\n    overflow: hidden;\n}\n\n.nav-item.active .step-summary {\n    color: #e9ecef;\n}\n\n/* Browser sub-step navigation styles */\n.nav-item.browser-sub-step {\n    padding-left: 24px;\n    font-size: 12px;\n    border-left: 2px solid #dee2e6;\n    margin-left: 8px;\n}\n\n.nav-item.browser-sub-step .step-number {\n    font-size: 11px;\n    color: #6c757d;\n}\n\n.nav-item.browser-sub-step .step-role {\n    font-size: 10px;\n    padding: 1px 4px;\n}\n\n.nav-item.browser-sub-step .step-summary {\n    font-size: 11px;\n    -webkit-line-clamp: 1;\n}\n\n.nav-item.browser-sub-step.active {\n    border-left-color: #007bff;\n}\n\n.nav-item .browser-toggle {\n    margin-left: auto;\n    cursor: pointer;\n    font-size: 12px;\n    color: #6c757d;\n    padding: 2px 4px;\n    border-radius: 2px;\n    transition: all 0.2s ease;\n}\n\n.nav-item .browser-toggle:hover {\n    background-color: #e9ecef;\n}\n\n.nav-item.active .browser-toggle {\n    color: #fff;\n}\n\n.nav-item.active .browser-toggle:hover {\n    background-color: rgba(255, 255, 255, 0.2);\n}\n\n.browser-sub-steps {\n    display: none;\n}\n\n.browser-sub-steps.expanded {\n    display: block;\n}\n\n/* Execution flow styles */\n.execution-steps-container {\n    display: flex;\n    flex-direction: column;\n    gap: 16px;\n}\n\n.execution-step {\n    border: 1px solid #dee2e6;\n    border-radius: 6px;\n    margin-bottom: 0;  /* Remove bottom margin, use gap instead */\n    background-color: white;\n    transition: all 0.3s ease;\n    position: relative;\n}\n\n.execution-step:hover {\n    box-shadow: 0 4px 8px rgba(0,0,0,0.1);\n}\n\n/* Ensure main agent steps have clear visual separation */\n.execution-step[data-agent*=\"main_agent\"] {\n    border-left: 4px solid #007bff;\n    z-index: 2;\n}\n\n/* Browser session should be indented inside main agent steps */\n.browser-session {\n    position: relative;\n    margin-left: 20px;\n    margin-top: 12px;\n}\n\n.step-header {\n    padding: 12px 16px;\n    cursor: pointer;\n    position: relative;\n    border-radius: 6px 6px 0 0;\n}\n\n.step-header:hover {\n    background-color: #f8f9fa;\n}\n\n.step-header.user-message {\n    background-color: #e3f2fd;\n    border-left: 4px solid #2196f3;\n}\n\n.step-header.assistant-message {\n    background-color: #f3e5f5;\n    border-left: 4px solid #9c27b0;\n}\n\n.step-header.user-message.browser-agent {\n    background-color: #e8f5e8;\n    border-left: 4px solid #4caf50;\n}\n\n.step-header.assistant-message.browser-agent {\n    background-color: #fff3e0;\n    border-left: 4px solid #ff9800;\n}\n\n.step-header.tool-message {\n    background-color: #fff3e0;\n    border-left: 4px solid #fd7e14;\n}\n\n.step-header.system-message {\n    background-color: #f8f9fa;\n    border-left: 4px solid #6c757d;\n}\n\n.step-content {\n    padding: 16px;\n    border-top: 1px solid #dee2e6;\n    background-color: #f8f9fa;\n}\n\n.step-toggle {\n    position: absolute;\n    right: 16px;\n    top: 50%;\n    transform: translateY(-50%);\n    font-size: 14px;\n    color: #6c757d;\n}\n\n/* Tool call styles */\n.tool-call {\n    background-color: #fff3cd;\n    border: 1px solid #ffeaa7;\n    border-radius: 4px;\n    padding: 10px;\n    margin: 8px 0;\n}\n\n.tool-call-header {\n    font-weight: 500;\n    color: #856404;\n    margin-bottom: 5px;\n}\n\n.tool-call.browser-agent {\n    background-color: #d4edda;\n    border-color: #c3e6cb;\n}\n\n.tool-call.browser-agent .tool-call-header {\n    color: #155724;\n}\n\n/* Browser session styles */\n.browser-session {\n    background-color: #f8f9fa;\n    border: 1px solid #dee2e6;\n    border-radius: 4px;\n    margin-top: 10px;\n    padding: 12px;\n}\n\n.browser-session-header {\n    font-weight: 500;\n    color: #495057;\n    margin-bottom: 10px;\n    padding-bottom: 8px;\n    border-bottom: 1px solid #dee2e6;\n}\n\n.browser-step {\n    background-color: white;\n    border: 1px solid #e9ecef;\n    border-radius: 4px;\n    margin-bottom: 8px;\n    padding: 8px 12px;\n}\n\n.browser-step.user {\n    background-color: #f0f8ff;\n}\n\n.browser-step.assistant {\n    background-color: #fdf6e3;\n}\n\n.browser-step.tool {\n    background-color: #fff3e0;\n    border-left: 3px solid #fd7e14;\n}\n\n.browser-step.system {\n    background-color: #f8f9fa;\n    border-left: 3px solid #6c757d;\n}\n\n/* Statistics styles */\n.stat-item {\n    display: flex;\n    justify-content: space-between;\n    align-items: center;\n    padding: 8px 0;\n    border-bottom: 1px solid #f0f0f0;\n}\n\n.stat-item:last-child {\n    border-bottom: none;\n}\n\n.stat-label {\n    font-weight: 500;\n    color: #495057;\n}\n\n.stat-value {\n    font-weight: 600;\n    color: #007bff;\n}\n\n/* Badge styles */\n.badge-role {\n    font-size: 11px;\n    padding: 4px 8px;\n    border-radius: 12px;\n    font-weight: 500;\n    text-transform: uppercase;\n}\n\n.badge-user {\n    background-color: #007bff;\n    color: white;\n}\n\n.badge-assistant {\n    background-color: #6f42c1;\n    color: white;\n}\n\n.badge-tool {\n    background-color: #fd7e14;\n    color: white;\n}\n\n.badge-system {\n    background-color: #6c757d;\n    color: white;\n}\n\n.badge-browser {\n    background-color: #28a745;\n    color: white;\n}\n\n/* Timestamp styles */\n.timestamp {\n    font-size: 11px;\n    color: #6c757d;\n    font-family: monospace;\n}\n\n/* Content preview styles */\n.content-preview {\n    background-color: white;\n    border-radius: 4px;\n    padding: 8px;\n    margin: 8px 0;\n}\n\n.content-preview .preview-text {\n    line-height: 1.5;\n}\n\n.expand-preview-btn {\n    color: #007bff !important;\n    font-size: 12px;\n    text-decoration: none;\n}\n\n.expand-preview-btn:hover {\n    text-decoration: underline !important;\n}\n\n/* Step content area style adjustments */\n.step-content {\n    padding: 16px;\n    border-top: 1px solid #dee2e6;\n    background-color: #f8f9fa;\n}\n\n.step-content h6 {\n    color: #495057;\n    font-weight: 600;\n    margin-bottom: 8px;\n    font-size: 14px;\n}\n\n/* Button styles */\n.btn-sm {\n    font-size: 12px;\n    padding: 4px 12px;\n}\n\n/* Responsive styles */\n@media (max-width: 768px) {\n    .container-fluid {\n        padding: 0 10px;\n    }\n    \n    .col-md-3 {\n        order: 2;\n    }\n    \n    .col-md-9 {\n        order: 1;\n    }\n    \n    .step-header {\n        padding: 10px 12px;\n    }\n    \n    .step-content {\n        padding: 12px;\n    }\n}\n\n/* Animation effects */\n.collapse {\n    transition: height 0.3s ease;\n}\n\n.fade-in {\n    animation: fadeIn 0.3s ease-in;\n}\n\n@keyframes fadeIn {\n    from {\n        opacity: 0;\n        transform: translateY(10px);\n    }\n    to {\n        opacity: 1;\n        transform: translateY(0);\n    }\n}\n\n/* Tooltip styles */\n.tooltip {\n    font-size: 12px;\n}\n\n/* Code styles */\n.code-block {\n    background-color: #f8f9fa;\n    border: 1px solid #e9ecef;\n    border-radius: 6px;\n    padding: 12px;\n    font-family: 'Courier New', monospace;\n    font-size: 13px;\n    white-space: pre-wrap;\n    margin: 8px 0;\n    overflow-x: auto;\n    line-height: 1.4;\n}\n\n.code-block pre {\n    margin: 0;\n    padding: 0;\n    background: none;\n    border: none;\n    font-family: inherit;\n    font-size: inherit;\n    white-space: pre-wrap;\n}\n\n.code-block code {\n    background: none;\n    border: none;\n    padding: 0;\n    font-family: inherit;\n    font-size: inherit;\n    color: inherit;\n}\n\n/* Error styles */\n.error-message {\n    color: #dc3545;\n    font-size: 14px;\n    margin-top: 8px;\n}\n\n.success-message {\n    color: #28a745;\n    font-size: 14px;\n    margin-top: 8px;\n}\n\n/* Scrollbar styles */\n::-webkit-scrollbar {\n    width: 8px;\n}\n\n::-webkit-scrollbar-track {\n    background: #f1f1f1;\n}\n\n::-webkit-scrollbar-thumb {\n    background: #c1c1c1;\n    border-radius: 4px;\n}\n\n::-webkit-scrollbar-thumb:hover {\n    background: #a8a8a8;\n}\n\n/* MCP tool call styles */\n.mcp-tool-call {\n    background-color: #ffffff;\n    border: 2px solid #007bff;\n    border-radius: 8px;\n    padding: 16px;\n    margin: 16px 0;\n    box-shadow: 0 2px 8px rgba(0,123,255,0.1);\n    overflow: hidden;\n}\n\n.mcp-tool-call.browser-agent {\n    border-color: #28a745;\n    background-color: #ffffff;\n    box-shadow: 0 2px 8px rgba(40,167,69,0.1);\n}\n\n.mcp-tool-header {\n    display: flex;\n    align-items: center;\n    font-weight: 600;\n    color: #007bff;\n    margin-bottom: 12px;\n    font-size: 14px;\n    padding-bottom: 8px;\n    border-bottom: 1px solid #e9ecef;\n}\n\n.mcp-tool-call.browser-agent .mcp-tool-header {\n    color: #28a745;\n}\n\n.mcp-tool-header i {\n    margin-right: 8px;\n    font-size: 16px;\n}\n\n.mcp-tool-name {\n    font-family: 'Courier New', monospace;\n    background-color: rgba(0,123,255,0.1);\n    padding: 4px 8px;\n    border-radius: 4px;\n    margin-left: 4px;\n    font-size: 13px;\n}\n\n.mcp-tool-call.browser-agent .mcp-tool-name {\n    background-color: rgba(40,167,69,0.1);\n}\n\n.mcp-tool-content {\n    margin-top: 8px;\n}\n\n.mcp-xml-structure {\n    font-family: 'Courier New', monospace;\n    background-color: #f8f9fa;\n    border: 1px solid #e9ecef;\n    border-radius: 4px;\n    padding: 16px;\n    line-height: 1.6;\n    font-size: 13px;\n}\n\n.xml-tag {\n    color: #0066cc;\n    font-weight: 500;\n    margin: 2px 0;\n}\n\n.xml-content {\n    margin-left: 20px;\n    margin: 8px 0 8px 20px;\n}\n\n.xml-arguments {\n    background-color: #ffffff;\n    border: 1px solid #dee2e6;\n    border-radius: 4px;\n    padding: 12px;\n    margin: 8px 0 8px 20px;\n    white-space: pre-wrap;\n    color: #2c3e50;\n    font-family: 'Courier New', monospace;\n    font-size: 12px;\n    line-height: 1.5;\n    overflow-x: auto;\n}\n\n.mcp-tool-args {\n    margin-top: 8px;\n}\n\n.mcp-args-label {\n    font-weight: 500;\n    color: #495057;\n    margin-bottom: 6px;\n    font-size: 13px;\n}\n\n/* Format badge styles */\n.badge-format {\n    font-size: 10px;\n    padding: 2px 6px;\n    border-radius: 3px;\n    font-weight: normal;\n}\n\n.badge-format {\n    background-color: #6c757d;\n    color: white;\n}\n\n/* Format badge default styles, can be extended as needed */\n\n/* Tool ID styles */\n.tool-id {\n    margin-top: 8px;\n    padding-top: 8px;\n    border-top: 1px solid #e9ecef;\n}\n\n/* Rendered content styles - white background */\n.rendered-content {\n    background-color: white;\n    padding: 12px;\n    border-radius: 4px;\n    border: 1px solid #e9ecef;\n    margin: 8px 0;\n    line-height: 1.6;\n}\n\n.rendered-content h1 {\n    color: #2c3e50;\n    border-bottom: 2px solid #3498db;\n    padding-bottom: 8px;\n    margin-bottom: 16px;\n    font-size: 1.5em;\n}\n\n.rendered-content h2 {\n    color: #34495e;\n    border-bottom: 1px solid #bdc3c7;\n    padding-bottom: 6px;\n    margin-bottom: 12px;\n    font-size: 1.3em;\n}\n\n.rendered-content h3 {\n    color: #2c3e50;\n    margin-bottom: 10px;\n    font-size: 1.1em;\n}\n\n.rendered-content strong {\n    color: #2c3e50;\n    font-weight: 600;\n}\n\n.rendered-content em {\n    color: #7f8c8d;\n    font-style: italic;\n}\n\n.rendered-content ul, .rendered-content ol {\n    margin: 10px 0;\n    padding-left: 20px;\n}\n\n.rendered-content li {\n    margin: 4px 0;\n}\n\n.rendered-content a {\n    color: #3498db;\n    text-decoration: none;\n}\n\n.rendered-content a:hover {\n    text-decoration: underline;\n}\n\n.rendered-content .inline-code {\n    background-color: #f8f9fa;\n    color: #e83e8c;\n    padding: 2px 4px;\n    border-radius: 3px;\n    font-family: 'Courier New', monospace;\n    font-size: 0.9em;\n}\n\n.rendered-content .code-block {\n    background-color: #f8f9fa;\n    border: 1px solid #e9ecef;\n    border-radius: 4px;\n    margin: 8px 0;\n    overflow-x: auto;\n}\n\n.rendered-content .code-block pre {\n    margin: 0;\n    padding: 12px;\n    background: none;\n    border: none;\n    font-family: 'Courier New', monospace;\n    font-size: 0.9em;\n    line-height: 1.4;\n    color: #2c3e50;\n}\n\n.rendered-content .code-block code {\n    background: none;\n    padding: 0;\n    color: inherit;\n    font-family: inherit;\n}\n\n/* Improve browser agent content styles */\n.browser-agent-content {\n    background-color: #f8fff8;\n    border: 1px solid #d4edda;\n    border-radius: 4px;\n    padding: 12px;\n    margin: 8px 0;\n}\n\n/* Improve content display in modal */\n.modal-body .rendered-content {\n    max-height: 400px;\n    overflow-y: auto;\n} "
  },
  {
    "path": "apps/visualize-trace/static/js/script.js",
    "content": "// Global variables\nlet currentFlowData = null;\nlet currentBasicInfo = null;\nlet currentFileList = [];\nlet currentFileIndex = -1;\n\n// DOM elements\nconst elements = {\n    directoryInput: document.getElementById('directoryInput'),\n    browseDirectoryBtn: document.getElementById('browseDirectoryBtn'),\n    fileSelect: document.getElementById('fileSelect'),\n    prevFileBtn: document.getElementById('prevFileBtn'),\n    nextFileBtn: document.getElementById('nextFileBtn'),\n    loadBtn: document.getElementById('loadBtn'),\n    refreshBtn: document.getElementById('refreshBtn'),\n    expandAllBtn: document.getElementById('expandAllBtn'),\n    collapseAllBtn: document.getElementById('collapseAllBtn'),\n    basicInfo: document.getElementById('basicInfo'),\n    executionSummary: document.getElementById('executionSummary'),\n    performanceSummary: document.getElementById('performanceSummary'),\n    executionFlow: document.getElementById('executionFlow'),\n    spansStats: document.getElementById('spansStats'),\n    stepLogsStats: document.getElementById('stepLogsStats'),\n    loadingOverlay: document.getElementById('loadingOverlay'),\n    errorToast: document.getElementById('errorToast'),\n    successToast: document.getElementById('successToast'),\n    errorMessage: document.getElementById('errorMessage'),\n    successMessage: document.getElementById('successMessage'),\n    messageModal: document.getElementById('messageModal'),\n    messageContent: document.getElementById('messageContent'),\n    navigationList: document.getElementById('navigationList')\n};\n\n// Initialize\ndocument.addEventListener('DOMContentLoaded', function() {\n    initializeApp();\n});\n\nfunction initializeApp() {\n    // Bind event listeners\n    elements.browseDirectoryBtn.addEventListener('click', browseDirectory);\n    elements.directoryInput.addEventListener('keypress', function(e) {\n        if (e.key === 'Enter') {\n            browseDirectory();\n        }\n    });\n    elements.fileSelect.addEventListener('change', onFileSelect);\n    elements.prevFileBtn.addEventListener('click', gotoPrevFile);\n    elements.nextFileBtn.addEventListener('click', gotoNextFile);\n    elements.loadBtn.addEventListener('click', loadTraceFile);\n    elements.refreshBtn.addEventListener('click', refreshFileList);\n    elements.expandAllBtn.addEventListener('click', expandAllSteps);\n    elements.collapseAllBtn.addEventListener('click', collapseAllSteps);\n    \n    // Set default directory path\n    setDefaultDirectory();\n    \n    // Initialize button states\n    updateNavigationButtons();\n    \n    // Add keyboard shortcut support\n    document.addEventListener('keydown', handleKeyboardShortcuts);\n}\n\n// Utility functions\nfunction showLoading() {\n    elements.loadingOverlay.classList.remove('d-none');\n}\n\nfunction hideLoading() {\n    elements.loadingOverlay.classList.add('d-none');\n}\n\nfunction showError(message) {\n    elements.errorMessage.textContent = message;\n    const toast = new bootstrap.Toast(elements.errorToast);\n    toast.show();\n}\n\nfunction showSuccess(message) {\n    elements.successMessage.textContent = message;\n    const toast = new bootstrap.Toast(elements.successToast);\n    toast.show();\n}\n\nfunction formatTimestamp(timestamp) {\n    if (!timestamp) return '';\n    try {\n        const date = new Date(timestamp);\n        return date.toLocaleString('zh-CN');\n    } catch (e) {\n        return timestamp;\n    }\n}\n\nfunction truncateText(text, maxLength = 100) {\n    if (!text) return '';\n    if (text.length <= maxLength) return text;\n    return text.substring(0, maxLength) + '...';\n}\n\nfunction formatFileSize(bytes) {\n    if (bytes === 0) return '0 B';\n    const k = 1024;\n    const sizes = ['B', 'KB', 'MB', 'GB'];\n    const i = Math.floor(Math.log(bytes) / Math.log(k));\n    return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];\n}\n\n// Handle MCP tool call display\nfunction formatMcpToolCallWithPlaceholders(text, placeholders) {\n    if (!text || typeof text !== 'string') return text;\n    \n    // MCP tool call regex - more lenient matching, including newlines\n    const mcpPattern = /<use_mcp_tool>\\s*<server_name>(.*?)<\\/server_name>\\s*<tool_name>(.*?)<\\/tool_name>\\s*<arguments>\\s*(.*?)\\s*<\\/arguments>\\s*<\\/use_mcp_tool>/gs;\n    \n    let placeholderCounter = 0;\n    \n    return text.replace(mcpPattern, (match, serverName, toolName, args) => {\n        // Clean and format arguments\n        let formattedArgs = args.trim();\n        \n        // First convert escaped newlines to actual newlines\n        formattedArgs = formattedArgs.replace(/\\\\n/g, '\\n');\n        \n        try {\n            // Try to format JSON arguments\n            const parsed = JSON.parse(formattedArgs);\n            formattedArgs = JSON.stringify(parsed, null, 2);\n        } catch (e) {\n            // If not JSON, keep as is but ensure newlines are correct\n            formattedArgs = formattedArgs.replace(/\\n/g, '\\n');\n        }\n        \n        const isBrowserAgent = serverName.trim() === 'browsing-agent';\n        const toolClass = isBrowserAgent ? 'browser-agent' : '';\n        const iconClass = isBrowserAgent ? 'globe' : 'cog';\n        \n        // Create complete MCP tool call HTML structure\n        const mcpHtml = `<div class=\"mcp-tool-call ${toolClass}\">\n    <div class=\"mcp-tool-header\">\n        <i class=\"fas fa-${iconClass}\"></i>\n        <span class=\"mcp-tool-name\">${serverName.trim()}.${toolName.trim()}</span>\n    </div>\n    <div class=\"mcp-tool-content\">\n        <div class=\"mcp-xml-structure\">\n            <div class=\"xml-tag\">&lt;use_mcp_tool&gt;</div>\n            <div class=\"xml-content\">\n                <div class=\"xml-tag\">&lt;server_name&gt;${serverName.trim()}&lt;/server_name&gt;</div>\n                <div class=\"xml-tag\">&lt;tool_name&gt;${toolName.trim()}&lt;/tool_name&gt;</div>\n                <div class=\"xml-tag\">&lt;arguments&gt;</div>\n                <div class=\"xml-arguments\">${formattedArgs}</div>\n                <div class=\"xml-tag\">&lt;/arguments&gt;</div>\n            </div>\n            <div class=\"xml-tag\">&lt;/use_mcp_tool&gt;</div>\n        </div>\n    </div>\n</div>`;\n        \n        // Use simple placeholder ID to avoid complex JSON strings\n        const placeholderId = `MCP_PLACEHOLDER_${placeholderCounter++}`;\n        placeholders.set(placeholderId, mcpHtml);\n        \n        return `[${placeholderId}]`;\n    });\n}\n\n// Create new format tool call HTML\nfunction createNewFormatToolCallHTML(tool) {\n    const isBeowserAgent = tool.server_name.includes('browsing') || tool.server_name.includes('agent');\n    const toolClass = isBeowserAgent ? 'browser-agent' : '';\n    const iconClass = isBeowserAgent ? 'globe' : 'cog';\n    \n    // Format arguments\n    let formattedArgs;\n    try {\n        if (typeof tool.arguments === 'string') {\n            formattedArgs = tool.arguments;\n        } else {\n            formattedArgs = JSON.stringify(tool.arguments, null, 2);\n        }\n    } catch (e) {\n        formattedArgs = String(tool.arguments);\n    }\n    \n    return `<div class=\"mcp-tool-call ${toolClass}\">\n    <div class=\"mcp-tool-header\">\n        <i class=\"fas fa-${iconClass}\"></i>\n        <span class=\"mcp-tool-name\">${tool.server_name}.${tool.tool_name}</span>\n        <span class=\"badge badge-format ms-2\">${tool.format || 'new'}</span>\n    </div>\n    <div class=\"mcp-tool-content\">\n        <div class=\"mcp-tool-args\">\n            <div class=\"mcp-args-label\">Arguments:</div>\n            <div class=\"xml-arguments\">${formattedArgs}</div>\n        </div>\n        ${tool.id ? `<div class=\"tool-id\"><small class=\"text-muted\">ID: ${tool.id}</small></div>` : ''}\n    </div>\n</div>`;\n}\n\n// Modified markdown rendering support - preserve markdown syntax, only handle newlines and MCP tool calls\nfunction renderMarkdown(text) {\n    if (!text || typeof text !== 'string') return '';\n    \n    let html = text;\n    let placeholders = new Map();\n    \n    // First process MCP tool calls, before HTML escaping\n    html = formatMcpToolCallWithPlaceholders(html, placeholders);\n    \n    // Escape HTML special characters, but protect MCP tool call placeholders\n    html = html.replace(/&/g, '&amp;')\n               .replace(/</g, '&lt;')\n               .replace(/>/g, '&gt;')\n               .replace(/\"/g, '&quot;')\n               .replace(/'/g, '&#39;');\n    \n    // Only handle newlines, preserve all markdown syntax\n    html = html.replace(/\\n/g, '<br>');\n    \n    // Finally process MCP tool call placeholders, insert HTML directly\n    placeholders.forEach((htmlContent, placeholderId) => {\n        html = html.replace(`[${placeholderId}]`, htmlContent);\n    });\n    \n    return html;\n}\n\n// 增强的内容渲染函数\nfunction isJsonString(str) {\n    try {\n        const trimmed = str.trim();\n        if ((trimmed.startsWith('{') && trimmed.endsWith('}')) || \n            (trimmed.startsWith('[') && trimmed.endsWith(']'))) {\n            JSON.parse(trimmed);\n            return true;\n        }\n        return false;\n    } catch (e) {\n        return false;\n    }\n}\n\nfunction formatJsonContent(content) {\n    try {\n        const trimmed = content.trim();\n        const parsed = JSON.parse(trimmed);\n        const formatted = JSON.stringify(parsed, null, 4);\n        return `<div class=\"code-block\"><pre><code>${formatted}</code></pre></div>`;\n    } catch (e) {\n        return content;\n    }\n}\n\nfunction renderContent(content, isBrowserAgent = false) {\n    if (!content) return '';\n    \n    // 检查是否为纯JSON字符串\n    if (isJsonString(content)) {\n        return formatJsonContent(content);\n    }\n    \n    // 直接渲染Markdown（已包含MCP工具调用处理）\n    let processedContent = renderMarkdown(content);\n    \n    // 如果是browser agent，添加特殊样式\n    if (isBrowserAgent) {\n        processedContent = `<div class=\"browser-agent-content\">${processedContent}</div>`;\n    }\n    \n    return processedContent;\n}\n\n// API调用函数\nasync function apiCall(url, options = {}) {\n    try {\n        const response = await fetch(url, {\n            headers: {\n                'Content-Type': 'application/json',\n                ...options.headers\n            },\n            ...options\n        });\n        \n        if (!response.ok) {\n            throw new Error(`HTTP error! status: ${response.status}`);\n        }\n        \n        return await response.json();\n    } catch (error) {\n        console.error('API call failed:', error);\n        throw error;\n    }\n}\n\n// 文件管理\nfunction setDefaultDirectory() {\n    // 设置默认目录为上级目录\n    elements.directoryInput.value = '../';\n    // 自动加载文件列表\n    refreshFileList();\n}\n\nasync function browseDirectory() {\n    const directory = elements.directoryInput.value.trim();\n    if (!directory) {\n        showError('请输入目录路径');\n        return;\n    }\n    \n    await refreshFileList(directory);\n}\n\nasync function refreshFileList(directory = null) {\n    try {\n        const targetDirectory = directory || elements.directoryInput.value.trim();\n        if (!targetDirectory) {\n            elements.fileSelect.innerHTML = '<option value=\"\">请先输入目录路径...</option>';\n            currentFileList = [];\n            currentFileIndex = -1;\n            updateNavigationButtons();\n            return;\n        }\n        \n        showLoading();\n        \n        const url = `/api/list_files?directory=${encodeURIComponent(targetDirectory)}`;\n        const data = await apiCall(url);\n        \n        elements.fileSelect.innerHTML = '<option value=\"\">选择Trace文件...</option>';\n        \n        if (data.files.length === 0) {\n            elements.fileSelect.innerHTML = '<option value=\"\">该目录下没有JSON文件</option>';\n            currentFileList = [];\n            currentFileIndex = -1;\n            showSuccess(`目录 \"${targetDirectory}\" 下没有找到JSON文件`);\n            updateNavigationButtons();\n            return;\n        }\n        \n        // 保存文件列表到全局变量\n        currentFileList = data.files;\n        currentFileIndex = -1;\n        \n        data.files.forEach((file, index) => {\n            const option = document.createElement('option');\n            option.value = file.path;\n            option.dataset.index = index;\n            const fileSize = formatFileSize(file.size);\n            const modifiedDate = new Date(file.modified * 1000).toLocaleString('zh-CN');\n            option.textContent = `${file.name} (${fileSize}, ${modifiedDate})`;\n            elements.fileSelect.appendChild(option);\n        });\n        \n        showSuccess(`在目录 \"${targetDirectory}\" 中找到 ${data.files.length} 个JSON文件`);\n        updateNavigationButtons();\n        \n    } catch (error) {\n        showError('获取文件列表失败: ' + error.message);\n        elements.fileSelect.innerHTML = '<option value=\"\">获取文件列表失败</option>';\n        currentFileList = [];\n        currentFileIndex = -1;\n        updateNavigationButtons();\n    } finally {\n        hideLoading();\n    }\n}\n\n// 文件切换功能\nfunction onFileSelect() {\n    const selectedOption = elements.fileSelect.options[elements.fileSelect.selectedIndex];\n    if (selectedOption && selectedOption.dataset.index !== undefined) {\n        currentFileIndex = parseInt(selectedOption.dataset.index);\n        updateNavigationButtons();\n    }\n}\n\nfunction gotoPrevFile() {\n    if (currentFileIndex > 0) {\n        currentFileIndex--;\n        selectFileByIndex(currentFileIndex);\n        loadTraceFile();\n    }\n}\n\nfunction gotoNextFile() {\n    if (currentFileIndex < currentFileList.length - 1) {\n        currentFileIndex++;\n        selectFileByIndex(currentFileIndex);\n        loadTraceFile();\n    }\n}\n\nfunction selectFileByIndex(index) {\n    if (index >= 0 && index < currentFileList.length) {\n        elements.fileSelect.selectedIndex = index + 1; // +1 因为第一个选项是\"选择Trace文件...\"\n        currentFileIndex = index;\n        updateNavigationButtons();\n    }\n}\n\nfunction updateNavigationButtons() {\n    const hasPrev = currentFileIndex > 0;\n    const hasNext = currentFileIndex >= 0 && currentFileIndex < currentFileList.length - 1;\n    \n    elements.prevFileBtn.disabled = !hasPrev;\n    elements.nextFileBtn.disabled = !hasNext;\n    \n    // 更新按钮提示文本\n    if (currentFileIndex >= 0 && currentFileList.length > 0) {\n        const prevFile = hasPrev ? currentFileList[currentFileIndex - 1] : null;\n        const nextFile = hasNext ? currentFileList[currentFileIndex + 1] : null;\n        \n        elements.prevFileBtn.title = prevFile ? `上一个: ${prevFile.name}` : '没有上一个文件';\n        elements.nextFileBtn.title = nextFile ? `下一个: ${nextFile.name}` : '没有下一个文件';\n    } else {\n        elements.prevFileBtn.title = '上一个文件';\n        elements.nextFileBtn.title = '下一个文件';\n    }\n}\n\n// 键盘快捷键处理\nfunction handleKeyboardShortcuts(event) {\n    // 只有在没有焦点在输入框时才处理快捷键\n    if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA' || event.target.tagName === 'SELECT') {\n        return;\n    }\n    \n    // 防止在模态框打开时触发\n    if (elements.messageModal.classList.contains('show')) {\n        return;\n    }\n    \n    switch (event.key) {\n        case 'ArrowLeft':\n            event.preventDefault();\n            if (!elements.prevFileBtn.disabled) {\n                gotoPrevFile();\n            }\n            break;\n        case 'ArrowRight':\n            event.preventDefault();\n            if (!elements.nextFileBtn.disabled) {\n                gotoNextFile();\n            }\n            break;\n        case 'Enter':\n            event.preventDefault();\n            if (elements.fileSelect.value) {\n                loadTraceFile();\n            }\n            break;\n        case 'r':\n        case 'R':\n            if (event.ctrlKey) {\n                event.preventDefault();\n                refreshFileList();\n            }\n            break;\n    }\n}\n\nasync function loadTraceFile() {\n    const selectedFile = elements.fileSelect.value;\n    if (!selectedFile) {\n        showError('请选择一个trace文件');\n        return;\n    }\n    \n    showLoading();\n    \n    try {\n        // 加载文件\n        await apiCall('/api/load_trace', {\n            method: 'POST',\n            body: JSON.stringify({ file_path: selectedFile })\n        });\n        \n        // 并行加载所有数据\n        const [basicInfo, executionSummary, performanceSummary, executionFlow, spansStats, stepLogsStats] = await Promise.all([\n            apiCall('/api/basic_info'),\n            apiCall('/api/execution_summary'),\n            apiCall('/api/performance_summary'),\n            apiCall('/api/execution_flow'),\n            apiCall('/api/spans_summary'),\n            apiCall('/api/step_logs_summary')\n        ]);\n        \n        // 更新界面\n        updateBasicInfo(basicInfo);\n        updateExecutionSummary(executionSummary);\n        updatePerformanceSummary(performanceSummary);\n        updateExecutionFlow(executionFlow);\n        updateSpansStats(spansStats);\n        updateStepLogsStats(stepLogsStats);\n        \n        // 显示当前文件信息\n        const currentFile = currentFileList[currentFileIndex];\n        if (currentFile) {\n            showSuccess(`文件加载成功: ${currentFile.name} (${currentFileIndex + 1}/${currentFileList.length})`);\n        } else {\n            showSuccess('文件加载成功');\n        }\n        \n    } catch (error) {\n        showError('加载文件失败: ' + error.message);\n    } finally {\n        hideLoading();\n    }\n}\n\n// 界面更新函数\nfunction updateBasicInfo(data) {\n    currentBasicInfo = data;\n    \n    const finalAnswer = data.final_boxed_answer || '暂无答案';\n    const groundTruth = data.ground_truth || '暂无正确答案';\n    \n    const html = `\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">任务ID:</span>\n            <span class=\"stat-value\">${data.task_id || 'N/A'}</span>\n        </div>\n        <div class=\"answer-box final-answer\">\n            <div class=\"answer-label\">最终答案</div>\n            <div class=\"answer-content\">${finalAnswer}</div>\n        </div>\n        <div class=\"answer-box ground-truth\">\n            <div class=\"answer-label\">正确答案</div>\n            <div class=\"answer-content\">${groundTruth}</div>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">判断结果:</span>\n            <span class=\"stat-value badge ${data.final_judge_result === 'CORRECT' ? 'bg-success' : 'bg-danger'}\">${data.final_judge_result || 'N/A'}</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">判断类型:</span>\n            <span class=\"stat-value\">${data.judge_type || 'N/A'}</span>\n        </div>\n    `;\n    \n    elements.basicInfo.innerHTML = html;\n}\n\nfunction updateExecutionSummary(data) {\n    const html = `\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">总步骤数:</span>\n            <span class=\"stat-value\">${data.total_steps}</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">工具调用次数:</span>\n            <span class=\"stat-value\">${data.total_tool_calls}</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">Browser会话数:</span>\n            <span class=\"stat-value\">${data.browser_sessions_count}</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">browsing-agent.search_and_browse:</span>\n            <span class=\"stat-value\">${data.tool_usage_distribution['browsing-agent.search_and_browse'] || 0}</span>\n        </div>\n    `;\n    \n    elements.executionSummary.innerHTML = html;\n}\n\nfunction updatePerformanceSummary(data) {\n    if (!data || Object.keys(data).length === 0) {\n        elements.performanceSummary.innerHTML = '<p class=\"text-muted\">无性能数据</p>';\n        return;\n    }\n    \n    const html = `\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">总执行时间:</span>\n            <span class=\"stat-value\">${(data.total_wall_time || 0).toFixed(2)}s</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">browsing_agent:</span>\n            <span class=\"stat-value\">${data.primary_breakdown?.browsing_agent ? (data.primary_breakdown.browsing_agent.total || 0).toFixed(2) : 0}s</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">main_agent:</span>\n            <span class=\"stat-value\">${data.primary_breakdown?.main_agent ? (data.primary_breakdown.main_agent.total || 0).toFixed(2) : 0}s</span>\n        </div>\n    `;\n    \n    elements.performanceSummary.innerHTML = html;\n}\n\nfunction updateExecutionFlow(data) {\n    currentFlowData = data;\n    \n    if (!data || data.length === 0) {\n        elements.executionFlow.innerHTML = '<p class=\"text-muted\">无执行流程数据</p>';\n        updateNavigationList([]);\n        return;\n    }\n    \n    // 确保每个步骤都是独立的顶级元素\n    const stepsContainer = document.createElement('div');\n    stepsContainer.className = 'execution-steps-container';\n    \n    data.forEach((step, index) => {\n        const stepElement = document.createElement('div');\n        stepElement.innerHTML = createStepHTML(step, index);\n        stepsContainer.appendChild(stepElement.firstElementChild);\n    });\n    \n    elements.executionFlow.innerHTML = '';\n    elements.executionFlow.appendChild(stepsContainer);\n    \n    // 更新导航列表\n    updateNavigationList(data);\n    \n    // 绑定事件监听器\n    bindStepEventListeners();\n}\n\nfunction createStepHTML(step, index) {\n    const roleClass = step.role === 'user' ? 'user-message' : \n                     step.role === 'tool' ? 'tool-message' : \n                     step.role === 'system' ? 'system-message' : \n                     'assistant-message';\n    const agentClass = step.agent.includes('browser') ? 'browser-agent' : '';\n    \n    // 渲染内容\n    const renderedPreview = renderContent(step.content_preview);\n    const renderedFullContent = renderContent(step.full_content);\n    \n    return `\n        <div class=\"execution-step fade-in\" data-step-id=\"${step.step_id}\" data-agent=\"${step.agent}\" id=\"step-${index}\">\n            <div class=\"step-header ${roleClass} ${agentClass}\" data-toggle=\"collapse\" data-target=\"#step-content-${index}\">\n                <div class=\"d-flex justify-content-between align-items-center\">\n                    <div>\n                        <span class=\"badge badge-role badge-${step.role}\">${step.role}</span>\n                        <span class=\"badge badge-browser ms-2\">${step.agent}</span>\n                        ${step.tool_calls.length > 0 ? `<span class=\"badge bg-warning text-dark ms-2\">${step.tool_calls.length} 工具调用</span>` : ''}\n                        ${step.browser_session ? `<span class=\"badge bg-success ms-2\">Browser会话</span>` : ''}\n                    </div>\n                    <div class=\"d-flex align-items-center\">\n                        <span class=\"timestamp me-2\">${formatTimestamp(step.timestamp)}</span>\n                        <span class=\"step-toggle\">\n                            <i class=\"fas fa-chevron-down\"></i>\n                        </span>\n                    </div>\n                </div>\n                <div class=\"content-preview mt-2\">\n                    <div class=\"preview-text\">\n                        ${renderedPreview}\n                    </div>\n                </div>\n            </div>\n            \n            <div class=\"step-content collapse\" id=\"step-content-${index}\">\n                <div class=\"mb-3\">\n                    <h6>完整内容:</h6>\n                    <div class=\"rendered-content\">${renderedFullContent}</div>\n                </div>\n                \n                ${step.tool_calls.length > 0 ? `\n                    <div class=\"mb-3\">\n                        <h6>工具调用:</h6>\n                        ${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}\n                    </div>\n                ` : ''}\n                \n                ${step.browser_flow && step.browser_flow.length > 0 ? `\n                    <div class=\"mb-3\">\n                        <h6>Browser会话流程:</h6>\n                        <div class=\"browser-session\">\n                            <div class=\"browser-session-header\">\n                                <i class=\"fas fa-globe\"></i> ${step.browser_session} (${step.browser_flow.length} 步骤)\n                            </div>\n                            ${step.browser_flow.map(browserStep => createBrowserStepHTML(browserStep, index)).join('')}\n                        </div>\n                    </div>\n                ` : ''}\n                \n                <div class=\"d-flex justify-content-end\">\n                    <button class=\"btn btn-outline-primary btn-sm\" onclick=\"showFullMessage(${step.step_id})\">\n                        <i class=\"fas fa-expand\"></i> 查看详情\n                    </button>\n                </div>\n            </div>\n        </div>\n    `;\n}\n\nfunction createToolCallHTML(tool) {\n    // 如果是新格式的工具调用，使用新的渲染方式\n    if (tool.format === 'new') {\n        return createNewFormatToolCallHTML(tool);\n    }\n    \n    // 旧格式（MCP或其他）使用原有的渲染方式\n    const isBeowserAgent = tool.server_name === 'browsing-agent' || tool.server_name.includes('agent');\n    const toolClass = isBeowserAgent ? 'browser-agent' : '';\n    \n    return `\n        <div class=\"tool-call ${toolClass}\">\n            <div class=\"tool-call-header\">\n                <i class=\"fas fa-${isBeowserAgent ? 'globe' : 'wrench'}\"></i>\n                ${tool.server_name}.${tool.tool_name}\n                <span class=\"badge badge-format ms-2\">${tool.format || 'mcp'}</span>\n            </div>\n            <div class=\"tool-arguments\">\n                <strong>参数:</strong>\n                <div class=\"code-block\">${JSON.stringify(tool.arguments, null, 2)}</div>\n            </div>\n        </div>\n    `;\n}\n\nfunction createBrowserStepHTML(step, parentIndex) {\n    // 为browser step创建唯一的ID\n    const browserId = `browser-${parentIndex}-${step.step_id}`;\n    \n    // 判断内容是否被截断\n    const isContentTruncated = step.full_content && step.content_preview.length < step.full_content.length;\n    \n    // 渲染内容\n    const renderedPreview = renderContent(step.content_preview);\n    const renderedFullContent = renderContent(step.full_content);\n    \n    return `\n        <div class=\"browser-step ${step.role}\" id=\"browser-step-${parentIndex}-${step.step_id}\">\n            <div class=\"d-flex justify-content-between align-items-center mb-2\">\n                <div>\n                    <span class=\"badge badge-role badge-${step.role}\">${step.role}</span>\n                    ${step.tool_calls.length > 0 ? `<span class=\"badge bg-warning text-dark ms-2\">${step.tool_calls.length} 工具</span>` : ''}\n                </div>\n                <span class=\"timestamp\">${formatTimestamp(step.timestamp)}</span>\n            </div>\n            <div class=\"content-preview\" id=\"browser-preview-${browserId}\">\n                <div class=\"preview-text\">\n                    ${renderedPreview}\n                    ${isContentTruncated ? `\n                        <span class=\"text-muted\">...</span>\n                        <button class=\"btn btn-link btn-sm p-0 ms-2 expand-preview-btn\" onclick=\"toggleBrowserPreview('${browserId}', ${parentIndex}, ${step.step_id})\" data-expanded=\"false\">\n                            <i class=\"fas fa-chevron-down\"></i> 展开\n                        </button>\n                    ` : ''}\n                </div>\n            </div>\n            ${step.tool_calls.length > 0 ? `\n                <div class=\"mt-2\">\n                    <h7>工具调用:</h7>\n                    ${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}\n                </div>\n            ` : ''}\n        </div>\n    `;\n}\n\nfunction updateSpansStats(data) {\n    if (!data || Object.keys(data).length === 0) {\n        elements.spansStats.innerHTML = '<p class=\"text-muted\">无Spans数据</p>';\n        return;\n    }\n    \n    const html = `\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">总Spans数:</span>\n            <span class=\"stat-value\">${data.total_spans}</span>\n        </div>\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">总时长:</span>\n            <span class=\"stat-value\">${(data.total_duration || 0).toFixed(2)}s</span>\n        </div>\n        <div class=\"mt-3\">\n            <h6>Agent统计:</h6>\n            ${Object.entries(data.agent_stats || {}).map(([agent, stats]) => `\n                <div class=\"mb-2\">\n                    <strong>${agent}:</strong>\n                    <div class=\"stat-item\">\n                        <span class=\"stat-label\">数量:</span>\n                        <span class=\"stat-value\">${stats.count}</span>\n                    </div>\n                    <div class=\"stat-item\">\n                        <span class=\"stat-label\">时长:</span>\n                        <span class=\"stat-value\">${(stats.total_duration || 0).toFixed(2)}s</span>\n                    </div>\n                </div>\n            `).join('')}\n        </div>\n    `;\n    \n    elements.spansStats.innerHTML = html;\n}\n\nfunction updateStepLogsStats(data) {\n    if (!data || Object.keys(data).length === 0) {\n        elements.stepLogsStats.innerHTML = '<p class=\"text-muted\">无步骤日志数据</p>';\n        return;\n    }\n    \n    const html = `\n        <div class=\"stat-item\">\n            <span class=\"stat-label\">总日志数:</span>\n            <span class=\"stat-value\">${data.total_logs}</span>\n        </div>\n        <div class=\"mt-3\">\n            <h6>状态分布:</h6>\n            ${Object.entries(data.status_distribution || {}).map(([status, count]) => `\n                <div class=\"stat-item\">\n                    <span class=\"stat-label\">${status}:</span>\n                    <span class=\"stat-value\">${count}</span>\n                </div>\n            `).join('')}\n        </div>\n        <div class=\"mt-3\">\n            <h6>步骤类型分布:</h6>\n            ${Object.entries(data.step_type_distribution || {}).map(([type, count]) => `\n                <div class=\"stat-item\">\n                    <span class=\"stat-label\">${type}:</span>\n                    <span class=\"stat-value\">${count}</span>\n                </div>\n            `).join('')}\n        </div>\n    `;\n    \n    elements.stepLogsStats.innerHTML = html;\n}\n\n// 事件处理函数\nfunction bindStepEventListeners() {\n    // 步骤折叠/展开\n    document.querySelectorAll('.step-header').forEach(header => {\n        header.addEventListener('click', function() {\n            const target = this.getAttribute('data-target');\n            const content = document.querySelector(target);\n            const icon = this.querySelector('.step-toggle i');\n            \n            if (content.classList.contains('show')) {\n                content.classList.remove('show');\n                icon.className = 'fas fa-chevron-down';\n            } else {\n                content.classList.add('show');\n                icon.className = 'fas fa-chevron-up';\n            }\n        });\n    });\n}\n\nfunction expandAllSteps() {\n    // 展开main agent的步骤\n    document.querySelectorAll('.step-content').forEach(content => {\n        content.classList.add('show');\n    });\n    document.querySelectorAll('.step-toggle i').forEach(icon => {\n        icon.className = 'fas fa-chevron-up';\n    });\n    \n    // 展开browser agent的预览内容\n    document.querySelectorAll('.expand-preview-btn').forEach(button => {\n        const isExpanded = button.getAttribute('data-expanded') === 'true';\n        if (!isExpanded) {\n            button.click();\n        }\n    });\n}\n\nfunction collapseAllSteps() {\n    // 收起main agent的步骤\n    document.querySelectorAll('.step-content').forEach(content => {\n        content.classList.remove('show');\n    });\n    document.querySelectorAll('.step-toggle i').forEach(icon => {\n        icon.className = 'fas fa-chevron-down';\n    });\n    \n    // 收起browser agent的预览内容\n    document.querySelectorAll('.expand-preview-btn').forEach(button => {\n        const isExpanded = button.getAttribute('data-expanded') === 'true';\n        if (isExpanded) {\n            button.click();\n        }\n    });\n}\n\n// 切换内容预览展开/收起\n// 切换browser预览展开/收起\nfunction toggleBrowserPreview(browserId, parentIndex, browserStepId) {\n    const previewElement = document.getElementById(`browser-preview-${browserId}`);\n    const button = previewElement.querySelector('.expand-preview-btn');\n    const isExpanded = button.getAttribute('data-expanded') === 'true';\n    \n    if (!currentFlowData) return;\n    \n    const parentStep = currentFlowData[parentIndex];\n    if (!parentStep || !parentStep.browser_flow) return;\n    \n    const browserStep = parentStep.browser_flow.find(step => step.step_id === browserStepId);\n    if (!browserStep) return;\n    \n    if (isExpanded) {\n        // 收起\n        const renderedPreview = renderContent(browserStep.content_preview);\n        previewElement.querySelector('.preview-text').innerHTML = `\n            ${renderedPreview}\n            <span class=\"text-muted\">...</span>\n            <button class=\"btn btn-link btn-sm p-0 ms-2 expand-preview-btn\" onclick=\"toggleBrowserPreview('${browserId}', ${parentIndex}, ${browserStepId})\" data-expanded=\"false\">\n                <i class=\"fas fa-chevron-down\"></i> 展开\n            </button>\n        `;\n    } else {\n        // 展开\n        const renderedFullContent = renderContent(browserStep.full_content);\n        previewElement.querySelector('.preview-text').innerHTML = `\n            ${renderedFullContent}\n            <button class=\"btn btn-link btn-sm p-0 ms-2 expand-preview-btn\" onclick=\"toggleBrowserPreview('${browserId}', ${parentIndex}, ${browserStepId})\" data-expanded=\"true\">\n                <i class=\"fas fa-chevron-up\"></i> 收起\n            </button>\n        `;\n    }\n}\n\nfunction showFullMessage(stepId) {\n    if (!currentFlowData) return;\n    \n    const step = currentFlowData.find(s => s.step_id === stepId);\n    if (!step) return;\n    \n    const renderedFullContent = renderContent(step.full_content);\n    \n    const modal = new bootstrap.Modal(elements.messageModal);\n    elements.messageContent.innerHTML = `\n        <div class=\"mb-3\">\n            <h6>步骤信息:</h6>\n            <div class=\"row\">\n                <div class=\"col-md-4\"><strong>Step ID:</strong> ${step.step_id}</div>\n                <div class=\"col-md-4\"><strong>Agent:</strong> ${step.agent}</div>\n                <div class=\"col-md-4\"><strong>Role:</strong> ${step.role}</div>\n            </div>\n            <div class=\"row mt-2\">\n                <div class=\"col-md-6\"><strong>时间:</strong> ${formatTimestamp(step.timestamp)}</div>\n                <div class=\"col-md-6\"><strong>工具调用:</strong> ${step.tool_calls.length}</div>\n            </div>\n        </div>\n        \n        <div class=\"mb-3\">\n            <h6>完整内容:</h6>\n            <div class=\"rendered-content\">${renderedFullContent}</div>\n        </div>\n        \n        ${step.tool_calls.length > 0 ? `\n            <div class=\"mb-3\">\n                <h6>工具调用详情:</h6>\n                ${step.tool_calls.map(tool => `\n                    <div class=\"card mb-2\">\n                        <div class=\"card-body\">\n                            <h7 class=\"card-title\">${tool.server_name}.${tool.tool_name}</h7>\n                            <div class=\"code-block\">${JSON.stringify(tool.arguments, null, 2)}</div>\n                        </div>\n                    </div>\n                `).join('')}\n            </div>\n        ` : ''}\n        \n        ${step.browser_flow && step.browser_flow.length > 0 ? `\n            <div class=\"mb-3\">\n                <h6>Browser会话详情:</h6>\n                <div class=\"accordion\" id=\"browserAccordion\">\n                    ${step.browser_flow.map((browserStep, index) => {\n                        const renderedBrowserContent = renderContent(browserStep.full_content);\n                        return `\n                            <div class=\"accordion-item\">\n                                <h2 class=\"accordion-header\">\n                                    <button class=\"accordion-button collapsed\" type=\"button\" data-bs-toggle=\"collapse\" data-bs-target=\"#browserStep${index}\">\n                                        Browser Step ${index + 1}: ${browserStep.role}\n                                        ${browserStep.tool_calls.length > 0 ? `(${browserStep.tool_calls.length} 工具调用)` : ''}\n                                    </button>\n                                </h2>\n                                <div id=\"browserStep${index}\" class=\"accordion-collapse collapse\">\n                                    <div class=\"accordion-body\">\n                                        <div class=\"rendered-content\">${renderedBrowserContent}</div>\n                                        ${browserStep.tool_calls.length > 0 ? `\n                                            <div class=\"mt-2\">\n                                                <strong>工具调用:</strong>\n                                                ${browserStep.tool_calls.map(tool => `\n                                                    <div class=\"small text-muted\">\n                                                        ${tool.server_name}.${tool.tool_name}\n                                                    </div>\n                                                `).join('')}\n                                            </div>\n                                        ` : ''}\n                                    </div>\n                                </div>\n                            </div>\n                        `;\n                    }).join('')}\n                </div>\n            </div>\n        ` : ''}\n    `;\n    \n    modal.show();\n} \n\n// ==================== 导航功能 ====================\n\nfunction updateNavigationList(data) {\n    if (!data || data.length === 0) {\n        elements.navigationList.innerHTML = '<p class=\"text-muted p-3 mb-0\">暂无步骤</p>';\n        return;\n    }\n    \n    const navigationHTML = data.map((step, index) => {\n        const summary = truncateText(step.content_preview, 50);\n        const toolsInfo = step.tool_calls.length > 0 ? ` (${step.tool_calls.length}工具)` : '';\n        const browserInfo = step.browser_session ? ' [浏览器]' : '';\n        \n        let html = `\n            <div class=\"nav-item\" data-step-index=\"${index}\" onclick=\"scrollToStep(${index})\">\n                <div class=\"d-flex align-items-center\">\n                    <span class=\"step-number\">${index + 1}</span>\n                    <span class=\"step-role ${step.role}\">${step.role}</span>\n                    ${step.browser_flow && step.browser_flow.length > 0 ? `\n                        <span class=\"browser-toggle\" onclick=\"toggleBrowserNav(${index}, event)\">\n                            <i class=\"fas fa-chevron-down\"></i>\n                        </span>\n                    ` : ''}\n                </div>\n                <div class=\"step-summary\">${summary}${toolsInfo}${browserInfo}</div>\n            </div>\n        `;\n        \n        // 添加browser子步骤\n        if (step.browser_flow && step.browser_flow.length > 0) {\n            html += `\n                <div class=\"browser-sub-steps\" id=\"browser-nav-${index}\">\n                    ${step.browser_flow.map((browserStep, browserIndex) => {\n                        const browserSummary = truncateText(browserStep.content_preview, 40);\n                        const browserToolsInfo = browserStep.tool_calls.length > 0 ? ` (${browserStep.tool_calls.length}工具)` : '';\n                        \n                        return `\n                            <div class=\"nav-item browser-sub-step\" data-step-index=\"${index}\" data-browser-step-id=\"${browserStep.step_id}\" onclick=\"scrollToBrowserStep(${index}, ${browserStep.step_id})\">\n                                <div class=\"d-flex align-items-center\">\n                                    <span class=\"step-number\">${index + 1}.${browserIndex + 1}</span>\n                                    <span class=\"step-role ${browserStep.role}\">${browserStep.role}</span>\n                                </div>\n                                <div class=\"step-summary\">${browserSummary}${browserToolsInfo}</div>\n                            </div>\n                        `;\n                    }).join('')}\n                </div>\n            `;\n        }\n        \n        return html;\n    }).join('');\n    \n    elements.navigationList.innerHTML = navigationHTML;\n}\n\nfunction scrollToStep(stepIndex) {\n    const stepElement = document.getElementById(`step-${stepIndex}`);\n    if (stepElement) {\n        stepElement.scrollIntoView({ \n            behavior: 'smooth', \n            block: 'start' \n        });\n        \n        // 更新活跃的导航项\n        updateActiveNavItem(stepIndex);\n        \n        // 如果步骤是收起的，自动展开\n        const stepContent = document.getElementById(`step-content-${stepIndex}`);\n        if (stepContent && !stepContent.classList.contains('show')) {\n            const collapseInstance = new bootstrap.Collapse(stepContent, {\n                toggle: false\n            });\n            collapseInstance.show();\n        }\n    }\n}\n\nfunction scrollToBrowserStep(parentIndex, browserStepId) {\n    const browserStepElement = document.getElementById(`browser-step-${parentIndex}-${browserStepId}`);\n    if (browserStepElement) {\n        browserStepElement.scrollIntoView({ \n            behavior: 'smooth', \n            block: 'start' \n        });\n        \n        // 更新活跃的导航项\n        updateActiveNavItem(parentIndex, browserStepId);\n        \n        // 确保父步骤是展开的\n        const stepContent = document.getElementById(`step-content-${parentIndex}`);\n        if (stepContent && !stepContent.classList.contains('show')) {\n            const collapseInstance = new bootstrap.Collapse(stepContent, {\n                toggle: false\n            });\n            collapseInstance.show();\n        }\n    }\n}\n\nfunction toggleBrowserNav(stepIndex, event) {\n    event.stopPropagation(); // 阻止事件冒泡\n    \n    const browserNavElement = document.getElementById(`browser-nav-${stepIndex}`);\n    const toggleIcon = event.target.closest('.browser-toggle').querySelector('i');\n    \n    if (browserNavElement.classList.contains('expanded')) {\n        browserNavElement.classList.remove('expanded');\n        toggleIcon.className = 'fas fa-chevron-down';\n    } else {\n        browserNavElement.classList.add('expanded');\n        toggleIcon.className = 'fas fa-chevron-up';\n    }\n}\n\nfunction updateActiveNavItem(activeIndex, browserStepId = null) {\n    // 移除所有活跃状态\n    const navItems = elements.navigationList.querySelectorAll('.nav-item');\n    navItems.forEach(item => item.classList.remove('active'));\n    \n    if (browserStepId) {\n        // 激活browser子步骤\n        const browserNavItem = elements.navigationList.querySelector(`[data-step-index=\"${activeIndex}\"][data-browser-step-id=\"${browserStepId}\"]`);\n        if (browserNavItem) {\n            browserNavItem.classList.add('active');\n        }\n    } else {\n        // 激活主步骤\n        const activeItem = elements.navigationList.querySelector(`[data-step-index=\"${activeIndex}\"]:not([data-browser-step-id])`);\n        if (activeItem) {\n            activeItem.classList.add('active');\n        }\n    }\n}\n\n// 监听滚动事件，自动更新导航激活状态\nlet scrollTimeout;\nfunction handleScroll() {\n    clearTimeout(scrollTimeout);\n    scrollTimeout = setTimeout(() => {\n        if (!currentFlowData) return;\n        \n        const steps = document.querySelectorAll('.execution-step');\n        const browserSteps = document.querySelectorAll('.browser-step');\n        const scrollTop = window.pageYOffset || document.documentElement.scrollTop;\n        const windowHeight = window.innerHeight;\n        \n        let activeIndex = 0;\n        let activeBrowserStepId = null;\n        let minDistance = Infinity;\n        \n        // 检查browser子步骤\n        browserSteps.forEach((browserStep) => {\n            const rect = browserStep.getBoundingClientRect();\n            const distance = Math.abs(rect.top - windowHeight / 3);\n            \n            if (distance < minDistance && rect.top < windowHeight * 0.7) {\n                minDistance = distance;\n                const id = browserStep.id;\n                const matches = id.match(/browser-step-(\\d+)-(\\d+)/);\n                if (matches) {\n                    activeIndex = parseInt(matches[1]);\n                    activeBrowserStepId = parseInt(matches[2]);\n                }\n            }\n        });\n        \n        // 如果没有找到活跃的browser步骤，检查主步骤\n        if (!activeBrowserStepId) {\n            steps.forEach((step, index) => {\n                const rect = step.getBoundingClientRect();\n                const distance = Math.abs(rect.top - windowHeight / 3);\n                \n                if (distance < minDistance && rect.top < windowHeight * 0.7) {\n                    minDistance = distance;\n                    activeIndex = index;\n                    activeBrowserStepId = null;\n                }\n            });\n        }\n        \n        updateActiveNavItem(activeIndex, activeBrowserStepId);\n    }, 100);\n}\n\n// 绑定滚动事件\nwindow.addEventListener('scroll', handleScroll);"
  },
  {
    "path": "apps/visualize-trace/templates/index.html",
    "content": "<!DOCTYPE html>\n<html lang=\"en\">\n<head>\n    <meta charset=\"UTF-8\">\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n    <title>Trace Analysis Dashboard</title>\n    <link href=\"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css\" rel=\"stylesheet\">\n    <link href=\"https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css\" rel=\"stylesheet\">\n    <link href=\"{{ url_for('static', filename='css/style.css') }}\" rel=\"stylesheet\">\n</head>\n<body>\n    <div class=\"container-fluid\">\n        <!-- Header -->\n        <div class=\"row\">\n            <div class=\"col-12\">\n                <nav class=\"navbar navbar-expand-lg navbar-dark bg-primary\">\n                    <div class=\"container-fluid\">\n                        <a class=\"navbar-brand\" href=\"#\">\n                            <i class=\"fas fa-chart-line\"></i> Trace Analysis Dashboard\n                        </a>\n                        <div class=\"navbar-nav ms-auto\">\n                            <div class=\"nav-item me-2\">\n                                <div class=\"input-group input-group-sm\">\n                                    <span class=\"input-group-text\">Directory:</span>\n                                    <input type=\"text\" class=\"form-control\" id=\"directoryInput\" placeholder=\"Enter directory path...\" style=\"width: 200px;\">\n                                    <button class=\"btn btn-outline-light\" type=\"button\" id=\"browseDirectoryBtn\">\n                                        <i class=\"fas fa-folder-open\"></i>\n                                    </button>\n                                </div>\n                            </div>\n                            <div class=\"nav-item\">\n                                <div class=\"input-group input-group-sm file-navigation\">\n                                    <button class=\"btn btn-outline-light nav-btn\" type=\"button\" id=\"prevFileBtn\" title=\"Previous file\">\n                                        <i class=\"fas fa-chevron-left\"></i>\n                                    </button>\n                                    <select class=\"form-select form-select-sm\" id=\"fileSelect\" style=\"min-width: 250px;\">\n                                        <option value=\"\">Select Trace file...</option>\n                                    </select>\n                                    <button class=\"btn btn-outline-light nav-btn\" type=\"button\" id=\"nextFileBtn\" title=\"Next file\">\n                                        <i class=\"fas fa-chevron-right\"></i>\n                                    </button>\n                                </div>\n                            </div>\n                            <button class=\"btn btn-outline-light btn-sm ms-2\" id=\"loadBtn\">\n                                <i class=\"fas fa-upload\"></i> Load\n                            </button>\n                            <button class=\"btn btn-outline-light btn-sm ms-2\" id=\"refreshBtn\">\n                                <i class=\"fas fa-sync\"></i> Refresh\n                            </button>\n                        </div>\n                    </div>\n                </nav>\n            </div>\n        </div>\n\n        <!-- Top summary information -->\n        <div class=\"row mt-3\">\n            <div class=\"col-12\">\n                <div class=\"card summary-panel\">\n                    <div class=\"card-body\">\n                        <div class=\"row\">\n                            <!-- Basic information -->\n                            <div class=\"col-md-4\">\n                                <h6><i class=\"fas fa-info-circle\"></i> Basic Information</h6>\n                                <div id=\"basicInfo\">\n                                    <p class=\"text-muted\">Please load a trace file first</p>\n                                </div>\n                            </div>\n                            \n                            <!-- Execution summary -->\n                            <div class=\"col-md-4\">\n                                <h6><i class=\"fas fa-chart-pie\"></i> Execution Summary</h6>\n                                <div id=\"executionSummary\">\n                                    <p class=\"text-muted\">Please load a trace file first</p>\n                                </div>\n                            </div>\n                            \n                            <!-- Performance summary -->\n                            <div class=\"col-md-4\">\n                                <h6><i class=\"fas fa-clock\"></i> Performance Summary</h6>\n                                <div id=\"performanceSummary\">\n                                    <p class=\"text-muted\">Please load a trace file first</p>\n                                </div>\n                            </div>\n                        </div>\n                    </div>\n                </div>\n            </div>\n        </div>\n\n        <!-- Main content -->\n        <div class=\"row mt-3\">\n            <!-- Left navigation directory -->\n            <div class=\"col-md-2\">\n                <div class=\"card navigation-panel\">\n                    <div class=\"card-header\">\n                        <h6><i class=\"fas fa-list\"></i> Step Navigation</h6>\n                    </div>\n                    <div class=\"card-body p-0\">\n                        <div class=\"navigation-list\" id=\"navigationList\">\n                            <p class=\"text-muted p-3 mb-0\">Please load a trace file first</p>\n                        </div>\n                    </div>\n                </div>\n            </div>\n\n            <!-- Right panel - Execution flow -->\n            <div class=\"col-md-10\">\n                <div class=\"card\">\n                    <div class=\"card-header d-flex justify-content-between align-items-center\">\n                        <h5><i class=\"fas fa-project-diagram\"></i> Execution Flow</h5>\n                        <div>\n                            <button class=\"btn btn-outline-primary btn-sm\" id=\"expandAllBtn\">\n                                <i class=\"fas fa-expand\"></i> Expand All\n                            </button>\n                            <button class=\"btn btn-outline-primary btn-sm\" id=\"collapseAllBtn\">\n                                <i class=\"fas fa-compress\"></i> Collapse All\n                            </button>\n                        </div>\n                    </div>\n                    <div class=\"card-body\" id=\"executionFlow\">\n                        <p class=\"text-muted\">Please load a trace file first</p>\n                    </div>\n                </div>\n            </div>\n        </div>\n\n        <!-- Bottom statistics -->\n        <div class=\"row mt-3\">\n            <div class=\"col-md-6\">\n                <div class=\"card\">\n                    <div class=\"card-header\">\n                        <h5><i class=\"fas fa-layer-group\"></i> Spans Statistics</h5>\n                    </div>\n                    <div class=\"card-body\" id=\"spansStats\">\n                        <p class=\"text-muted\">Please load a trace file first</p>\n                    </div>\n                </div>\n            </div>\n            <div class=\"col-md-6\">\n                <div class=\"card\">\n                    <div class=\"card-header\">\n                        <h5><i class=\"fas fa-list-ul\"></i> Step Logs Statistics</h5>\n                    </div>\n                    <div class=\"card-body\" id=\"stepLogsStats\">\n                        <p class=\"text-muted\">Please load a trace file first</p>\n                    </div>\n                </div>\n            </div>\n        </div>\n    </div>\n\n    <!-- Keyboard shortcuts hint -->\n    <div class=\"position-fixed bottom-0 start-0 p-3\" style=\"z-index: 10;\">\n        <div class=\"card border-0 shadow-sm\" style=\"background-color: rgba(0,0,0,0.8); color: white; font-size: 12px;\">\n            <div class=\"card-body p-2\">\n                <div class=\"text-center\">\n                    <strong>Shortcuts:</strong> \n                    <span class=\"badge bg-secondary mx-1\">←→</span> Switch files \n                    <span class=\"badge bg-secondary mx-1\">Enter</span> Load \n                    <span class=\"badge bg-secondary mx-1\">Ctrl+R</span> Refresh\n                </div>\n            </div>\n        </div>\n    </div>\n\n    <!-- Message details modal -->\n    <div class=\"modal fade\" id=\"messageModal\" tabindex=\"-1\">\n        <div class=\"modal-dialog modal-lg\">\n            <div class=\"modal-content\">\n                <div class=\"modal-header\">\n                    <h5 class=\"modal-title\">Message Details</h5>\n                    <button type=\"button\" class=\"btn-close\" data-bs-dismiss=\"modal\"></button>\n                </div>\n                <div class=\"modal-body\">\n                    <div id=\"messageContent\"></div>\n                </div>\n            </div>\n        </div>\n    </div>\n\n    <!-- Loading overlay -->\n    <div class=\"loading-overlay d-none\" id=\"loadingOverlay\">\n        <div class=\"spinner-border text-primary\" role=\"status\">\n            <span class=\"visually-hidden\">Loading...</span>\n        </div>\n    </div>\n\n    <!-- Toast notifications -->\n    <div class=\"toast-container position-fixed top-0 end-0 p-3\">\n        <div id=\"errorToast\" class=\"toast\" role=\"alert\">\n            <div class=\"toast-header bg-danger text-white\">\n                <strong class=\"me-auto\">Error</strong>\n                <button type=\"button\" class=\"btn-close btn-close-white\" data-bs-dismiss=\"toast\"></button>\n            </div>\n            <div class=\"toast-body\" id=\"errorMessage\"></div>\n        </div>\n        <div id=\"successToast\" class=\"toast\" role=\"alert\">\n            <div class=\"toast-header bg-success text-white\">\n                <strong class=\"me-auto\">Success</strong>\n                <button type=\"button\" class=\"btn-close btn-close-white\" data-bs-dismiss=\"toast\"></button>\n            </div>\n            <div class=\"toast-body\" id=\"successMessage\"></div>\n        </div>\n    </div>\n\n    <script src=\"https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js\"></script>\n    <script src=\"{{ url_for('static', filename='js/script.js') }}\"></script>\n</body>\n</html> "
  },
  {
    "path": "apps/visualize-trace/trace_analyzer.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport re\nfrom typing import Any, Dict, List, Optional\n\n\nclass TraceAnalyzer:\n    \"\"\"\n    Class for analyzing trace JSON files, convenient for reading and accessing important information\n\n    Supports two tool call formats:\n    1. Old format (MCP): Tool calls using XML tag format in content\n    2. New format: Tool calls using tool_calls field directly in message\n    \"\"\"\n\n    def __init__(self, json_file_path: str):\n        \"\"\"\n        Initialize analyzer\n\n        Args:\n            json_file_path: Path to the JSON file\n        \"\"\"\n        self.json_file_path = json_file_path\n        self.data = self._load_json()\n\n    def _load_json(self) -> Dict[str, Any]:\n        \"\"\"Load JSON file\"\"\"\n        try:\n            with open(self.json_file_path, \"r\", encoding=\"utf-8\") as f:\n                return json.load(f)\n        except Exception as e:\n            raise Exception(f\"Failed to load JSON file: {e}\")\n\n    def _parse_new_format_tool_name(self, tool_name: str) -> tuple[str, str]:\n        \"\"\"\n        Parse new format tool name\n\n        Args:\n            tool_name: New format tool name, for example:\n                      - \"tool-server_name-tool_name\" format\n                      - \"agent-browsing-search_and_browse\" format (browser agent)\n\n        Returns:\n            tuple: (server_name, actual_tool_name)\n        \"\"\"\n        # Handle agent-browsing-* format (browser agent calls)\n        if tool_name.startswith(\"agent-browsing-\"):\n            server_name = \"agent-browsing\"\n            actual_tool_name = tool_name[len(\"agent-browsing-\") :]\n            return server_name, actual_tool_name\n\n        # Handle other agent-* formats\n        elif tool_name.startswith(\"agent-\"):\n            # Find the last '-' to split server_name and tool_name\n            last_dash = tool_name.rfind(\"-\")\n            if last_dash > 6:  # There's content after \"agent-\"\n                server_name = tool_name[:last_dash]\n                actual_tool_name = tool_name[last_dash + 1 :]\n            else:\n                server_name = tool_name\n                actual_tool_name = \"\"\n            return server_name, actual_tool_name\n\n        # Handle tool-server_name-tool_name format\n        elif tool_name.startswith(\"tool-\"):\n            parts = tool_name.split(\"-\", 2)\n            if len(parts) >= 3:\n                server_name = parts[1]\n                actual_tool_name = parts[2]\n            else:\n                server_name = \"unknown\"\n                actual_tool_name = tool_name\n            return server_name, actual_tool_name\n\n        # Other formats\n        else:\n            server_name = \"unknown\"\n            actual_tool_name = tool_name\n            return server_name, actual_tool_name\n\n    # ==================== Basic Information ====================\n\n    def get_basic_info(self) -> Dict[str, Any]:\n        \"\"\"Get basic information of the task\"\"\"\n        return {\n            \"status\": self.data.get(\"status\"),\n            \"task_id\": self.data.get(\"task_id\"),\n            \"start_time\": self.data.get(\"start_time\"),\n            \"end_time\": self.data.get(\"end_time\"),\n            \"final_boxed_answer\": self.data.get(\"final_boxed_answer\"),\n            \"ground_truth\": self.data.get(\"ground_truth\"),\n            \"final_judge_result\": self.data.get(\"final_judge_result\"),\n            \"judge_type\": self.data.get(\"judge_type\"),\n            \"error\": self.data.get(\"error\", \"\"),\n        }\n\n    def get_performance_summary(self) -> Dict[str, Any]:\n        \"\"\"Get performance summary information\"\"\"\n        trace_data = self.data.get(\"trace_data\", {})\n        return trace_data.get(\"performance_summary\", {})\n\n    # ==================== Main Agent Message History ====================\n\n    def get_main_agent_history(self) -> Dict[str, Any]:\n        \"\"\"Get main agent message history\"\"\"\n        return self.data.get(\"main_agent_message_history\", {})\n\n    def get_main_agent_messages(self) -> List[Dict[str, Any]]:\n        \"\"\"Get main agent message list\"\"\"\n        history = self.get_main_agent_history()\n        return history.get(\"message_history\", [])\n\n    # ==================== Browser Agent Message History ====================\n\n    def get_browser_agent_sessions(self) -> Dict[str, Any]:\n        \"\"\"Get all browser agent sessions\"\"\"\n        # Try two possible key names\n        browser_sessions = self.data.get(\"browser_agent_message_history_sessions\", {})\n        if not browser_sessions:\n            browser_sessions = self.data.get(\"sub_agent_message_history_sessions\", {})\n        return browser_sessions\n\n    def get_browser_agent_session_messages(\n        self, session_id: str\n    ) -> List[Dict[str, Any]]:\n        \"\"\"Get message list for specified session\"\"\"\n        sessions = self.get_browser_agent_sessions()\n        session = sessions.get(session_id, {})\n        return session.get(\"message_history\", [])\n\n    # ==================== MCP Tool Call Parsing ====================\n\n    def parse_mcp_tool_call(self, text: str) -> Optional[Dict[str, Any]]:\n        \"\"\"Parse MCP tool call\"\"\"\n        pattern = r\"<use_mcp_tool>\\s*<server_name>(.*?)</server_name>\\s*<tool_name>(.*?)</tool_name>\\s*<arguments>\\s*(.*?)\\s*</arguments>\\s*</use_mcp_tool>\"\n\n        match = re.search(pattern, text, re.DOTALL)\n        if match:\n            server_name = match.group(1).strip()\n            tool_name = match.group(2).strip()\n            arguments_str = match.group(3).strip()\n\n            try:\n                arguments = json.loads(arguments_str)\n            except json.JSONDecodeError:\n                arguments = arguments_str\n\n            return {\n                \"server_name\": server_name,\n                \"tool_name\": tool_name,\n                \"arguments\": arguments,\n            }\n\n        return None\n\n    def extract_text_content(self, content) -> str:\n        \"\"\"Extract text from message content\"\"\"\n        if isinstance(content, list):\n            text_parts = []\n            for item in content:\n                if isinstance(item, dict) and item.get(\"type\") == \"text\":\n                    text_parts.append(item.get(\"text\", \"\"))\n            return \"\".join(text_parts)\n        return str(content)\n\n    def analyze_conversation_flow(self) -> List[Dict[str, Any]]:\n        \"\"\"Analyze conversation flow, including tool calls\"\"\"\n        flow_steps = []\n        main_messages = self.get_main_agent_messages()\n        sub_agent_sessions = self.get_browser_agent_sessions()\n\n        sub_agent_call_count = 0\n\n        for i, message in enumerate(main_messages):\n            role = message.get(\"role\")\n            content = message.get(\"content\", [])\n\n            text_content = self.extract_text_content(content)\n\n            step = {\n                \"step_id\": i,\n                \"agent\": \"main_agent\",\n                \"role\": role,\n                \"content_preview\": text_content[:200] + \"...\"\n                if len(text_content) > 200\n                else text_content,\n                \"full_content\": text_content,\n                \"tool_calls\": [],\n                \"browser_session\": None,\n                \"timestamp\": message.get(\"timestamp\", \"\"),\n                \"browser_flow\": [],\n            }\n\n            # If it's an assistant message, check for tool calls\n            if role == \"assistant\":\n                # Check for new format tool_calls\n                if \"tool_calls\" in message and message[\"tool_calls\"]:\n                    for tool_call in message[\"tool_calls\"]:\n                        # Convert new format to unified format\n                        if \"function\" in tool_call:\n                            function_info = tool_call[\"function\"]\n                            tool_name = function_info.get(\"name\", \"\")\n                            arguments = function_info.get(\"arguments\", \"\")\n\n                            # Parse arguments string as JSON (if it's a string)\n                            if isinstance(arguments, str):\n                                try:\n                                    arguments = json.loads(arguments)\n                                except json.JSONDecodeError:\n                                    pass\n\n                            # Extract server_name from tool_name (if available)\n                            server_name, actual_tool_name = (\n                                self._parse_new_format_tool_name(tool_name)\n                            )\n\n                            parsed_tool_call = {\n                                \"server_name\": server_name,\n                                \"tool_name\": actual_tool_name,\n                                \"arguments\": arguments,\n                                \"id\": tool_call.get(\"id\", \"\"),\n                                \"type\": tool_call.get(\"type\", \"function\"),\n                                \"format\": \"new\",\n                            }\n                            step[\"tool_calls\"].append(parsed_tool_call)\n\n                            # Handle browser agent calls - maintain complete consistency with MCP format logic\n                            if server_name.startswith(\"agent-\"):\n                                sub_agent_call_count += 1\n                                session_id = f\"{server_name}_{sub_agent_call_count}\"\n                                step[\"browser_session\"] = session_id\n\n                                # Analyze browser session conversation flow\n                                if session_id in sub_agent_sessions:\n                                    browser_flow = self.analyze_browser_session_flow(\n                                        session_id\n                                    )\n                                    step[\"browser_flow\"] = browser_flow\n                            elif server_name.startswith(\"browsing-agent\"):\n                                sub_agent_call_count += 1\n                                session_id = f\"browser_agent_{sub_agent_call_count}\"\n                                step[\"browser_session\"] = session_id\n\n                                # Analyze browser session conversation flow\n                                if session_id in sub_agent_sessions:\n                                    browser_flow = self.analyze_browser_session_flow(\n                                        session_id\n                                    )\n                                    step[\"browser_flow\"] = browser_flow\n\n                # Check for old format MCP tool calls (maintain compatibility)\n                mcp_tool_call = self.parse_mcp_tool_call(text_content)\n                if mcp_tool_call:\n                    mcp_tool_call[\"format\"] = \"mcp\"  # Mark as old format\n                    step[\"tool_calls\"].append(mcp_tool_call)\n\n                    # If browsing-agent is called, associate browser session\n                    if mcp_tool_call[\"server_name\"].startswith(\"agent-\"):\n                        sub_agent_call_count += 1\n                        session_id = (\n                            f\"{mcp_tool_call['server_name']}_{sub_agent_call_count}\"\n                        )\n                        step[\"browser_session\"] = session_id\n\n                        # Analyze browser session conversation flow\n                        if session_id in sub_agent_sessions:\n                            browser_flow = self.analyze_browser_session_flow(session_id)\n                            step[\"browser_flow\"] = browser_flow\n                    elif mcp_tool_call[\"server_name\"].startswith(\"browsing-agent\"):\n                        sub_agent_call_count += 1\n                        session_id = f\"browser_agent_{sub_agent_call_count}\"\n                        step[\"browser_session\"] = session_id\n\n                        # Analyze browser session conversation flow\n                        if session_id in sub_agent_sessions:\n                            browser_flow = self.analyze_browser_session_flow(session_id)\n                            step[\"browser_flow\"] = browser_flow\n            flow_steps.append(step)\n\n        return flow_steps\n\n    def analyze_browser_session_flow(self, session_id: str) -> List[Dict[str, Any]]:\n        \"\"\"Analyze browser session conversation flow\"\"\"\n        browser_messages = self.get_browser_agent_session_messages(session_id)\n        browser_flow = []\n\n        for i, message in enumerate(browser_messages):\n            role = message.get(\"role\")\n            content = message.get(\"content\", [])\n\n            text_content = self.extract_text_content(content)\n\n            step = {\n                \"step_id\": i,\n                \"agent\": session_id,\n                \"role\": role,\n                \"content_preview\": text_content[:200] + \"...\"\n                if len(text_content) > 200\n                else text_content,\n                \"full_content\": text_content,\n                \"tool_calls\": [],\n                \"timestamp\": message.get(\"timestamp\", \"\"),\n            }\n\n            # If it's an assistant message, check for tool calls\n            if role == \"assistant\":\n                # Check for new format tool_calls\n                if \"tool_calls\" in message and message[\"tool_calls\"]:\n                    for tool_call in message[\"tool_calls\"]:\n                        # Convert new format to unified format\n                        if \"function\" in tool_call:\n                            function_info = tool_call[\"function\"]\n                            tool_name = function_info.get(\"name\", \"\")\n                            arguments = function_info.get(\"arguments\", \"\")\n\n                            # Parse arguments string as JSON (if it's a string)\n                            if isinstance(arguments, str):\n                                try:\n                                    arguments = json.loads(arguments)\n                                except json.JSONDecodeError:\n                                    pass\n\n                            # Extract server_name from tool_name (if available)\n                            server_name, actual_tool_name = (\n                                self._parse_new_format_tool_name(tool_name)\n                            )\n\n                            parsed_tool_call = {\n                                \"server_name\": server_name,\n                                \"tool_name\": actual_tool_name,\n                                \"arguments\": arguments,\n                                \"id\": tool_call.get(\"id\", \"\"),\n                                \"type\": tool_call.get(\"type\", \"function\"),\n                                \"format\": \"new\",\n                            }\n                            step[\"tool_calls\"].append(parsed_tool_call)\n\n                # Check for old format MCP tool calls (maintain compatibility)\n                mcp_tool_call = self.parse_mcp_tool_call(text_content)\n                if mcp_tool_call:\n                    mcp_tool_call[\"format\"] = \"mcp\"  # Mark as old format\n                    step[\"tool_calls\"].append(mcp_tool_call)\n\n            browser_flow.append(step)\n\n        return browser_flow\n\n    def get_execution_summary(self) -> Dict[str, Any]:\n        \"\"\"Get execution summary information\"\"\"\n        flow_steps = self.analyze_conversation_flow()\n\n        total_steps = len(flow_steps)\n        tool_calls = []\n        browser_sessions = []\n\n        for step in flow_steps:\n            if step[\"tool_calls\"]:\n                tool_calls.extend(step[\"tool_calls\"])\n            if step.get(\"browser_session\"):\n                browser_sessions.append(step[\"browser_session\"])\n\n            # Collect tool calls from browser sessions\n            if step.get(\"browser_flow\"):\n                for browser_step in step[\"browser_flow\"]:\n                    if browser_step.get(\"tool_calls\"):\n                        tool_calls.extend(browser_step[\"tool_calls\"])\n\n        # Tool usage statistics\n        tool_usage = {}\n        for tool in tool_calls:\n            # Choose appropriate key name generation method based on format\n            if tool.get(\"format\") == \"new\":\n                # New format: use server_name.tool_name, if server_name is unknown then use only tool_name\n                if tool.get(\"server_name\") != \"unknown\":\n                    key = f\"{tool['server_name']}.{tool['tool_name']}\"\n                else:\n                    key = tool[\"tool_name\"]\n            else:\n                # Old format (MCP): maintain original method\n                key = f\"{tool['server_name']}.{tool['tool_name']}\"\n            tool_usage[key] = tool_usage.get(key, 0) + 1\n\n        return {\n            \"total_steps\": total_steps,\n            \"total_tool_calls\": len(tool_calls),\n            \"browser_sessions_count\": len(browser_sessions),\n            \"tool_usage_distribution\": tool_usage,\n            \"browser_sessions\": browser_sessions,\n        }\n\n    def get_spans_summary(self) -> Dict[str, Any]:\n        \"\"\"Get spans statistical summary\"\"\"\n        trace_data = self.data.get(\"trace_data\", {})\n        spans = trace_data.get(\"spans\", [])\n\n        agent_stats = {}\n        for span in spans:\n            agent = span.get(\"agent_context\", \"unknown\")\n            if agent not in agent_stats:\n                agent_stats[agent] = {\n                    \"count\": 0,\n                    \"total_duration\": 0,\n                    \"span_types\": set(),\n                }\n            agent_stats[agent][\"count\"] += 1\n            agent_stats[agent][\"total_duration\"] += span.get(\"duration_seconds\", 0)\n            agent_stats[agent][\"span_types\"].add(span.get(\"name\", \"unknown\"))\n\n        # Convert set to list\n        for agent in agent_stats:\n            agent_stats[agent][\"span_types\"] = list(agent_stats[agent][\"span_types\"])\n\n        return {\n            \"total_spans\": len(spans),\n            \"total_duration\": sum(span.get(\"duration_seconds\", 0) for span in spans),\n            \"agent_stats\": agent_stats,\n        }\n\n    def get_step_logs_summary(self) -> Dict[str, Any]:\n        \"\"\"Get step logs summary statistics\"\"\"\n        logs = self.data.get(\"step_logs\", [])\n\n        status_count = {}\n        step_type_count = {}\n\n        for log in logs:\n            status = log.get(\"status\", \"unknown\")\n            step_name = log.get(\"step_name\", \"unknown\")\n\n            status_count[status] = status_count.get(status, 0) + 1\n            step_type_count[step_name] = step_type_count.get(step_name, 0) + 1\n\n        return {\n            \"total_logs\": len(logs),\n            \"status_distribution\": status_count,\n            \"step_type_distribution\": step_type_count,\n        }\n"
  },
  {
    "path": "assets/LOCAL-TOOL-DEPLOYMENT.md",
    "content": "# Local Tool Deployment Guide\n\nThis guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration.\n\n## Overview\n\nMiroThinker supports several optional open-source tools that you can deploy locally:\n\n- **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files\n- **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images\n- **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks\n\nThese tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file.\n\n## Prerequisites\n\n- **GPU**: NVIDIA GPU with sufficient VRAM\n- **Python 3.10+**\n- **CUDA**: Compatible CUDA toolkit installed\n- **Model Storage**: Sufficient disk space to download model checkpoints\n\n## Tool Deployment\n\n### 1. Audio Transcription Tool (`tool-transcribe-os`)\n\n**Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo)\n\n**Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs.\n\n**Deployment with vLLM**:\n\n```bash\n# Install vLLM with audio support\npip install vllm==0.10.0\npip install vllm[audio]\n\n# Start the server\nvllm serve openai/whisper-large-v3-turbo \\\n  --served-model-name whisper-large-v3-turbo \\\n  --task transcription \\\n  --host 0.0.0.0 \\\n  --port 8000\n```\n\n**Configuration in `.env`**:\n\n```bash\nWHISPER_MODEL_NAME=\"openai/whisper-large-v3-turbo\"\nWHISPER_API_KEY=your_api_key  # Optional, if your server requires authentication\nWHISPER_BASE_URL=\"http://0.0.0.0:8000/v1\"\n```\n\n### 2. Visual Question Answering Tool (`tool-vqa-os`)\n\n**Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)\n\n**Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats.\n\n**Deployment with SGLang**:\n\n```bash\n# Install SGLang\npip install sglang[all]\n\n# Start the server\npython3 -m sglang.launch_server \\\n  --model-path Qwen/Qwen2.5-VL-72B-Instruct \\\n  --tp 8 \\\n  --host 0.0.0.0 \\\n  --port 8001 \\\n  --trust-remote-code \\\n  --enable-metrics\n```\n\n**Configuration in `.env`**:\n\n```bash\nVISION_MODEL_NAME=\"Qwen/Qwen2.5-VL-72B-Instruct\"\nVISION_API_KEY=your_api_key  # Optional, if your server requires authentication\nVISION_BASE_URL=\"http://0.0.0.0:8001/v1/chat/completions\"\n```\n\n### 3. Reasoning Engine Tool (`tool-reasoning-os`)\n\n**Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)\n\n**Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens).\n\n**Deployment with SGLang**:\n\n```bash\n# Install SGLang\npip install sglang[all]\n\n# Start the server\npython3 -m sglang.launch_server \\\n  --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \\\n  --tp 8 \\\n  --host 0.0.0.0 \\\n  --port 8002 \\\n  --trust-remote-code \\\n  --context-length 131072 \\\n  --enable-metrics\n```\n\n**Configuration in `.env`**:\n\n```bash\nREASONING_MODEL_NAME=\"Qwen/Qwen3-235B-A22B-Thinking-2507\"\nREASONING_API_KEY=your_api_key  # Optional, if your server requires authentication\nREASONING_BASE_URL=\"http://0.0.0.0:8002/v1/chat/completions\"\n```\n\n## Using Deployed Tools\n\nOnce you have deployed the tools, configure your agent to use them:\n\n1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`):\n\n```yaml\nmain_agent:\n  tools:\n    - tool-python\n    - search_and_scrape_webpage\n    - jina_scrape_llm_summary\n    - tool-transcribe-os    # Use local Whisper deployment\n    - tool-vqa-os           # Use local Qwen2.5-VL deployment\n    - tool-reasoning-os     # Use local Qwen3-235B deployment\n  max_turns: 400\n```\n\n2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above.\n\n1. **Run your agent**:\n\n```bash\ncd apps/miroflow-agent\nuv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1\n```\n\n## Commercial Alternatives\n\nIf you prefer not to deploy these tools locally, you can use commercial alternatives:\n\n- **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API\n- **`tool-vqa`**: Uses Claude Sonnet 3.7 API\n- **`tool-reasoning`**: Uses Claude Sonnet 3.7 API\n\nSimply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`).\n\n## Additional Resources\n\n- **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/)\n- **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/)\n- **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations\n"
  },
  {
    "path": "assets/QA.md",
    "content": "# MiroFlow QA Documentation\n\n## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations?\n\n**Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools.\n\n### Step-by-Step Process\n\n1. **Extract GAIA-Text-103 Tasks**\n\n   ```bash\n   # Extract text-103 tasks to a separate directory\n   uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation\n   ```\n\n   This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation`\n\n1. **Re-grade with GAIA-Text-103 Evaluator**\n\n   ```bash\n   # Apply GAIA-Text-103 specific grading\n   uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction\n   ```\n\n1. **Verify Results**\n\n   ```bash\n   # Check accuracy and generate statistics\n   uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction\n   ```\n\n## Q2: Does the choice of judgment model affect evaluation performance?\n\n**Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models.\n\nWe have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons:\n\n- **Ease of deployment:** No need to host additional GPU-intensive models\n- **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp)\n- **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons\n\n## Code Quality Checks\n\nBefore submitting a pull request, ensure your code meets our quality standards:\n\n```bash\n# Fix linting issues automatically\nuv tool run ruff@0.8.0 check --fix .\n\n# Format code according to our style guidelines\nuv tool run ruff@0.8.0 format .\n```\n\n## Know Issues\n\n- The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate.\n"
  },
  {
    "path": "assets/qwen3_nonthinking.jinja",
    "content": " {%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set content = message.content %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in message.content %}\n                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n<think>\\n\\n</think>\\n\\n' }}\n{%- endif %}"
  },
  {
    "path": "justfile",
    "content": "default:\n    just --list\n\n# lint monorepo\n[group('precommit')]\nlint:\n    uv tool run ruff@0.8.0 check --fix .\n\n# sort imports\n[group('precommit')]\nsort-imports:\n    uv tool run ruff@0.8.0 check --select I --fix .\n\n# format monorepo\n[group('precommit')]\nformat:\n    uv tool run ruff@0.8.0 format .\n\n# check license\n[group('precommit')]\ncheck-license:\n    uv run reuse lint\n\n# insert license for contributor\ninsert-license:\n    # https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings\n    git diff --name-only --cached | xargs -I {} reuse annotate -c \"$(git config --get user.name) <$(git config --get user.email)>\" \"{}\"\n\n# format markdown files\n[group('precommit')]\nformat-md:\n    find . -name \"*.md\" -type f | xargs uv tool run mdformat@0.7.17\n\n# run precommit before PR\n[group('precommit')]\nprecommit: lint sort-imports format-md format\n"
  },
  {
    "path": "libs/miroflow-tools/README.md",
    "content": "# 🛠️ MiroFlow Tools\n\n> A comprehensive tool management system and MCP (Model Context Protocol) server collection for MiroFlow, providing a unified interface to various AI capabilities including code execution, vision processing, audio transcription, web searching, reasoning, and document reading.\n\n## ✨ Features\n\n- **🔧 Unified Tool Management**: Centralized `ToolManager` for managing multiple MCP servers\n- **🌐 Multiple Transport Protocols**: Support for both stdio and SSE (HTTP) connections\n- **📦 Rich Tool Ecosystem**: Pre-built MCP servers for common AI tasks\n- **⚙️ Flexible Configuration**: Tool blacklisting, timeout management, and custom server configurations\n- **🛡️ Error Handling**: Robust retry logic and fallback mechanisms\n\n## 📦 Installation\n\nThis package is a local dependency that is automatically installed when you run `uv sync` in the `apps/miroflow-agent` directory. No separate installation is required.\n\nFor standalone usage or development:\n\n```bash\ncd libs/miroflow-tools\nuv sync\n```\n\n## 📋 MCP Servers Overview\n\nQuick reference tables of all available MCP servers and their tools. Click on \"Details\" to jump to the full documentation.\n\n### 📊 Tools Used in MiroThinker v1.0 and v1.5\n\nThe following tools were used in the MiroThinker v1.0 and v1.5 evaluation:\n\n| Category                   | Server Name                 | Tools                                                                                                                | Key Environment Variables                                                                 | Link                                     |\n|----------------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------------------|\n| **Execution Environment**  | `tool-python`               | `create_sandbox`, `run_command`, `run_python_code`                                                                   | `E2B_API_KEY`, `LOGS_DIR`                                                                 | [Details](#tool-python)                  |\n| **File Management**        | `tool-python`               | `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY`, `LOGS_DIR`                                                                 | [Details](#tool-python)                  |\n| **Information Retrieval**  | `search_and_scrape_webpage` | `google_search`                                                                                                      | `SERPER_API_KEY`, `SERPER_BASE_URL`                                                        | [Details](#search_and_scrape_webpage)    |\n| **Information Retrieval**  | `jina_scrape_llm_summary`   | `scrape_and_extract_info`                                                                                            | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | [Details](#jina_scrape_llm_summary)      |\n\n### 🔧 Additional Available Tools\n\nThe following tools are implemented but were not used in the MiroThinker v1.0/v1.5 evaluation:\n\n| Category                    | Server Name          | Tools                                             | Key Environment Variables                                           | Link                           |\n|-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------|\n| **Web Searching**           | `tool-google-search` | `google_search`, `scrape_website`                 | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) |\n| **Web Searching (Sogou)**  | `tool-sogou-search` | `sogou_search`, `scrape_website`                 | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) |\n| **Vision Processing**       | `tool-vqa`           | `visual_question_answering`                       | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`                            | [Details](#tool-vqa)           |\n| **Vision Processing**       | `tool-vqa-os`        | `visual_question_answering`                       | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`            | [Details](#tool-vqa-os)        |\n| **Audio Processing**        | `tool-transcribe`    | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL`                                  | [Details](#tool-transcribe)    |\n| **Audio Processing**        | `tool-transcribe-os` | `audio_transcription`                             | `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`         | [Details](#tool-transcribe-os) |\n| **Document Reading**        | `tool-reading`       | `convert_to_markdown`                             | None required                                                       | [Details](#tool-reading)       |\n| **Reasoning Engine**        | `tool-reasoning`     | `reasoning`                                       | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`                            | [Details](#tool-reasoning)     |\n| **Reasoning Engine**        | `tool-reasoning-os`  | `reasoning`                                       | `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`   | [Details](#tool-reasoning-os)  |\n\n## 🚀 Quick Start\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    # Initialize tool manager with server configurations\n    server_configs = [\n        {\n            \"name\": \"tool-python\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.python_mcp_server\"],\n                env={\"E2B_API_KEY\": \"your_e2b_api_key\"}  # Required for Python execution\n            )\n        },\n        # Add more server configurations...\n    ]\n\n    tool_manager = ToolManager(server_configs)\n\n    # Get all available tool definitions\n    tool_definitions = await tool_manager.get_all_tool_definitions()\n\n    # Create a sandbox first\n    sandbox_result = await tool_manager.execute_tool_call(\n        server_name=\"tool-python\",\n        tool_name=\"create_sandbox\",\n        arguments={\"timeout\": 600}\n    )\n\n    # Extract sandbox_id from result\n    sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()\n\n    # Execute a tool call\n    result = await tool_manager.execute_tool_call(\n    server_name=\"tool-python\",\n    tool_name=\"run_python_code\",\n        arguments={\"code_block\": \"print('Hello, World!')\", \"sandbox_id\": sandbox_id}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n## 🔧 ToolManager\n\nThe `ToolManager` class is the central component for managing and executing tools across multiple MCP servers.\n\n### Key Features\n\n- **🔌 Multi-Server Support**: Manage tools from multiple MCP servers simultaneously\n- **🔗 Connection Management**: Automatic connection handling for stdio and SSE transports\n- **🚫 Tool Blacklisting**: Filter out specific tools from specific servers\n- **📝 Structured Logging**: Optional task logging integration\n- **🔄 Error Recovery**: Automatic retry logic and fallback mechanisms\n\n### Methods\n\n- `get_all_tool_definitions()`: Retrieve tool schemas from all configured servers\n- `execute_tool_call(server_name, tool_name, arguments)`: Execute a specific tool\n- `set_task_log(task_log)`: Enable structured logging\n- `get_server_params(server_name)`: Get configuration for a specific server\n\n### Example Usage\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    # Configure servers\n    server_configs = [\n        {\n            \"name\": \"python-server\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.python_mcp_server\"],\n                env={\"E2B_API_KEY\": \"your_key\"}\n            )\n        }\n    ]\n\n    # Initialize with optional blacklist\n    tool_blacklist = {(\"python-server\", \"some_tool\")}\n    manager = ToolManager(server_configs, tool_blacklist=tool_blacklist)\n\n    # Enable logging\n    # manager.set_task_log(your_task_logger)\n\n    # Get tools\n    tools = await manager.get_all_tool_definitions()\n\n    # Create a sandbox first (required before running code)\n    sandbox_result = await manager.execute_tool_call(\n        server_name=\"python-server\",\n        tool_name=\"create_sandbox\",\n        arguments={\"timeout\": 600}\n    )\n    sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()\n\n    # Execute tool\n    result = await manager.execute_tool_call(\n        server_name=\"python-server\",\n        tool_name=\"run_python_code\",\n        arguments={\"code_block\": \"1 + 1\", \"sandbox_id\": sandbox_id}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n## 🔌 MCP Servers\n\n### Server: tool-python\n\nExecute Python code in isolated E2B sandboxes with persistent sessions.\n\n**Tools**:\n\n- 🔨 `create_sandbox(timeout=600)`: Create a new Linux sandbox\n- 🐍 `run_python_code(code_block, sandbox_id)`: Execute Python code\n- 💻 `run_command(command, sandbox_id)`: Run shell commands\n- ⬆️ `upload_file_from_local_to_sandbox(sandbox_id, local_file_path, sandbox_file_path)`: Upload files\n- ⬇️ `download_file_from_internet_to_sandbox(sandbox_id, url, sandbox_file_path)`: Download files\n- 💾 `download_file_from_sandbox_to_local(sandbox_id, sandbox_file_path, local_filename)`: Download files\n\n**Environment Variables**:\n\n- 🔑 `E2B_API_KEY`: E2B API key (required)\n- 📁 `LOGS_DIR`: Directory for temporary files (default: `../../logs`)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    # Configure server with environment variables\n    server_configs = [\n        {\n            \"name\": \"tool-python\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.python_mcp_server\"],\n                env={\"E2B_API_KEY\": \"your_e2b_api_key\"}\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    # Create sandbox\n    result = await manager.execute_tool_call(\n        server_name=\"tool-python\",\n        tool_name=\"create_sandbox\",\n        arguments={\"timeout\": 600}\n    )\n\n    # Extract sandbox_id from result\n    sandbox_id = result['result'].split('sandbox_id:')[-1].strip()\n\n    # Run code\n    result = await manager.execute_tool_call(\n        server_name=\"tool-python\",\n        tool_name=\"run_python_code\",\n        arguments={\"code_block\": \"import numpy as np; print(np.array([1,2,3]))\", \"sandbox_id\": sandbox_id}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-vqa\n\nAnalyze images and answer questions about visual content using Anthropic Claude.\n\n**Tools**:\n\n- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images\n\n**Environment Variables**:\n\n- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)\n- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-vqa\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.vision_mcp_server\"],\n                env={\n                    \"ANTHROPIC_API_KEY\": \"your_anthropic_api_key\",\n                    \"ANTHROPIC_BASE_URL\": \"https://api.anthropic.com\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-vqa\",\n        tool_name=\"visual_question_answering\",\n        arguments={\n            \"image_path_or_url\": \"https://example.com/image.jpg\",\n            \"question\": \"What is in this image?\"\n        }\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-vqa-os\n\nAnalyze images and answer questions about visual content using open-source compatible models.\n\n**Tools**:\n\n- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images\n\n**Environment Variables**:\n\n- 🔑 `VISION_API_KEY`: API key (required)\n- 🌐 `VISION_BASE_URL`: API endpoint URL (required)\n- 🤖 `VISION_MODEL_NAME`: Model name (required)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-vqa-os\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.vision_mcp_server_os\"],\n                env={\n                    \"VISION_API_KEY\": \"your_vision_api_key\",\n                    \"VISION_BASE_URL\": \"your_vision_base_url\",\n                    \"VISION_MODEL_NAME\": \"your_vision_model_name\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-vqa-os\",\n        tool_name=\"visual_question_answering\",\n        arguments={\n            \"image_path_or_url\": \"https://example.com/image.jpg\",\n            \"question\": \"What is in this image?\"\n        }\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-transcribe\n\nTranscribe audio files and answer questions about audio content using OpenAI Whisper.\n\n**Tools**:\n\n- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text\n- 🎧 `audio_question_answering(audio_path_or_url, question)`: Answer questions about audio\n\n**Environment Variables**:\n\n- 🔑 `OPENAI_API_KEY`: OpenAI API key (required)\n- 🌐 `OPENAI_BASE_URL`: API base URL (default: `https://api.openai.com/v1`)\n\n**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-transcribe\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.audio_mcp_server\"],\n                env={\n                    \"OPENAI_API_KEY\": \"your_openai_api_key\",\n                    \"OPENAI_BASE_URL\": \"https://api.openai.com/v1\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    # Transcribe audio\n    result = await manager.execute_tool_call(\n        server_name=\"tool-transcribe\",\n        tool_name=\"audio_transcription\",\n        arguments={\"audio_path_or_url\": \"/path/to/audio.mp3\"}\n    )\n    print(result)\n\n    # Answer questions about audio\n    result = await manager.execute_tool_call(\n        server_name=\"tool-transcribe\",\n        tool_name=\"audio_question_answering\",\n        arguments={\n            \"audio_path_or_url\": \"/path/to/audio.mp3\",\n            \"question\": \"What is the main topic discussed?\"\n        }\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-transcribe-os\n\nTranscribe audio files using open-source compatible models.\n\n**Tools**:\n\n- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text\n\n**Environment Variables**:\n\n- 🔑 `WHISPER_API_KEY`: API key (required)\n- 🌐 `WHISPER_BASE_URL`: API endpoint URL (required)\n- 🤖 `WHISPER_MODEL_NAME`: Model name (required)\n\n**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-transcribe-os\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.audio_mcp_server_os\"],\n                env={\n                    \"WHISPER_API_KEY\": \"your_whisper_api_key\",\n                    \"WHISPER_BASE_URL\": \"your_whisper_base_url\",\n                    \"WHISPER_MODEL_NAME\": \"your_whisper_model_name\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-transcribe-os\",\n        tool_name=\"audio_transcription\",\n        arguments={\"audio_path_or_url\": \"/path/to/audio.mp3\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-reading\n\nConvert various document formats to Markdown using MarkItDown.\n\n**Tools**:\n\n- 📄 `convert_to_markdown(uri)`: Convert documents (PDF, DOC, PPT, Excel, CSV, ZIP, etc.) to Markdown. URI must start with `file:`, `data:`, `http:`, or `https:` scheme.\n\n**Supported Formats**: 📄 PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, CSV, ZIP, and more\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    # Configure server (no additional environment variables required)\n    server_configs = [\n        {\n            \"name\": \"tool-reading\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.reading_mcp_server\"]\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-reading\",\n        tool_name=\"convert_to_markdown\",\n        arguments={\"uri\": \"file:///path/to/document.pdf\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-reasoning\n\nSolve complex reasoning problems requiring chain-of-thought using Anthropic Claude with thinking.\n\n**Tools**:\n\n- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions\n\n**Environment Variables**:\n\n- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)\n- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-reasoning\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.reasoning_mcp_server\"],\n                env={\n                    \"ANTHROPIC_API_KEY\": \"your_anthropic_api_key\",\n                    \"ANTHROPIC_BASE_URL\": \"https://api.anthropic.com\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-reasoning\",\n        tool_name=\"reasoning\",\n        arguments={\"question\": \"Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-reasoning-os\n\nSolve complex reasoning problems requiring chain-of-thought using open-source compatible models.\n\n**Tools**:\n\n- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions\n\n**Environment Variables**:\n\n- 🔑 `REASONING_API_KEY`: API key (required)\n- 🌐 `REASONING_BASE_URL`: API endpoint URL (required)\n- 🤖 `REASONING_MODEL_NAME`: Model name (required)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-reasoning-os\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.reasoning_mcp_server_os\"],\n                env={\n                    \"REASONING_API_KEY\": \"your_reasoning_api_key\",\n                    \"REASONING_BASE_URL\": \"your_reasoning_base_url\",\n                    \"REASONING_MODEL_NAME\": \"your_reasoning_model_name\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"tool-reasoning-os\",\n        tool_name=\"reasoning\",\n        arguments={\"question\": \"Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: search_and_scrape_webpage\n\nGoogle search via Serper API. Used in MiroThinker v1.0/v1.5 evaluation.\n\n**Tools**:\n\n- 🔍 `google_search(q, gl=\"us\", hl=\"en\", location=None, num=None, tbs=None, page=None, autocorrect=None)`: Perform web searches via Serper API and retrieve rich results\n\n**Environment Variables**:\n\n- 🔑 `SERPER_API_KEY`: Serper API key (required)\n- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"search_and_scrape_webpage\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.dev_mcp_servers.search_and_scrape_webpage\"],\n                env={\n                    \"SERPER_API_KEY\": \"your_serper_api_key\",\n                    \"SERPER_BASE_URL\": \"https://google.serper.dev\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"search_and_scrape_webpage\",\n        tool_name=\"google_search\",\n        arguments={\n            \"q\": \"Python async programming\",\n            \"gl\": \"us\",\n            \"hl\": \"en\",\n            \"num\": 10\n        }\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: jina_scrape_llm_summary\n\nScrape content from URLs and extract meaningful information using an LLM. Used in MiroThinker v1.0/v1.5 evaluation.\n\n**Tools**:\n\n- 🔎 `scrape_and_extract_info(url, info_to_extract, custom_headers=None)`: Scrape content from a URL (web pages, PDFs, code files, etc.) and extract meaningful information using an LLM\n\n**Environment Variables**:\n\n- 🔑 `JINA_API_KEY`: Jina.ai API key (required)\n- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)\n- 🔗 `SUMMARY_LLM_BASE_URL`: LLM API base URL for summarization (required)\n- 🤖 `SUMMARY_LLM_MODEL_NAME`: LLM model name for summarization (required)\n- 🔑 `SUMMARY_LLM_API_KEY`: LLM API key for summarization (optional, depends on LLM provider)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"jina_scrape_llm_summary\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary\"],\n                env={\n                    \"JINA_API_KEY\": \"your_jina_api_key\",\n                    \"JINA_BASE_URL\": \"https://r.jina.ai\",\n                    \"SUMMARY_LLM_BASE_URL\": \"your_llm_base_url\",\n                    \"SUMMARY_LLM_MODEL_NAME\": \"your_llm_model_name\",\n                    \"SUMMARY_LLM_API_KEY\": \"your_llm_api_key\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    result = await manager.execute_tool_call(\n        server_name=\"jina_scrape_llm_summary\",\n        tool_name=\"scrape_and_extract_info\",\n        arguments={\n            \"url\": \"https://example.com/article\",\n            \"info_to_extract\": \"What is the main topic of this article?\"\n        }\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-google-search\n\nGoogle search via Serper API with website scraping capabilities.\n\n**Tools**:\n\n- 🔍 `google_search(q, gl=\"us\", hl=\"en\", location=None, num=10, tbs=None, page=1)`: Google search\n- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai\n\n**Environment Variables**:\n\n- 🔑 `SERPER_API_KEY`: Serper API key (required for Google search)\n- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)\n- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)\n- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)\n\n**Filtering Options** (via environment variables):\n\n- 🚫 `REMOVE_SNIPPETS`: Remove snippets from search results\n- 🚫 `REMOVE_KNOWLEDGE_GRAPH`: Remove knowledge graph from results\n- 🚫 `REMOVE_ANSWER_BOX`: Remove answer box from results\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-google-search\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.searching_google_mcp_server\"],\n                env={\n                    \"SERPER_API_KEY\": \"your_serper_api_key\",\n                    \"SERPER_BASE_URL\": \"https://google.serper.dev\",\n                    \"JINA_API_KEY\": \"your_jina_api_key\",\n                    \"JINA_BASE_URL\": \"https://r.jina.ai\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    # Google search\n    result = await manager.execute_tool_call(\n        server_name=\"tool-google-search\",\n        tool_name=\"google_search\",\n        arguments={\n            \"q\": \"Python async programming\",\n            \"gl\": \"us\",\n            \"hl\": \"en\",\n            \"num\": 10\n        }\n    )\n    print(result)\n\n    # Scrape website\n    result = await manager.execute_tool_call(\n        server_name=\"tool-google-search\",\n        tool_name=\"scrape_website\",\n        arguments={\"url\": \"https://example.com/article\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n### Server: tool-sogou-search\n\nSogou search (optimized for Chinese) with website scraping capabilities. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation*\n\n**Tools**:\n\n- 🔍 `sogou_search(Query, Cnt=10)`: Sogou search (Chinese)\n- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai\n\n**Environment Variables**:\n\n- 🔑 `TENCENTCLOUD_SECRET_ID`: Tencent Cloud secret ID (required)\n- 🔑 `TENCENTCLOUD_SECRET_KEY`: Tencent Cloud secret key (required)\n- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)\n- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)\n\n**Example**:\n\n<details>\n<summary>Click to expand code example</summary>\n\n```python\nimport asyncio\nfrom miroflow_tools import ToolManager\nfrom mcp import StdioServerParameters\n\nasync def main():\n    server_configs = [\n        {\n            \"name\": \"tool-sogou-search\",\n            \"params\": StdioServerParameters(\n                command=\"python\",\n                args=[\"-m\", \"miroflow_tools.mcp_servers.searching_sogou_mcp_server\"],\n                env={\n                    \"TENCENTCLOUD_SECRET_ID\": \"your_tencent_secret_id\",\n                    \"TENCENTCLOUD_SECRET_KEY\": \"your_tencent_secret_key\",\n                    \"JINA_API_KEY\": \"your_jina_api_key\",\n                    \"JINA_BASE_URL\": \"https://r.jina.ai\"\n                }\n            )\n        }\n    ]\n\n    manager = ToolManager(server_configs)\n\n    # Sogou search\n    result = await manager.execute_tool_call(\n        server_name=\"tool-sogou-search\",\n        tool_name=\"sogou_search\",\n        arguments={\n            \"Query\": \"Python 异步编程\",\n            \"Cnt\": 10\n        }\n    )\n    print(result)\n\n    # Scrape website\n    result = await manager.execute_tool_call(\n        server_name=\"tool-sogou-search\",\n        tool_name=\"scrape_website\",\n        arguments={\"url\": \"https://example.com/article\"}\n    )\n    print(result)\n\nif __name__ == \"__main__\":\n    asyncio.run(main())\n```\n\n</details>\n\n## 🚀 Development\n\n### Adding a New MCP Server\n\n1. Create a new server file in `mcp_servers/`\n1. Use `FastMCP` to define tools:\n   ```python\n   from fastmcp import FastMCP\n   mcp = FastMCP(\"server-name\")\n\n   @mcp.tool()\n   async def my_tool(arg: str) -> str:\n       \"\"\"Tool description.\"\"\"\n       return \"result\"\n\n   if __name__ == \"__main__\":\n       mcp.run(transport=\"stdio\")\n   ```\n1. Add server configuration to your application\n1. Update this README with server documentation\n"
  },
  {
    "path": "libs/miroflow-tools/pyproject.toml",
    "content": "[project]\nname = \"miroflow-tools\"\nversion = \"0.1.0\"\ndescription = \"Tool management and MCP server utilities for MiroFlow\"\nreadme = \"README.md\"\nauthors = [\n    { name = \"MiroMind Team\", email = \"service@miromind.ai\" }\n]\nrequires-python = \">=3.12\"\ndependencies = [\n    \"mcp>=1.0.0\",\n    \"fastmcp>=0.1.0\",\n    \"playwright>=1.40.0\",\n    \"requests>=2.32.0\",\n    \"e2b-code-interpreter==1.2.1\",\n    \"wikipedia\",\n    \"mutagen\",\n    \"markitdown-mcp>=0.0.1a3\",\n    \"google-genai\",\n    \"aiohttp\",\n    \"redis\"\n]\n\n[build-system]\nrequires = [\"hatchling\"]\nbuild-backend = \"hatchling.build\"\n\n[tool.hatch.build.targets.wheel]\npackages = [\"src/miroflow_tools\"]\n\n[dependency-groups]\ndev = [\n    \"pytest>=8.4.1\",\n    \"pytest-asyncio>=1.0.0\",\n    \"pytest-cov>=6.2.1\",\n    \"pytest-html>=4.1.1\",\n    \"pytest-xdist>=3.7.0\",\n    \"pytest-mock>=3.10.0\",\n    \"pytest-timeout>=2.1.0\",\n    \"inline-snapshot>=0.23.2\",\n]\n\n[tool.pytest.ini_options]\nminversion = \"8.3.5\"\ntestpaths = [\"src/test\"]\nasyncio_default_fixture_loop_scope = \"function\"\naddopts = [\n    \"-rA\",\n    \"--show-capture=stderr\",\n    \"-n=auto\",\n    \"--html=report.html\",\n    \"--self-contained-html\",\n    \"--cov=miroflow_tools\",\n    \"--cov-report=html\",\n    \"--strict-markers\",\n    \"-v\",\n]\nmarkers = [\n    \"integration: marks tests as integration tests (may be slow)\",\n    \"unit: marks tests as unit tests\",\n    \"slow: marks tests as slow (deselect with '-m \\\"not slow\\\"')\",\n    \"requires_api_key: marks tests that require real API credentials\",\n] "
  },
  {
    "path": "libs/miroflow-tools/src/__init__.py",
    "content": ""
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/__init__.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nfrom .manager import ToolManager\n\n__all__ = [\"ToolManager\"]\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/jina_scrape_llm_summary.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport json\nimport logging\nimport os\nfrom typing import Any, Dict\n\nimport httpx\nfrom mcp.server.fastmcp import FastMCP\n\n# Configure logging\nlogger = logging.getLogger(\"miroflow\")\n\nSUMMARY_LLM_BASE_URL = os.environ.get(\"SUMMARY_LLM_BASE_URL\")\nSUMMARY_LLM_MODEL_NAME = os.environ.get(\"SUMMARY_LLM_MODEL_NAME\")\nSUMMARY_LLM_API_KEY = os.environ.get(\"SUMMARY_LLM_API_KEY\")\n\nJINA_API_KEY = os.environ.get(\"JINA_API_KEY\", \"\")\nJINA_BASE_URL = os.environ.get(\"JINA_BASE_URL\", \"https://r.jina.ai\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"jina_scrape_llm_summary\")\n\n\n@mcp.tool()\nasync def scrape_and_extract_info(\n    url: str, info_to_extract: str, custom_headers: Dict[str, str] = None\n):\n    \"\"\"\n    Scrape content from a URL, including web pages, PDFs, code files, and other supported resources, and extract meaningful information using an LLM.\n    If you need to extract information from a PDF, please use this tool.\n\n    Args:\n        url (str): The URL to scrape content from. Supports various types of URLs such as web pages, PDFs, raw text/code files (e.g., GitHub, Gist), and similar sources.\n        info_to_extract (str): The specific types of information to extract (usually a question)\n        custom_headers (Dict[str, str]): Additional headers to include in the scraping request\n\n    Returns:\n        Dict[str, Any]: A dictionary containing:\n            - success (bool): Whether the operation was successful\n            - url (str): The original URL\n            - extracted_info (str): The extracted information\n            - error (str): Error message if the operation failed\n            - scrape_stats (Dict): Statistics about the scraped content\n            - model_used (str): The model used for summarization\n            - tokens_used (int): Number of tokens used (if available)\n    \"\"\"\n    if _is_huggingface_dataset_or_space_url(url):\n        return json.dumps(\n            {\n                \"success\": False,\n                \"url\": url,\n                \"extracted_info\": \"\",\n                \"error\": \"You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.\",\n                \"scrape_stats\": {},\n                \"tokens_used\": 0,\n            },\n            ensure_ascii=False,\n        )\n\n    # First, scrape the content with Jina\n    scrape_result = await scrape_url_with_jina(url, custom_headers)\n\n    # If Jina fails, try direct Python scraping as fallback\n    if not scrape_result[\"success\"]:\n        logger.warning(\n            f\"Jina Scrape and Extract Info: Jina scraping failed: {scrape_result['error']}, trying direct Python scraping as fallback\"\n        )\n        scrape_result = await scrape_url_with_python(url, custom_headers)\n\n        if not scrape_result[\"success\"]:\n            logger.error(\n                f\"Jina Scrape and Extract Info: Both Jina and Python scraping failed: {scrape_result['error']}\"\n            )\n            return json.dumps(\n                {\n                    \"success\": False,\n                    \"url\": url,\n                    \"extracted_info\": \"\",\n                    \"error\": f\"Scraping failed (both Jina and Python): {scrape_result['error']}\",\n                    \"scrape_stats\": {},\n                    \"tokens_used\": 0,\n                },\n                ensure_ascii=False,\n            )\n        else:\n            logger.info(\n                f\"Jina Scrape and Extract Info: Python fallback scraping succeeded for URL: {url}\"\n            )\n\n    # Then, summarize the content\n    extracted_result = await extract_info_with_llm(\n        url=url,\n        content=scrape_result[\"content\"],\n        info_to_extract=info_to_extract,\n        model=SUMMARY_LLM_MODEL_NAME,\n        max_tokens=8192,\n    )\n\n    # Combine results\n    return json.dumps(\n        {\n            \"success\": extracted_result[\"success\"],\n            \"url\": url,\n            \"extracted_info\": extracted_result[\"extracted_info\"],\n            \"error\": extracted_result[\"error\"],\n            \"scrape_stats\": {\n                \"line_count\": scrape_result[\"line_count\"],\n                \"char_count\": scrape_result[\"char_count\"],\n                \"last_char_line\": scrape_result[\"last_char_line\"],\n                \"all_content_displayed\": scrape_result[\"all_content_displayed\"],\n            },\n            \"model_used\": extracted_result[\"model_used\"],\n            \"tokens_used\": extracted_result[\"tokens_used\"],\n        },\n        ensure_ascii=False,\n    )\n\n\ndef _is_huggingface_dataset_or_space_url(url):\n    \"\"\"\n    Check if the URL is a HuggingFace dataset or space URL.\n    :param url: The URL to check\n    :return: True if it's a HuggingFace dataset or space URL, False otherwise\n    \"\"\"\n    if not url:\n        return False\n    return \"huggingface.co/datasets\" in url or \"huggingface.co/spaces\" in url\n\n\nasync def scrape_url_with_jina(\n    url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4\n) -> Dict[str, Any]:\n    \"\"\"\n    Scrape content from a URL and save to a temporary file. Need to read the content from the temporary file.\n\n\n    Args:\n        url (str): The URL to scrape content from\n        custom_headers (Dict[str, str]): Additional headers to include in the request\n        max_chars (int): Maximum number of characters to reserve for the scraped content\n\n    Returns:\n        Dict[str, Any]: A dictionary containing:\n            - success (bool): Whether the operation was successful\n            - filename (str): Absolute path to the temporary file containing the scraped content\n            - content (str): The scraped content of the first 40k characters\n            - error (str): Error message if the operation failed\n            - line_count (int): Number of lines in the scraped content\n            - char_count (int): Number of characters in the scraped content\n            - last_char_line (int): Line number where the last displayed character is located\n            - all_content_displayed (bool): Signal indicating if all content was displayed (True if content <= 40k chars)\n    \"\"\"\n\n    # Validate input\n    if not url or not url.strip():\n        return {\n            \"success\": False,\n            \"filename\": \"\",\n            \"content\": \"\",\n            \"error\": \"URL cannot be empty\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Get API key from environment\n    if not JINA_API_KEY:\n        return {\n            \"success\": False,\n            \"filename\": \"\",\n            \"content\": \"\",\n            \"error\": \"JINA_API_KEY environment variable is not set\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Avoid duplicate Jina URL prefix\n    if url.startswith(\"https://r.jina.ai/\") and url.count(\"http\") >= 2:\n        url = url[len(\"https://r.jina.ai/\") :]\n\n    # Construct the Jina.ai API URL\n    jina_url = f\"{JINA_BASE_URL}/{url}\"\n\n    try:\n        # Prepare headers\n        headers = {\n            \"Authorization\": f\"Bearer {JINA_API_KEY}\",\n        }\n\n        # Add custom headers if provided\n        if custom_headers:\n            headers.update(custom_headers)\n\n        # Retry configuration\n        retry_delays = [1, 2, 4, 8]\n\n        for attempt, delay in enumerate(retry_delays, 1):\n            try:\n                # Make the request using httpx library\n                async with httpx.AsyncClient() as client:\n                    response = await client.get(\n                        jina_url,\n                        headers=headers,\n                        timeout=httpx.Timeout(None, connect=20, read=60),\n                        follow_redirects=True,  # Follow redirects (equivalent to curl -L)\n                    )\n\n                # Check if request was successful\n                response.raise_for_status()\n                break  # Success, exit retry loop\n\n            except httpx.ConnectTimeout as e:\n                # connection timeout, retry\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Jina Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Jina Scrape: Connection retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.ConnectError as e:\n                # connection error, retry\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Jina Scrape: Connection error: {e}, {delay}s before next attempt\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Jina Scrape: Connection retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.ReadTimeout as e:\n                # read timeout, retry\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Jina Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Jina Scrape: Read timeout retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.HTTPStatusError as e:\n                status_code = e.response.status_code\n\n                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)\n                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]\n\n                if should_retry and attempt < len(retry_delays):\n                    logger.info(\n                        f\"Jina Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                elif should_retry:\n                    logger.error(\n                        f\"Jina Scrape: HTTP {status_code} retry exhausted, url: {url}\"\n                    )\n                    raise e\n                else:\n                    logger.error(\n                        f\"Jina Scrape: HTTP {status_code} (non-retryable), url: {url}\"\n                    )\n                    raise e\n\n            except httpx.RequestError as e:\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Jina Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Jina Scrape: Unknown request exception retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n    except Exception as e:\n        error_msg = f\"Jina Scrape: Unexpected error occurred: {str(e)}\"\n        logger.error(error_msg)\n        return {\n            \"success\": False,\n            \"filename\": \"\",\n            \"content\": \"\",\n            \"error\": error_msg,\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Get the scraped content\n    content = response.text\n\n    if not content:\n        return {\n            \"success\": False,\n            \"filename\": \"\",\n            \"content\": \"\",\n            \"error\": \"No content returned from Jina.ai API\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # handle insufficient balance error\n    try:\n        content_dict = json.loads(content)\n    except json.JSONDecodeError:\n        content_dict = None\n    if (\n        isinstance(content_dict, dict)\n        and content_dict.get(\"name\") == \"InsufficientBalanceError\"\n    ):\n        return {\n            \"success\": False,\n            \"filename\": \"\",\n            \"content\": \"\",\n            \"error\": \"Insufficient balance\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Get content statistics\n    total_char_count = len(content)\n    total_line_count = content.count(\"\\n\") + 1 if content else 0\n\n    # Extract first max_chars characters\n    displayed_content = content[:max_chars]\n    all_content_displayed = total_char_count <= max_chars\n\n    # Calculate the line number of the last character displayed\n    if displayed_content:\n        # Count newlines up to the last displayed character\n        last_char_line = displayed_content.count(\"\\n\") + 1\n    else:\n        last_char_line = 0\n\n    return {\n        \"success\": True,\n        \"content\": displayed_content,\n        \"error\": \"\",\n        \"line_count\": total_line_count,\n        \"char_count\": total_char_count,\n        \"last_char_line\": last_char_line,\n        \"all_content_displayed\": all_content_displayed,\n    }\n\n\nasync def scrape_url_with_python(\n    url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4\n) -> Dict[str, Any]:\n    \"\"\"\n    Fallback scraping method using Python's httpx library directly.\n\n    Args:\n        url (str): The URL to scrape content from\n        custom_headers (Dict[str, str]): Additional headers to include in the request\n        max_chars (int): Maximum number of characters to reserve for the scraped content\n\n    Returns:\n        Dict[str, Any]: A dictionary containing:\n            - success (bool): Whether the operation was successful\n            - content (str): The scraped content\n            - error (str): Error message if the operation failed\n            - line_count (int): Number of lines in the scraped content\n            - char_count (int): Number of characters in the scraped content\n            - last_char_line (int): Line number where the last displayed character is located\n            - all_content_displayed (bool): Signal indicating if all content was displayed\n    \"\"\"\n    # Validate input\n    if not url or not url.strip():\n        return {\n            \"success\": False,\n            \"content\": \"\",\n            \"error\": \"URL cannot be empty\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    try:\n        # Prepare headers\n        headers = {\n            \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36\"\n        }\n\n        # Add custom headers if provided\n        if custom_headers:\n            headers.update(custom_headers)\n\n        # Retry configuration\n        retry_delays = [1, 2, 4]\n\n        for attempt, delay in enumerate(retry_delays, 1):\n            try:\n                # Make the request using httpx library\n                async with httpx.AsyncClient() as client:\n                    response = await client.get(\n                        url,\n                        headers=headers,\n                        timeout=httpx.Timeout(None, connect=20, read=60),\n                        follow_redirects=True,\n                    )\n\n                # Check if request was successful\n                response.raise_for_status()\n                break  # Success, exit retry loop\n\n            except httpx.ConnectTimeout as e:\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Python Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Python Scrape: Connection retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.ConnectError as e:\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Python Scrape: Connection error: {e}, {delay}s before next attempt\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Python Scrape: Connection retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.ReadTimeout as e:\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Python Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Python Scrape: Read timeout retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.HTTPStatusError as e:\n                status_code = e.response.status_code\n\n                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)\n                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]\n\n                if should_retry and attempt < len(retry_delays):\n                    logger.info(\n                        f\"Python Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                elif should_retry:\n                    logger.error(\n                        f\"Python Scrape: HTTP {status_code} retry exhausted, url: {url}\"\n                    )\n                    raise e\n                else:\n                    logger.error(\n                        f\"Python Scrape: HTTP {status_code} (non-retryable), url: {url}\"\n                    )\n                    raise e\n\n            except httpx.RequestError as e:\n                if attempt < len(retry_delays):\n                    logger.info(\n                        f\"Python Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        f\"Python Scrape: Unknown request exception retry attempts exhausted, url: {url}\"\n                    )\n                    raise e\n\n    except Exception as e:\n        error_msg = f\"Python Scrape: Unexpected error occurred: {str(e)}\"\n        logger.error(error_msg)\n        return {\n            \"success\": False,\n            \"content\": \"\",\n            \"error\": error_msg,\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Get the scraped content\n    content = response.text\n\n    if not content:\n        return {\n            \"success\": False,\n            \"content\": \"\",\n            \"error\": \"No content returned from URL\",\n            \"line_count\": 0,\n            \"char_count\": 0,\n            \"last_char_line\": 0,\n            \"all_content_displayed\": False,\n        }\n\n    # Get content statistics\n    total_char_count = len(content)\n    total_line_count = content.count(\"\\n\") + 1 if content else 0\n\n    # Extract first max_chars characters\n    displayed_content = content[:max_chars]\n    all_content_displayed = total_char_count <= max_chars\n\n    # Calculate the line number of the last character displayed\n    if displayed_content:\n        last_char_line = displayed_content.count(\"\\n\") + 1\n    else:\n        last_char_line = 0\n\n    return {\n        \"success\": True,\n        \"content\": displayed_content,\n        \"error\": \"\",\n        \"line_count\": total_line_count,\n        \"char_count\": total_char_count,\n        \"last_char_line\": last_char_line,\n        \"all_content_displayed\": all_content_displayed,\n    }\n\n\nEXTRACT_INFO_PROMPT = \"\"\"You are given a piece of content and the requirement of information to extract. Your task is to extract the information specifically requested. Be precise and focus exclusively on the requested information.\n\nINFORMATION TO EXTRACT:\n{}\n\nINSTRUCTIONS:\n1. Extract the information relevant to the focus above.\n2. If the exact information is not found, extract the most closely related details.\n3. Be specific and include exact details when available.\n4. Clearly organize the extracted information for easy understanding.\n5. Do not include general summaries or unrelated content.\n\nCONTENT TO ANALYZE:\n{}\n\nEXTRACTED INFORMATION:\"\"\"\n\n\ndef get_prompt_with_truncation(\n    info_to_extract: str, content: str, truncate_last_num_chars: int = -1\n) -> str:\n    if truncate_last_num_chars > 0:\n        content = content[:-truncate_last_num_chars] + \"[...truncated]\"\n\n    # Prepare the prompt\n    prompt = EXTRACT_INFO_PROMPT.format(info_to_extract, content)\n    return prompt\n\n\nasync def extract_info_with_llm(\n    url: str,\n    content: str,\n    info_to_extract: str,\n    model: str = \"LLM\",\n    max_tokens: int = 4096,\n) -> Dict[str, Any]:\n    \"\"\"\n    Summarize content using an LLM API.\n\n    Args:\n        content (str): The content to summarize\n        info_to_extract (str): The specific types of information to extract (usually a question)\n        model (str): The model to use for summarization\n        max_tokens (int): Maximum tokens for the response\n\n    Returns:\n        Dict[str, Any]: A dictionary containing:\n            - success (bool): Whether the operation was successful\n            - extracted_info (str): The extracted information\n            - error (str): Error message if the operation failed\n            - model_used (str): The model used for summarization\n            - tokens_used (int): Number of tokens used (if available)\n    \"\"\"\n\n    # Validate input\n    if not content or not content.strip():\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": \"Content cannot be empty\",\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n\n    prompt = get_prompt_with_truncation(info_to_extract, content)\n\n    # Prepare the payload\n    if \"gpt\" in model:\n        payload = {\n            \"model\": model,\n            \"max_completion_tokens\": max_tokens,\n            \"messages\": [\n                {\"role\": \"user\", \"content\": prompt},\n            ],\n        }\n        # Add cost-saving parameters for GPT-5 models\n        if \"gpt-5\" in model.lower() or \"gpt5\" in model.lower():\n            payload[\"service_tier\"] = \"flex\"\n            payload[\"reasoning_effort\"] = \"minimal\"\n    else:\n        payload = {\n            \"model\": model,\n            \"max_tokens\": max_tokens,\n            \"messages\": [\n                {\"role\": \"user\", \"content\": prompt},\n            ],\n            \"temperature\": 1.0,\n            # \"top_p\": 0.8,\n            # \"top_k\": 20,\n        }\n\n    # Validate LLM endpoint configuration early for clearer errors\n    if not SUMMARY_LLM_BASE_URL or not SUMMARY_LLM_BASE_URL.strip():\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": \"SUMMARY_LLM_BASE_URL environment variable is not set\",\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n\n    # Prepare headers (add Authorization if API key is available)\n    headers = {\"Content-Type\": \"application/json\"}\n    if SUMMARY_LLM_API_KEY:\n        headers[\"Authorization\"] = f\"Bearer {SUMMARY_LLM_API_KEY}\"\n\n    try:\n        # Retry configuration\n        connect_retry_delays = [1, 2, 4, 8]\n\n        for attempt, delay in enumerate(connect_retry_delays, 1):\n            try:\n                # Make the API request using httpx\n                async with httpx.AsyncClient() as client:\n                    response = await client.post(\n                        SUMMARY_LLM_BASE_URL,\n                        headers=headers,\n                        json=payload,\n                        timeout=httpx.Timeout(None, connect=30, read=300),\n                    )\n                    if response.text and len(response.text) >= 50:\n                        tail_50 = response.text[-50:]\n                        repeat_count = response.text.count(tail_50)\n                        if repeat_count > 5:\n                            logger.info(\"Repeat detected in extract_info_with_llm\")\n                            continue\n\n                # Check if the request was successful\n                if (\n                    \"Requested token count exceeds the model's maximum context length\"\n                    in response.text\n                    or \"longer than the model's context length\" in response.text\n                ):\n                    prompt = get_prompt_with_truncation(\n                        info_to_extract,\n                        content,\n                        truncate_last_num_chars=40960 * attempt,\n                    )  # remove 40k * num_attempts chars from the end of the content\n                    payload[\"messages\"][0][\"content\"] = prompt\n                    continue  # no need to raise error here, just try again\n\n                response.raise_for_status()\n                break  # Success, exit retry loop\n\n            except httpx.ConnectTimeout as e:\n                # connection timeout, retry\n                if attempt < len(connect_retry_delays):\n                    logger.info(\n                        f\"Jina Scrape and Extract Info: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        \"Jina Scrape and Extract Info: Connection retry attempts exhausted\"\n                    )\n                    raise e\n\n            except httpx.ConnectError as e:\n                # connection error, retry\n                if attempt < len(connect_retry_delays):\n                    logger.info(\n                        f\"Jina Scrape and Extract Info: Connection error: {e}, {delay}s before next attempt\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                else:\n                    logger.error(\n                        \"Jina Scrape and Extract Info: Connection retry attempts exhausted\"\n                    )\n                    raise e\n\n            except httpx.ReadTimeout as e:\n                # read timeout, LLM API is too slow, no need to retry\n                if attempt < len(connect_retry_delays):\n                    logger.info(\n                        f\"Jina Scrape and Extract Info: LLM API attempt {attempt} read timeout\"\n                    )\n                    continue\n                else:\n                    logger.error(\n                        f\"Jina Scrape and Extract Info: LLM API read timeout retry attempts exhausted, please check the request complexity, information to extract: {info_to_extract}, length of content: {len(content)}, url: {url}\"\n                    )\n                    raise e\n\n            except httpx.HTTPStatusError as e:\n                status_code = e.response.status_code\n\n                # Special case: GPT-5 service_tier parameter compatibility issue\n                if (\n                    \"gpt-5\" in model.lower() or \"gpt5\" in model.lower()\n                ) and \"service_tier\" in payload:\n                    logger.info(\n                        \"Extract Info: GPT-5 service_tier error, removing and retrying\"\n                    )\n                    payload.pop(\"service_tier\", None)\n                    if attempt < len(connect_retry_delays):\n                        await asyncio.sleep(delay)\n                        continue\n\n                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)\n                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]\n\n                if should_retry and attempt < len(connect_retry_delays):\n                    logger.info(\n                        f\"Extract Info: HTTP {status_code} (retryable), retry in {delay}s\"\n                    )\n                    await asyncio.sleep(delay)\n                    continue\n                elif should_retry:\n                    logger.error(f\"Extract Info: HTTP {status_code} retry exhausted\")\n                    raise e\n                else:\n                    logger.error(f\"Extract Info: HTTP {status_code} (non-retryable)\")\n                    raise httpx.HTTPStatusError(\n                        f\"response.text: {response.text}\",\n                        request=e.request,\n                        response=e.response,\n                    ) from e\n\n            except httpx.RequestError as e:\n                logger.error(\n                    f\"Jina Scrape and Extract Info: Unknown request exception: {e}\"\n                )\n                raise e\n\n    except Exception as e:\n        error_msg = f\"Jina Scrape and Extract Info: Unexpected error during LLM API call: {str(e)}\"\n        logger.error(error_msg)\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": error_msg,\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n\n    # Parse the response\n    try:\n        response_data = response.json()\n\n    except json.JSONDecodeError as e:\n        error_msg = (\n            f\"Jina Scrape and Extract Info: Failed to parse LLM API response: {str(e)}\"\n        )\n        logger.error(error_msg)\n        logger.error(f\"Raw response: {response.text}\")\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": error_msg,\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n\n    # Extract summary from response\n    if \"choices\" in response_data and len(response_data[\"choices\"]) > 0:\n        try:\n            summary = response_data[\"choices\"][0][\"message\"][\"content\"]\n        except Exception as e:\n            error_msg = f\"Jina Scrape and Extract Info: Failed to get summary from LLM API response: {str(e)}\"\n            logger.error(error_msg)\n            return {\n                \"success\": False,\n                \"extracted_info\": \"\",\n                \"error\": error_msg,\n                \"model_used\": model,\n                \"tokens_used\": 0,\n            }\n\n        # Extract token usage if available\n        tokens_used = 0\n        if \"usage\" in response_data:\n            tokens_used = response_data[\"usage\"].get(\"total_tokens\", 0)\n\n        return {\n            \"success\": True,\n            \"extracted_info\": summary,\n            \"error\": \"\",\n            \"model_used\": model,\n            \"tokens_used\": tokens_used,\n        }\n    elif \"error\" in response_data:\n        error_msg = (\n            f\"Jina Scrape and Extract Info: LLM API error: {response_data['error']}\"\n        )\n        logger.error(error_msg)\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": error_msg,\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n    else:\n        error_msg = f\"Jina Scrape and Extract Info: No valid response from LLM API, response data: {response_data}\"\n        logger.error(error_msg)\n        return {\n            \"success\": False,\n            \"extracted_info\": \"\",\n            \"error\": error_msg,\n            \"model_used\": model,\n            \"tokens_used\": 0,\n        }\n\n\nif __name__ == \"__main__\":\n    # Example usage and testing\n\n    # Run the MCP server\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/search_and_scrape_webpage.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport logging\nimport os\nfrom typing import Any, Dict\n\nimport httpx\nfrom mcp.server.fastmcp import FastMCP\nfrom tenacity import (\n    retry,\n    retry_if_exception_type,\n    stop_after_attempt,\n    wait_exponential,\n)\nfrom tencentcloud.common import credential\nfrom tencentcloud.common.common_client import CommonClient\nfrom tencentcloud.common.exception.tencent_cloud_sdk_exception import (\n    TencentCloudSDKException,\n)\nfrom tencentcloud.common.profile.client_profile import ClientProfile\nfrom tencentcloud.common.profile.http_profile import HttpProfile\n\nfrom ..mcp_servers.utils.url_unquote import decode_http_urls_in_dict\n\n# Configure logging\nlogger = logging.getLogger(\"miroflow\")\n\nSERPER_BASE_URL = os.getenv(\"SERPER_BASE_URL\", \"https://google.serper.dev\")\nSERPER_API_KEY = os.getenv(\"SERPER_API_KEY\", \"\")\n\nTENCENTCLOUD_SECRET_ID = os.getenv(\"TENCENTCLOUD_SECRET_ID\", \"\")\nTENCENTCLOUD_SECRET_KEY = os.getenv(\"TENCENTCLOUD_SECRET_KEY\", \"\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"search_and_scrape_webpage\")\n\n\n@retry(\n    stop=stop_after_attempt(3),\n    wait=wait_exponential(multiplier=1, min=4, max=10),\n    retry=retry_if_exception_type(\n        (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError)\n    ),\n)\nasync def make_serper_request(\n    payload: Dict[str, Any], headers: Dict[str, str]\n) -> httpx.Response:\n    \"\"\"Make HTTP request to Serper API with retry logic.\"\"\"\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            f\"{SERPER_BASE_URL}/search\",\n            json=payload,\n            headers=headers,\n        )\n        response.raise_for_status()\n        return response\n\n\ndef _is_banned_url(url: str) -> bool:\n    \"\"\"\n    Check if the URL is a banned URL.\n    :param url: The URL to check\n    :return: True if it's a banned URL, False otherwise\n    \"\"\"\n    banned_list = [\n        \"unifuncs\",\n        \"huggingface.co/datasets\",\n        \"huggingface.co/spaces\",\n    ]\n    if not url:\n        return False\n    return any(banned in url for banned in banned_list)\n\n\n@mcp.tool()\nasync def google_search(\n    q: str,\n    gl: str = \"us\",\n    hl: str = \"en\",\n    location: str = None,\n    num: int = None,\n    tbs: str = None,\n    page: int = None,\n    autocorrect: bool = None,\n):\n    \"\"\"\n    Tool to perform web searches via Serper API and retrieve rich results.\n\n    It is able to retrieve organic search results, people also ask,\n    related searches, and knowledge graph.\n\n    Args:\n        q: Search query string\n        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')\n        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')\n        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')\n        num: Number of results to return (default: 10)\n        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)\n        page: Page number of results to return (default: 1)\n        autocorrect: Whether to autocorrect spelling in query\n\n    Returns:\n        Dictionary containing search results and metadata.\n    \"\"\"\n    # Check for API key\n    if not SERPER_API_KEY:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"SERPER_API_KEY environment variable not set\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    # Validate required parameter\n    if not q or not q.strip():\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"Search query 'q' is required and cannot be empty\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    try:\n        # Helper function to perform a single search\n        async def perform_search(search_query: str) -> tuple[list, dict]:\n            \"\"\"Perform a search and return organic results and search parameters.\"\"\"\n            # Build payload with all supported parameters\n            payload: dict[str, Any] = {\n                \"q\": search_query.strip(),\n                \"gl\": gl,\n                \"hl\": hl,\n            }\n\n            # Add optional parameters if provided\n            if location:\n                payload[\"location\"] = location\n            if num is not None:\n                payload[\"num\"] = num\n            else:\n                payload[\"num\"] = 10  # Default\n            if tbs:\n                payload[\"tbs\"] = tbs\n            if page is not None:\n                payload[\"page\"] = page\n            if autocorrect is not None:\n                payload[\"autocorrect\"] = autocorrect\n\n            # Set up headers\n            headers = {\n                \"X-API-KEY\": SERPER_API_KEY,\n                \"Content-Type\": \"application/json\",\n            }\n\n            # Make the API request\n            response = await make_serper_request(payload, headers)\n            data = response.json()\n\n            # filter out HuggingFace dataset or space urls\n            organic_results = []\n            if \"organic\" in data:\n                for item in data[\"organic\"]:\n                    if _is_banned_url(item.get(\"link\", \"\")):\n                        continue\n                    organic_results.append(item)\n\n            return organic_results, data.get(\"searchParameters\", {})\n\n        # Perform initial search\n        original_query = q.strip()\n        organic_results, search_params = await perform_search(original_query)\n\n        # If no results and query contains quotes, retry without quotes\n        if not organic_results and '\"' in original_query:\n            # Remove all types of quotes\n            query_without_quotes = original_query.replace('\"', \"\").strip()\n            if query_without_quotes:  # Make sure we still have a valid query\n                organic_results, search_params = await perform_search(\n                    query_without_quotes\n                )\n\n        # Build comprehensive response\n        response_data = {\n            \"organic\": organic_results,\n            \"searchParameters\": search_params,\n        }\n        response_data = decode_http_urls_in_dict(response_data)\n\n        return json.dumps(response_data, ensure_ascii=False)\n\n    except Exception as e:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": f\"Unexpected error: {str(e)}\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n\n@retry(\n    stop=stop_after_attempt(3),\n    wait=wait_exponential(multiplier=1, min=4, max=10),\n    retry=retry_if_exception_type(TencentCloudSDKException),\n)\nasync def make_sogou_request(query: str, cnt: int) -> Dict[str, Any]:\n    \"\"\"Make request to Tencent Cloud SearchPro API with retry logic.\"\"\"\n    cred = credential.Credential(TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY)\n    httpProfile = HttpProfile()\n    httpProfile.endpoint = \"wsa.tencentcloudapi.com\"\n    clientProfile = ClientProfile()\n    clientProfile.httpProfile = httpProfile\n\n    params = f'{{\"Query\":\"{query}\",\"Mode\":0, \"Cnt\":{cnt}}}'\n    common_client = CommonClient(\"wsa\", \"2025-05-08\", cred, \"\", profile=clientProfile)\n    result = common_client.call_json(\"SearchPro\", json.loads(params))[\"Response\"]\n    return result\n\n\n@mcp.tool()\nasync def sogou_search(\n    q: str,\n    num: int = 10,\n) -> str:\n    \"\"\"\n    Tool to perform web searches via Tencent Cloud SearchPro API (Sogou search engine).\n\n    Sogou search offers superior results for Chinese-language queries compared to Google.\n\n    Args:\n        q: Search query string (Required)\n        num: Number of search results to return (Can only be 10/20/30/40/50, default: 10)\n\n    Returns:\n        JSON string containing search results with the following fields:\n        - Query: The original search query\n        - Pages: Array of search results, each containing title, url, passage, date, and site\n    \"\"\"\n    # Check for API credentials\n    if not TENCENTCLOUD_SECRET_ID or not TENCENTCLOUD_SECRET_KEY:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY environment variable not set\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    # Validate required parameter\n    if not q or not q.strip():\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"Search query 'q' is required and cannot be empty\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    # Validate num parameter\n    if num not in [10, 20, 30, 40, 50]:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": f\"Invalid num value: {num}. Must be one of 10, 20, 30, 40, 50\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    try:\n        # Make the API request\n        result = await make_sogou_request(q.strip(), num)\n\n        # Remove RequestId from response\n        if \"RequestId\" in result:\n            del result[\"RequestId\"]\n\n        # Process and simplify the Pages field\n        pages = []\n        if \"Pages\" in result:\n            for page in result[\"Pages\"]:\n                page_json = json.loads(page)\n                new_page = {\n                    \"title\": page_json.get(\"title\", \"\"),\n                    \"url\": page_json.get(\"url\", \"\"),\n                    \"passage\": page_json.get(\"passage\", \"\"),\n                    \"date\": page_json.get(\"date\", \"\"),\n                    \"site\": page_json.get(\"site\", \"\"),\n                }\n                pages.append(new_page)\n            result[\"Pages\"] = pages\n\n        # Decode URLs in the response\n        result = decode_http_urls_in_dict(result)\n\n        return json.dumps(result, ensure_ascii=False)\n\n    except TencentCloudSDKException as e:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": f\"Tencent Cloud API error: {str(e)}\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    except Exception as e:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": f\"Unexpected error: {str(e)}\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n\nif __name__ == \"__main__\":\n    mcp.run()\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport os\n\nfrom e2b_code_interpreter import Sandbox\nfrom mcp.server.fastmcp import FastMCP\n\n# Initialize FastMCP server\nmcp = FastMCP(\"stateless-python-server\")\n\n# API keys\nE2B_API_KEY = os.environ.get(\"E2B_API_KEY\")\n\n# DEFAULT CONFS\nDEFAULT_TIMEOUT = 300  # seconds\n\n\n@mcp.tool()\nasync def python(code: str) -> str:\n    \"\"\"Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n    When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output.\n    IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time.\n\n        Args:\n            code: The python code to run.\n\n        Returns:\n            A string containing the execution result including stdout and stderr.\n    \"\"\"\n    sandbox = Sandbox.create(\n        timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template=\"1av7fdjfvcparqo8efq6\"\n    )\n\n    max_attempts = 2\n    for attempt in range(1, max_attempts + 1):\n        try:\n            execution = sandbox.run_code(code)\n            break\n        except Exception as e:\n            if attempt == max_attempts:\n                raise e\n    execution = sandbox.run_code(code)\n\n    sandbox.kill()\n\n    return str(execution)\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/task_planner.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport json\nimport logging\nimport os\nfrom datetime import datetime\nfrom pathlib import Path\nfrom typing import Any, Dict, List\nfrom uuid import uuid4\n\nfrom mcp.server.fastmcp import FastMCP\n\n# Configure logging\nlogger = logging.getLogger(\"miroflow\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"task_planner\")\n\n# Configuration\nTODO_DATA_DIR = os.environ.get(\"TODO_DATA_DIR\", \"../../logs/todo_lists\")\n\n# TASK_ID is required for task isolation\n# Without TASK_ID, task planner operations will fail\nTASK_ID = os.environ.get(\"TASK_ID\")\nif not TASK_ID:\n    raise ValueError(\n        \"TASK_ID environment variable is required for task_planner tool. \"\n        \"This tool must have a unique task identifier to prevent data conflicts in concurrent execution.\"\n    )\n\nTODO_DATA_FILE = os.path.join(TODO_DATA_DIR, f\"todos_{TASK_ID}.json\")\n\n# Ensure data directory exists\nPath(TODO_DATA_DIR).mkdir(parents=True, exist_ok=True)\n\n\ndef load_todos() -> List[Dict[str, Any]]:\n    \"\"\"Load task plan from the JSON file.\"\"\"\n    if not os.path.exists(TODO_DATA_FILE):\n        return []\n\n    try:\n        with open(TODO_DATA_FILE, \"r\", encoding=\"utf-8\") as f:\n            return json.load(f)\n    except Exception as e:\n        logger.error(f\"Failed to load task plan: {str(e)}\")\n        return []\n\n\ndef save_todos(todos: List[Dict[str, Any]]) -> bool:\n    \"\"\"Save task plan to the JSON file.\"\"\"\n    try:\n        with open(TODO_DATA_FILE, \"w\", encoding=\"utf-8\") as f:\n            json.dump(todos, f, ensure_ascii=False, indent=2)\n        return True\n    except Exception as e:\n        logger.error(f\"Failed to save task plan: {str(e)}\")\n        return False\n\n\ndef format_todos_as_markdown(todos: List[Dict[str, Any]], message: str = \"\") -> str:\n    \"\"\"\n    Format task plan as markdown checklist.\n\n    Args:\n        todos: List of task items\n        message: Optional message to display at the top\n\n    Returns:\n        Markdown formatted string\n    \"\"\"\n    # Calculate statistics\n    total = len(todos)\n    completed = sum(1 for t in todos if t.get(\"completed\", False))\n    pending = total - completed\n\n    # Build markdown\n    lines = []\n    if message:\n        lines.append(f\"{message}\\n\")\n\n    lines.append(\"# Task Plan\\n\")\n    lines.append(f\"Total: {total} | Pending: {pending} | Completed: {completed}\\n\")\n    lines.append(\"\")\n\n    if not todos:\n        lines.append(\"No tasks planned yet.\")\n    else:\n        for todo in todos:\n            checkbox = \"[x]\" if todo.get(\"completed\", False) else \"[ ]\"\n            title = todo[\"title\"]\n            todo_id = todo[\"id\"][:8]  # Show first 8 chars of ID\n            lines.append(f\"- {checkbox} {title} ({todo_id})\")\n\n    return \"\\n\".join(lines)\n\n\n@mcp.tool()\nasync def add_todo(titles: List[str]) -> str:\n    \"\"\"\n    Create a task plan by adding one or more task items.\n\n    CRITICAL: Before starting to work on ANY task, you MUST first create a complete task plan.\n    This is the foundation of effective task execution:\n    - Break down the main goal into clear, actionable steps\n    - Identify all necessary subtasks upfront\n    - Create a roadmap that guides your work\n    - Ensure nothing is overlooked or forgotten\n\n    Good task planning prevents confusion and ensures systematic progress toward your goal.\n\n    Args:\n        titles: List of task item titles. For example:\n                - Single task: [\"Complete project report\"]\n                - Multiple tasks: [\"Complete project report\", \"Fix bug #123\", \"Update documentation\"]\n                - Complex project: [\"Research requirements\", \"Design architecture\", \"Implement core features\", \"Write tests\", \"Document API\"]\n\n    Returns:\n        Markdown formatted string showing the success message and current task plan.\n    \"\"\"\n    if not titles:\n        return \"❌ Error: Task titles list cannot be empty.\"\n\n    # Filter out empty titles\n    title_list = [t.strip() for t in titles if t and t.strip()]\n\n    if not title_list:\n        return \"❌ Error: No valid task titles provided.\"\n\n    todos = load_todos()\n    added_todos = []\n\n    # Add all tasks\n    for title in title_list:\n        new_todo = {\n            \"id\": str(uuid4()),\n            \"title\": title,\n            \"completed\": False,\n            \"created_at\": datetime.now().isoformat(),\n        }\n        todos.append(new_todo)\n        added_todos.append(title)\n\n    if not save_todos(todos):\n        return \"❌ Error: Failed to save task plan.\"\n\n    # Build success message\n    if len(added_todos) == 1:\n        message = f'✅ Task added: \"{added_todos[0]}\"'\n    else:\n        message = f\"✅ Added {len(added_todos)} tasks:\\n\" + \"\\n\".join(\n            f\"  - {t}\" for t in added_todos\n        )\n\n    return format_todos_as_markdown(todos, message)\n\n\n@mcp.tool()\nasync def list_todos() -> str:\n    \"\"\"\n    Display the complete task plan with all items and their status.\n\n    Use this to review your overall progress, see what's done and what remains,\n    and understand where you are in the execution of your plan.\n\n    Returns:\n        Markdown formatted string showing all tasks with their completion status.\n    \"\"\"\n    todos = load_todos()\n    return format_todos_as_markdown(todos)\n\n\n@mcp.tool()\nasync def complete_todo(todo_ids: List[str]) -> str:\n    \"\"\"\n    Mark one or more tasks as completed in your plan.\n\n    Use this after finishing a task to track your progress and maintain an\n    accurate view of what's done and what's remaining.\n\n    Args:\n        todo_ids: List of task IDs to mark as completed (full ID or first 8 characters).\n                  For example: [\"a7f3b2c1\"] or [\"a7f3b2c1\", \"b8e4c3d2\"]\n\n    Returns:\n        Markdown formatted string showing the success message and updated task plan.\n    \"\"\"\n    if not todo_ids:\n        return \"❌ Error: Task IDs list cannot be empty.\"\n\n    # Filter out empty IDs\n    id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]\n\n    if not id_list:\n        return \"❌ Error: No valid task IDs provided.\"\n\n    todos = load_todos()\n    completed_todos = []\n    not_found_ids = []\n\n    # Complete all matching tasks\n    for todo_id in id_list:\n        found = False\n        for todo in todos:\n            if todo[\"id\"] == todo_id or todo[\"id\"].startswith(todo_id):\n                if not todo.get(\n                    \"completed\", False\n                ):  # Only mark if not already completed\n                    todo[\"completed\"] = True\n                    completed_todos.append(todo[\"title\"])\n                found = True\n                break\n        if not found:\n            not_found_ids.append(todo_id)\n\n    if not completed_todos and not_found_ids:\n        return f\"❌ Error: Task IDs not found: {', '.join(not_found_ids)}\"\n\n    if not save_todos(todos):\n        return \"❌ Error: Failed to save changes.\"\n\n    # Build success message\n    if len(completed_todos) == 1:\n        message = f'✅ Completed: \"{completed_todos[0]}\"'\n    else:\n        message = f\"✅ Completed {len(completed_todos)} tasks:\\n\" + \"\\n\".join(\n            f\"  - {t}\" for t in completed_todos\n        )\n\n    if not_found_ids:\n        message += f'\\n⚠️  Not found: {\", \".join(not_found_ids)}'\n\n    return format_todos_as_markdown(todos, message)\n\n\n@mcp.tool()\nasync def delete_todo(todo_ids: List[str]) -> str:\n    \"\"\"\n    Remove one or more tasks from your plan.\n\n    Use this to adjust your plan when tasks become irrelevant, duplicated,\n    or no longer needed. This helps keep your plan focused and accurate.\n\n    Args:\n        todo_ids: List of task IDs to remove (full ID or first 8 characters).\n                  For example: [\"a7f3b2c1\"] or [\"a7f3b2c1\", \"b8e4c3d2\"]\n\n    Returns:\n        Markdown formatted string showing the success message and remaining task plan.\n    \"\"\"\n    if not todo_ids:\n        return \"❌ Error: Task IDs list cannot be empty.\"\n\n    # Filter out empty IDs\n    id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]\n\n    if not id_list:\n        return \"❌ Error: No valid task IDs provided.\"\n\n    todos = load_todos()\n    deleted_todos = []\n    not_found_ids = []\n    ids_to_delete = set()\n\n    # Find all tasks to delete\n    for todo_id in id_list:\n        found = False\n        for todo in todos:\n            if todo[\"id\"] == todo_id or todo[\"id\"].startswith(todo_id):\n                deleted_todos.append(todo[\"title\"])\n                ids_to_delete.add(todo[\"id\"])\n                found = True\n                break\n        if not found:\n            not_found_ids.append(todo_id)\n\n    if not deleted_todos and not_found_ids:\n        return f\"❌ Error: Task IDs not found: {', '.join(not_found_ids)}\"\n\n    # Remove the tasks\n    todos = [t for t in todos if t[\"id\"] not in ids_to_delete]\n\n    if not save_todos(todos):\n        return \"❌ Error: Failed to save changes.\"\n\n    # Build success message\n    if len(deleted_todos) == 1:\n        message = f'🗑️ Deleted: \"{deleted_todos[0]}\"'\n    else:\n        message = f\"🗑️ Deleted {len(deleted_todos)} tasks:\\n\" + \"\\n\".join(\n            f\"  - {t}\" for t in deleted_todos\n        )\n\n    if not_found_ids:\n        message += f'\\n⚠️  Not found: {\", \".join(not_found_ids)}'\n\n    return format_todos_as_markdown(todos, message)\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/manager.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport functools\nfrom typing import Any, Awaitable, Callable, Protocol, TypeVar\n\nfrom mcp import ClientSession, StdioServerParameters  # (already imported in config.py)\nfrom mcp.client.sse import sse_client\nfrom mcp.client.stdio import stdio_client\n\nfrom .mcp_servers.browser_session import PlaywrightSession\n\n# logger = logging.getLogger(\"miroflow_agent\")\n\nR = TypeVar(\"R\")\n\n\ndef with_timeout(timeout_s: float = 300.0):\n    \"\"\"\n    Decorator: wraps any *async* function in asyncio.wait_for().\n    Usage:\n        @with_timeout(20)\n        async def create_message_foo(...): ...\n    \"\"\"\n\n    def decorator(\n        func: Callable[..., Awaitable[R]],\n    ) -> Callable[..., Awaitable[R]]:\n        @functools.wraps(func)\n        async def wrapper(*args, **kwargs) -> R:\n            return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)\n\n        return wrapper\n\n    return decorator\n\n\nclass ToolManagerProtocol(Protocol):\n    \"\"\"this enables other kinds of tool manager.\"\"\"\n\n    async def get_all_tool_definitions(self) -> Any: ...\n    async def execute_tool_call(\n        self, *, server_name: str, tool_name: str, arguments: dict[str, Any]\n    ) -> Any: ...\n\n\nclass ToolManager(ToolManagerProtocol):\n    def __init__(self, server_configs, tool_blacklist=None):\n        \"\"\"\n        Initialize ToolManager.\n        :param server_configs: List returned by create_server_parameters()\n        \"\"\"\n        self.server_configs = server_configs\n        self.server_dict = {\n            config[\"name\"]: config[\"params\"] for config in server_configs\n        }\n        self.browser_session = None\n        self.tool_blacklist = tool_blacklist if tool_blacklist else set()\n        self.task_log = None\n\n    def set_task_log(self, task_log):\n        \"\"\"Set the task logger for structured logging.\"\"\"\n        self.task_log = task_log\n\n        self._log(\n            \"info\",\n            \"ToolManager | Initialization\",\n            f\"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}\",\n        )\n\n    def _log(self, level, step_name, message, metadata=None):\n        \"\"\"Helper method to log using task_log if available, otherwise skip logging.\"\"\"\n        if self.task_log:\n            self.task_log.log_step(level, step_name, message, metadata)\n\n    def _is_huggingface_dataset_or_space_url(self, url):\n        \"\"\"\n        Check if the URL is a Hugging Face dataset or space URL.\n        :param url: The URL to check\n        :return: True if it's a HuggingFace dataset or space URL, False otherwise\n        \"\"\"\n        if not url:\n            return False\n        return \"huggingface.co/datasets\" in url or \"huggingface.co/spaces\" in url\n\n    def _should_block_hf_scraping(self, tool_name, arguments):\n        \"\"\"\n        Check if we should block scraping of Hugging Face datasets/spaces.\n        :param tool_name: The name of the tool being called\n        :param arguments: The arguments passed to the tool\n        :return: True if scraping should be blocked, False otherwise\n        \"\"\"\n        return (\n            tool_name in [\"scrape\", \"scrape_website\"]\n            and arguments.get(\"url\")\n            and self._is_huggingface_dataset_or_space_url(arguments[\"url\"])\n        )\n\n    def get_server_params(self, server_name):\n        \"\"\"Get parameters for the specified server\"\"\"\n        return self.server_dict.get(server_name)\n\n    async def get_all_tool_definitions(self):\n        \"\"\"\n        Connect to all configured servers and get their tool definitions.\n        Returns a list suitable for passing to the Prompt generator.\n        \"\"\"\n        all_servers_for_prompt = []\n        # Process remote server tools\n        for config in self.server_configs:\n            server_name = config[\"name\"]\n            server_params = config[\"params\"]\n            one_server_for_prompt = {\"name\": server_name, \"tools\": []}\n            self._log(\n                \"info\",\n                \"ToolManager | Get Tool Definitions\",\n                f\"Getting tool definitions for server '{server_name}'...\",\n            )\n\n            try:\n                if isinstance(server_params, StdioServerParameters):\n                    async with stdio_client(server_params) as (read, write):\n                        async with ClientSession(\n                            read, write, sampling_callback=None\n                        ) as session:\n                            await session.initialize()\n                            tools_response = await session.list_tools()\n                            # black list some tools\n                            for tool in tools_response.tools:\n                                if (server_name, tool.name) in self.tool_blacklist:\n                                    self._log(\n                                        \"info\",\n                                        \"ToolManager | Tool Blacklisted\",\n                                        f\"Tool '{tool.name}' in server '{server_name}' is blacklisted, skipping.\",\n                                    )\n                                    continue\n                                one_server_for_prompt[\"tools\"].append(\n                                    {\n                                        \"name\": tool.name,\n                                        \"description\": tool.description,\n                                        \"schema\": tool.inputSchema,\n                                    }\n                                )\n                elif isinstance(server_params, str) and server_params.startswith(\n                    (\"http://\", \"https://\")\n                ):\n                    # SSE endpoint\n                    async with sse_client(server_params) as (read, write):\n                        async with ClientSession(\n                            read, write, sampling_callback=None\n                        ) as session:\n                            await session.initialize()\n                            tools_response = await session.list_tools()\n                            for tool in tools_response.tools:\n                                # Can add specific tool filtering logic here (if needed)\n                                # if server_name == \"tool-excel\" and tool.name not in [\"get_workbook_metadata\", \"read_data_from_excel\"]:\n                                #     continue\n                                one_server_for_prompt[\"tools\"].append(\n                                    {\n                                        \"name\": tool.name,\n                                        \"description\": tool.description,\n                                        \"schema\": tool.inputSchema,\n                                    }\n                                )\n                else:\n                    self._log(\n                        \"error\",\n                        \"ToolManager | Unknown Parameter Type\",\n                        f\"Error: Unknown parameter type for server '{server_name}': {type(server_params)}\",\n                    )\n                    raise TypeError(\n                        f\"Unknown server params type for {server_name}: {type(server_params)}\"\n                    )\n\n                self._log(\n                    \"info\",\n                    \"ToolManager | Tool Definitions Success\",\n                    f\"Successfully obtained {len(one_server_for_prompt['tools'])} tool definitions from server '{server_name}'.\",\n                )\n                all_servers_for_prompt.append(one_server_for_prompt)\n\n            except Exception as e:\n                self._log(\n                    \"error\",\n                    \"ToolManager | Connection Error\",\n                    f\"Error: Unable to connect or get tools from server '{server_name}': {e}\",\n                )\n                # Still add server entry, but mark tool list as empty or include error information\n                one_server_for_prompt[\"tools\"] = [\n                    {\"error\": f\"Unable to fetch tools: {e}\"}\n                ]\n                all_servers_for_prompt.append(one_server_for_prompt)\n\n        return all_servers_for_prompt\n\n    @with_timeout(1200)\n    async def execute_tool_call(self, server_name, tool_name, arguments) -> Any:\n        \"\"\"\n        Execute a single tool call.\n        :param server_name: Server name\n        :param tool_name: Tool name\n        :param arguments: Tool arguments dictionary\n        :return: Dictionary containing result or error\n        \"\"\"\n\n        # Original remote server call logic\n        server_params = self.get_server_params(server_name)\n        if not server_params:\n            self._log(\n                \"error\",\n                \"ToolManager | Server Not Found\",\n                f\"Error: Attempting to call server '{server_name}' not found\",\n            )\n            return {\n                \"server_name\": server_name,\n                \"tool_name\": tool_name,\n                \"error\": f\"Server '{server_name}' not found.\",\n            }\n\n        self._log(\n            \"info\",\n            \"ToolManager | Tool Call Start\",\n            f\"Connecting to server '{server_name}' to call tool '{tool_name}'\",\n            metadata={\"arguments\": arguments},\n        )\n\n        if server_name == \"playwright\":\n            try:\n                if self.browser_session is None:\n                    self.browser_session = PlaywrightSession(server_params)\n                    await self.browser_session.connect()\n                tool_result = await self.browser_session.call_tool(\n                    tool_name, arguments=arguments\n                )\n                return {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"result\": tool_result,\n                }\n            except Exception as e:\n                return {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"error\": f\"Tool call failed: {str(e)}\",\n                }\n        else:\n            try:\n                result_content = None\n                if isinstance(server_params, StdioServerParameters):\n                    async with stdio_client(server_params) as (read, write):\n                        async with ClientSession(\n                            read, write, sampling_callback=None\n                        ) as session:\n                            await session.initialize()\n                            try:\n                                tool_result = await session.call_tool(\n                                    tool_name, arguments=arguments\n                                )\n                                result_content = (\n                                    tool_result.content[-1].text\n                                    if tool_result.content\n                                    else \"\"\n                                )\n                                # post hoc check for browsing agent reading answers from hf datsets\n                                if self._should_block_hf_scraping(tool_name, arguments):\n                                    result_content = \"You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.\"\n                            except Exception as tool_error:\n                                self._log(\n                                    \"error\",\n                                    \"ToolManager | Tool Execution Error\",\n                                    f\"Tool execution error: {tool_error}\",\n                                )\n                                return {\n                                    \"server_name\": server_name,\n                                    \"tool_name\": tool_name,\n                                    \"error\": f\"Tool execution failed: {str(tool_error)}\",\n                                }\n                elif isinstance(server_params, str) and server_params.startswith(\n                    (\"http://\", \"https://\")\n                ):\n                    async with sse_client(server_params) as (read, write):\n                        async with ClientSession(\n                            read, write, sampling_callback=None\n                        ) as session:\n                            await session.initialize()\n                            try:\n                                tool_result = await session.call_tool(\n                                    tool_name, arguments=arguments\n                                )\n                                result_content = (\n                                    tool_result.content[-1].text\n                                    if tool_result.content\n                                    else \"\"\n                                )\n                                # post hoc check for browsing agent reading answers from hf datsets\n                                if self._should_block_hf_scraping(tool_name, arguments):\n                                    result_content = \"You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.\"\n                            except Exception as tool_error:\n                                self._log(\n                                    \"error\",\n                                    \"ToolManager | Tool Execution Error\",\n                                    f\"Tool execution error: {tool_error}\",\n                                )\n                                return {\n                                    \"server_name\": server_name,\n                                    \"tool_name\": tool_name,\n                                    \"error\": f\"Tool execution failed: {str(tool_error)}\",\n                                }\n                else:\n                    raise TypeError(\n                        f\"Unknown server params type for {server_name}: {type(server_params)}\"\n                    )\n\n                self._log(\n                    \"info\",\n                    \"ToolManager | Tool Call Success\",\n                    f\"Tool '{tool_name}' (server: '{server_name}') called successfully.\",\n                )\n\n                return {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"result\": result_content,  # Return extracted text content\n                }\n\n            except Exception as outer_e:  # Rename this to outer_e to avoid shadowing\n                self._log(\n                    \"error\",\n                    \"ToolManager | Tool Call Failed\",\n                    f\"Error: Failed to call tool '{tool_name}' (server: '{server_name}'): {outer_e}\",\n                )\n\n                # Store the original error message for later use\n                error_message = str(outer_e)\n\n                if (\n                    tool_name in [\"scrape\", \"scrape_website\"]\n                    and \"unhandled errors\" in error_message\n                    and \"url\" in arguments\n                    and arguments[\"url\"] is not None\n                ):\n                    try:\n                        self._log(\n                            \"info\",\n                            \"ToolManager | Fallback Attempt\",\n                            \"Attempting fallback using MarkItDown...\",\n                        )\n                        from markitdown import MarkItDown\n\n                        md = MarkItDown(\n                            docintel_endpoint=\"<document_intelligence_endpoint>\"\n                        )\n                        result = md.convert(arguments[\"url\"])\n                        self._log(\n                            \"info\",\n                            \"ToolManager | Fallback Success\",\n                            \"MarkItDown fallback successful\",\n                        )\n                        return {\n                            \"server_name\": server_name,\n                            \"tool_name\": tool_name,\n                            \"result\": result.text_content,  # Return extracted text content\n                        }\n                    except (\n                        Exception\n                    ) as inner_e:  # Use a different name to avoid shadowing\n                        # Log the inner exception if needed\n                        self._log(\n                            \"error\",\n                            \"ToolManager | Fallback Failed\",\n                            f\"Fallback also failed: {inner_e}\",\n                        )\n                        # No need for pass here as we'll continue to the return statement\n\n                # Always use the outer exception for the final error response\n                return {\n                    \"server_name\": server_name,\n                    \"tool_name\": tool_name,\n                    \"error\": f\"Tool call failed: {error_message}\",\n                }\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py",
    "content": ""
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport base64\nimport contextlib\nimport mimetypes\nimport os\nimport tempfile\nimport wave\nfrom urllib.parse import urlparse\n\nimport requests\nfrom fastmcp import FastMCP\nfrom mutagen import File as MutagenFile\nfrom openai import OpenAI\n\nOPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\nOPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"audio-mcp-server\")\n\n\ndef _get_audio_extension(url: str, content_type: str = None) -> str:\n    \"\"\"\n    Determine the appropriate audio file extension from URL or content type.\n\n    Args:\n        url: The URL of the audio file\n        content_type: The content type from HTTP headers\n\n    Returns:\n        File extension (with dot) to use for temporary file\n    \"\"\"\n    # First try to get extension from URL\n    parsed_url = urlparse(url)\n    path = parsed_url.path.lower()\n\n    # Common audio extensions\n    audio_extensions = [\".mp3\", \".wav\", \".m4a\", \".aac\", \".ogg\", \".flac\", \".wma\"]\n    for ext in audio_extensions:\n        if path.endswith(ext):\n            return ext\n\n    # If no extension found in URL, try content type\n    if content_type:\n        content_type = content_type.lower()\n        if \"mp3\" in content_type or \"mpeg\" in content_type:\n            return \".mp3\"\n        elif \"wav\" in content_type:\n            return \".wav\"\n        elif \"m4a\" in content_type:\n            return \".m4a\"\n        elif \"aac\" in content_type:\n            return \".aac\"\n        elif \"ogg\" in content_type:\n            return \".ogg\"\n        elif \"flac\" in content_type:\n            return \".flac\"\n\n    # Default fallback to mp3\n    return \".mp3\"\n\n\ndef _get_audio_duration(audio_path: str) -> float:\n    \"\"\"\n    Get audio duration in seconds.\n\n    Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).\n    Returns 0.0 if duration cannot be determined.\n    \"\"\"\n    # Try using wave for .wav files\n    try:\n        with contextlib.closing(wave.open(audio_path, \"rb\")) as f:\n            frames = f.getnframes()\n            rate = f.getframerate()\n            duration = frames / float(rate)\n            if duration > 0:\n                return duration\n    except Exception:\n        pass  # Not a wav file or failed\n\n    # Try using mutagen for other audio formats (mp3, etc)\n    try:\n        audio = MutagenFile(audio_path)\n        if (\n            audio is not None\n            and hasattr(audio, \"info\")\n            and hasattr(audio.info, \"length\")\n        ):\n            duration = float(audio.info.length)\n            if duration > 0:\n                return duration\n    except Exception:\n        pass  # Failed to get duration\n\n    # Return 0.0 if all methods failed\n    return 0.0\n\n\ndef _encode_audio_file(audio_path: str) -> tuple[str, str]:\n    \"\"\"Encode audio file to base64 and determine format.\"\"\"\n    with open(audio_path, \"rb\") as audio_file:\n        audio_data = audio_file.read()\n        encoded_string = base64.b64encode(audio_data).decode(\"utf-8\")\n\n    # Determine file format from file extension\n    mime_type, _ = mimetypes.guess_type(audio_path)\n    if mime_type and mime_type.startswith(\"audio/\"):\n        mime_format = mime_type.split(\"/\")[-1]\n        # Map MIME type formats to OpenAI supported formats\n        format_mapping = {\n            \"mpeg\": \"mp3\",  # audio/mpeg -> mp3\n            \"wav\": \"wav\",  # audio/wav -> wav\n            \"wave\": \"wav\",  # audio/wave -> wav\n        }\n        file_format = format_mapping.get(mime_format, \"mp3\")\n    else:\n        # Default to mp3 if we can't determine\n        file_format = \"mp3\"\n\n    return encoded_string, file_format\n\n\n@mcp.tool()\nasync def audio_transcription(audio_path_or_url: str) -> str:\n    \"\"\"\n    Transcribe audio file to text and return the transcription.\n    Args:\n        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.\n\n    Returns:\n        The transcription of the audio file.\n    \"\"\"\n    max_retries = 3\n    retry = 0\n    transcription = None\n\n    # Create client once outside the retry loop\n    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n    while retry < max_retries:\n        try:\n            if os.path.exists(audio_path_or_url):  # Check if the file exists locally\n                with open(audio_path_or_url, \"rb\") as audio_file:\n                    transcription = client.audio.transcriptions.create(\n                        model=\"gpt-4o-transcribe\", file=audio_file\n                    )\n            elif \"home/user\" in audio_path_or_url:\n                return \"[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction\"\n            else:\n                # download the audio file from the URL\n                response = requests.get(audio_path_or_url)\n                response.raise_for_status()  # Raise an exception for bad status codes\n\n                # Basic content validation - check if response has content\n                if not response.content:\n                    return (\n                        \"[ERROR]: Audio transcription failed: Downloaded file is empty\"\n                    )\n\n                # Check content type if available\n                content_type = response.headers.get(\"content-type\", \"\").lower()\n\n                # Get proper extension for the temporary file\n                file_extension = _get_audio_extension(audio_path_or_url, content_type)\n\n                # Use proper temporary file handling with correct extension\n                with tempfile.NamedTemporaryFile(\n                    delete=False, suffix=file_extension\n                ) as temp_file:\n                    temp_file.write(response.content)\n                    temp_audio_path = temp_file.name\n\n                try:\n                    with open(temp_audio_path, \"rb\") as audio_file:\n                        transcription = client.audio.transcriptions.create(\n                            model=\"gpt-4o-transcribe\", file=audio_file\n                        )\n                finally:\n                    # Clean up the temp file\n                    if os.path.exists(temp_audio_path):\n                        os.remove(temp_audio_path)\n            break\n\n        except requests.RequestException as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\\nNote: Files from sandbox are not available. You should use local path given in the instruction. \\nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n        except Exception as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio transcription failed: {e}\\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n\n    return transcription.text\n\n\n@mcp.tool()\nasync def audio_question_answering(audio_path_or_url: str, question: str) -> str:\n    \"\"\"\n    Answer the question based on the given audio information.\n\n    Args:\n        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.\n        question: The question to answer.\n\n    Returns:\n        The answer to the question, and the duration of the audio file.\n    \"\"\"\n    max_retries = 3\n    retry = 0\n\n    # Create client once outside the retry loop\n    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n    # Initialize variables to avoid scope issues\n    encoded_string = None\n    file_format = None\n    duration = 0.0\n\n    while retry < max_retries:\n        try:\n            text_prompt = f\"\"\"Answer the following question based on the given \\\n            audio information:\\n\\n{question}\"\"\"\n\n            if os.path.exists(audio_path_or_url):  # Check if the file exists locally\n                encoded_string, file_format = _encode_audio_file(audio_path_or_url)\n                duration = _get_audio_duration(audio_path_or_url)\n            elif \"home/user\" in audio_path_or_url:\n                return \"[ERROR]: The audio_question_answering tool cannot access to sandbox file, please use the local path provided by original instruction\"\n            else:\n                # download the audio file from the URL\n                response = requests.get(\n                    audio_path_or_url,\n                    headers={\n                        \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36\"\n                    },\n                )\n                response.raise_for_status()  # Raise an exception for bad status codes\n\n                # Basic content validation - check if response has content\n                if not response.content:\n                    return \"[ERROR]: Audio question answering failed: Downloaded file is empty.\\nNote: Files from sandbox are not available. You should use local path given in the instruction. \\nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\\nNote: YouTube video URL is not supported.\"\n\n                # Check content type if available\n                content_type = response.headers.get(\"content-type\", \"\").lower()\n\n                # Get proper extension for the temporary file\n                file_extension = _get_audio_extension(audio_path_or_url, content_type)\n\n                # Use proper temporary file handling with correct extension\n                with tempfile.NamedTemporaryFile(\n                    delete=False, suffix=file_extension\n                ) as temp_file:\n                    temp_file.write(response.content)\n                    temp_audio_path = temp_file.name\n\n                try:\n                    encoded_string, file_format = _encode_audio_file(temp_audio_path)\n                    duration = _get_audio_duration(temp_audio_path)\n                finally:\n                    # Clean up the temp file\n                    if os.path.exists(temp_audio_path):\n                        os.remove(temp_audio_path)\n\n            if encoded_string is None or file_format is None:\n                return \"[ERROR]: Audio question answering failed: Failed to encode audio file.\\nNote: Files from sandbox are not available. You should use local path given in the instruction. \\nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\\nNote: YouTube video URL is not supported.\"\n\n            response = client.chat.completions.create(\n                model=\"gpt-4o-audio-preview\",\n                messages=[\n                    {\n                        \"role\": \"system\",\n                        \"content\": \"You are a helpful assistant specializing in audio analysis.\",\n                    },\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\"type\": \"text\", \"text\": text_prompt},\n                            {\n                                \"type\": \"input_audio\",\n                                \"input_audio\": {\n                                    \"data\": encoded_string,\n                                    \"format\": file_format,\n                                },\n                            },\n                        ],\n                    },\n                ],\n            )\n\n            # If we reach here, the API call was successful\n            break\n\n        except requests.RequestException as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio question answering failed: Failed to download audio file - {e}.\\nNote: Files from sandbox are not available. You should use local path given in the instruction. \\nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n        except Exception as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio question answering failed when calling OpenAI API: {e}\\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n\n    response_text = response.choices[0].message.content\n    response_text += f\"\\n\\nAudio duration: {duration} seconds\"\n\n    return response_text\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server_os.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport base64\nimport contextlib\nimport mimetypes\nimport os\nimport tempfile\nimport wave\nfrom urllib.parse import urlparse\n\nimport requests\nfrom fastmcp import FastMCP\nfrom mutagen import File as MutagenFile\nfrom openai import OpenAI\n\nWHISPER_API_KEY = os.environ.get(\"WHISPER_API_KEY\")\nWHISPER_BASE_URL = os.environ.get(\"WHISPER_BASE_URL\")\nWHISPER_MODEL_NAME = os.environ.get(\"WHISPER_MODEL_NAME\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"audio-mcp-server-os\")\n\n\ndef _get_audio_extension(url: str, content_type: str = None) -> str:\n    \"\"\"\n    Determine the appropriate audio file extension from URL or content type.\n\n    Args:\n        url: The URL of the audio file\n        content_type: The content type from HTTP headers\n\n    Returns:\n        File extension (with dot) to use for temporary file\n    \"\"\"\n    # First try to get extension from URL\n    parsed_url = urlparse(url)\n    path = parsed_url.path.lower()\n\n    # Common audio extensions\n    audio_extensions = [\".mp3\", \".wav\", \".m4a\", \".aac\", \".ogg\", \".flac\", \".wma\"]\n    for ext in audio_extensions:\n        if path.endswith(ext):\n            return ext\n\n    # If no extension found in URL, try content type\n    if content_type:\n        content_type = content_type.lower()\n        if \"mp3\" in content_type or \"mpeg\" in content_type:\n            return \".mp3\"\n        elif \"wav\" in content_type:\n            return \".wav\"\n        elif \"m4a\" in content_type:\n            return \".m4a\"\n        elif \"aac\" in content_type:\n            return \".aac\"\n        elif \"ogg\" in content_type:\n            return \".ogg\"\n        elif \"flac\" in content_type:\n            return \".flac\"\n\n    # Default fallback to mp3\n    return \".mp3\"\n\n\ndef _get_audio_duration(audio_path: str) -> float:\n    \"\"\"\n    Get audio duration in seconds.\n\n    Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).\n    \"\"\"\n    # Try using wave for .wav files\n    try:\n        with contextlib.closing(wave.open(audio_path, \"rb\")) as f:\n            frames = f.getnframes()\n            rate = f.getframerate()\n            duration = frames / float(rate)\n            if duration > 0:\n                return duration\n    except Exception:\n        pass  # Not a wav file or failed\n\n    # Try using mutagen for other audio formats (mp3, etc)\n    try:\n        audio = MutagenFile(audio_path)\n        if (\n            audio is not None\n            and hasattr(audio, \"info\")\n            and hasattr(audio.info, \"length\")\n        ):\n            duration = float(audio.info.length)\n            if duration > 0:\n                return duration\n    except Exception as e:\n        return f\"[ERROR]: Failed to get audio duration: {e}\"\n\n\ndef _encode_audio_file(audio_path: str) -> tuple[str, str]:\n    \"\"\"Encode audio file to base64 and determine format.\"\"\"\n    with open(audio_path, \"rb\") as audio_file:\n        audio_data = audio_file.read()\n        encoded_string = base64.b64encode(audio_data).decode(\"utf-8\")\n\n    # Determine file format from file extension\n    mime_type, _ = mimetypes.guess_type(audio_path)\n    if mime_type and mime_type.startswith(\"audio/\"):\n        mime_format = mime_type.split(\"/\")[-1]\n        # Map MIME type formats to OpenAI supported formats\n        format_mapping = {\n            \"mpeg\": \"mp3\",  # audio/mpeg -> mp3\n            \"wav\": \"wav\",  # audio/wav -> wav\n            \"wave\": \"wav\",  # audio/wave -> wav\n        }\n        file_format = format_mapping.get(mime_format, \"mp3\")\n    else:\n        # Default to mp3 if we can't determine\n        file_format = \"mp3\"\n\n    return encoded_string, file_format\n\n\n@mcp.tool()\nasync def audio_transcription(audio_path_or_url: str) -> str:\n    \"\"\"\n    Transcribe audio file to text and return the transcription.\n    Args:\n        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.\n\n    Returns:\n        The transcription of the audio file.\n    \"\"\"\n    max_retries = 3\n    retry = 0\n    transcription = None\n\n    while retry < max_retries:\n        try:\n            client = OpenAI(base_url=WHISPER_BASE_URL, api_key=WHISPER_API_KEY)\n            if os.path.exists(audio_path_or_url):  # Check if the file exists locally\n                with open(audio_path_or_url, \"rb\") as audio_file:\n                    transcription = client.audio.transcriptions.create(\n                        model=WHISPER_MODEL_NAME, file=audio_file\n                    )\n            elif \"home/user\" in audio_path_or_url:\n                return \"[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction\"\n            else:\n                # download the audio file from the URL\n                response = requests.get(audio_path_or_url)\n                response.raise_for_status()  # Raise an exception for bad status codes\n\n                # Basic content validation - check if response has content\n                if not response.content:\n                    return (\n                        \"[ERROR]: Audio transcription failed: Downloaded file is empty\"\n                    )\n\n                # Check content type if available\n                content_type = response.headers.get(\"content-type\", \"\").lower()\n                if content_type and not any(\n                    media_type in content_type\n                    for media_type in [\"audio\", \"video\", \"application/octet-stream\"]\n                ):\n                    return f\"[ERROR]: Audio transcription failed: Invalid content type '{content_type}'. Expected audio file.\"\n\n                # Get proper extension for the temporary file\n                file_extension = _get_audio_extension(audio_path_or_url, content_type)\n\n                # Use proper temporary file handling with correct extension\n                with tempfile.NamedTemporaryFile(\n                    delete=False, suffix=file_extension\n                ) as temp_file:\n                    temp_file.write(response.content)\n                    temp_audio_path = temp_file.name\n\n                try:\n                    with open(temp_audio_path, \"rb\") as audio_file:\n                        transcription = client.audio.transcriptions.create(\n                            model=WHISPER_MODEL_NAME, file=audio_file\n                        )\n                finally:\n                    # Clean up the temp file\n                    if os.path.exists(temp_audio_path):\n                        os.remove(temp_audio_path)\n            break\n\n        except requests.RequestException as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\\nNote: Files from sandbox are not available. You should use local path given in the instruction. \\nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n        except Exception as e:\n            retry += 1\n            if retry >= max_retries:\n                return f\"[ERROR]: Audio transcription failed: {e}\\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\\nNote: YouTube video URL is not supported.\"\n            await asyncio.sleep(5 * (2**retry))\n\n    return transcription.text\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport json\nimport logging\n\nfrom mcp import StdioServerParameters\nfrom mcp.client.session import ClientSession\nfrom mcp.client.sse import sse_client\nfrom mcp.client.stdio import stdio_client\n\nlogger = logging.getLogger(\"miroflow\")\n\n\nclass PlaywrightSession:\n    \"\"\"Class to maintain a persistent Playwright MCP session.\"\"\"\n\n    def __init__(self, server_params):\n        self.server_params = server_params\n        self.read = None\n        self.write = None\n        self.session = None\n        self._client = None\n\n    async def connect(self):\n        \"\"\"Connect to the MCP server and initialize the session.\"\"\"\n        if self.session is None:\n            if isinstance(self.server_params, StdioServerParameters):\n                self._client = stdio_client(self.server_params)\n            else:\n                self._client = sse_client(self.server_params)\n            self.read, self.write = await self._client.__aenter__()\n            self.session = ClientSession(self.read, self.write, sampling_callback=None)\n            await self.session.__aenter__()\n            await self.session.initialize()\n            logger.info(\"Connected to MCP server and initialized session\")\n\n    async def call_tool(self, tool_name, arguments=None):\n        \"\"\"Call a tool while maintaining the session.\"\"\"\n        if self.session is None:\n            await self.connect()\n\n        logger.info(f\"Calling tool '{tool_name}'\")\n        tool_result = await self.session.call_tool(tool_name, arguments=arguments)\n        result_content = tool_result.content[0].text if tool_result.content else \"\"\n        return result_content\n\n    async def close(self):\n        \"\"\"Close the session and connection.\"\"\"\n        if self.session:\n            await self.session.__aexit__(None, None, None)\n            self.session = None\n\n        if self._client:\n            await self._client.__aexit__(None, None, None)\n            self._client = None\n            self.read = None\n            self.write = None\n            logger.info(\"Closed MCP session\")\n\n\n# Example usage:\nasync def test_persistent_session():\n    # Create a persistent session\n    mcp_session = PlaywrightSession(\"http://localhost:8931\")\n\n    try:\n        # First call: Navigate to a website\n        await mcp_session.call_tool(\"browser_navigate\", {\"url\": \"https://example.com\"})\n        logger.info(\"Navigation complete\")\n\n        # Wait a moment for the page to load\n        await asyncio.sleep(2)\n\n        # Second call: Take a snapshot of the current page\n        snapshot_result = await mcp_session.call_tool(\"browser_snapshot\", {})\n\n        # Process and save the snapshot\n        snapshot_json = json.loads(snapshot_result)\n        logger.info(f\"Snapshot taken of page: {snapshot_json.get('url')}\")\n        logger.info(f\"Page title: {snapshot_json.get('title')}\")\n\n        with open(\"snapshot.json\", \"w\") as f:\n            json.dump(snapshot_json, f, indent=2, ensure_ascii=False)\n\n        logger.info(\"Snapshot saved to snapshot.json\")\n\n    finally:\n        # Close the session when done with all tool calls\n        await mcp_session.close()\n\n\nif __name__ == \"__main__\":\n    asyncio.run(test_persistent_session())\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/python_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport os\nimport shlex\nfrom urllib.parse import urlparse\n\nfrom e2b_code_interpreter import Sandbox\nfrom fastmcp import FastMCP\n\n# Initialize FastMCP server\nmcp = FastMCP(\"e2b-python-interpreter\")\n\n# API keys\nE2B_API_KEY = os.environ.get(\"E2B_API_KEY\")\nLOGS_DIR = os.environ.get(\n    \"LOGS_DIR\", \"../../logs\"\n)  # Directory where benchmark logs are stored\n\n# DEFAULT TEMPLATE ID\nDEFAULT_TEMPLATE_ID = \"1av7fdjfvcparqo8efq6\"\n\n# DEFAULT CONFS\nDEFAULT_TIMEOUT = 600  # seconds\n# Maximum number of tokens that can be returned by the Python tool\nMAX_RESULT_LEN = 20_000\n# Maximum number of tokens allowed in an error message\nMAX_ERROR_LEN = 4_000\n# Invalid sandbox IDs that are not allowed to be used\nINVALID_SANDBOX_IDS = {\n    \"default\",\n    \"sandbox1\",\n    \"sandbox\",\n    \"some_id\",\n    \"new_sandbox\",\n    \"python\",\n    \"create_sandbox\",\n    \"sandbox123\",\n    \"temp\",\n    \"sandbox-0\",\n    \"sandbox-1\",\n    \"sandbox_0\",\n    \"sandbox_1\",\n    \"new\",\n    \"0\",\n    \"auto\",\n    \"default_sandbox\",\n    \"none\",\n    \"sandbox_12345\",\n    \"dummy\",\n    \"sandbox_01\",\n}\n\n\ndef looks_like_dir(path: str) -> bool:\n    \"\"\"\n    Return True if the given path either:\n      - exists and is a directory, OR\n      - does not exist but looks like a directory (e.g., ends with '/', or has no file extension)\n    \"\"\"\n    # If it exists, trust the filesystem\n    if os.path.isdir(path):\n        return True\n\n    # If it ends with '/' or has no extension, treat as directory\n    if path.endswith(os.path.sep) or not os.path.splitext(path)[1]:\n        return True\n\n    return False\n\n\ndef truncate_result(result: str) -> str:\n    \"\"\"\n    Truncate result to MAX_RESULT_LEN.\n\n    Args:\n        result: The full result string to potentially truncate\n\n    Returns:\n        Truncated result string\n    \"\"\"\n    if len(result) > MAX_RESULT_LEN:\n        result = result[:MAX_RESULT_LEN] + \" [Result truncated due to length limit]\"\n\n    return result\n\n\n@mcp.tool()\nasync def create_sandbox(timeout: int = DEFAULT_TIMEOUT) -> str:\n    \"\"\"Create a linux sandbox.\n\n    Args:\n        timeout: Time in seconds before the sandbox is automatically shutdown. The default is 600 seconds.\n\n    Returns:\n        The sandbox_id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox.\n    \"\"\"\n    max_retries = 5\n    timeout = min(timeout, DEFAULT_TIMEOUT)\n    for attempt in range(1, max_retries + 1):\n        sandbox = None\n        try:\n            sandbox = Sandbox(\n                template=DEFAULT_TEMPLATE_ID,\n                timeout=timeout,\n                api_key=E2B_API_KEY,\n            )\n            info = sandbox.get_info()\n\n            tmpfiles_dir = os.path.join(LOGS_DIR, \"tmpfiles\")\n            os.makedirs(tmpfiles_dir, exist_ok=True)\n\n            return f\"Sandbox created with sandbox_id: {info.sandbox_id}\"\n        except Exception as e:\n            if attempt == max_retries:\n                error_details = str(e)[:MAX_ERROR_LEN]\n                return f\"[ERROR]: Failed to create sandbox after {max_retries} attempts: {error_details}, please retry later.\"\n            await asyncio.sleep(attempt**2)  # Exponential backoff\n        finally:\n            # Set timeout before exit to prevent timeout after function exits\n            try:\n                sandbox.set_timeout(timeout)\n            except Exception:\n                pass  # Ignore timeout setting errors\n\n\n@mcp.tool()\nasync def run_command(command: str, sandbox_id: str) -> str:\n    \"\"\"Execute a lightweight shell command in the linux sandbox (no long-running, blocking, or resource-heavy processes).\n\n    Args:\n        command: The command to execute.\n        sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`.\n\n    Returns:\n        A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)\n    \"\"\"\n    if sandbox_id in INVALID_SANDBOX_IDS:\n        return f\"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool.\"\n\n    try:\n        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)\n    except Exception:\n        return f\"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct.\"\n\n    max_retries = 3\n    for attempt in range(1, max_retries + 1):\n        try:\n            sandbox.set_timeout(\n                DEFAULT_TIMEOUT\n            )  # refresh the timeout for each command execution\n            result = sandbox.commands.run(command)\n\n            result_str = str(result)\n            return truncate_result(result_str)\n        except Exception as e:\n            if attempt == max_retries:\n                # Build error message\n                error_details = str(e)[:MAX_ERROR_LEN]\n                error_msg = f\"[ERROR]: Failed to run command after {max_retries} attempts.\\n\\nException type: {type(e).__name__}\\nDetails: {error_details}\"\n                return error_msg\n            await asyncio.sleep(attempt**2)  # Exponential backoff\n        finally:\n            # Set timeout before exit to prevent timeout after function exits\n            try:\n                sandbox.set_timeout(DEFAULT_TIMEOUT)\n            except Exception:\n                pass  # Ignore timeout setting errors\n\n\n@mcp.tool()\nasync def run_python_code(code_block: str, sandbox_id: str) -> str:\n    \"\"\"Run short, safe python code in a sandbox and return the execution result (avoid long loops or heavy tasks; must finish quickly).\n\n    Args:\n        code_block: The python code to run.\n        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.\n\n    Returns:\n        A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)\n    \"\"\"\n    # If sandbox_id is invalid, fallback to stateless execution\n    if not sandbox_id or sandbox_id in INVALID_SANDBOX_IDS:\n        try:\n            sandbox = Sandbox(\n                template=DEFAULT_TEMPLATE_ID,\n                timeout=DEFAULT_TIMEOUT,\n                api_key=E2B_API_KEY,\n            )\n            try:\n                execution = sandbox.run_code(code_block)\n                return truncate_result(str(execution))\n            finally:\n                sandbox.kill()\n        except Exception as e:\n            error_details = str(e)[:MAX_ERROR_LEN]\n            return f\"[ERROR]: Failed to run code in stateless mode. Exception type: {type(e).__name__}, Details: {error_details}\"\n\n    try:\n        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)\n    except Exception:\n        return f\"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct.\"\n\n    max_retries = 3\n    for attempt in range(1, max_retries + 1):\n        try:\n            sandbox.set_timeout(\n                DEFAULT_TIMEOUT\n            )  # refresh the timeout for each command execution\n\n            execution = sandbox.run_code(code_block)\n            result_str = str(execution)\n            return truncate_result(result_str)\n        except Exception as e:\n            if attempt == max_retries:\n                error_details = str(e)[:MAX_ERROR_LEN]\n                error_msg = f\"[ERROR]: Failed to run code in sandbox {sandbox_id} after {max_retries} attempts. Exception type: {type(e).__name__}, Details: {error_details}\"\n                return error_msg\n            await asyncio.sleep(attempt**2)  # Exponential backoff\n        finally:\n            # Set timeout before exit to prevent timeout after function exits\n            try:\n                sandbox.set_timeout(DEFAULT_TIMEOUT)\n            except Exception:\n                pass  # Ignore timeout setting errors\n\n\n@mcp.tool()\nasync def upload_file_from_local_to_sandbox(\n    sandbox_id: str, local_file_path: str, sandbox_file_path: str = \"/home/user\"\n) -> str:\n    \"\"\"Upload a local file to the `/home/user` dir of the remote python interpreter.\n\n    Args:\n        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.\n        local_file_path: The path of the file on local machine to upload.\n        sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`.\n\n    Returns:\n        The path of the uploaded file in the remote python interpreter if the upload is successful.\n    \"\"\"\n    if sandbox_id in INVALID_SANDBOX_IDS:\n        return f\"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool.\"\n\n    try:\n        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)\n    except Exception:\n        return f\"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct.\"\n\n    try:\n        sandbox.set_timeout(\n            DEFAULT_TIMEOUT\n        )  # refresh the timeout for each command execution\n\n        # Check if local file exists and is readable\n        if not os.path.exists(local_file_path):\n            return f\"[ERROR]: Local file does not exist: {local_file_path}\"\n        if not os.path.isfile(local_file_path):\n            return f\"[ERROR]: Path is not a file: {local_file_path}\"\n\n        # Get the uploaded file path\n        uploaded_file_path = os.path.join(\n            sandbox_file_path, os.path.basename(local_file_path)\n        )\n        # Normalize the path\n        uploaded_file_path = os.path.normpath(uploaded_file_path)\n\n        # Ensure the parent directory exists in sandbox\n        parent_dir = os.path.dirname(uploaded_file_path)\n        if parent_dir and parent_dir != \"/\":\n            mkdir_result = sandbox.commands.run(f\"mkdir -p {shlex.quote(parent_dir)}\")\n            if mkdir_result.exit_code != 0:\n                mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]\n                return f\"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}\"\n\n        # Upload the file\n        with open(local_file_path, \"rb\") as f:\n            sandbox.files.write(uploaded_file_path, f)\n\n        return f\"File uploaded to {uploaded_file_path}\"\n    except Exception as e:\n        error_details = str(e)[:MAX_ERROR_LEN]\n        return f\"[ERROR]: Failed to upload file {local_file_path} to sandbox {sandbox_id}: {error_details}\"\n    finally:\n        # Set timeout before exit to prevent timeout after function exits\n        try:\n            sandbox.set_timeout(DEFAULT_TIMEOUT)\n        except Exception:\n            pass  # Ignore timeout setting errors\n\n\n@mcp.tool()\nasync def download_file_from_internet_to_sandbox(\n    sandbox_id: str, url: str, sandbox_file_path: str = \"/home/user\"\n) -> str:\n    \"\"\"Download a file from the internet to the `/home/user` dir of the sandbox (avoid large or slow URLs).\n\n    Args:\n        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.\n        url: The URL of the file to download.\n        sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`.\n\n    Returns:\n        The path of the downloaded file in the sandbox if the download is successful.\n    \"\"\"\n    if sandbox_id in INVALID_SANDBOX_IDS:\n        return f\"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool.\"\n\n    try:\n        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)\n    except Exception:\n        return f\"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct.\"\n\n    try:\n        sandbox.set_timeout(\n            DEFAULT_TIMEOUT\n        )  # refresh the timeout for each command execution\n\n        # Extract basename from URL properly (handle query parameters)\n        parsed_url = urlparse(url)\n        basename = os.path.basename(parsed_url.path) or \"downloaded_file\"\n        # Remove any query parameters or fragments from basename\n        if \"?\" in basename:\n            basename = basename.split(\"?\")[0]\n        if \"#\" in basename:\n            basename = basename.split(\"#\")[0]\n\n        # Check whether sandbox_file_path looks like a directory\n        if looks_like_dir(sandbox_file_path):\n            # It's a directory — join with the filename\n            downloaded_file_path = os.path.join(sandbox_file_path, basename)\n        else:\n            # It's a file path — use it directly\n            downloaded_file_path = sandbox_file_path\n\n        # Normalize the path\n        downloaded_file_path = os.path.normpath(downloaded_file_path)\n\n        # Ensure the parent directory exists in sandbox\n        parent_dir = os.path.dirname(downloaded_file_path)\n        if parent_dir and parent_dir != \"/\":\n            mkdir_result = sandbox.commands.run(f\"mkdir -p {shlex.quote(parent_dir)}\")\n            if mkdir_result.exit_code != 0:\n                mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]\n                return f\"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}\"\n\n        # Download the file with retry logic\n        max_retries = 3\n        for attempt in range(1, max_retries + 1):\n            safe_url = shlex.quote(url)\n            safe_path = shlex.quote(downloaded_file_path)\n            cmd = f\"wget {safe_url} -O {safe_path}\"\n            try:\n                result = sandbox.commands.run(cmd)\n                if result.exit_code == 0:\n                    return f\"File downloaded to {safe_path}\"\n                elif attempt < max_retries:\n                    await asyncio.sleep(4**attempt)\n                    continue  # Retry\n                else:\n                    # Extract detailed error information\n                    error_details = \"\"\n                    if hasattr(result, \"stderr\") and result.stderr:\n                        error_details = f\"stderr: {result.stderr}\"[:MAX_ERROR_LEN]\n                    error_msg = (\n                        f\"[ERROR]: Failed to download file from {url} to {downloaded_file_path} after {max_retries} attempts.\\n\\n\"\n                        f\"exit_code: {result.exit_code}\\n\\n\"\n                        f\"Details: {error_details}\"\n                    )\n                    return error_msg\n            except Exception as e:\n                if attempt == max_retries:\n                    error_details = str(e)[:MAX_ERROR_LEN]\n                    error_msg = f\"[ERROR]: Failed to download file from {url} to {downloaded_file_path}. Exception: {error_details}\"\n                    return error_msg\n                await asyncio.sleep(4**attempt)\n    except Exception as e:\n        error_details = str(e)[:MAX_ERROR_LEN]\n        return f\"[ERROR]: Failed to download file from {url}: {error_details}\"\n    finally:\n        # Set timeout before exit to prevent timeout after function exits\n        try:\n            sandbox.set_timeout(DEFAULT_TIMEOUT)\n        except Exception:\n            pass  # Ignore timeout setting errors\n\n\n@mcp.tool()\nasync def download_file_from_sandbox_to_local(\n    sandbox_id: str, sandbox_file_path: str, local_filename: str = None\n) -> str:\n    \"\"\"Download a file from the sandbox to local system. Files in sandbox cannot be processed by tools from other servers - only local files and internet URLs can be processed by them.\n\n    Args:\n        sandbox_id: The id of the sandbox to download the file from. To have a sandbox, use tool `create_sandbox`.\n        sandbox_file_path: The path of the file to download on the sandbox.\n        local_filename: Optional filename to save as. If not provided, uses the original filename from sandbox_file_path.\n\n    Returns:\n        The local path of the downloaded file if successful, otherwise error message.\n    \"\"\"\n    if sandbox_id in INVALID_SANDBOX_IDS:\n        return f\"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool.\"\n\n    try:\n        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)\n    except Exception:\n        return f\"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct.\"\n\n    try:\n        sandbox.set_timeout(\n            DEFAULT_TIMEOUT\n        )  # refresh the timeout for each command execution\n\n        # Create tmpfiles directory if it doesn't exist\n        if not LOGS_DIR:\n            return \"[ERROR]: LOGS_DIR environment variable is not set. Cannot determine where to save the file.\"\n\n        tmpfiles_dir = os.path.join(LOGS_DIR, \"tmpfiles\")\n        os.makedirs(tmpfiles_dir, exist_ok=True)\n\n        # Check if the path is a directory (before attempting to read)\n        check_result = sandbox.commands.run(\n            f'test -d {shlex.quote(sandbox_file_path)} && echo \"is_directory\" || echo \"not_directory\"'\n        )\n        if check_result.stdout and \"is_directory\" in check_result.stdout:\n            return f\"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file.\"\n\n        # Check if the file exists\n        check_file_result = sandbox.commands.run(\n            f'test -f {shlex.quote(sandbox_file_path)} && echo \"exists\" || echo \"not_exists\"'\n        )\n        if check_file_result.stdout and \"not_exists\" in check_file_result.stdout:\n            # Check if it exists at all (might be a symlink or other type)\n            check_any_result = sandbox.commands.run(\n                f'test -e {shlex.quote(sandbox_file_path)} && echo \"exists\" || echo \"not_exists\"'\n            )\n            if check_any_result.stdout and \"not_exists\" in check_any_result.stdout:\n                error_msg = f\"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: file does not exist.\"\n                return error_msg\n\n        # Determine local filename\n        if local_filename is None or local_filename.strip() == \"\":\n            local_filename = os.path.basename(sandbox_file_path)\n            # If basename is empty or just '/', use a default name\n            if not local_filename or local_filename == \"/\":\n                local_filename = \"downloaded_file\"\n\n        local_file_path = os.path.join(\n            tmpfiles_dir, f\"sandbox_{sandbox_id}_{local_filename}\"\n        )\n\n        # Download the file\n        try:\n            with open(local_file_path, \"wb\") as f:\n                content = sandbox.files.read(sandbox_file_path, format=\"bytes\")\n                f.write(content)\n        except Exception as read_error:\n            error_msg = str(read_error).lower()\n            if \"directory\" in error_msg or \"is a directory\" in error_msg:\n                return f\"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file.\"\n            else:\n                read_error_details = str(read_error)[:MAX_ERROR_LEN]\n                return f\"[ERROR]: Failed to read file '{sandbox_file_path}' from sandbox {sandbox_id}: {read_error_details}\"\n\n        return f\"File downloaded successfully to: {local_file_path}\"\n    except Exception as e:\n        error_details = str(e)[:MAX_ERROR_LEN]\n        return f\"[ERROR]: Failed to download file '{sandbox_file_path}' from sandbox {sandbox_id}: {error_details}\"\n    finally:\n        # Set timeout before exit to prevent timeout after function exits\n        try:\n            sandbox.set_timeout(DEFAULT_TIMEOUT)\n        except Exception:\n            pass  # Ignore timeout setting errors\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport argparse\nimport logging\nimport sys\n\nfrom fastmcp import FastMCP\nfrom mcp import ClientSession, StdioServerParameters\nfrom mcp.client.stdio import stdio_client\n\nlogger = logging.getLogger(\"miroflow\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"reading-mcp-server\")\n\n\n@mcp.tool()\nasync def convert_to_markdown(uri: str) -> str:\n    \"\"\"Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.)\n    described by an file: or data: URI to markdown.\n\n    Args:\n        uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes.\n\n    Returns:\n        str: The converted markdown content, or an error message if conversion fails.\n    \"\"\"\n    if not uri or not uri.strip():\n        return \"Error: URI parameter is required and cannot be empty.\"\n\n    # Validate URI scheme\n    valid_schemes = [\"http:\", \"https:\", \"file:\", \"data:\"]\n    if not any(uri.lower().startswith(scheme) for scheme in valid_schemes):\n        return f\"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}\"\n\n    tool_name = \"convert_to_markdown\"\n    arguments = {\"uri\": uri}\n\n    server_params = StdioServerParameters(\n        command=sys.executable,\n        args=[\"-m\", \"markitdown_mcp\"],\n    )\n\n    result_content = \"\"\n    try:\n        async with stdio_client(server_params) as (read, write):\n            async with ClientSession(read, write, sampling_callback=None) as session:\n                await session.initialize()\n                try:\n                    tool_result = await session.call_tool(\n                        tool_name, arguments=arguments\n                    )\n                    result_content = (\n                        tool_result.content[-1].text if tool_result.content else \"\"\n                    )\n                except Exception as tool_error:\n                    logger.info(f\"Tool execution error: {tool_error}\")\n                    return f\"Error: Tool execution failed: {str(tool_error)}\"\n    except Exception as session_error:\n        logger.info(f\"Session error: {session_error}\")\n        return (\n            f\"Error: Failed to connect to markitdown-mcp server: {str(session_error)}\"\n        )\n\n    return result_content\n\n\nif __name__ == \"__main__\":\n    # Set up argument parser\n    parser = argparse.ArgumentParser(description=\"Reading MCP Server\")\n    parser.add_argument(\n        \"--transport\",\n        choices=[\"stdio\", \"http\"],\n        default=\"stdio\",\n        help=\"Transport method: 'stdio' or 'http' (default: stdio)\",\n    )\n    parser.add_argument(\n        \"--port\",\n        type=int,\n        default=8080,\n        help=\"Port to use when running with HTTP transport (default: 8080)\",\n    )\n    parser.add_argument(\n        \"--path\",\n        type=str,\n        default=\"/mcp\",\n        help=\"URL path to use when running with HTTP transport (default: /mcp)\",\n    )\n\n    # Parse command line arguments\n    args = parser.parse_args()\n\n    # Run the server with the specified transport method\n    if args.transport == \"stdio\":\n        mcp.run(transport=\"stdio\")\n    else:\n        # For HTTP transport, include port and path options\n        mcp.run(transport=\"streamable-http\", port=args.port, path=args.path)\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport logging\nimport os\n\nfrom anthropic import Anthropic\nfrom fastmcp import FastMCP\n\nlogger = logging.getLogger(\"miroflow\")\n\nANTHROPIC_API_KEY = os.environ.get(\"ANTHROPIC_API_KEY\", \"\")\nANTHROPIC_BASE_URL = os.environ.get(\"ANTHROPIC_BASE_URL\", \"https://api.anthropic.com\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"reasoning-mcp-server\")\n\n\n@mcp.tool()\nasync def reasoning(question: str) -> str:\n    \"\"\"You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.\n    DO NOT use this tool for simple and obvious question.\n\n    Args:\n        question: The hard question.\n\n    Returns:\n        The answer to the question.\n    \"\"\"\n    messages_for_llm = [\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\n                    \"type\": \"text\",\n                    \"text\": question,\n                }\n            ],\n        }\n    ]\n\n    client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL)\n    response = client.messages.create(\n        model=\"claude-3-7-sonnet-20250219\",\n        max_tokens=21000,\n        thinking={\n            \"type\": \"enabled\",\n            \"budget_tokens\": 19000,\n        },\n        messages=messages_for_llm,\n        stream=False,\n    )\n\n    try:\n        return response.content[-1].text\n    except Exception:\n        logger.info(\"Reasoning Error: only thinking content is returned\")\n        return response.content[-1].thinking\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport logging\nimport os\nimport random\nimport time\n\nimport requests\nfrom fastmcp import FastMCP\n\nlogger = logging.getLogger(\"miroflow\")\n\nREASONING_API_KEY = os.environ.get(\"REASONING_API_KEY\")\nREASONING_BASE_URL = os.environ.get(\"REASONING_BASE_URL\")\nREASONING_MODEL_NAME = os.environ.get(\"REASONING_MODEL_NAME\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"reasoning-mcp-server-os\")\n\n# Retry configuration\nMAX_RETRIES = 10\nBACKOFF_BASE = 1.0  # initial backoff in seconds\nBACKOFF_MAX = 30.0  # maximum backoff in seconds\n\n\ndef post_with_retry(url, json, headers):\n    \"\"\"Send POST request with retry and exponential backoff.\n    Returns response object if success, otherwise None.\"\"\"\n    for attempt in range(1, MAX_RETRIES + 1):\n        try:\n            resp = requests.post(url, json=json, headers=headers, timeout=600)\n            if resp.status_code == 200:\n                return resp\n            else:\n                logger.warning(\n                    f\"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}\"\n                )\n        except requests.exceptions.RequestException as e:\n            logger.warning(f\"Request failed on attempt {attempt}: {e}\")\n\n        # Backoff before next retry\n        if attempt < MAX_RETRIES:\n            sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX)\n            # Add jitter to avoid thundering herd\n            sleep_time *= 0.8 + 0.4 * random.random()\n            logger.info(f\"Retrying in {sleep_time:.1f}s...\")\n            time.sleep(sleep_time)\n\n    logger.warning(f\"All {MAX_RETRIES} retries failed for {url}\")\n    return None\n\n\n@mcp.tool()\nasync def reasoning(question: str) -> str:\n    \"\"\"You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.\n    DO NOT use this tool for simple and obvious question.\n\n    Args:\n        question: The hard question.\n\n    Returns:\n        The answer to the question.\n    \"\"\"\n    payload = {\n        \"model\": REASONING_MODEL_NAME,\n        \"messages\": [{\"role\": \"user\", \"content\": question}],\n        \"temperature\": 0.6,\n        \"top_p\": 0.95,\n    }\n    headers = {\n        \"Authorization\": f\"Bearer {REASONING_API_KEY}\",\n        \"Content-Type\": \"application/json\",\n    }\n\n    response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers)\n    if response is None:\n        return \"Reasoning service unavailable. Please try again later.\"\n\n    json_response = response.json()\n    try:\n        content = json_response[\"choices\"][0][\"message\"][\"content\"]\n        if \"</think>\" in content:\n            content = content.split(\"</think>\", 1)[1].strip()\n        return content\n    except Exception:\n        logger.info(\"Reasoning Error: only thinking content is returned\")\n        return json_response[\"choices\"][0][\"message\"][\"reasoning_content\"]\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_google_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport calendar\nimport datetime\nimport json\nimport os\nimport sys\n\nimport requests\nimport wikipedia\nfrom fastmcp import FastMCP\nfrom mcp import ClientSession, StdioServerParameters  # (already imported in config.py)\nfrom mcp.client.stdio import stdio_client\n\nfrom .utils import strip_markdown_links\n\nSERPER_API_KEY = os.environ.get(\"SERPER_API_KEY\", \"\")\nSERPER_BASE_URL = os.environ.get(\"SERPER_BASE_URL\", \"https://google.serper.dev\")\nJINA_API_KEY = os.environ.get(\"JINA_API_KEY\", \"\")\nJINA_BASE_URL = os.environ.get(\"JINA_BASE_URL\", \"https://r.jina.ai\")\n\n# Google search result filtering environment variables\nREMOVE_SNIPPETS = os.environ.get(\"REMOVE_SNIPPETS\", \"\").lower() in (\"true\", \"1\", \"yes\")\nREMOVE_KNOWLEDGE_GRAPH = os.environ.get(\"REMOVE_KNOWLEDGE_GRAPH\", \"\").lower() in (\n    \"true\",\n    \"1\",\n    \"yes\",\n)\nREMOVE_ANSWER_BOX = os.environ.get(\"REMOVE_ANSWER_BOX\", \"\").lower() in (\n    \"true\",\n    \"1\",\n    \"yes\",\n)\n\n# Initialize FastMCP server\nmcp = FastMCP(\"searching-google-mcp-server\")\n\n\ndef filter_google_search_result(result_content: str) -> str:\n    \"\"\"Filter google search result content based on environment variables.\n\n    Args:\n        result_content: The JSON string result from google search\n\n    Returns:\n        Filtered JSON string result\n    \"\"\"\n    try:\n        # Parse JSON\n        data = json.loads(result_content)\n\n        # Remove knowledgeGraph if requested\n        if REMOVE_KNOWLEDGE_GRAPH and \"knowledgeGraph\" in data:\n            del data[\"knowledgeGraph\"]\n\n        # Remove answerBox if requested\n        if REMOVE_ANSWER_BOX and \"answerBox\" in data:\n            del data[\"answerBox\"]\n\n        # Remove snippets if requested\n        if REMOVE_SNIPPETS:\n            # Remove snippets from organic results\n            if \"organic\" in data:\n                for item in data[\"organic\"]:\n                    if \"snippet\" in item:\n                        del item[\"snippet\"]\n\n            # Remove snippets from peopleAlsoAsk\n            if \"peopleAlsoAsk\" in data:\n                for item in data[\"peopleAlsoAsk\"]:\n                    if \"snippet\" in item:\n                        del item[\"snippet\"]\n\n        # Return filtered JSON\n        return json.dumps(data, ensure_ascii=False, indent=None)\n\n    except (json.JSONDecodeError, Exception):\n        # If filtering fails, return original content\n        return result_content\n\n\n@mcp.tool()\nasync def google_search(\n    q: str,\n    gl: str = \"us\",\n    hl: str = \"en\",\n    location: str = None,\n    num: int = 10,\n    tbs: str = None,\n    page: int = 1,\n) -> str:\n    \"\"\"Perform google searches via Serper API and retrieve rich results.\n    It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.\n\n    Args:\n        q: Search query string.\n        gl: Country context for search (e.g., 'us' for United States, 'cn' for China, 'uk' for United Kingdom). Influences regional results priority. Default is 'us'.\n        hl: Google interface language (e.g., 'en' for English, 'zh' for Chinese, 'es' for Spanish). Affects snippet language preference. Default is 'en'.\n        location: City-level location for search results (e.g., 'SoHo, New York, United States', 'California, United States').\n        num: The number of results to return (default: 10).\n        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year).\n        page: The page number of results to return (default: 1).\n\n    Returns:\n        The search results.\n    \"\"\"\n    if SERPER_API_KEY == \"\":\n        return (\n            \"[ERROR]: SERPER_API_KEY is not set, google_search tool is not available.\"\n        )\n\n    tool_name = \"google_search\"\n    arguments = {\n        \"q\": q,\n        \"gl\": gl,\n        \"hl\": hl,\n        \"num\": num,\n        \"page\": page,\n        \"autocorrect\": False,\n    }\n    if location:\n        arguments[\"location\"] = location\n    if tbs:\n        arguments[\"tbs\"] = tbs\n    server_params = StdioServerParameters(\n        command=sys.executable,\n        args=[\"-m\", \"miroflow_tools.mcp_servers.serper_mcp_server\"],\n        env={\"SERPER_API_KEY\": SERPER_API_KEY, \"SERPER_BASE_URL\": SERPER_BASE_URL},\n    )\n    result_content = \"\"\n\n    retry_count = 0\n    max_retries = 3\n\n    while retry_count < max_retries:\n        try:\n            async with stdio_client(server_params) as (read, write):\n                async with ClientSession(\n                    read, write, sampling_callback=None\n                ) as session:\n                    await session.initialize()\n                    tool_result = await session.call_tool(\n                        tool_name, arguments=arguments\n                    )\n                    result_content = (\n                        tool_result.content[-1].text if tool_result.content else \"\"\n                    )\n                    assert (\n                        result_content is not None and result_content.strip() != \"\"\n                    ), \"Empty result from google_search tool, please try again.\"\n                    # Apply filtering based on environment variables\n                    filtered_result = filter_google_search_result(result_content)\n                    return filtered_result  # Success, exit retry loop\n        except Exception as error:\n            retry_count += 1\n            if retry_count >= max_retries:\n                return f\"[ERROR]: google_search tool execution failed after {max_retries} attempts: {str(error)}\"\n            # Wait before retrying\n            await asyncio.sleep(min(2**retry_count, 60))\n\n    return \"[ERROR]: Unknown error occurred in google_search tool, please try again.\"\n\n\n# @mcp.tool()\nasync def wiki_get_page_content(entity: str, first_sentences: int = 10) -> str:\n    \"\"\"Get specific Wikipedia page content for the specific entity (people, places, concepts, events) and return structured information.\n\n    This tool searches Wikipedia for the given entity and returns either the first few sentences\n    (which typically contain the summary/introduction) or full page content based on parameters.\n    It handles disambiguation pages and provides clean, structured output.\n\n    Args:\n        entity: The entity to search for in Wikipedia.\n        first_sentences: Number of first sentences to return from the page. Set to 0 to return full content. Defaults to 10.\n\n    Returns:\n        str: Formatted search results containing title, first sentences/full content, and URL.\n             Returns error message if page not found or other issues occur.\n    \"\"\"\n    try:\n        # Try to get the Wikipedia page directly\n        page = wikipedia.page(title=entity, auto_suggest=False)\n\n        # Prepare the result\n        result_parts = [f\"Page Title: {page.title}\"]\n\n        if first_sentences > 0:\n            # Get summary with specified number of sentences\n            try:\n                summary = wikipedia.summary(\n                    entity, sentences=first_sentences, auto_suggest=False\n                )\n                result_parts.append(\n                    f\"First {first_sentences} sentences (introduction): {summary}\"\n                )\n            except Exception:\n                # Fallback to page summary if direct summary fails\n                content_sentences = page.content.split(\". \")[:first_sentences]\n                summary = (\n                    \". \".join(content_sentences) + \".\"\n                    if content_sentences\n                    else page.content[:5000] + \"...\"\n                )\n                result_parts.append(\n                    f\"First {first_sentences} sentences (introduction): {summary}\"\n                )\n        else:\n            # Return full content if first_sentences is 0\n            # TODO: Context Engineering Needed\n            result_parts.append(f\"Content: {page.content}\")\n\n        result_parts.append(f\"URL: {page.url}\")\n\n        return \"\\n\\n\".join(result_parts)\n\n    except wikipedia.exceptions.DisambiguationError as e:\n        options_list = \"\\n\".join(\n            [f\"- {option}\" for option in e.options[:10]]\n        )  # Limit to first 10\n        output = (\n            f\"Disambiguation Error: Multiple pages found for '{entity}'.\\n\\n\"\n            f\"Available options:\\n{options_list}\\n\\n\"\n            f\"Please be more specific in your search query.\"\n        )\n\n        try:\n            search_results = wikipedia.search(entity, results=5)\n            if search_results:\n                output += f\"Try to search {entity} in Wikipedia: {search_results}\"\n            return output\n        except Exception:\n            pass\n\n        return output\n\n    except wikipedia.exceptions.PageError:\n        # Try a search if direct page lookup fails\n        try:\n            search_results = wikipedia.search(entity, results=5)\n            if search_results:\n                suggestion_list = \"\\n\".join(\n                    [f\"- {result}\" for result in search_results[:5]]\n                )\n                return (\n                    f\"Page Not Found: No Wikipedia page found for '{entity}'.\\n\\n\"\n                    f\"Similar pages found:\\n{suggestion_list}\\n\\n\"\n                    f\"Try searching for one of these suggestions instead.\"\n                )\n            else:\n                return (\n                    f\"Page Not Found: No Wikipedia page found for '{entity}' \"\n                    f\"and no similar pages were found. Please try a different search term.\"\n                )\n        except Exception as search_error:\n            return (\n                f\"Page Not Found: No Wikipedia page found for '{entity}'. \"\n                f\"Search for alternatives also failed: {str(search_error)}\"\n            )\n\n    except wikipedia.exceptions.RedirectError:\n        return f\"Redirect Error: Failed to follow redirect for '{entity}'\"\n\n    except requests.exceptions.RequestException as e:\n        return f\"Network Error: Failed to connect to Wikipedia: {str(e)}\"\n\n    except wikipedia.exceptions.WikipediaException as e:\n        return f\"Wikipedia Error: An error occurred while searching Wikipedia: {str(e)}\"\n\n    except Exception as e:\n        return f\"Unexpected Error: An unexpected error occurred: {str(e)}\"\n\n\n# @mcp.tool()\nasync def search_wiki_revision(\n    entity: str, year: int, month: int, max_revisions: int = 50\n) -> str:\n    \"\"\"Search for an entity in Wikipedia and return the revision history for a specific month.\n\n    Args:\n        entity: The entity to search for in Wikipedia.\n        year: The year of the revision (e.g. 2024).\n        month: The month of the revision (1-12).\n        max_revisions: Maximum number of revisions to return. Defaults to 50.\n\n    Returns:\n        str: Formatted revision history with timestamps, revision IDs, and URLs.\n             Returns error message if page not found or other issues occur.\n    \"\"\"\n    # Auto-adjust date values and track changes\n    adjustments = []\n    original_year, original_month = year, month\n    current_year = datetime.datetime.now().year\n\n    # Adjust year to valid range\n    if year < 2000:\n        year = 2000\n        adjustments.append(\n            f\"Year adjusted from {original_year} to 2000 (minimum supported)\"\n        )\n    elif year > current_year:\n        year = current_year\n        adjustments.append(\n            f\"Year adjusted from {original_year} to {current_year} (current year)\"\n        )\n\n    # Adjust month to valid range\n    if month < 1:\n        month = 1\n        adjustments.append(f\"Month adjusted from {original_month} to 1\")\n    elif month > 12:\n        month = 12\n        adjustments.append(f\"Month adjusted from {original_month} to 12\")\n\n    # Prepare adjustment message if any changes were made\n    if adjustments:\n        adjustment_msg = (\n            \"Date auto-adjusted: \"\n            + \"; \".join(adjustments)\n            + f\". Using {year}-{month:02d} instead.\\n\\n\"\n        )\n    else:\n        adjustment_msg = \"\"\n\n    base_url = \"https://en.wikipedia.org/w/api.php\"\n\n    try:\n        # Construct the time range\n        start_date = datetime.datetime(year, month, 1)\n        last_day = calendar.monthrange(year, month)[1]\n        end_date = datetime.datetime(year, month, last_day, 23, 59, 59)\n\n        # Convert to ISO format (UTC time)\n        start_iso = start_date.strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n        end_iso = end_date.strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n\n        # API parameters configuration\n        params = {\n            \"action\": \"query\",\n            \"format\": \"json\",\n            \"titles\": entity,\n            \"prop\": \"revisions\",\n            \"rvlimit\": min(max_revisions, 500),  # Wikipedia API limit\n            \"rvstart\": start_iso,\n            \"rvend\": end_iso,\n            \"rvdir\": \"newer\",\n            \"rvprop\": \"timestamp|ids\",\n        }\n\n        response = requests.get(base_url, params=params)\n        response.raise_for_status()\n\n        data = response.json()\n\n        # Check for API errors\n        if \"error\" in data:\n            return f\"[ERROR]: Wikipedia API Error: {data['error'].get('info', 'Unknown error')}\"\n\n        # Process the response\n        pages = data.get(\"query\", {}).get(\"pages\", {})\n\n        if not pages:\n            return f\"[ERROR]: No results found for entity '{entity}'\"\n\n        # Check if page exists\n        page_id = list(pages.keys())[0]\n        if page_id == \"-1\":\n            return f\"[ERROR]: Page Not Found: No Wikipedia page found for '{entity}'\"\n\n        page_info = pages[page_id]\n        page_title = page_info.get(\"title\", entity)\n\n        if \"revisions\" not in page_info or not page_info[\"revisions\"]:\n            return (\n                adjustment_msg + f\"Page Title: {page_title}\\n\\n\"\n                f\"No revisions found for '{entity}' in {year}-{month:02d}.\\n\\n\"\n                f\"The page may not have been edited during this time period.\"\n            )\n\n        # Format the results\n        result_parts = [\n            f\"Page Title: {page_title}\",\n            f\"Revision Period: {year}-{month:02d}\",\n            f\"Total Revisions Found: {len(page_info['revisions'])}\",\n        ]\n\n        # Add revision details\n        revisions_details = []\n        for i, rev in enumerate(page_info[\"revisions\"], 1):\n            revision_id = rev[\"revid\"]\n            timestamp = rev[\"timestamp\"]\n\n            # Format timestamp for better readability\n            try:\n                dt = datetime.datetime.fromisoformat(timestamp.replace(\"Z\", \"+00:00\"))\n                formatted_time = dt.strftime(\"%Y-%m-%d %H:%M:%S UTC\")\n            except Exception:\n                formatted_time = timestamp\n\n            # Construct revision URL\n            rev_url = f\"https://en.wikipedia.org/w/index.php?title={entity}&oldid={revision_id}\"\n\n            revisions_details.append(\n                f\"{i}. Revision ID: {revision_id}\\n\"\n                f\"   Timestamp: {formatted_time}\\n\"\n                f\"   URL: {rev_url}\"\n            )\n\n        if revisions_details:\n            result_parts.append(\"Revisions:\\n\" + \"\\n\\n\".join(revisions_details))\n\n        return (\n            adjustment_msg\n            + \"\\n\\n\".join(result_parts)\n            + \"\\n\\nHint: You can use the `scrape_website` tool to get the webpage content of a URL.\"\n        )\n\n    except requests.exceptions.Timeout:\n        return f\"[ERROR]: Network Error: Request timed out while fetching revision history for '{entity}'\"\n\n    except requests.exceptions.RequestException as e:\n        return f\"[ERROR]: Network Error: Failed to connect to Wikipedia: {str(e)}\"\n\n    except ValueError as e:\n        return f\"[ERROR]: Date Error: Invalid date values - {str(e)}\"\n\n    except Exception as e:\n        return f\"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}\"\n\n\n# @mcp.tool()\nasync def search_archived_webpage(url: str, year: int, month: int, day: int) -> str:\n    \"\"\"Search the Wayback Machine (archive.org) for archived versions of a webpage, optionally for a specific date.\n\n    Args:\n        url: The URL to search for in the Wayback Machine.\n        year: The target year (e.g., 2023).\n        month: The target month (1-12).\n        day: The target day (1-31).\n\n    Returns:\n        str: Formatted archive information including archived URL, timestamp, and status.\n             Returns error message if URL not found or other issues occur.\n    \"\"\"\n    # Handle empty URL\n    if not url:\n        return f\"[ERROR]: Invalid URL: '{url}'. URL cannot be empty.\"\n\n    # Auto-add https:// if no protocol is specified\n    protocol_hint = \"\"\n    if not url.startswith((\"http://\", \"https://\")):\n        original_url = url\n        url = f\"https://{url}\"\n        protocol_hint = f\"[NOTE]: Automatically added 'https://' to URL '{original_url}' -> '{url}'\\n\\n\"\n\n    hint_message = \"\"\n    if \".wikipedia.org\" in url:\n        hint_message = \"Note: You are trying to search a Wikipedia page, you can also use the `search_wiki_revision` tool to get the revision content of a Wikipedia page.\\n\\n\"\n\n    # Check if specific date is requested\n    date = \"\"\n    adjustment_msg = \"\"\n    if year > 0 and month > 0:\n        # Auto-adjust date values and track changes\n        adjustments = []\n        original_year, original_month, original_day = year, month, day\n        current_year = datetime.datetime.now().year\n\n        # Adjust year to valid range\n        if year < 1995:\n            year = 1995\n            adjustments.append(\n                f\"Year adjusted from {original_year} to 1995 (minimum supported)\"\n            )\n        elif year > current_year:\n            year = current_year\n            adjustments.append(\n                f\"Year adjusted from {original_year} to {current_year} (current year)\"\n            )\n\n        # Adjust month to valid range\n        if month < 1:\n            month = 1\n            adjustments.append(f\"Month adjusted from {original_month} to 1\")\n        elif month > 12:\n            month = 12\n            adjustments.append(f\"Month adjusted from {original_month} to 12\")\n\n        # Adjust day to valid range for the given month/year\n        max_day = calendar.monthrange(year, month)[1]\n        if day < 1:\n            day = 1\n            adjustments.append(f\"Day adjusted from {original_day} to 1\")\n        elif day > max_day:\n            day = max_day\n            adjustments.append(\n                f\"Day adjusted from {original_day} to {max_day} (max for {year}-{month:02d})\"\n            )\n\n        # Update the date string with adjusted values\n        date = f\"{year:04d}{month:02d}{day:02d}\"\n\n        try:\n            # Validate the final adjusted date\n            datetime.datetime(year, month, day)\n        except ValueError as e:\n            return f\"[ERROR]: Invalid date: {year}-{month:02d}-{day:02d}. {str(e)}\"\n\n        # Prepare adjustment message if any changes were made\n        if adjustments:\n            adjustment_msg = (\n                \"Date auto-adjusted: \"\n                + \"; \".join(adjustments)\n                + f\". Using {date} instead.\\n\\n\"\n            )\n\n    try:\n        base_url = \"https://archive.org/wayback/available\"\n        # Search with specific date if provided\n        if date:\n            retry_count = 0\n            # retry 5 times if the response is not valid\n            while retry_count < 5:\n                response = requests.get(f\"{base_url}?url={url}&timestamp={date}\")\n                response.raise_for_status()\n                data = response.json()\n                if (\n                    \"archived_snapshots\" in data\n                    and \"closest\" in data[\"archived_snapshots\"]\n                ):\n                    break\n                retry_count += 1\n                await asyncio.sleep(min(2**retry_count, 60))\n\n            if \"archived_snapshots\" in data and \"closest\" in data[\"archived_snapshots\"]:\n                closest = data[\"archived_snapshots\"][\"closest\"]\n                archived_url = closest[\"url\"]\n                archived_timestamp = closest[\"timestamp\"]\n                available = closest.get(\"available\", True)\n\n                if not available:\n                    return (\n                        hint_message\n                        + adjustment_msg\n                        + (\n                            f\"Archive Status: Snapshot exists but is not available\\n\\n\"\n                            f\"Original URL: {url}\\n\"\n                            f\"Requested Date: {year:04d}-{month:02d}-{day:02d}\\n\"\n                            f\"Closest Snapshot: {archived_timestamp}\\n\\n\"\n                            f\"Try a different date\"\n                        )\n                    )\n\n                # Format timestamp for better readability\n                try:\n                    dt = datetime.datetime.strptime(archived_timestamp, \"%Y%m%d%H%M%S\")\n                    formatted_time = dt.strftime(\"%Y-%m-%d %H:%M:%S UTC\")\n                except Exception:\n                    formatted_time = archived_timestamp\n\n                return (\n                    protocol_hint\n                    + hint_message\n                    + adjustment_msg\n                    + (\n                        f\"Archive Found: Archived version located\\n\\n\"\n                        f\"Original URL: {url}\\n\"\n                        f\"Requested Date: {year:04d}-{month:02d}-{day:02d}\\n\"\n                        f\"Archived URL: {archived_url}\\n\"\n                        f\"Archived Timestamp: {formatted_time}\\n\"\n                    )\n                    + \"\\n\\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL.\"\n                )\n\n        # Search without specific date (most recent)\n        retry_count = 0\n        # retry 5 times if the response is not valid\n        while retry_count < 5:\n            response = requests.get(f\"{base_url}?url={url}\")\n            response.raise_for_status()\n            data = response.json()\n            if \"archived_snapshots\" in data and \"closest\" in data[\"archived_snapshots\"]:\n                break\n            retry_count += 1\n            await asyncio.sleep(min(2**retry_count, 60))\n\n        if \"archived_snapshots\" in data and \"closest\" in data[\"archived_snapshots\"]:\n            closest = data[\"archived_snapshots\"][\"closest\"]\n            archived_url = closest[\"url\"]\n            archived_timestamp = closest[\"timestamp\"]\n            available = closest.get(\"available\", True)\n\n            if not available:\n                return (\n                    protocol_hint\n                    + hint_message\n                    + (\n                        f\"Archive Status: Most recent snapshot exists but is not available\\n\\n\"\n                        f\"Original URL: {url}\\n\"\n                        f\"Most Recent Snapshot: {archived_timestamp}\\n\\n\"\n                        f\"The URL may have been archived but access is restricted\"\n                    )\n                )\n\n            # Format timestamp for better readability\n            try:\n                dt = datetime.datetime.strptime(archived_timestamp, \"%Y%m%d%H%M%S\")\n                formatted_time = dt.strftime(\"%Y-%m-%d %H:%M:%S UTC\")\n            except Exception:\n                formatted_time = archived_timestamp\n\n            return (\n                protocol_hint\n                + hint_message\n                + (\n                    f\"Archive Found: Most recent archived version\\n\\n\"\n                    f\"Original URL: {url}\\n\"\n                    f\"Archived URL: {archived_url}\\n\"\n                    f\"Archived Timestamp: {formatted_time}\\n\"\n                )\n                + \"\\n\\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL.\"\n            )\n        else:\n            return (\n                protocol_hint\n                + hint_message\n                + (\n                    f\"Archive Not Found: No archived versions available\\n\\n\"\n                    f\"Original URL: {url}\\n\\n\"\n                    f\"The URL '{url}' has not been archived by the Wayback Machine.\\n\"\n                    f\"You may want to:\\n\"\n                    f\"- Check if the URL is correct\\n\"\n                    f\"- Try a different URL and date\\n\"\n                )\n            )\n\n    except requests.exceptions.RequestException as e:\n        return f\"[ERROR]: Network Error: Failed to connect to Wayback Machine: {str(e)}\"\n\n    except ValueError as e:\n        return f\"[ERROR]: Data Error: Failed to parse response from Wayback Machine: {str(e)}\"\n\n    except Exception as e:\n        return f\"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}\"\n\n\n@mcp.tool()\nasync def scrape_website(url: str) -> str:\n    \"\"\"This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.\n\n    Args:\n        url: The URL of the website to scrape.\n    Returns:\n        The scraped website content.\n    \"\"\"\n    # Validate URL format\n    if not url or not url.startswith((\"http://\", \"https://\")):\n        return f\"Invalid URL: '{url}'. URL must start with http:// or https://\"\n\n    # Avoid duplicate Jina URL prefix\n    if url.startswith(\"https://r.jina.ai/\") and url.count(\"http\") >= 2:\n        url = url[len(\"https://r.jina.ai/\") :]\n\n    # Check for restricted domains\n    if \"huggingface.co/datasets\" in url or \"huggingface.co/spaces\" in url:\n        return \"You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.\"\n\n    if JINA_API_KEY == \"\":\n        return \"JINA_API_KEY is not set, scrape_website tool is not available.\"\n\n    try:\n        # Use Jina.ai reader API to convert URL to LLM-friendly text\n        jina_url = f\"{JINA_BASE_URL}/{url}\"\n\n        # Make request with proper headers\n        headers = {\"Authorization\": f\"Bearer {JINA_API_KEY}\"}\n\n        response = requests.get(jina_url, headers=headers, timeout=60)\n        response.raise_for_status()\n\n        # Get the content\n        content = response.text.strip()\n        content = strip_markdown_links(content)\n\n        if not content:\n            return f\"No content retrieved from URL: {url}\"\n\n        return content\n\n    except requests.exceptions.Timeout:\n        return f\"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive.\"\n\n    except requests.exceptions.ConnectionError:\n        return f\"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible.\"\n\n    except requests.exceptions.HTTPError as e:\n        status_code = e.response.status_code if e.response else \"unknown\"\n        if status_code == 404:\n            return f\"[ERROR]: Page Not Found (404): The page at '{url}' does not exist.\"\n        elif status_code == 403:\n            return f\"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden.\"\n        elif status_code == 500:\n            return f\"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error.\"\n        else:\n            return f\"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}\"\n\n    except requests.exceptions.RequestException as e:\n        return f\"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}\"\n\n    except Exception as e:\n        return f\"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}\"\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sogou_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport json\nimport os\n\nimport requests\nfrom fastmcp import FastMCP\nfrom tencentcloud.common import credential\nfrom tencentcloud.common.common_client import CommonClient\nfrom tencentcloud.common.exception.tencent_cloud_sdk_exception import (\n    TencentCloudSDKException,\n)\nfrom tencentcloud.common.profile.client_profile import ClientProfile\nfrom tencentcloud.common.profile.http_profile import HttpProfile\n\nfrom .utils import strip_markdown_links\n\nTENCENTCLOUD_SECRET_ID = os.environ.get(\"TENCENTCLOUD_SECRET_ID\", \"\")\nTENCENTCLOUD_SECRET_KEY = os.environ.get(\"TENCENTCLOUD_SECRET_KEY\", \"\")\nJINA_API_KEY = os.environ.get(\"JINA_API_KEY\", \"\")\nJINA_BASE_URL = os.environ.get(\"JINA_BASE_URL\", \"https://r.jina.ai\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"searching-sogou-mcp-server\")\n\n\n@mcp.tool()\nasync def sogou_search(Query: str, Cnt: int = 10) -> str:\n    \"\"\"Performs web searches using the Tencent Cloud SearchPro API to retrieve comprehensive information, with Sogou search offering superior results for Chinese-language queries.\n\n    Args:\n        Query: The core search query string. Be specific to improve result relevance (e.g., \"2024 World Cup final results\"). (Required, no default value)\n        Cnt: Number of search results to return (Can only be 10/20/30/40/50). Optional, default: 10)\n\n    Returns:\n        The search results in JSON format, including the following core fields:\n        - Query: The original search query (consistent with the input Query, for request verification)\n        - Pages: Array of JSON strings, each containing details of a single search result (e.g., title, url, passage, date, site, favicon)\n    \"\"\"\n    if TENCENTCLOUD_SECRET_ID == \"\" or TENCENTCLOUD_SECRET_KEY == \"\":\n        return \"[ERROR]: TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY is not set, sogou_search tool is not available.\"\n\n    retry_count = 0\n    max_retries = 3\n\n    while retry_count < max_retries:\n        try:\n            cred = credential.Credential(\n                TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY\n            )\n            httpProfile = HttpProfile()\n            httpProfile.endpoint = \"wsa.tencentcloudapi.com\"\n            clientProfile = ClientProfile()\n            clientProfile.httpProfile = httpProfile\n\n            params = f'{{\"Query\":\"{Query}\",\"Mode\":0, \"Cnt\":{Cnt}}}'\n            common_client = CommonClient(\n                \"wsa\", \"2025-05-08\", cred, \"\", profile=clientProfile\n            )\n            result = common_client.call_json(\"SearchPro\", json.loads(params))[\n                \"Response\"\n            ]\n            del result[\"RequestId\"]\n            pages = []\n            for page in result[\"Pages\"]:\n                page_json = json.loads(page)\n                new_page = {}\n                new_page[\"title\"] = page_json[\"title\"]\n                new_page[\"url\"] = page_json[\"url\"]\n                new_page[\"passage\"] = page_json[\"passage\"]\n                new_page[\"date\"] = page_json[\"date\"]\n                # new_page[\"content\"] = page_json[\"content\"]\n                new_page[\"site\"] = page_json[\"site\"]\n                # new_page[\"favicon\"] = page_json[\"favicon\"]\n                pages.append(new_page)\n            result[\"Pages\"] = pages\n            return json.dumps(result, ensure_ascii=False)\n        except TencentCloudSDKException:\n            retry_count += 1\n            if retry_count >= max_retries:\n                return f\"[ERROR]: sogou_search tool execution failed after {max_retries} attempts: Unexpected error occurred.\"\n            # Wait before retrying\n            await asyncio.sleep(min(2**retry_count, 60))\n\n    return \"[ERROR]: Unknown error occurred in google_search tool, please try again.\"\n\n\n@mcp.tool()\nasync def scrape_website(url: str) -> str:\n    \"\"\"This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.\n\n    Args:\n        url: The URL of the website to scrape.\n    Returns:\n        The scraped website content.\n    \"\"\"\n    # Validate URL format\n    if not url or not url.startswith((\"http://\", \"https://\")):\n        return f\"Invalid URL: '{url}'. URL must start with http:// or https://\"\n\n    # Avoid duplicate Jina URL prefix\n    if url.startswith(\"https://r.jina.ai/\") and url.count(\"http\") >= 2:\n        url = url[len(\"https://r.jina.ai/\") :]\n\n    # Check for restricted domains\n    if \"huggingface.co/datasets\" in url or \"huggingface.co/spaces\" in url:\n        return \"You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.\"\n\n    if JINA_API_KEY == \"\":\n        return \"JINA_API_KEY is not set, scrape_website tool is not available.\"\n\n    try:\n        # Use Jina.ai reader API to convert URL to LLM-friendly text\n        jina_url = f\"{JINA_BASE_URL}/{url}\"\n\n        # Make request with proper headers\n        headers = {\"Authorization\": f\"Bearer {JINA_API_KEY}\"}\n\n        response = requests.get(jina_url, headers=headers, timeout=60)\n        response.raise_for_status()\n\n        # Get the content\n        content = response.text.strip()\n        content = strip_markdown_links(content)\n\n        if not content:\n            return f\"No content retrieved from URL: {url}\"\n\n        return content\n\n    except requests.exceptions.Timeout:\n        return f\"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive.\"\n\n    except requests.exceptions.ConnectionError:\n        return f\"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible.\"\n\n    except requests.exceptions.HTTPError as e:\n        status_code = e.response.status_code if e.response else \"unknown\"\n        if status_code == 404:\n            return f\"[ERROR]: Page Not Found (404): The page at '{url}' does not exist.\"\n        elif status_code == 403:\n            return f\"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden.\"\n        elif status_code == 500:\n            return f\"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error.\"\n        else:\n            return f\"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}\"\n\n    except requests.exceptions.RequestException as e:\n        return f\"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}\"\n\n    except Exception as e:\n        return f\"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}\"\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\n\"\"\"\nadapted from\nhttps://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1\n\"\"\"\n\nimport json\nimport os\nfrom typing import Any, Dict\n\nimport requests\nfrom mcp.server.fastmcp import FastMCP\nfrom tenacity import (\n    retry,\n    retry_if_exception_type,\n    stop_after_attempt,\n    wait_exponential,\n)\n\nfrom .utils import decode_http_urls_in_dict\n\nSERPER_BASE_URL = os.getenv(\"SERPER_BASE_URL\", \"https://google.serper.dev\")\nSERPER_API_KEY = os.getenv(\"SERPER_API_KEY\", \"\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"serper-mcp-server\")\n\n\n@retry(\n    stop=stop_after_attempt(3),\n    wait=wait_exponential(multiplier=1, min=4, max=10),\n    retry=retry_if_exception_type(\n        (requests.ConnectionError, requests.Timeout, requests.HTTPError)\n    ),\n)\ndef make_serper_request(\n    payload: Dict[str, Any], headers: Dict[str, str]\n) -> requests.Response:\n    \"\"\"Make HTTP request to Serper API with retry logic.\"\"\"\n    response = requests.post(f\"{SERPER_BASE_URL}/search\", json=payload, headers=headers)\n    response.raise_for_status()\n    return response\n\n\ndef _is_huggingface_dataset_or_space_url(url):\n    \"\"\"\n    Check if the URL is a HuggingFace dataset or space URL.\n    :param url: The URL to check\n    :return: True if it's a HuggingFace dataset or space URL, False otherwise\n    \"\"\"\n    if not url:\n        return False\n    return \"huggingface.co/datasets\" in url or \"huggingface.co/spaces\" in url\n\n\n@mcp.tool()\ndef google_search(\n    q: str,\n    gl: str = \"us\",\n    hl: str = \"en\",\n    location: str | None = None,\n    num: int | None = None,\n    tbs: str | None = None,\n    page: int | None = None,\n    autocorrect: bool | None = None,\n):\n    \"\"\"\n    Tool to perform web searches via Serper API and retrieve rich results.\n\n    It is able to retrieve organic search results, people also ask,\n    related searches, and knowledge graph.\n\n    Args:\n        q: Search query string\n        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')\n        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')\n        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')\n        num: Number of results to return (default: 10)\n        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week,\n            'qdr:m' for past month, 'qdr:y' for past year)\n        page: Page number of results to return (default: 1)\n        autocorrect: Whether to autocorrect spelling in query\n\n    Returns:\n        Dictionary containing search results and metadata.\n    \"\"\"\n    # Check for API key\n    if not SERPER_API_KEY:\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"SERPER_API_KEY environment variable not set\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    # Validate required parameter\n    if not q or not q.strip():\n        return json.dumps(\n            {\n                \"success\": False,\n                \"error\": \"Search query 'q' is required and cannot be empty\",\n                \"results\": [],\n            },\n            ensure_ascii=False,\n        )\n\n    try:\n        # Build payload with all supported parameters\n        payload: dict[str, Any] = {\n            \"q\": q.strip(),\n            \"gl\": gl,\n            \"hl\": hl,\n        }\n\n        # Add optional parameters if provided\n        if location:\n            payload[\"location\"] = location\n        if num is not None:\n            payload[\"num\"] = num\n        else:\n            payload[\"num\"] = 10  # Default\n        if tbs:\n            payload[\"tbs\"] = tbs\n        if page is not None:\n            payload[\"page\"] = page\n        if autocorrect is not None:\n            payload[\"autocorrect\"] = autocorrect\n\n        # Set up headers\n        headers = {\"X-API-KEY\": SERPER_API_KEY, \"Content-Type\": \"application/json\"}\n\n        # Make the API request\n        response = make_serper_request(payload, headers)\n        data = response.json()\n\n        # filter out HuggingFace dataset or space urls\n        organic_results = []\n        if \"organic\" in data:\n            for item in data[\"organic\"]:\n                if _is_huggingface_dataset_or_space_url(item.get(\"link\", \"\")):\n                    continue\n                organic_results.append(item)\n\n        # Keep all original fields, but overwrite \"organic\"\n        response_data = dict(data)\n        response_data[\"organic\"] = organic_results\n        response_data = decode_http_urls_in_dict(response_data)\n\n        return json.dumps(response_data, ensure_ascii=False)\n\n    except Exception as e:\n        return json.dumps(\n            {\"success\": False, \"error\": f\"Unexpected error: {str(e)}\", \"results\": []},\n            ensure_ascii=False,\n        )\n\n\nif __name__ == \"__main__\":\n    mcp.run()\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py",
    "content": "from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links\n\n__all__ = [\n    \"safe_unquote\",\n    \"decode_http_urls_in_dict\",\n    \"strip_markdown_links\",\n]\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py",
    "content": "import re\nfrom urllib.parse import unquote\n\nfrom markdown_it import MarkdownIt\n\n# RFC 3986 reserved characters percent-encoding (decoding these would alter URL semantics/structure)\n# gen-delims: : / ? # [ ] @\n# sub-delims: ! $ & ' ( ) * + , ; =\nRESERVED_PERCENT_ENCODINGS = frozenset(\n    {\n        \"%2f\",\n        \"%2F\",  # /  path separator\n        \"%3f\",\n        \"%3F\",  # ?  query string start\n        \"%23\",  # #  fragment start\n        \"%26\",  # &  query parameter separator\n        \"%3d\",\n        \"%3D\",  # =  key-value separator\n        \"%40\",  # @\n        \"%3a\",\n        \"%3A\",  # :\n        \"%5b\",\n        \"%5B\",  # [\n        \"%5d\",\n        \"%5D\",  # ]\n        \"%21\",  # !\n        \"%24\",  # $\n        \"%27\",  # '\n        \"%28\",  # (\n        \"%29\",  # )\n        \"%2a\",\n        \"%2A\",  # *\n        \"%2b\",\n        \"%2B\",  # +\n        \"%2c\",\n        \"%2C\",  # ,\n        \"%3b\",\n        \"%3B\",  # ;\n        \"%25\",  # %  percent sign itself (prevents double-encoding issues)\n        \"%20\",  # space (keep encoded to avoid URL semantic changes)\n    }\n)\n\n\ndef safe_unquote(url: str) -> str:\n    \"\"\"\n    Safely decode URL-encoded strings, only decoding characters that won't alter URL semantics.\n\n    Preserve the following encodings (because decoding would change URL structure/semantics):\n    - %2F (/) - path separator, decoding would alter path hierarchy\n    - %3F (?) - query string start marker\n    - %23 (#) - fragment start marker (not sent to server)\n    - %26 (&) - query parameter separator\n    - %3D (=) - key-value separator\n    - %25 (%) - percent sign itself (prevents double-encoding issues, e.g. %252F -> %2F -> /)\n    - %20 ( ) - space (keep encoded to avoid URL semantic changes)\n    - and other RFC 3986 reserved characters\n\n    Only decode unreserved characters and UTF-8 encoded international characters (e.g. Chinese).\n    \"\"\"\n    if not url:\n        return url\n\n    result = []\n    i = 0\n    n = len(url)\n\n    while i < n:\n        # Check if this is a percent-encoded sequence %XX\n        if url[i] == \"%\" and i + 2 < n:\n            hex_chars = url[i + 1 : i + 3]\n            # Validate it's a valid hexadecimal\n            if all(c in \"0123456789ABCDEFabcdef\" for c in hex_chars):\n                percent_encoded = url[i : i + 3]\n\n                # Check if this is a reserved character encoding that should be preserved\n                if percent_encoded in RESERVED_PERCENT_ENCODINGS:\n                    # Keep the encoding, don't decode\n                    result.append(percent_encoded)\n                    i += 3\n                    continue\n\n                # Try to decode (may be a UTF-8 multi-byte sequence)\n                # Collect consecutive percent-encoded sequences\n                encoded_sequence = percent_encoded\n                j = i + 3\n                while j + 2 < n and url[j] == \"%\":\n                    next_hex = url[j + 1 : j + 3]\n                    if all(c in \"0123456789ABCDEFabcdef\" for c in next_hex):\n                        next_encoded = url[j : j + 3]\n                        # Stop collecting if we encounter a reserved character\n                        if next_encoded in RESERVED_PERCENT_ENCODINGS:\n                            break\n                        encoded_sequence += next_encoded\n                        j += 3\n                    else:\n                        break\n\n                # Decode the collected sequence\n                try:\n                    decoded = unquote(encoded_sequence)\n                    result.append(decoded)\n                    i = j\n                    continue\n                except Exception:\n                    # Decoding failed, keep the original encoding\n                    result.append(percent_encoded)\n                    i += 3\n                    continue\n\n        result.append(url[i])\n        i += 1\n\n    return \"\".join(result)\n\n\ndef decode_http_urls_in_dict(data):\n    \"\"\"\n    Traverse all values in the data structure:\n    - If it's a string starting with http, apply urllib.parse.unquote\n    - If it's a list, recursively process each element\n    - If it's a dict, recursively process each value\n    - Other types remain unchanged\n    \"\"\"\n    if isinstance(data, str):\n        if \"%\" in data and \"http\" in data:\n            return safe_unquote(data)\n        else:\n            return data\n    elif isinstance(data, list):\n        return [decode_http_urls_in_dict(item) for item in data]\n    elif isinstance(data, dict):\n        return {key: decode_http_urls_in_dict(value) for key, value in data.items()}\n    else:\n        return data\n\n\nmd = MarkdownIt(\"commonmark\")\n\n\ndef strip_markdown_links(markdown: str) -> str:\n    tokens = md.parse(markdown)\n\n    def render(ts):\n        out = []\n        for tok in ts:\n            t = tok.type\n\n            # 1) Links: drop the wrapper, keep inner text (children will be rendered)\n            if t == \"link_open\" or t == \"link_close\":\n                continue\n\n            # 2) Images: skip the entire image block\n            if t == \"image\":\n                continue\n\n            # 3) Line breaks and block closings\n            if t == \"softbreak\":  # inline single line break\n                out.append(\"\\n\")\n                continue\n            if (\n                t == \"hardbreak\"\n            ):  # explicit line break (two spaces + newline in Markdown)\n                out.append(\"\\n\")\n                continue\n            if t in (\"paragraph_close\", \"heading_close\", \"blockquote_close\"):\n                out.append(\"\\n\\n\")\n                continue\n            if t in (\"list_item_close\", \"bullet_list_close\", \"ordered_list_close\"):\n                out.append(\"\\n\")\n                continue\n            if t == \"hr\":\n                out.append(\"\\n\\n\")\n                continue\n\n            # 4) Inline or nested tokens\n            if tok.children:\n                out.append(render(tok.children))\n                continue\n\n            # Preserve inline code style\n            if t == \"code_inline\":\n                out.append(f\"`{tok.content}`\")\n            else:\n                out.append(tok.content or \"\")\n\n        return \"\".join(out)\n\n    text = render(tokens)\n\n    # normalize excessive blank lines (avoid more than 2 consecutive newlines)\n    text = re.sub(r\"\\n{3,}\", \"\\n\\n\", text).rstrip() + \"\\n\"\n\n    return text.strip()\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport asyncio\nimport base64\nimport os\n\nfrom fastmcp import FastMCP\nfrom openai import OpenAI\n\nOPENAI_API_KEY = os.environ.get(\"OPENAI_API_KEY\")\nOPENAI_BASE_URL = os.environ.get(\"OPENAI_BASE_URL\", \"https://api.openai.com/v1\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"vision-mcp-server\")\n\n# Maximum file size for vision processing (20MB for images, 50MB for videos)\nMAX_IMAGE_SIZE = 20 * 1024 * 1024  # 20MB\nMAX_VIDEO_SIZE = 50 * 1024 * 1024  # 50MB\n\n\ndef guess_mime_media_type_from_extension(file_path: str) -> tuple[str, str]:\n    \"\"\"\n    Guess the MIME type and media category based on the file extension.\n\n    Returns:\n        Tuple of (mime_type, media_category) where media_category is 'image' or 'video'\n    \"\"\"\n    _, ext = os.path.splitext(file_path)\n    ext = ext.lower()\n\n    # Image formats\n    if ext in [\".jpg\", \".jpeg\"]:\n        return \"image/jpeg\", \"image\"\n    elif ext == \".png\":\n        return \"image/png\", \"image\"\n    elif ext == \".gif\":\n        return \"image/gif\", \"image\"\n    elif ext == \".webp\":\n        return \"image/webp\", \"image\"\n    elif ext == \".bmp\":\n        return \"image/bmp\", \"image\"\n    elif ext == \".tiff\" or ext == \".tif\":\n        return \"image/tiff\", \"image\"\n\n    # Video formats\n    elif ext == \".mp4\":\n        return \"video/mp4\", \"video\"\n    elif ext == \".mov\":\n        return \"video/quicktime\", \"video\"\n    elif ext == \".avi\":\n        return \"video/x-msvideo\", \"video\"\n    elif ext == \".mkv\":\n        return \"video/x-matroska\", \"video\"\n    elif ext == \".webm\":\n        return \"video/webm\", \"video\"\n\n    # Default to JPEG for unknown formats\n    return \"image/jpeg\", \"image\"\n\n\ndef _validate_file_size(file_path: str, media_category: str) -> tuple[bool, str]:\n    \"\"\"\n    Validate file size based on media category.\n\n    Returns:\n        Tuple of (is_valid, error_message)\n    \"\"\"\n    try:\n        file_size = os.path.getsize(file_path)\n        max_size = MAX_VIDEO_SIZE if media_category == \"video\" else MAX_IMAGE_SIZE\n        max_size_mb = max_size / (1024 * 1024)\n\n        if file_size > max_size:\n            return (\n                False,\n                f\"[ERROR]: File size ({file_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb}MB) for {media_category}\",\n            )\n\n        if file_size == 0:\n            return False, \"[ERROR]: File is empty\"\n\n        return True, \"\"\n    except Exception as e:\n        return False, f\"[ERROR]: Failed to check file size: {e}\"\n\n\n@mcp.tool()\nasync def visual_question_answering(media_path_or_url: str, question: str) -> str:\n    \"\"\"Ask question about an image or a video and get the answer with GPT-4o vision model.\n\n    Args:\n        media_path_or_url: The path of the image/video file locally or its URL. Supports images (jpg, png, gif, webp, bmp, tiff) and videos (mp4, mov, avi, mkv, webm).\n        question: The question to ask about the image or video.\n\n    Returns:\n        The answer to the media-related question.\n    \"\"\"\n    max_retries = 3\n    retry = 0\n\n    # Create client once outside the retry loop\n    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)\n\n    # Initialize variables\n    response = None\n    media_data = None\n    mime_type = None\n    media_category = None\n\n    while retry < max_retries:\n        try:\n            # Build message content\n            content = [{\"type\": \"text\", \"text\": question}]\n\n            if os.path.exists(media_path_or_url):  # Check if the file exists locally\n                # Get media type and validate\n                mime_type, media_category = guess_mime_media_type_from_extension(\n                    media_path_or_url\n                )\n\n                # Validate file size\n                is_valid, error_msg = _validate_file_size(\n                    media_path_or_url, media_category\n                )\n                if not is_valid:\n                    return error_msg\n\n                # Read and encode file\n                with open(media_path_or_url, \"rb\") as media_file:\n                    media_data = base64.b64encode(media_file.read()).decode(\"utf-8\")\n\n                # Add image_url content (works for both images and videos in OpenAI API)\n                content.append(\n                    {\n                        \"type\": \"image_url\",\n                        \"image_url\": {\"url\": f\"data:{mime_type};base64,{media_data}\"},\n                    }\n                )\n\n            elif \"home/user\" in media_path_or_url:\n                return \"[ERROR]: The visual_question_answering tool cannot access sandbox files, please use the local path provided by original instruction\"\n\n            else:  # Otherwise, assume it's a URL\n                # Basic URL validation\n                if not media_path_or_url.startswith((\"http://\", \"https://\")):\n                    return \"[ERROR]: Invalid URL format. URLs must start with http:// or https://\"\n\n                content.append(\n                    {\"type\": \"image_url\", \"image_url\": {\"url\": media_path_or_url}}\n                )\n\n            # Make API call\n            response = client.chat.completions.create(\n                model=\"gpt-4o\",\n                messages=[{\"role\": \"user\", \"content\": content}],\n                max_tokens=1024,\n            )\n\n            # If we reach here, the API call was successful\n            break\n\n        except FileNotFoundError:\n            return f\"[ERROR]: File not found: {media_path_or_url}\"\n        except PermissionError:\n            return f\"[ERROR]: Permission denied when reading file: {media_path_or_url}\"\n        except Exception as e:\n            retry += 1\n            if retry >= max_retries:\n                error_type = (\n                    \"API call\"\n                    if media_data is not None or not os.path.exists(media_path_or_url)\n                    else \"file processing\"\n                )\n                return f\"[ERROR]: Visual question answering failed during {error_type}: {e}\\nNote: Files from sandbox are not available. You should use local path given in the instruction.\\nSupported image formats: jpg, png, gif, webp, bmp, tiff\\nSupported video formats: mp4, mov, avi, mkv, webm\\nURLs must be publicly accessible and start with http:// or https://\"\n            await asyncio.sleep(5 * (2**retry))\n\n    # Extract and return response\n    try:\n        if response and response.choices and len(response.choices) > 0:\n            return response.choices[0].message.content\n        else:\n            return \"[ERROR]: Received empty response from API\"\n    except (AttributeError, IndexError) as e:\n        return f\"[ERROR]: Failed to parse API response: {e}\"\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  },
  {
    "path": "libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py",
    "content": "# Copyright (c) 2025 MiroMind\n# This source code is licensed under the Apache 2.0 License.\n\nimport base64\nimport os\n\nimport aiohttp\nimport requests\nfrom fastmcp import FastMCP\n\nVISION_API_KEY = os.environ.get(\"VISION_API_KEY\")\nVISION_BASE_URL = os.environ.get(\"VISION_BASE_URL\")\nVISION_MODEL_NAME = os.environ.get(\"VISION_MODEL_NAME\")\n\n# Initialize FastMCP server\nmcp = FastMCP(\"vision-mcp-server-os\")\n\n\ndef guess_mime_media_type_from_extension(file_path: str) -> str:\n    \"\"\"Guess the MIME type based on the file extension.\"\"\"\n    _, ext = os.path.splitext(file_path)\n    ext = ext.lower()\n    if ext in [\".jpg\", \".jpeg\"]:\n        return \"image/jpeg\"\n    elif ext == \".png\":\n        return \"image/png\"\n    elif ext == \".gif\":\n        return \"image/gif\"\n    else:\n        return \"image/jpeg\"  # Default to JPEG if unknown\n\n\n@mcp.tool()\nasync def visual_question_answering(image_path_or_url: str, question: str) -> str:\n    \"\"\"Ask question about an image or a video and get the answer with a vision language model.\n\n    Args:\n        image_path_or_url: The path of the image file locally or its URL.\n        question: The question to ask about the image.\n\n    Returns:\n        The answer to the image-related question.\n    \"\"\"\n    messages_for_llm = [\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image_url\", \"image_url\": {\"url\": None}},\n                {\n                    \"type\": \"text\",\n                    \"text\": question,\n                },\n            ],\n        }\n    ]\n\n    headers = {\n        \"Authorization\": f\"Bearer {VISION_API_KEY}\",\n        \"Content-Type\": \"application/json\",\n    }\n\n    try:\n        if os.path.exists(image_path_or_url):  # Check if the file exists locally\n            with open(image_path_or_url, \"rb\") as image_file:\n                image_data = base64.b64encode(image_file.read()).decode(\"utf-8\")\n                mime_type = guess_mime_media_type_from_extension(image_path_or_url)\n                messages_for_llm[0][\"content\"][0][\"image_url\"][\"url\"] = (\n                    f\"data:{mime_type};base64,{image_data}\"\n                )\n        elif image_path_or_url.startswith((\"http://\", \"https://\")):\n            async with aiohttp.ClientSession() as session:\n                async with session.get(image_path_or_url) as resp:\n                    if resp.status == 200:\n                        image_bytes = await resp.read()\n                        mime_type = resp.headers.get(\n                            \"Content-Type\", \"image/png\"\n                        )  # fallback MIME type\n                        image_data = base64.b64encode(image_bytes).decode(\"utf-8\")\n                        messages_for_llm[0][\"content\"][0][\"image_url\"][\"url\"] = (\n                            f\"data:{mime_type};base64,{image_data}\"\n                        )\n                    else:\n                        return f\"Failed to fetch image from URL: {image_path_or_url}\"\n        else:\n            messages_for_llm[0][\"content\"][0][\"image_url\"][\"url\"] = image_path_or_url\n\n        payload = {\"model\": VISION_MODEL_NAME, \"messages\": messages_for_llm}\n\n        response = requests.post(VISION_BASE_URL, json=payload, headers=headers)\n\n    except Exception as e:\n        return f\"Error: {e}\"\n\n    try:\n        return response.json()[\"choices\"][0][\"message\"][\"content\"]\n    except (AttributeError, IndexError):\n        return response.json()\n\n\nif __name__ == \"__main__\":\n    mcp.run(transport=\"stdio\")\n"
  }
]