Repository: MiroMindAI/MiroThinker
Branch: main
Commit: 40a9faef2efd
Files: 169
Total size: 1.1 MB

Directory structure:
gitextract_qqy1lifh/

├── .github/
│   └── workflows/
│       └── run-ruff.yml
├── .gitignore
├── LICENSE
├── README.md
├── apps/
│   ├── collect-trace/
│   │   ├── README.md
│   │   ├── pyproject.toml
│   │   ├── scripts/
│   │   │   ├── collect_trace_claude37.sh
│   │   │   ├── collect_trace_gpt41.sh
│   │   │   ├── collect_trace_gpt5.sh
│   │   │   └── collect_trace_qwen3.sh
│   │   └── utils/
│   │       ├── converters/
│   │       │   ├── __init__.py
│   │       │   ├── convert_non_oai_to_chatml.py
│   │       │   ├── convert_oai_to_chatml.py
│   │       │   ├── convert_to_chatml_auto_batch.py
│   │       │   ├── example_usage.py
│   │       │   └── system_prompts.py
│   │       ├── merge_chatml_msgs_to_one_json.py
│   │       └── process_logs.py
│   ├── gradio-demo/
│   │   ├── README.md
│   │   ├── main.py
│   │   ├── prompt_patch.py
│   │   ├── pyproject.toml
│   │   └── utils.py
│   ├── lobehub-compatibility/
│   │   ├── MiroThinkerToolParser.py
│   │   ├── README.md
│   │   ├── chat_template.jinja
│   │   ├── requirements.txt
│   │   ├── test_tool_parser.py
│   │   └── unit_test.py
│   ├── miroflow-agent/
│   │   ├── README.md
│   │   ├── benchmarks/
│   │   │   ├── __init__.py
│   │   │   ├── check_progress/
│   │   │   │   ├── check_progress_aime2025.py
│   │   │   │   ├── check_progress_browsecomp.py
│   │   │   │   ├── check_progress_browsecomp_zh.py
│   │   │   │   ├── check_progress_deepsearchqa.py
│   │   │   │   ├── check_progress_frames.py
│   │   │   │   ├── check_progress_gaia-validation-text-103.py
│   │   │   │   ├── check_progress_gaia-validation.py
│   │   │   │   ├── check_progress_hle-text-2158.py
│   │   │   │   ├── check_progress_hle-text-500.py
│   │   │   │   ├── check_progress_hle.py
│   │   │   │   ├── check_progress_seal-0.py
│   │   │   │   ├── check_progress_webwalkerqa.py
│   │   │   │   ├── check_progress_xbench_deepsearch.py
│   │   │   │   └── common.py
│   │   │   ├── common_benchmark.py
│   │   │   ├── evaluators/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── calculate_average_score.py
│   │   │   │   ├── eval_utils.py
│   │   │   │   └── extract_futurex_results.py
│   │   │   └── subset_extraction/
│   │   │       ├── gaia-text-103-grader.py
│   │   │       └── gaia-to-text-103-mover.py
│   │   ├── conf/
│   │   │   ├── __init__.py
│   │   │   ├── agent/
│   │   │   │   ├── default.yaml
│   │   │   │   ├── demo.yaml
│   │   │   │   ├── mirothinker_1.7_keep5_max200.yaml
│   │   │   │   ├── mirothinker_1.7_keep5_max300.yaml
│   │   │   │   ├── mirothinker_v1.0.yaml
│   │   │   │   ├── mirothinker_v1.0_keep5.yaml
│   │   │   │   ├── mirothinker_v1.5.yaml
│   │   │   │   ├── mirothinker_v1.5_keep5_max200.yaml
│   │   │   │   ├── mirothinker_v1.5_keep5_max400.yaml
│   │   │   │   ├── multi_agent.yaml
│   │   │   │   ├── multi_agent_os.yaml
│   │   │   │   ├── single_agent.yaml
│   │   │   │   └── single_agent_keep5.yaml
│   │   │   ├── benchmark/
│   │   │   │   ├── aime2025.yaml
│   │   │   │   ├── browsecomp.yaml
│   │   │   │   ├── browsecomp_zh.yaml
│   │   │   │   ├── collect_trace.yaml
│   │   │   │   ├── debug.yaml
│   │   │   │   ├── deepsearchqa.yaml
│   │   │   │   ├── default.yaml
│   │   │   │   ├── frames.yaml
│   │   │   │   ├── futurex.yaml
│   │   │   │   ├── gaia-validation-text-103.yaml
│   │   │   │   ├── gaia-validation.yaml
│   │   │   │   ├── hle-text-2158.yaml
│   │   │   │   ├── hle-text-500.yaml
│   │   │   │   ├── hle.yaml
│   │   │   │   ├── seal-0.yaml
│   │   │   │   ├── webwalkerqa.yaml
│   │   │   │   └── xbench_deepsearch.yaml
│   │   │   ├── config.yaml
│   │   │   └── llm/
│   │   │       ├── claude-3-7.yaml
│   │   │       ├── default.yaml
│   │   │       ├── gpt-5.yaml
│   │   │       └── qwen-3.yaml
│   │   ├── main.py
│   │   ├── pyproject.toml
│   │   ├── scripts/
│   │   │   ├── run_evaluate_multiple_runs_aime2025.sh
│   │   │   ├── run_evaluate_multiple_runs_browsecomp.sh
│   │   │   ├── run_evaluate_multiple_runs_browsecomp_zh.sh
│   │   │   ├── run_evaluate_multiple_runs_debug.sh
│   │   │   ├── run_evaluate_multiple_runs_deepsearchqa.sh
│   │   │   ├── run_evaluate_multiple_runs_frames.sh
│   │   │   ├── run_evaluate_multiple_runs_futurex.sh
│   │   │   ├── run_evaluate_multiple_runs_gaia-validation-text-103.sh
│   │   │   ├── run_evaluate_multiple_runs_gaia-validation.sh
│   │   │   ├── run_evaluate_multiple_runs_hle-text-2158.sh
│   │   │   ├── run_evaluate_multiple_runs_hle-text-500.sh
│   │   │   ├── run_evaluate_multiple_runs_hle.sh
│   │   │   ├── run_evaluate_multiple_runs_seal-0.sh
│   │   │   ├── run_evaluate_multiple_runs_webwalkerqa.sh
│   │   │   └── run_evaluate_multiple_runs_xbench_deepsearch.sh
│   │   └── src/
│   │       ├── __init__.py
│   │       ├── config/
│   │       │   ├── __init__.py
│   │       │   └── settings.py
│   │       ├── core/
│   │       │   ├── __init__.py
│   │       │   ├── answer_generator.py
│   │       │   ├── orchestrator.py
│   │       │   ├── pipeline.py
│   │       │   ├── stream_handler.py
│   │       │   └── tool_executor.py
│   │       ├── io/
│   │       │   ├── __init__.py
│   │       │   ├── input_handler.py
│   │       │   └── output_formatter.py
│   │       ├── llm/
│   │       │   ├── __init__.py
│   │       │   ├── base_client.py
│   │       │   ├── factory.py
│   │       │   ├── providers/
│   │       │   │   ├── __init__.py
│   │       │   │   ├── anthropic_client.py
│   │       │   │   └── openai_client.py
│   │       │   └── util.py
│   │       ├── logging/
│   │       │   ├── __init__.py
│   │       │   ├── summary_time_cost.py
│   │       │   └── task_logger.py
│   │       └── utils/
│   │           ├── __init__.py
│   │           ├── parsing_utils.py
│   │           ├── prompt_utils.py
│   │           └── wrapper_utils.py
│   └── visualize-trace/
│       ├── .python-version
│       ├── README.md
│       ├── app.py
│       ├── pyproject.toml
│       ├── requirements.txt
│       ├── run.py
│       ├── static/
│       │   ├── css/
│       │   │   └── style.css
│       │   └── js/
│       │       └── script.js
│       ├── templates/
│       │   └── index.html
│       └── trace_analyzer.py
├── assets/
│   ├── LOCAL-TOOL-DEPLOYMENT.md
│   ├── QA.md
│   └── qwen3_nonthinking.jinja
├── justfile
└── libs/
    └── miroflow-tools/
        ├── README.md
        ├── pyproject.toml
        └── src/
            ├── __init__.py
            └── miroflow_tools/
                ├── __init__.py
                ├── dev_mcp_servers/
                │   ├── jina_scrape_llm_summary.py
                │   ├── search_and_scrape_webpage.py
                │   ├── stateless_python_server.py
                │   └── task_planner.py
                ├── manager.py
                └── mcp_servers/
                    ├── __init__.py
                    ├── audio_mcp_server.py
                    ├── audio_mcp_server_os.py
                    ├── browser_session.py
                    ├── python_mcp_server.py
                    ├── reading_mcp_server.py
                    ├── reasoning_mcp_server.py
                    ├── reasoning_mcp_server_os.py
                    ├── searching_google_mcp_server.py
                    ├── searching_sogou_mcp_server.py
                    ├── serper_mcp_server.py
                    ├── utils/
                    │   ├── __init__.py
                    │   └── url_unquote.py
                    ├── vision_mcp_server.py
                    └── vision_mcp_server_os.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/run-ruff.yml
================================================
name: lint

on:
  pull_request:
    branches: [ "main" ]

jobs:
  lint:
    if: github.repository_owner == 'MiroMindAI'
    name: lint pull request
    runs-on: ubuntu-latest
    steps:
    - name: checkout code
      uses: actions/checkout@v4

    - name: Install uv
      uses: astral-sh/setup-uv@v5

    - name: Check static error
      run: |
        uv tool run ruff@0.8.0 check --show-fixes --output-format=github

    - name: Reformat code style
      run: |
        echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY
        if diff_output="$(uv tool run ruff@0.8.0 format --diff 2>&1)"; then
          echo "$diff_output"
          echo '✅ Format check passed.' >> "$GITHUB_STEP_SUMMARY"
        else
          echo "$diff_output"
          echo '❌ Format issues detected.' >> "$GITHUB_STEP_SUMMARY"
          {
            echo '```diff'
            echo "$diff_output"
            echo '```'
          } >> "$GITHUB_STEP_SUMMARY"
          exit 1
        fi

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# UV
#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#uv.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/

# pixi
#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
#   in the .venv directory. It is recommended not to include this directory in version control.
.pixi

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/

# Visual Studio Code
#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
#  you could uncomment the following to ignore the entire vscode folder
# .vscode/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc

# Cursor
#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
#  refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore

# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/


# -- ADDED --
# Log files
logs/

# Data directory - exclude everything except README
data/


.idea/

.DS_Store

apps/collect-trace/scripts/*/*.sh

================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

================================================
FILE: README.md
================================================
<div align="center">
  <img src="assets/mirothinker_logo.png" width="55%" alt="MiroThinker" />
</div>

<br>

<div align="center">

[![MODEL](https://img.shields.io/badge/Model-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/collections/miromind-ai/mirothinker-17)
[![Blog](https://img.shields.io/badge/Blog-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/#blog)
[![DATA](https://img.shields.io/badge/Data-0040A1?style=for-the-badge&logo=huggingface&logoColor=ffffff&labelColor)](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1)

[![GITHUB](https://img.shields.io/badge/Github-24292F?style=for-the-badge&logo=github&logoColor=white)](https://github.com/MiroMindAI)
[![WEBSITE](https://img.shields.io/badge/Website-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/)
[![DISCORD](https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/GPqEnkzQZd)

</div>

<div align="center">

### 🚀 [Try MiroThinker!](https://dr.miromind.ai/)

</div>

**MiroThinker**: A deep research agent optimized for research and prediction. It achieves a 88.2  on the challenging BrowseComp benchmark. See [Quick Start](#-quick-start).


## 📋 Table of Contents

- 📰 [News & Updates](#-news--updates)
- 📝 [Introduction](#-introduction)
- ✨ [Key Features](#-key-features)
- 📈 [Performance on Benchmarks](#-performance-on-benchmarks)
- 🚀 [Quick Start](#-quick-start)
- 📊 [Benchmark Evaluation](#-benchmark-evaluation)
- 🔬 [Trace Collection](#-trace-collection)
- ❓ [FAQ & Troubleshooting](#-faq--troubleshooting)
- 📄 [License](#-license)
- 🙏 [Acknowledgments](#-acknowledgments)

## 📰 News & Updates
- **[2026-03-11]** 🎉🎉🎉 Introducing [MiroThinker-1.7](https://huggingface.co/collections/miromind-ai/mirothinker-17), including [MiroThinker-1.7-mini](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) and [MiroThinker-1.7](https://huggingface.co/miromind-ai/MiroThinker-1.7). MiroThinker-1.7-mini achieves 72.3 on BrowseComp-ZH, setting a new SOTA among open-source models while using only 30B parameters. Our proprietary agent MiroThinker-H1 achieves leading performance on BrowseComp and BrowseComp-ZH among open-source and commercial models.
- **\[2026-01-23\]** 🎉 We have brought two important updates to [MiroThinker online](http://dr.miromind.ai): (a) Core Research Report Generation: Deep Research online reports now support generation, preview, and sharing. (b) Extended Document Upload Types: Now supports the upload of various file formats, such as `.pdf`, `.doc`, `.ppt`,  `.xls`,  `.jpg`. Welcome to try it out! MiroThinker will continue to be maintained and iteratively upgraded, with the goal of becoming the best Research Agent you'll ever use! 
- **\[2026-01-05\]** 🎉🎉 We release [MiroThinker-v1.5](https://huggingface.co/collections/miromind-ai/mirothinker-v15), a series of open-source deep research agents optimized for financial prediction. [MiroThinker-v1.5-30B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) surpasses Kimi-K2-Thinking on BrowseComp-ZH at much lower cost, using only 1/30 of the parameters. [MiroThinker-v1.5-235B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) scores 39.2% on HLE-Text, 69.8% on BrowseComp, 71.5% on BrowseComp-ZH, and 80.8% on GAIA-Val-165, setting a new state-of-the-art among search agents.


<details>
  <summary>📜 Click to expand older updates</summary>

- **\[2025-11-13\]** 🎉 [MiroThinker-v1.0](https://huggingface.co/collections/miromind-ai/mirothinker-v10) is now released! Introducing **interactive scaling** as a third dimension of performance improvement, MiroThinker v1.0 supports 256K context window and up to 600 tool calls per task. Available in 8B, 30B, and 72B parameter scales, achieving 37.7%, 47.1%, 55.6%, and 81.9% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. See [Technical Report](https://arxiv.org/abs/2511.11793) for more details.
- **\[2025-09-11\]** MiroThinker-72B-Preview ranked 4th in this week's FutureX benchmark. See [FutureX](https://futurex-ai.github.io/).
- **\[2025-09-08\]** [MiroThinker-v0.2](https://huggingface.co/collections/miromind-ai/mirothinker-v02) is now released, achieving open-source SOTA performance across multiple benchmarks, including HLE (17.8%), HLE-Text-Only (19.1%), BrowseComp-EN (17.2%), BrowseComp-ZH (29.4%), XBench-DeepSearch (56.0%), and Frames (74.8%).
- **\[2025-09-07\]** We supported more benchmarks, including [BrowseComp-ZH](https://arxiv.org/abs/2504.19314), [XBench-DeepSearch](https://xbench.org/agi/aisearch), and [FutureX](https://futurex-ai.github.io/). We plan to add more benchmarks in the future.
- **\[2025-08-22\]** Introducing streamlined deployment options for MiroThinker with optimized resource usage and faster startup times. Experience the interactive demo: [🚀 Try Gradio Demo](apps/gradio-demo)
- **\[2025-08-08\]** [MiroThinker-v0.1](https://huggingface.co/collections/miromind-ai/mirothinker-v01-689301b6d0563321862d44a1) released.

</details>

## 📝 Introduction

### MiroThinker-1.7
Our new MiroThinker family represents a significant leap in building reliable agents for long-chain tasks. Engineered with enhanced post-training pipeline, our  MiroThinker-1.7 family achieve SOTA performance in deep research tasks among open-source models.


**Key Features**

- 🚀 MiroThinker-1.7 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.
- 🔧 Handles up to 300 tool interactions per task, now with more accurate stepwise reasoning and decision-making.
- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.
- Our proprietary agent, MiroThinker-H1 provides promising evidence for long-chain verifiable reasoning — reasoning processes that are step-verifiable and globally verifiable, improving the performance of complex agentic workflows.

<div align="center">

|      Model Name       |         Parameters            | Max Context | Max Tool Calls |                              HF Link                               |
|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|
| MiroThinker-1.7-mini  | 30B   |    256K     |      300       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) |
| MiroThinker-1.7 | 235B |    256K     |      300       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7) |

</div>

MiroThinker-1.7 demonstrates strong general-research performance across a broad range of benchmarks, achieving 74.0%, 75.3%, 82.7% and 42.9% on  BrowseComp, BrowseComp-ZH, GAIA-Val-165 and HLE-Text, respectively. MiroThinker-1.7 achieves SOTA performance on BrowseComp-ZH.

![image](/assets/1.7_main_results.png)


### MiroThinker-v1.5

<details>
  <summary>📦 Click to expand MiroThinker-v1.5 details</summary>

MiroThinker v1.5 is the world-leading open-source search agent that advances tool-augmented reasoning through **interactive scaling** — training the agent to handle deeper and more frequent agent-environment interactions as a third dimension of performance improvement, beyond model size and context length.

![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_framework.png)

**Key Features**

- 🚀 MiroThinker v1.5 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.
- 🔧 Handles up to 400 tool calls per task — a substantial improvement over previous open-source research agents.
- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.

<div align="center">

|      Agent Name       |         Base Agent            | Max Context | Max Tool Calls |                              HF Link                               |
|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|
| MiroThinker-v1.5-30B  | Qwen3-30B-A3B-Thinking-2507   |    256K     |      400       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) |
| MiroThinker-v1.5-235B | Qwen3-235B-A22B-Thinking-2507 |    256K     |      400       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) |

</div>

MiroThinker v1.5 demonstrates strong general-research performance across a broad range of benchmarks, achieving 39.2%, 69.8%, 71.5%, and 80.8% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Val-165, respectively. These results surpass previous open-source agents and set the new world-leading BrowseComp performance.

![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_browsecomp.png)

</details>

### MiroThinker-v1.0

<details>
  <summary>📦 Click to expand MiroThinker-v1.0 details</summary>

Unlike previous agents that scale only model size or context length, MiroThinker v1.0 introduces **interactive scaling** at the agent level, systematically training the agent to handle deeper and more frequent agent–environment interactions as a third dimension of performance improvement. Interactive scaling leverages environment feedback and external information acquisition to correct errors and refine trajectories.

![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v1.0_Overall.png)

### ✨ Key Features

- 🚀 **256K Context Window**: Supports long-horizon reasoning and deep multi-step analysis
- 🔧 **600 Tool Calls**: Handles up to 600 tool calls per task — a substantial improvement over previous open-source research agents
- 📦 **Multiple Scales**: Released in 8B, 30B, and 72B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets

<div align="center">

|      Agent Name      |         Base Agent          | Max Context | Max Tool Calls |                              HF Link                               |
|:--------------------:|:---------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|
| MiroThinker-v1.0-8B  |        Qwen3-8B             |    256K     |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-8B)  |
| MiroThinker-v1.0-30B | Qwen3-30B-A3B-Thinking-2507 |    256K    |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-30B) |
| MiroThinker-v1.0-72B |    Qwen2.5-72B-Instruct     |    256K    |      600       | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-72B) |

</div>

MiroThinker v1.0 demonstrates strong general-research performance across a broad range of benchmarks, achieving **37.7%**, **47.1%**, **55.6%**, and **81.9%** on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. These results surpass previous open-source agents and narrow the gap with commercial counterparts such as **GPT-5-high**.

<div align="center">
  <img src="https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v1.0_Performance_1.png" width="100%" alt="MiroThinker" />
</div>

</details>

### MiroThinker-v0.2

<details>
  <summary>📦 Click to expand MiroThinker-v0.2 details</summary>

In this new version, we introduced three key improvements:

- 📚 **Richer training data** from both English and Chinese sources, yielding significant gains in benchmark performance and generalization
- 🎯 **Unified DPO training** with a single preference dataset across all agents
- 📏 **Extended context length** from 40k to 64k for more challenging multi-turn tool-use tasks

Compared to v0.1, MiroThinker v0.2 delivers consistent gains across benchmarks. For example, scores improved from **57.3 → 64.1** on **GAIA-Text-103** and from **17.0 → 29.4** on **BrowseComp-ZH**, reflecting substantial advancements in the model’s general research agent capabilities.

<div align="center">

|        Agent Name        |      Base Agent       | Max Context |                                HF Link                                 |
|:------------------------:|:---------------------:|:-----------:|:----------------------------------------------------------------------:|
| MiroThinker-4B-SFT-v0.2  |       Qwen3-4B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-SFT-v0.2)  |
| MiroThinker-4B-DPO-v0.2  |       Qwen3-4B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-DPO-v0.2)  |
| MiroThinker-8B-SFT-v0.2  |       Qwen3-8B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.2)  |
| MiroThinker-8B-DPO-v0.2  |       Qwen3-8B        |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.2)  |
| MiroThinker-14B-SFT-v0.2 |       Qwen3-14B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.2) |
| MiroThinker-14B-DPO-v0.2 |       Qwen3-14B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.2) |
| MiroThinker-32B-SFT-v0.2 |       Qwen3-32B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.2) |
| MiroThinker-32B-DPO-v0.2 |       Qwen3-32B       |    64K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.2) |

</div>

</details>

### MiroThinker-v0.1

<details>
  <summary>📦 Click to expand MiroThinker-v0.1 details</summary>

<div align="center">
  <img src="assets/gaia_text_103.png" width="98%" alt="MiroFlow Performance on GAIA-Validation" />
  <p><strong>Performance of Open-Source Agents on GAIA-Validation Benchmark.</strong></p>
</div>

We have released the **MiroThinker v0.1** series, including both SFT and DPO variants at parameter scales of **8B**, **14B**, and **32B**. Notably, MiroThinker v0.1 achieves **state-of-the-art performance** among open-source models on the [GAIA benchmark](https://huggingface.co/datasets/gaia-benchmark/GAIA), a rigorous evaluation suite for advanced agentic capabilities, demonstrating its strength in long-context, decision-intensive, and real-world task scenarios.

<div align="center">

| Agent Name                | Base Agent | Max Context | HF Link                                                               |
| :-----------------------: |:----------:|:-----------:| :--------------------------------------------------------------------:|
| MiroThinker-8B-SFT-v0.1   |  Qwen3-8B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.1)  |
| MiroThinker-8B-DPO-v0.1   |  Qwen3-8B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.1)  |
| MiroThinker-14B-SFT-v0.1  | Qwen3-14B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.1) |
| MiroThinker-14B-DPO-v0.1  | Qwen3-14B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.1) |
| MiroThinker-32B-SFT-v0.1  | Qwen3-32B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.1) |
| MiroThinker-32B-DPO-v0.1  | Qwen3-32B  |    40K     | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.1) |

</div>

</details>

## ✨ Key Features

### 🤖 **MiroThinker-Optimized Framework**

- 🔓 **Fully Open-Source Agent Framework**: Complete transparency with open framework and open agents
- 🔗 **Tool Integration**: Seamless integration with external tools and APIs
- 📝 **Trace Collection**: Comprehensive logging and analysis of agent interactions with elapsed time and estimated completion time displayed in minutes. Ready for SFT and DPO
- 📊 **Benchmark Evaluation**: Extensive testing across multiple benchmark datasets

### 📊 **Comprehensive Benchmark Suite**

<details open>
  <summary>📋 Click to expand benchmark list</summary>

- **GAIA Validation**: A benchmark for General AI Assistants. ([paper](https://arxiv.org/abs/2311.12983))
- **GAIA-Text-103**: A subset of GAIA Validation for text-only tasks. ([paper](https://arxiv.org/abs/2505.22648))
- **HLE**: Humanity's Last Exam. ([paper](https://arxiv.org/abs/2501.14249))
- **HLE-Text-2158**: A subset of HLE for text-only tasks. ([paper](https://arxiv.org/abs/2501.14249))
- **HLE-Text-500**: A subset of HLE for text-only tasks, created by [WebThinker](https://arxiv.org/pdf/2504.21776). ([paper](https://arxiv.org/pdf/2504.21776))
- **BrowseComp-EN**: Web browsing and comprehension tasks. ([paper](https://arxiv.org/abs/2504.12516))
- **BrowseComp-ZH**: A Chinese version of BrowseComp. ([paper](https://arxiv.org/abs/2504.19314))
- **WebWalkerQA**: Web navigation and question answering. ([paper](https://arxiv.org/abs/2501.07572))
- **Frames**: Factuality, Retrieval, And reasoning MEasurement Set. ([paper](https://arxiv.org/abs/2409.12941))
- **XBench-DeepSearch**: A benchmark for deep research agents. ([website](https://xbench.org/agi/aisearch))
- **FutureX**: A live benchmark designed for predicting unknown future. ([website](https://futurex-ai.github.io/))
- **SEAL-0**: A benchmark for evaluating LLMs on conflicting-evidence web questions. ([paper](https://arxiv.org/abs/2506.01062))
- **AIME2025**: American Invitational Mathematics Examination 2025. ([website](https://artificialanalysis.ai/evaluations/aime-2025))
- **DeepSearchQA**: Google's Deep Search Question Answering benchmark. ([paper](https://arxiv.org/abs/2505.20827))

</details>

## 📈 Performance on Benchmarks

### MiroThinker-1.7

> To prevent potential information leakage (e.g., retrieving benchmark answers from HuggingFace), we blocked access to certain websites during evaluation.

<div>
  <img src="assets/17_table.png" width="100%" alt="MiroThinker" />
</div>

</details>


### MiroThinker-v1.5

<details>
  <summary>📦 Click to expand MiroThinker-v1.5 details</summary>

> To prevent potential information leakage (e.g., searching benchmark answers from HuggingFace), access to HuggingFace has been explicitly disabled in these tools.

> We further perform canary string testing on the tool outputs of all trajectories and disregard any trajectory found to be contaminated, treating it as an incorrect answer.

<div>
  <img src="https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_performance.png" width="100%" alt="MiroThinker" />
</div>

</details>

### MiroThinker-v1.0

<details>
  <summary>📦 Click to expand MiroThinker-v1.0 details</summary>

<div align="center">
  <img src="https://github.com/user-attachments/assets/108a2105-4e1d-499e-a001-4713a03fd8ac" width="100%" alt="MiroThinker" />
</div>

</details>

### MiroThinker-v0.2

<details>
  <summary>📦 Click to expand MiroThinker-v0.2 details</summary>

#### Comparison with SOTA Research Agents

<div align="center">
  <img src="https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v0.2_Performance_2.png" width="90%" alt="MiroThinker" />
</div>

#### GAIA Benchmark

<div align="center">
  <img src="https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v0.2_Performance_1.png" width="80%" alt="MiroThinker" />
</div>

</details>

### MiroThinker-v0.1

<details>
  <summary>📦 Click to expand MiroThinker-v0.1 details</summary>

#### GAIA Benchmark

<div align="center">

| **Method**                   | Text-103<br>Best Pass@1 | Text-103<br>Pass@1 (Avg@8) | Val-165<br>Best Pass@1 | Val-165<br>Pass@1 (Avg@8) |
|------------------------------|:-----------------------:|:--------------------------:|:----------------------:|:-------------------------:|
| **🔹—— 7B/8B Agents ——**     |                         |                            |                        |                           |
| Search-o1-7B                 |          17.5           |             -              |           -            |             -             |
| R1-Searcher-7B               |          20.4           |             -              |           -            |             -             |
| WebDancer-7B                 |          31.0           |             -              |           -            |             -             |
| WebSailor-7B                 |          37.9           |             -              |           -            |             -             |
| CK-Pro-8B                    |          40.3           |             -              |          32.7          |             -             |
| **MiroThinker-8B-SFT-v0.1**  |          44.7           |            40.1            |          34.6          |           31.8            |
|     + Commercial Tools       |          46.6           |            42.1            |          37.6          |           33.9            |
| **MiroThinker-8B-DPO-v0.1**  |          46.6           |            44.8            |          37.0          |           35.4            |
|     + Commercial Tools       |        **50.5**         |          **46.7**          |        **38.2**        |         **35.9**          |
| **🔹—— 14B Agents ——**       |                         |                            |                        |                           |
| **MiroThinker-14B-SFT-v0.1** |          47.6           |            44.4            |          37.0          |           34.4            |
|     + Commercial Tools       |          49.5           |            47.5            |          41.8          |           39.8            |
| **MiroThinker-14B-DPO-v0.1** |          48.5           |            46.6            |          42.4          |           39.2            |
|     + Commercial Tools       |        **52.4**         |          **48.5**          |        **45.5**        |         **42.0**          |
| **🔹—— 32B Agents ——**       |                         |                            |                        |                           |
| Qwen3-32B                    |          31.1           |            26.7            |          29.7          |           26.4            |
| Search-o1-32B                |          28.2           |             -              |           -            |             -             |
| WebThinker-32B-RL            |          48.5           |             -              |           -            |             -             |
| WebDancer-QwQ-32B            |          51.5           |             -              |           -            |             -             |
| WebSailor-32B                |          53.2           |             -              |           -            |             -             |
| WebShaper-QwQ-32B            |          53.3           |             -              |           -            |             -             |
| **MiroThinker-32B-SFT-v0.1** |          55.3           |            51.3            |          44.9          |           42.7            |
|     + Commercial Tools       |          58.3           |            54.2            |          48.5          |           45.8            |
| **MiroThinker-32B-DPO-v0.1** |          57.3           |            54.1            |          48.5          |           45.9            |
|     + Commercial Tools       |        **60.2**         |          **57.9**          |        **50.9**        |         **48.9**          |

</div>

1. Following the practices of WebThinker, WebAgents, and CognitiveKernel, we report the Best Pass@1, the highest score across three runs, which often reflects stronger performance, though it may exhibit some variability. To provide a more stable measure, we additionally report Pass@1 (Avg@8), which offers greater consistency at the cost of slightly lower scores.

1. For consistency with prior open-source works, we evaluate GAIA-Text-103 using the WebAgents LLM-as-a-Judge template, and report results on GAIA-Val-165 using the official GAIA scorer script.

1. By default, we use open-source tools wherever possible, except for the code tool [E2B](https://github.com/e2b-dev/E2B) and the Google search tool [Serper](https://serper.dev/). We use [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct), and [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) in our implementation. The framework can be easily extended to other open-source tools of your choice.

1. Replacing these open-source tools with commercial alternatives can yield performance gains. Commercial tools were mainly used for multimodal capabilities and certain complex reasoning subtasks. The majority of tasks, including planning, browsing, refinement, navigation, and more, were handled by our agents.

#### More Benchmarks

<div align="center">

| Method                       | HLE<br>Pass@1 | Frames<br>Pass@1 | BrowseComp<br>Pass@1 | BrowseComp-ZH<br>Pass@1 | WebWalkerQA<br>Pass@1 |
|------------------------------|:-------------:|:----------------:|:--------------------:|:-----------------------:|:---------------------:|
| OpenAI Deep Research         |     26.6      |        -         |         51.5         |          42.9           |           -           |
| Gemini Deep Research         |     26.9      |        -         |          -           |            -            |           -           |
| Kimi-Researcher              |     26.9      |       78.8       |          -           |            -            |           -           |
|                              |               |                  |                      |                         |                       |
| WebDancer-7B                 |       -       |        -         |          -           |            -            |         36.0          |
| WebSailor-7B                 |       -       |        -         |         6.7          |          14.2           |           -           |
| **MiroThinker-8B-SFT-v0.1**  |       -       |       58.0       |         5.5          |           9.3           |         41.3          |
| **MiroThinker-8B-DPO-v0.1**  |       -       |       64.4       |         8.7          |          13.6           |         45.7          |
|                              |               |                  |                      |                         |                       |
| WebThinker-32B-RL            |       -       |        -         |          -           |            -            |         46.5          |
| WebDancer-QwQ-32B            |       -       |        -         |         3.8          |          18.0           |         47.9          |
| WebSailor-32B                |       -       |        -         |         10.5         |          25.5           |           -           |
| WebShaper-32B                |       -       |        -         |          -           |            -            |         51.4          |
| **MiroThinker-32B-SFT-v0.1** |     10.2      |       70.4       |         10.6         |          13.8           |         45.7          |
| **MiroThinker-32B-DPO-v0.1** |     11.8      |       71.7       |         13.0         |          17.0           |         49.3          |

</div>

1. MiroThinker’s performance was tested with this repository and open-source tools; other agents’ results are from their papers and official sites.

1. As [MiroVerse-v0.1](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) mainly contains English data, the agent’s Chinese capability is limited. We plan to add more Chinese data to improve performance in the next version.

</details>

## 🚀 Quick Start

For optimal usage, we recommend using MiroThinker with this tool-enabled agent framework and thinking mode enabled.

### Prerequisites

- 🐍 **Python 3.10+**
- 📦 **uv package manager** ([Installation guide](https://github.com/astral-sh/uv))
- 🔑 **Required API keys** (see configuration section below)

### Installation

```bash
# Clone the repository
git clone https://github.com/MiroMindAI/MiroThinker
cd MiroThinker

# Setup environment
cd apps/miroflow-agent
uv sync

# Configure API keys
cp .env.example .env
# Edit .env with your API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)
```

> **📝 Environment Variables**: See [Tool Configuration](#tool-configuration) section for required API keys.

### Tool Configuration

#### Minimal Configuration for MiroThinker-1.7.

| Server | Description | Tools Provided | Required Environment Variables |
|:-------|:------------|:---------------|:-------------------------------|
| **`tool-python`** | Execution environment and file management (E2B sandbox) | `create_sandbox`, `run_command`, `run_python_code`, `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY` |
| **`search_and_scrape_webpage`** | Google search via Serper API | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` |
| **`jina_scrape_llm_summary`** | Web scraping with LLM-based information extraction | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` |

**Minimal `.env` configuration example:**

```bash
# Required for MiroThinker v1.5 and v1.0 (minimal setup)
SERPER_API_KEY=your_serper_key
SERPER_BASE_URL="https://google.serper.dev"
JINA_API_KEY=your_jina_key
JINA_BASE_URL="https://r.jina.ai"
E2B_API_KEY=your_e2b_key

# Required for jina_scrape_llm_summary
# Note: Summary LLM can be a small model (e.g., Qwen3-14B or GPT-5-Nano)
# The choice has minimal impact on performance, use what's most convenient
SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
SUMMARY_LLM_MODEL_NAME=your_llm_model_name  # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano"
SUMMARY_LLM_API_KEY=your_llm_api_key  # Optional, depends on LLM provider

# Required for benchmark evaluation (LLM-as-a-Judge)
OPENAI_API_KEY=your_openai_key  # Required for running benchmark evaluations
OPENAI_BASE_URL="https://api.openai.com/v1"  # Optional, defaults to OpenAI's API
```

> **💡 Why this is minimal**: These 3 MCP servers cover the core capabilities needed for research tasks: web search, content extraction, and code execution. All other servers are optional enhancements.
>
> **🤖 Summary LLM**: The `SUMMARY_LLM` can be a small model like Qwen3-14B or GPT-5-Nano. The choice has minimal impact on overall performance, use whichever is most convenient for your setup.
>
> **📊 For Benchmark Evaluation**: If you plan to run benchmark evaluations, you also need `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL`) for LLM-as-a-Judge functionality used in evaluation scripts.
>
> **🖼️ For GAIA Multimodal Tasks**: GAIA-Val-165 includes tasks with image/audio/video files. Since MiroThinker is a text-only LLM, GPT-4o is used to pre-process these files into text descriptions. The same `OPENAI_API_KEY` is used for both this preprocessing and LLM-as-a-Judge.
>
> **📖 For more details**: See [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.

<details>
  <summary>🔧 Click to expand additional available tools</summary>

The following optional tools are available but were not used in MiroThinker v1.0-1.7 evaluation:

| Server Name          | Type         | Description                                 |
|:---------------------|:-------------|:--------------------------------------------|
| `tool-vqa`           | Commercial   | Vision processing using Claude              |
| `tool-vqa-os`        | Open-Source  | Vision processing (open-source alternative) |
| `tool-transcribe`    | Commercial   | Audio transcription using OpenAI            |
| `tool-transcribe-os` | Open-Source  | Audio transcription using Whisper           |
| `tool-reasoning`     | Commercial   | Reasoning engine using Claude               |
| `tool-reasoning-os`  | Open-Source  | Reasoning engine (open-source alternative)  |
| `tool-reading`       | Open-Source  | Document reading using MarkItDown           |
| `tool-google-search` | Commercial   | Web search using Google + scraping          |
| `tool-sogou-search` | Commercial   | Web search using Sogou (Chinese)           |

> **📖 Local Deployment**: For instructions on deploying open-source tools (`tool-vqa-os`, `tool-transcribe-os`, `tool-reasoning-os`) locally, see [Local Tool Deployment Guide](assets/LOCAL-TOOL-DEPLOYMENT.md).

See the [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.

</details>

#### Pre-configured Agent Settings

The `apps/miroflow-agent/conf/agent/` directory contains several pre-configured agent settings. Each configuration uses different tools and requires corresponding environment variables in your `.env` file.

> **💡 Recommended**: For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management, recommended for most tasks) or `mirothinker_v1.7_keep5_max300` (only used for BrowseComp and BrowseComp-ZH). 

| Configuration                          | Description | Max Turns | Context Retention | Required Environment Variables                                                                                                                               | Recommended For |
|:---------------------------------------|:------------|:----------|:------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|
| **`mirothinker_1.7_keep5_max200`** ⭐  | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **1.7 (recommended for most tasks)** |
| **`mirothinker_1.7_keep5_max300`** ⭐  | Single-agent with context management | 300 | Keep 5 most recent | Same as above                                                                                                                              | **1.7 (for BrowseComp & BrowseComp-ZH)** |


<details>
  <summary>📦 Click to expand legacy configurations (v0.1/v0.2)</summary>

| Configuration            | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For |
|:-------------------------|:------------|:----------|:------------------|:-------------------------------|:----------------|
| **`mirothinker_v1.5_keep5_max200`**  | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **v1.5 (recommended for most tasks)** |
| **`mirothinker_v1.5_keep5_max400`**  | Single-agent with context management | 400 | Keep 5 most recent | Same as above                                                                                                                              | **v1.5 (for BrowseComp & BrowseComp-ZH)** |
| **`mirothinker_v1.5`**                 | Single-agent for MiroThinker v1.5 | 600 | Keep all results | Same as above | **v1.5** |
| **`mirothinker_v1.0_keep5`**           | Single-agent with context management | 600 | Keep 5 most recent | Same as above                                                                                                                                   | **v1.0** |
| **`mirothinker_v1.0`**                 | Single-agent for MiroThinker v1.0 | 600 | Keep all results | Same as above | **v1.0** |
| **`multi_agent`**        | Multi-agent with commercial tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_BASE_URL`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |
| **`multi_agent_os`**     | Multi-agent with open-source tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`, `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`, `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |

</details>

> **💡 Note**: All environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and fill in the values for the tools you plan to use.

#### Creating Custom Tool Configurations

<details>
  <summary>🔧 Click to expand custom tool configuration guide</summary>

You can create your own YAML configuration file to freely combine MCP servers. Here's how:

1. **Create a new YAML file** in `apps/miroflow-agent/conf/agent/`:

```yaml
# conf/agent/my_custom_config.yaml
defaults:
  - default
  - _self_

main_agent:
  tools:
    - tool-python                    # Execution environment
    - search_and_scrape_webpage      # Google search
    - jina_scrape_llm_summary        # Web scraping with LLM
    - tool-vqa                       # Vision processing (optional)
    - tool-transcribe                # Audio processing (optional)
    - tool-reasoning                 # Reasoning engine (optional)
    - tool-reading                   # Document reading (optional)
  max_turns: 300  # Maximum number of turns

sub_agents:
  agent-browsing:  # Optional sub-agent
    tools:
      - tool-google-search
      - tool-vqa
      - tool-reading
      - tool-python
    max_turns: 50

keep_tool_result: -1  # Context retention budget: -1 keeps all tool results, or specify K to keep only the K most recent tool responses
```

> **💡 Context Retention Strategy**: The `keep_tool_result` parameter implements a **recency-based context retention** strategy. In the standard ReAct paradigm, all tool outputs are retained in the message history, which can lead to inefficient context utilization. Empirically, we observe that the agent's subsequent actions depend primarily on recent observations rather than distant ones. This strategy retains only the most recent K tool responses (where K is the `keep_tool_result` value) while preserving the complete sequence of thoughts and actions.
>
> **Benefits:**
>
> - ✅ Preserves the reasoning and action trace
> - ✅ Focuses the agent's attention on the most contextually relevant observations
> - ✅ Frees additional context space for extended reasoning and deeper tool-use trajectories
> - ✅ Does not lead to performance degradation while allowing more context space for interactive scaling
>
> **Usage:** Set `keep_tool_result: -1` to keep all tool results, or specify a positive integer K (e.g., `keep_tool_result: 5`) to keep only the K most recent tool responses.

2. **Use your custom configuration** when running evaluations:

```bash
cd apps/miroflow-agent
uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1
```

3. **Configure environment variables** in `.env` based on the tools you use.

   All available environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and configure the variables according to your chosen configuration:

   ```bash
   cd apps/miroflow-agent
   cp .env.example .env
   # Edit .env with your actual API keys
   ```

   **For MiroThinker v1.5** (`mirothinker_v1.5_keep5_max200.yaml`, `mirothinker_v1.5_keep5_max400.yaml`, or `mirothinker_v1.5.yaml`) and **v1.0** (`mirothinker_v1.0_keep5.yaml` or `mirothinker_v1.0.yaml`), see the [Minimal Configuration](#minimal-configuration-for-mirothinker-v15-and-v10) section above for the complete configuration example.

   **For other configurations**, refer to the [Pre-configured Agent Settings](#pre-configured-agent-settings) table above to see which environment variables are required.

</details>

<details>
  <summary>🔑 Click to expand optional API keys</summary>

```bash
# API for LLM-as-a-Judge (for benchmark testing, required for benchmark evaluation)
OPENAI_API_KEY=your_openai_key
OPENAI_BASE_URL="https://api.openai.com/v1"  # Optional, defaults to OpenAI's API

# API for Open-Source Audio Transcription Tool (for benchmark testing, optional)
WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
WHISPER_API_KEY=your_whisper_key
WHISPER_BASE_URL="https://your_whisper_base_url/v1"

# API for Open-Source VQA Tool (for benchmark testing, optional)
VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
VISION_API_KEY=your_vision_key
VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions"

# API for Open-Source Reasoning Tool (for benchmark testing, optional)
REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
REASONING_API_KEY=your_reasoning_key
REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions"

# API for Claude Sonnet 3.7 as Commercial Tools (optional)
ANTHROPIC_API_KEY=your_anthropic_key

# API for Sogou Search (optional)
TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id
TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key

# API for Summary LLM (can use small models like Qwen3-14B or GPT-5-Nano)
SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name  # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano"
SUMMARY_LLM_API_KEY=your_summary_llm_api_key
```

</details>

### Serve the MiroThinker Agent

#### Option 1 (Recommended): Serve with SGLang or vLLM

Use SGLang to serve MiroThinker models at port 61002:

```bash
NUM_GPUS=4
PORT=61002

# Downloading agent from HF 
AGENT_PATH=miromind-ai/MiroThinker-1.7-mini


python3 -m sglang.launch_server \
    --model-path $AGENT_PATH \
    --tp $NUM_GPUS \
    --dp 1 \
    --host 0.0.0.0 \
    --port $PORT \
    --trust-remote-code
```

> **📍 Server URL**: This will start a server at `http://0.0.0.0:$PORT`. Use this as your server base URL (e.g., `http://0.0.0.0:61002/v1`).

#### Option 2: Quantized Light-Weight Options

We also provide comprehensive guidance for serving MiroThinker agents using CPU-optimized and GPU-accelerated quantization techniques, along with detailed analysis and guidelines for deployment with llama.cpp, Ollama, SGLang, and other inference frameworks.

> **📖 Complete Guide**: See [Deployment Documentation](apps/gradio-demo/) for detailed deployment instructions.

### Run Your First Task

After setting up the environment and starting your server, run `main.py` to test with a default question: *"What is the title of today's arxiv paper in computer science?"*

```bash
cd apps/miroflow-agent

# Using MiroThinker agents (requires your own server)
uv run python main.py llm=qwen-3 agent=mirothinker_1.7_keep5_max200 llm.base_url=http://localhost:61002/v1

# Or using Claude (requires ANTHROPIC_API_KEY in .env)
uv run python main.py llm=claude-3-7 agent=single_agent_keep5

# Or using GPT-5 (requires OPENAI_API_KEY in .env)
uv run python main.py llm=gpt-5 agent=single_agent_keep5
```

**To customize your question**, edit `main.py` line 32:

```python
task_description = "Your custom question here"
```

The agent will search the web, execute code if needed, and provide an answer with sources.

> **📖 More details**: See [apps/miroflow-agent/README.md](apps/miroflow-agent/README.md) for available configurations and troubleshooting.

## 📊 Benchmark Evaluation

> For researchers who want to reproduce our benchmark results or evaluate on standard benchmarks.

### Download Benchmark Data

```bash
cd MiroThinker  # Back to project root
wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/data_20251115_password_protected.zip
unzip data_20251115_password_protected.zip
# Password: pf4*
rm data_20251115_password_protected.zip
```

### Run Benchmark Evaluation

> **Note:** For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management), `mirothinker_1.7_keep5_max300` (with context management).

**Available Parameters:**

You can customize the evaluation by setting the following environment variables before running the script:

| Parameter | Default | Description |
|:----------|:--------|:------------|
| `LLM_MODEL` | `"MiroThinker-Agents"` | Agent name identifier |
| `BASE_URL` | `"https://your-api.com/v1"` | Base URL of your server |
| `NUM_RUNS` | Varies by benchmark | Number of evaluation runs (3 for most benchmarks, 8 for GAIA/XBench/FutureX/SEAL-0, 32 for AIME2025) |
| `LLM_PROVIDER` | `"qwen"` | LLM provider (e.g., `qwen`, `openai`, `anthropic`) |
| `AGENT_SET` | `"mirothinker_1.7_keep5_max200"` | Agent configuration (e.g., `mirothinker_1.7_keep5_max200`, `mirothinker_1.7_keep5_max300`.) |
| `MAX_CONTEXT_LENGTH` | `262144` | Maximum context length (256K) |
| `MAX_CONCURRENT` | `10` | Maximum concurrent tasks |
| `PASS_AT_K` | `1` | Pass@K evaluation metric |
| `TEMPERATURE` | `1.0` | Sampling temperature |
| `API_KEY` | `"xxx"` | API key for the server |

**Example Usage:**

```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent

# Basic usage with v1.5 (recommended)
NUM_RUNS=8 LLM_MODEL="MiroThinker-1.7-mini" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh

# Or with v1.0
# NUM_RUNS=8 LLM_MODEL="MiroThinker-v1.0-30B" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh

# Customize number of runs and agent configuration (v1.5 with context management)
LLM_MODEL="MiroThinker-1.7-mini" \
BASE_URL="https://your-api.com/v1" \
NUM_RUNS=8 \
AGENT_SET="mirothinker_1.7_keep5_max200" \
bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh

```

<details open>
  <summary>📋 Click to expand all benchmark commands</summary>

> **⚠️ Important for MiroThinker-1.7**: To reproduce our reported results, you must set the correct `AGENT_SET`:
>
> - **BrowseComp & BrowseComp-ZH**: Use `AGENT_SET="mirothinker_1.7_keep5_max300"`
> - **All other benchmarks**: Use `AGENT_SET="mirothinker_1.7_keep5_max200"`

```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent

# HLE
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle.sh

# HLE-Text-2158
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-2158.sh

# HLE-Text-500
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-500.sh

# GAIA-Text-103
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh

# GAIA-Validation (GAIA-Val-165)
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation.sh

# BrowseComp-EN (⚠️ use max300)
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp.sh

# BrowseComp-ZH (⚠️ use max300)
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp_zh.sh

# WebWalkerQA
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_webwalkerqa.sh

# XBench-DeepSearch
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh

# FRAMES
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_frames.sh

# SEAL-0
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_seal-0.sh

# FutureX
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_futurex.sh

# AIME2025
NUM_RUNS=32 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_aime2025.sh

# DeepSearchQA
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_deepsearchqa.sh
```

</details>

#### 3. **Monitor evaluation progress**

<details>
  <summary>📊 Click to expand progress monitoring commands</summary>

```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent

# For HLE
python benchmarks/check_progress/check_progress_hle.py /path/to/evaluation/logs

# For HLE-Text-2158
python benchmarks/check_progress/check_progress_hle-text-2158.py /path/to/evaluation/logs

# For HLE-Text-500
python benchmarks/check_progress/check_progress_hle-text-500.py /path/to/evaluation/logs

# For BrowseComp-EN
python benchmarks/check_progress/check_progress_browsecomp.py /path/to/evaluation/logs

# For BrowseComp-ZH
python benchmarks/check_progress/check_progress_browsecomp_zh.py /path/to/evaluation/logs

# For GAIA-Validation
python benchmarks/check_progress/check_progress_gaia-validation.py /path/to/evaluation/logs

# For GAIA-Text-103
python benchmarks/check_progress/check_progress_gaia-validation-text-103.py /path/to/evaluation/logs

# For WebWalkerQA
python benchmarks/check_progress/check_progress_webwalkerqa.py /path/to/evaluation/logs

# For Frames
python benchmarks/check_progress/check_progress_frames.py /path/to/evaluation/logs

# For XBench-DeepSearch
python benchmarks/check_progress/check_progress_xbench_deepsearch.py /path/to/evaluation/logs

# For SEAL-0
python benchmarks/check_progress/check_progress_seal-0.py /path/to/evaluation/logs

# For AIME2025
python benchmarks/check_progress/check_progress_aime2025.py /path/to/evaluation/logs

# For DeepSearchQA
python benchmarks/check_progress/check_progress_deepsearchqa.py /path/to/evaluation/logs
```

</details>

## 🔬 Trace Collection

<details>
<summary>📋 Click to expand trace collection commands</summary>

```bash
cd apps/collect-trace

# Collect Traces for SFT
bash scripts/collect_trace_claude37.sh
bash scripts/collect_trace_gpt5.sh

# Collect Traces for DPO
bash scripts/collect_trace_qwen3.sh
```

</details>

## ❓ FAQ & Troubleshooting

### Common Issues

<details>
  <summary>🔧 Click to expand troubleshooting guide</summary>

#### **Q: Which version should I use?**

**A:** We recommend **MiroThinker-1.7** ⭐ with the minimal configuration:

- **v1.7** ⭐: Latest version with 256K context, world-leading performance. Use config (with context management):
  - `mirothinker_1.7_keep5_max200` (up to 200 turns, recommended for most tasks)
  - `mirothinker_1.7_keep5_max300` (up to 300 turns, only used for BrowseComp and BrowseComp-ZH)

#### **Q: How do I get API keys?**

**A:** You need these keys for minimal setup:

- **SERPER_API_KEY**: Get from [Serper.dev](https://serper.dev/) (Google search API)
- **JINA_API_KEY**: Get from [Jina.ai](https://jina.ai/) (Web scraping)
- **E2B_API_KEY**: Get from [E2B.dev](https://e2b.dev/) (Code execution sandbox)
- **SUMMARY_LLM_API_KEY**: Your LLM API credentials (for content summarization). Can be a small model like Qwen3-14B or GPT-5-Nano—the choice has minimal impact on performance.
- **OPENAI_API_KEY**: Get from [OpenAI](https://platform.openai.com/) (Required for benchmark evaluation, used for LLM-as-a-Judge)
- **OPENAI_BASE_URL**: Optional, defaults to `https://api.openai.com/v1`. Can be changed to use OpenAI-compatible APIs.

#### **Q: Agent server connection errors**

**A:** Common issues:

- **Check base URL format**: Should end with `/v1` (e.g., `https://your-api.com/v1`)
- **Verify API key**: Ensure `API_KEY` is set correctly in environment or script
- **Check server status**: Make sure your server is running and accessible
- **Network issues**: Verify firewall/network settings allow connections

#### **Q: Evaluation script fails to run**

**A:** Troubleshooting steps:

1. **Check working directory**: Make sure you're in `apps/miroflow-agent` directory
1. **Verify environment**: Run `uv sync` to ensure dependencies are installed
1. **Check .env file**: Ensure all required environment variables are set
1. **Review logs**: Check `logs/` directory for detailed error messages
1. **Verify data path**: Ensure benchmark data is downloaded and in correct location

#### **Q: Out of memory errors**

**A:** Solutions:

- **Reduce context length**: Set `MAX_CONTEXT_LENGTH` to a smaller value (e.g., 131072 for 128K)
- **Use context management with fewer turns**:
  - For v1.5: Use `mirothinker_1.7_keep5_max200` or `mirothinker_1.7_keep5_max300` (with context management)
- **Reduce concurrent tasks**: Set `MAX_CONCURRENT` to a smaller number (e.g., 5)
- **Use smaller agents**:
  - For v1.5: Try 30B instead of 235B
  - For v1.0: Try 8B or 30B instead of 72B

#### **Q: Tool execution errors**

**A:** Common fixes:

- **E2B errors**: Verify `E2B_API_KEY` is valid and account has credits
- **Serper errors**: Check `SERPER_API_KEY` and rate limits
- **Jina errors**: Verify `JINA_API_KEY` and `JINA_BASE_URL` are correct
- **LLM summarization errors**: Check `SUMMARY_LLM_*` variables and agent availability

#### **Q: How to monitor long-running evaluations?**

**A:** Use the progress monitoring scripts:

```bash
cd apps/miroflow-agent
python benchmarks/check_progress/check_progress_<benchmark_name>.py /path/to/logs
```

The scripts show completion status, elapsed time, and estimated remaining time.

</details>

### Getting Help

- 📖 **Documentation**: Check [MiroFlow Tools README](libs/miroflow-tools/README.md) for tool details
- 💬 **Discord**: Join our [Discord community](https://discord.com/invite/GPqEnkzQZd)
- 🐛 **Issues**: Report bugs on [GitHub Issues](https://github.com/MiroMindAI/MiroThinker/issues)
- 📧 **Contact**: Visit [our website](https://miromind.ai/) for more information

## 📄 License

This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.

## 🙏 Acknowledgments

We extend our sincere gratitude to:

- 🏆 **Benchmark Contributors** for the comprehensive evaluation datasets
- 🌍 **Open Source Community** for the tools and libraries that make this possible
- 👥 **All Contributors** who have helped make MiroThinker better

<div align="center">
  <a href="https://github.com/MiroMindAI/MiroThinker/graphs/contributors">
    <img src="https://contrib.rocks/image?repo=MiroMindAI/MiroThinker" />
  </a>
</div>

Join our community and help us build the future of AI agents!

### References

If you find this project useful in your research, please consider citing:

**MiroThinker** (Model & Method)
```
@article{miromind2026mirothinker,
  title={MiroThinker-1.7 & H1: Towards Heavy-Duty Research Agents via Verification},
  author={MiroMind Team and Bai, S. and Bing, L. and Lei, L. and Li, R. and Li, X. and Lin, X. and Min, E. and Su, L. and Wang, B. and Wang, L. and Wang, L. and Wang, S. and Wang, X. and Zhang, Y. and Zhang, Z. and others},
  journal={arXiv preprint arXiv:2603.15726},
  year={2026}
}
```

**MiroFlow** (Framework)
```bibtex
@article{miromind2026miroflow,
  title={MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework for General Deep Research Tasks},
  author={Su, Shiqian and Xing, Sen and Dong, Xuan and Zhong, Muyan and Wang, Bin and Zhu, Xizhou and Chen, Yuntao and Wang, Wenhai and Deng, Yue and Zhu, Pengxiang and others},
  journal={arXiv preprint arXiv:2602.22808},
  year={2026}
}
```

[![Star History Chart](https://api.star-history.com/svg?repos=MiroMindAI/MiroThinker&type=Date)](https://star-history.com/#MiroMindAI/MiroThinker&Date)


================================================
FILE: apps/collect-trace/README.md
================================================
# Collect Trace

> TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-a-Judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO).

## 📝 Overview

Collect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct.

Workflow:

1. Load each RLVR item’s question and verifiable answer.

1. Run the agent in the evaluation pipeline (with tool use / browsing as needed).

1. Verify the model's answer with an LLM-as-a-Judge against the RLVR reference answer.

1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples.

## 🚀 Quick Start

### Prerequisites

- Python 3.10+
- [uv](https://github.com/astral-sh/uv) package manager
- OpenAI API key (for LLM-based validation)
- RLVR dataset (JSONL; contains question and a verifiable answer)

### Installation

1. **Navigate to the collect-trace directory**:

   ```bash
   cd apps/collect-trace
   ```

1. **Install dependencies**:

   ```bash
   uv sync
   ```

1. **Set up environment variables**:

   ```bash
   # Create .env if missing (safe; won't overwrite existing file)
   [ -f ../miroflow-agent/.env ] || cp ../miroflow-agent/.env.example ../miroflow-agent/.env
   # (Alternative on macOS/Linux) cp -n ../miroflow-agent/.env.example ../miroflow-agent/.env || true

   # Edit .env and fill in your keys
   # Required: OPENAI_API_KEY (for LLM-as-a-Judge)
   # Optional: other keys for specific tools
   ```

### Basic Usage

Run a benchmark evaluation to collect traces:

```bash
# Using Claude-3.7 for trace collection
bash scripts/collect_trace_claude37.sh

# Using GPT-5 for trace collection  
bash scripts/collect_trace_gpt5.sh

# Using Qwen-3 for trace collection  
bash scripts/collect_trace_qwen3.sh
```


================================================
FILE: apps/collect-trace/pyproject.toml
================================================
[project]
name = "collect-trace"
version = "0.1.0"
description = "Executes a user-defined agent loop for capturing multi-turn interaction traces"
readme = "README.md"
requires-python = ">=3.12"
authors = [{ name = "MiroMind Team", email = "service@miromind.ai" }]
dependencies = [
    "miroflow-tools>=0.1.0",
    "dotenv>=0.9.9",
    "openai>=1.90.0",
]

[tool.uv.sources]
miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }


================================================
FILE: apps/collect-trace/scripts/collect_trace_claude37.sh
================================================
# Check if ANTHROPIC_API_KEY is set
if [ -z "$ANTHROPIC_API_KEY" ]; then
    echo "Error: ANTHROPIC_API_KEY is not set."
    exit 1
else
    echo "ANTHROPIC_API_KEY detected."
fi

# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"


# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_claude37"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR

# Collect traces
uv run python benchmarks/common_benchmark.py \
    benchmark=collect_trace \
    benchmark.data.data_dir="../../data/debug" \
    benchmark.data.metadata_file="standardized_data.jsonl" \
    llm=claude-3-7 \
    llm.provider=anthropic \
    llm.model_name=claude-3-7-sonnet-20250219 \
    llm.api_key="$ANTHROPIC_API_KEY" \
    llm.base_url=https://api.anthropic.com \
    llm.async_client=true \
    benchmark.execution.max_tasks=null \
    benchmark.execution.max_concurrent=10 \
    benchmark.execution.pass_at_k=1 \
    agent=single_agent \
    hydra.run.dir=$LOG_DIR \
    2>&1 | tee "$LOG_DIR/output.log"

# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl


================================================
FILE: apps/collect-trace/scripts/collect_trace_gpt41.sh
================================================
# Check if OPENAI_API_KEY is set
if [ -z "$OPENAI_API_KEY" ]; then
    echo "Error: OPENAI_API_KEY is not set."
    exit 1
else
    echo "OPENAI_API_KEY detected."
fi

# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"


# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_gpt41"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR

# Collect traces
uv run python benchmarks/common_benchmark.py \
    benchmark=collect_trace \
    benchmark.data.data_dir="../../data/debug" \
    benchmark.data.metadata_file="standardized_data.jsonl" \
    llm=gpt-5 \
    llm.provider=openai \
    llm.model_name=gpt-4.1-mini \
    llm.api_key="$OPENAI_API_KEY" \
    llm.base_url=https://api.openai.com/v1 \
    llm.async_client=true \
    benchmark.execution.max_tasks=null \
    benchmark.execution.max_concurrent=10 \
    benchmark.execution.pass_at_k=1 \
    agent=single_agent \
    hydra.run.dir=$LOG_DIR \
    2>&1 | tee "$LOG_DIR/output.log"

# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl


================================================
FILE: apps/collect-trace/scripts/collect_trace_gpt5.sh
================================================
# Check if OPENAI_API_KEY is set
if [ -z "$OPENAI_API_KEY" ]; then
    echo "Error: OPENAI_API_KEY is not set."
    exit 1
else
    echo "OPENAI_API_KEY detected."
fi

# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"


# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_gpt5"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR

# Collect traces
uv run python benchmarks/common_benchmark.py \
    benchmark=collect_trace \
    benchmark.data.data_dir="../../data/debug" \
    benchmark.data.metadata_file="standardized_data.jsonl" \
    llm=gpt-5 \
    llm.provider=openai \
    llm.model_name=gpt-5-2025-08-07 \
    llm.api_key="$OPENAI_API_KEY" \
    llm.base_url=https://api.openai.com/v1 \
    llm.async_client=true \
    benchmark.execution.max_tasks=null \
    benchmark.execution.max_concurrent=10 \
    benchmark.execution.pass_at_k=1 \
    agent=single_agent \
    hydra.run.dir=$LOG_DIR \
    2>&1 | tee "$LOG_DIR/output.log"

# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl


================================================
FILE: apps/collect-trace/scripts/collect_trace_qwen3.sh
================================================
# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"


# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_qwen3"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR

# Collect traces
uv run python benchmarks/common_benchmark.py \
    benchmark=collect_trace \
    benchmark.data.data_dir="../../data/debug" \
    benchmark.data.metadata_file="standardized_data.jsonl" \
    llm=qwen-3 \
    llm.provider=qwen \
    llm.model_name=qwen-3-32b \
    llm.api_key="" \
    llm.base_url=https://your-api.com/v1 \
    llm.async_client=true \
    llm.temperature=1.0 \
    llm.max_context_length=131072 \
    benchmark.execution.max_tasks=null \
    benchmark.execution.max_concurrent=10 \
    benchmark.execution.pass_at_k=1 \
    agent=single_agent \
    hydra.run.dir=$LOG_DIR \
    2>&1 | tee "$LOG_DIR/output.log"

# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR

# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl


================================================
FILE: apps/collect-trace/utils/converters/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

from .convert_non_oai_to_chatml import (
    convert_to_json_chatml,
    extract_and_save_chat_history,
)
from .convert_oai_to_chatml import (
    extract_message_history_from_log,
    oai_tool_message_to_chat_message,
    process_log_file,
    save_chatml_to_files,
)
from .convert_to_chatml_auto_batch import (
    batch_process_files,
    determine_conversion_method,
    get_llm_provider,
    process_single_file,
)

__all__ = [
    # OAI conversion functions
    "oai_tool_message_to_chat_message",
    "extract_message_history_from_log",
    "save_chatml_to_files",
    "process_log_file",
    # Non-OAI conversion functions
    "convert_to_json_chatml",
    "extract_and_save_chat_history",
    # Auto batch conversion functions
    "get_llm_provider",
    "determine_conversion_method",
    "process_single_file",
    "batch_process_files",
]


================================================
FILE: apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import sys
from pathlib import Path
from typing import Any, Dict, List


def convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:
    """
    Convert message list to OpenAI JSON format ChatML
    Filter out messages with role 'tool', convert content None to empty string
    """
    chatml_list = []
    for message in messages:
        role = message.get("role", "")
        if role == "tool":
            continue  # Skip tool messages
        if role == "system":
            continue  # Skip system messages
        content = message.get("content", "")
        if content is None:
            content = ""
        # Handle different content formats
        if isinstance(content, list):
            text_parts = []
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    text_parts.append(item.get("text", ""))
            content = " ".join(text_parts)
        elif isinstance(content, str):
            pass
        else:
            content = str(content)
        chatml_list.append({"role": role, "content": content})
    return chatml_list


def extract_and_save_chat_history(
    log_data: Dict[str, Any], output_dir: Path, input_filename: str
):
    """
    Extract message history from log data and save as ChatML format

    Args:
        log_data: Log data dictionary
        output_dir: Output directory
        input_filename: Input filename (without extension)
    """
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    # 1. Extract main_agent_message_history
    main_agent_history = log_data.get("main_agent_message_history", {})
    if main_agent_history and "message_history" in main_agent_history:
        main_messages = main_agent_history["message_history"]
        if main_messages:
            chatml_list = convert_to_json_chatml(main_messages)
            chatml_list.insert(
                0,
                {
                    "role": "system",
                    "content": main_agent_history.get("system_prompt", ""),
                },
            )
            # Save main agent chat records
            main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json"
            with open(main_output_file, "w", encoding="utf-8") as f:
                json.dump(chatml_list, f, ensure_ascii=False, indent=2)

            print(f"✓ Saved main agent chat records: {main_output_file}")

    # 2. Extract sub_agent_message_history_sessions
    sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {})
    if sub_agent_sessions:
        for session_name, session_data in sub_agent_sessions.items():
            if "message_history" in session_data:
                sub_agent_messages = session_data["message_history"]
                if sub_agent_messages:
                    chatml_list = convert_to_json_chatml(sub_agent_messages)
                    chatml_list.insert(
                        0,
                        {
                            "role": "system",
                            "content": session_data.get("system_prompt", ""),
                        },
                    )

                    # Save browser agent chat records
                    sub_agent_output_file = (
                        output_dir / f"{input_filename}_{session_name}_chatml.json"
                    )
                    with open(sub_agent_output_file, "w", encoding="utf-8") as f:
                        json.dump(chatml_list, f, ensure_ascii=False, indent=2)

                    print(f"✓ Saved sub agent chat records: {sub_agent_output_file}")


def main():
    """Main function"""
    if len(sys.argv) < 2:
        print("Usage: python convert_non_oai_to_chatml.py <log_file_path> [output_dir]")
        print(
            "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json"
        )
        print(
            "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats"
        )
        sys.exit(1)

    log_file_path = Path(sys.argv[1])
    output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("extracted_chats")

    # Check if input file exists
    if not log_file_path.exists():
        print(f"Error: Log file does not exist: {log_file_path}")
        sys.exit(1)

    try:
        # Read log file
        print(f"Reading log file: {log_file_path}")
        with open(log_file_path, "r", encoding="utf-8") as f:
            log_data = json.load(f)

        # Extract input filename (without extension)
        input_filename = log_file_path.stem

        # Extract and save chat history
        print(f"Extracting chat history to: {output_dir}")
        extract_and_save_chat_history(log_data, output_dir, input_filename)

        print("\n✓ Chat history extraction completed!")
        print(f"Output directory: {output_dir.absolute()}")

    except json.JSONDecodeError as e:
        print(f"Error: Cannot parse JSON file: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: apps/collect-trace/utils/converters/convert_oai_to_chatml.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import ast
import json
import os
import sys
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Any, Dict

from system_prompts import (
    main_system_prompt_foreword,
    sub_agent_system_prompt_foreword,
    system_prompt_tool_instrcutions,
)

# Initialize creation_time_str with current time
creation_time_str = datetime.now().strftime("%Y-%m-%d")


def oai_tool_message_to_chat_message(oai_messages, agent_type, tool_definition):
    def convert_oai_tool_call_to_mcp_tool_call_str(oai_tool_call):
        if isinstance(oai_tool_call, list):
            assert len(oai_tool_call) >= 1
        if isinstance(oai_tool_call, str):
            oai_tool_call = [json.loads(oai_tool_call)]

        mcp_tool_call_templates = []
        for each_oai_tool_call in oai_tool_call:
            assert isinstance(
                each_oai_tool_call, dict
            ), f"oai_tool_call should be a dict, but got {type(each_oai_tool_call)}"

            server_name, tool_name = each_oai_tool_call["function"]["name"].rsplit(
                "-", maxsplit=1
            )
            arguments = json.loads(each_oai_tool_call["function"]["arguments"])
            mcp_tool_call_template = f"<use_mcp_tool>\n<server_name>{server_name}</server_name>\n<tool_name>{tool_name}</tool_name>\n<arguments>\n{json.dumps(arguments)}\n</arguments>\n</use_mcp_tool>"
            mcp_tool_call_templates.append(mcp_tool_call_template)

        return "\n\n".join(mcp_tool_call_templates)

    def safe_get_text(content):
        """Safely extract text content, handling different content formats"""
        if isinstance(content, list) and content:
            if isinstance(content[0], dict) and "text" in content[0]:
                return content[0]["text"]
            elif isinstance(content[0], str):
                return content[0]
            else:
                return str(content[0])
        elif isinstance(content, str):
            return content
        elif content is None:
            return ""
        else:
            return str(content)

    def generate_mcp_servers_str(tool_definition):
        mcp_servers_str = ""
        if tool_definition and len(tool_definition) > 0:
            for server in tool_definition:
                mcp_servers_str += f"## Server name: {server['name']}\n"
                if "tools" in server and len(server["tools"]) > 0:
                    for tool in server["tools"]:
                        # Skip tools that failed to load (they only have 'error' key)
                        if "error" in tool and "name" not in tool:
                            continue
                        mcp_servers_str += f"### Tool name: {tool['name']}\n"
                        mcp_servers_str += f"Description: {tool['description']}\n"
                        mcp_servers_str += f"Input JSON schema: {tool['schema']}\n"
        return mcp_servers_str

    oai_messages = deepcopy(oai_messages)
    chat_messages = []
    idx = 0
    pending_user_tool_contents = []

    # Merge pending_user_tool_contents into a single user message and add to chat_messages
    def flush_pending(pending_user_tool_contents, chat_messages):
        if pending_user_tool_contents:
            combined_content = "\n\n".join(pending_user_tool_contents)
            chat_messages.append(
                {
                    "role": "user",
                    "content": combined_content,
                }
            )
        return []  # Always return a new empty list

    try:
        for idx, msg in enumerate(oai_messages):
            if msg["role"] in ["developer", "system"]:
                assert idx == 0, "System messages should be the first message"

                time_str = f" Today is: {creation_time_str}\n"
                tool_definition_str = generate_mcp_servers_str(tool_definition)
                ori_system_prompt = msg["content"][0]["text"]

                system_prompt_after_general_objective = ori_system_prompt[
                    ori_system_prompt.find("# General Objective") :
                ]

                if agent_type == "main":
                    system_prompt = (
                        main_system_prompt_foreword
                        + time_str
                        + system_prompt_tool_instrcutions
                        + tool_definition_str
                        + system_prompt_after_general_objective
                    )
                elif agent_type == "sub_agent":
                    system_prompt = (
                        sub_agent_system_prompt_foreword
                        + time_str
                        + system_prompt_tool_instrcutions
                        + tool_definition_str
                        + system_prompt_after_general_objective
                    )
                else:
                    raise ValueError(f"Unknown agent type: {agent_type}")

                chat_messages.append(
                    {
                        "role": "system",
                        "content": system_prompt,
                    }
                )

            elif msg["role"] in ["user", "tool"]:
                content = safe_get_text(msg["content"])
                pending_user_tool_contents.append(content)
            elif msg["role"] == "assistant" and "tool_calls" in msg:
                # Flush pending user/tool messages
                pending_user_tool_contents = flush_pending(
                    pending_user_tool_contents, chat_messages
                )

                content = safe_get_text(msg.get("content", ""))

                if content != "":
                    content += "\n\n"  # Concatenate thinking text with tool call

                chat_messages.append(
                    {
                        "role": "assistant",
                        "content": content
                        + convert_oai_tool_call_to_mcp_tool_call_str(msg["tool_calls"]),
                    }
                )
            elif msg["role"] == "assistant" and "tool_calls" not in msg:
                # Flush pending user/tool messages
                pending_user_tool_contents = flush_pending(
                    pending_user_tool_contents, chat_messages
                )

                content = safe_get_text(msg["content"])

                chat_messages.append(
                    {
                        "role": "assistant",
                        "content": content,
                    }
                )
            else:
                raise ValueError(f"Unknown role: {msg['role']}")

        assert (
            len(pending_user_tool_contents) == 0
        ), "Error: Trace ends with user/tool round. Pending user/tool contents should be empty."

    except Exception as e:
        raise ValueError(f"Error processing messages: {e}")

    return chat_messages


def extract_message_history_from_log(
    log_data: Dict[str, Any],
):
    """
    Extract message history from log data and convert to OpenAI ChatML format

    Args:
        log_data: Log data dictionary

    Returns:
        Dictionary containing main_agent and sub_agents message history
    """
    result = {"main_agent": [], "sub_agents": {}}

    # Extract main_agent_message_history
    main_agent_history = log_data.get("main_agent_message_history", {})
    if main_agent_history and "message_history" in main_agent_history:
        main_messages = main_agent_history["message_history"]
        if main_messages:
            tool_main_agent_definition = extract_step_message(
                log_data, "get_main_tool_definitions"
            )

            result["main_agent"] = oai_tool_message_to_chat_message(
                main_messages,
                "main",
                tool_main_agent_definition,
            )

    # Extract sub_agent_message_history_sessions
    sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {})
    if sub_agent_sessions:
        for session_name, session_data in sub_agent_sessions.items():
            if "message_history" in session_data:
                sub_agent_messages = session_data["message_history"]
                if sub_agent_messages:
                    sub_agent_type = session_name.split("_")[0]

                    tool_sub_agent_definition = extract_step_message(
                        log_data, f"get_sub_{sub_agent_type}_tool_definitions"
                    )
                    result["sub_agents"][session_name] = (
                        oai_tool_message_to_chat_message(
                            sub_agent_messages, "sub_agent", tool_sub_agent_definition
                        )
                    )

    return result


def save_chatml_to_files(
    chatml_data: Dict[str, Any],
    output_dir: Path,
    input_filename: str,
):
    """
    Save ChatML format messages to files

    Args:
        chatml_data: Dictionary containing message history
        output_dir: Output directory
        input_filename: Input filename (without extension)
    """
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save main agent messages
    if chatml_data["main_agent"]:
        main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json"
        with open(main_output_file, "w", encoding="utf-8") as f:
            json.dump(chatml_data["main_agent"], f, ensure_ascii=False, indent=2)
        print(f"✓ Saved main agent ChatML: {main_output_file}")

    # Save sub agent messages
    for session_name, messages in chatml_data["sub_agents"].items():
        # Extract numeric suffix

        sub_agent_output_file = (
            output_dir / f"{input_filename}_{session_name}_chatml.json"
        )

        with open(sub_agent_output_file, "w", encoding="utf-8") as f:
            json.dump(messages, f, ensure_ascii=False, indent=2)
        print(f"✓ Saved sub agent {session_name} ChatML: {sub_agent_output_file}")


def extract_step_message(data, target_step_name):
    try:
        # Check if step_logs field exists
        if "step_logs" not in data:
            print("step_logs field not found in log file")
            return None

        # Iterate through step_logs to find target step_name
        for i, step in enumerate(data["step_logs"]):
            step_name = step.get("step_name")
            if step_name == target_step_name:
                message = step.get("message")
                return ast.literal_eval(message)

        print(f"No record found with step_name '{target_step_name}'")
        return None

    except Exception as e:
        print(f"Error processing file: {e}")
        return None


def process_log_file(log_file_path: str, output_dir: str = "extracted_chatml"):
    """
    Process a single log file, extract message history and convert to ChatML format

    Args:
        log_file_path: Log file path
        output_dir: Output directory
    """
    log_path = Path(log_file_path)
    output_path = Path(output_dir)

    if not log_path.exists():
        print(f"Error: Log file does not exist: {log_file_path}")
        return

    # Get file creation time
    global creation_time_str
    try:
        stat_info = os.stat(log_path)
        creation_time = datetime.fromtimestamp(stat_info.st_ctime)
        creation_time_str = creation_time.strftime("%Y-%m-%d")
        print(f"File creation time: {creation_time_str}")
    except Exception as e:
        print(f"Warning: Could not get file creation time: {e}")

    try:
        # Read log file
        print(f"Reading log file: {log_path}")
        with open(log_path, "r", encoding="utf-8") as f:
            log_data = json.load(f)

        # Extract input filename (without extension)
        input_filename = log_path.stem

        # Extract message history and convert to ChatML format
        print("Extracting message history...")
        chatml_data = extract_message_history_from_log(log_data)

        # Save to files
        print(f"Saving ChatML files to: {output_path}")
        save_chatml_to_files(chatml_data, output_path, input_filename)

        print("\n✓ Processing completed!")
        print(f"Output directory: {output_path.absolute()}")

    except json.JSONDecodeError as e:
        print(f"Error: Cannot parse JSON file: {e}")
    except Exception as e:
        print(f"Error: {e}")


def main():
    """Main function"""
    if len(sys.argv) < 2:
        print("Usage: python convert_oai_to_chatml.py <log_file_path> [output_dir]")
        print("Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json")
        print(
            "Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chatml"
        )
        sys.exit(1)

    log_file_path = sys.argv[1]
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted_chatml"

    process_log_file(log_file_path, output_dir)


if __name__ == "__main__":
    main()


================================================
FILE: apps/collect-trace/utils/converters/convert_to_chatml_auto_batch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import subprocess
import sys
from pathlib import Path
from typing import Dict, List


def get_llm_provider(json_file_path: str) -> str:
    """
    Extract llm_provider from JSON file

    Args:
        json_file_path: Path to JSON file

    Returns:
        llm_provider value or 'unknown' if not found
    """
    try:
        with open(json_file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Extract llm_provider from env_info
        provider = data.get("env_info", {}).get("llm_provider")
        if provider:
            return provider
        else:
            return "unknown"
    except Exception as e:
        print(f"Error reading JSON file {json_file_path}: {e}")
        return "error"


def determine_conversion_method(provider: str) -> str:
    """
    Determine conversion method based on provider

    Args:
        provider: LLM provider name

    Returns:
        'oai' for OpenAI, 'non-oai' for others
    """
    if provider.lower() in ["openai", "claude_newapi", "deepseek_newapi"]:
        return "oai"
    else:
        return "non-oai"


def get_script_paths() -> tuple:
    """
    Get paths to conversion scripts

    Returns:
        Tuple of (oai_script_path, non_oai_script_path)
    """
    # Get directory of current script
    current_dir = Path(__file__).parent

    oai_script = current_dir / "convert_oai_to_chatml.py"
    non_oai_script = current_dir / "convert_non_oai_to_chatml.py"

    # Check if scripts exist
    if not oai_script.exists():
        raise FileNotFoundError(f"OAI conversion script not found: {oai_script}")

    if not non_oai_script.exists():
        raise FileNotFoundError(
            f"Non-OAI conversion script not found: {non_oai_script}"
        )

    return str(oai_script), str(non_oai_script)


def process_single_file(json_file_path: str, output_dir: str) -> bool:
    """
    Process a single JSON file

    Args:
        json_file_path: Path to JSON file
        output_dir: Output directory

    Returns:
        True if successful, False otherwise
    """
    try:
        # Get llm_provider
        provider = get_llm_provider(json_file_path)

        if provider == "error":
            print(f"❌ Failed to read provider from: {json_file_path}")
            return False

        # Determine conversion method
        conversion_method = determine_conversion_method(provider)

        # Get script paths
        oai_script, non_oai_script = get_script_paths()

        # Choose script based on conversion method
        if conversion_method == "oai":
            script_path = oai_script
            print(f"🔧 Using OAI conversion for provider: {provider}")
        else:
            script_path = non_oai_script
            print(f"🔧 Using Non-OAI conversion for provider: {provider}")

        # Run conversion script
        result = subprocess.run(
            [sys.executable, script_path, json_file_path, output_dir],
            capture_output=True,
            text=True,
        )

        if result.returncode == 0:
            print(f"✅ Successfully processed: {json_file_path}")
            return True
        else:
            print(f"❌ Failed to process {json_file_path}: {result.stderr}")
            return False

    except Exception as e:
        print(f"❌ Error processing {json_file_path}: {e}")
        return False


def find_json_files(input_paths: List[str]) -> List[str]:
    """
    Find JSON files from input paths

    Args:
        input_paths: List of file paths, directories, or patterns

    Returns:
        List of JSON file paths
    """
    json_files = []

    for path in input_paths:
        path_obj = Path(path)

        if path_obj.is_file():
            # Single file
            if path_obj.suffix.lower() == ".json":
                json_files.append(str(path_obj))
        elif path_obj.is_dir():
            # Directory - find all JSON files
            for json_file in path_obj.glob("*.json"):
                json_files.append(str(json_file))
        else:
            # Pattern matching
            try:
                for json_file in Path(".").glob(path):
                    if json_file.suffix.lower() == ".json":
                        json_files.append(str(json_file))
            except Exception:
                print(f"Warning: Could not process pattern: {path}")

    return json_files


def batch_process_files(input_paths: List[str], output_dir: str) -> Dict[str, int]:
    """
    Batch process multiple files

    Args:
        input_paths: List of input paths
        output_dir: Output directory

    Returns:
        Dictionary with processing statistics
    """
    # Find JSON files
    json_files = find_json_files(input_paths)

    if not json_files:
        print("❌ No JSON files found in the specified paths")
        return {"total": 0, "success": 0, "failed": 0}

    print(f"📁 Found {len(json_files)} JSON files to process")

    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Process files
    success_count = 0
    failed_count = 0

    for json_file in json_files:
        if process_single_file(json_file, output_dir):
            success_count += 1
        else:
            failed_count += 1

    return {"total": len(json_files), "success": success_count, "failed": failed_count}


def show_help():
    """Show help information"""
    help_text = """
Auto ChatML Conversion Script
============================

Automatically determines conversion method based on llm_provider field in JSON files

Usage:
  python convert_to_chatml_auto_batch.py <input_paths...> [output_dir]
  python convert_to_chatml_auto_batch.py <log_dir> [output_dir]
  python convert_to_chatml_auto_batch.py <log_file_pattern> [output_dir]

Parameters:
  input_paths: JSON files, directories, or patterns
  output_dir: Output directory (optional, default: extracted_chatml)

Examples:
  python convert_to_chatml_auto_batch.py logs/debug_logs/
  python convert_to_chatml_auto_batch.py logs/debug_logs/*.json
  python convert_to_chatml_auto_batch.py logs/debug_logs/ ./my_output
  python convert_to_chatml_auto_batch.py task_1.json task_2.json

Conversion Logic:
  - If llm_provider = 'openai': Use convert_oai_to_chatml.py
  - If llm_provider = anything else: Use convert_non_oai_to_chatml.py

Features:
  1. Auto-detect conversion method per file
  2. Batch process log files
  3. Extract main_agent_message_history
  4. Extract browser_agent_message_history_sessions
  5. Convert to OpenAI ChatML format
  6. Save as separate files
  7. Generate processing summary
"""
    print(help_text)


def main():
    """Main function"""
    # Check for help
    if len(sys.argv) < 2 or sys.argv[1] in ["-h", "--help"]:
        show_help()
        return

    # Parse arguments
    args = sys.argv[1:]

    # Check if last argument is output directory
    if len(args) > 1 and not args[-1].startswith("-"):
        # Check if last argument looks like a directory
        last_arg = args[-1]
        if (
            last_arg.endswith("/")
            or not Path(last_arg).suffix
            or last_arg == "extracted_chatml"
            or last_arg.startswith("./")
        ):
            output_dir = last_arg
            input_paths = args[:-1]
        else:
            output_dir = "extracted_chatml"
            input_paths = args
    else:
        output_dir = "extracted_chatml"
        input_paths = args

    print("🚀 Starting auto ChatML conversion")
    print(f"📂 Input paths: {input_paths}")
    print(f"📁 Output directory: {output_dir}")

    try:
        # Check if conversion scripts exist
        get_script_paths()

        # Process files
        stats = batch_process_files(input_paths, output_dir)

        # Show results
        print("\n" + "=" * 50)
        print("📊 Processing Summary")
        print("=" * 50)
        print(f"Total files: {stats['total']}")
        print(f"Successfully processed: {stats['success']}")
        print(f"Failed: {stats['failed']}")
        print(f"Output directory: {Path(output_dir).absolute()}")

        if stats["failed"] > 0:
            print(f"\n⚠️  {stats['failed']} files failed to process")
            sys.exit(1)
        else:
            print("\n✅ All files processed successfully!")

    except FileNotFoundError as e:
        print(f"❌ {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: apps/collect-trace/utils/converters/example_usage.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import os
import sys
import tempfile
from pathlib import Path

# Add parent directory to Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from utils.converters import (
    extract_and_save_chat_history,
    extract_message_history_from_log,
)


def example_1_basic_conversion():
    """Example 1: Basic conversion using Python API"""
    print("=== Example 1: Basic Conversion ===")

    # Sample log data
    log_data = {
        "main_agent_message_history": {
            "system_prompt": "You are a helpful assistant.",
            "message_history": [
                {
                    "role": "developer",
                    "content": [
                        {"type": "text", "text": "You are a helpful assistant."}
                    ],
                },
                {
                    "role": "user",
                    "content": [{"type": "text", "text": "Hello, how are you?"}],
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": "I'm doing well, thank you!"}],
                },
            ],
        },
        "browser_agent_message_history_sessions": {
            "browser_agent_1": {
                "system_prompt": "You are a browsing agent.",
                "message_history": [
                    {
                        "role": "developer",
                        "content": [
                            {"type": "text", "text": "You are a browsing agent."}
                        ],
                    },
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": "Search for something"}],
                    },
                    {
                        "role": "assistant",
                        "content": [{"type": "text", "text": "I found it."}],
                    },
                ],
            }
        },
        "env_info": {"llm_provider": "openai"},
    }

    # Convert using OAI method
    chatml_data = extract_message_history_from_log(log_data)
    print(
        f"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent"
    )
    print(
        f"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent"
    )

    # Convert using Non-OAI method
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        extract_and_save_chat_history(log_data, temp_path, "example")

        # Check generated files
        main_file = temp_path / "example_main_agent_chatml.json"
        browser_file = temp_path / "example_browser_agent_1_chatml.json"

        if main_file.exists():
            with open(main_file, "r") as f:
                main_content = json.load(f)
                print(
                    f"Non-OAI conversion result: {len(main_content)} messages in main agent"
                )

        if browser_file.exists():
            with open(browser_file, "r") as f:
                browser_content = json.load(f)
                print(
                    f"Non-OAI conversion result: {len(browser_content)} messages in browser agent"
                )


if __name__ == "__main__":
    print("ChatML Conversion Utilities - Usage Examples")
    print("=" * 50)

    example_1_basic_conversion()

    print("\n" + "=" * 50)
    print("Examples completed successfully!")
    print("\nFor more information, see the README.md file.")


================================================
FILE: apps/collect-trace/utils/converters/system_prompts.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

main_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n    \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""

sub_agent_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n    \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""

system_prompt_tool_instrcutions = """# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n<use_mcp_tool>\n<server_name>server name here</server_name>\n<tool_name>tool name here</tool_name>\n<arguments>\n{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\"escaped string\\\"\"\n}\n</arguments>\n</use_mcp_tool>\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n"""


================================================
FILE: apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import glob
import json
import os


def merge_json_files(input_dir, type="main"):
    # List to store all messages
    all_conversations = []

    # Get all JSON files matching the pattern
    json_files = glob.glob(os.path.join(input_dir, f"*{type}*.json"))

    # Read each JSON file and merge its content
    for json_file in json_files:
        try:
            with open(json_file, "r", encoding="utf-8") as f:
                data = json.load(f)
                conversation = {
                    "messages": data,
                }
                all_conversations.append(conversation)
            print(f"Successfully processed: {json_file}")
        except Exception as e:
            print(f"Error processing {json_file}: {str(e)}")

    output_file = os.path.join(input_dir, f"{type}_merged.json")
    # Write the merged data to a new JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_conversations, f, ensure_ascii=False, indent=2)

    print(
        f"\nMerging complete! All {type} JSON files have been merged into {output_file}"
    )
    print(f"Total number of files processed: {len(json_files)}")
    print(f"Total number of messages: {len(all_conversations)}")


def main():
    parser = argparse.ArgumentParser(
        description="Merge multiple JSON files which contain chat messages into a single file"
    )
    parser.add_argument(
        "--input_dir",
        type=str,
        required=True,
        help="File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')",
    )

    args = parser.parse_args()

    merge_json_files(args.input_dir, type="main_agent")
    merge_json_files(args.input_dir, type="agent-browsing")


if __name__ == "__main__":
    main()


================================================
FILE: apps/collect-trace/utils/process_logs.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import json
import os
import shutil


def get_successful_log_paths(jsonl_file_path: str) -> list:
    """
    Collects the paths of successful log files from a dataset.

    This function extracts log file paths of successful records based on
    the value of `final_judge_result`. If the dataset has been fully
    processed, it reads from a `benchmark_results.jsonl` file. Otherwise,
    if processing was interrupted, it falls back to scanning individual
    `.json` files in the given directory.

    Success is determined by:
    - `PASS_AT_K_SUCCESS` for records in JSONL files.
    - `CORRECT` for records in individual JSON files.

    Args:
        jsonl_file_path (str): Path to a JSONL file or a directory of JSON files.

    Returns:
        list: A list of log file paths for successful records.
    """
    log_paths = []

    if jsonl_file_path.endswith(".jsonl"):
        with open(jsonl_file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        data = json.loads(line)
                        if data.get("final_judge_result") == "PASS_AT_K_SUCCESS":
                            log_path = data.get("log_file_path")
                            if log_path:
                                log_paths.append(log_path)
                    except json.JSONDecodeError:
                        continue
    else:
        filenames = os.listdir(jsonl_file_path)
        filenames = [filename for filename in filenames if filename.endswith(".json")]
        for filename in filenames:
            filepath = os.path.join(jsonl_file_path, filename)
            try:
                data = json.load(open(filepath, "r"))
            except Exception:
                continue
            try:
                final_judge_result = data["final_judge_result"]
            except KeyError:
                print(data.keys())
                continue
            if final_judge_result == "CORRECT":
                log_paths.append(filepath)

    return log_paths


# Usage example
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Extract successful log paths from JSONL file"
    )
    parser.add_argument(
        "file_path", help="Path to the JSONL file containing benchmark results"
    )
    args = parser.parse_args()

    result = get_successful_log_paths(args.file_path)

    # Get the parent directory of args.file_path
    parent_dir = os.path.abspath(os.path.dirname(args.file_path))

    # Create successful logs directory
    success_log_dir = parent_dir + "/successful_logs"
    success_chatml_log_dir = parent_dir + "/successful_chatml_logs"
    os.makedirs(success_log_dir, exist_ok=True)
    print(f"Successful logs directory: {success_log_dir}")

    for i, path in enumerate(result, 1):
        basename = os.path.basename(path)
        print(f"Copying file: {path} to {success_log_dir}/{basename}")
        shutil.copy(path, f"{success_log_dir}/{basename}")

    os.system(
        f"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}"
    )
    os.system(
        f"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}"
    )


================================================
FILE: apps/gradio-demo/README.md
================================================
# Local Deep Research Demo with Gradio Web UI

Host your own Deep Research demo using our [MiroThinker v1.5](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) models and lightweight Gradio-based web interface.

## 🖥️ Hardware Requirements

- **GPU**: NVIDIA RTX 40xx/50xx series or equivalent
- **VRAM**:
  - **16GB minimum** (with Q4 quantization via llama.cpp)
  - **48GB+ recommended** (for FP8 quantization or longer context)
  - MiroThinker-v1.5-30B is a 30B MoE model with 3B active parameters

## ⚙️ LLM Server Deployment

### Download Model Checkpoints

Download the full checkpoint from Hugging Face:

```python
from huggingface_hub import snapshot_download
snapshot_download(repo_id="miromind-ai/MiroThinker-v1.5-30B", local_dir="model/MiroThinker-v1.5-30B")
```

### Option 1: SGLang Server (Recommended)

FP8 is a highly efficient 8-bit floating point format that significantly reduces memory usage while maintaining model quality. This approach provides excellent performance for inference workloads on modern GPUs.

Please install [SGLang](https://github.com/sgl-project/sglang) first. Then initialize fast inference with FP8 precision:

```bash
MODEL_PATH=model/MiroThinker-v1.5-30B

python3 -m sglang.launch_server \
    --model-path $MODEL_PATH \
    --mem-fraction-static 0.9 \
    --quantization fp8 \
    --tp 1 \
    --dp 1 \
    --host 0.0.0.0 \
    --port 61005 \
    --trust-remote-code
```

It will start an openai compatible server with BASE_URL=`http://0.0.0.0:61005/v1`.

### Option 2: llama.cpp (Quantized)

For memory-efficient inference, download the pre-quantized GGUF version from the community:

**Note**: Thanks to the community for providing quantized versions: [mradermacher](https://huggingface.co/mradermacher)

```bash
# Download Q4_K_M quantized model (recommended balance)
wget https://huggingface.co/mradermacher/MiroThinker-v1.5-30B-GGUF/resolve/main/MiroThinker-v1.5-30B.Q4_K_M.gguf
```

Follow the [official llama.cpp installation guide](https://github.com/ggml-org/llama.cpp) to set up the environment. After that:

```bash
# Set up model path
MODEL_PATH=model/MiroThinker-v1.5-30B.Q4_K_M.gguf

# Start the server
llama-server -m $MODEL_PATH \
    --port 61005 \
    -ngl 99 \
    -v
```

This will start an OpenAI-compatible server at `http://0.0.0.0:61005/v1`.

### Other Options

You can also leverage other frameworks for model serving like Ollama, vLLM, and Text Generation Inference (TGI) for different deployment scenarios.

## 🚀 Quick Start Guide

### 1. **Environment Setup**

Get your API keys:

- [Serper](https://serper.dev/): 2,500 free search credits for new accounts (required for web search)
- [E2B](https://e2b.dev/): Free tier available (required for Python code execution)
- [Jina](https://jina.ai/): Free tier available (required for web scraping)

Edit the `apps/miroflow-agent/.env` file with your API keys:

```bash
# Required - Web Search
SERPER_API_KEY=your_serper_key

# Required - Python Code Execution (E2B Cloud Sandbox)
E2B_API_KEY=your_e2b_key

# Required - Web Scraping
JINA_API_KEY=your_jina_key

# Required - Summary LLM (for webpage summarization)
# Option 1: Use OpenAI GPT-5-Nano (recommended, cost-effective)
SUMMARY_LLM_BASE_URL=https://api.openai.com/v1
SUMMARY_LLM_MODEL_NAME=gpt-5-nano
SUMMARY_LLM_API_KEY=your_openai_key

# Option 2: Use MiroThinker itself (if you have enough VRAM)
# SUMMARY_LLM_BASE_URL=http://0.0.0.0:61005/v1
# SUMMARY_LLM_MODEL_NAME=MiroThinker
# SUMMARY_LLM_API_KEY=none
```

### 2. **Install Dependencies**

We use [uv](https://github.com/astral-sh/uv) to manage all dependencies.

```bash
cd apps/gradio-demo
uv sync
```

### 3. **Configure API Endpoint**

Set your LLM API endpoint and API key:

```bash
export BASE_URL=http://your-sglang-address:your-sglang-port/v1
export API_KEY=your_api_key  # Optional, required if your endpoint needs authentication
```

### 4. **Launch the Application**

```bash
uv run main.py
```

### 5. **Access the Web Interface**

Open your browser and navigate to: `http://localhost:8080`

### 📝 Notes

- Ensure your LLM server is up and running before launching the demo
- The demo will use your local CPU/GPU for inference while leveraging external APIs for search and code execution
- Monitor your API usage through the respective provider dashboards


================================================
FILE: apps/gradio-demo/main.py
================================================
import asyncio
import json
import logging
import os
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import AsyncGenerator, List, Optional

import gradio as gr
from dotenv import load_dotenv
from hydra import compose, initialize_config_dir
from omegaconf import DictConfig
from prompt_patch import apply_prompt_patch
from src.config.settings import expose_sub_agents_as_tools
from src.core.pipeline import create_pipeline_components, execute_task_pipeline
from utils import replace_chinese_punctuation

# Apply custom system prompt patch (adds MiroThinker identity)
apply_prompt_patch()

# Create global cleanup thread pool for operations that won't be affected by asyncio.cancel
cleanup_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="cleanup")

logger = logging.getLogger(__name__)

# Set DEMO_MODE for simplified tool configuration
os.environ["DEMO_MODE"] = "1"

# Load environment variables from .env file
load_dotenv()

# Global Hydra initialization flag
_hydra_initialized = False


def load_miroflow_config(config_overrides: Optional[dict] = None) -> DictConfig:
    """
    Load the full MiroFlow configuration using Hydra, similar to how benchmarks work.
    """
    global _hydra_initialized

    # Get the path to the miroflow agent config directory
    miroflow_config_dir = Path(__file__).parent.parent / "miroflow-agent" / "conf"
    miroflow_config_dir = miroflow_config_dir.resolve()
    logger.debug(f"Config dir: {miroflow_config_dir}")

    if not miroflow_config_dir.exists():
        raise FileNotFoundError(
            f"MiroFlow config directory not found: {miroflow_config_dir}"
        )

    # Initialize Hydra if not already done
    if not _hydra_initialized:
        try:
            initialize_config_dir(
                config_dir=str(miroflow_config_dir), version_base=None
            )
            _hydra_initialized = True
        except Exception as e:
            logger.warning(f"Hydra already initialized or error: {e}")

    # Compose configuration with environment variable overrides
    overrides = []

    # Add environment variable based overrides (refer to scripts/debug.sh)
    llm_provider = os.getenv(
        "DEFAULT_LLM_PROVIDER", "qwen"
    )  # debug.sh defaults to qwen
    model_name = os.getenv(
        "DEFAULT_MODEL_NAME", "MiroThinker"
    )  # debug.sh default model
    agent_set = os.getenv("DEFAULT_AGENT_SET", "demo")  # Use demo config
    base_url = os.getenv("BASE_URL", "http://localhost:11434")
    api_key = os.getenv("API_KEY", "")  # API key for LLM endpoint
    logger.debug(f"LLM base_url: {base_url}")

    # Map provider names to config files
    # Available configs: default.yaml, claude-3-7.yaml, gpt-5.yaml, qwen-3.yaml
    provider_config_map = {
        "anthropic": "claude-3-7",
        "openai": "gpt-5",
        "qwen": "qwen-3",
    }

    llm_config = provider_config_map.get(
        llm_provider, "qwen-3"
    )  # fallback to qwen-3 config
    overrides.extend(
        [
            f"llm={llm_config}",
            f"llm.provider={llm_provider}",
            f"llm.model_name={model_name}",
            f"llm.base_url={base_url}",
            f"llm.api_key={api_key}",
            f"agent={agent_set}",
            "agent.main_agent.max_turns=50",  # Limit max turns for gradio demo
            "benchmark=gaia-validation",  # refer to debug.sh
        ]
    )

    # Add config overrides from request
    if config_overrides:
        for key, value in config_overrides.items():
            if isinstance(value, dict):
                for subkey, subvalue in value.items():
                    overrides.append(f"{key}.{subkey}={subvalue}")
            else:
                overrides.append(f"{key}={value}")

    try:
        cfg = compose(config_name="config", overrides=overrides)
        return cfg
    except Exception as e:
        logger.error(f"Failed to compose Hydra config: {e}")
        exit()


# Lazy loading for tool definitions to speed up page load
# Tools will be loaded on first request instead of blocking startup
_preload_cache = {
    "cfg": None,
    "main_agent_tool_manager": None,
    "sub_agent_tool_managers": None,
    "output_formatter": None,
    "tool_definitions": None,
    "sub_agent_tool_definitions": None,
    "loaded": False,
}
_preload_lock = threading.Lock()


def _ensure_preloaded():
    """Lazy load pipeline components on first request."""
    global _preload_cache
    if _preload_cache["loaded"]:
        return

    with _preload_lock:
        if _preload_cache["loaded"]:
            return

        logger.info("Loading pipeline components (first request)...")
        cfg = load_miroflow_config(None)
        main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (
            create_pipeline_components(cfg)
        )
        tool_definitions = asyncio.run(
            main_agent_tool_manager.get_all_tool_definitions()
        )
        if cfg.agent.sub_agents:
            tool_definitions += expose_sub_agents_as_tools(cfg.agent.sub_agents)

        sub_agent_tool_definitions = {
            name: asyncio.run(sub_agent_tool_manager.get_all_tool_definitions())
            for name, sub_agent_tool_manager in sub_agent_tool_managers.items()
        }

        _preload_cache["cfg"] = cfg
        _preload_cache["main_agent_tool_manager"] = main_agent_tool_manager
        _preload_cache["sub_agent_tool_managers"] = sub_agent_tool_managers
        _preload_cache["output_formatter"] = output_formatter
        _preload_cache["tool_definitions"] = tool_definitions
        _preload_cache["sub_agent_tool_definitions"] = sub_agent_tool_definitions
        _preload_cache["loaded"] = True
        logger.info("Pipeline components loaded successfully.")


class ThreadSafeAsyncQueue:
    """Thread-safe async queue wrapper"""

    def __init__(self):
        self._queue = asyncio.Queue()
        self._loop = None
        self._closed = False

    def set_loop(self, loop):
        self._loop = loop

    async def put(self, item):
        """Put data safely from any thread"""
        if self._closed:
            return
        await self._queue.put(item)

    def put_nowait_threadsafe(self, item):
        """Put data from other threads - use direct queue put for lower latency"""
        if self._closed or not self._loop:
            return
        # Use put_nowait directly instead of creating a task for lower latency
        self._loop.call_soon_threadsafe(lambda: self._queue.put_nowait(item))

    async def get(self):
        return await self._queue.get()

    def close(self):
        self._closed = True


def filter_google_search_organic(organic: List[dict]) -> List[dict]:
    """
    Filter google search organic results to remove unnecessary information
    """
    result = []
    for item in organic:
        result.append(
            {
                "title": item.get("title", ""),
                "link": item.get("link", ""),
            }
        )
    return result


def is_scrape_error(result: str) -> bool:
    """
    Check if the scrape result is an error
    """
    try:
        json.loads(result)
        return False
    except json.JSONDecodeError:
        return True


def filter_message(message: dict) -> dict:
    """
    Filter message to remove unnecessary information
    """
    if message["event"] == "tool_call":
        tool_name = message["data"].get("tool_name")
        tool_input = message["data"].get("tool_input")
        if (
            tool_name == "google_search"
            and isinstance(tool_input, dict)
            and "result" in tool_input
        ):
            result_dict = json.loads(tool_input["result"])
            if "organic" in result_dict:
                new_result = {
                    "organic": filter_google_search_organic(result_dict["organic"])
                }
                message["data"]["tool_input"]["result"] = json.dumps(
                    new_result, ensure_ascii=False
                )
        if (
            tool_name in ["scrape", "scrape_website"]
            and isinstance(tool_input, dict)
            and "result" in tool_input
        ):
            # if error, it can not be json
            if is_scrape_error(tool_input["result"]):
                message["data"]["tool_input"] = {"error": tool_input["result"]}
            else:
                message["data"]["tool_input"] = {}
    return message


async def stream_events_optimized(
    task_id: str, query: str, _: Optional[dict] = None, disconnect_check=None
) -> AsyncGenerator[dict, None]:
    """Optimized event stream generator that directly outputs structured events, no longer wrapped as SSE strings."""
    workflow_id = task_id
    last_send_time = time.time()
    last_heartbeat_time = time.time()

    # Create thread-safe queue
    stream_queue = ThreadSafeAsyncQueue()
    stream_queue.set_loop(asyncio.get_event_loop())

    cancel_event = threading.Event()

    def run_pipeline_in_thread():
        try:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

            class ThreadQueueWrapper:
                def __init__(self, thread_queue, cancel_event):
                    self.thread_queue = thread_queue
                    self.cancel_event = cancel_event

                async def put(self, item):
                    if self.cancel_event.is_set():
                        logger.info("Pipeline cancelled, stopping execution")
                        return
                    self.thread_queue.put_nowait_threadsafe(filter_message(item))

            wrapper_queue = ThreadQueueWrapper(stream_queue, cancel_event)

            # Ensure pipeline components are loaded (lazy loading)
            _ensure_preloaded()

            async def pipeline_with_cancellation():
                pipeline_task = asyncio.create_task(
                    execute_task_pipeline(
                        cfg=_preload_cache["cfg"],
                        task_id=workflow_id,
                        task_description=query,
                        task_file_name=None,
                        main_agent_tool_manager=_preload_cache[
                            "main_agent_tool_manager"
                        ],
                        sub_agent_tool_managers=_preload_cache[
                            "sub_agent_tool_managers"
                        ],
                        output_formatter=_preload_cache["output_formatter"],
                        stream_queue=wrapper_queue,
                        log_dir=os.getenv("LOG_DIR", "logs/api-server"),
                        tool_definitions=_preload_cache["tool_definitions"],
                        sub_agent_tool_definitions=_preload_cache[
                            "sub_agent_tool_definitions"
                        ],
                    )
                )

                async def check_cancellation():
                    while not cancel_event.is_set():
                        await asyncio.sleep(0.5)
                    logger.info("Cancel event detected, cancelling pipeline")
                    pipeline_task.cancel()

                cancel_task = asyncio.create_task(check_cancellation())

                try:
                    done, pending = await asyncio.wait(
                        [pipeline_task, cancel_task],
                        return_when=asyncio.FIRST_COMPLETED,
                    )
                    for task in pending:
                        task.cancel()
                    for task in done:
                        if task == pipeline_task:
                            try:
                                await task
                            except asyncio.CancelledError:
                                logger.info("Pipeline task was cancelled")
                except Exception as e:
                    logger.error(f"Pipeline execution error: {e}")
                    pipeline_task.cancel()
                    cancel_task.cancel()

            loop.run_until_complete(pipeline_with_cancellation())
        except Exception as e:
            if not cancel_event.is_set():
                logger.error(f"Pipeline error: {e}", exc_info=True)
                stream_queue.put_nowait_threadsafe(
                    {
                        "event": "error",
                        "data": {"error": str(e), "workflow_id": workflow_id},
                    }
                )
        finally:
            stream_queue.put_nowait_threadsafe(None)
            if "loop" in locals():
                loop.close()

    executor = ThreadPoolExecutor(max_workers=1)
    future = executor.submit(run_pipeline_in_thread)

    try:
        while True:
            try:
                if disconnect_check and await disconnect_check():
                    logger.info("Client disconnected, stopping pipeline")
                    cancel_event.set()
                    break
                message = await asyncio.wait_for(stream_queue.get(), timeout=0.1)
                if message is None:
                    logger.info("Pipeline completed")
                    break
                yield message
                last_send_time = time.time()
            except asyncio.TimeoutError:
                current_time = time.time()
                if current_time - last_send_time > 300:
                    logger.info("Stream timeout")
                    break
                if future.done():
                    try:
                        message = stream_queue._queue.get_nowait()
                        if message is not None:
                            yield message
                            continue
                    except Exception:
                        break
                if current_time - last_heartbeat_time >= 15:
                    yield {
                        "event": "heartbeat",
                        "data": {"timestamp": current_time, "workflow_id": workflow_id},
                    }
                    last_heartbeat_time = current_time
    except Exception as e:
        logger.error(f"Stream error: {e}", exc_info=True)
        yield {
            "event": "error",
            "data": {"workflow_id": workflow_id, "error": f"Stream error: {str(e)}"},
        }
    finally:
        cancel_event.set()
        stream_queue.close()
        try:
            future.result(timeout=1.0)
        except Exception:
            pass
        executor.shutdown(wait=False)


# ========================= Gradio Integration =========================


def _init_render_state():
    return {
        "agent_order": [],
        "agents": {},  # agent_id -> {"agent_name": str, "tool_call_order": [], "tools": {tool_call_id: {...}}}
        "current_agent_id": None,
        "errors": [],
    }


def _format_think_content(text: str) -> str:
    """Convert <think> tags to readable markdown format."""
    import re

    # Replace <think> tags with blockquote format (no label)
    text = re.sub(r"<think>\s*", "\n> ", text)
    text = re.sub(r"\s*</think>", "\n", text)
    # Convert newlines within thinking to blockquote continuation
    lines = text.split("\n")
    result = []
    in_thinking = False
    for line in lines:
        if line.strip().startswith(">") and not in_thinking:
            in_thinking = True
            result.append(line)
        elif in_thinking and line.strip() and not line.startswith(">"):
            result.append(f"> {line}")
        else:
            if line.strip() == "" and in_thinking:
                in_thinking = False
            result.append(line)
    return "\n".join(result)


def _append_show_text(tool_entry: dict, delta: str):
    existing = tool_entry.get("content", "")
    # Skip "Final boxed answer" content (already shown in main response)
    if "Final boxed answer" in delta:
        return
    # Format think tags for display
    formatted_delta = _format_think_content(delta)
    tool_entry["content"] = existing + formatted_delta


def _is_empty_payload(value) -> bool:
    if value is None:
        return True
    if isinstance(value, str):
        stripped = value.strip()
        return stripped == "" or stripped in ("{}", "[]")
    if isinstance(value, (dict, list, tuple, set)):
        return len(value) == 0
    return False


def _format_search_results(tool_input: dict, tool_output: dict) -> str:
    """Format google_search results in a beautiful card layout."""
    lines = []

    # Get search query from input
    query = ""
    if isinstance(tool_input, dict):
        query = tool_input.get("q", "") or tool_input.get("query", "")

    # Parse results from output - handle multiple formats
    results = []
    if isinstance(tool_output, dict):
        # Case 1: output has "result" field containing JSON string
        result_str = tool_output.get("result", "")
        if isinstance(result_str, str) and result_str.strip():
            try:
                result_data = json.loads(result_str)
                if isinstance(result_data, dict):
                    results = result_data.get("organic", [])
            except json.JSONDecodeError:
                pass
        elif isinstance(result_str, dict):
            results = result_str.get("organic", [])

        # Case 2: output directly contains "organic" field
        if not results and "organic" in tool_output:
            results = tool_output.get("organic", [])

    if not results and not query:
        return ""

    # Build the card
    lines.append('<div class="search-card">')

    # Header with query
    if query:
        lines.append('<div class="search-header">')
        lines.append('<span class="search-icon">🔍</span>')
        lines.append(f'<span class="search-query">Search: "{query}"</span>')
        lines.append("</div>")

    # Results count
    if results:
        lines.append(f'<div class="search-count">≡ Found {len(results)} results</div>')

        # Results list
        lines.append('<div class="search-results">')
        for item in results[:10]:  # Limit to 10 results
            title = item.get("title", "Untitled")
            link = item.get("link", "#")

            lines.append(f"""<a href="{link}" target="_blank" class="search-result-item">
                <span class="result-icon">🌐</span>
                <span class="result-title">{title}</span>
            </a>""")
        lines.append("</div>")

    lines.append("</div>")

    return "\n".join(lines)


def _format_sogou_search_results(tool_input: dict, tool_output: dict) -> str:
    """Format sogou_search results in a beautiful card layout."""
    lines = []

    # Get search query from input
    query = ""
    if isinstance(tool_input, dict):
        query = tool_input.get("q", "") or tool_input.get("query", "")

    # Parse results from output - sogou uses "Pages" instead of "organic"
    results = []
    if isinstance(tool_output, dict):
        result_str = tool_output.get("result", "")
        if isinstance(result_str, str) and result_str.strip():
            try:
                result_data = json.loads(result_str)
                if isinstance(result_data, dict):
                    results = result_data.get("Pages", [])
            except json.JSONDecodeError:
                pass
        elif isinstance(result_str, dict):
            results = result_str.get("Pages", [])

        if not results and "Pages" in tool_output:
            results = tool_output.get("Pages", [])

    if not results and not query:
        return ""

    # Build the card
    lines.append('<div class="search-card">')

    # Header with query
    if query:
        lines.append('<div class="search-header">')
        lines.append('<span class="search-icon">🔍</span>')
        lines.append(f'<span class="search-query">Search: "{query}"</span>')
        lines.append("</div>")

    # Results count
    if results:
        lines.append(f'<div class="search-count">≡ Found {len(results)} results</div>')

        # Results list
        lines.append('<div class="search-results">')
        for item in results[:10]:  # Limit to 10 results
            title = item.get("title", "Untitled")
            link = item.get("url", item.get("link", "#"))

            lines.append(f"""<a href="{link}" target="_blank" class="search-result-item">
                <span class="result-icon">🌐</span>
                <span class="result-title">{title}</span>
            </a>""")
        lines.append("</div>")

    lines.append("</div>")

    return "\n".join(lines)


def _format_scrape_results(tool_input: dict, tool_output: dict) -> str:
    """Format scrape/webpage results in a card layout."""
    lines = []

    # Get URL
    url = ""
    if isinstance(tool_input, dict):
        url = tool_input.get("url", tool_input.get("link", ""))

    # Check for error
    if isinstance(tool_output, dict) and "error" in tool_output:
        lines.append('<div class="scrape-card scrape-error">')
        lines.append('<div class="scrape-header">')
        lines.append('<span class="scrape-icon">🌐</span>')
        lines.append(
            f'<span class="scrape-url">{url[:60]}{"..." if len(url) > 60 else ""}</span>'
        )
        lines.append("</div>")
        lines.append('<div class="scrape-status error">❌ Failed</div>')
        lines.append("</div>")
        return "\n".join(lines)

    # Success case
    lines.append('<div class="scrape-card">')
    if url:
        lines.append('<div class="scrape-header">')
        lines.append('<span class="scrape-icon">🌐</span>')
        lines.append(
            f'<span class="scrape-url">{url[:60]}{"..." if len(url) > 60 else ""}</span>'
        )
        lines.append("</div>")
        lines.append('<div class="scrape-status success">✓ Done</div>')
    lines.append("</div>")

    return "\n".join(lines)


def _render_markdown(state: dict) -> str:
    lines = []
    final_summary_lines = []  # Collect final summary content separately

    # Render errors first if any
    if state.get("errors"):
        for err in state["errors"]:
            lines.append(f'<div class="error-block">❌ {err}</div>')

    # Render all agents' content
    for agent_id in state.get("agent_order", []):
        agent = state["agents"].get(agent_id, {})
        agent_name = agent.get("agent_name", "")
        is_final_summary = agent_name == "Final Summary"

        for call_id in agent.get("tool_call_order", []):
            call = agent["tools"].get(call_id, {})
            tool_name = call.get("tool_name", "unknown_tool")

            # Show text / message - display directly
            if tool_name in ("show_text", "message"):
                content = call.get("content", "")
                if content:
                    if is_final_summary:
                        final_summary_lines.append(content)
                    else:
                        lines.append(content)
                continue

            tool_input = call.get("input", {})
            tool_output = call.get("output", {})
            has_input = not _is_empty_payload(tool_input)
            has_output = not _is_empty_payload(tool_output)

            # Special formatting for google_search
            if tool_name == "google_search" and (has_input or has_output):
                formatted = _format_search_results(tool_input, tool_output)
                if formatted:
                    lines.append(formatted)
                continue

            # Special formatting for sogou_search
            if tool_name == "sogou_search" and (has_input or has_output):
                formatted = _format_sogou_search_results(tool_input, tool_output)
                if formatted:
                    lines.append(formatted)
                continue

            # Special formatting for scrape/webpage tools
            if tool_name in (
                "scrape",
                "scrape_website",
                "scrape_webpage",
                "scrape_and_extract_info",
            ) and (has_input or has_output):
                formatted = _format_scrape_results(tool_input, tool_output)
                if formatted:
                    lines.append(formatted)
                continue

            # Special formatting for code execution tools
            if tool_name in ("python", "run_python_code") and (has_input or has_output):
                # Use pure Markdown to avoid HTML wrapper blocking Markdown rendering
                lines.append("\n---\n")
                lines.append("#### 💻 Code Execution\n")
                # Show code input - try multiple possible keys
                code = ""
                if isinstance(tool_input, dict):
                    code = tool_input.get("code") or tool_input.get("code_block") or ""
                elif isinstance(tool_input, str):
                    code = tool_input
                if code:
                    lines.append(f"\n```python\n{code}\n```\n")
                # Show output if available
                if has_output:
                    output = ""
                    if isinstance(tool_output, dict):
                        output = (
                            tool_output.get("result")
                            or tool_output.get("output")
                            or tool_output.get("stdout")
                            or ""
                        )
                    elif isinstance(tool_output, str):
                        output = tool_output
                    if isinstance(output, str) and output.strip():
                        lines.append("\n**Output:**\n")
                        lines.append(
                            f'\n```text\n{output[:1000]}{"..." if len(output) > 1000 else ""}\n```\n'
                        )
                lines.append("\n✅ Executed\n")
                continue

            # Other tools - show as compact card
            if has_input or has_output:
                target_lines = final_summary_lines if is_final_summary else lines
                target_lines.append('<div class="tool-card">')
                target_lines.append(f'<div class="tool-header">🔧 {tool_name}</div>')
                if has_input:
                    # Show brief input summary
                    if isinstance(tool_input, dict):
                        brief = ", ".join(
                            f"{k}: {str(v)[:30]}..."
                            if len(str(v)) > 30
                            else f"{k}: {v}"
                            for k, v in list(tool_input.items())[:2]
                        )
                        target_lines.append(f'<div class="tool-brief">{brief}</div>')
                if has_output:
                    target_lines.append('<div class="tool-status">✓ Done</div>')
                target_lines.append("</div>")

    # Add final summary with Markdown-based styling (no HTML wrapper to preserve Markdown rendering)
    if final_summary_lines:
        lines.append("\n\n---\n\n")  # Markdown horizontal rule as divider
        lines.append("## 📋 Research Summary\n\n")
        lines.extend(final_summary_lines)

    return "\n".join(lines) if lines else "*Waiting to start research...*"


def _update_state_with_event(state: dict, message: dict):
    event = message.get("event")
    data = message.get("data", {})
    if event == "start_of_agent":
        agent_id = data.get("agent_id")
        agent_name = data.get("agent_name", "unknown")
        if agent_id and agent_id not in state["agents"]:
            state["agents"][agent_id] = {
                "agent_name": agent_name,
                "tool_call_order": [],
                "tools": {},
            }
            state["agent_order"].append(agent_id)
        state["current_agent_id"] = agent_id
    elif event == "end_of_agent":
        # End marker, no special handling needed, keep structure
        state["current_agent_id"] = None
    elif event == "tool_call":
        tool_call_id = data.get("tool_call_id")
        tool_name = data.get("tool_name", "unknown_tool")
        agent_id = state.get("current_agent_id") or (
            state["agent_order"][-1] if state["agent_order"] else None
        )
        if not agent_id:
            return state
        agent = state["agents"].setdefault(
            agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}}
        )
        tools = agent["tools"]
        if tool_call_id not in tools:
            tools[tool_call_id] = {"tool_name": tool_name}
            agent["tool_call_order"].append(tool_call_id)
        entry = tools[tool_call_id]
        if tool_name == "show_text" and "delta_input" in data:
            delta = data.get("delta_input", {}).get("text", "")
            _append_show_text(entry, delta)
        elif tool_name == "show_text" and "tool_input" in data:
            ti = data.get("tool_input")
            text = ""
            if isinstance(ti, dict):
                text = ti.get("text", "") or (
                    (ti.get("result") or {}).get("text")
                    if isinstance(ti.get("result"), dict)
                    else ""
                )
            elif isinstance(ti, str):
                text = ti
            if text:
                _append_show_text(entry, text)
        else:
            # Distinguish between input and output:
            if "tool_input" in data:
                # Could be input (first time) or output with result (second time)
                ti = data["tool_input"]
                # If contains result, assign to output; otherwise assign to input
                if isinstance(ti, dict) and "result" in ti:
                    entry["output"] = ti
                else:
                    # Only update input if we don't already have valid input data, or if the new data is not empty
                    if "input" not in entry or not _is_empty_payload(ti):
                        entry["input"] = ti
    elif event == "message":
        # Same incremental text display as show_text, aggregated by message_id
        message_id = data.get("message_id")
        agent_id = state.get("current_agent_id") or (
            state["agent_order"][-1] if state["agent_order"] else None
        )
        if not agent_id:
            return state
        agent = state["agents"].setdefault(
            agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}}
        )
        tools = agent["tools"]
        if message_id not in tools:
            tools[message_id] = {"tool_name": "message"}
            agent["tool_call_order"].append(message_id)
        entry = tools[message_id]
        delta_content = (data.get("delta") or {}).get("content", "")
        if isinstance(delta_content, str) and delta_content:
            _append_show_text(entry, delta_content)
    elif event == "error":
        # Collect errors, display uniformly during rendering
        err_text = data.get("error") if isinstance(data, dict) else None
        if not err_text:
            try:
                err_text = json.dumps(data, ensure_ascii=False)
            except Exception:
                err_text = str(data)
        state.setdefault("errors", []).append(err_text)
    else:
        # Ignore heartbeat or other events
        pass

    return state


_CANCEL_FLAGS = {}
_CANCEL_LOCK = threading.Lock()


def _set_cancel_flag(task_id: str):
    with _CANCEL_LOCK:
        _CANCEL_FLAGS[task_id] = True


def _reset_cancel_flag(task_id: str):
    with _CANCEL_LOCK:
        _CANCEL_FLAGS[task_id] = False


async def _disconnect_check_for_task(task_id: str):
    with _CANCEL_LOCK:
        return _CANCEL_FLAGS.get(task_id, False)


def _spinner_markup(running: bool) -> str:
    if not running:
        return ""
    return (
        '\n\n<div style="display:flex;align-items:center;gap:8px;color:#555;margin-top:8px;">'
        '<div style="width:16px;height:16px;border:2px solid #ddd;border-top-color:#3b82f6;border-radius:50%;animation:spin 0.8s linear infinite;"></div>'
        "<span>Generating...</span>"
        "</div>\n<style>@keyframes spin{to{transform:rotate(360deg)}}</style>\n"
    )


async def gradio_run(query: str, ui_state: Optional[dict]):
    query = replace_chinese_punctuation(query or "")
    task_id = str(uuid.uuid4())
    _reset_cancel_flag(task_id)
    if not ui_state:
        ui_state = {"task_id": task_id}
    else:
        ui_state = {**ui_state, "task_id": task_id}
    state = _init_render_state()
    # Initial: disable Run, enable Stop, and show spinner at bottom of text
    yield (
        _render_markdown(state) + _spinner_markup(True),
        gr.update(interactive=False),
        gr.update(interactive=True),
        ui_state,
    )
    async for message in stream_events_optimized(
        task_id, query, None, lambda: _disconnect_check_for_task(task_id)
    ):
        # Skip heartbeat events - they don't need UI update
        event_type = message.get("event", "unknown")
        if event_type == "heartbeat":
            continue

        state = _update_state_with_event(state, message)
        md = _render_markdown(state)
        yield (
            md + _spinner_markup(True),
            gr.update(interactive=False),
            gr.update(interactive=True),
            ui_state,
        )
        # Small delay to allow Gradio to process the update
        await asyncio.sleep(0.01)
    # End: enable Run, disable Stop, remove spinner
    yield (
        _render_markdown(state),
        gr.update(interactive=True),
        gr.update(interactive=False),
        ui_state,
    )


def stop_current(ui_state: Optional[dict]):
    tid = (ui_state or {}).get("task_id")
    if tid:
        _set_cancel_flag(tid)
    # Immediately switch button availability: enable Run, disable Stop
    return (
        gr.update(interactive=True),
        gr.update(interactive=False),
    )


def build_demo():
    # Use remote logo from dr.miromind.ai for faster page load

    custom_css = """
    /* ========== MiroThinker - Modern Clean Design ========== */
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
    
    /* Base */
    .gradio-container {
        max-width: 100% !important;
        margin: 0 !important;
        padding: 0 !important;
        font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
        background: #ffffff !important;
        min-height: 100vh;
    }
    
    footer { display: none !important; }
    
    /* ===== Top Navigation ===== */
    .top-nav {
        display: flex;
        align-items: center;
        justify-content: space-between;
        padding: 16px 32px;
        border-bottom: 1px solid #f0f0f0;
        background: #ffffff;
    }
    
    .nav-left {
        display: flex;
        align-items: center;
        gap: 20px;
    }
    
    .nav-brand {
        display: flex;
        align-items: center;
        gap: 10px;
        font-weight: 600;
        font-size: 1.1em;
        color: #18181b;
    }
    
    .brand-logo {
        width: 32px;
        height: 32px;
        border-radius: 6px;
    }
    
    .nav-links {
        display: flex;
        align-items: center;
        gap: 12px;
    }
    
    .nav-links a {
        color: #71717a;
        font-size: 1.1em;
        text-decoration: none;
        transition: color 0.2s;
    }
    
    .nav-links a:hover {
        color: #18181b;
    }
    
    .nav-right {
        display: flex;
        align-items: center;
        gap: 16px;
    }
    
    .nav-right a {
        color: #52525b;
        text-decoration: none;
        font-size: 0.9em;
    }
    
    /* ===== Hero Section ===== */
    .hero-section {
        text-align: center;
        padding: 60px 24px 40px;
        max-width: 900px;
        margin: 0 auto;
    }
    
    .hero-title {
        font-size: 3em;
        font-weight: 700;
        background: linear-gradient(135deg, #10b981 0%, #14b8a6 50%, #06b6d4 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        background-clip: text;
        margin: 0 0 16px 0;
        letter-spacing: -0.02em;
    }
    
    .hero-subtitle {
        display: flex;
        align-items: center;
        justify-content: center;
        gap: 16px;
        color: #71717a;
        font-size: 1em;
    }
    
    .hero-line {
        width: 40px;
        height: 1px;
        background: #d4d4d8;
    }
    
    /* ===== Input Section ===== */
    #input-section {
        max-width: 720px !important;
        margin: 0 auto 40px !important;
        background: #ffffff;
        border: 1px solid #e0e0e0;
        border-radius: 16px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.04);
    }
    
    #question-input {
        padding: 20px 24px !important;
        background: #ffffff !important;
        border: none !important;
    }
    
    #question-input textarea {
        background: #ffffff !important;
        border: none !important;
        font-size: 1.05em !important;
        line-height: 1.7 !important;
        color: #18181b !important;
        box-shadow: none !important;
    }
    
    #question-input textarea:focus {
        outline: none !important;
        box-shadow: none !important;
    }
    
    #question-input textarea::placeholder {
        color: #9ca3af !important;
    }
    
    #btn-row {
        padding: 16px 24px !important;
        border-top: 1px solid #f0f0f0;
        gap: 12px !important;
    }
    
    #run-btn {
        background: linear-gradient(135deg, #10b981 0%, #14b8a6 100%) !important;
        color: #ffffff !important;
        border: none !important;
        border-radius: 10px !important;
        padding: 12px 24px !important;
        font-size: 0.95em !important;
        font-weight: 500 !important;
        cursor: pointer !important;
        transition: opacity 0.2s, transform 0.2s !important;
    }
    
    #run-btn:hover {
        opacity: 0.9 !important;
        transform: translateY(-1px) !important;
    }
    
    #stop-btn {
        background: #ffffff !important;
        color: #71717a !important;
        border: 1px solid #e5e5e5 !important;
        border-radius: 10px !important;
        padding: 12px 20px !important;
        font-size: 0.95em !important;
        font-weight: 500 !important;
        cursor: pointer !important;
        transition: all 0.2s !important;
    }
    
    #stop-btn:hover {
        color: #ef4444 !important;
        border-color: #fecaca !important;
        background: #fef2f2 !important;
    }
    
    /* ===== Output Section ===== */
    #output-section {
        max-width: 900px !important;
        margin: 0 auto !important;
        padding: 0 24px 60px !important;
    }
    
    .output-label {
        font-size: 0.85em;
        font-weight: 500;
        color: #71717a;
        text-transform: uppercase;
        letter-spacing: 0.05em;
        margin-bottom: 12px;
        padding: 0 4px;
    }
    
    #log-view {
        padding: 24px !important;
        min-height: 400px;
        max-height: 70vh;
        overflow-y: auto;
        background: #ffffff !important;
        border: 1px solid #e5e5e5 !important;
        border-radius: 16px !important;
    }
    
    #log-view h3 {
        font-size: 0.95em;
        font-weight: 600;
        color: #18181b;
        margin: 24px 0 16px 0;
        padding-bottom: 8px;
        border-bottom: 1px solid #f4f4f5;
    }
    
    #log-view h3:first-child {
        margin-top: 0;
    }
    
    /* Error block */
    .error-block {
        background: #fef2f2;
        border: 1px solid #fecaca;
        border-radius: 10px;
        padding: 12px 16px;
        margin: 12px 0;
        color: #dc2626;
        font-size: 0.9em;
    }
    
    /* Tool card */
    .tool-card {
        background: #fafafa;
        border: 1px solid #e5e5e5;
        border-radius: 10px;
        padding: 12px 16px;
        margin: 12px 0;
    }
    
    .tool-header {
        font-size: 0.9em;
        font-weight: 500;
        color: #3f3f46;
        margin-bottom: 4px;
    }
    
    .tool-brief {
        font-size: 0.8em;
        color: #71717a;
        margin-top: 4px;
    }
    
    .tool-status {
        font-size: 0.8em;
        color: #10b981;
        margin-top: 6px;
    }
    
    #log-view blockquote {
        background: linear-gradient(135deg, #f0fdf4 0%, #ecfeff 100%);
        border: none;
        border-left: 3px solid #10b981;
        padding: 16px 20px;
        margin: 16px 0;
        border-radius: 0 12px 12px 0;
        font-style: normal;
        color: #065f46;
        font-size: 0.9em;
        line-height: 1.7;
    }
    
    #log-view pre {
        background: #f8f9fa !important;
        color: #1e293b !important;
        border-radius: 8px !important;
        padding: 16px !important;
        font-size: 0.85em !important;
        line-height: 1.6 !important;
        overflow-x: auto;
        margin: 12px 0;
        border: 1px solid #e2e8f0;
    }
    
    #log-view pre code {
        background: transparent !important;
        color: #1e293b !important;
        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;
        font-size: inherit !important;
        padding: 0 !important;
        white-space: pre-wrap;
        word-break: break-word;
    }
    
    #log-view code {
        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;
        background: #f1f5f9 !important;
        color: #1e293b !important;
        padding: 2px 6px !important;
        border-radius: 4px !important;
        font-size: 0.9em !important;
    }
    
    #log-view p {
        line-height: 1.7;
        color: #3f3f46;
    }
    
    #log-view::-webkit-scrollbar {
        width: 6px;
    }
    
    #log-view::-webkit-scrollbar-track {
        background: transparent;
    }
    
    #log-view::-webkit-scrollbar-thumb {
        background: #e5e5e5;
        border-radius: 3px;
    }
    
    #log-view::-webkit-scrollbar-thumb:hover {
        background: #d4d4d8;
    }
    
    /* ===== Footer ===== */
    .app-footer {
        text-align: center;
        padding: 24px;
        color: #a1a1aa;
        font-size: 0.85em;
        border-top: 1px solid #f0f0f0;
    }
    
    /* ===== Loading Spinner ===== */
    @keyframes spin {
        to { transform: rotate(360deg); }
    }
    
    .loading-indicator {
        display: inline-flex;
        align-items: center;
        gap: 10px;
        color: #10b981;
        font-size: 0.9em;
        padding: 12px 0;
    }
    
    .loading-indicator::before {
        content: '';
        width: 16px;
        height: 16px;
        border: 2px solid #d1fae5;
        border-top-color: #10b981;
        border-radius: 50%;
        animation: spin 0.8s linear infinite;
    }
    
    /* ===== Search Results Card ===== */
    .search-card {
        background: #ffffff;
        border: 1px solid #e5e5e5;
        border-radius: 12px;
        margin: 16px 0;
        overflow: hidden;
    }
    
    .search-header {
        display: flex;
        align-items: center;
        gap: 10px;
        padding: 14px 18px;
        background: #fafafa;
        border-bottom: 1px solid #f0f0f0;
    }
    
    .search-icon {
        font-size: 1em;
        color: #10b981;
    }
    
    .search-query {
        font-size: 0.9em;
        color: #3f3f46;
        font-weight: 500;
    }
    
    .search-count {
        padding: 10px 18px;
        font-size: 0.8em;
        color: #71717a;
        background: #fafafa;
        border-bottom: 1px solid #f0f0f0;
    }
    
    .search-results {
        padding: 8px 0;
    }
    
    .search-result-item {
        display: flex;
        align-items: center;
        gap: 12px;
        padding: 12px 18px;
        text-decoration: none;
        color: #3f3f46;
        font-size: 0.9em;
        transition: background 0.15s;
        border-left: 3px solid transparent;
    }
    
    .search-result-item:hover {
        background: #f9fafb;
        border-left-color: #10b981;
    }
    
    .result-icon {
        font-size: 1em;
        flex-shrink: 0;
        opacity: 0.6;
    }
    
    .result-title {
        flex: 1;
        overflow: hidden;
        text-overflow: ellipsis;
        white-space: nowrap;
    }
    
    /* ===== Scrape Card ===== */
    .scrape-card {
        background: #ffffff;
        border: 1px solid #e5e5e5;
        border-radius: 10px;
        margin: 12px 0;
        padding: 12px 16px;
        display: flex;
        align-items: center;
        justify-content: space-between;
        gap: 12px;
    }
    
    .scrape-card.scrape-error {
        border-color: #fecaca;
        background: #fef2f2;
    }
    
    .scrape-header {
        display: flex;
        align-items: center;
        gap: 10px;
        flex: 1;
        min-width: 0;
    }
    
    .scrape-icon {
        font-size: 1em;
        opacity: 0.6;
    }
    
    .scrape-url {
        font-size: 0.85em;
        color: #52525b;
        overflow: hidden;
        text-overflow: ellipsis;
        white-space: nowrap;
    }
    
    .scrape-status {
        font-size: 0.8em;
        padding: 4px 10px;
        border-radius: 6px;
        flex-shrink: 0;
    }
    
    .scrape-status.success {
        background: #ecfdf5;
        color: #059669;
    }
    
    .scrape-status.error {
        background: #fef2f2;
        color: #dc2626;
    }
    
    /* ===== Final Summary Section ===== */
    .final-summary-divider {
        height: 1px;
        background: linear-gradient(to right, transparent, #e5e5e5, transparent);
        margin: 32px 0;
    }
    
    .final-summary-section {
        background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%);
        border: 1px solid #e2e8f0;
        border-radius: 16px;
        padding: 24px;
        margin-top: 16px;
    }
    
    .final-summary-header {
        font-size: 1.1em;
        font-weight: 600;
        color: #1e293b;
        margin-bottom: 16px;
        padding-bottom: 12px;
        border-bottom: 2px solid #3b82f6;
        display: inline-block;
    }
    
    .final-summary-content {
        color: #334155;
        line-height: 1.8;
    }
    
    .final-summary-content h1,
    .final-summary-content h2,
    .final-summary-content h3 {
        color: #1e293b;
        margin-top: 1.5em;
        margin-bottom: 0.5em;
    }
    
    .final-summary-content h1 { font-size: 1.4em; }
    .final-summary-content h2 { font-size: 1.2em; }
    .final-summary-content h3 { font-size: 1.1em; }
    
    .final-summary-content p {
        margin: 0.8em 0;
    }
    
    .final-summary-content ul,
    .final-summary-content ol {
        margin: 0.8em 0;
        padding-left: 1.5em;
    }
    
    .final-summary-content li {
        margin: 0.4em 0;
    }
    
    .final-summary-content a {
        color: #3b82f6;
        text-decoration: none;
    }
    
    .final-summary-content a:hover {
        text-decoration: underline;
    }
    
    .final-summary-content code {
        background: #e2e8f0;
        padding: 2px 6px;
        border-radius: 4px;
        font-family: 'SF Mono', 'Fira Code', monospace;
        font-size: 0.9em;
    }
    
    .final-summary-content pre {
        background: #1e293b;
        color: #e2e8f0;
        padding: 16px;
        border-radius: 8px;
        overflow-x: auto;
    }
    
    .final-summary-content pre code {
        background: transparent;
        padding: 0;
        color: inherit;
    }
    
    .final-summary-content table {
        width: 100%;
        border-collapse: collapse;
        margin: 1em 0;
    }
    
    .final-summary-content th,
    .final-summary-content td {
        padding: 10px 12px;
        border: 1px solid #e2e8f0;
        text-align: left;
    }
    
    .final-summary-content th {
        background: #f1f5f9;
        font-weight: 600;
    }
    
    .final-summary-content blockquote {
        border-left: 4px solid #3b82f6;
        margin: 1em 0;
        padding: 0.5em 1em;
        background: #f8fafc;
        color: #475569;
    }
    
    /* ===== Code Execution Card ===== */
    .code-card {
        background: #1e1e2e;
        border: 1px solid #313244;
        border-radius: 12px;
        margin: 12px 0;
        padding: 16px;
        overflow: hidden;
    }
    
    .code-header {
        font-size: 0.9em;
        font-weight: 600;
        color: #cdd6f4;
        margin-bottom: 12px;
        display: flex;
        align-items: center;
        gap: 8px;
    }
    
    .code-card pre {
        background: #11111b !important;
        border-radius: 8px;
        padding: 12px 16px;
        margin: 8px 0;
        overflow-x: auto;
        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;
        font-size: 0.85em;
        line-height: 1.5;
    }
    
    .code-card code {
        background: transparent !important;
        color: #cdd6f4 !important;
        font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important;
    }
    
    .code-output-label {
        font-size: 0.8em;
        color: #a6adc8;
        margin-top: 12px;
        margin-bottom: 4px;
    }
    
    .code-status {
        font-size: 0.8em;
        color: #a6e3a1;
        margin-top: 8px;
        text-align: right;
    }
    
    /* ===== Responsive ===== */
    @media (max-width: 768px) {
        .hero-title {
            font-size: 2em;
        }
        
        .hero-section {
            padding: 40px 16px 24px;
        }
        
        .input-wrapper, .output-wrapper {
            padding: 0 16px;
        }
        
        #log-view {
            max-height: 50vh;
        }
    }
    """

    # Favicon head content
    favicon_head = '<link rel="icon" href="https://dr.miromind.ai/favicon.ico?v=2">'

    with gr.Blocks(
        css=custom_css,
        title="MiroThinker - Deep Research",
        theme=gr.themes.Base(),
        head=favicon_head,
    ) as demo:
        # Top Navigation
        gr.HTML("""
            <nav class="top-nav">
                <div class="nav-left">
                    <div class="nav-brand">
                        <img src="https://dr.miromind.ai/favicon.png" class="brand-logo" alt="MiroThinker" />
                        MiroThinker
                    </div>
                    <div class="nav-links">
                        <a href="https://huggingface.co/MiroMind" target="_blank">🤗</a>
                        <a href="https://github.com/MiroMind/MiroThinker" target="_blank">
                            <svg width="20" height="20" viewBox="0 0 24 24" fill="currentColor">
                                <path d="M12 0c-6.626 0-12 5.373-12 12 0 5.302 3.438 9.8 8.207 11.387.599.111.793-.261.793-.577v-2.234c-3.338.726-4.033-1.416-4.033-1.416-.546-1.387-1.333-1.756-1.333-1.756-1.089-.745.083-.729.083-.729 1.205.084 1.839 1.237 1.839 1.237 1.07 1.834 2.807 1.304 3.492.997.107-.775.418-1.305.762-1.604-2.665-.305-5.467-1.334-5.467-5.931 0-1.311.469-2.381 1.236-3.221-.124-.303-.535-1.524.117-3.176 0 0 1.008-.322 3.301 1.23.957-.266 1.983-.399 3.003-.404 1.02.005 2.047.138 3.006.404 2.291-1.552 3.297-1.23 3.297-1.23.653 1.653.242 2.874.118 3.176.77.84 1.235 1.911 1.235 3.221 0 4.609-2.807 5.624-5.479 5.921.43.372.823 1.102.823 2.222v3.293c0 .319.192.694.801.576 4.765-1.589 8.199-6.086 8.199-11.386 0-6.627-5.373-12-12-12z"/>
                            </svg>
                        </a>
                    </div>
                </div>
                <div class="nav-right">
                    <a href="https://miromind.ai" target="_blank">Visit Website</a>
                </div>
            </nav>
        """)

        # Hero Section
        gr.HTML("""
            <div class="hero-section">
                <h1 class="hero-title">Research Deep. Uncover the Future</h1>
                <div class="hero-subtitle">
                    <span class="hero-line"></span>
                    Don't just chat. Predict, verify, and discover with science-based AI.
                    <span class="hero-line"></span>
                </div>
            </div>
        """)

        # Input Section
        with gr.Column(elem_id="input-section"):
            inp = gr.Textbox(
                lines=4,
                placeholder="Enter your research question...",
                show_label=False,
                elem_id="question-input",
            )
            with gr.Row(elem_id="btn-row"):
                stop_btn = gr.Button(
                    "⏹ Stop",
                    elem_id="stop-btn",
                    variant="stop",
                    interactive=False,
                    scale=1,
                )
                run_btn = gr.Button(
                    "Start Research ➤", elem_id="run-btn", variant="primary", scale=2
                )

        # Output Section
        with gr.Column(elem_id="output-section"):
            gr.HTML('<div class="output-label">Research Progress</div>')
            out_md = gr.Markdown("*Waiting to start research...*", elem_id="log-view")

        # State
        ui_state = gr.State({"task_id": None})

        # Event handlers
        run_btn.click(
            fn=gradio_run,
            inputs=[inp, ui_state],
            outputs=[out_md, run_btn, stop_btn, ui_state],
        )
        stop_btn.click(fn=stop_current, inputs=[ui_state], outputs=[run_btn, stop_btn])

        # Footer
        gr.HTML("""
            <div class="app-footer">
                Content generated by MiroMind AI. Please verify important information.
            </div>
        """)

    return demo


if __name__ == "__main__":
    demo = build_demo()
    host = os.getenv("HOST", "0.0.0.0")
    port = int(os.getenv("PORT", "8080"))
    demo.queue().launch(server_name=host, server_port=port)


================================================
FILE: apps/gradio-demo/prompt_patch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Custom Prompt Override (Monkey Patching)

This module allows customizing prompts without modifying miroflow-agent code.

Patches applied:
1. `generate_mcp_system_prompt` - Prepends custom identity prompt
2. `process_input` - Removes the boxed format requirement suffix
3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt for demo
4. `format_final_summary_and_log` - Disables boxed format check to prevent retry

Usage:
    from prompt_patch import apply_prompt_patch
    apply_prompt_patch()
"""

import re

# ============================================================================
# Custom Identity Prompt
# ============================================================================

CUSTOM_IDENTITY_PROMPT = """You are MiroThinker, a specialized deep research AI assistant developed by MiroMind.

IMPORTANT IDENTITY REMINDER:
- You are NOT ChatGPT, Claude, or any other AI assistant

"""

# ============================================================================
# Strings to Remove from Input Processing
# ============================================================================

# This string is appended to task descriptions in input_handler.py
# We remove it for demo mode since we don't need strict boxed format
BOXED_FORMAT_SUFFIX = "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}."

# ============================================================================
# Custom Summarize Prompt for Demo Mode
# ============================================================================


def get_demo_summarize_prompt(target_language: str, task_description: str) -> str:
    """
    Generate a user-friendly summarize prompt for demo mode.

    This prompt is designed for better user experience, producing well-formatted
    Markdown responses instead of strict boxed answers.

    Args:
        target_language: The language to write the response in
        task_description: The original user question

    Returns:
        The summarize prompt string
    """
    return f"""Please provide the final research summary based only on the information already gathered.
No further tool calls are allowed.

## Requirements
- **Language**: Write the entire response in **{target_language}**.
- **Focus**: Directly answer the original question above. Do not just summarize gathered information — provide a clear, actionable answer.
- **Response Length**: Match the complexity of your response to the question. For simple or short questions, provide a concise and direct answer without unnecessary elaboration. For complex questions, provide a detailed and structured report.
- Use clear and structured Markdown formatting when appropriate.
- Use appropriate Markdown headings (e.g., #, ##, ###) only when the content warrants structure.
- Present key findings in an organized, concise, and readable way.
- Use tables only when they genuinely improve clarity.
- **Currency Format**: Use `\\$` instead of `$` for currency amounts (e.g., `\\$100`, `\\$1,000`) to avoid conflicts with inline math syntax.
- **Citation Format**:
  - **In-Text**: Use the format `[ID]`, where `ID` is a **numeric identifier only** (digits 0–9), e.g. `[1]`, `[2]`.
  - **References Section(if has any sources)**: At the very end, add "References" (or equivalent in {target_language}). Format: [ID] TITLE/SECTION_TITLE. <URL>/<FILENAME>.
- Do NOT mention tools, tool calls, or internal reasoning steps.
- Focus solely on delivering a professional, easy-to-read response that answers the user's original question.

## Original Question (for reference)
{task_description}"""


def _detect_language(text: str) -> str:
    """
    Simple language detection based on character analysis.

    Returns a language description suitable for the summarize prompt.
    """
    # Count characters by script
    chinese_chars = sum(1 for c in text if "\u4e00" <= c <= "\u9fff")
    japanese_chars = sum(
        1 for c in text if "\u3040" <= c <= "\u30ff" or "\u31f0" <= c <= "\u31ff"
    )
    korean_chars = sum(1 for c in text if "\uac00" <= c <= "\ud7af")

    total_chars = len(text.replace(" ", ""))
    if total_chars == 0:
        return "English"

    # Determine primary language
    if chinese_chars / total_chars > 0.1:
        return "Chinese (Simplified)"
    elif japanese_chars / total_chars > 0.1:
        return "Japanese"
    elif korean_chars / total_chars > 0.1:
        return "Korean"
    else:
        return "the same language as the user's question"


# ============================================================================
# Monkey Patching
# ============================================================================

_patched = False


def apply_prompt_patch():
    """
    Apply monkey patches to customize prompts for demo mode.

    Patches applied:
    1. `generate_mcp_system_prompt` - Prepends custom identity prompt to system prompt
    2. `process_input` - Removes the boxed format requirement from task descriptions
    3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt
    4. `format_final_summary_and_log` - Disables boxed format check to prevent retry

    This function is idempotent - calling it multiple times has no additional effect.
    """
    global _patched

    if _patched:
        return

    _patch_system_prompt()
    _patch_input_handler()
    _patch_summarize_prompt()
    _patch_output_formatter()

    _patched = True


def _patch_system_prompt():
    """Patch system prompt generation to include custom identity."""
    from src.llm.providers import anthropic_client, openai_client
    from src.utils import prompt_utils

    # Store original function
    original_generate_mcp_system_prompt = prompt_utils.generate_mcp_system_prompt

    def patched_generate_mcp_system_prompt(date, mcp_servers):
        """Patched version that prepends custom identity prompt."""
        original_prompt = original_generate_mcp_system_prompt(date, mcp_servers)
        return CUSTOM_IDENTITY_PROMPT + original_prompt

    # Apply patches to all modules that import and use this function
    prompt_utils.generate_mcp_system_prompt = patched_generate_mcp_system_prompt
    openai_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt
    anthropic_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt


def _patch_input_handler():
    """Patch input handler to remove boxed format requirement."""
    from src.core import orchestrator
    from src.io import input_handler

    # Store original function
    original_process_input = input_handler.process_input

    def patched_process_input(task_description: str, task_file_name: str):
        """Patched version that removes boxed format requirement."""
        result1, result2 = original_process_input(task_description, task_file_name)
        # Remove the boxed format suffix from both results
        result1 = result1.replace(BOXED_FORMAT_SUFFIX, "")
        result2 = result2.replace(BOXED_FORMAT_SUFFIX, "")
        return result1, result2

    # Apply patch to input_handler module
    input_handler.process_input = patched_process_input
    # Also patch in orchestrator where it's imported
    orchestrator.process_input = patched_process_input


def _patch_summarize_prompt():
    """Patch summarize prompt generation for better user experience."""
    from src.core import answer_generator, orchestrator
    from src.utils import prompt_utils

    def patched_generate_agent_summarize_prompt(
        task_description: str, agent_type: str = ""
    ) -> str:
        """
        Patched version that uses user-friendly prompt for main agent.

        For main agent in demo mode, uses a Markdown-friendly prompt instead of
        the strict boxed format prompt used for benchmarks.
        """
        if agent_type == "main":
            # Detect language from task description
            target_language = _detect_language(task_description)
            return get_demo_summarize_prompt(target_language, task_description)
        elif agent_type == "agent-browsing" or agent_type == "browsing-agent":
            # Keep original behavior for sub-agents
            summarize_prompt = (
                "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
                "We are now ending this session, and your conversation history will be deleted. "
                "You must NOT initiate any further tool use. This is your final opportunity to report "
                "*all* of the information gathered during the session.\n\n"
                "The original task is repeated here for reference:\n\n"
                f'"{task_description}"\n\n'
                "Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
                "If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
                "If you reached a conclusion or answer, include it as part of the response.\n"
                "If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "
                "Search results, quotes, and observations that might help a downstream agent solve the problem.\n"
                "If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n"
                "Your final response should be a clear, complete, and structured report.\n"
                "Organize the content into logical sections with appropriate headings.\n"
                "Do NOT include any tool call instructions, speculative filler, or vague summaries.\n"
                "Focus on factual, specific, and well-organized information."
            )
            return summarize_prompt.strip()
        else:
            raise ValueError(f"Unknown agent type: {agent_type}")

    # Apply patches to all modules that import and use this function
    prompt_utils.generate_agent_summarize_prompt = (
        patched_generate_agent_summarize_prompt
    )
    orchestrator.generate_agent_summarize_prompt = (
        patched_generate_agent_summarize_prompt
    )
    answer_generator.generate_agent_summarize_prompt = (
        patched_generate_agent_summarize_prompt
    )


def _patch_output_formatter():
    """
    Patch output formatter to disable boxed format check.

    In demo mode, we don't require \boxed{} format, so we patch the
    format_final_summary_and_log method to always return a valid result
    instead of FORMAT_ERROR_MESSAGE, which would trigger retry logic.
    """
    from src.io import output_formatter

    # Get the OutputFormatter class
    OutputFormatter = output_formatter.OutputFormatter

    def patched_format_final_summary_and_log(self, final_answer_text: str, client=None):
        """
        Patched version that doesn't return FORMAT_ERROR_MESSAGE.

        Instead of checking for \boxed{} content, we use the entire answer
        (with thinking tags removed) as the result.
        """
        summary_lines = []
        summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30)
        summary_lines.append(final_answer_text)

        # In demo mode, use the full answer text (minus thinking) as the result
        # Remove <think>...</think> tags for the extracted result
        boxed_result = re.sub(
            r"<think>.*?</think>", "", final_answer_text, flags=re.DOTALL
        ).strip()

        # If there's actual boxed content, extract it (for compatibility)
        actual_boxed = self._extract_boxed_content(final_answer_text)
        if actual_boxed:
            boxed_result = actual_boxed

        # Add extracted result section
        summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20)
        summary_lines.append(boxed_result if boxed_result else final_answer_text)

        # Token usage statistics and cost estimation
        if client and hasattr(client, "format_token_usage_summary"):
            token_summary_lines, log_string = client.format_token_usage_summary()
            summary_lines.extend(token_summary_lines)
        else:
            summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20)
            summary_lines.append("Token usage information not available.")
            summary_lines.append("-" * (40 + len(" Token Usage & Cost ")))
            log_string = "Token usage information not available."

        # Return boxed_result (never FORMAT_ERROR_MESSAGE in demo mode)
        # This ensures no retry is triggered
        return (
            "\n".join(summary_lines),
            boxed_result or "Demo mode - no boxed format required",
            log_string,
        )

    # Apply patch
    OutputFormatter.format_final_summary_and_log = patched_format_final_summary_and_log


def get_custom_identity_prompt() -> str:
    """Return the custom identity prompt string."""
    return CUSTOM_IDENTITY_PROMPT


================================================
FILE: apps/gradio-demo/pyproject.toml
================================================
[project]
name = "gradio-demo"
version = "0.1.0"
description = "Gradio Demo"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "pydantic>=2.10.0",
    "python-dotenv>=1.0.0",
    "hydra-core>=1.3.0",
    "miroflow-agent",
    "aiohttp>=3.12.15",
    "gradio>=5.42.0",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["./"]

[tool.uv.sources]
miroflow-agent = { path = "../miroflow-agent", editable = true }

[dependency-groups]
dev = [
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
    "httpx>=0.28.1",
]


================================================
FILE: apps/gradio-demo/utils.py
================================================
import re


def contains_chinese(text):
    """
    Detect if a string contains Chinese characters or Chinese punctuation

    Args:
        text (str): The string to detect

    Returns:
        bool: True if contains Chinese characters or punctuation, False otherwise
    """
    # Chinese character Unicode ranges:
    # \u4e00-\u9fff: CJK Unified Ideographs
    # \u3400-\u4dbf: CJK Extension A
    # \uf900-\ufaff: CJK Compatibility Ideographs
    # \u3000-\u303f: CJK Symbols and Punctuation
    # \uff00-\uffef: Fullwidth ASCII, Fullwidth punctuation
    chinese_pattern = re.compile(
        r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]"
    )
    return bool(chinese_pattern.search(text))


def replace_chinese_punctuation(text):
    # Handle single-character replacements with translate
    punctuation_map = str.maketrans(
        {
            "，": ",",
            "。": ".",
            "！": "!",
            "？": "?",
            "；": ";",
            "：": ":",
            "“": '"',
            "”": '"',
            "‘": "'",
            "’": "'",
            "（": "(",
            "）": ")",
            "【": "[",
            "】": "]",
            "《": "<",
            "》": ">",
            "、": ",",
            "—": "-",
        }
    )
    # First, replace multi-character punctuation
    text = text.replace("……", "...")
    # Then apply single-character replacements
    return text.translate(punctuation_map)


================================================
FILE: apps/lobehub-compatibility/MiroThinkerToolParser.py
================================================
"""
Tool parser plugin for vLLM for MiroThinker MCP format to compatible with the tool calling interface of openai.
MCP format:
    <use_mcp_tool>
        <server_name>server name</server_name>
        <tool_name>tool name</tool_name>
        <arguments>
        {...}
        </arguments>
    </use_mcp_tool>
"""

import json
from collections.abc import Sequence

import json_repair
import regex as re
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import (
    ChatCompletionRequest,
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
    ExtractedToolCallInformation,
    FunctionCall,
    ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
    ToolParser,
    ToolParserManager,
)
from vllm.logger import init_logger

logger = init_logger(__name__)


class MirothinkerToolParser(ToolParser):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

        # State tracking for streaming
        self.current_tool_name_sent: bool = False
        self.prev_tool_call_arr: list[dict] = []
        self.current_tool_id: int = -1
        self.streamed_args_for_tool: list[str] = []
        self.buffer: str = ""  # Buffer for potential tool call tags
        self._resolved_tool_name_cache: dict[tuple[str, str], str] = {}

        # Correctness-first streaming state (incremental state machine)
        self._stream_mode: str = "text"  # "text" | "tool"
        self._text_token_prefix: str = ""  # possible prefix of <use_mcp_tool>
        self._tool_end_token_prefix: str = ""  # possible prefix of </use_mcp_tool>
        self._tool_block_buffer: str = (
            ""  # accumulates between <use_mcp_tool> and </use_mcp_tool>
        )
        self._stream_tool_call_ids: list[str] = []

        # Token definitions
        self.tool_call_start_token: str = "<use_mcp_tool>"
        self.tool_call_end_token: str = "</use_mcp_tool>"

        # Regex patterns
        self.tool_call_regex = re.compile(
            r"<use_mcp_tool>\s*"
            r"<server_name>(.*?)</server_name>\s*"
            r"<tool_name>(.*?)</tool_name>\s*"
            r"<arguments>\s*(.*?)\s*</arguments>\s*"
            r"</use_mcp_tool>",
            re.DOTALL,
        )

        # For streaming partial tool calls
        # IMPORTANT: Use GREEDY matching (.*) for arguments to capture all content
        # in streaming mode. We'll clean up </arguments> tag in the code if present.
        # The outer ()? makes the whole <arguments> section optional
        # The inner (.*) will match empty string if <arguments> exists but has no content yet
        self.partial_tool_regex = re.compile(
            r"<use_mcp_tool>\s*"
            r"(?:<server_name>(.*?)</server_name>\s*)?"
            r"(?:<tool_name>(.*?)</tool_name>\s*)?"
            r"(?:<arguments>(\s*.*))?",  # Move \s* inside capture group so empty match returns ""
            re.DOTALL,
        )

        # For correctness-first parsing on COMPLETE tool blocks only
        self._complete_tool_block_regex = re.compile(
            r"<use_mcp_tool>\s*"
            r"(?:<server_name>(.*?)</server_name>\s*)?"
            r"(?:<tool_name>(.*?)</tool_name>\s*)?"
            r"(?:<arguments>\s*(.*?)\s*(?:</arguments>\s*)?)?"
            r"</use_mcp_tool>",
            re.DOTALL,
        )

    def _resolve_tool_name(
        self, server_name: str, tool_name: str, request: ChatCompletionRequest
    ) -> str:
        """
        Resolve the actual tool name by combining server_name and tool_name
        if server_name is not 'default'.
        """
        if not server_name or server_name == "default":
            return tool_name

        if not request or not request.tools:
            return tool_name

        cache_key = (server_name, tool_name)
        cached = self._resolved_tool_name_cache.get(cache_key)
        if cached:
            return cached

        # Filter tools that contain server_name
        candidates = []
        for tool in request.tools:
            if hasattr(tool, "function") and hasattr(tool.function, "name"):
                name = tool.function.name
                if tool_name in name:
                    candidates.append(name)
        if len(candidates) == 1:
            resolved = candidates[0]
            self._resolved_tool_name_cache[cache_key] = resolved
            return resolved
        # Find match containing tool_name
        for candidate in candidates:
            if server_name in candidate:
                logger.debug(
                    "Resolved tool %s -> %s (server: %s)",
                    tool_name,
                    candidate,
                    server_name,
                )
                self._resolved_tool_name_cache[cache_key] = candidate
                return candidate

        return tool_name

    def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
        request = super().adjust_request(request)
        if request.tools and request.tool_choice != "none":
            # Do not skip special tokens for proper tool parsing
            request.skip_special_tokens = False
        return request

    def _ensure_tool_id_valid(self, tool_id: int) -> bool:
        """Ensure the tool_id is valid and arrays have enough elements"""
        if tool_id < 0:
            return False

        # Ensure arrays are large enough
        while len(self.streamed_args_for_tool) <= tool_id:
            self.streamed_args_for_tool.append("")
        while len(self.prev_tool_call_arr) <= tool_id:
            self.prev_tool_call_arr.append({})

        return True

    def extract_tool_calls(
        self,
        model_output: str,
        request: ChatCompletionRequest,
    ) -> ExtractedToolCallInformation:
        # Sanity check; avoid unnecessary processing
        if logger.isEnabledFor(10):  # DEBUG
            logger.debug("model_output len=%s", len(model_output))
        if (
            self.tool_call_start_token not in model_output
            or request.tool_choice == "none"
            or not request.tools
        ):
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )

        try:
            tool_calls = []
            had_any_match = False
            had_parse_error = False
            # Find all complete tool calls
            for match in self.tool_call_regex.finditer(model_output):
                had_any_match = True
                server_name = match.group(1).strip()
                tool_name = match.group(2).strip()
                arguments_str = match.group(3).strip()

                # Resolve tool name
                tool_name = self._resolve_tool_name(server_name, tool_name, request)

                try:
                    # Parse arguments as JSON
                    arguments = json.loads(arguments_str)

                    tool_call = ToolCall(
                        type="function",
                        function=FunctionCall(
                            name=tool_name,
                            arguments=json.dumps(arguments, ensure_ascii=False),
                        ),
                    )
                    tool_calls.append(tool_call)

                except json.JSONDecodeError:
                    try:
                        repaired = json_repair.repair_json(arguments_str)
                        if not repaired:
                            had_parse_error = True
                            logger.warning(
                                "Failed to repair tool arguments JSON: %s",
                                arguments_str,
                            )
                            continue

                        arguments = json.loads(repaired)
                        tool_call = ToolCall(
                            type="function",
                            function=FunctionCall(
                                name=tool_name,
                                arguments=json.dumps(arguments, ensure_ascii=False),
                            ),
                        )
                        tool_calls.append(tool_call)
                    except Exception:
                        had_parse_error = True
                        logger.warning(
                            "Failed to parse tool arguments after repair: %s",
                            arguments_str,
                        )
                        continue

            # If we couldn't successfully parse tool calls (or format didn't match), do not truncate.
            # Return the full model output as content to avoid losing text.
            if had_parse_error or not tool_calls or not had_any_match:
                return ExtractedToolCallInformation(
                    tools_called=False, tool_calls=[], content=model_output
                )

            # Extract content before first tool call
            content = model_output[: model_output.find(self.tool_call_start_token)]

            return ExtractedToolCallInformation(
                tools_called=len(tool_calls) > 0,
                tool_calls=tool_calls,
                content=content if content else None,
            )

        except Exception:
            logger.exception("Error in extracting tool call from response.")
            return ExtractedToolCallInformation(
                tools_called=False, tool_calls=[], content=model_output
            )

    def extract_tool_calls_streaming(
        self,
        previous_text: str,
        current_text: str,
        delta_text: str,
        previous_token_ids: Sequence[int],
        current_token_ids: Sequence[int],
        delta_token_ids: Sequence[int],
        request: ChatCompletionRequest,
    ) -> DeltaMessage | None:
        # Reset state if this is the start of a new request
        if not previous_text:
            self.current_tool_name_sent = False
            self.prev_tool_call_arr = []
            self.current_tool_id = -1
            self.streamed_args_for_tool = []
            self.buffer = ""
            self._resolved_tool_name_cache = {}

            self._stream_mode = "text"
            self._text_token_prefix = ""
            self._tool_end_token_prefix = ""
            self._tool_block_buffer = ""
            self._stream_tool_call_ids = []

        # If tools are disabled for this request, do not suppress tags or parse tool calls.
        # Flush any internal buffers as plain text so we never drop output.
        if request.tool_choice == "none" or not request.tools:
            out = ""
            if self.buffer:
                out += self.buffer
                self.buffer = ""
            if self._text_token_prefix:
                out += self._text_token_prefix
                self._text_token_prefix = ""
            if self._tool_block_buffer:
                out += self.tool_call_start_token + self._tool_block_buffer
                self._tool_block_buffer = ""
            if self._tool_end_token_prefix:
                out += self._tool_end_token_prefix
                self._tool_end_token_prefix = ""
            out += delta_text
            return DeltaMessage(content=out) if out else None

        def _longest_token_prefix_at_end(s: str, token: str) -> str:
            max_len = min(len(token) - 1, len(s))
            for i in range(max_len, 0, -1):
                if token.startswith(s[-i:]):
                    return s[-i:]
            return ""

        emitted_text_parts: list[str] = []
        emitted_tool_calls: list[DeltaToolCall] = []

        chunk = delta_text

        while chunk:
            if self._stream_mode == "text":
                if self._text_token_prefix:
                    chunk = self._text_token_prefix + chunk
                    self._text_token_prefix = ""

                start_idx = chunk.find(self.tool_call_start_token)
                if start_idx < 0:
                    prefix = _longest_token_prefix_at_end(
                        chunk, self.tool_call_start_token
                    )
                    if prefix:
                        safe = chunk[: -len(prefix)]
                        if safe:
                            emitted_text_parts.append(safe)
                        self._text_token_prefix = prefix
                    else:
                        emitted_text_parts.append(chunk)
                    break

                before = chunk[:start_idx]
                if before:
                    emitted_text_parts.append(before)
                chunk = chunk[start_idx + len(self.tool_call_start_token) :]
                self._stream_mode = "tool"
                self._tool_block_buffer = ""
                self._tool_end_token_prefix = ""
                continue

            # tool mode
            if self._tool_end_token_prefix:
                chunk = self._tool_end_token_prefix + chunk
                self._tool_end_token_prefix = ""

            end_idx = chunk.find(self.tool_call_end_token)
            if end_idx < 0:
                prefix = _longest_token_prefix_at_end(chunk, self.tool_call_end_token)
                if prefix:
                    self._tool_block_buffer += chunk[: -len(prefix)]
                    self._tool_end_token_prefix = prefix
                else:
                    self._tool_block_buffer += chunk
                break

            # Complete tool block
            self._tool_block_buffer += chunk[:end_idx]
            tool_block = (
                self.tool_call_start_token
                + self._tool_block_buffer
                + self.tool_call_end_token
            )
            remainder = chunk[end_idx + len(self.tool_call_end_token) :]

            # Reset tool buffers before parsing
            self._stream_mode = "text"
            self._tool_block_buffer = ""
            self._tool_end_token_prefix = ""

            try:
                m = self._complete_tool_block_regex.search(tool_block)
                if not m:
                    emitted_text_parts.append(tool_block)
                    chunk = remainder
                    continue

                server_name = (m.group(1) or "").strip()
                tool_name = (m.group(2) or "").strip()
                arguments_str = (m.group(3) or "").strip()

                if not tool_name:
                    emitted_text_parts.append(tool_block)
                    chunk = remainder
                    continue

                resolved_name = (
                    self._resolve_tool_name(server_name, tool_name, request)
                    if server_name
                    else tool_name
                )

                # Finalize arguments strictly at end of the block
                if not arguments_str:
                    arguments_json_str = "{}"
                else:
                    try:
                        arguments_obj = json.loads(arguments_str)
                    except Exception:
                        repaired = json_repair.repair_json(arguments_str)
                        if not repaired:
                            emitted_text_parts.append(tool_block)
                            chunk = remainder
                            continue
                        arguments_obj = json.loads(repaired)
                    arguments_json_str = json.dumps(arguments_obj, ensure_ascii=False)

                tool_index = len(self._stream_tool_call_ids)
                tool_call_id = make_tool_call_id()
                self._stream_tool_call_ids.append(tool_call_id)

                emitted_tool_calls.append(
                    DeltaToolCall(
                        index=tool_index,
                        type="function",
                        id=tool_call_id,
                        function=DeltaFunctionCall(
                            name=resolved_name,
                            arguments=arguments_json_str,
                        ).model_dump(exclude_none=True),
                    )
                )

            except Exception:
                logger.exception(
                    "Error parsing complete tool block in streaming; falling back to plain text."
                )
                emitted_text_parts.append(tool_block)

            chunk = remainder

        emitted_text = "".join(emitted_text_parts) if emitted_text_parts else None
        if emitted_text is not None and emitted_text == "":
            emitted_text = None
        if emitted_text is None and not emitted_tool_calls:
            return None

        # vLLM's DeltaMessage.tool_calls is validated as a list; do not pass None explicitly.
        if emitted_tool_calls:
            return DeltaMessage(content=emitted_text, tool_calls=emitted_tool_calls)
        return DeltaMessage(content=emitted_text)


# Register the tool parser to ToolParserManager
ToolParserManager.register_module("mirothinker", True, MirothinkerToolParser)


================================================
FILE: apps/lobehub-compatibility/README.md
================================================
# LobeChat Integration Guide

This guide describes how to integrate the MiroThinker model with [LobeChat](https://github.com/lobehub/lobe-chat), an open-source, modern LLM UI framework supporting tool usage (function calling).

## Before You Start

MiroThinker is a reasoning model. When generating responses, it first outputs its reasoning process inside `<think>...</think>` tags, then provides the final answer. For agentic tasks (multi-step tool use), the model performs better when it can see its previous reasoning in the conversation history.

However, LobeChat does not preserve reasoning content in conversation history. When sending messages back to the API, LobeChat strips the `<think>...</think>` content from previous assistant messages. This means the model cannot see its prior reasoning steps.

- For general chat: This works fine.
- For agentic workflows: Performance may be degraded since the model cannot reference its previous reasoning.

If you need full reasoning preservation for agentic use cases, consider modifying LobeChat's source code to return `reasoning_content` in conversation history.

## 1. Start the Inference Service

First, launch the MiroThinker model using vLLM with the OpenAI-compatible API adapter. We use vLLM because it supports loading custom tool parsers from external Python files, while SGLang does not. Ensure you include the tool parser plugin.

```bash
# Configuration
PORT=61002
MODEL_PATH=miromind-ai/MiroThinker-v1.5-30B

# Start vLLM server
vllm serve $MODEL_PATH \
    --served-model-name mirothinker \
    --port $PORT \
    --trust-remote-code \
    --chat-template chat_template.jinja \
    --tool-parser-plugin MiroThinkerToolParser.py \
    --tool-call-parser mirothinker \
    --enable-auto-tool-choice
```

## 2. Configure LobeChat

You can use either the self-hosted version or the [web application](https://lobechat.com/chat).

### Step 1: Access Settings

Navigate to **Settings** -> **AI Service Provider** to add a custom AI service provider.

![Settings Navigation](img/settings.png)

### Step 2: Add Custom AI Provider

Click the `+` button to add a new provider and configure it as follows:

![Add AI Provider](img/AI-provider.png)

| Field | Value | Description |
| :--- | :--- | :--- |
| **Provider ID** | `miromind` | Or any identifier you prefer. |
| **Request Format** | `OPENAI` |  |
| **API Key** | `your-api-key` | Use any string if auth is disabled. |
| **API Proxy Address** | `http://localhost:61002/v1` | Replace with your actual service address. |

### Step 3: Configure the Model

After adding the provider, add the models you deploy to the service provider's model list.:

1. Add a new model with the ID `mirothinker` (must match `--served-model-name`).
1. **Crucial**: Enable the **Function Calling** capability toggle.
1. Click "Check" to verify connectivity.

![Model Configuration](img/model.png)

## 3. Usage Demo

Once configured, you can use MiroThinker in LobeChat with full tool-calling capabilities.

![Presentation Demo](img/presentation.gif)


================================================
FILE: apps/lobehub-compatibility/chat_template.jinja
================================================
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "In this environment you have access to a set of tools you can use to answer the user's question.\n\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\n\nToday is: " + strftime_now('%Y-%m-%d') + ". For time-dependent questions, answer based on the world as it would reasonably be today.\n\n# Tool-Use Formatting Instructions\n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription:\nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n<use_mcp_tool>\n<server_name>server name here</server_name>\n<tool_name>tool name here</tool_name>\n<arguments>\n{\n  \"param1\": \"value1\",\n  \"param2\": \"value2 \\\"escaped string\\\"\"\n}\n</arguments>\n</use_mcp_tool>\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n## Server name: default" }}
    {%- for tool in tools %}
        {%- set func = tool.function if tool.function is defined else tool %}
        {{- "\n### Tool name: " + func.name + "\n" }}
        {{- "Description:\n" }}
        {%- set desc = func.description if func.description else '' %}
        {%- if desc[:4] == '    ' %}
            {{- desc }}
        {%- else %}
            {{- "    " + desc }}
        {%- endif %}
        {%- if "Args:" not in desc and func.parameters is defined and func.parameters.properties is defined %}
            {{- "\n\n    Args:" }}
            {%- for prop_name, prop_value in func.parameters.properties.items() %}
                {%- if prop_value.description is defined %}
                    {{- "\n        " + prop_name + ": " + prop_value.description }}
                {%- else %}
                    {{- "\n        " + prop_name + ": " + (prop_value.type if prop_value.type is defined else "any") }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- "\n\nInput JSON schema: " + (func.parameters | tojson) + "\n" }}
    {%- endfor %}
    {{- "\n# General Objective\n\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.<|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content if message.content is not none else '' %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set reasoning_content = (content.split('</think>')[0]).rstrip('\n') %}
                {%- set reasoning_content = (reasoning_content.split('<think>')[-1]).lstrip('\n') %}
                {%- set content = (content.split('</think>')[-1]).lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<use_mcp_tool>\n<server_name>default</server_name>\n<tool_name>' }}
                {{- tool_call.name }}
                {{- '</tool_name>\n<arguments>\n' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '\n</arguments>\n</use_mcp_tool>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user\n' }}
        {%- else %}
            {{- '\n\n' }}
        {%- endif %}
        {{- message.content }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- endif %}
{%- endif %}

================================================
FILE: apps/lobehub-compatibility/requirements.txt
================================================
vllm>=0.11.0
json-repair
regex

================================================
FILE: apps/lobehub-compatibility/test_tool_parser.py
================================================
#!/usr/bin/env python3
"""
Test MiroThinkerToolParser for correctness.
"""

import json
import sys
from types import SimpleNamespace
from unittest.mock import MagicMock

import regex as re

# Mock vLLM imports for testing without vLLM installed
# Create mock modules
mock_vllm = MagicMock()
mock_vllm.entrypoints = MagicMock()
mock_vllm.entrypoints.chat_utils = MagicMock()
mock_vllm.entrypoints.chat_utils.make_tool_call_id = lambda: "call_test_123"

mock_protocol = SimpleNamespace(
    ChatCompletionRequest=MagicMock,
    DeltaFunctionCall=MagicMock,
    DeltaMessage=MagicMock,
    DeltaToolCall=MagicMock,
    ExtractedToolCallInformation=MagicMock,
    FunctionCall=MagicMock,
    ToolCall=MagicMock,
)

mock_tool_parser = SimpleNamespace(
    ToolParser=object,
    ToolParserManager=MagicMock(),
)

mock_logger = SimpleNamespace(
    init_logger=lambda x: MagicMock(isEnabledFor=lambda _: False),
)

sys.modules["vllm"] = mock_vllm
sys.modules["vllm.entrypoints"] = mock_vllm.entrypoints
sys.modules["vllm.entrypoints.chat_utils"] = mock_vllm.entrypoints.chat_utils
sys.modules["vllm.entrypoints.openai"] = MagicMock()
sys.modules["vllm.entrypoints.openai.protocol"] = mock_protocol
sys.modules["vllm.entrypoints.openai.tool_parsers"] = MagicMock()
sys.modules["vllm.entrypoints.openai.tool_parsers.abstract_tool_parser"] = (
    mock_tool_parser
)
sys.modules["vllm.logger"] = mock_logger


def test_tool_call_regex():
    """Test the main tool call regex pattern."""
    tool_call_regex = re.compile(
        r"<use_mcp_tool>\s*"
        r"<server_name>(.*?)</server_name>\s*"
        r"<tool_name>(.*?)</tool_name>\s*"
        r"<arguments>\s*(.*?)\s*</arguments>\s*"
        r"</use_mcp_tool>",
        re.DOTALL,
    )

    # Test 1: Basic tool call
    text1 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>web_search</tool_name>
<arguments>
{"query": "AI news"}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text1)
    assert match is not None, "Should match basic tool call"
    assert match.group(1).strip() == "my_mcp_server"
    assert match.group(2).strip() == "web_search"
    assert json.loads(match.group(3).strip()) == {"query": "AI news"}
    print("✅ Test 1: Basic tool call - PASSED")

    # Test 2: Tool call with content before
    text2 = """Let me search for that.

<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>search</tool_name>
<arguments>
{"q": "test"}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text2)
    assert match is not None, "Should match tool call with content before"
    print("✅ Test 2: Tool call with content before - PASSED")

    # Test 3: Multiple tool calls
    text3 = """<use_mcp_tool>
<server_name>server1</server_name>
<tool_name>tool1</tool_name>
<arguments>{"a": 1}</arguments>
</use_mcp_tool>

<use_mcp_tool>
<server_name>server2</server_name>
<tool_name>tool2</tool_name>
<arguments>{"b": 2}</arguments>
</use_mcp_tool>"""

    matches = list(tool_call_regex.finditer(text3))
    assert len(matches) == 2, f"Should find 2 tool calls, found {len(matches)}"
    assert matches[0].group(2).strip() == "tool1"
    assert matches[1].group(2).strip() == "tool2"
    print("✅ Test 3: Multiple tool calls - PASSED")

    # Test 4: Complex JSON arguments
    text4 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>complex_tool</tool_name>
<arguments>
{
  "query": "test with quotes and apostrophes",
  "options": {"nested": true},
  "list": [1, 2, 3]
}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text4)
    assert match is not None, "Should match complex JSON"
    args = json.loads(match.group(3).strip())
    assert args["query"] == "test with quotes and apostrophes"
    assert args["options"]["nested"] is True
    print("✅ Test 4: Complex JSON arguments - PASSED")

    # Test 5: Empty arguments
    text5 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>no_args_tool</tool_name>
<arguments>
{}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text5)
    assert match is not None, "Should match empty arguments"
    assert json.loads(match.group(3).strip()) == {}
    print("✅ Test 5: Empty arguments - PASSED")

    # Test 6: Minimal whitespace
    text6 = "<use_mcp_tool><server_name>s</server_name><tool_name>t</tool_name><arguments>{}</arguments></use_mcp_tool>"
    match = tool_call_regex.search(text6)
    assert match is not None, "Should match minimal whitespace"
    print("✅ Test 6: Minimal whitespace - PASSED")


def test_partial_tool_regex():
    """Test the partial tool regex for streaming."""
    partial_tool_regex = re.compile(
        r"<use_mcp_tool>\s*"
        r"(?:<server_name>(.*?)</server_name>\s*)?"
        r"(?:<tool_name>(.*?)</tool_name>\s*)?"
        r"(?:<arguments>(\s*.*))?",
        re.DOTALL,
    )

    # Test partial: only opening tag
    text1 = "<use_mcp_tool>\n"
    match = partial_tool_regex.search(text1)
    assert match is not None
    print("✅ Partial test 1: Only opening tag - PASSED")

    # Test partial: server_name only
    text2 = "<use_mcp_tool>\n<server_name>my_server</server_name>\n"
    match = partial_tool_regex.search(text2)
    assert match is not None
    assert match.group(1).strip() == "my_server"
    assert match.group(2) is None
    print("✅ Partial test 2: Server name only - PASSED")

    # Test partial: incomplete arguments
    text3 = """<use_mcp_tool>
<server_name>my_server</server_name>
<tool_name>my_tool</tool_name>
<arguments>
{"query": "incomp"""

    match = partial_tool_regex.search(text3)
    assert match is not None
    assert match.group(1).strip() == "my_server"
    assert match.group(2).strip() == "my_tool"
    assert '{"query": "incomp' in match.group(3)
    print("✅ Partial test 3: Incomplete arguments - PASSED")


def test_complete_tool_block_regex():
    """Test the complete tool block regex used in streaming."""
    complete_regex = re.compile(
        r"<use_mcp_tool>\s*"
        r"(?:<server_name>(.*?)</server_name>\s*)?"
        r"(?:<tool_name>(.*?)</tool_name>\s*)?"
        r"(?:<arguments>\s*(.*?)\s*(?:</arguments>\s*)?)?"
        r"</use_mcp_tool>",
        re.DOTALL,
    )

    # Test: Complete block
    text1 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>search</tool_name>
<arguments>
{"q": "test"}
</arguments>
</use_mcp_tool>"""

    match = complete_regex.search(text1)
    assert match is not None
    assert match.group(1).strip() == "my_mcp_server"
    assert match.group(2).strip() == "search"
    assert json.loads(match.group(3).strip()) == {"q": "test"}
    print("✅ Complete block test 1: Full block - PASSED")

    # Test: Without arguments tag
    text2 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>simple_tool</tool_name>
</use_mcp_tool>"""

    match = complete_regex.search(text2)
    assert match is not None
    assert match.group(2).strip() == "simple_tool"
    assert match.group(3) is None
    print("✅ Complete block test 2: Without arguments - PASSED")


def test_edge_cases():
    """Test edge cases and potential bugs."""
    tool_call_regex = re.compile(
        r"<use_mcp_tool>\s*"
        r"<server_name>(.*?)</server_name>\s*"
        r"<tool_name>(.*?)</tool_name>\s*"
        r"<arguments>\s*(.*?)\s*</arguments>\s*"
        r"</use_mcp_tool>",
        re.DOTALL,
    )

    # Edge case 1: Unicode in arguments
    text1 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>search</tool_name>
<arguments>
{"query": "你好世界 🎉"}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text1)
    assert match is not None
    args = json.loads(match.group(3).strip())
    assert args["query"] == "你好世界 🎉"
    print("✅ Edge case 1: Unicode in arguments - PASSED")

    # Edge case 2: Newlines in JSON
    text2 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>search</tool_name>
<arguments>
{
  "query": "line1\\nline2\\nline3"
}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text2)
    assert match is not None
    args = json.loads(match.group(3).strip())
    assert "line1\nline2" in args["query"]
    print("✅ Edge case 2: Newlines in JSON - PASSED")

    # Edge case 3: Tags in content (should not match nested)
    text3 = """<use_mcp_tool>
<server_name>my_mcp_server</server_name>
<tool_name>search</tool_name>
<arguments>
{"query": "<html><body>test</body></html>"}
</arguments>
</use_mcp_tool>"""

    match = tool_call_regex.search(text3)
    assert match is not None
    args = json.loads(match.group(3).strip())
    assert "<html>" in args["query"]
    print("✅ Edge case 3: HTML tags in arguments - PASSED")


def check_unused_code():
    """Check for unused code in the parser."""
    print("\n" + "=" * 60)
    print("CODE ANALYSIS - Potential Issues")
    print("=" * 60)

    issues = []

    # Issue 1: Unused variables
    unused_vars = [
        "self.current_tool_name_sent",
        "self.prev_tool_call_arr",
        "self.current_tool_id",
        "self.streamed_args_for_tool",
        "self.buffer",
    ]
    issues.append(
        f"⚠️  Unused instance variables (defined but never used in main logic):\n   {', '.join(unused_vars)}"
    )

    # Issue 2: Unused method
    issues.append("⚠️  `_ensure_tool_id_valid` method is defined but never called")

    # Issue 3: Unused regex
    issues.append("⚠️  `partial_tool_regex` is defined but never used")

    # Issue 4: server_name handling
    issues.append(
        "⚠️  `_resolve_tool_name` checks for 'default' server_name,\n   but chat_template.jinja uses 'my_mcp_server'"
    )

    for issue in issues:
        print(f"\n{issue}")

    print("\n" + "=" * 60)
    print("RECOMMENDATIONS")
    print("=" * 60)
    print("""
1. Remove unused variables and methods to clean up the code
2. Either use `partial_tool_regex` or remove it
3. Update `_resolve_tool_name` to handle 'my_mcp_server' correctly
4. The streaming implementation looks correct with the state machine approach
5. The main `extract_tool_calls` and `extract_tool_calls_streaming` logic appears sound
""")


def main():
    print("=" * 60)
    print("MiroThinkerToolParser Test Suite")
    print("=" * 60)

    print("\n--- Testing Main Tool Call Regex ---")
    test_tool_call_regex()

    print("\n--- Testing Partial Tool Regex ---")
    test_partial_tool_regex()

    print("\n--- Testing Complete Tool Block Regex ---")
    test_complete_tool_block_regex()

    print("\n--- Testing Edge Cases ---")
    test_edge_cases()

    check_unused_code()

    print("\n" + "=" * 60)
    print("ALL REGEX TESTS PASSED ✅")
    print("=" * 60)


if __name__ == "__main__":
    main()


================================================
FILE: apps/lobehub-compatibility/unit_test.py
================================================
#!/usr/bin/env python3
"""
Unit tests for MiroThinker chat template.

Run with: pytest unit_test.py -v
"""

from datetime import datetime
from pathlib import Path

import pytest
from jinja2 import BaseLoader, Environment

# ============================================================================
# Fixtures
# ============================================================================


def strftime_now(format_str: str) -> str:
    """Simulate vLLM's strftime_now function."""
    return datetime.now().strftime(format_str)


@pytest.fixture
def template():
    """Load the chat template."""
    template_path = Path(__file__).parent / "chat_template.jinja"
    with open(template_path, "r") as f:
        template_str = f.read()

    env = Environment(loader=BaseLoader())
    env.globals["strftime_now"] = strftime_now
    return env.from_string(template_str)


@pytest.fixture
def today_date():
    """Get today's date in YYYY-MM-DD format."""
    return datetime.now().strftime("%Y-%m-%d")


# ============================================================================
# Test: Basic Message Formatting
# ============================================================================


class TestBasicMessageFormatting:
    """Tests for basic message formatting without tools."""

    def test_user_message_format(self, template):
        """User message should be wrapped in <|im_start|>user ... <|im_end|>."""
        messages = [{"role": "user", "content": "Hello!"}]
        result = template.render(messages=messages, add_generation_prompt=False)

        assert "<|im_start|>user\nHello!<|im_end|>" in result

    def test_system_message_format(self, template):
        """System message should be wrapped correctly."""
        messages = [
            {"role": "system", "content": "You are helpful."},
            {"role": "user", "content": "Hi"},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)

        assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result

    def test_assistant_message_format(self, template):
        """Assistant message should be wrapped correctly with <think> tags."""
        messages = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi there!"},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)

        # Assistant always outputs <think> tags (even if empty)
        assert (
            "<|im_start|>assistant\n<think>\n\n</think>\n\nHi there!<|im_end|>"
            in result
        )

    def test_add_generation_prompt(self, template):
        """add_generation_prompt should add <|im_start|>assistant at the end."""
        messages = [{"role": "user", "content": "Hello"}]
        result = template.render(messages=messages, add_generation_prompt=True)

        assert result.endswith("<|im_start|>assistant\n")

    def test_multi_turn_conversation(self, template):
        """Multi-turn conversation should maintain correct order."""
        messages = [
            {"role": "system", "content": "System prompt"},
            {"role": "user", "content": "User 1"},
            {"role": "assistant", "content": "Assistant 1"},
            {"role": "user", "content": "User 2"},
        ]
        result = template.render(messages=messages, add_generation_prompt=True)

        # Check order
        sys_pos = result.find("System prompt")
        user1_pos = result.find("User 1")
        asst1_pos = result.find("Assistant 1")
        user2_pos = result.find("User 2")

        assert sys_pos < user1_pos < asst1_pos < user2_pos


# ============================================================================
# Test: Thinking/Reasoning Content
# ============================================================================


class TestThinkingContent:
    """Tests for <think> tag handling."""

    def test_reasoning_content_field(self, template):
        """reasoning_content field should be wrapped in <think> tags."""
        messages = [
            {"role": "user", "content": "What is 2+2?"},
            {
                "role": "assistant",
                "content": "The answer is 4.",
                "reasoning_content": "2+2=4 by basic arithmetic.",
            },
        ]
        result = template.render(messages=messages, add_generation_prompt=False)

        assert "<think>\n2+2=4 by basic arithmetic.\n</think>" in result
        assert "The answer is 4." in result

    def test_think_tags_in_content(self, template):
        """<think> tags in content should be extracted and reformatted."""
        messages = [
            {"role": "user", "content": "Question"},
            {
                "role": "assistant",
                "content": "<think>\nMy reasoning here.\n</think>\n\nMy answer here.",
            },
        ]
        result = template.render(messages=messages, add_generation_prompt=False)

        assert "<think>\nMy reasoning here.\n</think>" in result
        assert "My answer here." in result

    def test_think_preserved_in_history(self, template):
        """Think tags should be preserved in historical messages, not removed."""
        messages = [
            {"role": "user", "content": "First question"},
            {
                "role": "assistant",
                "content": "First answer",
                "reasoning_content": "First reasoning",
            },
            {"role": "user", "content": "Second question"},
        ]
        result = template.render(messages=messages, add_generation_prompt=True)

        # Historical thinking should be present
        assert "<think>\nFirst reasoning\n</think>" in result

    def test_enable_thinking_false(self, template):
        """enable_thinking=false should output empty think tags."""
        messages = [{"role": "user", "content": "Hello"}]
        result = template.render(
            messages=messages, add_generation_prompt=True, enable_thinking=False
        )

        assert result.endswith("<|im_start|>assistant\n<think>\n\n</think>\n\n")

    def test_enable_thinking_true(self, template):
        """enable_thinking=true should not output empty think tags."""
        messages = [{"role": "user", "content": "Hello"}]
        result = template.render(
            messages=messages, add_generation_prompt=True, enable_thinking=True
        )

        assert result.endswith("<|im_start|>assistant\n")
        assert "<think>\n\n</think>" not in result


# ============================================================================
# Test: Tool Definitions in System Prompt
# ============================================================================


class TestToolDefinitions:
    """Tests for tool definition formatting in system prompt."""

    def test_tools_trigger_system_prompt(self, template, today_date):
        """When tools are provided, a special system prompt should be generated."""
        messages = [{"role": "user", "content": "Search something"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "web_search",
                    "description": "Search the web",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "In this environment you have access to a set of tools" in result
        assert f"Today is: {today_date}" in result
        assert "# Tool-Use Formatting Instructions" in result

    def test_tool_name_format(self, template):
        """Tool should be formatted with ### Tool name: header."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "my_tool",
                    "description": "My description",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "### Tool name: my_tool" in result

    def test_tool_server_name(self, template):
        """Tool server should be my_mcp_server."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test_tool",
                    "description": "Test",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "## Server name: default" in result

    def test_tool_description_indentation(self, template):
        """Tool description should be indented with 4 spaces."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test_tool",
                    "description": "My tool description",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "Description:\n    My tool description" in result

    def test_tool_args_auto_generated(self, template):
        """Args section should be auto-generated from parameters.properties."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search function",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string", "description": "Search query"},
                            "limit": {"type": "integer", "description": "Max results"},
                        },
                    },
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "Args:" in result
        assert "query: Search query" in result
        assert "limit: Max results" in result

    def test_tool_args_not_duplicated(self, template):
        """If description already has Args:, don't add another."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search function\n\nArgs:\n    query: The query",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string", "description": "Search query"}
                        },
                    },
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        # Should only have one Args: section
        assert result.count("Args:") == 1

    def test_tool_json_schema_included(self, template):
        """Input JSON schema should be included."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test",
                    "description": "Test",
                    "parameters": {
                        "type": "object",
                        "properties": {"x": {"type": "string"}},
                    },
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "Input JSON schema:" in result
        assert '"type": "object"' in result or '"type":"object"' in result

    def test_tool_without_function_wrapper(self, template):
        """Tools can be passed without the function wrapper."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "name": "direct_tool",
                "description": "Direct tool format",
                "parameters": {"type": "object", "properties": {}},
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "### Tool name: direct_tool" in result

    def test_tool_none_description(self, template):
        """Tool with None description should not crash."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test",
                    "description": None,
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        # Should not raise an exception
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )
        assert "### Tool name: test" in result

    def test_tool_empty_description(self, template):
        """Tool with empty description should not crash."""
        messages = [{"role": "user", "content": "Test"}]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test",
                    "description": "",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )
        assert "### Tool name: test" in result

    def test_system_message_prepended_with_tools(self, template):
        """Custom system message should be prepended when tools are present."""
        messages = [
            {"role": "system", "content": "You are MiroThinker."},
            {"role": "user", "content": "Hi"},
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "test",
                    "description": "Test",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        # System message should come first, then tool instructions
        sys_idx = result.find("You are MiroThinker.")
        tools_idx = result.find("In this environment you have access")
        assert sys_idx < tools_idx


# ============================================================================
# Test: Tool Calls in Assistant Messages
# ============================================================================


class TestToolCalls:
    """Tests for tool call formatting in assistant messages."""

    def test_tool_call_format(self, template):
        """Tool calls should be formatted with <use_mcp_tool> tags."""
        messages = [
            {"role": "user", "content": "Search for AI"},
            {
                "role": "assistant",
                "content": "Let me search.",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "type": "function",
                        "function": {
                            "name": "web_search",
                            "arguments": '{"query": "AI news"}',
                        },
                    }
                ],
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "web_search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=False
        )

        assert "<use_mcp_tool>" in result
        assert "<server_name>default</server_name>" in result
        assert "<tool_name>web_search</tool_name>" in result
        assert "<arguments>" in result
        assert '{"query": "AI news"}' in result
        assert "</arguments>" in result
        assert "</use_mcp_tool>" in result

    def test_tool_call_no_content(self, template):
        """Tool call with None content should work."""
        messages = [
            {"role": "user", "content": "Search"},
            {
                "role": "assistant",
                "content": None,
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {
                            "name": "search",
                            "arguments": '{"q": "test"}',
                        },
                    }
                ],
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=False
        )

        # Should have tool call with empty think tags (no content before tool call)
        assert "<|im_start|>assistant\n<think>\n\n</think>\n\n<use_mcp_tool>" in result

    def test_multiple_tool_calls(self, template):
        """Multiple tool calls should be separated by newlines."""
        messages = [
            {"role": "user", "content": "Compare Tokyo and Osaka"},
            {
                "role": "assistant",
                "content": "I'll search both.",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {
                            "name": "search",
                            "arguments": '{"q": "Tokyo"}',
                        },
                    },
                    {
                        "id": "call_2",
                        "function": {
                            "name": "search",
                            "arguments": '{"q": "Osaka"}',
                        },
                    },
                ],
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=False
        )

        # Extract assistant message part (after the last <|im_start|>assistant)
        assistant_start = result.rfind("<|im_start|>assistant")
        assistant_part = result[assistant_start:]

        # Should have two tool calls in assistant message
        assert assistant_part.count("<use_mcp_tool>") == 2
        assert assistant_part.count("</use_mcp_tool>") == 2

    def test_tool_call_arguments_dict(self, template):
        """Tool call with dict arguments should be JSON serialized."""
        messages = [
            {"role": "user", "content": "Search"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {
                            "name": "search",
                            "arguments": {"q": "test", "limit": 5},  # dict, not string
                        },
                    }
                ],
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=False
        )

        # Arguments should be JSON serialized
        assert "<arguments>" in result
        assert '"q"' in result or "'q'" in result


# ============================================================================
# Test: Tool Responses
# ============================================================================


class TestToolResponses:
    """Tests for tool response handling."""

    def test_tool_response_in_user_message(self, template):
        """Tool response should be embedded in a user message."""
        messages = [
            {"role": "user", "content": "Search"},
            {
                "role": "assistant",
                "content": "Searching...",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {"name": "search", "arguments": '{"q": "test"}'},
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "call_1",
                "content": "Search results here",
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        # Tool response should be in a user message
        assert "<|im_start|>user\nSearch results here<|im_end|>" in result

    def test_multiple_tool_responses_merged(self, template):
        """Multiple consecutive tool responses should be merged into one user message."""
        messages = [
            {"role": "user", "content": "Compare"},
            {
                "role": "assistant",
                "content": "Searching...",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {"name": "search", "arguments": '{"q": "A"}'},
                    },
                    {
                        "id": "call_2",
                        "function": {"name": "search", "arguments": '{"q": "B"}'},
                    },
                ],
            },
            {"role": "tool", "tool_call_id": "call_1", "content": "Result A"},
            {"role": "tool", "tool_call_id": "call_2", "content": "Result B"},
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        # Should have only one user message containing both results
        # Results should be separated by \n\n
        assert "Result A\n\nResult B" in result

        # Count im_start|>user - should have 2 (original user + tool results)
        user_count = result.count("<|im_start|>user")
        assert user_count == 2

    def test_tool_response_no_wrapper_tags(self, template):
        """Tool responses should NOT be wrapped in <tool_response> tags."""
        messages = [
            {"role": "user", "content": "Search"},
            {
                "role": "assistant",
                "content": "",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {"name": "search", "arguments": '{"q": "test"}'},
                    }
                ],
            },
            {"role": "tool", "tool_call_id": "call_1", "content": "Results"},
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        assert "<tool_response>" not in result
        assert "</tool_response>" not in result


# ============================================================================
# Test: Edge Cases
# ============================================================================


class TestEdgeCases:
    """Tests for edge cases and error handling."""

    def test_only_system_message(self, template):
        """Only system message should work."""
        messages = [{"role": "system", "content": "You are helpful."}]
        result = template.render(messages=messages, add_generation_prompt=False)
        assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result

    def test_assistant_empty_content(self, template):
        """Assistant with empty string content should work."""
        messages = [
            {"role": "user", "content": "Hi"},
            {"role": "assistant", "content": ""},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)
        # Assistant always outputs <think> tags (even with empty content)
        assert "<|im_start|>assistant\n<think>\n\n</think>\n\n<|im_end|>" in result

    def test_unicode_content(self, template):
        """Unicode content should be preserved."""
        messages = [
            {"role": "user", "content": "你好！🎉"},
            {"role": "assistant", "content": "こんにちは！"},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)
        assert "你好！🎉" in result
        assert "こんにちは！" in result

    def test_special_characters_in_content(self, template):
        """Special characters should be preserved."""
        messages = [
            {"role": "user", "content": "Test <tag> & \"quotes\" 'apostrophe'"},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)
        assert '<tag> & "quotes"' in result

    def test_newlines_preserved(self, template):
        """Newlines in content should be preserved."""
        messages = [
            {"role": "user", "content": "Line 1\nLine 2\n\nLine 4"},
        ]
        result = template.render(messages=messages, add_generation_prompt=False)
        assert "Line 1\nLine 2\n\nLine 4" in result


# ============================================================================
# Test: Complete Flow
# ============================================================================


class TestCompleteFlow:
    """Integration tests for complete conversation flows."""

    def test_full_tool_use_flow(self, template, today_date):
        """Test a complete tool use flow."""
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What's the weather?"},
            {
                "role": "assistant",
                "content": "Let me check.",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {
                            "name": "weather",
                            "arguments": '{"city": "Tokyo"}',
                        },
                    }
                ],
            },
            {"role": "tool", "tool_call_id": "call_1", "content": "Sunny, 25°C"},
            {
                "role": "assistant",
                "content": "It's sunny and 25°C in Tokyo!",
            },
            {"role": "user", "content": "Thanks!"},
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "weather",
                    "description": "Get weather info",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "city": {"type": "string", "description": "City name"}
                        },
                    },
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=True
        )

        # Check structure
        assert "<|im_start|>system" in result
        assert "You are a helpful assistant." in result
        assert f"Today is: {today_date}" in result
        assert "### Tool name: weather" in result
        assert "<use_mcp_tool>" in result
        assert "<server_name>default</server_name>" in result
        assert "Sunny, 25°C" in result
        assert "It's sunny and 25°C in Tokyo!" in result
        assert result.endswith("<|im_start|>assistant\n")

    def test_reasoning_with_tool_use(self, template):
        """Test reasoning content combined with tool use."""
        messages = [
            {"role": "user", "content": "Search for Python tutorials"},
            {
                "role": "assistant",
                "content": "I'll search for Python tutorials.",
                "reasoning_content": "User wants Python tutorials. I should use web search.",
                "tool_calls": [
                    {
                        "id": "call_1",
                        "function": {
                            "name": "search",
                            "arguments": '{"q": "Python tutorials"}',
                        },
                    }
                ],
            },
        ]
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "search",
                    "description": "Search",
                    "parameters": {"type": "object", "properties": {}},
                },
            }
        ]
        result = template.render(
            messages=messages, tools=tools, add_generation_prompt=False
        )

        # Should have both thinking and tool call
        assert "<think>" in result
        assert "User wants Python tutorials" in result
        assert "</think>" in result
        assert "<use_mcp_tool>" in result


# ============================================================================
# Run tests
# ============================================================================

if __name__ == "__main__":
    pytest.main([__file__, "-v"])


================================================
FILE: apps/miroflow-agent/README.md
================================================
# MiroFlow Agent

> For comprehensive documentation, installation guide, and tool configuration, see the [main README](../../README.md).

## Prerequisites

Before running the agent, ensure you have:

1. **Installed dependencies**: Run `uv sync` in this directory
1. **Configured environment variables**: Copy `.env.example` to `.env` and fill in your API keys
   ```bash
   cp .env.example .env
   # Edit .env with your actual API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)
   ```
1. **Started your model server** (for MiroThinker models): See the [Serve the MiroThinker Model](../../README.md#serve-the-mirothinker-model) section

## Quick Start

### Run a Single Task

The simplest way to test the agent is running `main.py` directly. It will execute a default task: *"What is the title of today's arxiv paper in computer science?"*

```bash
# Using MiroThinker models (requires your own model server)
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 llm.base_url=http://localhost:61002/v1

# Using Claude (requires ANTHROPIC_API_KEY in .env)
uv run python main.py llm=claude-3-7 agent=single_agent_keep5

# Using GPT-5 (requires OPENAI_API_KEY in .env)
uv run python main.py llm=gpt-5 agent=single_agent_keep5
```

### Customize Your Task

To ask a different question, edit `main.py` line 32:

```python
task_description = "Your custom question here"
```

Then run the agent again. It will search the web, execute code, and provide an answer.

### Run Benchmark Evaluation

For systematic evaluation on standard benchmarks, add the `benchmark=` parameter:

```bash
# Run on debug benchmark (quick test)
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=debug llm.base_url=http://localhost:61002/v1

# Run on specific benchmarks
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=gaia-validation-text-103 llm.base_url=http://localhost:61002/v1
```

## Available Configurations

### LLM Models

| Model | Config Name | Requirements |
|-------|-------------|--------------|
| MiroThinker (self-hosted) | `qwen-3` | Model server + `llm.base_url` |
| Claude 3.7 Sonnet | `claude-3-7` | `ANTHROPIC_API_KEY` in .env |
| GPT-5 | `gpt-5` | `OPENAI_API_KEY` in .env |

### Agent Configurations

**MiroThinker v1.5:**

- `mirothinker_v1.5_keep5_max200` ⭐ (recommended) - context management, up to 200 turns
- `mirothinker_v1.5_keep5_max400` - context management, up to 400 turns (for BrowseComp)
- `mirothinker_v1.5` - no context management, up to 600 turns

**MiroThinker v1.0:**

- `mirothinker_v1.0_keep5` (recommended) - context management, up to 600 turns
- `mirothinker_v1.0` - no context management, up to 600 turns

**General (for closed-source models like Claude, GPT-5):**

- `single_agent_keep5` (recommended) - single agent with context management
- `single_agent` - single agent without context management

**Multi-Agent (Legacy for v0.1/v0.2):**

- `multi_agent` - multi-agent with commercial tools
- `multi_agent_os` - multi-agent with open-source tools

### Benchmark Configs

`debug`, `browsecomp`, `browsecomp_zh`, `hle`, `hle-text-2158`, `hle-text-500`, `gaia-validation-text-103`, `gaia-validation`, `frames`, `xbench_deepsearch`, `futurex`, `seal-0`, `aime2025`, `deepsearchqa`, `webwalkerqa`

## Output

The agent will:

1. Execute the task using available tools (search, code execution, etc.)
1. Generate a final summary and boxed answer
1. Save detailed logs to `../../logs/` directory
1. Display the results in the terminal

## Troubleshooting

| Problem | Solution |
|---------|----------|
| API key errors | Check `.env` file has correct keys |
| Model connection failed | Verify `llm.base_url` is accessible |
| Tool execution errors | Check E2B/Serper/Jina API keys and quotas |
| Out of memory | Use `mirothinker_v1.5_keep5_max200` config |

For detailed logs, check the `logs/` directory.


================================================
FILE: apps/miroflow-agent/benchmarks/__init__.py
================================================


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "aime2025"
BENCHMARK_NAME_STD = "AIME2025"
TASKS_PER_RUN = 30
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "browsecomp"
BENCHMARK_NAME_STD = "BrowseComp-EN"
TASKS_PER_RUN = 1266
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "browsecomp_zh"
BENCHMARK_NAME_STD = "BrowseComp-ZH"
TASKS_PER_RUN = 289
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_deepsearchqa.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import glob
import json
import os
from pathlib import Path

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "deepsearchqa"
BENCHMARK_NAME_STD = "DeepSearchQA"
TASKS_PER_RUN = 900
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def extract_eval_details_from_log(log_file: str) -> dict:
    """
    Extract evaluation details from a completed task log file.

    Returns:
        Dict with num_correct, num_expected, num_excessive, or empty dict if not found
    """
    try:
        with open(log_file, "r") as f:
            content = f.read()

        # Try to parse as JSON first (task log files are JSON)
        try:
            log_data = json.loads(content)

            # Method 1: Check for eval_details field (new format - saved directly)
            if "eval_details" in log_data and log_data["eval_details"]:
                eval_details = log_data["eval_details"]
                if all(
                    k in eval_details
                    for k in ["num_correct", "num_expected", "num_excessive"]
                ):
                    return {
                        "num_correct": eval_details["num_correct"],
                        "num_expected": eval_details["num_expected"],
                        "num_excessive": eval_details["num_excessive"],
                    }

            # Method 2: Check if llm_response contains the evaluation output (legacy format)
            if "llm_response" in log_data and log_data["llm_response"]:
                llm_response = log_data["llm_response"]

                # Look for DeepSearchQA Judge output
                if "DeepSearchQA Judge - Correct:" in llm_response:
                    for line in llm_response.split("\n"):
                        if "DeepSearchQA Judge - Correct:" in line:
                            # Parse "Correct: X/Y, Excessive: Z"
                            parts = line.split("Correct:")[1].strip()
                            correct_part, excessive_part = parts.split(", Excessive:")
                            num_correct, num_expected = map(
                                int, correct_part.split("/")
                            )
                            num_excessive = int(excessive_part.strip())

                            return {
                                "num_correct": num_correct,
                                "num_expected": num_expected,
                                "num_excessive": num_excessive,
                            }
        except json.JSONDecodeError:
            # Not JSON, try as plain text (legacy format)
            if "DeepSearchQA Judge - Correct:" in content:
                for line in content.split("\n"):
                    if "DeepSearchQA Judge - Correct:" in line:
                        # Parse "Correct: X/Y, Excessive: Z"
                        parts = line.split("Correct:")[1].strip()
                        correct_part, excessive_part = parts.split(", Excessive:")
                        num_correct, num_expected = map(int, correct_part.split("/"))
                        num_excessive = int(excessive_part.strip())

                        return {
                            "num_correct": num_correct,
                            "num_expected": num_expected,
                            "num_excessive": num_excessive,
                        }
    except Exception:
        pass

    return {}


def calculate_deepsearchqa_metrics_from_logs(base_path: str) -> dict:
    """
    Calculate metrics from individual task log files (for in-progress runs).

    Returns:
        Dict with metrics or None if no completed tasks found
    """
    try:
        # Find all completed task log files
        pattern = os.path.join(base_path, "run_*/task_*.json")
        log_files = glob.glob(pattern)

        if not log_files:
            return None

        num_valid = 0
        num_fully_correct = 0
        num_fully_incorrect = 0
        num_correct_with_extraneous = 0
        f1_list = []

        for log_file in log_files:
            details = extract_eval_details_from_log(log_file)
            if not details:
                continue

            num_correct = details["num_correct"]
            num_expected = details["num_expected"]
            num_excessive = details["num_excessive"]

            # Calculate per-item metrics
            true_positives = num_correct
            false_negatives = num_expected - num_correct
            false_positives = num_excessive

            # Calculate precision and recall for F1
            precision = 0.0
            if (true_positives + false_positives) > 0:
                precision = true_positives / (true_positives + false_positives)

            recall = 0.0
            if (true_positives + false_negatives) > 0:
                recall = true_positives / (true_positives + false_negatives)

            f1 = 0.0
            if (precision + recall) > 0:
                f1 = 2 * (precision * recall) / (precision + recall)

            f1_list.append(f1)

            # Classify into categories
            all_expected_correct = num_correct == num_expected
            has_extraneous = num_excessive > 0

            if all_expected_correct and not has_extraneous:
                num_fully_correct += 1
            elif num_correct == 0:
                num_fully_incorrect += 1
            elif all_expected_correct and has_extraneous:
                num_correct_with_extraneous += 1

            num_valid += 1

        if num_valid > 0:
            return {
                "num_valid": num_valid,
                "fully_correct": num_fully_correct,
                "fully_incorrect": num_fully_incorrect,
                "correct_with_extraneous": num_correct_with_extraneous,
                "pct_fully_correct": num_fully_correct / num_valid,
                "pct_fully_incorrect": num_fully_incorrect / num_valid,
                "pct_correct_with_extraneous": num_correct_with_extraneous / num_valid,
                "avg_f1": sum(f1_list) / len(f1_list),
            }

        return None

    except Exception:
        return None


def calculate_deepsearchqa_metrics(results_file: str) -> dict:
    """
    Calculate DeepSearchQA-specific metrics from results file.
    Following the official Google DeepSearchQA evaluation metrics:
    1. Fully Correct: All expected answers correct + no extraneous answers
    2. Fully Incorrect: No correct answers
    3. Correct with Extraneous Answers: All expected answers correct + has extraneous
    4. F1 Score: Harmonic mean of precision and recall

    Returns:
        Dict with the 4 core metrics
    """
    try:
        results = []
        with open(results_file, "r") as f:
            for line in f:
                if line.strip():
                    results.append(json.loads(line))

        num_valid = 0
        num_fully_correct = 0
        num_fully_incorrect = 0
        num_correct_with_extraneous = 0
        f1_list = []

        for result in results:
            if result.get("status") != "success":
                continue

            # Extract eval_details from attempts
            if "attempts" in result and result["attempts"]:
                for attempt in result["attempts"]:
                    if "eval_details" in attempt and attempt["eval_details"]:
                        details = attempt["eval_details"]
                        num_correct = details.get("num_correct", 0)
                        num_expected = details.get("num_expected", 0)
                        num_excessive = details.get("num_excessive", 0)

                        # Calculate per-item metrics
                        true_positives = num_correct
                        false_negatives = num_expected - num_correct
                        false_positives = num_excessive

                        # Calculate precision and recall for F1
                        precision = 0.0
                        if (true_positives + false_positives) > 0:
                            precision = true_positives / (
                                true_positives + false_positives
                            )

                        recall = 0.0
                        if (true_positives + false_negatives) > 0:
                            recall = true_positives / (true_positives + false_negatives)

                        f1 = 0.0
                        if (precision + recall) > 0:
                            f1 = 2 * (precision * recall) / (precision + recall)

                        f1_list.append(f1)

                        # Classify into categories
                        all_expected_correct = num_correct == num_expected
                        has_extraneous = num_excessive > 0

                        if all_expected_correct and not has_extraneous:
                            num_fully_correct += 1
                        elif num_correct == 0:
                            num_fully_incorrect += 1
                        elif all_expected_correct and has_extraneous:
                            num_correct_with_extraneous += 1

                        num_valid += 1
                        break  # Only use first attempt with details

        if num_valid > 0:
            return {
                "num_valid": num_valid,
                "fully_correct": num_fully_correct,
                "fully_incorrect": num_fully_incorrect,
                "correct_with_extraneous": num_correct_with_extraneous,
                "pct_fully_correct": num_fully_correct / num_valid,
                "pct_fully_incorrect": num_fully_incorrect / num_valid,
                "pct_correct_with_extraneous": num_correct_with_extraneous / num_valid,
                "avg_f1": sum(f1_list) / len(f1_list),
            }
        else:
            return {"num_valid": 0}

    except Exception as e:
        print(f"Warning: Could not calculate DeepSearchQA metrics: {e}")
        return {"num_valid": 0}


def show_deepsearchqa_metrics(base_path: str):
    """
    Show DeepSearchQA-specific metrics for all runs.
    Following Google DeepSearchQA official metrics:
    1. Fully Correct
    2. Fully Incorrect
    3. Correct with Extraneous Answers
    4. F1 Score
    """
    print("\n" + "=" * 80)
    print("DeepSearchQA Metrics (Official Google Metrics)")
    print("=" * 80)

    # Find all benchmark_results.jsonl files
    results_files = glob.glob(os.path.join(base_path, "run_*/benchmark_results.jsonl"))

    if not results_files:
        print("(Metrics will be available after tasks complete)")
        return

    all_fully_correct = []
    all_fully_incorrect = []
    all_correct_with_extraneous = []
    all_f1 = []

    for results_file in sorted(results_files):
        run_dir = Path(results_file).parent.name
        metrics = calculate_deepsearchqa_metrics(results_file)

        if metrics["num_valid"] > 0:
            fully_correct_pct = metrics["pct_fully_correct"]
            fully_incorrect_pct = metrics["pct_fully_incorrect"]
            correct_with_extraneous_pct = metrics["pct_correct_with_extraneous"]
            f1 = metrics["avg_f1"]

            all_fully_correct.append(fully_correct_pct)
            all_fully_incorrect.append(fully_incorrect_pct)
            all_correct_with_extraneous.append(correct_with_extraneous_pct)
            all_f1.append(f1)

            print(f"\n{run_dir} ({metrics['num_valid']} items):")
            print(
                f"  Fully Correct:              {fully_correct_pct:6.2%}  ({metrics['fully_correct']} items)"
            )
            print(
                f"  Fully Incorrect:            {fully_incorrect_pct:6.2%}  ({metrics['fully_incorrect']} items)"
            )
            print(
                f"  Correct w/ Extraneous:      {correct_with_extraneous_pct:6.2%}  ({metrics['correct_with_extraneous']} items)"
            )
            print(f"  F1 Score:                   {f1:6.2%}")

    if all_fully_correct:
        print("\n" + "=" * 80)
        print(f"Average across {len(all_fully_correct)} runs:")
        print("=" * 80)
        avg_fully_correct = sum(all_fully_correct) / len(all_fully_correct)
        avg_fully_incorrect = sum(all_fully_incorrect) / len(all_fully_incorrect)
        avg_correct_with_extraneous = sum(all_correct_with_extraneous) / len(
            all_correct_with_extraneous
        )
        avg_f1 = sum(all_f1) / len(all_f1)

        print(f"  Fully Correct:              {avg_fully_correct:6.2%}")
        print(f"  Fully Incorrect:            {avg_fully_incorrect:6.2%}")
        print(f"  Correct w/ Extraneous:      {avg_correct_with_extraneous:6.2%}")
        print(f"  F1 Score:                   {avg_f1:6.2%}")

    print("=" * 80)


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )

        # Show DeepSearchQA-specific metrics (only if runs are complete)
        # Check if any run has completed all its tasks
        has_complete_run = False
        run_dirs = glob.glob(os.path.join(args.path, "run_*"))
        for run_dir in run_dirs:
            results_file = os.path.join(run_dir, "benchmark_results.jsonl")
            if os.path.exists(results_file):
                has_complete_run = True
                break

        if has_complete_run:
            show_deepsearchqa_metrics(args.path)
        elif summary.total_completed > 0:
            # Try to show intermediate metrics from completed tasks
            interim_metrics = calculate_deepsearchqa_metrics_from_logs(args.path)

            print("\n" + "=" * 80)
            print("DeepSearchQA Metrics (Official Google Metrics)")
            print("=" * 80)

            if interim_metrics and interim_metrics.get("num_valid", 0) > 0:
                num_with_details = interim_metrics["num_valid"]
                print(
                    f"⚠️  INTERIM RESULTS (based on {num_with_details}/{summary.total_completed} tasks with eval_details)"
                )
                if num_with_details < summary.total_completed:
                    print(
                        f"    Note: {summary.total_completed - num_with_details} completed tasks don't have eval_details (likely ran before the update)"
                    )
                print("-" * 80)

                fully_correct_pct = interim_metrics["pct_fully_correct"]
                fully_incorrect_pct = interim_metrics["pct_fully_incorrect"]
                correct_with_extraneous_pct = interim_metrics[
                    "pct_correct_with_extraneous"
                ]
                f1 = interim_metrics["avg_f1"]

                print(
                    f"  Fully Correct:              {fully_correct_pct:6.2%}  ({interim_metrics['fully_correct']} items)"
                )
                print(
                    f"  Fully Incorrect:            {fully_incorrect_pct:6.2%}  ({interim_metrics['fully_incorrect']} items)"
                )
                print(
                    f"  Correct w/ Extraneous:      {correct_with_extraneous_pct:6.2%}  ({interim_metrics['correct_with_extraneous']} items)"
                )
                print(f"  F1 Score:                   {f1:6.2%}")
                print()
                print(
                    f"Note: Based on {interim_metrics['num_valid']} completed tasks. Final metrics may differ."
                )
            else:
                print(f"Tasks in progress... ({summary.total_completed} completed)")
                print("Detailed metrics will be available when runs complete.")

            print("=" * 80)

        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "frames"
BENCHMARK_NAME_STD = "Frames"
TASKS_PER_RUN = 824
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import GAIAProgressChecker as ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "gaia-2023-validation-text-103"
BENCHMARK_NAME_STD = "GAIA-Text-103"
TASKS_PER_RUN = 103
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import GAIAProgressChecker as ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "gaia-2023-validation"
BENCHMARK_NAME_STD = "GAIA-Val-165"
TASKS_PER_RUN = 165
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle-text-2158"
BENCHMARK_NAME_STD = "HLE-Text-2158"
TASKS_PER_RUN = 2158
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle-text-500"
BENCHMARK_NAME_STD = "HLE-Text-500"
TASKS_PER_RUN = 500
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle"
BENCHMARK_NAME_STD = "HLE-2500"
TASKS_PER_RUN = 2500
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "seal-0"
BENCHMARK_NAME_STD = "SEAL-0"
TASKS_PER_RUN = 111
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "webwalkerqa"
BENCHMARK_NAME_STD = "WebWalkerQA"
TASKS_PER_RUN = 680
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_task_id_(\d+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import os

from common import ProgressChecker

# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "xbench_deepsearch"
BENCHMARK_NAME_STD = "XBench-DeepSearch"
TASKS_PER_RUN = 100
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"


def parse_args():
    parser = argparse.ArgumentParser(
        description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
    )
    parser.add_argument(
        "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()

    try:
        # Create progress checker and run analysis
        checker = ProgressChecker(
            args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
        )
        summary = checker.run_analysis(
            benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
        )
        # Exit with appropriate code
        if summary.total_tasks == 0:
            print("No task files found in any run directories")
        elif summary.total_completed == 0:
            print("No tasks completed yet")

    except FileNotFoundError as e:
        print(f"Error: {e}")
    except PermissionError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/common.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import glob
import json
import math
import os
import re
from dataclasses import dataclass
from datetime import datetime
from io import StringIO
from typing import Dict, List, Optional, Tuple

# Time estimation constants
DEFAULT_TASK_TIME_MINUTES = 3.5
MINUTES_PER_HOUR = 60
HOURS_PER_DAY = 24
MINUTES_PER_DAY = MINUTES_PER_HOUR * HOURS_PER_DAY

# Progress bar configuration
PROGRESS_BAR_WIDTH = 20
GREEN_THRESHOLD = 80
YELLOW_THRESHOLD = 60
ORANGE_THRESHOLD = 40

# Judge result patterns for correctness
CORRECT_RESULTS = ["CORRECT", "SUCCESS"]
SUCCESS_PATTERNS = ["PASS_AT_K_SUCCESS"]

# Log file configuration
LOG_FILE_PREFIX = "progress_analysis_"
LOG_FILE_TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"


def create_progress_bar(percentage: float, width: int = PROGRESS_BAR_WIDTH) -> str:
    """Create a visual progress bar for percentage display"""
    filled = int(width * percentage / 100)
    bar = "█" * filled + "░" * (width - filled)

    # Add color based on percentage
    if percentage >= GREEN_THRESHOLD:
        color = "\033[92m"  # Green
    elif percentage >= YELLOW_THRESHOLD:
        color = "\033[93m"  # Yellow
    elif percentage >= ORANGE_THRESHOLD:
        color = "\033[33m"  # Orange
    else:
        color = "\033[91m"  # Red

    reset = "\033[0m"
    return f"{color}[{bar}] {percentage:.1f}%{reset}"


def find_earliest_start_time(completed_files: List[str]) -> Optional[datetime]:
    """Find the earliest start time from all completed files"""
    earliest_time = None

    for file_path in completed_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            if "start_time" in data:
                # Parse UTC time and convert to naive datetime
                start_time_str = data["start_time"]
                if start_time_str.endswith("Z"):
                    start_time_str = start_time_str[:-1] + "+00:00"
                start_time = datetime.fromisoformat(start_time_str)
                # Convert to naive datetime for comparison
                start_time = start_time.replace(tzinfo=None)

                if earliest_time is None or start_time < earliest_time:
                    earliest_time = start_time

        except (json.JSONDecodeError, KeyError, ValueError, OSError):
            continue  # Skip files with invalid timing data

    return earliest_time


def find_latest_end_time(completed_files: List[str]) -> Optional[datetime]:
    """Find the latest end time from all completed files"""
    latest_time = None

    for file_path in completed_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            if "end_time" in data:
                # Parse UTC time and convert to naive datetime
                end_time_str = data["end_time"]
                if end_time_str.endswith("Z"):
                    end_time_str = end_time_str[:-1] + "+00:00"
                end_time = datetime.fromisoformat(end_time_str)
                # Convert to naive datetime for comparison (UTC-naive)
                end_time = end_time.replace(tzinfo=None)

                if latest_time is None or end_time > latest_time:
                    latest_time = end_time

        except (json.JSONDecodeError, KeyError, ValueError, OSError):
            continue  # Skip files with invalid timing data

    # If no valid end_time found, return current UTC (naive)
    return latest_time or datetime.now().replace(tzinfo=None)


def calculate_mean_and_std(values: List[float]) -> Tuple[float, float]:
    """Calculate mean and standard deviation of a list of values"""
    if not values:
        return 0.0, 0.0

    n = len(values)
    mean = sum(values) / n

    if n == 1:
        return mean, 0.0

    variance = sum((x - mean) ** 2 for x in values) / (n - 1)
    std = math.sqrt(variance)

    return mean, std


def estimate_completion_time(
    total_tasks: int, completed_tasks: int, completed_files: List[str]
) -> str:
    """Estimate completion time based on overall progress rate from all completed tasks"""
    if completed_tasks == 0:
        return "Cannot estimate (no completed tasks)"

    # Check if all tasks are completed
    if completed_tasks >= total_tasks:
        return "All tasks completed"

    remaining_tasks = total_tasks - completed_tasks

    # Use overall completion rate from all successfully completed tasks
    earliest_start = find_earliest_start_time(completed_files)
    latest_end = find_latest_end_time(completed_files)

    if earliest_start is None:
        # Fallback to default estimation if no valid timing data
        estimated_minutes = remaining_tasks * DEFAULT_TASK_TIME_MINUTES
    else:
        # Calculate overall elapsed time
        elapsed_time = latest_end - earliest_start
        elapsed_minutes = elapsed_time.total_seconds() / 60

        if elapsed_minutes <= 0:
            return "Cannot estimate (time interval too short)"

        # Calculate average time per task based on all completed tasks
        avg_minutes_per_task = elapsed_minutes / completed_tasks
        if avg_minutes_per_task <= 0:
            return "Cannot estimate (invalid time per task)"

        estimated_minutes = remaining_tasks * avg_minutes_per_task

    # Format the estimate in minutes
    return f"~{int(estimated_minutes)} minutes"


@dataclass
class TaskStats:
    """Statistics for a single task"""

    completed: int = 0
    running: int = 0
    failed: int = 0
    judge_correct: int = 0
    total: int = 0

    # Completed files for timing analysis
    completed_files: List[str] = None

    # Turn statistics
    total_turns: int = 0
    completed_tasks_with_turns: int = 0

    # No boxed content found statistics
    no_boxed_found: int = 0

    def __post_init__(self):
        if self.completed_files is None:
            self.completed_files = []

    @property
    def judge_accuracy(self) -> float:
        """Calculate judge accuracy percentage"""
        return (
            (self.judge_correct / self.completed * 100) if self.completed > 0 else 0.0
        )

    @property
    def completion_rate(self) -> float:
        """Calculate completion rate percentage"""
        return (self.completed / self.total * 100) if self.total > 0 else 0.0

    @property
    def average_turns(self) -> float:
        """Calculate average turns per completed task"""
        return (
            (self.total_turns / self.completed_tasks_with_turns)
            if self.completed_tasks_with_turns > 0
            else 0.0
        )


@dataclass
class GAIATaskStats(TaskStats):
    """Statistics for a single task"""

    # Difficulty level tracking
    level1_completed: int = 0
    level1_correct: int = 0
    level2_completed: int = 0
    level2_correct: int = 0
    level3_completed: int = 0
    level3_correct: int = 0

    @property
    def level1_accuracy(self) -> float:
        """Calculate Level 1 accuracy percentage"""
        return (
            (self.level1_correct / self.level1_completed * 100)
            if self.level1_completed > 0
            else 0.0
        )

    @property
    def level2_accuracy(self) -> float:
        """Calculate Level 2 accuracy percentage"""
        return (
            (self.level2_correct / self.level2_completed * 100)
            if self.level2_completed > 0
            else 0.0
        )

    @property
    def level3_accuracy(self) -> float:
        """Calculate Level 3 accuracy percentage"""
        return (
            (self.level3_correct / self.level3_completed * 100)
            if self.level3_completed > 0
            else 0.0
        )


@dataclass
class SummaryStats:
    """Summary statistics across all runs"""

    total_tasks: int = 0
    total_completed: int = 0
    total_running: int = 0
    total_failed: int = 0
    total_judge_correct: int = 0
    total_no_boxed_found: int = 0

    @property
    def total_judge_accuracy(self) -> float:
        """Calculate overall judge accuracy percentage"""
        return (
            (self.total_judge_correct / self.total_completed * 100)
            if self.total_completed > 0
            else 0.0
        )

    def average_run_accuracy(
        self, run_stats_list: List[Tuple[str, TaskStats]]
    ) -> Tuple[float, float]:
        """Calculate overall accuracy (mean) and standard deviation across individual runs"""
        if not run_stats_list:
            return 0.0, 0.0

        # Mean accuracy is the overall accuracy (weighted average)
        # This matches the OVERALL JUDGE ACCURACY calculation
        mean = self.total_judge_accuracy

        # Standard deviation is calculated from individual run accuracies
        accuracies = [
            stats.judge_accuracy for _, stats in run_stats_list if stats.completed > 0
        ]

        if not accuracies:
            return mean, 0.0

        _, std = calculate_mean_and_std(accuracies)
        return mean, std

    @property
    def total_completion_rate(self) -> float:
        """Calculate overall completion rate percentage"""
        return (
            (self.total_completed / self.total_tasks * 100)
            if self.total_tasks > 0
            else 0.0
        )


@dataclass
class GAIASummaryStats(SummaryStats):
    """Summary statistics across all runs"""

    # Difficulty level summary stats
    level1_completed: int = 0
    level1_correct: int = 0
    level2_completed: int = 0
    level2_correct: int = 0
    level3_completed: int = 0
    level3_correct: int = 0

    @property
    def level1_accuracy(self) -> float:
        """Calculate overall Level 1 accuracy percentage"""
        return (
            (self.level1_correct / self.level1_completed * 100)
            if self.level1_completed > 0
            else 0.0
        )

    @property
    def level2_accuracy(self) -> float:
        """Calculate overall Level 2 accuracy percentage"""
        return (
            (self.level2_correct / self.level2_completed * 100)
            if self.level2_completed > 0
            else 0.0
        )

    @property
    def level3_accuracy(self) -> float:
        """Calculate overall Level 3 accuracy percentage"""
        return (
            (self.level3_correct / self.level3_completed * 100)
            if self.level3_completed > 0
            else 0.0
        )


class ProgressChecker:
    """Main class for checking benchmark progress"""

    def __init__(self, target_path: str, task_per_run: int, data_path: str):
        self.target_path = target_path
        self.run_dirs: List[str] = []
        self.total_tasks_per_run = task_per_run

        # Load benchmark data
        self._load_benchmark_data(data_path)

    def _load_benchmark_data(self, data_path) -> None:
        """Load benchmark data and configuration"""
        try:
            # Load benchmark data if available
            if os.path.exists(data_path):
                with open(data_path) as f:
                    benchmark_data = [json.loads(line) for line in f.readlines()]
                print(f"Loaded {len(benchmark_data)} tasks from {data_path}")
        except Exception as e:
            print(f"Warning: Could not load data: {e}")

    def find_run_directories(self) -> List[str]:
        """Find all run directories in the target path"""
        run_dirs = []

        if not os.path.exists(self.target_path):
            raise FileNotFoundError(f"Path '{self.target_path}' does not exist")

        # Check if target_path itself is a run directory
        if os.path.basename(self.target_path).startswith("run_"):
            run_dirs.append(self.target_path)
        else:
            # Find run_* directories under target_path
            try:
                for item in os.listdir(self.target_path):
                    item_path = os.path.join(self.target_path, item)
                    if os.path.isdir(item_path) and item.startswith("run_"):
                        run_dirs.append(item_path)
            except PermissionError:
                raise PermissionError(
                    f"No permission to access directory '{self.target_path}'"
                )

        # Sort by run number
        run_dirs.sort(key=lambda x: self._extract_run_number(x))

        if not run_dirs:
            raise ValueError(f"No run directories found in '{self.target_path}'")

        return run_dirs

    def _extract_run_number(self, path: str) -> int:
        """Extract run number from directory path for sorting"""
        basename = os.path.basename(path)
        parts = basename.split("_")
        if len(parts) > 1 and parts[1].isdigit():
            return int(parts[1])
        return 0

    def _extract_task_id(self, filename: str, task_id_pattern: str) -> Optional[str]:
        """Extract task ID from filename"""
        match = re.match(task_id_pattern, filename)
        return match.group(1) if match else None

    def _get_latest_task_files(self, run_dir: str, task_id_pattern: str) -> List[str]:
        """Get the latest task file for each task ID in a run directory"""
        json_files = glob.glob(os.path.join(run_dir, "task_*.json"))

        if not json_files:
            return []

        # Group by task ID, keep only the latest file for each task
        task_groups: Dict[str, Dict] = {}

        for json_file in json_files:
            filename = os.path.basename(json_file)
            task_id = self._extract_task_id(filename, task_id_pattern)

            if task_id:
                try:
                    # Read the JSON file to get the start_time
                    with open(json_file, "r", encoding="utf-8") as f:
                        data = json.load(f)

                    start_time_str = data.get("start_time", "")
                    if start_time_str:
                        # Parse the ISO format timestamp
                        from datetime import datetime

                        start_time = datetime.fromisoformat(
                            start_time_str.replace("Z", "+00:00")
                        )
                        start_timestamp = start_time.timestamp()
                    else:
                        # Fallback to file modification time if start_time is not available
                        start_timestamp = os.path.getmtime(json_file)

                    if (
                        task_id not in task_groups
                        or start_timestamp > task_groups[task_id]["timestamp"]
                    ):
                        task_groups[task_id] = {
                            "file": json_file,
                            "timestamp": start_timestamp,
                        }
                except (json.JSONDecodeError, ValueError, OSError) as e:
                    # Fallback to file modification time if JSON parsing fails
                    print(f"Warning: Could not parse {json_file}: {e}")
                    file_mtime = os.path.getmtime(json_file)
                    if (
                        task_id not in task_groups
                        or file_mtime > task_groups[task_id]["timestamp"]
                    ):
                        task_groups[task_id] = {
                            "file": json_file,
                            "timestamp": file_mtime,
                        }

        return [info["file"] for info in task_groups.values()]

    def _is_task_completed(self, data: Dict) -> bool:
        """Check if a task is completed based on its data"""
        end_time = data.get("end_time", "")
        error = data.get("error", "")
        status = data.get("status", "")
        final_answer = data.get("final_boxed_answer", "")

        return (
            (end_time != "" and error == "")
            or (status == "completed")
            or (final_answer != "" and error == "")
        )

    def _is_judge_correct(self, judge_result) -> bool:
        """Determine if LLM judge result indicates correct answer"""
        if isinstance(judge_result, bool):
            return judge_result
        elif isinstance(judge_result, str):
            result_str = judge_result.upper()
            return (
                result_str in CORRECT_RESULTS
                or any(pattern in result_str for pattern in SUCCESS_PATTERNS)
                or result_str.lower() in ["true", "1", "yes", "pass"]
            )
        elif isinstance(judge_result, (int, float)):
            return judge_result > 0
        elif isinstance(judge_result, dict):
            return judge_result.get("correct", False) or judge_result.get(
                "is_correct", False
            )
        return False

    def _calculate_turns(self, data: Dict) -> int:
        """Calculate number of turns from task data (excluding system prompt)"""
        try:
            main_agent_history = data.get("main_agent_message_history", {})
            message_history = main_agent_history.get("message_history", [])

            if not message_history:
                return 0

            # Filter out system messages and count total messages, then divide by 2
            # Turn count = (total messages excluding system) / 2
            non_system_messages = [
                msg for msg in message_history if msg.get("role") != "system"
            ]

            # Each turn consists of user + assistant, so divide by 2
            turn_count = len(non_system_messages) // 2

            return turn_count
        except (KeyError, TypeError, IndexError):
            return 0

    def analyze_run_directory(
        self, run_dir: str, task_id_pattern: str
    ) -> Tuple[TaskStats, Dict[str, bool]]:
        """Analyze a single run directory and return statistics and task results

        Returns:
            Tuple[TaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct
        """
        latest_files = self._get_latest_task_files(run_dir, task_id_pattern)

        # Use the correct total tasks
        stats = TaskStats(total=self.total_tasks_per_run)
        completed_files = []  # Track completed files for timing analysis
        task_results = {}  # Track task_id -> is_correct mapping

        for json_file in latest_files:
            try:
                with open(json_file, "r", encoding="utf-8") as f:
                    data = json.load(f)

                status = data.get("status", "")

                if status == "running":
                    stats.running += 1
                elif self._is_task_completed(data):
                    stats.completed += 1
                    completed_files.append(json_file)  # Track for timing analysis

                    # Check judge result for completed tasks
                    judge_result = data.get("final_judge_result", None)
                    is_correct = judge_result is not None and self._is_judge_correct(
                        judge_result
                    )
                    if is_correct:
                        stats.judge_correct += 1

                    # Extract task ID and store result
                    filename = os.path.basename(json_file)
                    task_id = self._extract_task_id(filename, task_id_pattern)
                    if task_id:
                        task_results[task_id] = is_correct

                    # Check if final_boxed_answer contains "No \\boxed{} content found"
                    final_boxed_answer = data.get("final_boxed_answer", "")
                    if (
                        isinstance(final_boxed_answer, str)
                        and "No \\boxed{} content found" in final_boxed_answer
                    ):
                        stats.no_boxed_found += 1

                    # Calculate turns for completed tasks
                    turns = self._calculate_turns(data)
                    if turns > 0:
                        stats.total_turns += turns
                        stats.completed_tasks_with_turns += 1
                else:
                    stats.failed += 1

            except (json.JSONDecodeError, IOError) as e:
                # Skip files that are being written or corrupted
                if "Expecting value" in str(e) or "line 1 column 1" in str(e):
                    continue  # Skip corrupted/empty files
                print(f"Warning: Could not parse {json_file}: {e}")
                stats.failed += 1
            except Exception as e:
                print(f"Warning: Unexpected error processing {json_file}: {e}")
                stats.failed += 1

        # Store completed files in stats for timing analysis
        stats.completed_files = completed_files
        return stats, task_results

    def run_analysis(
        self, benchmark_name_std: str, task_id_pattern: str
    ) -> SummaryStats:
        """Run the complete analysis and return summary statistics"""
        self.run_dirs = self.find_run_directories()
        summary = SummaryStats()
        run_stats_list = []  # Store statistics for each run
        all_completed_files = []  # Collect all completed files for timing analysis
        all_task_results = {}  # Collect task_id -> list of is_correct across all runs

        print()
        print("=" * 80)
        print(f"Analyzing benchmark progress for: {self.target_path}")
        print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 80)

        # Analyze each run directory
        for run_dir in self.run_dirs:
            run_name = os.path.basename(run_dir)
            stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)

            if stats.total == 0:
                print(f"{run_name}: No task files found")
                print()
                continue

            # Display run statistics in a single line
            run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}"

            # Add accuracy information
            if stats.completed > 0:
                run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)"

                # Add average turns information (show even if some tasks are still running)
                if stats.completed_tasks_with_turns > 0:
                    run_info += f" | Avg Turns: {stats.average_turns:.1f}"

            print(run_info)
            print()

            # Store run statistics for later display
            run_stats_list.append((run_name, stats))

            # Collect completed files for timing analysis
            all_completed_files.extend(stats.completed_files)

            # Collect task results for Pass@n calculation
            for task_id, is_correct in task_results.items():
                if task_id not in all_task_results:
                    all_task_results[task_id] = []
                all_task_results[task_id].append(is_correct)

            # Update summary statistics
            summary.total_tasks += stats.total
            summary.total_completed += stats.completed
            summary.total_running += stats.running
            summary.total_failed += stats.failed
            summary.total_judge_correct += stats.judge_correct
            summary.total_no_boxed_found += stats.no_boxed_found

        # Display summary after all runs are processed
        self._display_summary(
            summary,
            run_stats_list,
            all_completed_files,
            benchmark_name_std,
            all_task_results,
        )

        return summary

    def _calculate_pass_at_n(
        self, all_task_results: Dict[str, List[bool]], total_tasks: int
    ) -> Tuple[int, float]:
        """Calculate Pass@n: number of tasks with at least one correct answer across all runs

        Returns:
            Tuple[int, float]: (pass_at_n_count, pass_at_n_percentage)
        """
        if not all_task_results or total_tasks == 0:
            return 0, 0.0

        pass_at_n_count = 0
        for task_id, results in all_task_results.items():
            # If at least one run got it correct, this task passes
            if any(results):
                pass_at_n_count += 1

        pass_at_n_percentage = (
            (pass_at_n_count / total_tasks * 100) if total_tasks > 0 else 0.0
        )
        return pass_at_n_count, pass_at_n_percentage

    def _display_summary(
        self,
        summary: SummaryStats,
        run_stats_list: List[Tuple[str, TaskStats]],
        completed_files: List[str],
        benchmark_name_std: str,
        all_task_results: Dict[str, List[bool]] = None,
    ):
        """Display summary statistics"""
        print("=" * 80)
        print("SUMMARY STATISTICS")
        print("=" * 80)
        print(
            f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)"
        )

        # Estimate completion time using overall progress rate
        if summary.total_tasks > 0 and summary.total_completed > 0:
            remaining_tasks = summary.total_tasks - summary.total_completed
            earliest_start = find_earliest_start_time(completed_files)
            latest_end = find_latest_end_time(completed_files)
            completion_estimate = estimate_completion_time(
                summary.total_tasks, summary.total_completed, completed_files
            )

            print(f"Remaining Tasks: {remaining_tasks}")
            if earliest_start:
                elapsed_time = latest_end - earliest_start
                elapsed_minutes = elapsed_time.total_seconds() / 60
                tasks_per_minute = (
                    summary.total_completed / elapsed_minutes
                    if elapsed_minutes > 0
                    else 0
                )
                print(f"Elapsed Time: {elapsed_minutes:.1f} minutes")
                print(f"Completion Rate: {tasks_per_minute:.1f} tasks/minute")
            print(f"Estimated Time to Complete: {completion_estimate}")

        if summary.total_completed > 0:
            accuracy_bar = create_progress_bar(summary.total_judge_accuracy)
            print(
                f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}"
            )

            # Calculate and display overall average turns
            total_turns = sum(stats.total_turns for _, stats in run_stats_list)
            total_tasks_with_turns = sum(
                stats.completed_tasks_with_turns for _, stats in run_stats_list
            )
            if total_tasks_with_turns > 0:
                overall_avg_turns = total_turns / total_tasks_with_turns
                print(f"Overall Average Turns: {overall_avg_turns:.1f}")

        # Display each run's correct percentage
        if run_stats_list:
            print()
            print("INDIVIDUAL RUN ACCURACIES:")
            for run_name, stats in run_stats_list:
                if stats.completed > 0:
                    accuracy_bar = create_progress_bar(stats.judge_accuracy)
                    print(
                        f"  {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}"
                    )
                else:
                    print(
                        f"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)"
                    )

            # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
            num_runs = len(run_stats_list)
            mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
            if mean_acc > 0:
                print()
                if num_runs > 1:
                    print(
                        f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%"
                    )
                else:
                    print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%")

            # Display Pass@n if multiple runs
            if num_runs > 1 and all_task_results:
                # Calculate total unique tasks (use the first run's total as reference)
                first_run_total = (
                    run_stats_list[0][1].total
                    if run_stats_list
                    else summary.total_tasks
                )
                pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
                    all_task_results, first_run_total
                )
                pass_at_n_bar = create_progress_bar(pass_at_n_percentage)
                print(
                    f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}"
                )

            # Display no boxed content found statistics
            if summary.total_completed > 0:
                print(
                    f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)"
                )

        print("=" * 80)
        print()

        # Save analysis results to log file
        self._save_analysis_log(
            summary,
            run_stats_list,
            completed_files,
            benchmark_name_std,
            all_task_results,
        )

    def _save_analysis_log(
        self,
        summary: SummaryStats,
        run_stats_list: List[Tuple[str, TaskStats]],
        completed_files: List[str],
        benchmark_name_std: str,
        all_task_results: Dict[str, List[bool]] = None,
    ) -> None:
        """Save analysis results to a log file in the target directory"""
        try:
            # Create log filename with timestamp
            timestamp = datetime.now().strftime(LOG_FILE_TIMESTAMP_FORMAT)
            log_filename = f"{LOG_FILE_PREFIX}{timestamp}.log"
            log_path = os.path.join(self.target_path, log_filename)

            # Capture the analysis output
            output_buffer = StringIO()

            # Write header
            output_buffer.write("=" * 80 + "\n")
            output_buffer.write(f"{benchmark_name_std} Progress Analysis\n")
            output_buffer.write(
                f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
            )
            output_buffer.write(f"Target Path: {self.target_path}\n")
            output_buffer.write("=" * 80 + "\n\n")

            # Write run statistics
            for run_name, stats in run_stats_list:
                output_buffer.write(
                    f"{run_name}: Status: {stats.completed} completed, {stats.running} running, {stats.failed} failed\n"
                )
                if stats.completed > 0:
                    accuracy = stats.judge_correct / stats.completed * 100
                    output_buffer.write(
                        f"  Overall Accuracy: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n"
                    )
                else:
                    output_buffer.write(
                        f"  Overall Accuracy: {stats.judge_correct}/{stats.completed} (N/A)\n"
                    )
                output_buffer.write("\n")

            # Write summary statistics
            output_buffer.write("=" * 80 + "\n")
            output_buffer.write("SUMMARY STATISTICS\n")
            output_buffer.write("=" * 80 + "\n")
            output_buffer.write(
                f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\n"
            )

            # Write timing information
            if summary.total_tasks > 0 and summary.total_completed > 0:
                remaining_tasks = summary.total_tasks - summary.total_completed
                earliest_start = find_earliest_start_time(completed_files)
                latest_end = find_latest_end_time(completed_files)
                completion_estimate = estimate_completion_time(
                    summary.total_tasks, summary.total_completed, completed_files
                )

                output_buffer.write(f"Remaining Tasks: {remaining_tasks}\n")
                if earliest_start:
                    elapsed_time = latest_end - earliest_start
                    elapsed_minutes = elapsed_time.total_seconds() / 60
                    tasks_per_minute = (
                        summary.total_completed / elapsed_minutes
                        if elapsed_minutes > 0
                        else 0
                    )
                    output_buffer.write(
                        f"Elapsed Time: {elapsed_minutes:.1f} minutes\n"
                    )
                    output_buffer.write(
                        f"Completion Rate: {tasks_per_minute:.1f} tasks/minute\n"
                    )
                output_buffer.write(
                    f"Estimated Time to Complete: {completion_estimate}\n"
                )

            if summary.total_completed > 0:
                accuracy = summary.total_judge_correct / summary.total_completed * 100
                output_buffer.write(
                    f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} ({accuracy:.1f}%)\n"
                )
                no_boxed_percentage = (
                    summary.total_no_boxed_found / summary.total_completed * 100
                )
                output_buffer.write(
                    f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n"
                )

            # Write individual run accuracies
            if run_stats_list:
                output_buffer.write("\nINDIVIDUAL RUN ACCURACIES:\n")
                for run_name, stats in run_stats_list:
                    if stats.completed > 0:
                        accuracy = stats.judge_correct / stats.completed * 100
                        output_buffer.write(
                            f"  {run_name}: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n"
                        )
                    else:
                        output_buffer.write(
                            f"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\n"
                        )

                # Write mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
                num_runs = len(run_stats_list)
                mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
                if mean_acc > 0:
                    if num_runs > 1:
                        output_buffer.write(
                            f"\nPass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\n"
                        )
                    else:
                        output_buffer.write(
                            f"\nMEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\n"
                        )

                # Write Pass@n if multiple runs
                if num_runs > 1 and all_task_results:
                    first_run_total = (
                        run_stats_list[0][1].total
                        if run_stats_list
                        else summary.total_tasks
                    )
                    pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
                        all_task_results, first_run_total
                    )
                    output_buffer.write(
                        f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} ({pass_at_n_percentage:.1f}%)\n"
                    )

                    if summary.total_completed > 0:
                        no_boxed_percentage = (
                            summary.total_no_boxed_found / summary.total_completed * 100
                        )
                        output_buffer.write(
                            f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n"
                        )

            output_buffer.write("=" * 80 + "\n")

            # Write to file
            with open(log_path, "w", encoding="utf-8") as f:
                f.write(output_buffer.getvalue())

            output_buffer.close()
            print(f"Analysis results saved to: {log_path}")

        except Exception as e:
            print(f"Warning: Could not save analysis log: {e}")


class GAIAProgressChecker(ProgressChecker):
    """Main class for checking GAIA benchmark progress"""

    DIFFICULTY_LEVELS = [1, 2, 3]

    def __init__(self, target_path: str, task_per_run: int, data_path: str):
        super().__init__(target_path, task_per_run=0, data_path="")  # 调用父类构造函数

        # Difficulty level mapping
        self.task_difficulty_map: Dict[str, int] = {}
        self.total_tasks_per_run = task_per_run

        # Load GAIA data if this is a GAIA validation directory
        self._load_benchmark_data(data_path)

    def _load_benchmark_data(self, data_path) -> None:
        """Load GAIA-specific data and configuration"""
        try:
            if os.path.exists(data_path):
                with open(data_path) as f:
                    benchmark_data = [json.loads(line) for line in f.readlines()]

                print(f"Loaded {len(benchmark_data)} tasks from {data_path}")

                for line in benchmark_data:
                    task_id = line["task_id"]
                    metadata = line.get("metadata", {})
                    difficulty_level = (
                        metadata.get("Level") or metadata.get("level") or 0
                    )
                    if difficulty_level in self.DIFFICULTY_LEVELS:
                        self.task_difficulty_map[task_id] = difficulty_level

                level_counts = {
                    level: sum(
                        1 for v in self.task_difficulty_map.values() if v == level
                    )
                    for level in self.DIFFICULTY_LEVELS
                }
                print(f"Difficulty level distribution: {level_counts}")

        except Exception as e:
            print(f"Warning: Could not load GAIA data: {e}")

    def _update_difficulty_stats(
        self, stats: GAIATaskStats, task_id: str, is_correct: bool
    ) -> None:
        """Update difficulty level statistics for a task"""
        if task_id not in self.task_difficulty_map:
            return
        difficulty_level = self.task_difficulty_map[task_id]
        if difficulty_level == 1:
            stats.level1_completed += 1
            if is_correct:
                stats.level1_correct += 1
        elif difficulty_level == 2:
            stats.level2_completed += 1
            if is_correct:
                stats.level2_correct += 1
        elif difficulty_level == 3:
            stats.level3_completed += 1
            if is_correct:
                stats.level3_correct += 1

    def analyze_run_directory(
        self, run_dir: str, task_id_pattern: str
    ) -> Tuple[GAIATaskStats, Dict[str, bool]]:
        """Analyze a single run directory and return statistics (GAIA-specific)

        Returns:
            Tuple[GAIATaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct
        """
        latest_files = self._get_latest_task_files(
            run_dir, task_id_pattern
        )  # 直接用父类的实现
        stats = GAIATaskStats(total=len(latest_files))
        completed_files = []
        task_results = {}  # Track task_id -> is_correct mapping

        for json_file in latest_files:
            try:
                with open(json_file, "r", encoding="utf-8") as f:
                    data = json.load(f)

                status = data.get("status", "")
                if status == "running":
                    stats.running += 1
                elif self._is_task_completed(data):
                    stats.completed += 1
                    completed_files.append(json_file)

                    judge_result = data.get("final_judge_result", None)
                    is_correct = judge_result is not None and self._is_judge_correct(
                        judge_result
                    )
                    if is_correct:
                        stats.judge_correct += 1

                    # Check if final_boxed_answer contains "No \\boxed{} content found"
                    final_boxed_answer = data.get("final_boxed_answer", "")
                    if (
                        isinstance(final_boxed_answer, str)
                        and "No \\boxed{} content found" in final_boxed_answer
                    ):
                        stats.no_boxed_found += 1

                    task_id = self._extract_task_id(
                        os.path.basename(json_file), task_id_pattern
                    )
                    if task_id:
                        self._update_difficulty_stats(stats, task_id, is_correct)
                        task_results[task_id] = is_correct

                    # Calculate turns for completed tasks
                    turns = self._calculate_turns(data)
                    if turns > 0:
                        stats.total_turns += turns
                        stats.completed_tasks_with_turns += 1
                else:
                    stats.failed += 1
            except Exception as e:
                print(f"Warning: Could not process {json_file}: {e}")
                stats.failed += 1

        stats.completed_files = completed_files
        return stats, task_results

    def run_analysis(
        self, benchmark_name_std: str, task_id_pattern: str
    ) -> GAIASummaryStats:
        """Run the complete analysis and return summary statistics"""
        self.run_dirs = self.find_run_directories()
        summary = GAIASummaryStats()
        run_stats_list = []  # Store statistics for each run
        all_completed_files = []  # Collect all completed files for timing analysis
        all_task_results = {}  # Collect task_id -> list of is_correct across all runs

        print()
        print("=" * 80)
        print(f"Analyzing benchmark progress for: {self.target_path}")
        print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print("=" * 80)

        # Analyze each run directory
        for run_dir in self.run_dirs:
            run_name = os.path.basename(run_dir)
            stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)

            if stats.total == 0:
                print(f"{run_name}: No task files found")
                print()
                continue

            # Display run statistics in a single line
            run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}"

            # Add accuracy information
            if stats.completed > 0:
                run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)"

                # Add average turns information (show even if some tasks are still running)
                if stats.completed_tasks_with_turns > 0:
                    run_info += f" | Avg Turns: {stats.average_turns:.1f}"

            print(run_info)
            print()

            # Store run statistics for later display
            run_stats_list.append((run_name, stats))

            # Collect completed files for timing analysis
            all_completed_files.extend(stats.completed_files)

            # Collect task results for Pass@n calculation
            for task_id, is_correct in task_results.items():
                if task_id not in all_task_results:
                    all_task_results[task_id] = []
                all_task_results[task_id].append(is_correct)

            # Update summary statistics
            self._update_summary_stats(summary, stats)

        # Display summary after all runs are processed
        self._display_summary(
            summary,
            run_stats_list,
            all_completed_files,
            benchmark_name_std,
            all_task_results,
        )

        return summary

    def _update_summary_stats(
        self, summary: GAIASummaryStats, stats: GAIATaskStats
    ) -> None:
        """Update summary statistics with data from a single run"""
        summary.total_tasks += stats.total
        summary.total_completed += stats.completed
        summary.total_running += stats.running
        summary.total_failed += stats.failed
        summary.total_judge_correct += stats.judge_correct
        summary.total_no_boxed_found += stats.no_boxed_found

        # Update difficulty level summary stats
        summary.level1_completed += stats.level1_completed
        summary.level1_correct += stats.level1_correct
        summary.level2_completed += stats.level2_completed
        summary.level2_correct += stats.level2_correct
        summary.level3_completed += stats.level3_completed
        summary.level3_correct += stats.level3_correct

    def _display_summary(
        self,
        summary: GAIASummaryStats,
        run_stats_list: List[Tuple[str, GAIATaskStats]],
        completed_files: List[str],
        benchmark_name_std: str,
        all_task_results: Dict[str, List[bool]] = None,
    ):
        """Display summary statistics"""
        print("=" * 80)
        print("SUMMARY STATISTICS")
        print("=" * 80)

        # Estimate completion time using overall progress rate
        if summary.total_completed > 0:
            num_runs = len(run_stats_list) if run_stats_list else 1
            expected_total_tasks = self.total_tasks_per_run * num_runs
            remaining_tasks = expected_total_tasks - summary.total_completed
            earliest_start = find_earliest_start_time(completed_files)
            last_end = find_latest_end_time(completed_files)
            completion_estimate = estimate_completion_time(
                expected_total_tasks, summary.total_completed, completed_files
            )

            print(
                f"Current Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)"
            )
            print(f"Remaining Tasks to Complete: {remaining_tasks}")
            if earliest_start:
                elapsed_time = last_end - earliest_start
                elapsed_minutes = elapsed_time.total_seconds() / 60
                overall_rate = (
                    summary.total_completed / elapsed_minutes
                    if elapsed_minutes > 0
                    else 0
                )
                print(f"Elapsed Time: {elapsed_minutes:.1f} minutes")
                print(f"Completion Rate: {overall_rate:.2f} tasks/minute")

            print(f"Estimated Time to Complete: {completion_estimate}")

        # Display each run's correct percentage
        if run_stats_list:
            print()
            print("INDIVIDUAL RUN ACCURACIES:")
            for run_name, stats in run_stats_list:
                if stats.completed > 0:
                    accuracy_bar = create_progress_bar(stats.judge_accuracy)
                    print(
                        f"  {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}"
                    )

                    # Add difficulty level information for each run
                    if (
                        stats.level1_completed > 0
                        or stats.level2_completed > 0
                        or stats.level3_completed > 0
                    ):
                        # Calculate total expected tasks for each difficulty level
                        total_level1 = sum(
                            1
                            for level in self.task_difficulty_map.values()
                            if level == 1
                        )
                        total_level2 = sum(
                            1
                            for level in self.task_difficulty_map.values()
                            if level == 2
                        )
                        total_level3 = sum(
                            1
                            for level in self.task_difficulty_map.values()
                            if level == 3
                        )

                        difficulty_info = (
                            f"    L1: {stats.level1_correct}/{stats.level1_completed}/{total_level1} ({stats.level1_accuracy:.1f}%) | "
                            f"L2: {stats.level2_correct}/{stats.level2_completed}/{total_level2} ({stats.level2_accuracy:.1f}%) | "
                            f"L3: {stats.level3_correct}/{stats.level3_completed}/{total_level3} ({stats.level3_accuracy:.1f}%)"
                        )
                        print(f"    {difficulty_info}")
                        print()
                else:
                    print(
                        f"  {run_name}: {stats.judge_correct}/{stats.completed} (N/A)"
                    )

            # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
            num_runs = len(run_stats_list)
            mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
            if mean_acc > 0:
                print()
                if num_runs > 1:
                    print(
                        f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%"
                    )
                else:
                    print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%")

            # Display Pass@n if multiple runs
            if num_runs > 1 and all_task_results:
                # Use the first run's total as reference
                first_run_total = (
                    run_stats_list[0][1].total
                    if run_stats_list
                    else summary.total_tasks
                )
                pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
                    all_task_results, first_run_total
                )
                pass_at_n_bar = create_progress_bar(pass_at_n_percentage)
                print(
                    f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}"
                )

            # Display no boxed content found statistics
            if summary.total_completed > 0:
                print(
                    f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)"
                )

        # Display overall judge accuracy after individual runs
        if summary.total_completed > 0:
            print()
            accuracy_bar = create_progress_bar(summary.total_judge_accuracy)
            print(
                f"OVERALL JUDGE ACCURACY: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}"
            )

            # Calculate and display overall average turns
            total_turns = sum(stats.total_turns for _, stats in run_stats_list)
            total_tasks_with_turns = sum(
                stats.completed_tasks_with_turns for _, stats in run_stats_list
            )
            if total_tasks_with_turns > 0:
                overall_avg_turns = total_turns / total_tasks_with_turns
                print(f"OVERALL AVERAGE TURNS: {overall_avg_turns:.1f}")

        # Display difficulty level summary if available
        if (
            summary.level1_completed > 0
            or summary.level2_completed > 0
            or summary.level3_completed > 0
        ):
            print()
            print("DIFFICULTY LEVEL SUMMARY:")
            # Calculate total expected tasks for each difficulty level
            total_level1 = sum(
                1 for level in self.task_difficulty_map.values() if level == 1
            )
            total_level2 = sum(
                1 for level in self.task_difficulty_map.values() if level == 2
            )
            total_level3 = sum(
                1 for level in self.task_difficulty_map.values() if level == 3
            )

            print(
                f"  L1: {summary.level1_correct}/{summary.level1_completed}/{total_level1} ({summary.level1_accuracy:.1f}%) | L2: {summary.level2_correct}/{summary.level2_completed}/{total_level2} ({summary.level2_accuracy:.1f}%) | L3: {summary.level3_correct}/{summary.level3_completed}/{total_level3} ({summary.level3_accuracy:.1f}%)"
            )

        print("=" * 80)
        print()


================================================
FILE: apps/miroflow-agent/benchmarks/common_benchmark.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import gc
import json
import os
import random
import re
from abc import ABC
from concurrent.futures import ProcessPoolExecutor
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import hydra

# Import from the new modular structure
from evaluators.eval_utils import verify_answer_for_datasets
from omegaconf import DictConfig, OmegaConf
from src.core.pipeline import (
    create_pipeline_components,
    execute_task_pipeline,
)
from src.logging.summary_time_cost import generate_summary
from src.utils.prompt_utils import (
    FAILURE_EXPERIENCE_FOOTER,
    FAILURE_EXPERIENCE_HEADER,
    FAILURE_EXPERIENCE_ITEM,
    FORMAT_ERROR_MESSAGE,
)


def _task_worker(task_dict, cfg_dict, evaluator_kwargs):
    """
    Worker function to run a single task in a separate process.
    This function is called by ProcessPoolExecutor and must be at module level.
    """
    import asyncio

    from omegaconf import OmegaConf

    # Reconstruct config in this process
    cfg = OmegaConf.create(cfg_dict)

    # Reconstruct task
    task = BenchmarkTask(
        task_id=task_dict["task_id"],
        task_question=task_dict["task_question"],
        ground_truth=task_dict["ground_truth"],
        file_path=task_dict.get("file_path"),
        metadata=task_dict.get("metadata", {}),
    )

    # Create evaluator in this process
    evaluator = GenericEvaluator(
        data_dir=evaluator_kwargs["data_dir"],
        benchmark_name=evaluator_kwargs["benchmark_name"],
        cfg=cfg,
        metadata_file=evaluator_kwargs.get("metadata_file", "metadata.jsonl"),
        task_id_field=evaluator_kwargs.get("task_id_field", "task_id"),
        question_field=evaluator_kwargs.get("question_field", "task_question"),
        ground_truth_field=evaluator_kwargs.get("ground_truth_field", "ground_truth"),
        file_name_field=evaluator_kwargs.get("file_name_field"),
    )

    # Run task in new event loop
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

    # Set exception handler to suppress "Task exception was never retrieved" warnings
    def exception_handler(loop, context):
        # Suppress all asyncio internal warnings for cleaner output
        pass

    loop.set_exception_handler(exception_handler)

    try:
        result = loop.run_until_complete(evaluator.run_single_task(task))
        # Convert result to dict for serialization
        return asdict(result)
    finally:
        loop.close()


@dataclass
class BenchmarkTask:
    """Generic benchmark task data structure"""

    task_id: str
    task_question: str
    ground_truth: str
    file_path: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)
    model_boxed_answer: str = ""
    status: str = "pending"  # pending, success, failed


@dataclass
class BenchmarkResult:
    """Generic benchmark evaluation result structure"""

    task_id: str
    task_question: str
    ground_truth: str
    file_path: Optional[str]
    status: str
    model_boxed_answer: str = ""
    metadata: Dict[str, Any] = field(default_factory=dict)
    error_message: str = ""
    final_judge_result: Optional[str] = None
    judge_type: Optional[str] = None
    log_file_path: Optional[str] = None
    # Pass@K support fields
    attempts: List[Dict[str, Any]] = field(default_factory=list)  # Store all attempts
    pass_at_k_success: bool = False  # Whether task passed using pass@k evaluation
    k_value: int = 1  # The k value used for this evaluation


class BenchmarkEvaluator(ABC):
    """Abstract base class for benchmark evaluators"""

    def __init__(self, data_dir: str, benchmark_name: str, cfg: DictConfig):
        """
        Initialize benchmark evaluator

        Args:
            data_dir: Path to benchmark data directory
            benchmark_name: Name of the benchmark
            cfg: The Hydra configuration object
        """
        self.data_dir = Path(data_dir)
        self.benchmark_name = benchmark_name
        self.cfg = cfg
        self.pass_at_k = cfg.benchmark.execution.get("pass_at_k", 1)
        self.tasks: List[BenchmarkTask] = []
        self.results: List[BenchmarkResult] = []

        # Format error tracking and retry configuration
        # Read from agent config as it's part of context management
        self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)

        # Get LLM provider and model from the config object
        self.llm_provider = cfg.llm.provider
        self.llm_model = cfg.llm.model_name

        # Initialize pipeline components
        print("Initializing pipeline components...")
        (
            self.main_agent_tool_manager,
            self.sub_agent_tool_managers,
            self.output_formatter,
        ) = create_pipeline_components(cfg)
        print(
            f"Pipeline components initialized successfully! Using pass@{self.pass_at_k}"
        )

    def get_log_dir(self) -> Path:
        """Get the log directory for the current benchmark and model."""
        return Path(hydra.core.hydra_config.HydraConfig.get().run.dir)

    async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
        """
        Run inference for a single benchmark task with pass@k support

        Args:
            task: BenchmarkTask object

        Returns:
            BenchmarkResult object
        """
        print(f"Processing task {task.task_id} with pass@{self.pass_at_k}")

        result = BenchmarkResult(
            task_id=task.task_id,
            task_question=task.task_question,
            ground_truth=task.ground_truth,
            file_path=task.file_path,
            model_boxed_answer="",
            status="pending",
            metadata=task.metadata.copy(),
            k_value=self.pass_at_k,
        )

        logs_dir = self.get_log_dir()
        found_correct_answer = False

        # Print debug info about log directory
        print(f"  Current log directory: {logs_dir}")

        try:
            # Prepare task
            task_description, task_file_path = self.prepare_task_description(task)

            # Run up to k attempts (with early stopping when correct answer found)
            for attempt in range(1, self.pass_at_k + 1):
                print(f"  Attempt {attempt}/{self.pass_at_k} for task {task.task_id}")
                format_retry_count = 0

                # Check if log file exists for this specific attempt in current directory
                log_pattern = f"task_{task.task_id}_attempt-{attempt}_*.json"
                matching_logs = []

                # Search only in current log directory
                if logs_dir.exists():
                    dir_logs = sorted(list(logs_dir.glob(log_pattern)))
                    if dir_logs:
                        matching_logs.extend(dir_logs)

                if matching_logs:
                    # Sort by timestamp in filename to get the most recent
                    def extract_timestamp(file_path):
                        filename = file_path.name
                        # Extract timestamp from filename like: task_xxx_attempt-1_format-retry-0_2025-08-13-10-13-20.json
                        # The timestamp is the last part before .json
                        if "_" in filename and filename.endswith(".json"):
                            timestamp_part = filename.split("_")[-1].replace(
                                ".json", ""
                            )
                            # Convert timestamp to datetime for proper sorting
                            from datetime import datetime

                            return datetime.strptime(
                                timestamp_part, "%Y-%m-%d-%H-%M-%S"
                            )
                        return filename

                    matching_logs = sorted(matching_logs, key=extract_timestamp)

                attempt_result = {
                    "attempt_number": attempt,
                    "model_boxed_answer": "",
                    "status": "pending",
                    "log_file_path": None,
                    "final_judge_result": None,
                    "judge_type": None,
                    "is_correct": False,
                }

                # Try to load existing result for this attempt
                if matching_logs:
                    log_file = matching_logs[-1]
                    attempt_result["log_file_path"] = str(log_file)
                    print(
                        f"    Found existing log for attempt {attempt}: {log_file.name}"
                    )

                    match = re.search(r"retry-(\d+)", os.path.basename(str(log_file)))
                    if match:
                        format_retry_count = int(match.group(1))
                    else:
                        raise ValueError(
                            f"Failed to extract retry number from log file: {log_file}"
                        )

                    try:
                        with open(log_file) as f:
                            log_data = json.loads(f.read())
                            if log_data.get("status") == "success":
                                format_retry_count += 1
                            if log_data.get("final_boxed_answer"):
                                attempt_result["model_boxed_answer"] = log_data[
                                    "final_boxed_answer"
                                ]
                                attempt_result["status"] = log_data.get("status")
                                # Check if we already have judge result in log
                                if log_data.get("final_judge_result"):
                                    attempt_result["final_judge_result"] = log_data[
                                        "final_judge_result"
                                    ]
                                    attempt_result["judge_type"] = log_data.get(
                                        "judge_type", ""
                                    )
                                    attempt_result["is_correct"] = (
                                        log_data["final_judge_result"] == "CORRECT"
                                    )
                                    # Load evaluation details if available
                                    if log_data.get("eval_details"):
                                        attempt_result["eval_details"] = log_data[
                                            "eval_details"
                                        ]
                                print(
                                    f"    Loaded existing result: {attempt_result['model_boxed_answer']}"
                                )
                    except Exception as e:
                        print(f"    Error loading log file {log_file}: {e}")

                # Run inference if no existing result or if we have a format error
                if (
                    not attempt_result["model_boxed_answer"]
                    or attempt_result["model_boxed_answer"] == FORMAT_ERROR_MESSAGE
                ):
                    # Try to get a valid response with format retry
                    print(f"TASK ID: {task.task_id}, ATTEMPT: {attempt}")

                    max_format_retries = self.context_compress_limit

                    # Track accumulated failure experiences for this attempt
                    # Start with the original task description
                    current_task_description = task_description
                    failure_experiences = []

                    # Resume: Recover failure experiences from previous retry logs
                    if format_retry_count > 0 and logs_dir.exists():
                        print(
                            f"    Resuming from retry {format_retry_count}, recovering previous failure experiences..."
                        )
                        for prev_retry in range(format_retry_count):
                            prev_log_pattern = f"task_{task.task_id}_attempt-{attempt}_format-retry-{prev_retry}_*.json"
                            prev_logs = sorted(list(logs_dir.glob(prev_log_pattern)))
                            if prev_logs:
                                prev_log_file = prev_logs[-1]  # Get the latest one
                                try:
                                    with open(
                                        prev_log_file, "r", encoding="utf-8"
                                    ) as f:
                                        prev_log_data = json.load(f)
                                        # Extract failure experience from trace_data
                                        trace_data = prev_log_data.get("trace_data", {})
                                        prev_failure_exp = trace_data.get(
                                            "failure_experience_summary"
                                        )
                                        if prev_failure_exp:
                                            failure_experiences.append(prev_failure_exp)
                                            print(
                                                f"      Recovered failure experience from retry {prev_retry}"
                                            )
                                except Exception as e:
                                    print(
                                        f"      Warning: Failed to load previous log {prev_log_file}: {e}"
                                    )

                        # Rebuild enhanced task description with recovered failure experiences
                        if failure_experiences:
                            current_task_description += FAILURE_EXPERIENCE_HEADER
                            for idx, exp in enumerate(failure_experiences, 1):
                                current_task_description += (
                                    FAILURE_EXPERIENCE_ITEM.format(
                                        attempt_number=idx,
                                        failure_summary=exp,
                                    )
                                )
                            current_task_description += FAILURE_EXPERIENCE_FOOTER
                            print(
                                f"    Recovered {len(failure_experiences)} failure experience(s) from previous retries"
                            )

                    while format_retry_count <= max_format_retries:
                        try:
                            # Check if this is the final retry (no more chances after this)
                            is_final_retry = format_retry_count == max_format_retries

                            (
                                response,
                                final_boxed_answer,
                                log_file_path,
                                failure_experience_summary,
                            ) = await execute_task_pipeline(
                                cfg=self.cfg,
                                task_id=f"{task.task_id}_attempt-{attempt}_format-retry-{format_retry_count}",
                                task_file_name=task_file_path,
                                task_description=current_task_description,
                                main_agent_tool_manager=self.main_agent_tool_manager,
                                sub_agent_tool_managers=self.sub_agent_tool_managers,
                                output_formatter=self.output_formatter,
                                ground_truth=task.ground_truth,
                                log_dir=str(self.get_log_dir()),
                                is_final_retry=is_final_retry,
                            )

                            attempt_result["model_boxed_answer"] = (
                                final_boxed_answer if final_boxed_answer else ""
                            )
                            attempt_result["log_file_path"] = log_file_path

                            # Check for format error
                            if (
                                attempt_result["model_boxed_answer"]
                                == FORMAT_ERROR_MESSAGE
                            ):
                                format_retry_count += 1
                                if format_retry_count <= max_format_retries:
                                    # Use the model-generated failure experience summary
                                    print(
                                        f"    Format error detected, using model-generated failure summary for retry {format_retry_count}..."
                                    )

                                    if failure_experience_summary:
                                        failure_experiences.append(
                                            failure_experience_summary
                                        )

                                        # Build enhanced task description with accumulated failure experiences
                                        # Start fresh from original task_description each time
                                        current_task_description = task_description
                                        current_task_description += (
                                            FAILURE_EXPERIENCE_HEADER
                                        )
                                        for idx, exp in enumerate(
                                            failure_experiences, 1
                                        ):
                                            current_task_description += (
                                                FAILURE_EXPERIENCE_ITEM.format(
                                                    attempt_number=idx,
                                                    failure_summary=exp,
                                                )
                                            )
                                        current_task_description += (
                                            FAILURE_EXPERIENCE_FOOTER
                                        )

                                        print(
                                            f"    Enhanced task description with {len(failure_experiences)} failure experience(s)"
                                        )
                                    else:
                                        print(
                                            "    No failure experience summary generated, retrying without enhancement..."
                                        )
                                    continue
                                else:
                                    # Exceeded format retry limit
                                    attempt_result["status"] = "success"
                                    attempt_result["model_boxed_answer"] = (
                                        f"{FORMAT_ERROR_MESSAGE} (after {max_format_retries} retries)"
                                    )
                                    attempt_result["error_message"] = (
                                        f"Exceeded format error retry limit ({max_format_retries})"
                                    )
                                    break
                            else:
                                # Got valid response, success
                                attempt_result["status"] = "success"
                                break

                        except Exception as e:
                            attempt_result["status"] = "failed"
                            attempt_result["error_message"] = str(e)
                            print(
                                f"    Error in attempt {attempt}, format retry {format_retry_count}: {e}"
                            )
                            break

                # Perform LLM verification if we have an answer and haven't verified yet
                if (
                    attempt_result["model_boxed_answer"]
                    and attempt_result["final_judge_result"] is None
                    and task.ground_truth is not None
                ):
                    print(f"    Verifying answer for attempt {attempt}...")
                    try:
                        (
                            evaluation_result,
                            judge_type,
                            eval_details,
                        ) = await verify_answer_for_datasets(
                            benchmark_name=self.benchmark_name,
                            question=task.task_question,
                            target=task.ground_truth,
                            predicted_answer=attempt_result["model_boxed_answer"],
                            metadata=task.metadata,
                        )
                        attempt_result["final_judge_result"] = evaluation_result
                        attempt_result["judge_type"] = judge_type
                        attempt_result["is_correct"] = evaluation_result == "CORRECT"

                        # Store evaluation details (e.g., for DeepSearchQA metrics)
                        if eval_details:
                            attempt_result["eval_details"] = eval_details

                        # Update the log file with verification result
                        if attempt_result["log_file_path"]:
                            self._update_log_file_with_evaluation(
                                attempt_result["model_boxed_answer"],
                                attempt_result["log_file_path"],
                                evaluation_result,
                                judge_type,
                                eval_details,  # Pass eval_details to save in log file
                            )

                        if attempt_result["is_correct"]:
                            print(f"    ✅ Attempt {attempt}: CORRECT!")
                            found_correct_answer = True
                        else:
                            print(
                                f"    ❌ Attempt {attempt}: INCORRECT ({evaluation_result})"
                            )

                    except Exception as e:
                        print(f"    Error verifying attempt {attempt}: {e}")
                        attempt_result["final_judge_result"] = "ERROR"
                        attempt_result["judge_type"] = "error"
                        attempt_result["is_correct"] = False

                elif attempt_result["is_correct"]:
                    print(f"    ✅ Attempt {attempt}: CORRECT (cached)")
                    found_correct_answer = True

                elif attempt_result["final_judge_result"]:
                    print(
                        f"    ❌ Attempt {attempt}: INCORRECT (cached: {attempt_result['final_judge_result']})"
                    )
                else:
                    print(f"    ⚠️  Attempt {attempt}: No valid answer to verify")

                result.attempts.append(attempt_result)

                # Update main result with the first successful attempt or best attempt so far
                if attempt == 1 or (
                    attempt_result["status"] == "success"
                    and not result.model_boxed_answer
                ):
                    result.model_boxed_answer = attempt_result["model_boxed_answer"]
                    result.log_file_path = attempt_result["log_file_path"]
                    result.status = attempt_result["status"]
                    if "error_message" in attempt_result:
                        result.error_message = attempt_result["error_message"]

                # Early stopping: if we found a correct answer, we can stop
                if found_correct_answer:
                    print(
                        f"    🎯 Found correct answer! Stopping early after {attempt} attempts."
                    )
                    break

        except Exception as e:
            result.error_message = str(e)
            result.status = "failed"
            print(f"Error processing task {task.task_id}: {e}")

        finally:
            result.pass_at_k_success = found_correct_answer

            # Set main result judge result based on pass@k outcome
            if found_correct_answer:
                result.final_judge_result = "PASS_AT_K_SUCCESS"
                result.judge_type = "pass_at_k"
            else:
                if result.ground_truth is None:
                    result.final_judge_result = "TEST_SET_MODE"
                else:
                    result.final_judge_result = "PASS_AT_K_FAILED"
                result.judge_type = "pass_at_k"

            print(f"Task {task.task_id} completed with {len(result.attempts)} attempts")
            if result.ground_truth is not None:
                print(
                    f"    Pass@{self.pass_at_k} result: {'✅ SUCCESS' if found_correct_answer else '❌ FAILED'}"
                )

        gc.collect()
        return result

    def _run_single_task_sync(self, task: BenchmarkTask) -> BenchmarkResult:
        """Sync wrapper for run_single_task to be used in threads"""
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)

        # Set exception handler to suppress "Task exception was never retrieved" warnings
        def exception_handler(loop, context):
            # Suppress all asyncio internal warnings for cleaner output
            pass

        loop.set_exception_handler(exception_handler)

        try:
            # Direct await is simpler and cleaner than gather for single task
            return loop.run_until_complete(self.run_single_task(task))
        finally:
            loop.close()

    def run_parallel_inference(
        self, tasks: List[BenchmarkTask], max_concurrent: int = 3
    ) -> List[BenchmarkResult]:
        """Run inference on multiple tasks in parallel using multiprocessing"""
        print(
            f"Running inference on {len(tasks)} tasks with max_concurrent={max_concurrent} (multiprocessing)"
        )

        # Serialize config
        cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)

        # Shuffle tasks to avoid order bias and improve balancing
        shuffled_tasks = tasks.copy()
        random.shuffle(shuffled_tasks)

        # Prepare evaluator kwargs for worker processes
        evaluator_kwargs = {
            "data_dir": str(self.data_dir),
            "benchmark_name": self.benchmark_name,
        }
        # Add GenericEvaluator specific kwargs if available
        if hasattr(self, "metadata_file"):
            evaluator_kwargs["metadata_file"] = str(self.metadata_file.name)
        if hasattr(self, "task_id_field"):
            evaluator_kwargs["task_id_field"] = self.task_id_field
        if hasattr(self, "question_field"):
            evaluator_kwargs["question_field"] = self.question_field
        if hasattr(self, "ground_truth_field"):
            evaluator_kwargs["ground_truth_field"] = self.ground_truth_field
        if hasattr(self, "file_name_field"):
            evaluator_kwargs["file_name_field"] = self.file_name_field

        # Prepare serializable arguments for worker processes
        worker_args = []
        for task in shuffled_tasks:
            task_dict = {
                "task_id": task.task_id,
                "task_question": task.task_question,
                "ground_truth": task.ground_truth,
                "file_path": task.file_path,
                "metadata": task.metadata,
            }
            worker_args.append((task_dict, cfg_dict, evaluator_kwargs))

        # Use ProcessPoolExecutor for true parallelism (bypasses GIL)
        processed_results = []
        task_index_map = {
            task.task_id: (i, task) for i, task in enumerate(shuffled_tasks)
        }
        results_dict = {}  # Store results by task_id to maintain order

        executor = None
        try:
            executor = ProcessPoolExecutor(max_workers=max_concurrent)
            # Submit all tasks
            future_to_task_id = {}
            for args in worker_args:
                task_dict = args[0]  # First element is task_dict
                future = executor.submit(_task_worker, *args)
                future_to_task_id[future] = task_dict["task_id"]

            # Collect results as they complete
            from concurrent.futures import as_completed

            for future in as_completed(future_to_task_id):
                task_id = future_to_task_id[future]
                try:
                    result_dict = future.result()
                    # Reconstruct BenchmarkResult from dict
                    result = BenchmarkResult(**result_dict)
                    results_dict[task_id] = result
                    completed = len(results_dict)
                    print(
                        f"Progress: {completed}/{len(shuffled_tasks)} tasks completed"
                    )
                except Exception as e:
                    print(f"Exception in task {task_id}: {e}")
                    # Get original task for error result
                    _, original_task = task_index_map[task_id]
                    error_result = BenchmarkResult(
                        task_id=original_task.task_id,
                        task_question=original_task.task_question,
                        ground_truth=original_task.ground_truth,
                        file_path=original_task.file_path,
                        model_boxed_answer="",
                        status="failed",
                        metadata=original_task.metadata.copy(),
                        error_message=str(e),
                    )
                    results_dict[task_id] = error_result
        except KeyboardInterrupt:
            print("\n⚠️  Received interrupt signal, shutting down gracefully...")
            if executor:
                print("  Cancelling pending tasks and terminating worker processes...")
                # Cancel all pending futures
                for future in future_to_task_id:
                    future.cancel()

                # Forcefully terminate worker processes
                # Access internal processes and terminate them
                if hasattr(executor, "_processes") and executor._processes:
                    for pid, process in executor._processes.items():
                        try:
                            if process.is_alive():
                                print(f"    Terminating worker process {pid}...")
                                process.terminate()
                        except Exception as e:
                            print(
                                f"    Warning: Failed to terminate process {pid}: {e}"
                            )

                    # Give processes a short time to terminate gracefully
                    import time

                    time.sleep(0.5)

                    # Force kill any remaining processes
                    for pid, process in executor._processes.items():
                        try:
                            if process.is_alive():
                                print(f"    Force killing worker process {pid}...")
                                process.kill()
                        except Exception as e:
                            print(f"    Warning: Failed to kill process {pid}: {e}")

                # Shutdown executor without waiting for pending tasks
                executor.shutdown(wait=False, cancel_futures=True)
            print("  Shutdown complete.")
            raise
        finally:
            # Ensure executor is properly cleaned up
            if executor:
                try:
                    executor.shutdown(wait=True)
                except Exception:
                    pass  # Ignore errors during cleanup

        # Reconstruct results in original task order
        processed_results = [results_dict[task.task_id] for task in shuffled_tasks]

        # Sort results to maintain original task order
        task_id_to_index = {task.task_id: i for i, task in enumerate(tasks)}
        processed_results.sort(
            key=lambda r: task_id_to_index.get(r.task_id, len(tasks))
        )

        self.results = processed_results
        return processed_results

    def save_results(self, output_file: str) -> str:
        """Save evaluation results to JSONL file"""
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)

        with open(output_path, "w", encoding="utf-8") as f:
            for result in self.results:
                f.write(json.dumps(asdict(result), ensure_ascii=False) + "\n")

        print(f"Results saved to {output_path}")
        return str(output_path)

    def evaluate_accuracy(self) -> float:
        """Evaluate pass@k accuracy (verification already done in run_single_task)"""
        if not self.results:
            print("No results to evaluate")
            return 0.0

        print(
            f"Calculating pass@{self.pass_at_k} accuracy for {len(self.results)} results..."
        )

        correct_count = 0
        total_count = 0

        for result in self.results:
            total_count += 1

            # Display task results
            print(f"\nTask {result.task_id}:")
            print(f"  Attempts: {len(result.attempts)}")
            if result.ground_truth is not None:
                print(
                    f"  Pass@{self.pass_at_k}: {'✅ SUCCESS' if result.pass_at_k_success else '❌ FAILED'}"
                )

            print("  " + "=" * 50)
            print(f"  Reference: {result.ground_truth}")
            print("  " + "=" * 50)

            if result.pass_at_k_success:
                correct_count += 1

        pass_at_k_accuracy = correct_count / total_count if total_count > 0 else 0.0

        print(f"\nPass@{self.pass_at_k} Final Results:")
        print(f"Tasks passed: {correct_count}/{total_count}")
        print(f"Pass@{self.pass_at_k} Accuracy: {pass_at_k_accuracy:.2%}")

        return pass_at_k_accuracy

    def _update_log_file_with_evaluation(
        self,
        model_boxed_answer: str,
        log_file_path: str,
        evaluation_result: str,
        judge_type: str,
        eval_details: Optional[Dict[str, Any]] = None,
    ):
        """Helper method to update log file with evaluation result"""
        try:
            log_file = Path(log_file_path)
            # Read existing data
            with open(log_file, "r", encoding="utf-8") as f:
                log_data = json.load(f)

            # Update with evaluation result
            log_data["final_boxed_answer"] = model_boxed_answer
            log_data["final_judge_result"] = evaluation_result
            log_data["judge_type"] = judge_type

            # Store evaluation details (e.g., for DeepSearchQA metrics)
            if eval_details:
                log_data["eval_details"] = eval_details

            # Write to a temporary file and then atomically replace
            temp_log_file = log_file.with_suffix(f"{log_file.suffix}.tmp")
            with open(temp_log_file, "w", encoding="utf-8") as f:
                json.dump(log_data, f, indent=2, ensure_ascii=False)

            os.replace(temp_log_file, log_file)
            print(f"    Updated log file {log_file.name} with evaluation result.")
        except Exception as e:
            print(f"    Error updating log file {log_file_path}: {e}")


class GenericEvaluator(BenchmarkEvaluator):
    """Generic benchmark evaluator for JSONL format"""

    def __init__(
        self,
        data_dir: str,
        benchmark_name: str,
        cfg: DictConfig,
        metadata_file: str = "metadata.jsonl",
        task_id_field: str = "task_id",
        question_field: str = "task_question",
        ground_truth_field: str = "ground_truth",
        file_name_field: Optional[str] = "file_name_field",
    ):
        """
        Initialize generic evaluator

        Args:
            data_dir: Path to benchmark data directory
            benchmark_name: Name of the benchmark
            cfg: The Hydra configuration object
            metadata_file: Name of the metadata file
            task_id_field: Field name for task ID in the data
            question_field: Field name for task question in the data
            ground_truth_field: Field name for ground truth answer in the data
            file_name_field: Field name for file name in the data (optional)
            pass_at_k: Pass@K value for evaluation (default: 1)
        """
        super().__init__(data_dir=data_dir, benchmark_name=benchmark_name, cfg=cfg)
        self.metadata_file = self.data_dir / metadata_file
        self.task_id_field = task_id_field
        self.question_field = question_field
        self.ground_truth_field = ground_truth_field
        self.file_name_field = file_name_field
        self.tasks: List[BenchmarkTask] = []
        self.results: List[BenchmarkResult] = []

    def load_tasks(self, limit: Optional[int] = None) -> List[BenchmarkTask]:
        """
        Load benchmark tasks from metadata.jsonl

        Args:
            limit: Maximum number of tasks to load (None for all)

        Returns:
            List of BenchmarkTask objects
        """
        print(f"Loading tasks from {self.metadata_file}")

        if not self.metadata_file.exists():
            raise FileNotFoundError(f"Metadata file not found: {self.metadata_file}")

        tasks = []
        with open(self.metadata_file, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if limit and i >= limit:
                    break

                try:
                    data = json.loads(line.strip())

                    # Extract file path if specified
                    file_path = None
                    if self.file_name_field and self.file_name_field in data:
                        file_path = data[self.file_name_field]

                    # Create metadata dict with all remaining fields
                    metadata = {
                        k: v
                        for k, v in data.items()
                        if k
                        not in [
                            self.task_id_field,
                            self.question_field,
                            self.ground_truth_field,
                            self.file_name_field,
                        ]
                    }

                    task = BenchmarkTask(
                        task_id=data[self.task_id_field],
                        task_question=data[self.question_field],
                        ground_truth=data[self.ground_truth_field],
                        file_path=file_path,
                        metadata=metadata,
                    )
                    tasks.append(task)

                except Exception as e:
                    print(f"Warning: Failed to parse line {i + 1}: {e}")
                    continue

        gc.collect()
        self.tasks = tasks
        print(f"Loaded {len(tasks)} tasks")
        return tasks

    def prepare_task_description(
        self, task: BenchmarkTask
    ) -> Tuple[str, Optional[str]]:
        """
        Prepare task description and file path for the agent

        Args:
            task: BenchmarkTask object

        Returns:
            Tuple of (task_description, task_file_path)
        """

        task_file_path = None
        if task.file_path:
            # Build complete file path: data directory + relative path
            full_file_path = self.data_dir / task.file_path
            # Convert to absolute path and resolve any symbolic links
            task_file_path = str(full_file_path.resolve())
        else:
            task_file_path = None

        # Return task question and file path
        return task.task_question, task_file_path


class CommonBenchmark:
    """Main class to run a benchmark"""

    def __init__(self, cfg: DictConfig):
        """
        Initialize the benchmark run

        Args:
            cfg: Hydra configuration object
        """
        self.cfg = cfg
        self.benchmark_name = cfg.benchmark.name
        evaluator_kwargs = cfg.benchmark.get("evaluator_kwargs", OmegaConf.create({}))
        # Support for legacy config structure
        if "metadata_file" in cfg.benchmark.data:
            evaluator_kwargs["metadata_file"] = cfg.benchmark.data.metadata_file
        if "field_mapping" in cfg.benchmark.data:
            mapping = cfg.benchmark.data.field_mapping
            if "task_id_field" in mapping:
                evaluator_kwargs["task_id_field"] = mapping.task_id_field
            if "task_question_field" in mapping:
                evaluator_kwargs["question_field"] = mapping.task_question_field
            if "ground_truth_field" in mapping:
                evaluator_kwargs["ground_truth_field"] = mapping.ground_truth_field
            if "file_name_field" in mapping:
                evaluator_kwargs["file_name_field"] = mapping.file_name_field

        self.evaluator = GenericEvaluator(
            data_dir=cfg.benchmark.data.data_dir,
            benchmark_name=self.benchmark_name,
            cfg=cfg,
            **evaluator_kwargs,
        )

    def run_evaluation(self) -> float:
        """
        Run the full benchmark evaluation process
        """
        print(f"Starting evaluation for benchmark: {self.benchmark_name}")
        print(f"LLM Provider: {self.evaluator.llm_provider}")
        print(f"LLM Model: {self.evaluator.llm_model}")

        # Load tasks
        self.evaluator.load_tasks(limit=self.cfg.benchmark.execution.max_tasks)
        if not self.evaluator.tasks:
            print("No tasks loaded. Exiting.")
            return 0.0

        # Run inference
        print(
            f"\nStarting parallel inference with {self.cfg.benchmark.execution.max_concurrent} concurrent tasks..."
        )
        print(f"Using pass@{self.evaluator.pass_at_k} evaluation...")

        self.evaluator.run_parallel_inference(
            self.evaluator.tasks,
            max_concurrent=self.cfg.benchmark.execution.max_concurrent,
        )

        # Evaluate accuracy
        print("Evaluating accuracy...")
        accuracy = self.evaluator.evaluate_accuracy()
        print(f"\nOverall pass@{self.evaluator.pass_at_k} accuracy: {accuracy:.2%}")
        # Save results

        # Construct the full path in the correct log directory
        log_dir = self.evaluator.get_log_dir()
        results_path = log_dir / "benchmark_results.jsonl"

        self.evaluator.save_results(str(results_path))
        print(f"\nEvaluation completed! Results saved to {results_path}")

        # save accuracy to a file
        accuracy_file = str(results_path).replace(
            ".jsonl", f"_pass_at_{self.evaluator.pass_at_k}_accuracy.txt"
        )
        with open(accuracy_file, "w") as f:
            f.write(f"{accuracy:.2%}")
        # Generate and save summary
        generate_summary(log_dir)
        return accuracy


@hydra.main(config_path="../conf", config_name="config", version_base=None)
def run_benchmark(cfg: DictConfig) -> None:
    """
    Main entry point for running benchmarks with Hydra.
    """
    print("Benchmark configuration:\n", OmegaConf.to_yaml(cfg.benchmark))

    benchmark = CommonBenchmark(cfg)
    benchmark.run_evaluation()


if __name__ == "__main__":
    run_benchmark()


================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/__init__.py
================================================


================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import glob
import os
import re
import statistics
import sys


def detect_pass_at_k(results_dir: str) -> tuple:
    """Detect the pass_at_k value used in the results directory"""

    # Find all possible pass_at_k files
    pattern = os.path.join(
        results_dir, "run_*", "benchmark_results_pass_at_*_accuracy.txt"
    )
    all_files = glob.glob(pattern)

    if not all_files:
        print(f"No accuracy files found in {results_dir}")
        print(f"Expected pattern: {pattern}")
        return None, []

    # Extract pass_at_k value from the first file
    filename = os.path.basename(all_files[0])
    match = re.search(r"pass_at_(\d+)_accuracy\.txt", filename)

    if not match:
        print(f"Cannot extract pass_at_k from filename: {filename}")
        return None, []

    k = int(match.group(1))

    # Get all files with this k value
    accuracy_files = glob.glob(
        os.path.join(
            results_dir, "run_*", f"benchmark_results_pass_at_{k}_accuracy.txt"
        )
    )

    return k, accuracy_files


def calculate_average_scores(results_dir: str) -> dict:
    """Calculate average scores from multiple runs - automatically detect pass_at_k value"""

    # Detect pass_at_k value and corresponding files
    pass_at_k, accuracy_files = detect_pass_at_k(results_dir)

    if pass_at_k is None:
        return None

    print(f"Detected pass_at_{pass_at_k} files")
    print(f"Found {len(accuracy_files)} accuracy files")

    scores = []

    # Read each accuracy file
    for i, file_path in enumerate(sorted(accuracy_files), 1):
        try:
            with open(file_path, "r") as f:
                content = f.read().strip()
                # Remove percentage sign and convert to float
                score = float(content.replace("%", ""))
                scores.append(score)
                print(f"Run {i}: {score:.2f}%")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

    if not scores:
        print("No valid scores found")
        return None

    # Calculate statistics
    stats = {
        "pass_at_k": pass_at_k,
        "num_runs": len(scores),
        "individual_scores": scores,
        "average_score": statistics.mean(scores),
        "std_dev": statistics.stdev(scores) if len(scores) > 1 else 0,
        "min_score": min(scores),
        "max_score": max(scores),
    }

    return stats


def print_results(stats: dict):
    """Print results"""
    print("\n" + "=" * 50)
    print("EVALUATION RESULTS")
    print("=" * 50)

    print(f"Pass@{stats['pass_at_k']} Results:")
    print(f"Number of runs: {stats['num_runs']}")
    print(f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}")
    print()
    print(f"Standard deviation: {stats['std_dev']:.2f}%")
    print(f"Min score: {stats['min_score']:.2f}%")
    print(f"Max score: {stats['max_score']:.2f}%")
    print(f"Average score: {stats['average_score']:.2f}%")
    print("=" * 50)


def main():
    if len(sys.argv) < 2:
        print("Usage: python calculate_average_score.py <results_directory>")
        print("Example: python calculate_average_score.py logs/gaia-validation/mytest")
        sys.exit(1)

    results_dir = sys.argv[1]

    if not os.path.exists(results_dir):
        print(f"Results directory does not exist: {results_dir}")
        sys.exit(1)

    print(f"Analyzing results from: {results_dir}")

    stats = calculate_average_scores(results_dir)

    if stats:
        print_results(stats)

        # Save simple statistics results
        output_file = os.path.join(
            results_dir, f"average_scores_pass_at_{stats['pass_at_k']}.txt"
        )
        with open(output_file, "w") as f:
            f.write("EVALUATION RESULTS\n")
            f.write("=" * 50 + "\n")
            f.write(f"Pass@{stats['pass_at_k']} Results:\n")
            f.write(f"Number of runs: {stats['num_runs']}\n")
            f.write(
                f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\n"
            )
            f.write(f"Standard deviation: {stats['std_dev']:.2f}%\n")
            f.write(f"Min score: {stats['min_score']:.2f}%\n")
            f.write(f"Max score: {stats['max_score']:.2f}%\n")
            f.write(f"Average score: {stats['average_score']:.2f}%\n")
            f.write("=" * 50 + "\n")

        print(f"\nResults saved to: {output_file}")
    else:
        print("Failed to calculate statistics")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/eval_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import json
import os
import re
import string
import warnings
from typing import Any, Dict, Literal, Optional

from dotenv import load_dotenv
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel

load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL")

evaluation_llm_client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
model_as_a_judge_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)


# ================================================
# verify_answer_simpleqa
# ================================================

EVALUATION_PROMPT_SIMPLEQA = """
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
First, I will give examples of each grade, and then you will grade a new example.


The following are examples of CORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia Obama and Sasha Obama
Predicted answer 1: sasha and malia obama
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
```
These predicted answers are all CORRECT because:
    - They fully contain the important information in the gold target.
    - They do not contain any information that contradicts the gold target.
    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.


The following are examples of INCORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: Malia.
Predicted answer 2: Malia, Sasha, and Susan.
Predicted answer 3: Barack Obama does not have any children.
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
```
These predicted answers are all INCORRECT because:
    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.


The following are examples of NOT_ATTEMPTED predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: I don't know.
Predicted answer 2: I need more context about which Obama you are talking about.
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
```
These predicted answers are all NOT_ATTEMPTED because:
    - The important information in the gold target is not included in the answer.
    - No statements in the answer contradict the gold target.


Also note the following things:
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". 
    - Predicted answers "120k", "124k", and 115k" are all CORRECT. 
    - Predicted answers "100k" and "113k" are INCORRECT. 
    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
- Do not punish for typos in people's name if it's clearly the same name. 
    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".


Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
```
Question: {}
Gold target: {}
Predicted answer: {}
```

Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
C: NOT_ATTEMPTED

Just return the letters "A", "B", or "C", with no text around it.
""".strip()


async def verify_answer_simpleqa(
    question: str, target: str, predicted_answer: str
) -> str:
    """
    Use LLM to verify if the predicted answer is correct.
    Expects the LLM to choose between A (correct), B or C (incorrect).
    """
    messages = [
        {
            "role": "user",
            "content": EVALUATION_PROMPT_SIMPLEQA.format(
                question, target, predicted_answer
            ),
        }
    ]
    CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"}

    try:
        llm_response = await evaluation_llm_client.chat.completions.create(
            model="gpt-4.1-2025-04-14", messages=messages, max_completion_tokens=2
        )
        content = llm_response.choices[0].message.content
        match = re.search(r"(A|B|C)", content)
        if match:
            return CHOICE_MAP[match.group(0)]
    except Exception as e:
        print(f"LLM evaluation failed: {e}")

    return "NOT_ATTEMPTED"


# ================================================
# verify_answer_hle
# ================================================

HLE_JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.

[question]: {question}

[response]: {response}

Your judgement must be in the format and criteria specified below:

extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.

[correct_answer]: {correct_answer}

reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.

correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.

confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available."""


class HLEExtractedAnswer(BaseModel):
    extracted_final_answer: str
    reasoning: str
    correct: Literal["yes", "no"]
    confidence: int
    strict: Literal[True] = True  # 100% reliability


async def verify_answer_hle(question: str, target: str, predicted_answer: str) -> str:
    """
    Use HLE-style LLM judge to verify if the predicted answer is correct.
    Returns the evaluation result as a string: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED".

    Args:
        question: The question being answered
        target: The correct/target answer
        predicted_answer: The model's predicted answer

    Returns:
        String indicating the evaluation result
    """
    prompt = HLE_JUDGE_PROMPT.format(
        question=question, correct_answer=target, response=predicted_answer
    )

    try:
        response = await evaluation_llm_client.beta.chat.completions.parse(
            model="o3-mini-2025-01-31",
            max_completion_tokens=4096,
            messages=[{"role": "user", "content": prompt}],
            response_format=HLEExtractedAnswer,
        )

        content = response.choices[0].message.parsed

        # Print HLE reasoning
        print(f"LLM as Judge Reasoning: {content.reasoning}")
        print(f"LLM as Judge Result: {content.correct}")
        print(f"LLM as Judge Confidence: {content.confidence}%")

        # Convert HLE format to eval_utils format
        if content.correct == "yes":
            return "CORRECT"
        else:
            return "INCORRECT"

    except Exception as e:
        if "Incorrect API key provided" in str(e):
            print(f"LLM evaluation failed: {e}")
            exit()
        print(f"LLM evaluation failed: {e}")
        return "NOT_ATTEMPTED"


# ================================================
# verify_answer_gaia
# ================================================


async def verify_answer_gaia(question: str, target: str, predicted_answer: str) -> str:
    """
    Use GAIA-style judge to verify if the predicted answer is correct.
    """

    def normalize_number_str(number_str: str) -> float | None:
        # we replace these common units and commas to allow
        # conversion to float
        for char in ["$", "%", ","]:
            number_str = number_str.replace(char, "")
        try:
            return float(number_str)
        except ValueError:
            print(f"String {number_str} cannot be normalized to number str.")
            return None  # Return None instead of inf to handle gracefully

    def split_string(
        s: str,
        char_list: list[str] = [",", ";"],
    ) -> list[str]:
        pattern = f"[{''.join(char_list)}]"
        return re.split(pattern, s)

    def normalize_str(input_str, remove_punct=True) -> str:
        """
        Normalize a string by:
        - Removing all white spaces
        - Optionally removing punctuation (if remove_punct is True)
        - Converting to lowercase
        Parameters:
        - input_str: str, the string to normalize
        - remove_punct: bool, whether to remove punctuation (default: True)
        Returns:
        - str, the normalized string
        """
        # Remove all white spaces. Required e.g for seagull vs. sea gull
        no_spaces = re.sub(r"\s", "", input_str)

        # Remove punctuation, if specified.
        if remove_punct:
            translator = str.maketrans("", "", string.punctuation)
            return no_spaces.lower().translate(translator)
        else:
            return no_spaces.lower()

    def question_scorer(
        model_answer: str,
        ground_truth: str,
    ) -> bool:
        def is_float(element: any) -> bool:
            try:
                float(element)
                return True
            except ValueError:
                return False

        if model_answer is None:
            model_answer = "None"

        # if gt is a number
        if is_float(ground_truth):
            print(f"Evaluating {model_answer} as a number.")
            normalized_answer = normalize_number_str(model_answer)
            # If normalization failed, the answer is incorrect
            if normalized_answer is None:
                return False
            return normalized_answer == float(ground_truth)

        # if gt is a list
        elif any(char in ground_truth for char in [",", ";"]):
            print(f"Evaluating {model_answer} as a comma separated list.")
            # question with the fish: normalization removes punct

            gt_elems = split_string(ground_truth)
            ma_elems = split_string(model_answer)

            # check length is the same
            if len(gt_elems) != len(ma_elems):
                warnings.warn(
                    "Answer lists have different lengths, returning False.", UserWarning
                )
                return False

            # compare each element as float or str
            comparisons = []
            for ma_elem, gt_elem in zip(ma_elems, gt_elems):
                if is_float(gt_elem):
                    normalized_ma_elem = normalize_number_str(ma_elem)
                    # If normalization failed, this element is incorrect
                    if normalized_ma_elem is None:
                        comparisons.append(False)
                    else:
                        comparisons.append(normalized_ma_elem == float(gt_elem))
                else:
                    # we do not remove punct since comparisons can include punct
                    comparisons.append(
                        normalize_str(ma_elem, remove_punct=False)
                        == normalize_str(gt_elem, remove_punct=False)
                    )
            return all(comparisons)

        # if gt is a str
        else:
            print(f"Evaluating {model_answer} as a string.")
            return normalize_str(model_answer) == normalize_str(ground_truth)

    # Use the question_scorer to evaluate the answer
    try:
        is_correct = question_scorer(predicted_answer, target)
        return "CORRECT" if is_correct else "INCORRECT"
    except Exception as e:
        print(f"GAIA evaluation failed: {e}")
        raise e

        # use raise error instead, later we could judge it as NOT_ATTEMPTED.
        # return "NOT_ATTEMPTED"


# ================================================
# verify_answer_gaia_validation_text_103

# Prompt from WebAgent
# https://github.com/Alibaba-NLP/WebAgent/blob/f25dae54daf0ce2874ffd5ed5ffb20feca7c4c4e/WebSailor/src/prompt.py#L98
# ================================================

GAIA_VALIDATION_TEXT_103_SCORER_PROMPT = """You are an evaluation assistant. Please determine if the predicted answer is equivalent to the labeled answer.

Question: {question}

Labeled Answer: {correct_answer}

Predicted Answer: {response}

Did the model give an answer **equivalent** to the labeled answer? Please respond with "Correct" if they are equivalent, or "Incorrect" if they are not equivalent. Do not include any other text.
"""


async def verify_answer_gaia_validation_text_103(
    question: str, target: str, predicted_answer: str
) -> str:
    prompt = GAIA_VALIDATION_TEXT_103_SCORER_PROMPT.format(
        question=question, correct_answer=target, response=predicted_answer
    )

    max_tries = 10
    for attempt in range(max_tries):
        try:
            response = await evaluation_llm_client.chat.completions.create(
                model="gpt-4.1-2025-04-14",
                messages=[{"role": "user", "content": prompt}],
            )

            content = response.choices[0].message.content
            print("LLM Judge Response: ", content)

            if response:
                break
        except Exception as e:
            if attempt == (max_tries - 1):
                raise e

    # Use case-insensitive matching and strip whitespace/punctuation
    content_normalized = content.strip().rstrip(".").lower()
    if content_normalized == "correct":
        return "CORRECT"
    elif content_normalized == "incorrect":
        return "INCORRECT"
    else:
        # If we can't parse the response, default to NOT_ATTEMPTED to trigger retry
        print(f"Warning: Could not parse judge response: {content}")
        return "NOT_ATTEMPTED"


# ================================================
# verify_answer_browsecomp

# Prompt from Tongyi DeepResearch
# https://github.com/Alibaba-NLP/DeepResearch/blob/main/WebAgent/WebWatcher/infer/evaluation/prompt.py#L110
# ================================================

JUDGE_PROMPT_BC_zh = """
请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为：【正确】、【错误】。

首先，我们将列出每个评定类别的示例，然后请您对新问题的预测答案进行评定。
以下是【正确】的答复示例：
```
问题：贝拉克·奥巴马的孩子叫什么名字？
标准答案：玛丽亚·奥巴马和萨莎·奥巴马
模型预测1：Malia Obama and Sasha Obama
模型预测2：玛丽亚和萨沙
模型预测3：大多数人会说是玛丽亚和萨莎，但我不确定，需要再确认
模型预测4：巴拉克·奥巴马有两个女儿，她们分别是玛丽亚·安和娜塔莎·玛丽安，但通常称作玛丽亚·奥巴马和萨莎·奥巴马。
```
这些答复均为【正确】，因为：
    - 完整地包含了标准答案中的重要信息。
    - 不包含任何与标准答案矛盾的信息。
    - 只关注语义内容，中英文，大小写、标点、语法和顺序不重要。
    - 答复中出现模糊语句或猜测是可以接受的，前提是包含了标准答案且不含有不正确信息或矛盾。

以下是【错误】的答复示例：
```
问题：巴拉克·奥巴马的孩子叫什么名字？
标准答案：玛丽亚·奥巴马和萨莎·奥巴马
模型预测1：玛丽亚
模型预测2：玛丽亚、萨莎和苏珊和萨莎·奥巴马或玛丽亚·奥巴马，或娜塔莎·玛丽安，或爱因斯坦
模型预测3：虽然我不知道他们的确切名字，但能说出巴拉克·奥巴马有两个孩子。
模型预测4：你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗？
模型预测5：巴拉克·奥巴马的孩子
```
这些答复均为【错误】，因为：
    - 答复中包含与标准答案矛盾的事实陈述。
    - 答案为空、重复表述问题。
    - 答案枚举了多个答案，重复表述答案。

需要格外注意的是：
- 标准答案中包含对于问题中多个方面的回答，并且在同一个方面的答案中可能会有多种不同的描述，这些描述均是正确的，并且在同一个括号中给出，通过逗号连接。例如，考虑问题"抖音自己的人工智能大模型叫什么名字？"，标准答案为"【【豆包，云雀】】"：
    - 预测答案"豆包"、"豆包、云雀"、"云雀"等均为【正确】。
- 对于标准答案中包含的不同方面的回答，模型需要同时给出所有方面的回答才可以算是正确，否则直接判断为【错误】，不存在【部分正确】这种输出方式，这些答案会在不同的括号中给出。例如，考虑问题"TFBOYS组合中的成员有哪些？"，标准答案为"【【王俊凯】【王源】【易洋千玺】】"：
    - 预测答案"王俊凯、王源、易洋千玺"等同时包含所有答案，才可以算为【正确】。
    - 预测答案为"王俊凯、易洋千玺"等没有同时包含所有答案，会被算为【错误】。

另外注意以下几点：
- 对于标准答案为数字的问题，预测答案应和标准答案一致。例如，考虑问题"金山铁路黄浦江特大桥的全长是多少米？"，标准答案为"3518.17"：
    - 预测答案"3518"、"3518.1"、"3518.17"均为【正确】。
    - 预测答案"3520"和"3600"均为【错误】。
- 如果模型预测并没有直接回答问题，模型试图绕过或未能直接给出标准答案视为【错误】答案。
    - 例如：问题"林宥嘉的老婆是谁"，标准答案为"丁文琪"。模型预测"林宥嘉的老婆"、"林宥嘉的老婆应该很优秀"、"林宥嘉的老婆可能是某个公众人物"均为【错误】。
- 如果标准答案包含比问题更多的信息，预测答案只需包含问题中提到的信息。
    - 例如，考虑问题"菱镁矿的主要化学成分是什么？"标准答案为"碳酸镁（MgCO3）"。"碳酸镁"或"MgCO3"均视为【正确】答案。
- 如果从问题中明显可以推断出预测答案省略的信息，那么算作正确。
    - 例如，问题"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产，那么这遗址在哪个地区？"标准答案为"意大利撒丁岛"，预测答案"撒丁岛"被视为【正确】。
- 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
    - 例如，如果标准答案是"Robinson"，那么回答鲁滨逊或者鲁滨孙均正确。
- 你应该更关注标准答案和模型预测的匹配度，而不是关心标准答案是否是正确的。

下面是一个新的问题示例。请只回复【正确】、【错误】之一，不要道歉或纠正自己的错误，只需要评估该回答。
```
问题: {question}
标准答案: {correct_answer}
预测答案: {response}
```

将此新问题的预测答案评定为以下之一：
A.【正确】
B.【错误】

只返回【正确】、【错误】所代表的选项即可，即仅返回A或B即可，无须添加任何其他的文本。
""".strip()


JUDGE_PROMPT_BC_en = """
Based on the given question, standard answer, and model-predicted answer, evaluate whether the model's response is correct. Your task is to classify the result as: [CORRECT] or [INCORRECT].

First, we'll list examples for each category, then you'll evaluate a new question's predicted answer.
Here are examples of [CORRECT] responses:
```
Question: What are the names of Barack Obama's children?
Standard Answer: Malia Obama and Sasha Obama
Model Prediction 1: Malia Obama and Sasha Obama
Model Prediction 2: Malia and Sasha
Model Prediction 3: Most would say Malia and Sasha, but I'm not sure, I should verify
Model Prediction 4: Barack Obama has two daughters, Malia Ann and Natasha Marian, commonly known as Malia Obama and Sasha Obama.
```
These responses are all [CORRECT] because they:
    - Fully include the important information from the standard answer.
    - Don't contain any information that contradicts the standard answer.
    - Focus only on semantic content; language, capitalization, punctuation, grammar, and order aren't important.
    - Vague statements or guesses are acceptable as long as they include the standard answer and don't contain incorrect information or contradictions.

Here are examples of [INCORRECT] responses:
```
Question: What are the names of Barack Obama's children?
Standard Answer: Malia Obama and Sasha Obama
Model Prediction 1: Malia
Model Prediction 2: Malia, Sasha and Susan or Sasha Obama or Malia Obama, or Natasha Marian, or Einstein
Model Prediction 3: While I don't know their exact names, I can tell you Barack Obama has two children.
Model Prediction 4: You might be thinking of Betsy and Olivia. But you should verify the details with the latest references. Is that the correct answer?
Model Prediction 5: Barack Obama's children
```
These responses are all [INCORRECT] because they:
    - Contain factual statements that contradict the standard answer.
    - Are empty or merely repeat the question.
    - Enumerate multiple answers or repeat the answer.

Pay special attention to the following:
- The standard answer may contain responses to multiple aspects of the question, and within the same aspect, there might be different descriptions, all of which are correct and are given in the same bracket, connected by commas. For example, for the question "What is the name of ByteDance's AI model?", the standard answer is "[[Doubao, Skylark]]":
    - Predicted answers "Doubao", "Doubao, Skylark", "Skylark", etc. are all [CORRECT].
- For standard answers containing responses to different aspects, the model needs to provide answers to all aspects to be considered correct; otherwise, it's directly judged as [INCORRECT]. There is no [PARTIALLY CORRECT] output option. These answers will be given in different brackets. For example, for the question "Who are the members of TFBOYS?", the standard answer is "[[Wang Junkai][Wang Yuan][Yi Yangqianxi]]":
    - Predicted answers like "Wang Junkai, Wang Yuan, Yi Yangqianxi" that include all answers are [CORRECT].
    - Predicted answers like "Wang Junkai, Yi Yangqianxi" that don't include all answers are [INCORRECT].

Also note the following points:
- For questions with numerical standard answers, the predicted answer should match the standard answer. For example, for the question "What is the total length in meters of the Huangpu River Bridge on the Jinshan Railway?", the standard answer is "3518.17":
    - Predicted answers "3518", "3518.1", "3518.17" are all [CORRECT].
    - Predicted answers "3520" and "3600" are [INCORRECT].
- If the model prediction doesn't directly answer the question, attempts to circumvent or fails to directly provide the standard answer, it's considered an [INCORRECT] answer.
    - For example, for the question "Who is JJ Lin's wife?", with the standard answer "Ding Wenqi", model predictions like "JJ Lin's wife", "JJ Lin's wife should be excellent", "JJ Lin's wife might be a public figure" are all [INCORRECT].
- If the standard answer contains more information than the question asks for, the predicted answer only needs to include the information mentioned in the question.
    - For example, for the question "What is the main chemical component of magnesite?", with the standard answer "Magnesium carbonate (MgCO3)", "Magnesium carbonate" or "MgCO3" are both considered [CORRECT] answers.
- If information omitted in the predicted answer can be clearly inferred from the question, it's considered correct.
    - For example, for the question "The Nuragic ruins of Barumini were listed as a World Cultural Heritage by UNESCO in 1997, so where is this site located?", with the standard answer "Sardinia, Italy", the predicted answer "Sardinia" is considered [CORRECT].
- If it's clear that different translations of a name refer to the same person, it's considered correct.
    - For example, if the standard answer is "Robinson", answers like "Lubinson" or "Lubinsun" are both correct.
- You should focus more on the match between the standard answer and the model prediction, rather than whether the standard answer itself is correct.

Below is a new question example. Please reply with only [CORRECT] or [INCORRECT], without apologies or corrections to your own errors, just evaluate the answer.
```
Question: {question}
Standard Answer: {correct_answer}
Predicted Answer: {response}
```

Evaluate this new question's predicted answer as one of the following:
A. [CORRECT]
B. [INCORRECT]

Return only the option representing [CORRECT] or [INCORRECT], i.e., just return A or B, without adding any other text.
""".strip()


async def verify_answer_browsecomp(
    question: str, target: str, predicted_answer: str
) -> str:
    """
    Use BrowseComp judge (English version) to verify if the predicted answer is correct.
    Expects the LLM to return A (correct) or B (incorrect).
    """

    prompt = JUDGE_PROMPT_BC_en.format(
        question=question, correct_answer=target, response=predicted_answer
    )

    try:
        response = await evaluation_llm_client.chat.completions.create(
            model="gpt-4.1-2025-04-14",
            messages=[{"role": "user", "content": prompt}],
            max_completion_tokens=2,
        )

        content = response.choices[0].message.content
        print(f"BrowseComp Judge Response: {content}")

        # Extract A or B from the response
        match = re.search(r"[AB]", content)
        if match:
            choice = match.group(0)
            if choice == "A":
                return "CORRECT"
            elif choice == "B":
                return "INCORRECT"

        # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry
        print(f"Warning: Could not parse BrowseComp judge response: {content}")
        return "NOT_ATTEMPTED"

    except Exception as e:
        print(f"BrowseComp evaluation failed: {e}")
        raise e


async def verify_answer_browsecomp_zh(
    question: str, target: str, predicted_answer: str
) -> str:
    """
    Use BrowseComp judge (Chinese version) to verify if the predicted answer is correct.
    Expects the LLM to return A (correct) or B (incorrect).
    """

    prompt = JUDGE_PROMPT_BC_zh.format(
        question=question, correct_answer=target, response=predicted_answer
    )

    try:
        response = await evaluation_llm_client.chat.completions.create(
            model="gpt-4.1-2025-04-14",
            messages=[{"role": "user", "content": prompt}],
            max_completion_tokens=2,
        )

        content = response.choices[0].message.content
        print(f"BrowseComp-ZH Judge Response: {content}")

        # Extract A or B from the response
        match = re.search(r"[AB]", content)
        if match:
            choice = match.group(0)
            if choice == "A":
                return "CORRECT"
            elif choice == "B":
                return "INCORRECT"

        # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry
        print(f"Warning: Could not parse BrowseComp-ZH judge response: {content}")
        return "NOT_ATTEMPTED"

    except Exception as e:
        print(f"BrowseComp-ZH evaluation failed: {e}")
        raise e


# ================================================
# verify_answer_xbench_deepsearch

# Prompt from XBench-Evals
# https://github.com/xbench-ai/xbench-evals/blob/main/eval_grader.py#L25
# ================================================

JUDGE_PROMPT_XBENCH = """
你是一个通用人工智能助手。根据下面给出的[正确答案], 判断以下对[原问题]的[回答]的回答是否正确。

[原问题]: {question}

[正确答案]: {correct_answer}

[回答]:{response}

你的判断必须按照以下格式和标准进行:

最终答案: 从[回答]中提取出的最终准确答案。如果[回答]中没有明确的最终答案, 则填写'无'。

解释: 根据[正确答案]解释为什么[最终答案]是正确的或错误的。只关注[最终答案]与[正确答案]之间是否存在实质性差异, 不要评论题目的背景, 不要尝试重新解题, 不要为任何不同于[正确答案]的答案辩护, 只专注于判断答案是否一致。

结论: 如果[最终答案]与上方给出的[正确答案]一致, 或者在数值题目中处于可接受的微小误差范围内, 则填写'正确'; 否则（即存在任何不一致、歧义、不等价或提取出的答案错误的情况）填写'错误'。
""".strip()


async def verify_answer_xbench_deepsearch(
    question: str, target: str, predicted_answer: str
) -> str:
    """
    Use XBench-DeepSearch judge to verify if the predicted answer is correct.
    """

    def parse_match_result(match):
        if match is None:
            return match
        match = match.group(0)
        try:
            target = match.split(":")[1].strip()
            return target
        except Exception:
            return match  # return naive result in case of failed

    if predicted_answer is None:
        return "INCORRECT"

    judge_prompt = JUDGE_PROMPT_XBENCH.format(
        question=question,
        correct_answer=target,
        response=predicted_answer,
    )
    try:
        response = await evaluation_llm_client.chat.completions.create(
            model="gpt-4.1-2025-04-14",
            messages=[{"role": "user", "content": judge_prompt}],
        )
        judge_response = response.choices[0].message.content
    except Exception:
        judge_response = None
    if judge_response is None:
        return "NOT_ATTEMPTED"

    # Extract grader conclusions
    extract_match = re.search(r"最终答案:*(.*)", judge_response)
    extract_match = parse_match_result(extract_match)

    # Fixed regex: make the dot optional with \s* (zero or more whitespace)
    correct_match = re.search(r"结论:*\s*(正确|错误)", judge_response)
    correct_match = parse_match_result(correct_match)

    explain_match = re.search(r"解释:*(.*)", judge_response)
    explain_match = parse_match_result(explain_match)

    # Print debug info
    print(f"XBench Judge - Extract: {extract_match}, Correct: {correct_match}")

    if correct_match == "正确":
        return "CORRECT"
    elif correct_match == "错误":
        return "INCORRECT"
    else:
        # If we can't parse the result, return NOT_ATTEMPTED to trigger retry
        print(
            f"Warning: Could not parse XBench judge response, correct_match={correct_match}"
        )
        return "NOT_ATTEMPTED"


# ================================================
# verify_answer_deepsearchqa
#
# Official prompt from DeepSearchQA benchmark
# https://www.kaggle.com/code/andrewmingwang/deepsearchqa-starter-code
# ================================================

JUDGE_PROMPT_DEEPSEARCHQA = """Your task is to evaluate whether a given "AI Response" for a specific "User Prompt" arrived at the correct answer.

**Answer Correctness Task**

*   **Purpose:** Assess whether the AI response provides the correct answer(s) based on the provided "Correct Answer" and "Prompt Type".
*   **Process:**
    *   Identify the "Prompt Type": "<prompt_type>".
    *   Refer to the "Correct Answer": "<answer>".
    *   Based on the "Prompt Type", determine if the "AI Response" contains the expected answer(s).
        *   **'Single Answer'**: Check if the response provides the answer that addresses the user's question. It does not have to match the exact wording of the provided answer.
        *   **'Set Answer'**: Check if the response includes *each* item from the provided ground truth answers. The order might not matter unless specified otherwise. The response might include more answers than the list. Determine the correctness *only* based on the list first and then check if the response includes answers not in the list.
    *   **Explanation:** Provide a brief explanation justifying your assessment of answer correctness, referencing specific parts of the AI response and the correct answer.
    *   **Correctness Details:** Provide a dictionary, one key for each expected answer part, and value is a boolean indicating whether each expected answer part was found.
        *   For 'Set Answer', this will be a list of attributes, one for each item/part in the "Correct Answer". Each key will be a string indicating the expected answer part, and the value will be a boolean indicating whether that part was found in the response.
    *   **Excessive Answers:** Provide a list of strings, each indicating an excessive answer part. If the response provides answers that are **not** in the "Correct Answer" list, add these answers as excessive answers. Return an empty list when there's no excessive answers in the response.


**Output Format:**

Your evaluation *must* be structured as a nested JSON dictionary with the following top-level keys: `"Answer Correctness"`. Please return NULL if any of "Prompt", "AI Response" or "Correct Answer" is empty.
The value for `"Answer Correctness"` should be a dictionary containing `"Explanation"` (a string), `"Correctness Details"` (a dictionary where each key is the expected correct answer, and the value is a boolean indicating whether the response contains the correct answer), and `"Excessive Answers"` (a list of strings indicating the excessive answers).

Make sure you return a valid JSON string. Pay special attention to quotes, commas and special characters in the JSON string. Make sure to escape all special characters and quotes in the JSON string.


**Example (Partial):**

"```json
{{
  "Answer Correctness": {{
    "Explanation": "The response correctly identified Belgium and France but also includes an excessive answer, Italy.",
    "Correctness Details": {{
      "Belgium": true,
      "France": true,
    }},
    "Excessive Answers": [ "Italy" ]
  }}
}}
```"

**Now, proceed with the evaluation using the provided User Prompt, AI Response, and Correct Answer.**

User Prompt (Wrapped in <prompt> and </prompt>):
<prompt>
{prompt}
</prompt>
--------------------
**  Correct Answer (Wrapped in <answer> and </answer>):
Prompt Type: {prompt_type}
<answer>
{answer}
</answer>
--------------------
AI assistant response (Wrapped in <response> and </response>):
<response>
{response}
</response>

--------------------
Rating:"""


async def verify_answer_deepsearchqa(
    question: str,
    target: str,
    predicted_answer: str,
    metadata: Optional[Dict[str, Any]] = None,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
    """
    Use DeepSearchQA-specific judge to verify if the predicted answer is correct.
    Uses the official DeepSearchQA evaluation prompt with JSON output format.

    Args:
        question: The question being answered
        target: The correct/target answer
        predicted_answer: The model's predicted answer
        metadata: Optional metadata dict with additional context (e.g., problem_category, answer_type)

    Returns:
        Tuple of (result, judge_type, details_dict):
        - result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED"
        - judge_type: "deepsearchqa_judge"
        - details_dict: Dict with keys:
            - correctness_details: Dict[str, bool] mapping answer parts to correctness
            - excessive_answers: List[str] of extra answers not in ground truth
            - explanation: str explaining the judgment
            - num_correct: int number of correct answer parts
            - num_expected: int total number of expected answer parts
            - num_excessive: int number of excessive answers
    """

    if predicted_answer is None:
        return "INCORRECT", "deepsearchqa_judge", None

    # Determine prompt_type from metadata
    prompt_type = "Single Answer"  # Default
    if metadata and "answer_type" in metadata:
        answer_type = metadata["answer_type"]
        # Map answer_type to prompt_type
        if answer_type == "Set Answer":
            prompt_type = "Set Answer"
        # Add more mappings if needed

    judge_prompt = JUDGE_PROMPT_DEEPSEARCHQA.format(
        prompt_type=prompt_type,
        prompt=question,
        answer=target,
        response=predicted_answer,
    )

    try:
        response = await evaluation_llm_client.chat.completions.create(
            model="gpt-4.1-2025-04-14",
            messages=[{"role": "user", "content": judge_prompt}],
        )
        judge_response = response.choices[0].message.content
    except Exception as e:
        print(f"DeepSearchQA judge failed: {e}")
        return "NOT_ATTEMPTED", "deepsearchqa_judge", None

    if judge_response is None:
        return "NOT_ATTEMPTED", "deepsearchqa_judge", None

    # Parse JSON response
    try:
        # Extract JSON from the response (might be wrapped in markdown code blocks)
        json_match = re.search(r"```json\s*(\{.*?\})\s*```", judge_response, re.DOTALL)
        if json_match:
            json_str = json_match.group(1)
        else:
            # Try to find JSON without code blocks
            json_match = re.search(r"\{.*\}", judge_response, re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
            else:
                print("Warning: Could not find JSON in DeepSearchQA judge response")
                return "NOT_ATTEMPTED", "deepsearchqa_judge", None

        result = json.loads(json_str)
        answer_correctness = result.get("Answer Correctness", {})

        explanation = answer_correctness.get("Explanation", "")
        correctness_details = answer_correctness.get("Correctness Details", {})
        excessive_answers = answer_correctness.get("Excessive Answers", [])

        # Calculate statistics
        num_expected = len(correctness_details)
        num_correct = sum(1 for v in correctness_details.values() if v)
        num_excessive = len(excessive_answers)

        # Build details dict
        details = {
            "correctness_details": correctness_details,
            "excessive_answers": excessive_answers,
            "explanation": explanation,
            "num_correct": num_correct,
            "num_expected": num_expected,
            "num_excessive": num_excessive,
        }

        # Print debug info
        print(
            f"DeepSearchQA Judge - Correct: {num_correct}/{num_expected}, Excessive: {num_excessive}"
        )
        print(f"DeepSearchQA Judge - Explanation: {explanation}")

        # Determine if answer is correct
        # Following official logic: all expected parts must be found, and no excessive answers
        if correctness_details:
            all_correct = all(correctness_details.values())
            if all_correct and not excessive_answers:
                return "CORRECT", "deepsearchqa_judge", details
            else:
                # Either missing some expected answers or has excessive answers
                return "INCORRECT", "deepsearchqa_judge", details
        else:
            # No correctness details, can't determine
            return "NOT_ATTEMPTED", "deepsearchqa_judge", None

    except json.JSONDecodeError as e:
        print(f"Warning: Failed to parse JSON from DeepSearchQA judge: {e}")
        print(f"Response: {judge_response[:200]}...")
        return "NOT_ATTEMPTED", "deepsearchqa_judge", None
    except Exception as e:
        print(f"Warning: Error processing DeepSearchQA judge response: {e}")
        return "NOT_ATTEMPTED", "deepsearchqa_judge", None


# ================================================
# verify_answer_for_datasets
# ================================================


async def _verify_answer_for_datasets_core(
    benchmark_name: str,
    question: str,
    target: str,
    predicted_answer: str,
    metadata: Optional[Dict[str, Any]] = None,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
    """
    Verify the answer for a given dataset.

    Args:
        benchmark_name: Name of the benchmark dataset
        question: The question being answered
        target: The correct/target answer
        predicted_answer: The model's predicted answer
        metadata: Optional metadata dict with additional context

    Returns:
        A tuple of (result, judge_type, details_dict).
        details_dict is None for most benchmarks, but contains evaluation details for DeepSearchQA.
    """

    # For benchmarks that need detailed evaluation, don't use exact_match
    if benchmark_name not in ["deepsearchqa"]:
        if predicted_answer == target:
            return "CORRECT", "exact_match", None

    # For gaia-validation, use gaia-validation-text-103-scorer
    # We found that gaia_scorer tends to label many correct answers as incorrect, so we believe
    # that using an LLM-as-judge approach can more accurately reflect the model’s performance.
    if benchmark_name == "gaia-validation":
        # result = await verify_answer_gaia(question, target, predicted_answer)
        # return result, "gaia_scorer", None
        result = await verify_answer_gaia_validation_text_103(
            question, target, predicted_answer
        )
        return result, "gaia_validation_text_103_judge", None

    # For gaia-validation-text-103, use gaia-validation-text-103-scorer
    elif benchmark_name == "gaia-validation-text-103":
        result = await verify_answer_gaia_validation_text_103(
            question, target, predicted_answer
        )
        return result, "gaia_validation_text_103_judge", None

    # For browsecomp (English) and browsecomp-zh (Chinese), use different judges
    elif benchmark_name == "browsecomp":
        result = await verify_answer_browsecomp(question, target, predicted_answer)
        return result, "browsecomp_judge", None

    elif benchmark_name == "browsecomp_zh":
        result = await verify_answer_browsecomp_zh(question, target, predicted_answer)
        return result, "browsecomp_zh_judge", None

    # For hle, hle-text-500, and hle-text-2158, use hle_judge
    elif "hle" in benchmark_name:
        result = await verify_answer_hle(question, target, predicted_answer)
        return result, "hle_judge", None

    # For webwalkerqa, frames, and seal-0, use gaia_validation_text_103_judge
    elif benchmark_name in ["webwalkerqa", "frames", "seal-0"]:
        result = await verify_answer_gaia_validation_text_103(
            question, target, predicted_answer
        )
        return result, "gaia_validation_text_103_judge", None

    # For simpleqa, use simpleqa_judge
    elif benchmark_name == "simpleqa" or benchmark_name == "collect_trace":
        result = await verify_answer_simpleqa(question, target, predicted_answer)
        return result, "simpleqa_judge", None

    # For xbench_deepsearch, use xbench_deepsearch_judge
    elif benchmark_name == "xbench_deepsearch":
        result = await verify_answer_xbench_deepsearch(
            question, target, predicted_answer
        )
        return result, "xbench_deepsearch_judge", None

    # For deepsearchqa, use deepsearchqa_judge (with metadata support and detailed evaluation)
    elif benchmark_name == "deepsearchqa":
        result, judge_type, details = await verify_answer_deepsearchqa(
            question, target, predicted_answer, metadata
        )
        # Return details for DeepSearchQA-specific metrics calculation
        return result, judge_type, details

    # For other benchmarks, use gaia_validation_text_103_judge
    else:
        result = await verify_answer_gaia_validation_text_103(
            question, target, predicted_answer
        )
        return result, "gaia_validation_text_103_judge", None


async def verify_answer_for_datasets(
    benchmark_name: str,
    question: str,
    target: str,
    predicted_answer: str,
    metadata: Optional[Dict[str, Any]] = None,
    max_retries: int = 10,
    retry_interval: int = 5,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
    """
    Wrapper with retry logic for NOT_ATTEMPTED results.

    Args:
        benchmark_name: Name of the benchmark dataset
        question: The question being answered
        target: The correct/target answer
        predicted_answer: The model's predicted answer
        metadata: Optional metadata dict with additional context
        max_retries: Maximum number of retry attempts
        retry_interval: Seconds to wait between retries

    Returns:
        A tuple of (result, judge_type, details_dict).
        details_dict contains evaluation details (for DeepSearchQA) or None (for other benchmarks).
    """
    for attempt in range(1, max_retries + 1):
        result, judge_type, details = await _verify_answer_for_datasets_core(
            benchmark_name, question, target, predicted_answer, metadata
        )
        if result != "NOT_ATTEMPTED":
            return result, judge_type, details
        if attempt < max_retries:
            print(
                f"[Retry {attempt}/{max_retries}] Got NOT_ATTEMPTED, retrying in {retry_interval}s..."
            )
            await asyncio.sleep(retry_interval)

    # still NOT_ATTEMPTED after retries
    print(f"All {max_retries} attempts resulted in NOT_ATTEMPTED.")
    return "NOT_ATTEMPTED", "retry_wrapper", None


================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/extract_futurex_results.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import json
import os
from collections import Counter, defaultdict
from typing import Dict, List, Tuple


def majority_vote(
    preds: List[str], first_seen_order: Dict[str, int]
) -> Tuple[str, Dict[str, int]]:
    """
    Compute the majority-vote prediction for a list of candidate predictions.

    Tie-breaking rules (deterministic):
      1) Highest frequency wins.
      2) If there is a tie on frequency, choose the candidate that appeared earliest
         across all runs (based on the provided first_seen_order index).
      3) As a final guard (shouldn't be needed if first_seen_order is complete),
         fall back to lexicographic order.

    Returns:
      (chosen_prediction, counts_dict)
    """
    counter = Counter(preds)
    # Get the max vote count
    max_count = max(counter.values())
    # All candidates that share the max vote count
    tied = [c for c, cnt in counter.items() if cnt == max_count]

    if len(tied) == 1:
        chosen = tied[0]
    else:
        # Prefer the one seen earliest globally
        tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x))
        chosen = tied[0]

    # Expose counts for optional debugging/inspection
    return chosen, dict(counter)


def discover_runs(results_dir: str) -> List[str]:
    """
    Discover subdirectories inside results_dir that potentially contain a
    'benchmark_results.jsonl'. We don't strictly require the subdir name to
    start with 'run_', but we sort the list to keep processing deterministic.
    """
    runs = []
    for name in sorted(os.listdir(results_dir)):
        path = os.path.join(results_dir, name)
        if os.path.isdir(path):
            fpath = os.path.join(path, "benchmark_results.jsonl")
            if os.path.isfile(fpath):
                runs.append(path)
    return runs


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Aggregate multiple run_*/benchmark_results.jsonl files and produce a FutureX submission with majority voting."
    )
    parser.add_argument(
        "results_dir",
        help="Path to results dir containing run_*/benchmark_results.jsonl",
    )
    parser.add_argument(
        "-o",
        "--output",
        default=None,
        help="Output JSONL file path (default: <results_dir>/futurex_submission.jsonl)",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    results_dir = os.path.abspath(args.results_dir)
    if not os.path.isdir(results_dir):
        raise FileNotFoundError(f"Results dir not found: {results_dir}")

    output_file = (
        os.path.abspath(args.output)
        if args.output
        else os.path.join(results_dir, "futurex_submission.jsonl")
    )

    # Maps task_id -> list of predictions collected across runs
    preds_by_task: Dict[str, List[str]] = defaultdict(list)

    # Track first-seen order index for each distinct prediction string across all runs.
    # This enables deterministic tie-breaking.
    first_seen_order: Dict[str, int] = {}
    next_order_idx = 0

    runs = discover_runs(results_dir)
    if not runs:
        raise FileNotFoundError(
            f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}"
        )

    total_lines = 0
    used_lines = 0

    # Read and aggregate predictions
    for run_dir in runs:
        fpath = os.path.join(run_dir, "benchmark_results.jsonl")
        print(f"Reading: {fpath}")
        with open(fpath, "r", encoding="utf-8") as fin:
            for line in fin:
                total_lines += 1
                line = line.strip()
                if not line:
                    continue
                try:
                    rec = json.loads(line)
                except json.JSONDecodeError:
                    # Skip malformed JSON lines, but keep going
                    continue

                task_id = rec.get("task_id")
                pred = rec.get("model_boxed_answer")

                # Only accept non-empty strings; coerce to str for safety
                if task_id and pred is not None and str(pred).strip():
                    pred_str = str(pred).strip()
                    preds_by_task[task_id].append(pred_str)
                    if pred_str not in first_seen_order:
                        first_seen_order[pred_str] = next_order_idx
                        next_order_idx += 1
                    used_lines += 1

    # Write submission JSONL
    # We sort task_ids to keep output reproducible.
    num_tasks = 0
    with open(output_file, "w", encoding="utf-8") as out:
        for task_id in sorted(preds_by_task.keys()):
            voted_pred, _counts = majority_vote(
                preds_by_task[task_id], first_seen_order
            )
            out.write(
                json.dumps(
                    {"id": task_id, "prediction": voted_pred}, ensure_ascii=False
                )
                + "\n"
            )
            num_tasks += 1

    # Optional: small summary to stdout
    print(f"Collected from {len(runs)} run(s).")
    print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
    print(f"Aggregated {num_tasks} unique task_id(s).")
    print(f"✅ Submission saved to {output_file}")


if __name__ == "__main__":
    main()


================================================
FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-text-103-grader.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
GAIA-Text-103 Task Grader

This script:
1. Loads extracted GAIA-Text-103 tasks from the extraction directory
2. Grades each task using the GAIA-Text-103 evaluator (LLM judgement)
3. Updates the original task files with grading results

Usage:
    uv run benchmarks/subset_extraction/gaia-text-103-grader.py /path/to/extraction/directory
"""

import argparse
import asyncio
import json
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional

# Add the benchmarks directory to the path to import evaluators
sys.path.append(str(Path(__file__).parent.parent))
from evaluators.eval_utils import verify_answer_gaia_validation_text_103


@dataclass
class GradingResult:
    """Result of grading a single task"""

    task_id: str
    run_name: str
    file_path: str
    question: str
    ground_truth: str
    predicted_answer: str
    judge_result: str
    judge_type: str = "gaia_validation_text_103_scorer"
    grading_time: float = 0.0
    error_message: str = ""


class GAIAText103Grader:
    """Grader for GAIA-Text-103 tasks using LLM judgement"""

    def __init__(self, extraction_dir: str):
        """
        Initialize the grader

        Args:
            extraction_dir: Directory containing extracted GAIA-Text-103 tasks
        """
        self.extraction_dir = Path(extraction_dir)
        self.results: List[GradingResult] = []
        self.stats = {
            "total_tasks": 0,
            "graded_tasks": 0,
            "errors": 0,
            "total_grading_time": 0.0,
        }

    def find_task_files(self) -> List[Path]:
        """Find all task JSON files in the extraction directory"""
        task_files = []

        # Recursively search for task files
        for root, dirs, files in os.walk(self.extraction_dir):
            for file in files:
                if file.startswith("task_") and file.endswith(".json"):
                    task_files.append(Path(root) / file)

        return sorted(task_files)

    def extract_task_info(self, task_file: Path) -> Optional[Dict]:
        """Extract task information from a task file"""
        try:
            with open(task_file, "r", encoding="utf-8") as f:
                task_data = json.load(f)

            # Check if task has already been graded with our specific scorer
            if task_data.get("judge_type") == "gaia_validation_text_103_scorer":
                print(f"Skipping already graded task: {task_file.name}")
                return None

            # Extract basic information
            task_info = {
                "task_id": task_data.get("task_id", ""),
                "run_name": task_data.get("run_name", ""),
                "file_path": str(task_file),
                "question": task_data.get("input", {}).get("task_description", ""),
                "ground_truth": task_data.get("ground_truth", ""),
                "predicted_answer": task_data.get("final_boxed_answer", ""),
            }

            # Validate required fields
            if not all(
                [
                    task_info["question"],
                    task_info["ground_truth"],
                    task_info["predicted_answer"],
                ]
            ):
                print(f"Warning: Missing required fields in {task_file}")
                print(f"  question: {task_info['question']}")
                print(f"  ground_truth: {task_info['ground_truth']}")
                print(f"  predicted_answer: {task_info['predicted_answer']}")
                return None

            return task_info

        except Exception as e:
            print(f"Error reading task file {task_file}: {e}")
            return None

    async def grade_single_task(self, task_info: Dict) -> GradingResult:
        """Grade a single task using GAIA-Text-103 evaluator"""
        start_time = time.time()

        result = GradingResult(
            task_id=task_info["task_id"],
            run_name=task_info["run_name"],
            file_path=task_info["file_path"],
            question=task_info["question"],
            ground_truth=task_info["ground_truth"],
            predicted_answer=task_info["predicted_answer"],
            judge_result="",
            judge_type="gaia_validation_text_103_scorer",
        )

        try:
            # Use the GAIA-Text-103 evaluator
            judge_result = await verify_answer_gaia_validation_text_103(
                question=task_info["question"],
                target=task_info["ground_truth"],
                predicted_answer=task_info["predicted_answer"],
            )

            result.judge_result = judge_result
            result.grading_time = time.time() - start_time

            print(
                f"Task {task_info['task_id']} ({task_info['run_name']}): {judge_result}"
            )

        except Exception as e:
            result.error_message = str(e)
            result.judge_result = "ERROR"
            result.grading_time = time.time() - start_time
            self.stats["errors"] += 1
            print(f"Error grading task {task_info['task_id']}: {e}")

        return result

    async def grade_all_tasks(self, max_concurrent: int = 5) -> List[GradingResult]:
        """Grade all tasks with concurrent processing"""
        task_files = self.find_task_files()
        print(f"Found {len(task_files)} task files to grade")

        # Extract task information
        task_infos = []
        for task_file in task_files:
            task_info = self.extract_task_info(task_file)
            if task_info:
                task_infos.append(task_info)

        self.stats["total_tasks"] = len(task_infos)
        print(f"Extracted {len(task_infos)} valid tasks for grading")

        if not task_infos:
            print("No valid tasks found for grading")
            return []

        # Grade tasks with concurrency control
        semaphore = asyncio.Semaphore(max_concurrent)

        async def grade_with_semaphore(task_info):
            async with semaphore:
                return await self.grade_single_task(task_info)

        # Create tasks for concurrent execution
        tasks = [grade_with_semaphore(task_info) for task_info in task_infos]

        # Execute all grading tasks
        results = await asyncio.gather(*tasks, return_exceptions=True)

        # Filter out exceptions and collect valid results
        valid_results = []
        for i, result in enumerate(results):
            if isinstance(result, Exception):
                print(f"Exception in task {i}: {result}")
                self.stats["errors"] += 1
            else:
                valid_results.append(result)
                self.stats["graded_tasks"] += 1
                self.stats["total_grading_time"] += result.grading_time

        self.results = valid_results
        return valid_results

    def update_original_files(self):
        """Update original task files with grading results"""
        updated_count = 0

        for result in self.results:
            try:
                # Read original file
                with open(result.file_path, "r", encoding="utf-8") as f:
                    task_data = json.load(f)

                # Add grading information
                task_data["final_judge_result"] = result.judge_result
                task_data["judge_type"] = result.judge_type
                task_data["grading_time"] = result.grading_time

                if result.error_message:
                    task_data["grading_error"] = result.error_message

                # Write back to file
                with open(result.file_path, "w", encoding="utf-8") as f:
                    json.dump(task_data, f, indent=2, ensure_ascii=False)

                updated_count += 1

            except Exception as e:
                print(f"Error updating file {result.file_path}: {e}")

        print(f"Updated {updated_count} original task files with grading results")

    def print_summary(self):
        """Print grading summary"""
        print("\n" + "=" * 60)
        print("GAIA-Text-103 Grading Summary")
        print("=" * 60)

        print(f"Total tasks found: {self.stats['total_tasks']}")
        print(f"Successfully graded: {self.stats['graded_tasks']}")
        print(f"Errors: {self.stats['errors']}")
        print("=" * 60)


async def main():
    """Main function"""
    parser = argparse.ArgumentParser(
        description="Grade GAIA-Text-103 tasks using LLM judgement"
    )
    parser.add_argument(
        "extraction_dir", help="Directory containing extracted GAIA-Text-103 tasks"
    )
    parser.add_argument(
        "--max-concurrent",
        type=int,
        default=5,
        help="Maximum number of concurrent grading tasks (default: 5)",
    )
    args = parser.parse_args()

    # Validate input directory
    if not os.path.exists(args.extraction_dir):
        print(f"Error: Extraction directory not found: {args.extraction_dir}")
        return 1

    print(f"Extraction directory: {args.extraction_dir}")
    print(f"Max concurrent tasks: {args.max_concurrent}")
    print()

    # Create grader and run grading
    grader = GAIAText103Grader(args.extraction_dir)

    try:
        print("Starting grading process...")
        results = await grader.grade_all_tasks(max_concurrent=args.max_concurrent)

        if results:
            # Update original files only
            grader.update_original_files()

            # Print summary
            grader.print_summary()

            print("\n✅ Grading completed successfully!")
            print("📝 Original task files updated with grading results")
        else:
            print("❌ No tasks were graded successfully")
            return 1

    except KeyboardInterrupt:
        print("\nGrading interrupted by user")
        return 1
    except Exception as e:
        print(f"Error during grading: {e}")
        return 1

    return 0


if __name__ == "__main__":
    exit_code = asyncio.run(main())
    sys.exit(exit_code)


================================================
FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-to-text-103-mover.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
GAIA to Text-103 Task Copier

This script:
1. Loads GAIA validation logs from a specified directory
2. Identifies tasks that belong to GAIA-Text-103 dataset
3. Copies those tasks to a new directory structure maintaining the original layout
"""

import argparse
import json
import os
import shutil
from pathlib import Path
from typing import Set


class GAIAtoText103Copier:
    """Copy GAIA-Text-103 tasks from GAIA validation logs"""

    def __init__(self, gaia_text_103_data_path: str, output_dir: str):
        """
        Initialize the copier

        Args:
            gaia_text_103_data_path: Path to GAIA-Text-103 standardized data file
            output_dir: Directory to save copied tasks
        """
        self.gaia_text_103_data_path = gaia_text_103_data_path
        self.output_dir = Path(output_dir)
        self.gaia_text_103_task_ids: Set[str] = set()
        self.copied_count = 0

        # Load GAIA-Text-103 task IDs
        self._load_gaia_text_103_tasks()

    def _load_gaia_text_103_tasks(self):
        """Load task IDs from GAIA-Text-103 dataset"""
        print(f"Loading GAIA-Text-103 task IDs from {self.gaia_text_103_data_path}")

        if not os.path.exists(self.gaia_text_103_data_path):
            raise FileNotFoundError(
                f"GAIA-Text-103 data file not found: {self.gaia_text_103_data_path}"
            )

        with open(self.gaia_text_103_data_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():
                    task_data = json.loads(line)
                    task_id = task_data.get("task_id")
                    if task_id:
                        self.gaia_text_103_task_ids.add(task_id)

        print(f"Loaded {len(self.gaia_text_103_task_ids)} GAIA-Text-103 task IDs")

    def copy_gaia_text_103_tasks(self, gaia_logs_dir: str) -> int:
        """
        Copy GAIA-Text-103 tasks from GAIA validation logs

        Args:
            gaia_logs_dir: Directory containing GAIA validation logs

        Returns:
            Number of copied tasks
        """
        print(f"Copying GAIA-Text-103 tasks from {gaia_logs_dir}")

        # Find all task JSON files in the logs directory (including in run subdirectories)
        task_files = []
        for root, dirs, files in os.walk(gaia_logs_dir):
            for file in files:
                if file.startswith("task_") and file.endswith(".json"):
                    task_files.append(os.path.join(root, file))

        print(f"Found {len(task_files)} task files to process")

        copied_count = 0

        for task_file in task_files:
            try:
                filename = os.path.basename(task_file)
                # Extract task ID from filename like: task_5188369a-3bbe-43d8-8b94-11558f909a08_attempt_1_format_retry_0_2025-08-06T21-14-23-770872Z.json
                task_id = (
                    filename.split("_")[1]
                    if filename.startswith("task_") and "_" in filename
                    else ""
                )

                if task_id and task_id in self.gaia_text_103_task_ids:
                    # This is a GAIA-Text-103 task, copy it
                    copied_count += 1

                    # Preserve the original directory structure
                    # Get the relative path from the original directory
                    original_dir = os.path.dirname(gaia_logs_dir)
                    relative_path = os.path.relpath(task_file, original_dir)

                    # Create the same directory structure in the output
                    output_file = self.output_dir / relative_path
                    output_file.parent.mkdir(parents=True, exist_ok=True)

                    # Copy the file
                    shutil.copy2(task_file, output_file)

                    if copied_count % 50 == 0:
                        print(f"Copied {copied_count} tasks...")

            except Exception as e:
                print(f"Error processing {task_file}: {e}")
                continue

        print(f"Successfully copied {copied_count} GAIA-Text-103 tasks")
        self.copied_count = copied_count
        return copied_count

    def print_summary(self):
        """Print copying summary to console"""
        print("\n" + "=" * 60)
        print("GAIA-Text-103 Task Copying Summary")
        print("=" * 60)
        print(f"Total Tasks Copied: {self.copied_count}")
        print(f"Output Directory: {self.output_dir}")
        print("=" * 60)


def main():
    """Main function"""
    parser = argparse.ArgumentParser(
        description="Copy GAIA-Text-103 tasks from GAIA validation logs"
    )
    parser.add_argument(
        "gaia_logs_dir", help="Directory containing GAIA validation logs"
    )
    parser.add_argument(
        "--gaia_text_103_data",
        default="../../data/gaia-2023-validation-text-103/standardized_data.jsonl",
        help="Path to GAIA-Text-103 standardized data file",
    )
    parser.add_argument(
        "--output-dir",
        help="Output directory for copied tasks (default: side by side with gaia-validation)",
    )

    args = parser.parse_args()

    # Set default output directory side by side with gaia-validation
    if not args.output_dir:
        gaia_logs_path = Path(args.gaia_logs_dir)
        # If the input is a gaia-validation directory, create gaia-text-103-extraction next to it
        if gaia_logs_path.name == "gaia-validation":
            args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction")
        else:
            # Otherwise, create in the same directory as the input
            args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction")

    # Validate inputs
    if not os.path.exists(args.gaia_logs_dir):
        print(f"Error: GAIA logs directory not found: {args.gaia_logs_dir}")
        return 1

    if not os.path.exists(args.gaia_text_103_data):
        print(f"Error: GAIA-Text-103 data file not found: {args.gaia_text_103_data}")
        return 1

    print(f"Input GAIA logs directory: {args.gaia_logs_dir}")
    print(f"Output directory: {args.output_dir}")
    print(f"GAIA-Text-103 data file: {args.gaia_text_103_data}")
    print()

    try:
        # Initialize copier
        copier = GAIAtoText103Copier(args.gaia_text_103_data, args.output_dir)

        # Copy tasks
        copied_count = copier.copy_gaia_text_103_tasks(args.gaia_logs_dir)

        if copied_count == 0:
            print("No GAIA-Text-103 tasks found in the logs directory")
            return 0

        # Print summary
        copier.print_summary()

        return 0

    except Exception as e:
        print(f"Error: {e}")
        return 1


if __name__ == "__main__":
    exit_code = main()
    exit(exit_code)


================================================
FILE: apps/miroflow-agent/conf/__init__.py
================================================
# This file makes the conf directory a Python package


================================================
FILE: apps/miroflow-agent/conf/agent/default.yaml
================================================
# conf/agent/default.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
main_agent:
  tools:
    - tool-python
    - tool-vqa
    - tool-transcribe
    - tool-reasoning
    - tool-reader
  max_turns: 20  # Maximum number of turns for main agent execution

sub_agents:
  agent-browsing:
    tools:
      - tool-google-search
      - tool-vqa
      - tool-reader
      - tool-python
    max_turns: 20

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/demo.yaml
================================================
# conf/agent/demo.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 20  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max200.yaml
================================================
# conf/agent/mirothinker_1.7_keep5_max200.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 200  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).
retry_with_summary: False # default is true

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max300.yaml
================================================
# conf/agent/mirothinker_1.7_keep5_max300.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 300  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).
retry_with_summary: False # default is true

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml
================================================
# conf/agent/mirothinker_v1.0.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 600  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml
================================================
# conf/agent/mirothinker_v1.0_keep5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 600  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml
================================================
# conf/agent/mirothinker_v1.5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 600  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml
================================================
# conf/agent/mirothinker_v1.5_keep5_max200.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 200  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml
================================================
# conf/agent/mirothinker_v1.5_keep5_max400.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 400  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/multi_agent.yaml
================================================
# conf/agent/multi_agent.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - tool-python
    - tool-vqa
    - tool-transcribe
    - tool-reasoning
    - tool-reader
  max_turns: 50  # Maximum number of turns for main agent execution

sub_agents:
  agent-browsing:
    tools:
      - tool-google-search
      - tool-vqa
      - tool-reader
      - tool-python
    max_turns: 50

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).


================================================
FILE: apps/miroflow-agent/conf/agent/multi_agent_os.yaml
================================================
# conf/agent/multi_agent_os.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - tool-python
    - tool-vqa-os
    - tool-transcribe-os
    - tool-reasoning-os
    - tool-reader
  max_turns: 50  # Maximum number of turns for main agent execution

sub_agents:
  agent-browsing:
    tools:
      - tool-google-search
      - tool-vqa-os
      - tool-reader
      - tool-python
    max_turns: 50

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).


================================================
FILE: apps/miroflow-agent/conf/agent/single_agent.yaml
================================================
# conf/agent/single_agent.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 600  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: -1
context_compress_limit: 0  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/agent/single_agent_keep5.yaml
================================================
# conf/agent/single_agent_keep5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
  - default
  - _self_

main_agent:
  tools:
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-python
  tool_blacklist:
    - [ "search_and_scrape_webpage", "sogou_search" ]
    - [ "tool-python", "download_file_from_sandbox_to_local" ]
  max_turns: 600  # Maximum number of turns for main agent execution

sub_agents:

# Settings for context management
keep_tool_result: 5
context_compress_limit: 5  # Enable context compression (>0 = enabled, 0 = disabled).

================================================
FILE: apps/miroflow-agent/conf/benchmark/aime2025.yaml
================================================
# conf/benchmark/aime2025.yaml
defaults:
  - default
  - _self_

name: "aime2025"

data:
  data_dir: "../../data/aime2025"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1

================================================
FILE: apps/miroflow-agent/conf/benchmark/browsecomp.yaml
================================================
# conf/benchmark/browsecomp.yaml
defaults:
  - default
  - _self_

name: "browsecomp"

data:
  data_dir: "../../data/browsecomp"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml
================================================
# conf/benchmark/browsecomp_zh.yaml
defaults:
  - default
  - _self_

name: "browsecomp_zh"

data:
  data_dir: "../../data/browsecomp_zh"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/collect_trace.yaml
================================================
# conf/benchmark/collect_trace.yaml
defaults:
  - default
  - _self_

name: "collect_trace"

data:
  data_dir: "../../data/debug"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/debug.yaml
================================================
# conf/benchmark/debug.yaml
defaults:
  - default
  - _self_

name: "debug"

data:
  data_dir: "../../data/debug"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/deepsearchqa.yaml
================================================
# conf/benchmark/deepsearchqa.yaml
defaults:
  - default
  - _self_

name: "deepsearchqa"

data:
  data_dir: "../../data/deepsearchqa"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/default.yaml
================================================
# conf/benchmark/default.yaml - Default benchmark configuration
# This is a base configuration for benchmarks. Specific benchmarks can override these defaults.
name: "default"

data:
  metadata_file: "standardized_data.jsonl"
  field_mapping:
    task_id_field: "task_id"
    task_question_field: "task_question"
    ground_truth_field: "ground_truth"
    file_name_field: "file_name"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5 
  pass_at_k: 1

================================================
FILE: apps/miroflow-agent/conf/benchmark/frames.yaml
================================================
# conf/benchmark/frames.yaml
defaults:
  - default
  - _self_

name: "frames"

data:
  data_dir: "../../data/frames"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/futurex.yaml
================================================
# conf/benchmark/futurex.yaml
defaults:
  - default
  - _self_

name: "futurex"

data:
  data_dir: "../../data/futurex"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml
================================================
# conf/benchmark/gaia-validation-text-103.yaml
defaults:
  - default
  - _self_

name: "gaia-validation-text-103"

data:
  data_dir: "../../data/gaia-2023-validation-text-103"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/gaia-validation.yaml
================================================
# conf/benchmark/gaia-validation.yaml
defaults:
  - default
  - _self_

name: "gaia-validation"

data:
  data_dir: "../../data/gaia-2023-validation"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml
================================================
# conf/benchmark/hle-text-2158.yaml
defaults:
  - default
  - _self_

name: "hle-text-2158"

data:
  data_dir: "../../data/hle-text-2158"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1

================================================
FILE: apps/miroflow-agent/conf/benchmark/hle-text-500.yaml
================================================
# conf/benchmark/hle-text-500.yaml
defaults:
  - default
  - _self_

name: "hle-text-500"

data:
  data_dir: "../../data/hle-text-500"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1

================================================
FILE: apps/miroflow-agent/conf/benchmark/hle.yaml
================================================
# conf/benchmark/hle.yaml
defaults:
  - default
  - _self_

name: "hle"

data:
  data_dir: "../../data/hle"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/seal-0.yaml
================================================
# conf/benchmark/seal-0.yaml
defaults:
  - default
  - _self_

name: "seal-0"

data:
  data_dir: "../../data/seal-0"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1

================================================
FILE: apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml
================================================
# conf/benchmark/webwalkerqa.yaml
defaults:
  - default
  - _self_

name: "webwalkerqa"

data:
  data_dir: "../../data/webwalkerqa"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml
================================================
# conf/benchmark/xbench_deepsearch.yaml
defaults:
  - default
  - _self_

name: "xbench_deepsearch"

data:
  data_dir: "../../data/xbench_deepsearch"

execution:
  max_tasks: null  # null means no limit
  max_concurrent: 5
  pass_at_k: 1


================================================
FILE: apps/miroflow-agent/conf/config.yaml
================================================
# conf/config.yaml
defaults:
  - llm: default
  - agent: default
  - benchmark: default
  - _self_  # Allows variables to be defined at the top of this file

hydra:
  run:
    dir: ../../logs/debug

# You can define some top-level or default parameters here
project_name: "miroflow-agent"
debug_dir: "../../logs/debug"


================================================
FILE: apps/miroflow-agent/conf/llm/claude-3-7.yaml
================================================
# conf/llm/claude-3-7.yaml
defaults:
  - default
  - _self_

provider: "anthropic"
model_name: "claude-3-7-sonnet-20250219"
base_url: https://api.anthropic.com
max_context_length: 65536


================================================
FILE: apps/miroflow-agent/conf/llm/default.yaml
================================================
# conf/llm/default.yaml - Default LLM configuration
provider: "anthropic" # openai, anthropic, qwen
model_name: "claude-3-7-sonnet-20250219"
async_client: false
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
api_key: ""
base_url: https://api.anthropic.com
repetition_penalty: 1.0


================================================
FILE: apps/miroflow-agent/conf/llm/gpt-5.yaml
================================================
# conf/llm/gpt-5.yaml
defaults:
  - default
  - _self_

provider: "openai"
model_name: "gpt-5-2025-08-07"
base_url: https://api.openai.com/v1
max_context_length: 65536


================================================
FILE: apps/miroflow-agent/conf/llm/qwen-3.yaml
================================================
# conf/llm/qwen-3.yaml
defaults:
  - default
  - _self_

provider: "qwen"
model_name: "qwen-3"
base_url: "https://your-api.com/v1"
max_context_length: 262144
max_tokens: 16384
top_p: 0.95
repetition_penalty: 1.05
temperature: 1.0

================================================
FILE: apps/miroflow-agent/main.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio

import hydra
from omegaconf import DictConfig, OmegaConf

# Import from the new modular structure
from src.core.pipeline import (
    create_pipeline_components,
    execute_task_pipeline,
)
from src.logging.task_logger import bootstrap_logger

# Configure logger and get the configured instance
logger = bootstrap_logger()


async def amain(cfg: DictConfig) -> None:
    """Asynchronous main function."""

    logger.info(OmegaConf.to_yaml(cfg))

    # Create pipeline components using the factory function
    main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (
        create_pipeline_components(cfg)
    )

    # Define task parameters
    task_id = "task_example"
    task_description = "What is the title of today's arxiv paper in computer science?"
    task_file_name = ""

    # Execute task using the pipeline
    final_summary, final_boxed_answer, log_file_path, _ = await execute_task_pipeline(
        cfg=cfg,
        task_id=task_id,
        task_file_name=task_file_name,
        task_description=task_description,
        main_agent_tool_manager=main_agent_tool_manager,
        sub_agent_tool_managers=sub_agent_tool_managers,
        output_formatter=output_formatter,
        log_dir=cfg.debug_dir,
    )


@hydra.main(config_path="conf", config_name="config", version_base=None)
def main(cfg: DictConfig) -> None:
    asyncio.run(amain(cfg))


if __name__ == "__main__":
    main()


================================================
FILE: apps/miroflow-agent/pyproject.toml
================================================
[project]
name = "miroflow-agent"
version = "0.1.0"
description = "An agent framework for complex task solving with LLM and MCP tools"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "miroflow-tools>=0.1.0",
    "huggingface-hub>=0.28.0",
    "requests>=2.32.3",
    "rich>=13.9.4",
    "jinja2>=3.1.4",
    "pillow>=11.0.0",
    "markdownify>=0.14.1",
    "duckduckgo-search>=6.3.7",
    "python-dotenv",
    "pdfminer-six",
    "python-pptx",
    "puremagic",
    "pydub",
    "SpeechRecognition",
    "youtube_transcript_api",
    "mcp",
    "fastmcp",
    "anthropic",
    "e2b-code-interpreter==1.2.1",
    "jsonlines>=4.0.0",
    "mammoth>=1.9.0",
    "numpy>=2.2.5",
    "ipdb>=0.13.13",
    "datasets>=3.5.0",
    "openpyxl>=3.1.5",
    "markitdown-mcp>=0.0.1a3",
    "markitdown>=0.1.1",
    "regex>=2024.11.6",
    "openai>=1.78.1",
    "tenacity>=9.1.2",
    "transformers>=4.51.3",
    "omegaconf>=2.3.0",
    "wikipedia",
    "mutagen",
    "hydra-core",
    "google-genai",
    "tiktoken>=0.9.0",
    "aiohttp",
    "colorama>=0.4.6",
    "json-repair>=0.49.0",
    "tencentcloud-sdk-python>=3.0.1451"
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src"]


[tool.uv.sources]
miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }

[dependency-groups]
dev = [
    "inline-snapshot>=0.23.2",
    "pyright>=1.1.403",
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
    "pytest-cov>=6.2.1",
    "pytest-html>=4.1.1",
    "pytest-xdist>=3.7.0",
    "ty>=0.0.1a14",
]

[tool.pytest.ini_options]
# see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml
minversion = "8.3.5"
testpaths = ["tests"]
# make warning go away
# https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915
asyncio_default_fixture_loop_scope = "function"
addopts = [
    # summary for failed AND passed tests
    "-rA",
    # only show stderr for test. stdlog can contain sensitive information
    "--show-capture=stderr",
    # use `pytest-xdist` to run tests in parallel
    "-n=auto",
    # use `pytest-html` to generate test report in html format
    "--html=report.html",
    "--self-contained-html",
    # use `pytest-testmon` to run tests on changed files only
    # "--testmon",
    # use `pytest-cov` to generate test coverage report
    "--cov=miroflow_agent",
    "--cov-report=html",
]


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-32}
BENCHMARK_NAME="aime2025"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/aime2025 \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="browsecomp"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/browsecomp \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="browsecomp_zh"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/browsecomp_zh \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-1}
BENCHMARK_NAME="debug"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-1}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/debug \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_deepsearchqa.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="deepsearchqa"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/deepsearchqa \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="frames"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/frames \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="futurex"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data_250924_250930.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/futurex \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
# echo "Calculating average scores..."
# uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "Extracting predictions and formatting for FutureX submission..."
uv run python benchmarks/evaluators/extract_futurex_results.py "$RESULTS_DIR"

# Check status and provide user-friendly message
if [ $? -eq 0 ]; then
    echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl"
    echo "You can now upload this file to the FutureX test server."
else
    echo "❌ Failed to generate submission file. Please check the logs for details."
fi

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="gaia-validation-text-103"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="gaia-validation"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/gaia-2023-validation \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle-text-2158"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data_original.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/hle-text-2158 \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle-text-500"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data_original.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/hle-text-500 \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/hle \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="seal-0"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/seal-0 \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="webwalkerqa"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/webwalkerqa \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh
================================================
#!/bin/bash

# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}

# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="xbench_deepsearch"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}

# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"

echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"

# Create results directory
mkdir -p "$RESULTS_DIR"

# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
    echo "=========================================="
    echo "Launching experiment $i/$NUM_RUNS"
    echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
    echo "=========================================="
    
    # Set specific identifier for this run
    RUN_ID="run_$i"
    
    # Run experiment (background execution)
    (
        uv run python benchmarks/common_benchmark.py \
            benchmark=$BENCHMARK_NAME \
            benchmark.data.metadata_file="standardized_data.jsonl" \
            llm=qwen-3 \
            llm.provider=$LLM_PROVIDER \
            llm.model_name=$LLM_MODEL \
            llm.base_url=$BASE_URL \
            llm.async_client=true \
            llm.temperature=$TEMPERATURE \
            llm.max_context_length=$MAX_CONTEXT_LENGTH \
            llm.api_key=$API_KEY \
            benchmark.execution.max_tasks=null \
            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
            benchmark.execution.pass_at_k=$PASS_AT_K \
            benchmark.data.data_dir=../../data/xbench_deepsearch \
            agent=$AGENT_SET \
            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
            2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
        
        # Check if run was successful
        if [ $? -eq 0 ]; then
            echo "Run $i completed successfully"
            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
            if [ -f "$RESULT_FILE" ]; then
                echo "Results saved to $RESULT_FILE"
            else
                echo "Warning: Result file not found for run $i"
            fi
        else
            echo "Run $i failed!"
        fi
    ) &
    
    # Small delay between launches to avoid simultaneous requests
    sleep 2
done

echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."

# Wait for all background tasks to complete
wait

echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="

# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"

echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "==========================================" 


================================================
FILE: apps/miroflow-agent/src/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""MiroFlow Agent - A modular agent framework for task execution."""

from .core.orchestrator import Orchestrator
from .core.pipeline import create_pipeline_components, execute_task_pipeline
from .io.output_formatter import OutputFormatter
from .llm.factory import ClientFactory
from .logging.task_logger import TaskLog, bootstrap_logger

__all__ = [
    "Orchestrator",
    "create_pipeline_components",
    "execute_task_pipeline",
    "OutputFormatter",
    "ClientFactory",
    "TaskLog",
    "bootstrap_logger",
]


================================================
FILE: apps/miroflow-agent/src/config/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Configuration module for MiroFlow Agent."""

from .settings import (
    create_mcp_server_parameters,
    expose_sub_agents_as_tools,
    get_env_info,
)

__all__ = [
    "create_mcp_server_parameters",
    "expose_sub_agents_as_tools",
    "get_env_info",
]


================================================
FILE: apps/miroflow-agent/src/config/settings.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Configuration settings and MCP server parameter management.

This module handles:
- Loading environment variables for API keys and service URLs
- Creating MCP server configurations for different tools
- Exposing sub-agents as callable tools
- Collecting environment information for logging
"""

import os
import sys

from dotenv import load_dotenv
from mcp import StdioServerParameters
from omegaconf import DictConfig

# Load environment variables from .env file
load_dotenv()

# API for Google Search
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev")

# API for Web Scraping
JINA_API_KEY = os.environ.get("JINA_API_KEY")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")

# API for Linux Sandbox
E2B_API_KEY = os.environ.get("E2B_API_KEY")

# API for Open-Source Audio Transcription Tool
WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL")
WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY")
WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME")

# API for Open-Source VQA Tool
VISION_API_KEY = os.environ.get("VISION_API_KEY")
VISION_BASE_URL = os.environ.get("VISION_BASE_URL")
VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME")

# API for Open-Source Reasoning Tool
REASONING_API_KEY = os.environ.get("REASONING_API_KEY")
REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL")
REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME")

# API for Claude Sonnet 3.7 as Commercial Tools
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")

# API Keys for Commercial Tools
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

# API for Sogou Search
TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID")
TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY")

# API for Summary LLM
SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY")
SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL")
SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME")


# MCP server configuration generation function
def create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig):
    """
    Create MCP server configurations based on agent configuration.

    Dynamically generates StdioServerParameters for each tool specified in the
    agent configuration. Each tool type (search, python, vqa, etc.) has its own
    MCP server with appropriate environment variables.

    Args:
        cfg: Global Hydra configuration object
        agent_cfg: Agent-specific configuration containing 'tools' and 'tool_blacklist'

    Returns:
        Tuple of (configs, blacklist) where:
        - configs: List of dicts with 'name' and 'params' (StdioServerParameters)
        - blacklist: Set of (server_name, tool_name) tuples to exclude
    """
    configs = []

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-google-search" in agent_cfg["tools"]
    ):
        if not SERPER_API_KEY:
            raise ValueError(
                "SERPER_API_KEY not set, tool-google-search will be unavailable."
            )

        configs.append(
            {
                "name": "tool-google-search",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.mcp_servers.searching_google_mcp_server",
                    ],
                    env={
                        "SERPER_API_KEY": SERPER_API_KEY,
                        "SERPER_BASE_URL": SERPER_BASE_URL,
                        "JINA_API_KEY": JINA_API_KEY,
                        "JINA_BASE_URL": JINA_BASE_URL,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-sogou-search" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-sogou-search",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.mcp_servers.searching_sogou_mcp_server",
                    ],
                    env={
                        "TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID,
                        "TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY,
                        "JINA_API_KEY": JINA_API_KEY,
                        "JINA_BASE_URL": JINA_BASE_URL,
                    },
                ),
            }
        )

    if agent_cfg.get("tools", None) is not None and "tool-python" in agent_cfg["tools"]:
        configs.append(
            {
                "name": "tool-python",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
                    env={"E2B_API_KEY": E2B_API_KEY},
                ),
            }
        )

    if agent_cfg.get("tools", None) is not None and "tool-vqa" in agent_cfg["tools"]:
        configs.append(
            {
                "name": "tool-vqa",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"],
                    env={
                        "OPENAI_API_KEY": OPENAI_API_KEY,
                        "OPENAI_BASE_URL": OPENAI_BASE_URL,
                    },
                ),
            }
        )

    if agent_cfg.get("tools", None) is not None and "tool-vqa-os" in agent_cfg["tools"]:
        configs.append(
            {
                "name": "tool-vqa-os",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"],
                    env={
                        "VISION_API_KEY": VISION_API_KEY,
                        "VISION_BASE_URL": VISION_BASE_URL,
                        "VISION_MODEL_NAME": VISION_MODEL_NAME,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-transcribe" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-transcribe",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"],
                    env={
                        "OPENAI_API_KEY": OPENAI_API_KEY,
                        "OPENAI_BASE_URL": OPENAI_BASE_URL,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-transcribe-os" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-transcribe-os",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"],
                    env={
                        "WHISPER_BASE_URL": WHISPER_BASE_URL,
                        "WHISPER_API_KEY": WHISPER_API_KEY,
                        "WHISPER_MODEL_NAME": WHISPER_MODEL_NAME,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-reasoning" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-reasoning",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.mcp_servers.reasoning_mcp_server",
                    ],
                    env={
                        "ANTHROPIC_API_KEY": ANTHROPIC_API_KEY,
                        "ANTHROPIC_BASE_URL": ANTHROPIC_BASE_URL,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-reasoning-os" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-reasoning-os",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.mcp_servers.reasoning_mcp_server_os",
                    ],
                    env={
                        "REASONING_API_KEY": REASONING_API_KEY,
                        "REASONING_BASE_URL": REASONING_BASE_URL,
                        "REASONING_MODEL_NAME": REASONING_MODEL_NAME,
                    },
                ),
            }
        )

    # reader
    if agent_cfg.get("tools", None) is not None and "tool-reader" in agent_cfg["tools"]:
        configs.append(
            {
                "name": "tool-reader",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "markitdown_mcp"],
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "tool-reading" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "tool-reading",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"],
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "search_and_scrape_webpage" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "search_and_scrape_webpage",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.dev_mcp_servers.search_and_scrape_webpage",
                    ],
                    env={
                        "SERPER_API_KEY": SERPER_API_KEY,
                        "SERPER_BASE_URL": SERPER_BASE_URL,
                        "TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID,
                        "TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "jina_scrape_llm_summary" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "jina_scrape_llm_summary",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary",
                    ],
                    env={
                        "JINA_API_KEY": JINA_API_KEY,
                        "JINA_BASE_URL": JINA_BASE_URL,
                        "SUMMARY_LLM_BASE_URL": SUMMARY_LLM_BASE_URL,
                        "SUMMARY_LLM_MODEL_NAME": SUMMARY_LLM_MODEL_NAME,
                        "SUMMARY_LLM_API_KEY": SUMMARY_LLM_API_KEY,
                    },
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "stateless_python" in agent_cfg["tools"]
    ):
        configs.append(
            {
                "name": "stateless_python",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.dev_mcp_servers.stateless_python_server",
                    ],
                    env={"E2B_API_KEY": E2B_API_KEY},
                ),
            }
        )

    if (
        agent_cfg.get("tools", None) is not None
        and "task_planner" in agent_cfg["tools"]
    ):
        # Generate a random UUID for each MCP server instance to ensure isolation
        # Each time create_mcp_server_parameters is called, a new UUID is generated
        # This automatically isolates todo lists for concurrent tasks
        import uuid

        todo_task_id = str(uuid.uuid4())
        configs.append(
            {
                "name": "task_planner",
                "params": StdioServerParameters(
                    command=sys.executable,
                    args=[
                        "-m",
                        "miroflow_tools.dev_mcp_servers.task_planner",
                    ],
                    env={"TASK_ID": todo_task_id},
                ),
            }
        )

    blacklist = set()
    for black_list_item in agent_cfg.get("tool_blacklist", []):
        blacklist.add((black_list_item[0], black_list_item[1]))
    return configs, blacklist


def expose_sub_agents_as_tools(sub_agents_cfg: DictConfig):
    """
    Convert sub-agent configurations into tool definitions for the main agent.

    This allows the main agent to invoke sub-agents (like the browsing agent)
    as if they were regular MCP tools, enabling a hierarchical agent architecture.

    Args:
        sub_agents_cfg: Configuration containing sub-agent definitions

    Returns:
        List of server parameter dicts, each with 'name' and 'tools' keys.
        Each tool includes 'name', 'description', and 'schema' for the sub-agent.
    """
    sub_agents_server_params = []
    for sub_agent in sub_agents_cfg.keys():
        if "agent-browsing" in sub_agent:
            sub_agents_server_params.append(
                dict(
                    name="agent-browsing",
                    tools=[
                        dict(
                            name="search_and_browse",
                            description="This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe result of the subtask. ",
                            schema={
                                "type": "object",
                                "properties": {
                                    "subtask": {"title": "Subtask", "type": "string"}
                                },
                                "required": ["subtask"],
                                "title": "search_and_browseArguments",
                            },
                        )
                    ],
                )
            )
    return sub_agents_server_params


def get_env_info(cfg: DictConfig) -> dict:
    """
    Collect current configuration and environment information for logging.

    Gathers LLM settings, agent configuration, API key availability (masked),
    and base URLs. Used for debugging and task log enrichment.

    Args:
        cfg: Hydra configuration object

    Returns:
        Dictionary containing:
        - LLM configuration (provider, model, temperature, etc.)
        - Agent configuration (max turns for main/sub agents)
        - API key availability flags (boolean, not actual keys)
        - Service base URLs
    """
    return {
        # LLM Configuration
        "llm_provider": cfg.llm.provider,
        "llm_base_url": cfg.llm.base_url,
        "llm_model_name": cfg.llm.model_name,
        "llm_temperature": cfg.llm.temperature,
        "llm_top_p": cfg.llm.top_p,
        "llm_min_p": cfg.llm.min_p,
        "llm_top_k": cfg.llm.top_k,
        "llm_max_tokens": cfg.llm.max_tokens,
        "llm_repetition_penalty": cfg.llm.repetition_penalty,
        "llm_async_client": cfg.llm.async_client,
        "keep_tool_result": cfg.agent.keep_tool_result,
        # Agent Configuration
        "main_agent_max_turns": cfg.agent.main_agent.max_turns,
        **(
            {
                f"sub_{sub_agent}_max_turns": cfg.agent.sub_agents[sub_agent].max_turns
                for sub_agent in cfg.agent.sub_agents
            }
            if cfg.agent.sub_agents is not None
            else {}
        ),
        # API Keys (masked for security)
        "has_serper_api_key": bool(SERPER_API_KEY),
        "has_jina_api_key": bool(JINA_API_KEY),
        "has_anthropic_api_key": bool(ANTHROPIC_API_KEY),
        "has_openai_api_key": bool(OPENAI_API_KEY),
        "has_e2b_api_key": bool(E2B_API_KEY),
        "has_tencent_secret_id": bool(TENCENTCLOUD_SECRET_ID),
        "has_tencent_secret_key": bool(TENCENTCLOUD_SECRET_KEY),
        "has_summary_llm_api_key": bool(SUMMARY_LLM_API_KEY),
        # Base URLs
        "openai_base_url": OPENAI_BASE_URL,
        "anthropic_base_url": ANTHROPIC_BASE_URL,
        "jina_base_url": JINA_BASE_URL,
        "serper_base_url": SERPER_BASE_URL,
        "whisper_base_url": WHISPER_BASE_URL,
        "vision_base_url": VISION_BASE_URL,
        "reasoning_base_url": REASONING_BASE_URL,
        "summary_llm_base_url": SUMMARY_LLM_BASE_URL,
    }


================================================
FILE: apps/miroflow-agent/src/core/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Core module containing orchestrator and pipeline components."""

from .answer_generator import AnswerGenerator
from .orchestrator import Orchestrator
from .pipeline import create_pipeline_components, execute_task_pipeline
from .stream_handler import StreamHandler
from .tool_executor import ToolExecutor

__all__ = [
    "AnswerGenerator",
    "Orchestrator",
    "StreamHandler",
    "ToolExecutor",
    "create_pipeline_components",
    "execute_task_pipeline",
]


================================================
FILE: apps/miroflow-agent/src/core/answer_generator.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Answer generator module for final answer generation and context management.

This module provides the AnswerGenerator class that handles:
- LLM call processing
- Failure summary generation for context compression
- Final answer generation with retries
- Context management fallback strategies
"""

import logging
from typing import Any, Dict, List, Optional, Tuple

from omegaconf import DictConfig

from ..io.output_formatter import OutputFormatter
from ..llm.base_client import BaseClient
from ..logging.task_logger import TaskLog
from ..utils.parsing_utils import extract_failure_experience_summary
from ..utils.prompt_utils import (
    FAILURE_SUMMARY_ASSISTANT_PREFIX,
    FAILURE_SUMMARY_PROMPT,
    FORMAT_ERROR_MESSAGE,
    generate_agent_summarize_prompt,
)
from ..utils.wrapper_utils import ErrorBox, ResponseBox
from .stream_handler import StreamHandler

logger = logging.getLogger(__name__)

# Safety limits for retry loops
DEFAULT_MAX_FINAL_ANSWER_RETRIES = 3


class AnswerGenerator:
    """
    Generator for final answers with context management support.

    Handles the generation of final answers, failure summaries for retry,
    and various fallback strategies based on context management settings.
    """

    def __init__(
        self,
        llm_client: BaseClient,
        output_formatter: OutputFormatter,
        task_log: TaskLog,
        stream_handler: StreamHandler,
        cfg: DictConfig,
        intermediate_boxed_answers: List[str],
    ):
        """
        Initialize the answer generator.

        Args:
            llm_client: The LLM client for API calls
            output_formatter: Formatter for output processing
            task_log: Logger for task execution
            stream_handler: Handler for streaming events
            cfg: Configuration object
            intermediate_boxed_answers: List to track intermediate answers
        """
        self.llm_client = llm_client
        self.output_formatter = output_formatter
        self.task_log = task_log
        self.stream = stream_handler
        self.cfg = cfg
        self.intermediate_boxed_answers = intermediate_boxed_answers

        # Context management settings
        self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
        self.max_final_answer_retries = (
            DEFAULT_MAX_FINAL_ANSWER_RETRIES if cfg.agent.keep_tool_result == -1 else 1
        )
        self.retry_with_summary = cfg.agent.get("retry_with_summary", True)

    async def handle_llm_call(
        self,
        system_prompt: str,
        message_history: List[Dict[str, Any]],
        tool_definitions: List[Dict],
        step_id: int,
        purpose: str = "",
        agent_type: str = "main",
    ) -> Tuple[Optional[str], bool, Optional[Any], List[Dict[str, Any]]]:
        """
        Unified LLM call and logging processing.

        Args:
            system_prompt: System prompt for the LLM
            message_history: Conversation history
            tool_definitions: Available tool definitions
            step_id: Current step ID for logging
            purpose: Description of the call purpose
            agent_type: Type of agent making the call

        Returns:
            Tuple of (response_text, should_break, tool_calls_info, message_history)
        """
        original_message_history = message_history
        try:
            response, message_history = await self.llm_client.create_message(
                system_prompt=system_prompt,
                message_history=message_history,
                tool_definitions=tool_definitions,
                keep_tool_result=self.cfg.agent.keep_tool_result,
                step_id=step_id,
                task_log=self.task_log,
                agent_type=agent_type,
            )

            if ErrorBox.is_error_box(response):
                await self.stream.show_error(str(response))
                response = None

            if ResponseBox.is_response_box(response):
                if response.has_extra_info():
                    extra_info = response.get_extra_info()
                    if extra_info.get("warning_msg"):
                        await self.stream.show_error(
                            extra_info.get("warning_msg", "Empty warning message")
                        )
                response = response.get_response()

            # Check if response is None (indicating an error occurred)
            if response is None:
                self.task_log.log_step(
                    "error",
                    f"{purpose} | LLM Call Failed",
                    f"{purpose} failed - no response received",
                )
                return "", False, None, original_message_history

            # Use client's response processing method
            assistant_response_text, should_break, message_history = (
                self.llm_client.process_llm_response(
                    response, message_history, agent_type
                )
            )

            # Use client's tool call information extraction method
            tool_calls_info = self.llm_client.extract_tool_calls_info(
                response, assistant_response_text
            )

            self.task_log.log_step(
                "info",
                f"{purpose} | LLM Call",
                "completed successfully",
            )
            return (
                assistant_response_text,
                should_break,
                tool_calls_info,
                message_history,
            )

        except Exception as e:
            self.task_log.log_step(
                "error",
                f"{purpose} | LLM Call ERROR",
                f"{purpose} error: {str(e)}",
            )
            # Return empty response with should_break=False, need to retry
            return "", False, None, original_message_history

    async def generate_failure_summary(
        self,
        system_prompt: str,
        message_history: List[Dict[str, Any]],
        tool_definitions: List[Dict],
        turn_count: int,
    ) -> Optional[str]:
        """
        Generate a failure experience summary for context compression.

        This is the core of the context management mechanism. When a task attempt fails
        (i.e., the task is not completed within the given turns and context window),
        we compress the entire conversation history into a structured summary containing:
        - Failure type: incomplete / blocked / misdirected / format_missed
        - What happened: the approach taken and why a final answer was not reached
        - Useful findings: facts, intermediate results, or conclusions to be reused

        Args:
            system_prompt: The system prompt used in the conversation
            message_history: The full conversation history to be compressed
            tool_definitions: Available tool definitions
            turn_count: Current turn count for step ID

        Returns:
            The compressed failure experience summary, or None if generation failed
        """
        self.task_log.log_step(
            "info",
            "Main Agent | Failure Summary",
            "Generating failure experience summary for potential retry...",
        )

        # Build failure summary history
        failure_summary_history = message_history.copy()
        if failure_summary_history and failure_summary_history[-1]["role"] == "user":
            failure_summary_history.pop()

        # Add failure summary prompt and assistant prefix for structured output
        failure_summary_history.append(
            {"role": "user", "content": FAILURE_SUMMARY_PROMPT}
        )
        failure_summary_history.append(
            {"role": "assistant", "content": FAILURE_SUMMARY_ASSISTANT_PREFIX}
        )

        # Call LLM to generate failure summary
        (
            failure_summary_text,
            _,
            _,
            _,
        ) = await self.handle_llm_call(
            system_prompt,
            failure_summary_history,
            tool_definitions,
            turn_count + 10,  # Use a different step id
            "Main Agent | Failure Experience Summary",
            agent_type="main",
        )

        # Prepend the assistant prefix to the response for complete output
        if failure_summary_text:
            failure_summary_text = (
                FAILURE_SUMMARY_ASSISTANT_PREFIX + failure_summary_text
            )
            failure_experience_summary = extract_failure_experience_summary(
                failure_summary_text
            )
            # Truncate for logging, but only add "..." if actually truncated
            log_preview = failure_experience_summary[:500]
            if len(failure_experience_summary) > 500:
                log_preview += "..."
            self.task_log.log_step(
                "info",
                "Main Agent | Failure Summary",
                f"Generated failure experience summary:\n{log_preview}",
            )
            return failure_experience_summary
        else:
            self.task_log.log_step(
                "warning",
                "Main Agent | Failure Summary",
                "Failed to generate failure experience summary",
            )
            return None

    async def generate_final_answer_with_retries(
        self,
        system_prompt: str,
        message_history: List[Dict[str, Any]],
        tool_definitions: List[Dict],
        turn_count: int,
        task_description: str,
    ) -> Tuple[Optional[str], str, Optional[str], str, List[Dict[str, Any]]]:
        """
        Generate final answer with retry mechanism.

        Args:
            system_prompt: System prompt for the LLM
            message_history: Conversation history
            tool_definitions: Available tool definitions
            turn_count: Current turn count
            task_description: Original task description

        Returns:
            Tuple of (final_answer_text, final_summary, final_boxed_answer, usage_log, message_history)
        """
        # Generate summary prompt
        summary_prompt = generate_agent_summarize_prompt(
            task_description,
            agent_type="main",
        )

        if message_history[-1]["role"] == "user":
            message_history.pop(-1)
        message_history.append({"role": "user", "content": summary_prompt})

        final_answer_text = None
        final_boxed_answer = None
        final_summary = ""
        usage_log = ""

        for retry_idx in range(self.max_final_answer_retries):
            (
                final_answer_text,
                should_break,
                tool_calls_info,
                message_history,
            ) = await self.handle_llm_call(
                system_prompt,
                message_history,
                tool_definitions,
                turn_count + 1 + retry_idx,
                f"Main agent | Final Summary (attempt {retry_idx + 1}/{self.max_final_answer_retries})",
                agent_type="main",
            )

            if final_answer_text:
                final_summary, final_boxed_answer, usage_log = (
                    self.output_formatter.format_final_summary_and_log(
                        final_answer_text, self.llm_client
                    )
                )

                if final_boxed_answer != FORMAT_ERROR_MESSAGE:
                    self.task_log.log_step(
                        "info",
                        "Main Agent | Final Answer",
                        f"Boxed answer found on attempt {retry_idx + 1}",
                    )
                    break
                else:
                    self.task_log.log_step(
                        "warning",
                        "Main Agent | Final Answer",
                        f"No boxed answer on attempt {retry_idx + 1}, retrying...",
                    )
                    if retry_idx < self.max_final_answer_retries - 1:
                        if (
                            message_history
                            and message_history[-1]["role"] == "assistant"
                        ):
                            message_history.pop()
            else:
                self.task_log.log_step(
                    "warning",
                    "Main Agent | Final Answer",
                    f"Failed to generate answer on attempt {retry_idx + 1}",
                )
                if retry_idx < self.max_final_answer_retries - 1:
                    if message_history and message_history[-1]["role"] == "assistant":
                        message_history.pop()

        # Ensure final_boxed_answer is never None
        if final_boxed_answer is None:
            final_boxed_answer = FORMAT_ERROR_MESSAGE

        return (
            final_answer_text,
            final_summary,
            final_boxed_answer,
            usage_log,
            message_history,
        )

    def handle_no_context_management_fallback(
        self,
        final_answer_text: Optional[str],
        final_summary: str,
        final_boxed_answer: Optional[str],
    ) -> Tuple[str, str, str]:
        """
        Handle fallback when context_compress_limit == 0 (no context management).

        In this mode, the model has only one chance to answer.
        We should try to use intermediate answers as fallback to maximize accuracy.

        Args:
            final_answer_text: The generated final answer text
            final_summary: The final summary
            final_boxed_answer: The extracted boxed answer

        Returns:
            Tuple of (final_answer_text, final_summary, final_boxed_answer)
        """
        # Validate final_answer_text
        if not final_answer_text:
            final_answer_text = "No final answer generated."
            final_summary = final_answer_text
            final_boxed_answer = FORMAT_ERROR_MESSAGE
            self.task_log.log_step(
                "error",
                "Main Agent | Final Answer",
                "Unable to generate final answer after all retries",
            )
        else:
            self.task_log.log_step(
                "info",
                "Main Agent | Final Answer",
                f"Final answer content:\n\n{final_answer_text}",
            )

        # Fallback to intermediate answer if no valid boxed answer
        if (
            final_boxed_answer == FORMAT_ERROR_MESSAGE or final_boxed_answer is None
        ) and self.intermediate_boxed_answers:
            final_boxed_answer = self.intermediate_boxed_answers[-1]
            self.task_log.log_step(
                "info",
                "Main Agent | Final Answer (No Context Management)",
                f"Using intermediate boxed answer as fallback: {final_boxed_answer}",
            )

        # Ensure final_boxed_answer is never None
        if final_boxed_answer is None:
            final_boxed_answer = FORMAT_ERROR_MESSAGE

        return final_answer_text, final_summary, final_boxed_answer

    def handle_context_management_no_fallback(
        self,
        final_answer_text: Optional[str],
        final_summary: str,
        final_boxed_answer: Optional[str],
    ) -> Tuple[str, str, str]:
        """
        Handle failure when context_compress_limit > 0 (context management enabled).

        In this mode, the model has multiple chances to retry with context management.
        We should NOT guess or use intermediate answers, because:
        - A wrong guess can reduce accuracy
        - The model will have another chance to answer with failure experience

        Args:
            final_answer_text: The generated final answer text
            final_summary: The final summary
            final_boxed_answer: The extracted boxed answer

        Returns:
            Tuple of (final_answer_text, final_summary, final_boxed_answer)
        """
        # Validate final_answer_text
        if not final_answer_text:
            final_answer_text = "No final answer generated."
            final_summary = final_answer_text
            final_boxed_answer = FORMAT_ERROR_MESSAGE
            self.task_log.log_step(
                "error",
                "Main Agent | Final Answer",
                "Unable to generate final answer after all retries",
            )
        else:
            self.task_log.log_step(
                "info",
                "Main Agent | Final Answer",
                f"Final answer content:\n\n{final_answer_text}",
            )

        # Ensure final_boxed_answer is never None
        if final_boxed_answer is None:
            final_boxed_answer = FORMAT_ERROR_MESSAGE

        # With context management, do NOT fallback to intermediate answers
        if final_boxed_answer == FORMAT_ERROR_MESSAGE:
            self.task_log.log_step(
                "info",
                "Main Agent | Final Answer (Context Management Mode)",
                "No valid boxed answer found. Not using intermediate fallback - will generate failure summary for retry.",
            )

        return final_answer_text, final_summary, final_boxed_answer

    async def generate_and_finalize_answer(
        self,
        system_prompt: str,
        message_history: List[Dict[str, Any]],
        tool_definitions: List[Dict],
        turn_count: int,
        task_description: str,
        reached_max_turns: bool = False,
        is_final_retry: bool = False,
        save_callback=None,
    ) -> Tuple[str, str, Optional[str], str, List[Dict[str, Any]]]:
        """
        Generate final answer and handle fallback based on context management settings.

        Context Management (context_compress_limit > 0) is essentially a context compression
        mechanism that enables multi-attempt problem solving.

        Decision table based on (context_management, reached_max_turns):

        | Context Management | Reached Max Turns | Behavior                                    |
        |--------------------|-------------------|---------------------------------------------|
        | OFF (limit=0)      | No                | Generate answer → fallback to intermediate  |
        | OFF (limit=0)      | Yes               | Generate answer → fallback to intermediate  |
        | ON  (limit>0)      | No                | Generate answer → no fallback, fail summary |
        | ON  (limit>0)      | Yes               | SKIP generation → fail summary directly     |

        Args:
            system_prompt: System prompt for the LLM
            message_history: Conversation history
            tool_definitions: Available tool definitions
            turn_count: Current turn count
            task_description: Original task description
            reached_max_turns: Whether the main loop ended due to reaching max turns
            save_callback: Optional callback to save message history

        Returns:
            Tuple of (final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history)
        """
        context_management_enabled = self.context_compress_limit > 0
        failure_experience_summary = None
        usage_log = ""

        # CASE: Context management ON + reached max turns + NOT final retry
        # Skip answer generation entirely - any answer would be a blind guess
        # But if this is the final retry, we still try to generate an answer (last chance)
        if context_management_enabled and reached_max_turns and not is_final_retry:
            self.task_log.log_step(
                "info",
                "Main Agent | Final Answer (Context Management Mode)",
                "Reached max turns. Skipping answer generation to avoid blind guessing.",
            )

            if save_callback:
                save_callback(system_prompt, message_history)

            if self.retry_with_summary:
                failure_experience_summary = await self.generate_failure_summary(
                    system_prompt, message_history, tool_definitions, turn_count
                )

            return (
                "Task incomplete - reached maximum turns. Will retry with failure experience.",
                FORMAT_ERROR_MESSAGE,
                failure_experience_summary,
                usage_log,
                message_history,
            )

        # ALL OTHER CASES: Generate final answer first
        # (including final retry with reached_max_turns - last chance to get an answer)
        (
            final_answer_text,
            final_summary,
            final_boxed_answer,
            usage_log,
            message_history,
        ) = await self.generate_final_answer_with_retries(
            system_prompt=system_prompt,
            message_history=message_history,
            tool_definitions=tool_definitions,
            turn_count=turn_count,
            task_description=task_description,
        )

        if save_callback:
            save_callback(system_prompt, message_history)

        # CASE: Context management OFF or final retry
        # Try to use intermediate answers as fallback to maximize accuracy
        # For final retry, there's no more retry opportunity, so we use fallback
        if not context_management_enabled or is_final_retry:
            final_answer_text, final_summary, final_boxed_answer = (
                self.handle_no_context_management_fallback(
                    final_answer_text, final_summary, final_boxed_answer
                )
            )
            if is_final_retry:
                self.task_log.log_step(
                    "info",
                    "Main Agent | Final Answer (Final Retry)",
                    "This is the final retry. Using intermediate fallback if available.",
                )
            return (
                final_summary,
                final_boxed_answer,
                None,
                usage_log,
                message_history,
            )

        # CASE: Context management ON + normal completion (not reached max turns, not final retry)
        # Don't use fallback - wrong guess would reduce accuracy
        final_answer_text, final_summary, final_boxed_answer = (
            self.handle_context_management_no_fallback(
                final_answer_text, final_summary, final_boxed_answer
            )
        )

        if final_boxed_answer == FORMAT_ERROR_MESSAGE and self.retry_with_summary:
            failure_experience_summary = await self.generate_failure_summary(
                system_prompt, message_history, tool_definitions, turn_count
            )

        return (
            final_summary,
            final_boxed_answer,
            failure_experience_summary,
            usage_log,
            message_history,
        )


================================================
FILE: apps/miroflow-agent/src/core/orchestrator.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Orchestrator module for coordinating agent task execution.

This module contains the main Orchestrator class that manages the execution of tasks
by coordinating between the main agent, sub-agents, and various tools.
"""

import asyncio
import gc
import logging
import time
import uuid
from collections import defaultdict
from datetime import date
from typing import Any, Dict, List, Optional

from miroflow_tools.manager import ToolManager
from omegaconf import DictConfig

from ..config.settings import expose_sub_agents_as_tools
from ..io.input_handler import process_input
from ..io.output_formatter import OutputFormatter
from ..llm.base_client import BaseClient
from ..logging.task_logger import TaskLog, get_utc_plus_8_time
from ..utils.parsing_utils import extract_llm_response_text
from ..utils.prompt_utils import (
    generate_agent_specific_system_prompt,
    generate_agent_summarize_prompt,
    mcp_tags,
    refusal_keywords,
)
from .answer_generator import AnswerGenerator
from .stream_handler import StreamHandler
from .tool_executor import ToolExecutor

logger = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

# Default timeout for LLM calls in seconds
DEFAULT_LLM_TIMEOUT = 600

# Safety limits for retry loops
DEFAULT_MAX_CONSECUTIVE_ROLLBACKS = 5

# Additional attempts beyond max_turns for total loop protection
EXTRA_ATTEMPTS_BUFFER = 200


def _list_tools(sub_agent_tool_managers: Dict[str, ToolManager]):
    """
    Create a cached async function for fetching sub-agent tool definitions.

    This factory function returns an async closure that lazily fetches and caches
    tool definitions from all sub-agent tool managers. The cache ensures that
    tool definitions are only fetched once per orchestrator instance.

    Args:
        sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.

    Returns:
        An async function that returns a dictionary of tool definitions for each sub-agent.
    """
    cache = None

    async def wrapped():
        nonlocal cache
        if cache is None:
            # Only fetch tool definitions if not already cached
            result = {
                name: await tool_manager.get_all_tool_definitions()
                for name, tool_manager in sub_agent_tool_managers.items()
            }
            cache = result
        return cache

    return wrapped


class Orchestrator:
    """
    Main orchestrator for coordinating agent task execution.

    Manages the execution loop for main and sub-agents, coordinating
    LLM calls, tool execution, streaming events, and context management.
    """

    def __init__(
        self,
        main_agent_tool_manager: ToolManager,
        sub_agent_tool_managers: Dict[str, ToolManager],
        llm_client: BaseClient,
        output_formatter: OutputFormatter,
        cfg: DictConfig,
        task_log: Optional["TaskLog"] = None,
        stream_queue: Optional[Any] = None,
        tool_definitions: Optional[List[Dict[str, Any]]] = None,
        sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,
    ):
        """
        Initialize the orchestrator.

        Args:
            main_agent_tool_manager: Tool manager for main agent
            sub_agent_tool_managers: Dictionary of tool managers for sub-agents
            llm_client: The LLM client for API calls
            output_formatter: Formatter for output processing
            cfg: Configuration object
            task_log: Logger for task execution
            stream_queue: Optional async queue for streaming events
            tool_definitions: Pre-fetched tool definitions (optional)
            sub_agent_tool_definitions: Pre-fetched sub-agent tool definitions (optional)
        """
        self.main_agent_tool_manager = main_agent_tool_manager
        self.sub_agent_tool_managers = sub_agent_tool_managers
        self.llm_client = llm_client
        self.output_formatter = output_formatter
        self.cfg = cfg
        self.task_log = task_log
        self.stream_queue = stream_queue
        self.tool_definitions = tool_definitions
        self.sub_agent_tool_definitions = sub_agent_tool_definitions

        # Initialize sub-agent tool list function
        self._list_sub_agent_tools = None
        if sub_agent_tool_managers:
            self._list_sub_agent_tools = _list_tools(sub_agent_tool_managers)

        # Pass task_log to llm_client
        if self.llm_client and task_log:
            self.llm_client.task_log = task_log

        # Track boxed answers extracted during main loop turns
        self.intermediate_boxed_answers: List[str] = []

        # Record used subtask / q / Query to detect duplicates
        self.used_queries: Dict[str, Dict[str, int]] = {}

        # Retry loop protection limits
        self.MAX_CONSECUTIVE_ROLLBACKS = DEFAULT_MAX_CONSECUTIVE_ROLLBACKS

        # Context management settings
        self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)

        # Initialize helper components
        self.stream = StreamHandler(stream_queue)
        self.tool_executor = ToolExecutor(
            main_agent_tool_manager=main_agent_tool_manager,
            sub_agent_tool_managers=sub_agent_tool_managers,
            output_formatter=output_formatter,
            task_log=task_log,
            stream_handler=self.stream,
            max_consecutive_rollbacks=DEFAULT_MAX_CONSECUTIVE_ROLLBACKS,
        )
        self.answer_generator = AnswerGenerator(
            llm_client=llm_client,
            output_formatter=output_formatter,
            task_log=task_log,
            stream_handler=self.stream,
            cfg=cfg,
            intermediate_boxed_answers=self.intermediate_boxed_answers,
        )

    def _save_message_history(
        self, system_prompt: str, message_history: List[Dict[str, Any]]
    ):
        """Save message history to task log."""
        self.task_log.main_agent_message_history = {
            "system_prompt": system_prompt,
            "message_history": message_history,
        }
        self.task_log.save()

    async def _handle_response_format_issues(
        self,
        assistant_response_text: str,
        message_history: List[Dict[str, Any]],
        turn_count: int,
        consecutive_rollbacks: int,
        total_attempts: int,
        max_attempts: int,
        agent_name: str,
    ) -> tuple:
        """
        Handle MCP tag format errors and refusal keywords.

        Args:
            assistant_response_text: The LLM response text
            message_history: Current message history
            turn_count: Current turn count
            consecutive_rollbacks: Current consecutive rollback count
            total_attempts: Total attempts made
            max_attempts: Maximum allowed attempts
            agent_name: Name of the agent for logging

        Returns:
            Tuple of (should_continue, should_break, turn_count, consecutive_rollbacks, message_history)
        """
        # Check for MCP tags in response (format error)
        if any(mcp_tag in assistant_response_text for mcp_tag in mcp_tags):
            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
                turn_count -= 1
                consecutive_rollbacks += 1
                if message_history[-1]["role"] == "assistant":
                    message_history.pop()
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | Rollback",
                    f"Tool call format incorrect - found MCP tags in response. "
                    f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, "
                    f"Total attempts: {total_attempts}/{max_attempts}",
                )
                return True, False, turn_count, consecutive_rollbacks, message_history
            else:
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks",
                    f"Ending agent loop after {consecutive_rollbacks} consecutive MCP format errors",
                )
                return False, True, turn_count, consecutive_rollbacks, message_history

        # Check for refusal keywords
        if any(keyword in assistant_response_text for keyword in refusal_keywords):
            matched_keywords = [
                kw for kw in refusal_keywords if kw in assistant_response_text
            ]
            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
                turn_count -= 1
                consecutive_rollbacks += 1
                if message_history[-1]["role"] == "assistant":
                    message_history.pop()
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | Rollback",
                    f"LLM refused to answer - found refusal keywords: {matched_keywords}. "
                    f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, "
                    f"Total attempts: {total_attempts}/{max_attempts}",
                )
                return True, False, turn_count, consecutive_rollbacks, message_history
            else:
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks",
                    f"Ending agent loop after {consecutive_rollbacks} consecutive refusals with keywords: {matched_keywords}",
                )
                return False, True, turn_count, consecutive_rollbacks, message_history

        # No format issues - normal end without tool calls
        return False, True, turn_count, consecutive_rollbacks, message_history

    async def _check_duplicate_query(
        self,
        tool_name: str,
        arguments: dict,
        cache_name: str,
        consecutive_rollbacks: int,
        turn_count: int,
        total_attempts: int,
        max_attempts: int,
        message_history: List[Dict[str, Any]],
        agent_name: str,
    ) -> tuple:
        """
        Check for duplicate queries and handle rollback if needed.

        Args:
            tool_name: Name of the tool being called
            arguments: Tool arguments
            cache_name: Name of the query cache to use
            consecutive_rollbacks: Current consecutive rollback count
            turn_count: Current turn count
            total_attempts: Total attempts made
            max_attempts: Maximum allowed attempts
            message_history: Current message history
            agent_name: Name of the agent for logging

        Returns:
            Tuple of (is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history)
        """
        query_str = self.tool_executor.get_query_str_from_tool_call(
            tool_name, arguments
        )
        if not query_str:
            return False, False, turn_count, consecutive_rollbacks, message_history

        self.used_queries.setdefault(cache_name, defaultdict(int))
        count = self.used_queries[cache_name][query_str]

        if count > 0:
            if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
                message_history.pop()
                turn_count -= 1
                consecutive_rollbacks += 1
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | Rollback",
                    f"Duplicate query detected - tool: {tool_name}, query: '{query_str}', "
                    f"previous count: {count}. Consecutive rollbacks: {consecutive_rollbacks}/"
                    f"{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
                )
                return True, True, turn_count, consecutive_rollbacks, message_history
            else:
                self.task_log.log_step(
                    "warning",
                    f"{agent_name} | Turn: {turn_count} | Allow Duplicate",
                    f"Allowing duplicate query after {consecutive_rollbacks} rollbacks - "
                    f"tool: {tool_name}, query: '{query_str}', previous count: {count}",
                )

        return False, False, turn_count, consecutive_rollbacks, message_history

    async def _record_query(self, cache_name: str, tool_name: str, arguments: dict):
        """Record a successful query execution."""
        query_str = self.tool_executor.get_query_str_from_tool_call(
            tool_name, arguments
        )
        if query_str:
            self.used_queries.setdefault(cache_name, defaultdict(int))
            self.used_queries[cache_name][query_str] += 1

    async def run_sub_agent(
        self,
        sub_agent_name: str,
        task_description: str,
    ):
        """
        Run a sub-agent to handle a subtask.

        Args:
            sub_agent_name: Name of the sub-agent to run
            task_description: Description of the subtask

        Returns:
            The final answer text from the sub-agent
        """
        task_description += "\n\nPlease provide the answer and detailed supporting information of the subtask given to you."
        self.task_log.log_step(
            "info",
            f"{sub_agent_name} | Task Description",
            f"Subtask: {task_description}",
        )

        # Stream sub-agent start
        display_name = sub_agent_name.replace("agent-", "")
        sub_agent_id = await self.stream.start_agent(display_name)
        await self.stream.start_llm(display_name)

        # Start new sub-agent session
        self.task_log.start_sub_agent_session(sub_agent_name, task_description)

        # Initialize message history
        message_history = [{"role": "user", "content": task_description}]

        # Get sub-agent tool definitions
        if not self.sub_agent_tool_definitions:
            tool_definitions = await self._list_sub_agent_tools()
            tool_definitions = tool_definitions.get(sub_agent_name, {})
        else:
            tool_definitions = self.sub_agent_tool_definitions[sub_agent_name]

        if not tool_definitions:
            self.task_log.log_step(
                "warning",
                f"{sub_agent_name} | No Tools",
                "No tool definitions available.",
            )

        # Generate sub-agent system prompt
        system_prompt = self.llm_client.generate_agent_system_prompt(
            date=date.today(),
            mcp_servers=tool_definitions,
        ) + generate_agent_specific_system_prompt(agent_type=sub_agent_name)

        # Limit sub-agent turns
        if self.cfg.agent.sub_agents:
            max_turns = self.cfg.agent.sub_agents[sub_agent_name].max_turns
        else:
            max_turns = 0
        turn_count = 0
        total_attempts = 0
        max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER
        consecutive_rollbacks = 0

        while turn_count < max_turns and total_attempts < max_attempts:
            turn_count += 1
            total_attempts += 1

            if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:
                self.task_log.log_step(
                    "error",
                    f"{sub_agent_name} | Too Many Rollbacks",
                    f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.",
                )
                break

            self.task_log.save()

            # Reset 'last_call_tokens'
            self.llm_client.last_call_tokens = {
                "prompt_tokens": 0,
                "completion_tokens": 0,
            }

            # LLM call using answer generator
            (
                assistant_response_text,
                should_break,
                tool_calls,
                message_history,
            ) = await self.answer_generator.handle_llm_call(
                system_prompt,
                message_history,
                tool_definitions,
                turn_count,
                f"{sub_agent_name} | Turn: {turn_count}",
                agent_type=sub_agent_name,
            )

            if should_break:
                self.task_log.log_step(
                    "info",
                    f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
                    "should break is True, breaking the loop",
                )
                break

            if assistant_response_text:
                text_response = extract_llm_response_text(assistant_response_text)
                if text_response:
                    await self.stream.tool_call("show_text", {"text": text_response})
            else:
                self.task_log.log_step(
                    "info",
                    f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
                    "LLM call failed",
                )
                await asyncio.sleep(5)
                continue

            # Handle no tool calls case
            if not tool_calls:
                (
                    should_continue,
                    should_break_loop,
                    turn_count,
                    consecutive_rollbacks,
                    message_history,
                ) = await self._handle_response_format_issues(
                    assistant_response_text,
                    message_history,
                    turn_count,
                    consecutive_rollbacks,
                    total_attempts,
                    max_attempts,
                    sub_agent_name,
                )
                if should_continue:
                    continue
                if should_break_loop:
                    if not any(
                        mcp_tag in assistant_response_text for mcp_tag in mcp_tags
                    ) and not any(
                        keyword in assistant_response_text
                        for keyword in refusal_keywords
                    ):
                        self.task_log.log_step(
                            "info",
                            f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
                            f"No tool calls found in {sub_agent_name}, ending on turn {turn_count}",
                        )
                    break

            # Execute tool calls
            tool_calls_data = []
            all_tool_results_content_with_id = []
            should_rollback_turn = False

            for call in tool_calls:
                server_name = call["server_name"]
                tool_name = call["tool_name"]
                arguments = call["arguments"]
                call_id = call["id"]

                # Fix common parameter name mistakes
                arguments = self.tool_executor.fix_tool_call_arguments(
                    tool_name, arguments
                )

                self.task_log.log_step(
                    "info",
                    f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
                    f"Executing {tool_name} on {server_name}",
                )

                call_start_time = time.time()
                try:
                    # Check for duplicate query
                    cache_name = sub_agent_id + "_" + tool_name
                    (
                        is_duplicate,
                        should_rollback,
                        turn_count,
                        consecutive_rollbacks,
                        message_history,
                    ) = await self._check_duplicate_query(
                        tool_name,
                        arguments,
                        cache_name,
                        consecutive_rollbacks,
                        turn_count,
                        total_attempts,
                        max_attempts,
                        message_history,
                        sub_agent_name,
                    )
                    if should_rollback:
                        should_rollback_turn = True
                        break

                    # Send stream event
                    tool_call_id = await self.stream.tool_call(tool_name, arguments)

                    # Execute tool call
                    tool_result = await self.sub_agent_tool_managers[
                        sub_agent_name
                    ].execute_tool_call(server_name, tool_name, arguments)

                    # Update query count if successful
                    if "error" not in tool_result:
                        await self._record_query(cache_name, tool_name, arguments)

                    # Post-process result
                    tool_result = self.tool_executor.post_process_tool_call_result(
                        tool_name, tool_result
                    )
                    result = (
                        tool_result.get("result")
                        if tool_result.get("result")
                        else tool_result.get("error")
                    )

                    # Check for errors that should trigger rollback
                    if self.tool_executor.should_rollback_result(
                        tool_name, result, tool_result
                    ):
                        if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
                            message_history.pop()
                            turn_count -= 1
                            consecutive_rollbacks += 1
                            should_rollback_turn = True
                            self.task_log.log_step(
                                "warning",
                                f"{sub_agent_name} | Turn: {turn_count} | Rollback",
                                f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'",
                            )
                            break

                    await self.stream.tool_call(
                        tool_name, {"result": result}, tool_call_id=tool_call_id
                    )
                    call_end_time = time.time()
                    call_duration_ms = int((call_end_time - call_start_time) * 1000)

                    self.task_log.log_step(
                        "info",
                        f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
                        f"Tool {tool_name} completed in {call_duration_ms}ms",
                    )

                    tool_calls_data.append(
                        {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "arguments": arguments,
                            "result": tool_result,
                            "duration_ms": call_duration_ms,
                            "call_time": get_utc_plus_8_time(),
                        }
                    )

                except Exception as e:
                    call_end_time = time.time()
                    call_duration_ms = int((call_end_time - call_start_time) * 1000)

                    tool_calls_data.append(
                        {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "arguments": arguments,
                            "error": str(e),
                            "duration_ms": call_duration_ms,
                            "call_time": get_utc_plus_8_time(),
                        }
                    )
                    tool_result = {
                        "error": f"Tool call failed: {str(e)}",
                        "server_name": server_name,
                        "tool_name": tool_name,
                    }
                    self.task_log.log_step(
                        "error",
                        f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
                        f"Tool {tool_name} failed to execute: {str(e)}",
                    )

                tool_result_for_llm = self.output_formatter.format_tool_result_for_user(
                    tool_result
                )
                all_tool_results_content_with_id.append((call_id, tool_result_for_llm))

            if should_rollback_turn:
                continue

            # Reset consecutive rollbacks on successful execution
            if consecutive_rollbacks > 0:
                self.task_log.log_step(
                    "info",
                    f"{sub_agent_name} | Turn: {turn_count} | Recovery",
                    f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks",
                )
            consecutive_rollbacks = 0

            # Update message history
            message_history = self.llm_client.update_message_history(
                message_history, all_tool_results_content_with_id
            )

            # Check context length
            temp_summary_prompt = generate_agent_summarize_prompt(
                task_description,
                agent_type=sub_agent_name,
            )

            pass_length_check, message_history = self.llm_client.ensure_summary_context(
                message_history, temp_summary_prompt
            )

            if not pass_length_check:
                turn_count = max_turns
                self.task_log.log_step(
                    "info",
                    f"{sub_agent_name} | Turn: {turn_count} | Context Limit Reached",
                    "Context limit reached, triggering summary",
                )
                break

        # Log loop end
        if turn_count >= max_turns:
            self.task_log.log_step(
                "info",
                f"{sub_agent_name} | Max Turns Reached / Context Limit Reached",
                f"Reached maximum turns ({max_turns}) or context limit reached",
            )
        else:
            self.task_log.log_step(
                "info",
                f"{sub_agent_name} | Main Loop Completed",
                f"Main loop completed after {turn_count} turns",
            )

        # Generate final summary
        self.task_log.log_step(
            "info",
            f"{sub_agent_name} | Final Summary",
            f"Generating {sub_agent_name} final summary",
        )

        summary_prompt = generate_agent_summarize_prompt(
            task_description,
            agent_type=sub_agent_name,
        )

        if message_history[-1]["role"] == "user":
            message_history.pop()
        message_history.append({"role": "user", "content": summary_prompt})

        await self.stream.tool_call(
            "Partial Summary", {}, tool_call_id=str(uuid.uuid4())
        )

        # Generate final answer
        (
            final_answer_text,
            should_break,
            tool_calls_info,
            message_history,
        ) = await self.answer_generator.handle_llm_call(
            system_prompt,
            message_history,
            tool_definitions,
            turn_count + 1,
            f"{sub_agent_name} | Final summary",
            agent_type=sub_agent_name,
        )

        if final_answer_text:
            self.task_log.log_step(
                "info",
                f"{sub_agent_name} | Final Answer",
                "Final answer generated successfully",
            )
        else:
            final_answer_text = (
                f"No final answer generated by sub agent {sub_agent_name}."
            )
            self.task_log.log_step(
                "error",
                f"{sub_agent_name} | Final Answer",
                "Unable to generate final answer",
            )

        # Save session history
        self.task_log.sub_agent_message_history_sessions[
            self.task_log.current_sub_agent_session_id
        ] = {"system_prompt": system_prompt, "message_history": message_history}

        self.task_log.save()
        self.task_log.end_sub_agent_session(sub_agent_name)

        # Remove thinking content
        final_answer_text = final_answer_text.split("<think>")[-1].strip()
        final_answer_text = final_answer_text.split("</think>")[-1].strip()

        # Stream sub-agent end
        await self.stream.end_llm(display_name)
        await self.stream.end_agent(display_name, sub_agent_id)

        return final_answer_text

    async def run_main_agent(
        self,
        task_description,
        task_file_name=None,
        task_id="default_task",
        is_final_retry=False,
    ):
        """
        Execute the main end-to-end task.

        Args:
            task_description: Description of the task to execute
            task_file_name: Optional file associated with the task
            task_id: Unique identifier for the task

        Returns:
            Tuple of (final_summary, final_boxed_answer, failure_experience_summary)
        """
        workflow_id = await self.stream.start_workflow(task_description)

        self.task_log.log_step("info", "Main Agent", f"Start task with id: {task_id}")
        self.task_log.log_step(
            "info", "Main Agent", f"Task description: {task_description}"
        )
        if task_file_name:
            self.task_log.log_step(
                "info", "Main Agent", f"Associated file: {task_file_name}"
            )

        # Process input
        initial_user_content, processed_task_desc = process_input(
            task_description, task_file_name
        )
        message_history = [{"role": "user", "content": initial_user_content}]

        # Record initial user input
        user_input = processed_task_desc
        if task_file_name:
            user_input += f"\n[Attached file: {task_file_name}]"

        # Get tool definitions
        if not self.tool_definitions:
            tool_definitions = (
                await self.main_agent_tool_manager.get_all_tool_definitions()
            )
            if self.cfg.agent.sub_agents is not None:
                tool_definitions += expose_sub_agents_as_tools(
                    self.cfg.agent.sub_agents
                )
        else:
            tool_definitions = self.tool_definitions

        if not tool_definitions:
            self.task_log.log_step(
                "warning",
                "Main Agent | Tool Definitions",
                "Warning: No tool definitions found. LLM cannot use any tools.",
            )

        # Generate system prompt
        system_prompt = self.llm_client.generate_agent_system_prompt(
            date=date.today(),
            mcp_servers=tool_definitions,
        ) + generate_agent_specific_system_prompt(agent_type="main")
        system_prompt = system_prompt.strip()

        # Main loop configuration
        max_turns = self.cfg.agent.main_agent.max_turns
        turn_count = 0
        total_attempts = 0
        max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER
        consecutive_rollbacks = 0

        self.current_agent_id = await self.stream.start_agent("main")
        await self.stream.start_llm("main")

        while turn_count < max_turns and total_attempts < max_attempts:
            turn_count += 1
            total_attempts += 1

            if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:
                self.task_log.log_step(
                    "error",
                    "Main Agent | Too Many Rollbacks",
                    f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.",
                )
                break

            self.task_log.save()

            # LLM call
            (
                assistant_response_text,
                should_break,
                tool_calls,
                message_history,
            ) = await self.answer_generator.handle_llm_call(
                system_prompt,
                message_history,
                tool_definitions,
                turn_count,
                f"Main agent | Turn: {turn_count}",
                agent_type="main",
            )

            # Process LLM response
            if assistant_response_text:
                text_response = extract_llm_response_text(assistant_response_text)
                if text_response:
                    await self.stream.tool_call("show_text", {"text": text_response})

                # Extract boxed content
                boxed_content = self.output_formatter._extract_boxed_content(
                    assistant_response_text
                )
                if boxed_content:
                    self.intermediate_boxed_answers.append(boxed_content)

                if should_break:
                    self.task_log.log_step(
                        "info",
                        f"Main Agent | Turn: {turn_count} | LLM Call",
                        "should break is True, breaking the loop",
                    )
                    break
            else:
                turn_count -= 1
                self.task_log.log_step(
                    "warning",
                    f"Main Agent | Turn: {turn_count} | LLM Call",
                    "No valid response from LLM, retrying",
                )
                await asyncio.sleep(5)
                continue

            # Handle no tool calls case
            if not tool_calls:
                (
                    should_continue,
                    should_break_loop,
                    turn_count,
                    consecutive_rollbacks,
                    message_history,
                ) = await self._handle_response_format_issues(
                    assistant_response_text,
                    message_history,
                    turn_count,
                    consecutive_rollbacks,
                    total_attempts,
                    max_attempts,
                    "Main Agent",
                )
                if should_continue:
                    continue
                if should_break_loop:
                    if not any(
                        mcp_tag in assistant_response_text for mcp_tag in mcp_tags
                    ) and not any(
                        keyword in assistant_response_text
                        for keyword in refusal_keywords
                    ):
                        self.task_log.log_step(
                            "info",
                            f"Main Agent | Turn: {turn_count} | LLM Call",
                            "LLM did not request tool usage, ending process.",
                        )
                    break

            # Execute tool calls
            tool_calls_data = []
            all_tool_results_content_with_id = []
            should_rollback_turn = False
            main_agent_last_call_tokens = self.llm_client.last_call_tokens

            for call in tool_calls:
                server_name = call["server_name"]
                tool_name = call["tool_name"]
                arguments = call["arguments"]
                call_id = call["id"]

                # Fix common parameter name mistakes
                arguments = self.tool_executor.fix_tool_call_arguments(
                    tool_name, arguments
                )

                call_start_time = time.time()
                try:
                    if server_name.startswith("agent-") and self.cfg.agent.sub_agents:
                        # Sub-agent execution
                        cache_name = "main_" + tool_name
                        (
                            is_duplicate,
                            should_rollback,
                            turn_count,
                            consecutive_rollbacks,
                            message_history,
                        ) = await self._check_duplicate_query(
                            tool_name,
                            arguments,
                            cache_name,
                            consecutive_rollbacks,
                            turn_count,
                            total_attempts,
                            max_attempts,
                            message_history,
                            "Main Agent",
                        )
                        if should_rollback:
                            should_rollback_turn = True
                            break

                        # Stream events
                        await self.stream.end_llm("main")
                        await self.stream.end_agent("main", self.current_agent_id)

                        # Execute sub-agent
                        sub_agent_result = await self.run_sub_agent(
                            server_name,
                            arguments["subtask"],
                        )

                        # Update query count
                        await self._record_query(cache_name, tool_name, arguments)

                        tool_result = {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "result": sub_agent_result,
                        }
                        self.current_agent_id = await self.stream.start_agent(
                            "main", display_name="Summarizing"
                        )
                        await self.stream.start_llm("main", display_name="Summarizing")
                    else:
                        # Regular tool execution
                        cache_name = "main_" + tool_name
                        (
                            is_duplicate,
                            should_rollback,
                            turn_count,
                            consecutive_rollbacks,
                            message_history,
                        ) = await self._check_duplicate_query(
                            tool_name,
                            arguments,
                            cache_name,
                            consecutive_rollbacks,
                            turn_count,
                            total_attempts,
                            max_attempts,
                            message_history,
                            "Main Agent",
                        )
                        if should_rollback:
                            should_rollback_turn = True
                            break

                        # Send stream event
                        tool_call_id = await self.stream.tool_call(tool_name, arguments)

                        # Execute tool call
                        tool_result = (
                            await self.main_agent_tool_manager.execute_tool_call(
                                server_name=server_name,
                                tool_name=tool_name,
                                arguments=arguments,
                            )
                        )

                        # Update query count if successful
                        if "error" not in tool_result:
                            await self._record_query(cache_name, tool_name, arguments)

                        # Post-process result
                        tool_result = self.tool_executor.post_process_tool_call_result(
                            tool_name, tool_result
                        )
                        result = (
                            tool_result.get("result")
                            if tool_result.get("result")
                            else tool_result.get("error")
                        )

                        # Check for errors that should trigger rollback
                        if self.tool_executor.should_rollback_result(
                            tool_name, result, tool_result
                        ):
                            if (
                                consecutive_rollbacks
                                < self.MAX_CONSECUTIVE_ROLLBACKS - 1
                            ):
                                message_history.pop()
                                turn_count -= 1
                                consecutive_rollbacks += 1
                                should_rollback_turn = True
                                self.task_log.log_step(
                                    "warning",
                                    f"Main Agent | Turn: {turn_count} | Rollback",
                                    f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'",
                                )
                                break

                        await self.stream.tool_call(
                            tool_name, {"result": result}, tool_call_id=tool_call_id
                        )

                    call_end_time = time.time()
                    call_duration_ms = int((call_end_time - call_start_time) * 1000)

                    tool_calls_data.append(
                        {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "arguments": arguments,
                            "result": tool_result,
                            "duration_ms": call_duration_ms,
                            "call_time": get_utc_plus_8_time(),
                        }
                    )
                    self.task_log.log_step(
                        "info",
                        f"Main Agent | Turn: {turn_count} | Tool Call",
                        f"Tool {tool_name} completed in {call_duration_ms}ms",
                    )

                except Exception as e:
                    call_end_time = time.time()
                    call_duration_ms = int((call_end_time - call_start_time) * 1000)

                    tool_calls_data.append(
                        {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "arguments": arguments,
                            "error": str(e),
                            "duration_ms": call_duration_ms,
                            "call_time": get_utc_plus_8_time(),
                        }
                    )
                    tool_result = {
                        "server_name": server_name,
                        "tool_name": tool_name,
                        "error": str(e),
                    }
                    self.task_log.log_step(
                        "error",
                        f"Main Agent | Turn: {turn_count} | Tool Call",
                        f"Tool {tool_name} failed to execute: {str(e)}",
                    )

                # Format results for LLM
                tool_result_for_llm = self.output_formatter.format_tool_result_for_user(
                    tool_result
                )
                all_tool_results_content_with_id.append((call_id, tool_result_for_llm))

            if should_rollback_turn:
                continue

            # Reset consecutive rollbacks on successful execution
            if consecutive_rollbacks > 0:
                self.task_log.log_step(
                    "info",
                    f"Main Agent | Turn: {turn_count} | Recovery",
                    f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks",
                )
            consecutive_rollbacks = 0

            # Update 'last_call_tokens'
            self.llm_client.last_call_tokens = main_agent_last_call_tokens

            # Update message history
            message_history = self.llm_client.update_message_history(
                message_history, all_tool_results_content_with_id
            )

            self.task_log.main_agent_message_history = {
                "system_prompt": system_prompt,
                "message_history": message_history,
            }
            self.task_log.save()

            # Check context length
            temp_summary_prompt = generate_agent_summarize_prompt(
                task_description,
                agent_type="main",
            )

            pass_length_check, message_history = self.llm_client.ensure_summary_context(
                message_history, temp_summary_prompt
            )

            if not pass_length_check:
                turn_count = max_turns
                self.task_log.log_step(
                    "warning",
                    f"Main Agent | Turn: {turn_count} | Context Limit Reached",
                    "Context limit reached, triggering summary",
                )
                break

        await self.stream.end_llm("main")
        await self.stream.end_agent("main", self.current_agent_id)

        # Determine if max turns was reached
        reached_max_turns = turn_count >= max_turns
        if reached_max_turns:
            self.task_log.log_step(
                "warning",
                "Main Agent | Max Turns Reached / Context Limit Reached",
                f"Reached maximum turns ({max_turns}) or context limit reached",
            )
        else:
            self.task_log.log_step(
                "info",
                "Main Agent | Main Loop Completed",
                f"Main loop completed after {turn_count} turns",
            )

        # Final summary
        self.task_log.log_step(
            "info", "Main Agent | Final Summary", "Generating final summary"
        )

        self.current_agent_id = await self.stream.start_agent("Final Summary")
        await self.stream.start_llm("Final Summary")

        # Generate final answer using answer generator
        (
            final_summary,
            final_boxed_answer,
            failure_experience_summary,
            usage_log,
            message_history,
        ) = await self.answer_generator.generate_and_finalize_answer(
            system_prompt=system_prompt,
            message_history=message_history,
            tool_definitions=tool_definitions,
            turn_count=turn_count,
            task_description=task_description,
            reached_max_turns=reached_max_turns,
            is_final_retry=is_final_retry,
            save_callback=self._save_message_history,
        )

        await self.stream.tool_call("show_text", {"text": final_boxed_answer})
        await self.stream.end_llm("Final Summary")
        await self.stream.end_agent("Final Summary", self.current_agent_id)
        await self.stream.end_workflow(workflow_id)

        self.task_log.log_step(
            "info", "Main Agent | Usage Calculation", f"Usage log: {usage_log}"
        )

        self.task_log.log_step(
            "info",
            "Main Agent | Final boxed answer",
            f"Final boxed answer:\n\n{final_boxed_answer}",
        )

        self.task_log.log_step(
            "info",
            "Main Agent | Task Completed",
            f"Main agent task {task_id} completed successfully",
        )
        gc.collect()
        return final_summary, final_boxed_answer, failure_experience_summary


================================================
FILE: apps/miroflow-agent/src/core/pipeline.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Task execution pipeline module.

This module provides:
- execute_task_pipeline: Main function to run a complete task from start to finish
- create_pipeline_components: Factory function to initialize all pipeline components

The pipeline orchestrates the interaction between LLM clients, tool managers,
and the orchestrator to execute complex multi-turn agent tasks.
"""

import traceback
import uuid
from typing import Any, Dict, List, Optional

from miroflow_tools.manager import ToolManager
from omegaconf import DictConfig

from ..config.settings import (
    create_mcp_server_parameters,
    get_env_info,
)
from ..io.output_formatter import OutputFormatter
from ..llm.factory import ClientFactory
from ..logging.task_logger import (
    TaskLog,
    get_utc_plus_8_time,
)
from .orchestrator import Orchestrator


async def execute_task_pipeline(
    cfg: DictConfig,
    task_id: str,
    task_description: str,
    task_file_name: str,
    main_agent_tool_manager: ToolManager,
    sub_agent_tool_managers: Dict[str, ToolManager],
    output_formatter: OutputFormatter,
    ground_truth: Optional[Any] = None,
    log_dir: str = "logs",
    stream_queue: Optional[Any] = None,
    tool_definitions: Optional[List[Dict[str, Any]]] = None,
    sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,
    is_final_retry: bool = False,
):
    """
    Executes the full pipeline for a single task.

    Args:
        cfg: The Hydra configuration object.
        task_id: A unique identifier for this task run (used for logging).
        task_description: The description of the task for the LLM.
        task_file_name: The path to an associated file (empty string if none).
        main_agent_tool_manager: An initialized main agent ToolManager instance.
        sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.
        output_formatter: An initialized OutputFormatter instance.
        ground_truth: The ground truth for the task (optional).
        log_dir: The directory to save the task log (default: "logs").
        stream_queue: A queue for streaming the task execution (optional).
        tool_definitions: The definitions of the tools for the main agent (optional).
        sub_agent_tool_definitions: The definitions of the tools for the sub-agents (optional).

    Returns:
        A tuple of (final_summary, final_boxed_answer, log_file_path, failure_experience_summary):
        - final_summary: A string with the final execution summary, or an error message.
        - final_boxed_answer: The extracted boxed answer from the LLM response.
        - log_file_path: The path to the saved task log file.
        - failure_experience_summary: Summary of failure experience for retry (None if successful).
    """
    # Create task log
    task_log = TaskLog(
        log_dir=log_dir,
        task_id=task_id,
        start_time=get_utc_plus_8_time(),
        input={"task_description": task_description, "task_file_name": task_file_name},
        env_info=get_env_info(cfg),
        ground_truth=ground_truth,
    )

    # Log task start
    task_log.log_step(
        "info", "Main | Task Start", f"--- Starting Task Execution: {task_id} ---"
    )

    # Set task_log for all ToolManager instances
    main_agent_tool_manager.set_task_log(task_log)
    if sub_agent_tool_managers:
        for sub_agent_tool_manager in sub_agent_tool_managers.values():
            sub_agent_tool_manager.set_task_log(task_log)

    try:
        # Initialize LLM client
        random_uuid = str(uuid.uuid4())
        unique_id = f"{task_id}-{random_uuid}"
        llm_client = ClientFactory(task_id=unique_id, cfg=cfg, task_log=task_log)

        # Initialize orchestrator
        orchestrator = Orchestrator(
            main_agent_tool_manager=main_agent_tool_manager,
            sub_agent_tool_managers=sub_agent_tool_managers,
            llm_client=llm_client,
            output_formatter=output_formatter,
            cfg=cfg,
            task_log=task_log,
            stream_queue=stream_queue,
            tool_definitions=tool_definitions,
            sub_agent_tool_definitions=sub_agent_tool_definitions,
        )

        (
            final_summary,
            final_boxed_answer,
            failure_experience_summary,
        ) = await orchestrator.run_main_agent(
            task_description=task_description,
            task_file_name=task_file_name,
            task_id=task_id,
            is_final_retry=is_final_retry,
        )

        llm_client.close()

        task_log.final_boxed_answer = final_boxed_answer
        task_log.status = "success"

        # Store failure experience summary in task log if available
        if failure_experience_summary:
            task_log.trace_data["failure_experience_summary"] = (
                failure_experience_summary
            )

        log_file_path = task_log.save()
        return (
            final_summary,
            final_boxed_answer,
            log_file_path,
            failure_experience_summary,
        )

    except Exception as e:
        error_details = traceback.format_exc()
        task_log.log_step(
            "warning",
            "task_error_notification",
            f"An error occurred during task {task_id}",
        )
        task_log.log_step("error", "task_error_details", error_details)

        error_message = (
            f"Error executing task {task_id}:\n"
            f"Description: {task_description}\n"
            f"File: {task_file_name}\n"
            f"Error Type: {type(e).__name__}\n"
            f"Error Details:\n{error_details}"
        )

        task_log.status = "failed"
        task_log.error = error_details

        log_file_path = task_log.save()

        return error_message, "", log_file_path, None

    finally:
        task_log.end_time = get_utc_plus_8_time()

        # Record task summary to structured log
        task_log.log_step(
            "info",
            "task_execution_finished",
            f"Task {task_id} execution completed with status: {task_log.status}",
        )
        task_log.save()


def create_pipeline_components(cfg: DictConfig):
    """
    Creates and initializes the core components of the agent pipeline.

    Args:
        cfg: The Hydra configuration object.

    Returns:
        Tuple of (main_agent_tool_manager, sub_agent_tool_managers, output_formatter)
    """
    # Create ToolManagers for main agent and sub-agents
    main_agent_mcp_server_configs, main_agent_blacklist = create_mcp_server_parameters(
        cfg, cfg.agent.main_agent
    )
    main_agent_tool_manager = ToolManager(
        main_agent_mcp_server_configs,
        tool_blacklist=main_agent_blacklist,
    )

    # Create OutputFormatter
    output_formatter = OutputFormatter()
    sub_agent_tool_managers = {}

    # For single agent mode
    if not cfg.agent.sub_agents:
        return main_agent_tool_manager, {}, output_formatter

    for sub_agent in cfg.agent.sub_agents:
        sub_agent_mcp_server_configs, sub_agent_blacklist = (
            create_mcp_server_parameters(cfg, cfg.agent.sub_agents[sub_agent])
        )
        sub_agent_tool_manager = ToolManager(
            sub_agent_mcp_server_configs,
            tool_blacklist=sub_agent_blacklist,
        )
        sub_agent_tool_managers[sub_agent] = sub_agent_tool_manager

    return main_agent_tool_manager, sub_agent_tool_managers, output_formatter


================================================
FILE: apps/miroflow-agent/src/core/stream_handler.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Stream handler module for SSE (Server-Sent Events) protocol.

This module provides the StreamHandler class that manages all streaming events
for real-time communication with clients during agent task execution.
"""

import logging
import uuid
from typing import Any, Optional

logger = logging.getLogger(__name__)


class StreamHandler:
    """
    Handler for streaming events in SSE protocol format.

    Manages the sending of various event types including workflow lifecycle,
    agent lifecycle, LLM interactions, and tool calls.
    """

    def __init__(self, stream_queue: Optional[Any] = None):
        """
        Initialize the stream handler.

        Args:
            stream_queue: Optional async queue for sending stream messages.
                         If None, streaming is disabled.
        """
        self.stream_queue = stream_queue

    async def update(self, event_type: str, data: dict):
        """
        Send a streaming update in SSE protocol format.

        Args:
            event_type: The type of event (e.g., 'start_of_workflow', 'tool_call')
            data: The event payload data
        """
        if self.stream_queue:
            try:
                stream_message = {
                    "event": event_type,
                    "data": data,
                }
                await self.stream_queue.put(stream_message)
            except Exception as e:
                logger.warning(f"Failed to send stream update: {e}")

    async def start_workflow(self, user_input: str) -> str:
        """
        Send start_of_workflow event.

        Args:
            user_input: The initial user input for the workflow

        Returns:
            The generated workflow ID
        """
        workflow_id = str(uuid.uuid4())
        await self.update(
            "start_of_workflow",
            {
                "workflow_id": workflow_id,
                "input": [
                    {
                        "role": "user",
                        "content": user_input,
                    }
                ],
            },
        )
        return workflow_id

    async def end_workflow(self, workflow_id: str):
        """
        Send end_of_workflow event.

        Args:
            workflow_id: The workflow ID to end
        """
        await self.update(
            "end_of_workflow",
            {
                "workflow_id": workflow_id,
            },
        )

    async def show_error(self, error: str):
        """
        Send show_error event and signal stream end.

        Args:
            error: The error message to display
        """
        await self.tool_call("show_error", {"error": error})
        if self.stream_queue:
            try:
                await self.stream_queue.put(None)
            except Exception as e:
                logger.warning(f"Failed to send show_error: {e}")

    async def start_agent(self, agent_name: str, display_name: str = None) -> str:
        """
        Send start_of_agent event.

        Args:
            agent_name: Internal name of the agent
            display_name: Optional display name for UI

        Returns:
            The generated agent ID
        """
        agent_id = str(uuid.uuid4())
        await self.update(
            "start_of_agent",
            {
                "agent_name": agent_name,
                "display_name": display_name,
                "agent_id": agent_id,
            },
        )
        return agent_id

    async def end_agent(self, agent_name: str, agent_id: str):
        """
        Send end_of_agent event.

        Args:
            agent_name: Internal name of the agent
            agent_id: The agent ID to end
        """
        await self.update(
            "end_of_agent",
            {
                "agent_name": agent_name,
                "agent_id": agent_id,
            },
        )

    async def start_llm(self, agent_name: str, display_name: str = None):
        """
        Send start_of_llm event.

        Args:
            agent_name: Name of the agent making the LLM call
            display_name: Optional display name for UI
        """
        await self.update(
            "start_of_llm",
            {
                "agent_name": agent_name,
                "display_name": display_name,
            },
        )

    async def end_llm(self, agent_name: str):
        """
        Send end_of_llm event.

        Args:
            agent_name: Name of the agent that finished LLM call
        """
        await self.update(
            "end_of_llm",
            {
                "agent_name": agent_name,
            },
        )

    async def message(self, message_id: str, delta_content: str):
        """
        Send message event with streaming content.

        Args:
            message_id: Unique identifier for the message
            delta_content: The content delta to send
        """
        await self.update(
            "message",
            {
                "message_id": message_id,
                "delta": {
                    "content": delta_content,
                },
            },
        )

    async def tool_call(
        self,
        tool_name: str,
        payload: dict,
        streaming: bool = False,
        tool_call_id: str = None,
    ) -> str:
        """
        Send tool_call event.

        Args:
            tool_name: Name of the tool being called
            payload: Tool call arguments or results
            streaming: If True, send payload keys as deltas
            tool_call_id: Optional existing tool call ID

        Returns:
            The tool call ID (generated if not provided)
        """
        if not tool_call_id:
            tool_call_id = str(uuid.uuid4())

        if streaming:
            for key, value in payload.items():
                await self.update(
                    "tool_call",
                    {
                        "tool_call_id": tool_call_id,
                        "tool_name": tool_name,
                        "delta_input": {key: value},
                    },
                )
        else:
            # Send complete tool call
            await self.update(
                "tool_call",
                {
                    "tool_call_id": tool_call_id,
                    "tool_name": tool_name,
                    "tool_input": payload,
                },
            )

        return tool_call_id


================================================
FILE: apps/miroflow-agent/src/core/tool_executor.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Tool executor module for handling tool call execution.

This module provides the ToolExecutor class that manages tool call execution,
including argument fixing, duplicate detection, result processing, and error handling.
"""

import json
import logging
import os
import time
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple

from miroflow_tools.manager import ToolManager

from ..io.output_formatter import OutputFormatter
from ..logging.task_logger import TaskLog, get_utc_plus_8_time
from .stream_handler import StreamHandler

logger = logging.getLogger(__name__)

# Maximum length for scrape results in demo mode (to support more conversation turns)
DEMO_SCRAPE_MAX_LENGTH = 20_000


class ToolExecutor:
    """
    Executor for tool calls with support for duplicate detection and result processing.

    Handles the execution of tool calls, including parameter fixing, duplicate query
    detection, result truncation in demo mode, and error handling.
    """

    def __init__(
        self,
        main_agent_tool_manager: ToolManager,
        sub_agent_tool_managers: Dict[str, ToolManager],
        output_formatter: OutputFormatter,
        task_log: TaskLog,
        stream_handler: StreamHandler,
        max_consecutive_rollbacks: int = 5,
    ):
        """
        Initialize the tool executor.

        Args:
            main_agent_tool_manager: Tool manager for main agent
            sub_agent_tool_managers: Dictionary of tool managers for sub-agents
            output_formatter: Formatter for tool results
            task_log: Logger for task execution
            stream_handler: Handler for streaming events
            max_consecutive_rollbacks: Maximum allowed consecutive rollbacks
        """
        self.main_agent_tool_manager = main_agent_tool_manager
        self.sub_agent_tool_managers = sub_agent_tool_managers
        self.output_formatter = output_formatter
        self.task_log = task_log
        self.stream = stream_handler
        self.max_consecutive_rollbacks = max_consecutive_rollbacks

        # Track used queries to detect duplicates
        self.used_queries: Dict[str, Dict[str, int]] = {}

    def fix_tool_call_arguments(self, tool_name: str, arguments: dict) -> dict:
        """
        Fix common parameter name mistakes made by LLM.

        Args:
            tool_name: Name of the tool being called
            arguments: Original arguments dictionary

        Returns:
            Fixed arguments dictionary
        """
        # Create a copy to avoid modifying the original
        fixed_args = arguments.copy()

        # Fix scrape_and_extract_info parameter names
        if tool_name == "scrape_and_extract_info":
            # Map common mistakes to the correct parameter name
            mistake_names = ["description", "introduction"]
            if "info_to_extract" not in fixed_args:
                for mistake_name in mistake_names:
                    if mistake_name in fixed_args:
                        fixed_args["info_to_extract"] = fixed_args.pop(mistake_name)
                        break

        # Fix run_python_code parameter names: 'code' -> 'code_block'
        # Also add default sandbox_id if missing (will trigger stateless fallback)
        if tool_name == "run_python_code":
            if "code_block" not in fixed_args and "code" in fixed_args:
                fixed_args["code_block"] = fixed_args.pop("code")
            if "sandbox_id" not in fixed_args:
                fixed_args["sandbox_id"] = "default"

        return fixed_args

    def get_query_str_from_tool_call(
        self, tool_name: str, arguments: dict
    ) -> Optional[str]:
        """
        Extract the query string from tool call arguments based on tool_name.

        Supports search_and_browse, google_search, sogou_search, scrape_website,
        and scrape_and_extract_info.

        Args:
            tool_name: Name of the tool
            arguments: Tool arguments dictionary

        Returns:
            Query string for duplicate detection, or None if not applicable
        """
        if tool_name == "search_and_browse":
            return tool_name + "_" + arguments.get("subtask", "")
        elif tool_name == "google_search":
            return tool_name + "_" + arguments.get("q", "")
        elif tool_name == "sogou_search":
            return tool_name + "_" + arguments.get("Query", "")
        elif tool_name == "scrape_website":
            return tool_name + "_" + arguments.get("url", "")
        elif tool_name == "scrape_and_extract_info":
            return (
                tool_name
                + "_"
                + arguments.get("url", "")
                + "_"
                + arguments.get("info_to_extract", "")
            )
        return None

    def is_duplicate_query(self, cache_name: str, query_str: str) -> Tuple[bool, int]:
        """
        Check if a query has been executed before.

        Args:
            cache_name: Name of the cache (e.g., "main_google_search")
            query_str: The query string to check

        Returns:
            Tuple of (is_duplicate, previous_count)
        """
        self.used_queries.setdefault(cache_name, defaultdict(int))
        count = self.used_queries[cache_name][query_str]
        return count > 0, count

    def record_query(self, cache_name: str, query_str: str):
        """
        Record that a query has been executed.

        Args:
            cache_name: Name of the cache
            query_str: The query string to record
        """
        self.used_queries.setdefault(cache_name, defaultdict(int))
        self.used_queries[cache_name][query_str] += 1

    def is_google_search_empty_result(self, tool_name: str, tool_result: dict) -> bool:
        """
        Check if google_search result has empty organic results.

        This indicates a poor search query that should be retried.

        Args:
            tool_name: Name of the tool
            tool_result: The tool execution result

        Returns:
            True if the result is empty and should trigger retry
        """
        if tool_name != "google_search":
            return False

        result = tool_result.get("result")
        if not result:
            return False

        try:
            if isinstance(result, str):
                result_dict = json.loads(result)
            else:
                result_dict = result

            organic = result_dict.get("organic", [])
            return len(organic) == 0
        except (json.JSONDecodeError, TypeError, AttributeError):
            return False

    def get_scrape_result(self, result: str) -> str:
        """
        Process scrape result and truncate if too long.

        Args:
            result: Raw scrape result string (JSON or plain text)

        Returns:
            Processed result, truncated to DEMO_SCRAPE_MAX_LENGTH if necessary
        """
        try:
            scrape_result_dict = json.loads(result)
            text = scrape_result_dict.get("text")
            if text and len(text) > DEMO_SCRAPE_MAX_LENGTH:
                text = text[:DEMO_SCRAPE_MAX_LENGTH]
            return json.dumps({"text": text}, ensure_ascii=False)
        except json.JSONDecodeError:
            if isinstance(result, str) and len(result) > DEMO_SCRAPE_MAX_LENGTH:
                result = result[:DEMO_SCRAPE_MAX_LENGTH]
            return result

    def post_process_tool_call_result(
        self, tool_name: str, tool_call_result: dict
    ) -> dict:
        """
        Process tool call results.

        Only in demo mode: truncate scrape results to 20,000 chars
        to support more conversation turns.

        Args:
            tool_name: Name of the tool
            tool_call_result: The tool execution result

        Returns:
            Processed tool result
        """
        if os.environ.get("DEMO_MODE") == "1":
            if "result" in tool_call_result and tool_name in [
                "scrape",
                "scrape_website",
            ]:
                tool_call_result["result"] = self.get_scrape_result(
                    tool_call_result["result"]
                )
        return tool_call_result

    def should_rollback_result(
        self, tool_name: str, result: Any, tool_result: dict
    ) -> bool:
        """
        Check if a tool result should trigger a rollback.

        Args:
            tool_name: Name of the tool
            result: The result value
            tool_result: Full tool result dictionary

        Returns:
            True if the result indicates an error that should trigger rollback
        """
        return (
            str(result).startswith("Unknown tool:")
            or str(result).startswith("Error executing tool")
            or self.is_google_search_empty_result(tool_name, tool_result)
        )

    async def execute_single_tool_call(
        self,
        tool_manager: ToolManager,
        server_name: str,
        tool_name: str,
        arguments: dict,
        agent_name: str,
        turn_count: int,
    ) -> Tuple[dict, int, List[dict]]:
        """
        Execute a single tool call.

        Args:
            tool_manager: The tool manager to use
            server_name: Name of the MCP server
            tool_name: Name of the tool
            arguments: Tool arguments
            agent_name: Name of the agent making the call
            turn_count: Current turn count

        Returns:
            Tuple of (tool_result, duration_ms, tool_calls_data)
        """
        call_start_time = time.time()
        tool_calls_data = []

        try:
            # Execute tool call
            tool_result = await tool_manager.execute_tool_call(
                server_name, tool_name, arguments
            )

            # Post-process result
            tool_result = self.post_process_tool_call_result(tool_name, tool_result)

            call_end_time = time.time()
            call_duration_ms = int((call_end_time - call_start_time) * 1000)

            self.task_log.log_step(
                "info",
                f"{agent_name} | Turn: {turn_count} | Tool Call",
                f"Tool {tool_name} completed in {call_duration_ms}ms",
            )

            tool_calls_data.append(
                {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "arguments": arguments,
                    "result": tool_result,
                    "duration_ms": call_duration_ms,
                    "call_time": get_utc_plus_8_time(),
                }
            )

            return tool_result, call_duration_ms, tool_calls_data

        except Exception as e:
            call_end_time = time.time()
            call_duration_ms = int((call_end_time - call_start_time) * 1000)

            tool_calls_data.append(
                {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "arguments": arguments,
                    "error": str(e),
                    "duration_ms": call_duration_ms,
                    "call_time": get_utc_plus_8_time(),
                }
            )

            tool_result = {
                "error": f"Tool call failed: {str(e)}",
                "server_name": server_name,
                "tool_name": tool_name,
            }

            self.task_log.log_step(
                "error",
                f"{agent_name} | Turn: {turn_count} | Tool Call",
                f"Tool {tool_name} failed to execute: {str(e)}",
            )

            return tool_result, call_duration_ms, tool_calls_data

    def format_tool_result_for_llm(self, tool_result: dict) -> dict:
        """
        Format tool result for feeding back to LLM.

        Args:
            tool_result: The tool execution result

        Returns:
            Formatted result suitable for LLM message
        """
        return self.output_formatter.format_tool_result_for_user(tool_result)


================================================
FILE: apps/miroflow-agent/src/io/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Input/Output module for processing task inputs and formatting outputs."""

from .input_handler import process_input
from .output_formatter import OutputFormatter

__all__ = [
    "process_input",
    "OutputFormatter",
]


================================================
FILE: apps/miroflow-agent/src/io/input_handler.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Input handler module for processing various file types.

This module provides functions for:
- Processing task inputs with associated files
- Converting documents (PDF, DOCX, PPTX, XLSX) to markdown
- Generating captions for images, audio, and video files
- Extracting task-relevant information from media files

Supported file formats:
- Documents: PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, HTML
- Images: JPG, JPEG, PNG, GIF, WEBP
- Audio: WAV, MP3, M4A
- Video: MP4, MOV, AVI, MKV, WEBM
- Data: JSON, JSONLD, CSV, YAML, TOML
- Code: PY, SH, MD, TXT
- Archives: ZIP
"""

import base64
import html
import json
import os
import re
import shutil
import tempfile
import traceback
from typing import Any, Tuple, Union
from urllib.parse import quote, unquote, urlparse, urlunparse

import mammoth
import markdownify
import openpyxl
import pdfminer
import pdfminer.high_level
import pptx
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from markitdown import MarkItDown
from openai import OpenAI
from openpyxl.utils import get_column_letter

# Ensure .env file is loaded
load_dotenv()

# File extension constants for different media types
IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "gif", "webp"}
AUDIO_EXTENSIONS = {"wav", "mp3", "m4a"}
VIDEO_EXTENSIONS = {"mp4", "mov", "avi", "mkv", "webm"}
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS
# Extensions that should skip MarkItDown fallback processing
SKIP_MARKITDOWN_EXTENSIONS = MEDIA_EXTENSIONS | {"pdb"}


def _generate_image_caption(image_path: str) -> str:
    """
    Generate a caption for an image using OpenAI's GPT-4o vision model.

    Args:
        image_path: Path to the image file

    Returns:
        Caption string, or error message if failed
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return "[Caption unavailable: OPENAI_API_KEY not set]"

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Read and encode image
        with open(image_path, "rb") as image_file:
            image_data = base64.b64encode(image_file.read()).decode("utf-8")

        # Guess MIME type
        _, ext = os.path.splitext(image_path)
        ext = ext.lower()
        mime_type = {
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".gif": "image/gif",
            ".webp": "image/webp",
        }.get(ext, "image/jpeg")

        # Call OpenAI API
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Please provide a detailed description of this image. Include key objects, people, text, colors, and any other relevant details.",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{image_data}"
                            },
                        },
                    ],
                }
            ],
            max_tokens=2048,
            temperature=0,
        )

        content = response.choices[0].message.content
        return content if content else "[Caption unavailable: Empty response]"

    except Exception as e:
        return f"[Caption generation failed: {str(e)}]"


def _generate_audio_caption(audio_path: str) -> str:
    """
    Generate a caption for an audio file using OpenAI's audio transcription.

    Args:
        audio_path: Path to the audio file

    Returns:
        Caption string (transcription), or error message if failed
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return "[Caption unavailable: OPENAI_API_KEY not set]"

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Transcribe audio
        with open(audio_path, "rb") as audio_file:
            transcription = client.audio.transcriptions.create(
                model="gpt-4o-transcribe", file=audio_file
            )

        text = transcription.text
        return text if text else "[Transcription unavailable: Empty response]"

    except Exception as e:
        return f"[Caption generation failed: {str(e)}]"


def _generate_video_caption(video_path: str) -> str:
    """
    Generate a caption for a video using OpenAI's GPT-4o vision model.

    Args:
        video_path: Path to the video file

    Returns:
        Caption string, or error message if failed
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return "[Caption unavailable: OPENAI_API_KEY not set]"

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Read and encode video
        with open(video_path, "rb") as video_file:
            video_data = base64.b64encode(video_file.read()).decode("utf-8")

        # Guess MIME type
        _, ext = os.path.splitext(video_path)
        ext = ext.lower()
        mime_type = {
            ".mp4": "video/mp4",
            ".mov": "video/quicktime",
            ".avi": "video/x-msvideo",
            ".mkv": "video/x-matroska",
            ".webm": "video/webm",
        }.get(ext, "video/mp4")

        # Call OpenAI API
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Please provide a detailed description of this video. Include key events, people, objects, actions, audio information, and any text visible in the video.",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{video_data}"
                            },
                        },
                    ],
                }
            ],
            max_tokens=2048,
            temperature=0,
        )

        content = response.choices[0].message.content
        return content if content else "[Caption unavailable: Empty response]"

    except Exception as e:
        return f"[Caption generation failed: {str(e)}]"


def _extract_task_relevant_info_from_image(
    image_path: str, task_description: str
) -> str:
    """
    Extract task-relevant information directly from an image based on the task description.

    Args:
        image_path: Path to the image file
        task_description: The user's task description

    Returns:
        Extracted relevant information, or empty string if extraction fails
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return ""

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Read and encode image
        with open(image_path, "rb") as image_file:
            image_data = base64.b64encode(image_file.read()).decode("utf-8")

        # Guess MIME type
        _, ext = os.path.splitext(image_path)
        ext = ext.lower()
        mime_type = {
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".png": "image/png",
            ".gif": "image/gif",
            ".webp": "image/webp",
        }.get(ext, "image/jpeg")

        # Call OpenAI API with task-specific prompt
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""Based on the following task, analyze this image and extract only the information that is directly relevant to completing the task.

Task: {task_description}

Please provide a concise summary of the relevant information from the image that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the image." Keep the response brief and focused.""",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{image_data}"
                            },
                        },
                    ],
                }
            ],
            max_tokens=1024,
            temperature=0,
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Warning: Failed to extract task-relevant info from image: {str(e)}")
        return ""


def _extract_task_relevant_info_from_audio(
    audio_path: str, task_description: str
) -> str:
    """
    Extract task-relevant information directly from an audio file based on the task description.

    Args:
        audio_path: Path to the audio file
        task_description: The user's task description

    Returns:
        Extracted relevant information, or empty string if extraction fails
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return ""

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Read and encode audio file
        with open(audio_path, "rb") as audio_file:
            audio_data = base64.b64encode(audio_file.read()).decode("utf-8")

        # Detect audio format
        _, ext = os.path.splitext(audio_path)
        ext = ext.lower()
        audio_format = {
            ".mp3": "mp3",
            ".wav": "wav",
            ".m4a": "m4a",
        }.get(ext, "mp3")

        # Use gpt-4o-audio-preview for direct audio question answering
        text_prompt = f"""Based on the following task, analyze this audio and extract only the information that is directly relevant to completing the task.

Task: {task_description}

Please provide a concise summary of the relevant information from the audio that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the audio." Keep the response brief and focused."""

        response = client.chat.completions.create(
            model="gpt-4o-audio-preview",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant specializing in audio analysis.",
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": text_prompt},
                        {
                            "type": "input_audio",
                            "input_audio": {
                                "data": audio_data,
                                "format": audio_format,
                            },
                        },
                    ],
                },
            ],
            max_tokens=1024,
            temperature=0,
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Warning: Failed to extract task-relevant info from audio: {str(e)}")
        return ""


def _extract_task_relevant_info_from_video(
    video_path: str, task_description: str
) -> str:
    """
    Extract task-relevant information directly from a video based on the task description.

    Args:
        video_path: Path to the video file
        task_description: The user's task description

    Returns:
        Extracted relevant information, or empty string if extraction fails
    """
    try:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

        if not OPENAI_API_KEY:
            return ""

        client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

        # Read and encode video
        with open(video_path, "rb") as video_file:
            video_data = base64.b64encode(video_file.read()).decode("utf-8")

        # Guess MIME type
        _, ext = os.path.splitext(video_path)
        ext = ext.lower()
        mime_type = {
            ".mp4": "video/mp4",
            ".mov": "video/quicktime",
            ".avi": "video/x-msvideo",
            ".mkv": "video/x-matroska",
            ".webm": "video/webm",
        }.get(ext, "video/mp4")

        # Call OpenAI API with task-specific prompt
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"""Based on the following task, analyze this video and extract only the information that is directly relevant to completing the task.

Task: {task_description}

Please provide a concise summary of the relevant information from the video that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the video." Keep the response brief and focused.""",
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:{mime_type};base64,{video_data}"
                            },
                        },
                    ],
                }
            ],
            max_tokens=1024,
            temperature=0,
        )

        return response.choices[0].message.content.strip()

    except Exception as e:
        print(f"Warning: Failed to extract task-relevant info from video: {str(e)}")
        return ""


def process_input(task_description: str, task_file_name: str) -> Tuple[str, str]:
    """
    Process user input and associated files.

    Extracts content from the task file (if provided) and appends it to the
    task description in a format suitable for the LLM.

    Args:
        task_description: The original task description
        task_file_name: Path to an associated file, or empty string if none

    Returns:
        Tuple of (updated_task_description, updated_task_description)
        Both values are the same - the task description with file content appended
    """
    updated_task_description = task_description
    file_content_section = ""  # Collect file content to append at the end

    if task_file_name:
        try:
            file_extension = task_file_name.rsplit(".", maxsplit=1)[-1].lower()
            parsing_result = None

            if file_extension in IMAGE_EXTENSIONS:
                # Generate unconditional image caption
                caption = _generate_image_caption(task_file_name)

                # Extract task-relevant information directly from the image
                relevant_info = _extract_task_relevant_info_from_image(
                    task_file_name, task_description
                )

                # Format as Markdown
                file_content_section += f"\n\nNote: An image file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Image Content\nFile: {task_file_name}\n\n"
                file_content_section += f"> {caption}\n\n"

                if relevant_info:
                    file_content_section += "Task-Relevant Information:\n\n"
                    file_content_section += f"{relevant_info}\n\n"

            elif file_extension == "py":
                # Python files - read directly
                with open(task_file_name, "r", encoding="utf-8") as f:
                    parsing_result = DocumentConverterResult(
                        title=None, text_content=f.read()
                    )
                file_content_section += f"\n\nNote: A Python file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Python File\nFile: {task_file_name}\n\n"

            elif file_extension in ["txt", "md", "sh", "yaml", "yml", "toml", "csv"]:
                # Text-based files - read directly
                with open(task_file_name, "r", encoding="utf-8") as f:
                    parsing_result = DocumentConverterResult(
                        title=None, text_content=f.read()
                    )
                file_type_name = {
                    "txt": "Text",
                    "md": "Markdown",
                    "sh": "Shell Script",
                    "yaml": "YAML",
                    "yml": "YAML",
                    "toml": "TOML",
                    "csv": "CSV",
                }.get(file_extension, "Text")
                file_content_section += f"\n\nNote: A {file_type_name.lower()} file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += (
                    f"## {file_type_name} File\nFile: {task_file_name}\n\n"
                )

            elif file_extension in ["jsonld", "json"]:
                with open(task_file_name, "r", encoding="utf-8") as f:
                    parsing_result = DocumentConverterResult(
                        title=None,
                        text_content=json.dumps(
                            json.load(f), ensure_ascii=False, indent=2
                        ),
                    )
                file_content_section += f"\n\nNote: A JSON file '{task_file_name}' is associated with this task. The content has been extracted as JSON format below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## JSON File\nFile: {task_file_name}\n\n"

            elif file_extension in ["xlsx", "xls"]:
                parsing_result = XlsxConverter(local_path=task_file_name)
                file_content_section += f"\n\nNote: An Excel file '{task_file_name}' is associated with this task. The content has been extracted as a markdown table below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Excel File\nFile: {task_file_name}\n\n"

            elif file_extension == "pdf":
                parsing_result = DocumentConverterResult(
                    title=None,
                    text_content=pdfminer.high_level.extract_text(task_file_name),
                )
                file_content_section += f"\n\nNote: A PDF file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## PDF File\nFile: {task_file_name}\n\n"

            elif file_extension in ["docx", "doc"]:
                parsing_result = DocxConverter(local_path=task_file_name)
                file_content_section += f"\n\nNote: A Word document '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Word Document\nFile: {task_file_name}\n\n"

            elif file_extension in ["html", "htm"]:
                parsing_result = HtmlConverter(local_path=task_file_name)
                file_content_section += f"\n\nNote: An HTML file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## HTML File\nFile: {task_file_name}\n\n"

            elif file_extension in ["pptx", "ppt"]:
                parsing_result = PptxConverter(local_path=task_file_name)
                file_content_section += f"\n\nNote: A PowerPoint presentation '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += (
                    f"## PowerPoint Presentation\nFile: {task_file_name}\n\n"
                )

            elif file_extension in AUDIO_EXTENSIONS:
                # Generate unconditional audio transcription
                caption = _generate_audio_caption(task_file_name)

                # Extract task-relevant information directly from the audio
                relevant_info = _extract_task_relevant_info_from_audio(
                    task_file_name, task_description
                )

                # Format as Markdown
                file_content_section += f"\n\nNote: An audio file '{task_file_name}' is associated with this task. The content has been extracted as a transcription below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Audio Content\nFile: {task_file_name}\n\n"
                file_content_section += f"> {caption}\n\n"

                if relevant_info:
                    file_content_section += "Task-Relevant Information:\n\n"
                    file_content_section += f"{relevant_info}\n\n"

            elif file_extension in VIDEO_EXTENSIONS:
                # Generate unconditional video caption
                caption = _generate_video_caption(task_file_name)

                # Extract task-relevant information directly from the video
                relevant_info = _extract_task_relevant_info_from_video(
                    task_file_name, task_description
                )

                # Format as Markdown
                file_content_section += f"\n\nNote: A video file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## Video Content\nFile: {task_file_name}\n\n"
                file_content_section += f"> {caption}\n\n"

                if relevant_info:
                    file_content_section += "Task-Relevant Information:\n\n"
                    file_content_section += f"{relevant_info}\n\n"

            elif file_extension in ["zip"]:
                parsing_result = ZipConverter(local_path=task_file_name)
                file_content_section += f"\n\nNote: A ZIP archive '{task_file_name}' is associated with this task. The content has been extracted as file list and contents below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                file_content_section += f"## ZIP Archive\nFile: {task_file_name}\n\n"

            elif file_extension == "pdb":
                # PDB files (protein database) - only add note
                file_content_section += f"\n\nNote: A PDB file '{task_file_name}' is associated with this task. You may use available tools to read its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"

            else:
                # For other file types, let MarkItDown try to handle it
                pass  # MarkItDown will be tried below

            #### markitdown process - ONLY if no specialized converter handled it ####
            if parsing_result is None:
                try:
                    if file_extension not in SKIP_MARKITDOWN_EXTENSIONS:
                        md = MarkItDown(enable_plugins=True)
                        parsing_result = md.convert(task_file_name)
                        print(
                            f"Info: Used MarkItDown as fallback to process file {task_file_name}"
                        )
                        # Add prompt for files processed by MarkItDown
                        file_content_section += f"\n\nNote: A file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
                        file_content_section += (
                            f"## File Content\nFile: {task_file_name}\n\n"
                        )
                except Exception as e:
                    print(
                        f"Warning: MarkItDown failed to process {task_file_name}: {e}"
                    )
                    pass
            ############################

            # Collect the content and title (if has) to append later
            if getattr(parsing_result, "title", None):
                file_content_section += "Title:\n\n{}\n\n".format(parsing_result.title)
                file_content_section += "Content:\n\n```\n{}\n```\n".format(
                    parsing_result.text_content
                )
            elif getattr(parsing_result, "text_content", None):
                content = parsing_result.text_content
                max_len = 200_000  # Limit the length of results returned to LLM
                if len(content) > max_len:
                    content = content[:max_len] + "\n... [File truncated]"
                file_content_section += "```\n{}\n```\n".format(content)
            else:
                pass  # for image, audio, video files that already have their content formatted

        except FileNotFoundError:
            print(f"Error: File not found {task_file_name}")
            file_content_section += (
                f"\nWarning: The specified file '{task_file_name}' was not found."
            )
        except Exception as e:
            print(f"Error: Error processing file {task_file_name}: {e}")
            traceback.print_exc()
            file_content_section += f"\nWarning: There was an error processing the file '{task_file_name}': {str(e)}"

    # output format requirement
    updated_task_description += "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}."

    # Append file content at the end
    updated_task_description += file_content_section
    updated_task_description = updated_task_description.strip()

    return updated_task_description, updated_task_description


class _CustomMarkdownify(markdownify.MarkdownConverter):
    """
    A custom version of markdownify's MarkdownConverter. Changes include:

    - Altering the default heading style to use '#', '##', etc.
    - Removing javascript hyperlinks.
    - Truncating images with large data:uri sources.
    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
    """

    def __init__(self, **options: Any):
        options["heading_style"] = options.get("heading_style", markdownify.ATX)
        # Explicitly cast options to the expected type if necessary
        super().__init__(**options)

    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual, but be sure to start with a new line"""
        if not convert_as_inline:
            if not re.search(r"^\n", text):
                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore

        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore

    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
        """Same as usual converter, but removes Javascript links and escapes URIs."""
        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
        if not text:
            return ""
        href = el.get("href")
        title = el.get("title")

        # Escape URIs and skip non-http or file schemes
        if href:
            try:
                parsed_url = urlparse(href)  # type: ignore
                if parsed_url.scheme and parsed_url.scheme.lower() not in [
                    "http",
                    "https",
                    "file",
                ]:  # type: ignore
                    return "%s%s%s" % (prefix, text, suffix)
                href = urlunparse(
                    parsed_url._replace(path=quote(unquote(parsed_url.path)))
                )  # type: ignore
            except ValueError:  # It's not clear if this ever gets thrown
                return "%s%s%s" % (prefix, text, suffix)

        # For the replacement see #29: text nodes underscores are escaped
        if (
            self.options["autolinks"]
            and text.replace(r"\_", "_") == href
            and not title
            and not self.options["default_title"]
        ):
            # Shortcut syntax
            return "<%s>" % href
        if self.options["default_title"] and not title:
            title = href
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        return (
            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
            if href
            else text
        )

    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
        """Same as usual converter, but removes data URIs"""

        alt = el.attrs.get("alt", None) or ""
        src = el.attrs.get("src", None) or ""
        title = el.attrs.get("title", None) or ""
        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
        if (
            convert_as_inline
            and el.parent.name not in self.options["keep_inline_images_in"]
        ):
            return alt

        # Remove dataURIs
        if src.startswith("data:"):
            src = src.split(",")[0] + "..."

        return "![%s](%s%s)" % (alt, src, title_part)

    def convert_soup(self, soup: Any) -> str:
        return super().convert_soup(soup)  # type: ignore


class DocumentConverterResult:
    """The result of converting a document to text."""

    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
        self.title: Union[str, None] = title
        self.text_content: str = text_content


def convert_html_to_md(html_content):
    """
    Placeholder for HTML to Markdown conversion function
    In the original class, this would call self._convert()
    """
    soup = BeautifulSoup(html_content, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()

    # Print only the main content
    body_elm = soup.find("body")
    webpage_text = ""
    if body_elm:
        webpage_text = _CustomMarkdownify().convert_soup(body_elm)
    else:
        webpage_text = _CustomMarkdownify().convert_soup(soup)

    assert isinstance(webpage_text, str)

    return DocumentConverterResult(
        title=None if soup.title is None else soup.title.string,
        text_content=webpage_text,
    )


def HtmlConverter(local_path: str):
    """
    Convert an HTML file to Markdown format.

    Args:
        local_path: Path to the HTML file to convert.

    Returns:
        DocumentConverterResult containing the converted Markdown text.
    """
    with open(local_path, "rt", encoding="utf-8") as fh:
        html_content = fh.read()

        return convert_html_to_md(html_content)


def DocxConverter(local_path: str):
    """
    Convert a DOCX file to Markdown format.

    Uses mammoth library to first convert DOCX to HTML, then converts
    the HTML to Markdown.

    Args:
        local_path: Path to the DOCX file to convert.

    Returns:
        DocumentConverterResult containing the converted Markdown text.
    """
    with open(local_path, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html_content = result.value
    return convert_html_to_md(html_content)


def XlsxConverter(local_path: str):
    """
    Converts Excel files to Markdown using openpyxl.
    Preserves color formatting and other cell styling information.

    Args:
        local_path: Path to the Excel file

    Returns:
        DocumentConverterResult with the Markdown representation of the Excel file
    """
    # Load the workbook
    wb = openpyxl.load_workbook(local_path, data_only=True)
    md_content = ""

    # Helper function to convert RGB color to hex
    def rgb_to_hex(rgb_value):
        if not rgb_value:
            return None

        # Convert RGB value to string for processing
        rgb_string = str(rgb_value)

        # Handle RGB format like 'RGB(255, 255, 255)'
        if isinstance(rgb_value, str) and rgb_string.startswith("RGB"):
            rgb_match = re.match(r"RGB\((\d+), (\d+), (\d+)\)", rgb_string)
            if rgb_match:
                r, g, b = map(int, rgb_match.groups())
                return f"#{r:02x}{g:02x}{b:02x}"

        # Special handling for FFFFFFFF (white) and 00000000 (transparent/none)
        if rgb_string in ["FFFFFFFF", "00000000", "none", "auto"]:
            return None

        # Handle ARGB format (common in openpyxl)
        if len(rgb_string) == 8:  # ARGB format like 'FF5733FF'
            return f"#{rgb_string[2:]}"  # Strip alpha channel

        # Handle direct hex values like 'FF5733'
        if isinstance(rgb_value, str):
            return f"#{rgb_string}" if not rgb_string.startswith("#") else rgb_string

        return None  # Return None for unrecognized formats

    # Helper function to detect and format cell styling
    def get_cell_format_info(cell):
        info = {}

        # Get background color if it exists
        if cell.fill and hasattr(cell.fill, "fgColor") and cell.fill.fgColor:
            # Get the RGB value - in openpyxl this can be stored in different attributes
            rgb_value = None
            if hasattr(cell.fill.fgColor, "rgb") and cell.fill.fgColor.rgb:
                rgb_value = cell.fill.fgColor.rgb
            elif hasattr(cell.fill.fgColor, "value") and cell.fill.fgColor.value:
                rgb_value = cell.fill.fgColor.value

            if rgb_value:
                bg_color = rgb_to_hex(rgb_value)
                if bg_color:  # Skip transparent or white (handled in rgb_to_hex)
                    info["bg_color"] = bg_color

        # Get font color if it exists
        if cell.font and hasattr(cell.font, "color") and cell.font.color:
            # Get the RGB value - in openpyxl this can be stored in different attributes
            rgb_value = None
            if hasattr(cell.font.color, "rgb") and cell.font.color.rgb:
                rgb_value = cell.font.color.rgb
            elif hasattr(cell.font.color, "value") and cell.font.color.value:
                rgb_value = cell.font.color.value

            if rgb_value:
                font_color = rgb_to_hex(rgb_value)
                if font_color:  # Skip transparent (handled in rgb_to_hex)
                    info["font_color"] = font_color

        # Get font weight (bold)
        if cell.font and cell.font.bold:
            info["bold"] = True

        # Get font style (italic)
        if cell.font and cell.font.italic:
            info["italic"] = True

        # Get font underline
        if cell.font and cell.font.underline and cell.font.underline != "none":
            info["underline"] = True

        return info

    # Process each sheet in the workbook
    for sheet_name in wb.sheetnames:
        try:
            sheet = wb[sheet_name]
            md_content += f"## {sheet_name}\n\n"

            # Get the dimensions of the used part of the sheet
            min_row, min_col = 1, 1
            max_row = max(
                (cell.row for cell in sheet._cells.values() if cell.value is not None),
                default=0,
            )
            max_col = max(
                (
                    cell.column
                    for cell in sheet._cells.values()
                    if cell.value is not None
                ),
                default=0,
            )

            if max_row == 0 or max_col == 0:
                md_content += "This sheet is empty.\n\n"
                continue
        except Exception as e:
            error_msg = f"Error processing sheet '{sheet_name}': {str(e)}"
            print(error_msg)
            md_content += (
                f"## {sheet_name}\n\nError processing this sheet: {str(e)}\n\n"
            )
            continue

        try:
            # First, determine column widths
            col_widths = {}
            for col_idx in range(min_col, max_col + 1):
                max_length = 0
                # col_letter = get_column_letter(col_idx)
                _ = get_column_letter(col_idx)
                for row_idx in range(min_row, max_row + 1):
                    try:
                        cell = sheet.cell(row=row_idx, column=col_idx)
                        cell_value = str(cell.value) if cell.value is not None else ""
                        max_length = max(max_length, len(cell_value))
                    except Exception as e:
                        print(
                            f"Warning: Error processing cell at row {row_idx}, column {col_idx}: {str(e)}"
                        )
                        max_length = max(max_length, 10)  # Use reasonable default
                col_widths[col_idx] = max(max_length + 2, 5)  # Min width of 5

            # Start building the table
            # Header row with column separators
            md_content += "|"
            for col_idx in range(min_col, max_col + 1):
                md_content += " " + " " * col_widths[col_idx] + " |"
            md_content += "\n"

            # Separator row
            md_content += "|"
            for col_idx in range(min_col, max_col + 1):
                md_content += ":" + "-" * col_widths[col_idx] + ":|"
            md_content += "\n"

            # Data rows
            for row_idx in range(min_row, max_row + 1):
                md_content += "|"
                for col_idx in range(min_col, max_col + 1):
                    try:
                        cell = sheet.cell(row=row_idx, column=col_idx)
                        cell_value = str(cell.value) if cell.value is not None else ""

                        # Get formatting info
                        try:
                            format_info = get_cell_format_info(cell)
                        except Exception as e:
                            print(
                                f"Warning: Error getting formatting for cell at row {row_idx}, column {col_idx}: {str(e)}"
                            )
                            format_info = {}

                        formatted_value = cell_value

                        # Add HTML-style formatting if needed
                        if format_info:
                            style_parts = []

                            if "bg_color" in format_info:
                                style_parts.append(
                                    f"background-color:{format_info['bg_color']}"
                                )

                            if "font_color" in format_info:
                                style_parts.append(f"color:{format_info['font_color']}")

                            span_attributes = []
                            if style_parts:
                                span_attributes.append(
                                    f'style="{"; ".join(style_parts)}"'
                                )

                            # Format with bold/italic/underline if needed
                            inner_value = cell_value
                            if "bold" in format_info:
                                inner_value = f"<strong>{inner_value}</strong>"
                            if "italic" in format_info:
                                inner_value = f"<em>{inner_value}</em>"
                            if "underline" in format_info:
                                inner_value = f"<u>{inner_value}</u>"

                            # Only add a span if we have style attributes
                            if span_attributes:
                                formatted_value = f"<span {' '.join(span_attributes)}>{inner_value}</span>"
                            else:
                                formatted_value = inner_value

                        # Pad to column width and add to markdown
                        padding = col_widths[col_idx] - len(cell_value)
                        padded_value = " " + formatted_value + " " * (padding + 1)
                        md_content += padded_value + "|"
                    except Exception as e:
                        print(
                            f"Error processing cell at row {row_idx}, column {col_idx}: {str(e)}"
                        )
                        # Add a placeholder for the failed cell
                        padded_value = " [Error] " + " " * (col_widths[col_idx] - 7)
                        md_content += padded_value + " |"

                md_content += "\n"
        except Exception as e:
            error_msg = f"Error generating table for sheet '{sheet_name}': {str(e)}\n{traceback.format_exc()}"
            print(error_msg)
            md_content += f"Error generating table: {str(e)}\n\n"

        # Add formatting legend
        has_formatting = False
        for row_idx in range(min_row, max_row + 1):
            for col_idx in range(min_col, max_col + 1):
                cell = sheet.cell(row=row_idx, column=col_idx)
                if get_cell_format_info(cell):
                    has_formatting = True
                    break
            if has_formatting:
                break

        if has_formatting:
            md_content += "\n### Formatting Information\n"
            md_content += "The table above includes HTML formatting to represent colors and styles from the original Excel file.\n"
            md_content += "This formatting may not display in all Markdown viewers.\n"

        md_content += "\n\n"  # Extra newlines between sheets

    return DocumentConverterResult(
        title=None,
        text_content=md_content.strip(),
    )


def PptxConverter(local_path: str) -> DocumentConverterResult:
    """
    Converts PPTX files to Markdown. Supports headings, tables and images with alt text.

    Args:
        local_path: Path to the PPTX file

    Returns:
        DocumentConverterResult containing the converted Markdown text
    """

    def is_picture(shape):
        """Check if a shape is a picture"""
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
            return True
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
            if hasattr(shape, "image"):
                return True
        return False

    def is_table(shape):
        """Check if a shape is a table"""
        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
            return True
        return False

    if not local_path.endswith(".pptx"):
        return DocumentConverterResult(
            title=None,
            text_content=f"Error: Expected .pptx file, got: {local_path}",
        )

    md_content = ""
    presentation = pptx.Presentation(local_path)
    slide_num = 0

    for slide in presentation.slides:
        slide_num += 1
        md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
        title = slide.shapes.title

        for shape in slide.shapes:
            # Pictures
            if is_picture(shape):
                # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
                alt_text = ""
                try:
                    alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
                except Exception:
                    pass
                # A placeholder name
                filename = re.sub(r"\W", "", shape.name) + ".jpg"
                md_content += (
                    "\n!["
                    + (alt_text if alt_text else shape.name)
                    + "]("
                    + filename
                    + ")\n"
                )

            # Tables
            if is_table(shape):
                html_table = "<html><body><table>"
                first_row = True
                for row in shape.table.rows:
                    html_table += "<tr>"
                    for cell in row.cells:
                        if first_row:
                            html_table += "<th>" + html.escape(cell.text) + "</th>"
                        else:
                            html_table += "<td>" + html.escape(cell.text) + "</td>"
                    html_table += "</tr>"
                    first_row = False
                html_table += "</table></body></html>"

                # Note: This would require a separate HTML to Markdown converter function
                # In this version, I'm assuming a convert_html_to_md function exists
                md_content += (
                    "\n" + convert_html_to_md(html_table).text_content.strip() + "\n"
                )

            # Text areas
            elif shape.has_text_frame:
                if shape == title:
                    md_content += "# " + shape.text.lstrip() + "\n"
                else:
                    md_content += shape.text + "\n"

        md_content = md_content.strip()
        if slide.has_notes_slide:
            md_content += "\n\n### Notes:\n"
            notes_frame = slide.notes_slide.notes_text_frame
            if notes_frame is not None:
                md_content += notes_frame.text
            md_content = md_content.strip()

    return DocumentConverterResult(
        title=None,
        text_content=md_content.strip(),
    )


def ZipConverter(local_path: str, **kwargs):
    """
    Extracts ZIP files to a temporary directory and processes each file according to its extension.
    Returns a combined result of all processed files.
    """
    import zipfile

    temp_dir = tempfile.mkdtemp(prefix="zip_extract_")
    md_content = f"# Extracted from ZIP: {os.path.basename(local_path)}\n\n"

    try:
        with zipfile.ZipFile(local_path, "r") as zip_ref:
            zip_ref.extractall(temp_dir)

        # Get all extracted files
        extracted_files = []
        for root, dirs, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, temp_dir)
                extracted_files.append((file_path, rel_path))

        if not extracted_files:
            md_content += "The ZIP file is empty or contains no files.\n"
        else:
            md_content += f"Total files extracted: {len(extracted_files)}\n\n"

            for file_path, rel_path in extracted_files:
                md_content += f"## File: {rel_path}\n\n"

                # Process each file based on its extension
                file_extension = (
                    file_path.rsplit(".", maxsplit=1)[-1].lower()
                    if "." in file_path
                    else ""
                )
                file_result = None

                try:
                    # Use the same processing logic as process_input
                    if file_extension == "py":
                        with open(file_path, "r", encoding="utf-8") as f:
                            file_result = DocumentConverterResult(
                                title=None, text_content=f.read()
                            )

                    elif file_extension in [
                        "txt",
                        "md",
                        "sh",
                        "yaml",
                        "yml",
                        "toml",
                        "csv",
                    ]:
                        with open(file_path, "r", encoding="utf-8") as f:
                            file_result = DocumentConverterResult(
                                title=None, text_content=f.read()
                            )

                    elif file_extension in ["jsonld", "json"]:
                        with open(file_path, "r", encoding="utf-8") as f:
                            file_result = DocumentConverterResult(
                                title=None,
                                text_content=json.dumps(
                                    json.load(f), ensure_ascii=False, indent=2
                                ),
                            )

                    elif file_extension in ["xlsx", "xls"]:
                        file_result = XlsxConverter(local_path=file_path)

                    elif file_extension == "pdf":
                        file_result = DocumentConverterResult(
                            title=None,
                            text_content=pdfminer.high_level.extract_text(file_path),
                        )

                    elif file_extension in ["docx", "doc"]:
                        file_result = DocxConverter(local_path=file_path)

                    elif file_extension in ["html", "htm"]:
                        file_result = HtmlConverter(local_path=file_path)

                    elif file_extension in ["pptx", "ppt"]:
                        file_result = PptxConverter(local_path=file_path)

                    elif file_extension in IMAGE_EXTENSIONS:
                        # Generate image caption for files in ZIP
                        caption = _generate_image_caption(file_path)
                        md_content += "[Image file]\n\n"
                        md_content += f"> {caption}\n\n"
                        continue

                    elif file_extension in AUDIO_EXTENSIONS:
                        # Generate audio caption for files in ZIP
                        caption = _generate_audio_caption(file_path)
                        md_content += "[Audio file]\n\n"
                        md_content += f"> {caption}\n\n"
                        continue

                    elif file_extension in VIDEO_EXTENSIONS:
                        # Generate video caption for files in ZIP
                        caption = _generate_video_caption(file_path)
                        md_content += "[Video file]\n\n"
                        md_content += f"> {caption}\n\n"
                        continue

                    elif file_extension == "pdb":
                        md_content += "[PDB file - specialized format]\n\n"
                        continue

                    else:
                        # Try MarkItDown as fallback
                        try:
                            md_tool = MarkItDown(enable_plugins=True)
                            file_result = md_tool.convert(file_path)
                        except Exception:
                            md_content += (
                                f"[Unsupported file type: {file_extension}]\n\n"
                            )
                            continue

                    # Add the processed content
                    if file_result and getattr(file_result, "text_content", None):
                        content = file_result.text_content
                        # Limit length for each file
                        max_len = 50_000
                        if len(content) > max_len:
                            content = content[:max_len] + "\n... [Content truncated]"
                        md_content += f"```\n{content}\n```\n\n"

                except Exception as e:
                    md_content += f"[Error processing file: {str(e)}]\n\n"
                    print(f"Warning: Error processing {rel_path} from ZIP: {e}")

    finally:
        # Clean up temporary directory
        try:
            shutil.rmtree(temp_dir)
        except Exception as e:
            print(f"Warning: Could not remove temporary directory {temp_dir}: {e}")

    return DocumentConverterResult(
        title="ZIP Archive Contents", text_content=md_content.strip()
    )


================================================
FILE: apps/miroflow-agent/src/io/output_formatter.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Output formatting utilities for agent responses."""

import re
from typing import Tuple

from ..utils.prompt_utils import FORMAT_ERROR_MESSAGE

# Maximum length for tool results before truncation (100k chars ≈ 25k tokens)
TOOL_RESULT_MAX_LENGTH = 100_000


class OutputFormatter:
    """Formatter for processing and formatting agent outputs."""

    def _extract_boxed_content(self, text: str) -> str:
        r"""
        Extract the content of the last \boxed{...} occurrence in the given text.

        Supports:
          - Arbitrary levels of nested braces
          - Escaped braces (\{ and \})
          - Whitespace between \boxed and the opening brace
          - Empty content inside braces
          - Incomplete boxed expressions (extracts to end of string as fallback)

        Args:
            text: Input text that may contain \boxed{...} expressions

        Returns:
            The extracted boxed content, or empty string if no match is found.
        """
        if not text:
            return ""

        _BOXED_RE = re.compile(r"\\boxed\b", re.DOTALL)

        last_result = None  # Track the last boxed content (complete or incomplete)
        i = 0
        n = len(text)

        while True:
            # Find the next \boxed occurrence
            m = _BOXED_RE.search(text, i)
            if not m:
                break
            j = m.end()

            # Skip any whitespace after \boxed
            while j < n and text[j].isspace():
                j += 1

            # Require that the next character is '{'
            if j >= n or text[j] != "{":
                i = j
                continue

            # Parse the brace content manually to handle nesting and escapes
            depth = 0
            k = j
            escaped = False
            found_closing = False
            while k < n:
                ch = text[k]
                if escaped:
                    escaped = False
                elif ch == "\\":
                    escaped = True
                elif ch == "{":
                    depth += 1
                elif ch == "}":
                    depth -= 1
                    # When depth returns to zero, the boxed content ends
                    if depth == 0:
                        last_result = text[j + 1 : k]
                        i = k + 1
                        found_closing = True
                        break
                k += 1

            # If we didn't find a closing brace, this is an incomplete boxed
            # Store it as the last result (will be overwritten if we find more boxed later)
            if not found_closing and depth > 0:
                last_result = text[j + 1 : n]
                i = k  # Continue from where we stopped
            elif not found_closing:
                i = j + 1  # Move past this invalid boxed

        # Return the last boxed content found (complete or incomplete)
        black_list = ["?", "??", "???", "？", "……", "…", "...", "unknown", None]
        return last_result.strip() if last_result not in black_list else ""

    def format_tool_result_for_user(self, tool_call_execution_result: dict) -> dict:
        """
        Format tool execution results to be fed back to LLM as user messages.

        Only includes necessary information (results or errors). Long results
        are truncated to TOOL_RESULT_MAX_LENGTH to prevent context overflow.

        Args:
            tool_call_execution_result: Dict containing server_name, tool_name,
                and either 'result' or 'error'.

        Returns:
            Dict with 'type' and 'text' keys suitable for LLM message content.
        """
        server_name = tool_call_execution_result["server_name"]
        tool_name = tool_call_execution_result["tool_name"]

        if "error" in tool_call_execution_result:
            # Provide concise error information to LLM
            content = f"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}"
        elif "result" in tool_call_execution_result:
            # Provide the original output result of the tool
            content = tool_call_execution_result["result"]
            # Truncate overly long results to prevent context overflow
            if len(content) > TOOL_RESULT_MAX_LENGTH:
                content = content[:TOOL_RESULT_MAX_LENGTH] + "\n... [Result truncated]"
        else:
            content = f"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result."

        return {"type": "text", "text": content}

    def format_final_summary_and_log(
        self, final_answer_text: str, client=None
    ) -> Tuple[str, str, str]:
        """
        Format final summary information, including answers and token statistics.

        Args:
            final_answer_text: The final answer text from the agent
            client: Optional LLM client for token usage statistics

        Returns:
            Tuple of (summary_text, boxed_result, usage_log)
        """
        summary_lines = []
        summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30)
        summary_lines.append(final_answer_text)

        # Extract boxed result - find the last match using safer regex patterns
        boxed_result = self._extract_boxed_content(final_answer_text)

        # Add extracted result section
        summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20)

        if boxed_result:
            summary_lines.append(boxed_result)
        elif final_answer_text:
            summary_lines.append("No \\boxed{} content found.")
            boxed_result = FORMAT_ERROR_MESSAGE

        # Token usage statistics and cost estimation - use client method
        if client and hasattr(client, "format_token_usage_summary"):
            token_summary_lines, log_string = client.format_token_usage_summary()
            summary_lines.extend(token_summary_lines)
        else:
            # If no client or client doesn't support it, use default format
            summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20)
            summary_lines.append("Token usage information not available.")
            summary_lines.append("-" * (40 + len(" Token Usage & Cost ")))
            log_string = "Token usage information not available."

        return "\n".join(summary_lines), boxed_result, log_string


================================================
FILE: apps/miroflow-agent/src/llm/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

from .base_client import BaseClient
from .factory import ClientFactory
from .providers import (
    AnthropicClient,
    OpenAIClient,
)

__all__ = [
    "BaseClient",
    "ClientFactory",
    "AnthropicClient",
    "OpenAIClient",
]


================================================
FILE: apps/miroflow-agent/src/llm/base_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Base client module for LLM providers.

This module defines the abstract base class and common utilities for LLM clients,
supporting both OpenAI and Anthropic API formats.
"""

import asyncio
import dataclasses
from abc import ABC
from typing import (
    Any,
    Dict,
    List,
    Optional,
    Tuple,
    TypedDict,
)

from omegaconf import DictConfig

from ..logging.task_logger import TaskLog
from .util import with_timeout

# Default timeout for LLM API calls (10 minutes)
DEFAULT_LLM_TIMEOUT_SECONDS = 600


class TokenUsage(TypedDict, total=True):
    """
    Unified token usage tracking across different LLM providers.

    We unify OpenAI and Anthropic formats. There are four usage types:
    - input/output tokens: Standard input and output token counts
    - cache write/read tokens: Tokens involved in caching operations

    Provider-specific notes:
    - OpenAI: Cache write is free, cache read is cheaper
    - Anthropic: Cache write has a small cost, cache read is cheaper
    """

    total_input_tokens: int
    total_output_tokens: int
    total_cache_read_input_tokens: int
    total_cache_write_input_tokens: int


@dataclasses.dataclass
class BaseClient(ABC):
    """
    Abstract base class for LLM provider clients.

    This class provides the common interface and utilities for interacting with
    different LLM providers (OpenAI, Anthropic, etc.). Concrete implementations
    should override _create_client() and provider-specific methods.

    Attributes:
        task_id: Unique identifier for the current task (used for tracking)
        cfg: Hydra configuration containing LLM settings
        task_log: Optional logger for recording task execution details
    """

    # Required arguments (no default value)
    task_id: str
    cfg: DictConfig

    # Optional arguments (with default value)
    task_log: Optional["TaskLog"] = None

    # Initialized in __post_init__
    client: Any = dataclasses.field(init=False)
    token_usage: TokenUsage = dataclasses.field(init=False)
    last_call_tokens: Dict[str, int] = dataclasses.field(init=False)

    def __post_init__(self):
        # Initialize last_call_tokens before other operations
        self.last_call_tokens: Dict[str, int] = {
            "prompt_tokens": 0,
            "completion_tokens": 0,
        }

        # Explicitly assign from cfg object
        self.provider: str = self.cfg.llm.provider
        self.model_name: str = self.cfg.llm.model_name
        self.temperature: float = self.cfg.llm.temperature
        self.top_p: float = self.cfg.llm.top_p
        self.min_p: float = self.cfg.llm.min_p
        self.top_k: int = self.cfg.llm.top_k
        self.max_context_length: int = self.cfg.llm.max_context_length
        self.max_tokens: int = self.cfg.llm.max_tokens
        self.async_client: bool = self.cfg.llm.async_client
        self.keep_tool_result: int = self.cfg.agent.keep_tool_result
        self.api_key: Optional[str] = self.cfg.llm.get("api_key")
        self.base_url: Optional[str] = self.cfg.llm.get("base_url")
        self.use_tool_calls: Optional[bool] = self.cfg.llm.get("use_tool_calls")
        self.repetition_penalty: float = self.cfg.llm.get("repetition_penalty", 1.0)

        self.token_usage = self._reset_token_usage()
        self.client = self._create_client()

        self.task_log.log_step(
            "info",
            "LLM | Initialization",
            f"LLMClient {self.provider} {self.model_name} initialization completed.",
        )

    def _reset_token_usage(self) -> TokenUsage:
        """
        Reset token usage counter to zero.

        Returns:
            A new TokenUsage dict with all counters set to zero.
        """
        return TokenUsage(
            total_input_tokens=0,
            total_output_tokens=0,
            total_cache_write_input_tokens=0,
            total_cache_read_input_tokens=0,
        )

    def _remove_tool_result_from_messages(
        self, messages, keep_tool_result
    ) -> List[Dict]:
        """Remove tool results from messages

        Args:
            messages: List of message dictionaries
            keep_tool_result: Number of tool results to keep. -1 means keep all.

        Returns:
            List of messages with tool results filtered according to keep_tool_result
        """
        messages_copy = [m.copy() for m in messages]

        if keep_tool_result == -1:
            # No processing needed, keep all messages
            return messages_copy

        # Find indices of all user/tool messages (these are tool results)
        user_indices = [
            i
            for i, msg in enumerate(messages_copy)
            if msg.get("role") == "user" or msg.get("role") == "tool"
        ]

        if len(user_indices) == 0:
            # No user/tool messages found
            self.task_log.log_step(
                "info",
                "LLM | Message Retention",
                "No user/tool messages found in the history.",
            )
            return messages_copy

        # The first user message is the initial task, not a tool result
        # Tool results start from the second user message onwards
        if len(user_indices) == 1:
            # Only one user message (the initial task), no tool results to filter
            self.task_log.log_step(
                "info",
                "LLM | Message Retention",
                "Only 1 user message found (initial task). Keeping it as is.",
            )
            return messages_copy

        # Tool result indices (excluding the first user message which is the initial task)
        tool_result_indices = user_indices[1:]
        first_user_idx = user_indices[
            0
        ]  # Always keep the first user message (initial task)

        # Calculate how many tool results to keep from the end
        if keep_tool_result == 0:
            # Keep 0 tool results, only keep the initial task
            num_tool_results_to_keep = 0
        else:
            # Keep the last keep_tool_result tool results
            num_tool_results_to_keep = min(keep_tool_result, len(tool_result_indices))

        # Get indices of tool results to keep from the end
        tool_result_indices_to_keep = (
            tool_result_indices[-num_tool_results_to_keep:]
            if num_tool_results_to_keep > 0
            else []
        )

        # Combine first message (initial task) and tool results to keep
        indices_to_keep = [first_user_idx] + tool_result_indices_to_keep

        self.task_log.log_step(
            "info",
            "LLM | Message Retention",
            f"Message retention summary: Total user/tool messages: {len(user_indices)}, "
            f"Initial task at index: {first_user_idx}, "
            f"Keeping last {num_tool_results_to_keep} tool results at indices: {tool_result_indices_to_keep}, "
            f"Total messages to keep: {len(indices_to_keep)}",
        )

        # Replace content of tool results that should be omitted
        for i, msg in enumerate(messages_copy):
            if (
                msg.get("role") == "user" or msg.get("role") == "tool"
            ) and i not in indices_to_keep:
                # Preserve the message structure but replace content
                if isinstance(msg.get("content"), list):
                    # For Anthropic format
                    msg["content"] = [
                        {
                            "type": "text",
                            "text": "Tool result is omitted to save tokens.",
                        }
                    ]
                else:
                    # For OpenAI format
                    msg["content"] = "Tool result is omitted to save tokens."

        return messages_copy

    @with_timeout(DEFAULT_LLM_TIMEOUT_SECONDS)
    async def create_message(
        self,
        system_prompt: str,
        message_history: List[Dict],
        tool_definitions: List[Dict],
        keep_tool_result: int = -1,
        step_id: int = 1,
        task_log: Optional["TaskLog"] = None,
        agent_type: str = "main",
    ) -> Tuple[Any, List[Dict]]:
        """
        Call LLM to generate a response with optional tool call support.

        This is the main entry point for LLM interactions. It handles:
        - Message history management
        - Tool result filtering based on keep_tool_result
        - Error handling and logging

        Args:
            system_prompt: System prompt to guide the LLM's behavior
            message_history: List of previous messages in the conversation
            tool_definitions: List of available tool definitions
            keep_tool_result: Number of recent tool results to keep (-1 = keep all)
            step_id: Current step identifier for logging
            task_log: Optional logger for task execution
            agent_type: Type of agent making the call ("main" or sub-agent name)

        Returns:
            Tuple of (response, updated_message_history)
        """
        # Unified LLM call processing
        try:
            response, message_history = await self._create_message(
                system_prompt,
                message_history,
                tool_definitions,
                keep_tool_result=keep_tool_result,
            )

        except Exception as e:
            self.task_log.log_step(
                "error",
                f"FATAL ERROR | {agent_type} | LLM Call ERROR",
                f"{agent_type} failed: {str(e)}",
            )
            response = None

        return response, message_history

    @staticmethod
    async def convert_tool_definition_to_tool_call(tools_definitions):
        """
        Convert MCP tool definitions to OpenAI function call format.

        Transforms the internal tool definition format used by MCP servers into
        the format expected by OpenAI's function calling API.

        Args:
            tools_definitions: List of server definitions, each containing a 'name'
                and 'tools' list with tool specifications.

        Returns:
            List of tool definitions in OpenAI function call format, where each
            tool name is prefixed with its server name (e.g., "server-name-tool-name").
        """
        tool_list = []
        for server in tools_definitions:
            if "tools" in server and len(server["tools"]) > 0:
                for tool in server["tools"]:
                    tool_def = dict(
                        type="function",
                        function=dict(
                            name=f"{server['name']}-{tool['name']}",
                            description=tool["description"],
                            parameters=tool["schema"],
                        ),
                    )
                    tool_list.append(tool_def)
        return tool_list

    def close(self):
        """Close client connection.

        Note: For async clients (AsyncOpenAI, AsyncAnthropic), the connection
        will be closed when the client object is garbage collected.
        For proper async cleanup, use `await client.aclose()` in an async context.
        """
        if hasattr(self.client, "close"):
            if asyncio.iscoroutinefunction(self.client.close):
                # For async clients, we cannot call close() synchronously.
                # The async HTTP client will be closed when garbage collected.
                # For explicit async cleanup, call aclose() from an async context.
                if hasattr(self.client, "_client"):
                    # Try to close the underlying httpx client if available
                    try:
                        self.client._client.close()
                    except Exception:
                        pass  # Ignore errors during cleanup
            else:
                self.client.close()
        elif hasattr(self.client, "_client") and hasattr(self.client._client, "close"):
            # Some clients may have internal _client attribute
            self.client._client.close()

    def _format_response_for_log(self, response) -> Dict:
        """Format response for logging"""
        if not response:
            return {}

        # Basic response information
        formatted = {
            "response_type": type(response).__name__,
        }

        # Anthropic response
        if hasattr(response, "content"):
            formatted["content"] = []
            for block in response.content:
                if hasattr(block, "type"):
                    if block.type == "text":
                        formatted["content"].append(
                            {
                                "type": "text",
                                "text": block.text[:500] + "..."
                                if len(block.text) > 500
                                else block.text,
                            }
                        )
                    elif block.type == "tool_use":
                        formatted["content"].append(
                            {
                                "type": "tool_use",
                                "id": block.id,
                                "name": block.name,
                                "input": str(block.input)[:200] + "..."
                                if len(str(block.input)) > 200
                                else str(block.input),
                            }
                        )

        # OpenAI response
        if hasattr(response, "choices"):
            formatted["choices"] = []
            for choice in response.choices:
                choice_data = {"finish_reason": choice.finish_reason}
                if hasattr(choice, "message"):
                    message = choice.message
                    choice_data["message"] = {
                        "role": message.role,
                        "content": message.content[:500] + "..."
                        if message.content and len(message.content) > 500
                        else message.content,
                    }
                    if hasattr(message, "tool_calls") and message.tool_calls:
                        choice_data["message"]["tool_calls_count"] = len(
                            message.tool_calls
                        )
                formatted["choices"].append(choice_data)

        return formatted


================================================
FILE: apps/miroflow-agent/src/llm/factory.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
LLM Client Factory module.

This module provides a factory function for creating LLM clients based on
configuration. It supports multiple providers including OpenAI, Anthropic,
and Qwen (via OpenAI-compatible API).
"""

from typing import Optional, Union

from omegaconf import DictConfig, OmegaConf

from ..logging.task_logger import TaskLog
from .providers.anthropic_client import AnthropicClient
from .providers.openai_client import OpenAIClient

# Supported LLM providers
SUPPORTED_PROVIDERS = {"anthropic", "openai", "qwen"}


def ClientFactory(
    task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs
) -> Union[OpenAIClient, AnthropicClient]:
    """
    Create an LLM client based on the provider specified in configuration.

    This factory function automatically selects and instantiates the appropriate
    client class based on the `llm.provider` field in the configuration.

    Args:
        task_id: Unique identifier for the current task (used for tracking)
        cfg: Hydra configuration object containing LLM settings
        task_log: Optional logger for recording task execution details
        **kwargs: Additional keyword arguments to merge into configuration

    Returns:
        An instance of the appropriate LLM client (OpenAIClient or AnthropicClient)

    Example:
        >>> client = ClientFactory(
        ...     task_id="task_001",
        ...     cfg=cfg,
        ...     task_log=task_log
        ... )
    """
    provider = cfg.llm.provider
    config = OmegaConf.merge(cfg, kwargs)

    client_creators = {
        "anthropic": lambda: AnthropicClient(
            task_id=task_id, task_log=task_log, cfg=config
        ),
        "qwen": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
        "openai": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
    }

    factory = client_creators.get(provider)
    if not factory:
        raise ValueError(
            f"Unsupported provider: '{provider}'. "
            f"Supported providers are: {', '.join(sorted(SUPPORTED_PROVIDERS))}"
        )

    return factory()


================================================
FILE: apps/miroflow-agent/src/llm/providers/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

from .anthropic_client import AnthropicClient
from .openai_client import OpenAIClient

__all__ = [
    "AnthropicClient",
    "OpenAIClient",
]


================================================
FILE: apps/miroflow-agent/src/llm/providers/anthropic_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Anthropic Claude LLM client implementation.

This module provides the AnthropicClient class for interacting with Anthropic's
Claude API, with support for prompt caching and extended thinking.

Features:
- Async and sync API support
- Prompt caching with ephemeral cache control
- Token usage tracking including cache statistics
- MCP tool call parsing and response processing
"""

import asyncio
import dataclasses
import logging
from typing import Any, Dict, List, Tuple, Union

import tiktoken
from anthropic import (
    NOT_GIVEN,
    Anthropic,
    AsyncAnthropic,
    DefaultAsyncHttpxClient,
    DefaultHttpxClient,
)
from tenacity import retry, stop_after_attempt, wait_fixed

from ...utils.prompt_utils import generate_mcp_system_prompt
from ..base_client import BaseClient

logger = logging.getLogger("miroflow_agent")


@dataclasses.dataclass
class AnthropicClient(BaseClient):
    def __post_init__(self):
        super().__post_init__()

        # Anthropic-specific token counters
        self.input_tokens: int = 0
        self.output_tokens: int = 0
        self.cache_creation_tokens: int = 0
        self.cache_read_tokens: int = 0

    def _create_client(self) -> Union[AsyncAnthropic, Anthropic]:
        """Create LLM client"""
        http_client_args = {"headers": {"x-upstream-session-id": self.task_id}}
        if self.async_client:
            return AsyncAnthropic(
                api_key=self.api_key,
                base_url=self.base_url,
                http_client=DefaultAsyncHttpxClient(**http_client_args),
            )
        else:
            return Anthropic(
                api_key=self.api_key,
                base_url=self.base_url,
                http_client=DefaultHttpxClient(**http_client_args),
            )

    def _update_token_usage(self, usage_data: Any) -> None:
        """Update cumulative token usage"""
        if usage_data:
            # Update based on actual field names returned by Anthropic API
            self.token_usage["total_cache_write_input_tokens"] += (
                getattr(usage_data, "cache_creation_input_tokens", 0) or 0
            )
            self.token_usage["total_cache_read_input_tokens"] += (
                getattr(usage_data, "cache_read_input_tokens", 0) or 0
            )
            self.token_usage["total_input_tokens"] += (
                getattr(usage_data, "input_tokens", 0) or 0
            )
            self.token_usage["total_output_tokens"] += (
                getattr(usage_data, "output_tokens", 0) or 0
            )
            self.task_log.log_step(
                "info",
                "LLM | Token Usage",
                f"Input: {getattr(usage_data, 'input_tokens', 0)}, "
                f"Cache: {getattr(usage_data, 'cache_creation_input_tokens', 0)}+{getattr(usage_data, 'cache_read_input_tokens', 0)}, "
                f"Output: {getattr(usage_data, 'output_tokens', 0)}",
            )

            self.last_call_tokens = {
                "input_tokens": getattr(usage_data, "input_tokens", 0)
                + getattr(usage_data, "cache_creation_input_tokens", 0)
                + getattr(usage_data, "cache_read_input_tokens", 0),
                "output_tokens": getattr(usage_data, "output_tokens", 0),
            }
        else:
            self.task_log.log_step(
                "warning", "LLM | Token Usage", "Warning: No valid usage_data received."
            )

    @retry(wait=wait_fixed(10), stop=stop_after_attempt(5))
    async def _create_message(
        self,
        system_prompt: str,
        messages_history: List[Dict[str, Any]],
        tools_definitions,
        keep_tool_result: int = -1,
    ):
        """
        Send message to Anthropic API.
        :param system_prompt: System prompt string.
        :param messages_history: Message history list.
        :return: Anthropic API response object or None (if error occurs).
        """
        self.task_log.log_step(
            "info",
            "LLM | Call Start",
            f"Calling LLM ({'async' if self.async_client else 'sync'})",
        )

        # Create a filtered copy for sending to LLM (to save tokens)
        # But keep the original messages_history for returning (for complete log)
        messages_for_llm = self._remove_tool_result_from_messages(
            messages_history, keep_tool_result
        )

        # Apply cache control
        processed_messages = self._apply_cache_control(messages_for_llm)

        try:
            # Note: Anthropic API does not support repetition_penalty parameter
            if self.async_client:
                response = await self.client.messages.create(
                    model=self.model_name,
                    temperature=self.temperature,
                    top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,
                    top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,
                    max_tokens=self.max_tokens,
                    system=[
                        {
                            "type": "text",
                            "text": system_prompt,
                            "cache_control": {"type": "ephemeral"},
                        }
                    ],
                    messages=processed_messages,
                    stream=False,
                )
            else:
                response = self.client.messages.create(
                    model=self.model_name,
                    temperature=self.temperature,
                    top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,
                    top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,
                    max_tokens=self.max_tokens,
                    system=[
                        {
                            "type": "text",
                            "text": system_prompt,
                            "cache_control": {"type": "ephemeral"},
                        }
                    ],
                    messages=processed_messages,
                    stream=False,
                )
            self._update_token_usage(getattr(response, "usage", None))
            self.task_log.log_step(
                "info",
                "LLM | Call Status",
                f"LLM call status: {getattr(response, 'stop_reason', 'N/A')}",
            )
            # Return the original messages_history (not the filtered copy)
            # This ensures that the complete conversation history is preserved in logs
            return response, messages_history
        except asyncio.CancelledError:
            self.task_log.log_step(
                "warning",
                "LLM | Call Cancelled",
                "⚠️ LLM API call was cancelled during execution",
            )
            raise  # Re-raise to allow decorator to log it
        except Exception as e:
            self.task_log.log_step(
                "error", "LLM | Call Failed", f"Anthropic LLM call failed: {str(e)}"
            )
            raise e

    def process_llm_response(
        self, llm_response: Any, message_history: List[Dict], agent_type: str = "main"
    ) -> tuple[str, bool, List[Dict]]:
        """Process LLM response"""
        if not llm_response:
            self.task_log.log_step(
                "error",
                "LLM | Response Processing",
                "❌ LLM call failed, skipping this response.",
            )
            return "", True, message_history

        if not hasattr(llm_response, "content") or not llm_response.content:
            self.task_log.log_step(
                "error",
                "LLM | Response Processing",
                "❌ LLM response is empty or contains no content.",
            )
            return "", True, message_history

        # Extract response content
        assistant_response_text = ""
        assistant_response_content = []

        from ...utils.parsing_utils import fix_server_name_in_text

        for block in llm_response.content:
            if block.type == "text":
                assistant_response_text += block.text + "\n"
                assistant_response_content.append({"type": "text", "text": block.text})
            elif block.type == "tool_use":
                assistant_response_content.append(
                    {
                        "type": "tool_use",
                        "id": block.id,
                        "name": block.name,
                        "input": block.input,
                    }
                )

        # Fix server_name in text content
        assistant_response_text = fix_server_name_in_text(assistant_response_text)
        for item in assistant_response_content:
            if item.get("type") == "text":
                item["text"] = fix_server_name_in_text(item["text"])

        # Add assistant response to history
        message_history.append(
            {"role": "assistant", "content": assistant_response_content}
        )

        self.task_log.log_step(
            "info", "LLM | Response", f"LLM Response: {assistant_response_text}"
        )

        return assistant_response_text, False, message_history

    def extract_tool_calls_info(
        self, llm_response: Any, assistant_response_text: str
    ) -> List[Dict]:
        """Extract tool call information from LLM response"""
        from ...utils.parsing_utils import parse_llm_response_for_tool_calls

        return parse_llm_response_for_tool_calls(assistant_response_text)

    def update_message_history(
        self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]
    ) -> List[Dict]:
        """Update message history with tool calls data (llm client specific)"""

        merged_text = "\n".join(
            [
                item[1]["text"]
                for item in all_tool_results_content_with_id
                if item[1]["type"] == "text"
            ]
        )

        message_history.append(
            {
                "role": "user",
                "content": [{"type": "text", "text": merged_text}],
            }
        )

        return message_history

    def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:
        from ...utils.parsing_utils import set_tool_server_mapping

        prompt = generate_mcp_system_prompt(date, mcp_servers)
        set_tool_server_mapping(prompt)
        return prompt

    def _estimate_tokens(self, text: str) -> int:
        """Use tiktoken to estimate the number of tokens in text"""
        if not hasattr(self, "encoding"):
            # Initialize tiktoken encoder
            try:
                self.encoding = tiktoken.get_encoding("o200k_base")
            except Exception:
                # If o200k_base is not available, use cl100k_base as fallback
                self.encoding = tiktoken.get_encoding("cl100k_base")

        try:
            return len(self.encoding.encode(text))
        except Exception as e:
            # If encoding fails, use simple estimation: approximately 1 token per 4 characters
            self.task_log.log_step(
                "error",
                "LLM | Token Estimation Error",
                f"Error: {str(e)}",
            )
            return len(text) // 4

    def ensure_summary_context(
        self, message_history: list, summary_prompt: str
    ) -> tuple[bool, list]:
        """
        Check if current message_history + summary_prompt will exceed context
        If it will exceed, remove the last assistant-user pair and return False
        Return True to continue, False if messages have been rolled back
        """
        # Get token usage from the last LLM call
        last_input_tokens = self.last_call_tokens.get("input_tokens", 0)
        last_output_tokens = self.last_call_tokens.get("output_tokens", 0)
        buffer_factor = 1.5

        # Calculate token count for summary prompt
        summary_tokens = int(self._estimate_tokens(str(summary_prompt)) * buffer_factor)

        # Calculate token count for the last user message in message_history
        last_user_tokens = 0
        if message_history[-1]["role"] == "user":
            content = message_history[-1]["content"]
            last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)

        # Calculate total token count: last input + output + last user message + summary + reserved response space
        estimated_total = (
            last_input_tokens
            + last_output_tokens
            + last_user_tokens
            + summary_tokens
            + self.max_tokens
            + 1000  # Add 1000 tokens as buffer
        )

        if estimated_total >= self.max_context_length:
            self.task_log.log_step(
                "info",
                "LLM | Context Limit Reached",
                "Context limit reached, proceeding to step back and summarize the conversation",
            )

            # Remove the last user message (tool call results)
            if message_history[-1]["role"] == "user":
                message_history.pop()

            # Remove the second-to-last assistant message (tool call request)
            if message_history[-1]["role"] == "assistant":
                message_history.pop()

            self.task_log.log_step(
                "info",
                "LLM | Context Limit Reached",
                f"Removed the last assistant-user pair, current message_history length: {len(message_history)}",
            )

            return False, message_history

        self.task_log.log_step(
            "info",
            "LLM | Context Limit Not Reached",
            f"{estimated_total}/{self.max_context_length}",
        )
        return True, message_history

    def format_token_usage_summary(self) -> tuple[List[str], str]:
        """Format token usage statistics, return summary_lines for format_final_summary and log string"""
        token_usage = self.get_token_usage()

        total_input = token_usage.get("total_input_tokens", 0)
        total_output = token_usage.get("total_output_tokens", 0)
        total_cache_creation = token_usage.get("total_cache_write_input_tokens", 0)
        total_cache_read = token_usage.get("total_cache_read_input_tokens", 0)

        summary_lines = []
        summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20)
        summary_lines.append(f"Total Input Tokens (non-cache): {total_input}")
        summary_lines.append(
            f"Total Cache Creation Input Tokens: {total_cache_creation}"
        )
        summary_lines.append(f"Total Cache Read Input Tokens: {total_cache_read}")
        summary_lines.append(f"Total Output Tokens: {total_output}")
        summary_lines.append("-" * (40 + len(" Token Usage ")))
        summary_lines.append("Pricing is disabled - no cost information available")
        summary_lines.append("-" * (40 + len(" Token Usage ")))

        # Generate log string
        log_string = (
            f"[{self.model_name}] Total Input: {total_input}, "
            f"Cache Creation: {total_cache_creation}, "
            f"Cache Read: {total_cache_read}, "
            f"Output: {total_output}"
        )

        return summary_lines, log_string

    def get_token_usage(self):
        return self.token_usage.copy()

    def _apply_cache_control(self, messages: List[Dict]) -> List[Dict]:
        """Apply cache control to the last user message and system message (if applicable)"""
        cached_messages = []
        user_turns_processed = 0
        for turn in reversed(messages):
            if turn["role"] == "user" and user_turns_processed < 1:
                # Add ephemeral cache control to the text part of the last user message
                new_content = []
                processed_text = False
                # Check if content is a list
                if isinstance(turn["content"], str):
                    turn["content"] = [{"type": "text", "text": turn["content"]}]
                if isinstance(turn.get("content"), list):
                    # see example here
                    # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
                    for item in turn["content"]:
                        if (
                            item.get("type") == "text"
                            and len(item.get("text")) > 0
                            and not processed_text
                        ):
                            # Copy and add cache control
                            text_item = item.copy()
                            text_item["cache_control"] = {"type": "ephemeral"}
                            new_content.append(text_item)
                            processed_text = True
                        else:
                            # Other types of content (like image) copied directly
                            new_content.append(item.copy())
                    cached_messages.append({"role": "user", "content": new_content})
                else:
                    # If content is not a list (e.g., plain text), add as is without cache control
                    # Or adjust logic as needed
                    self.task_log.log_step(
                        "warning",
                        "LLM | Cache Control",
                        "Warning: User message content is not in expected list format, cache control not applied.",
                    )
                    cached_messages.append(turn)

                user_turns_processed += 1
            else:
                # Add other messages directly
                cached_messages.append(turn)
        return list(reversed(cached_messages))


================================================
FILE: apps/miroflow-agent/src/llm/providers/openai_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
OpenAI-compatible LLM client implementation.

This module provides the OpenAIClient class for interacting with OpenAI's API
and OpenAI-compatible endpoints (such as vLLM, Qwen, DeepSeek, etc.).

Features:
- Async and sync API support
- Automatic retry with exponential backoff
- Token usage tracking and context length management
- MCP tool call parsing and response processing
"""

import asyncio
import dataclasses
import logging
from typing import Any, Dict, List, Tuple, Union

import tiktoken
from openai import AsyncOpenAI, DefaultAsyncHttpxClient, DefaultHttpxClient, OpenAI

from ...utils.prompt_utils import generate_mcp_system_prompt
from ..base_client import BaseClient

logger = logging.getLogger("miroflow_agent")


@dataclasses.dataclass
class OpenAIClient(BaseClient):
    def _create_client(self) -> Union[AsyncOpenAI, OpenAI]:
        """Create LLM client"""
        http_client_args = {"headers": {"x-upstream-session-id": self.task_id}}
        if self.async_client:
            return AsyncOpenAI(
                api_key=self.api_key,
                base_url=self.base_url,
                http_client=DefaultAsyncHttpxClient(**http_client_args),
            )
        else:
            return OpenAI(
                api_key=self.api_key,
                base_url=self.base_url,
                http_client=DefaultHttpxClient(**http_client_args),
            )

    def _update_token_usage(self, usage_data: Any) -> None:
        """Update cumulative token usage"""
        if usage_data:
            input_tokens = getattr(usage_data, "prompt_tokens", 0)
            output_tokens = getattr(usage_data, "completion_tokens", 0)
            prompt_tokens_details = getattr(usage_data, "prompt_tokens_details", None)
            if prompt_tokens_details:
                cached_tokens = (
                    getattr(prompt_tokens_details, "cached_tokens", None) or 0
                )
            else:
                cached_tokens = 0

            # Record token usage for the most recent call
            self.last_call_tokens = {
                "prompt_tokens": input_tokens,
                "completion_tokens": output_tokens,
            }

            # OpenAI does not provide cache_creation_input_tokens
            self.token_usage["total_input_tokens"] += input_tokens
            self.token_usage["total_output_tokens"] += output_tokens
            self.token_usage["total_cache_read_input_tokens"] += cached_tokens

            self.task_log.log_step(
                "info",
                "LLM | Token Usage",
                f"Input: {self.token_usage['total_input_tokens']}, "
                f"Output: {self.token_usage['total_output_tokens']}",
            )

    async def _create_message(
        self,
        system_prompt: str,
        messages_history: List[Dict[str, Any]],
        tools_definitions,
        keep_tool_result: int = -1,
    ):
        """
        Send message to OpenAI API.
        :param system_prompt: System prompt string.
        :param messages_history: Message history list.
        :return: OpenAI API response object or None (if error occurs).
        """

        # Create a copy for sending to LLM (to avoid modifying the original)
        messages_for_llm = [m.copy() for m in messages_history]

        # put the system prompt in the first message since OpenAI API does not support system prompt in
        if system_prompt:
            # Check if there's already a system or developer message
            if messages_for_llm and messages_for_llm[0]["role"] in [
                "system",
                "developer",
            ]:
                messages_for_llm[0] = {
                    "role": "system",
                    "content": system_prompt,
                }

            else:
                messages_for_llm.insert(
                    0,
                    {
                        "role": "system",
                        "content": system_prompt,
                    },
                )

        # Filter tool results to save tokens (only affects messages sent to LLM)
        messages_for_llm = self._remove_tool_result_from_messages(
            messages_for_llm, keep_tool_result
        )

        # Retry loop with dynamic max_tokens adjustment
        max_retries = 10
        base_wait_time = 30
        current_max_tokens = self.max_tokens

        for attempt in range(max_retries):
            params = {
                "model": self.model_name,
                "temperature": self.temperature,
                "messages": messages_for_llm,
                "stream": False,
                "top_p": self.top_p,
                "extra_body": {},
            }
            # Check if the model is GPT-5, and adjust the parameter accordingly
            if "gpt-5" in self.model_name:
                # Use 'max_completion_tokens' for GPT-5
                params["max_completion_tokens"] = current_max_tokens
            else:
                # Use 'max_tokens' for GPT-4 and other models
                params["max_tokens"] = current_max_tokens

            # Add repetition_penalty if it's not the default value
            if self.repetition_penalty != 1.0:
                params["extra_body"]["repetition_penalty"] = self.repetition_penalty

            if "deepseek-v3-1" in self.model_name:
                params["extra_body"]["thinking"] = {"type": "enabled"}

            # auto-detect if we need to continue from the last assistant message
            if messages_for_llm and messages_for_llm[-1].get("role") == "assistant":
                params["extra_body"]["continue_final_message"] = True
                params["extra_body"]["add_generation_prompt"] = False

            try:
                if self.async_client:
                    response = await self.client.chat.completions.create(**params)
                else:
                    response = self.client.chat.completions.create(**params)
                # Update token count
                self._update_token_usage(getattr(response, "usage", None))
                self.task_log.log_step(
                    "info",
                    "LLM | Response Status",
                    f"{getattr(response.choices[0], 'finish_reason', 'N/A')}",
                )

                # Check if response was truncated due to length limit
                finish_reason = getattr(response.choices[0], "finish_reason", None)
                if finish_reason == "length":
                    # If this is not the last retry, increase max_tokens and retry
                    if attempt < max_retries - 1:
                        # Increase max_tokens by 10%
                        current_max_tokens = int(current_max_tokens * 1.1)
                        self.task_log.log_step(
                            "warning",
                            "LLM | Length Limit Reached",
                            f"Response was truncated due to length limit (attempt {attempt + 1}/{max_retries}). Increasing max_tokens to {current_max_tokens} and retrying...",
                        )
                        await asyncio.sleep(base_wait_time)
                        continue
                    else:
                        # Last retry, return the truncated response instead of raising exception
                        self.task_log.log_step(
                            "warning",
                            "LLM | Length Limit Reached - Returning Truncated Response",
                            f"Response was truncated after {max_retries} attempts. Returning truncated response to allow ReAct loop to continue.",
                        )
                        # Return the truncated response and let the orchestrator handle it
                        return response, messages_history

                # Check if the last 50 characters of the response appear more than 5 times in the response content.
                # If so, treat it as a severe repeat and trigger a retry.
                if hasattr(response.choices[0], "message") and hasattr(
                    response.choices[0].message, "content"
                ):
                    resp_content = response.choices[0].message.content or ""
                else:
                    resp_content = getattr(response.choices[0], "text", "")

                if resp_content and len(resp_content) >= 50:
                    tail_50 = resp_content[-50:]
                    repeat_count = resp_content.count(tail_50)
                    if repeat_count > 5:
                        # If this is not the last retry, retry
                        if attempt < max_retries - 1:
                            self.task_log.log_step(
                                "warning",
                                "LLM | Repeat Detected",
                                f"Severe repeat: the last 50 chars appeared over 5 times (attempt {attempt + 1}/{max_retries}), retrying...",
                            )
                            await asyncio.sleep(base_wait_time)
                            continue
                        else:
                            # Last retry, return anyway
                            self.task_log.log_step(
                                "warning",
                                "LLM | Repeat Detected - Returning Anyway",
                                f"Severe repeat detected after {max_retries} attempts. Returning response anyway.",
                            )

                # Success - return the original messages_history (not the filtered copy)
                # This ensures that the complete conversation history is preserved in logs
                return response, messages_history

            except asyncio.TimeoutError as e:
                if attempt < max_retries - 1:
                    self.task_log.log_step(
                        "warning",
                        "LLM | Timeout Error",
                        f"Timeout error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...",
                    )
                    await asyncio.sleep(base_wait_time)
                    continue
                else:
                    self.task_log.log_step(
                        "error",
                        "LLM | Timeout Error",
                        f"Timeout error after {max_retries} attempts: {str(e)}",
                    )
                    raise e
            except asyncio.CancelledError as e:
                self.task_log.log_step(
                    "error",
                    "LLM | Request Cancelled",
                    f"Request was cancelled: {str(e)}",
                )
                raise e
            except Exception as e:
                if "Error code: 400" in str(e) and "longer than the model" in str(e):
                    self.task_log.log_step(
                        "error",
                        "LLM | Context Length Error",
                        f"Error: {str(e)}",
                    )
                    raise e
                else:
                    if attempt < max_retries - 1:
                        self.task_log.log_step(
                            "warning",
                            "LLM | API Error",
                            f"Error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...",
                        )
                        await asyncio.sleep(base_wait_time)
                        continue
                    else:
                        self.task_log.log_step(
                            "error",
                            "LLM | API Error",
                            f"Error after {max_retries} attempts: {str(e)}",
                        )
                        raise e

        # Should never reach here, but just in case
        raise Exception("Unexpected error: retry loop completed without returning")

    def process_llm_response(
        self, llm_response: Any, message_history: List[Dict], agent_type: str = "main"
    ) -> tuple[str, bool, List[Dict]]:
        """Process LLM response"""
        if not llm_response or not llm_response.choices:
            error_msg = "LLM did not return a valid response."
            self.task_log.log_step(
                "error", "LLM | Response Error", f"Error: {error_msg}"
            )
            return "", True, message_history  # Exit loop, return message_history

        # Extract LLM response text
        from ...utils.parsing_utils import fix_server_name_in_text

        if llm_response.choices[0].finish_reason == "stop":
            assistant_response_text = llm_response.choices[0].message.content or ""
            assistant_response_text = fix_server_name_in_text(assistant_response_text)

            message_history.append(
                {"role": "assistant", "content": assistant_response_text}
            )

        elif llm_response.choices[0].finish_reason == "length":
            assistant_response_text = llm_response.choices[0].message.content or ""
            assistant_response_text = fix_server_name_in_text(assistant_response_text)
            if assistant_response_text == "":
                assistant_response_text = "LLM response is empty."
            elif "Context length exceeded" in assistant_response_text:
                # This is the case where context length is exceeded, needs special handling
                self.task_log.log_step(
                    "warning",
                    "LLM | Context Length",
                    "Detected context length exceeded, returning error status",
                )
                message_history.append(
                    {"role": "assistant", "content": assistant_response_text}
                )
                return (
                    assistant_response_text,
                    True,
                    message_history,
                )  # Return True to indicate need to exit loop

            # Add assistant response to history
            message_history.append(
                {"role": "assistant", "content": assistant_response_text}
            )

        else:
            raise ValueError(
                f"Unsupported finish reason: {llm_response.choices[0].finish_reason}"
            )

        return assistant_response_text, False, message_history

    def extract_tool_calls_info(
        self, llm_response: Any, assistant_response_text: str
    ) -> List[Dict]:
        """Extract tool call information from LLM response"""
        from ...utils.parsing_utils import parse_llm_response_for_tool_calls

        return parse_llm_response_for_tool_calls(assistant_response_text)

    def update_message_history(
        self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]
    ) -> List[Dict]:
        """Update message history with tool calls data (llm client specific)"""

        merged_text = "\n".join(
            [
                item[1]["text"]
                for item in all_tool_results_content_with_id
                if item[1]["type"] == "text"
            ]
        )

        message_history.append(
            {
                "role": "user",
                "content": merged_text,
            }
        )

        return message_history

    def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:
        from ...utils.parsing_utils import set_tool_server_mapping

        prompt = generate_mcp_system_prompt(date, mcp_servers)
        set_tool_server_mapping(prompt)
        return prompt

    def _estimate_tokens(self, text: str) -> int:
        """Use tiktoken to estimate the number of tokens in text"""
        if not hasattr(self, "encoding"):
            # Initialize tiktoken encoder
            try:
                self.encoding = tiktoken.get_encoding("o200k_base")
            except Exception:
                # If o200k_base is not available, use cl100k_base as fallback
                self.encoding = tiktoken.get_encoding("cl100k_base")

        try:
            return len(self.encoding.encode(text))
        except Exception as e:
            # If encoding fails, use simple estimation: approximately 1 token per 4 characters
            self.task_log.log_step(
                "error",
                "LLM | Token Estimation Error",
                f"Error: {str(e)}",
            )
            return len(text) // 4

    def ensure_summary_context(
        self, message_history: list, summary_prompt: str
    ) -> tuple[bool, list]:
        """
        Check if current message_history + summary_prompt will exceed context
        If it will exceed, remove the last assistant-user pair and return False
        Return True to continue, False if messages have been rolled back
        """
        # Get token usage from the last LLM call
        last_prompt_tokens = self.last_call_tokens.get("prompt_tokens", 0)
        last_completion_tokens = self.last_call_tokens.get("completion_tokens", 0)
        buffer_factor = 1.5

        # Calculate token count for summary prompt
        summary_tokens = int(self._estimate_tokens(summary_prompt) * buffer_factor)

        # Calculate token count for the last user message in message_history
        last_user_tokens = 0
        if message_history[-1]["role"] == "user":
            content = message_history[-1]["content"]
            last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)

        # Calculate total token count: last prompt + completion + last user message + summary + reserved response space
        estimated_total = (
            last_prompt_tokens
            + last_completion_tokens
            + last_user_tokens
            + summary_tokens
            + self.max_tokens
            + 1000  # Add 1000 tokens as buffer
        )

        if estimated_total >= self.max_context_length:
            self.task_log.log_step(
                "info",
                "LLM | Context Limit Reached",
                "Context limit reached, proceeding to step back and summarize the conversation",
            )

            # Remove the last user message (tool call results)
            if message_history[-1]["role"] == "user":
                message_history.pop()

            # Remove the second-to-last assistant message (tool call request)
            if message_history[-1]["role"] == "assistant":
                message_history.pop()

            self.task_log.log_step(
                "info",
                "LLM | Context Limit Reached",
                f"Removed the last assistant-user pair, current message_history length: {len(message_history)}",
            )

            return False, message_history

        self.task_log.log_step(
            "info",
            "LLM | Context Limit Not Reached",
            f"{estimated_total}/{self.max_context_length}",
        )
        return True, message_history

    def format_token_usage_summary(self) -> tuple[List[str], str]:
        """Format token usage statistics, return summary_lines for format_final_summary and log string"""
        token_usage = self.get_token_usage()

        total_input = token_usage.get("total_input_tokens", 0)
        total_output = token_usage.get("total_output_tokens", 0)
        cache_input = token_usage.get("total_cache_input_tokens", 0)

        summary_lines = []
        summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20)
        summary_lines.append(f"Total Input Tokens: {total_input}")
        summary_lines.append(f"Total Cache Input Tokens: {cache_input}")
        summary_lines.append(f"Total Output Tokens: {total_output}")
        summary_lines.append("-" * (40 + len(" Token Usage ")))
        summary_lines.append("Pricing is disabled - no cost information available")
        summary_lines.append("-" * (40 + len(" Token Usage ")))

        # Generate log string
        log_string = (
            f"[{self.model_name}] Total Input: {total_input}, "
            f"Cache Input: {cache_input}, "
            f"Output: {total_output}"
        )

        return summary_lines, log_string

    def get_token_usage(self):
        return self.token_usage.copy()


================================================
FILE: apps/miroflow-agent/src/llm/util.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Utility decorators and helpers for LLM client operations.

This module provides:
- Timeout decorator for async LLM API calls
- Other common utilities shared across LLM providers
"""

import asyncio
import functools
from typing import Awaitable, Callable, TypeVar

T = TypeVar("T")


def with_timeout(
    timeout_s: float = 300.0,
) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
    """
    Decorator: wraps any *async* function in asyncio.wait_for().
    Usage:
        @with_timeout(20)
        async def create_message_foo(...): ...
    """

    def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
        @functools.wraps(func)
        async def wrapper(*args, **kwargs) -> T:
            return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)

        return wrapper

    return decorator


================================================
FILE: apps/miroflow-agent/src/logging/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Logging module for task execution tracking."""

from .task_logger import (
    LLMCallLog,
    StepLog,
    TaskLog,
    ToolCallLog,
    bootstrap_logger,
    get_utc_plus_8_time,
)

__all__ = [
    "TaskLog",
    "StepLog",
    "LLMCallLog",
    "ToolCallLog",
    "bootstrap_logger",
    "get_utc_plus_8_time",
]


================================================
FILE: apps/miroflow-agent/src/logging/summary_time_cost.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
from collections import defaultdict
from pathlib import Path

from .task_logger import logger


def _get_summary_template():
    """Returns a template for the summary data structure."""
    return {
        "total_tasks": 0,
        "total_wall_time": 0.0,
        "primary_breakdown": {
            "main_agent": defaultdict(float),
            "browsing_agent": defaultdict(float),
        },
        "cross_cutting_breakdown": defaultdict(float),
        "tool_workload_breakdown": defaultdict(float),
    }


def _update_summary_data(summary_block, perf_summary, tool_workload):
    """Updates a summary block with data from a single result."""
    summary_block["total_tasks"] += 1
    summary_block["total_wall_time"] += perf_summary.get("total_wall_time", 0.0)

    # Update primary breakdown
    primary_breakdown = perf_summary.get("primary_breakdown", {})
    for agent, data in primary_breakdown.items():
        if agent in summary_block["primary_breakdown"]:
            for key, value in data.items():
                summary_block["primary_breakdown"][agent][key] += value

    # Update cross-cutting breakdown
    cross_cutting_breakdown = perf_summary.get("cross_cutting_breakdown", {})
    for key, value in cross_cutting_breakdown.items():
        summary_block["cross_cutting_breakdown"][key] += value

    # Update tool workload breakdown
    for key, value in tool_workload.items():
        summary_block["tool_workload_breakdown"][key] += value


def _calculate_averages(summary_block):
    """Calculates and adds average values to a summary block."""
    num_tasks = summary_block["total_tasks"]
    if num_tasks == 0:
        return

    summary_block["average_wall_time"] = summary_block["total_wall_time"] / num_tasks

    # Calculate averages for primary breakdown
    for agent, data in summary_block["primary_breakdown"].items():
        summary_block["primary_breakdown"][agent] = dict(data)  # Convert back to dict
        avg_data = {f"avg_{k}": v / num_tasks for k, v in data.items()}
        summary_block["primary_breakdown"][agent].update(avg_data)

    # Calculate averages for cross-cutting breakdown
    summary_block["cross_cutting_breakdown"] = dict(
        summary_block["cross_cutting_breakdown"]
    )
    avg_cross_cutting = {
        f"avg_{k}": v / num_tasks
        for k, v in summary_block["cross_cutting_breakdown"].items()
    }
    summary_block["cross_cutting_breakdown"].update(avg_cross_cutting)

    # Calculate averages for tool workload breakdown
    summary_block["tool_workload_breakdown"] = dict(
        summary_block["tool_workload_breakdown"]
    )
    avg_tool_workload = {
        f"avg_{k}": v / num_tasks
        for k, v in summary_block["tool_workload_breakdown"].items()
    }
    summary_block["tool_workload_breakdown"].update(avg_tool_workload)


def generate_summary(log_dir: Path):
    """
    Generates a summary of benchmark results by reading log files from a directory,
    calculating total and average trace data, both overall and grouped by
    final_judge_result.

    Args:
        log_dir: The directory where the individual result log files are and where
                 the summary file will be saved.
    """
    results = []
    for log_file in log_dir.glob("*.json"):
        if log_file.name == "summary.json":
            continue
        try:
            with open(log_file, "r", encoding="utf-8") as f:
                results.append(json.load(f))
        except json.JSONDecodeError:
            logger.info(f"Warning: Could not decode JSON from {log_file}. Skipping.")
        except Exception as e:
            logger.info(f"Warning: Could not read file {log_file}: {e}. Skipping.")

    overall_summary = _get_summary_template()
    summary_by_judge = defaultdict(_get_summary_template)

    for result in results:
        trace_data = result.get("trace_data")
        if not trace_data or "performance_summary" not in trace_data:
            continue

        perf_summary = trace_data["performance_summary"]
        tool_workload = trace_data.get("tool_workload_breakdown", {})

        # Update overall summary
        _update_summary_data(overall_summary, perf_summary, tool_workload)

        # Update summary by judge result
        judge_result = result.get("final_judge_result", "unknown")
        _update_summary_data(
            summary_by_judge[judge_result], perf_summary, tool_workload
        )

    # Calculate averages for all summary blocks
    _calculate_averages(overall_summary)
    for judge_result in summary_by_judge:
        _calculate_averages(summary_by_judge[judge_result])

    summary_data = {
        "overall_summary": overall_summary,
        "summary_by_final_judge_result": dict(summary_by_judge),
    }

    summary_file = log_dir / "summary_time_cost.json"
    with open(summary_file, "w", encoding="utf-8") as f:
        json.dump(summary_data, f, indent=4, ensure_ascii=False)


================================================
FILE: apps/miroflow-agent/src/logging/task_logger.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Task logging and structured output module.

This module provides:
- TaskLog: Main dataclass for tracking task execution state and history
- StepLog: Individual step logging with timestamps and metadata
- ColoredFormatter: Console output formatting with color-coded log levels
- Utility functions for time handling and logger configuration

All logs are persisted to JSON files for later analysis and debugging.
"""

import json
import logging
import os
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional

# Import colorama for cross-platform colored output
from colorama import Fore, Style, init

# Initialize colorama
init(autoreset=True, strip=False)

# This will be set to the configured logger instance
logger = None


def get_color_for_level(level: str) -> str:
    """Get color code based on log level for better visual distinction"""
    if level == "ERROR":
        return f"{Fore.RED}{Style.BRIGHT}"
    elif level == "WARNING":
        return f"{Fore.YELLOW}{Style.BRIGHT}"
    elif level == "INFO":
        return f"{Fore.GREEN}{Style.BRIGHT}"
    elif level == "DEBUG":
        return f"{Fore.CYAN}{Style.BRIGHT}"
    else:
        return f"{Fore.WHITE}{Style.BRIGHT}"


class ColoredFormatter(logging.Formatter):
    """Custom formatter that adds colors for better developer visualization"""

    def format(self, record):
        # Get timestamp and format it
        timestamp = self.formatTime(record, self.datefmt)

        # Color the level name based on severity
        level_color = get_color_for_level(record.levelname)
        level_reset = Style.RESET_ALL

        # Color the logger name (miroflow_agent)
        name_color = f"{Fore.BLUE}{Style.BRIGHT}"
        name_reset = Style.RESET_ALL

        # Get the message as is (icons are already added in log_step)
        message = record.getMessage()

        # Format with selective coloring
        formatted = f"[{timestamp}][{name_color}{record.name}{name_reset}][{level_color}{record.levelname}{level_reset}] - {message}"

        return formatted


def bootstrap_logger() -> logging.Logger:
    """Configure the miroflow_agent logger with consistent formatting"""

    global logger

    # Configure miroflow_agent logger
    miroflow_agent_logger = logging.getLogger("miroflow_agent")

    # Check if logger already has handlers to prevent duplicate configuration
    if miroflow_agent_logger.handlers:
        logger = miroflow_agent_logger
        return miroflow_agent_logger

    # Create formatter with consistent format
    formatter = ColoredFormatter(
        "%(asctime)s,%(msecs)03d",
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    # Add our handler with the specified formatter
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    miroflow_agent_logger.addHandler(handler)
    miroflow_agent_logger.setLevel(logging.DEBUG)

    # Disable propagation to prevent duplicate logging from root logger
    miroflow_agent_logger.propagate = False

    # Set the global logger variable
    logger = miroflow_agent_logger

    return miroflow_agent_logger


def get_utc_plus_8_time() -> str:
    """Get UTC+8 timezone current time string"""
    utc_plus_8 = timezone(timedelta(hours=8))
    return datetime.now(utc_plus_8).strftime("%Y-%m-%d %H:%M:%S")


@dataclass
class LLMCallLog:
    """Record technical details of LLM calls"""

    provider: str
    model: str
    input_tokens: int = 0
    output_tokens: int = 0
    cache_creation_tokens: int = 0
    cache_read_tokens: int = 0
    error: Optional[str] = None


@dataclass
class ToolCallLog:
    """Record detailed information of tool calls"""

    server_name: str
    tool_name: str
    arguments: Dict[str, Any] = field(default_factory=dict)
    result: Any = None
    error: Optional[str] = None
    call_time: Optional[str] = None


@dataclass
class StepLog:
    """Record detailed information of task execution steps"""

    step_name: str
    message: str
    timestamp: str
    info_level: Literal["info", "warning", "error", "debug"] = "info"
    metadata: Dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        """Validate info_level after initialization"""
        valid_levels = {"info", "warning", "error", "debug"}
        if self.info_level not in valid_levels:
            raise ValueError(
                f"info_level must be one of {valid_levels}, got '{self.info_level}'"
            )


@dataclass
class TaskLog:
    status: str = "running"
    start_time: str = ""
    end_time: str = ""

    task_id: str = ""
    input: Any = None
    ground_truth: str = ""
    final_boxed_answer: str = ""
    final_judge_result: str = ""
    judge_type: str = ""
    eval_details: Optional[Dict[str, Any]] = None  # For DeepSearchQA metrics
    error: str = ""

    # Main records: main agent conversation turns
    current_main_turn_id: int = 0
    current_sub_agent_turn_id: int = 0
    sub_agent_counter: int = 0
    current_sub_agent_session_id: Optional[str] = None

    env_info: Optional[dict] = field(default_factory=dict)
    log_dir: str = "logs"

    main_agent_message_history: List[Dict[str, Any]] = field(default_factory=list)
    sub_agent_message_history_sessions: Dict[str, List[Dict[str, Any]]] = field(
        default_factory=dict
    )

    step_logs: List[StepLog] = field(default_factory=list)
    trace_data: Dict[str, Any] = field(default_factory=dict)

    def start_sub_agent_session(
        self, sub_agent_name: str, subtask_description: str
    ) -> str:
        """Start a new sub-agent session"""
        self.sub_agent_counter += 1
        session_id = f"{sub_agent_name}_{self.sub_agent_counter}"
        self.current_sub_agent_session_id = session_id

        # Record sub-agent session start
        self.log_step(
            "info",
            f"{sub_agent_name} | Session Start",
            f"Starting {session_id} for subtask: {subtask_description[:100]}{'...' if len(subtask_description) > 100 else ''}",
            metadata={"session_id": session_id, "subtask": subtask_description},
        )

        return session_id

    def end_sub_agent_session(self, sub_agent_name: str) -> Optional[str]:
        """End the current sub-agent session"""
        self.log_step(
            "info",
            f"{sub_agent_name} | Session End",
            f"Ending {self.current_sub_agent_session_id}",
            metadata={"session_id": self.current_sub_agent_session_id},
        )
        self.current_sub_agent_session_id = None
        return None

    def log_step(
        self,
        info_level: Literal["info", "warning", "error", "debug"],
        step_name: str,
        message: str,
        metadata: Optional[Dict[str, Any]] = None,
    ):
        """Record execution step"""
        # Add icons to step_name based on content
        icon = ""
        if "Tool Call Start" in step_name:
            icon = "▶️ "
        elif "Tool Call Success" in step_name:
            icon = "✅ "
        elif "Tool Call Error" in step_name or (
            "error" in info_level and "tool" in step_name.lower()
        ):
            icon = "❌ "
        elif "agent-" in step_name:
            icon = "🤖 "
        elif "Main Agent" in step_name:
            icon = "👑 "
        elif "LLM" in step_name:
            icon = "🧠 "
        elif "ToolManager" in step_name or "Tool Call" in step_name:
            icon = "🔧 "
        elif "tool-python" in step_name.lower():
            icon = "🐍 "
        elif "tool-google-search" in step_name.lower():
            icon = "🔍 "
        elif "tool-browser" in step_name.lower() or "playwright" in step_name.lower():
            icon = "🌐 "

        # Add icon to step_name
        step_name_with_icon = f"{icon}{step_name}"

        step_log = StepLog(
            step_name=step_name_with_icon,
            message=message,
            timestamp=get_utc_plus_8_time(),
            info_level=info_level,
            metadata=metadata or {},
        )

        self.step_logs.append(step_log)

        # Print the structured log to console using the configured logger
        log_message = f"{step_name_with_icon}: {message}"

        # Ensure logger is configured
        global logger
        if logger is None:
            logger = bootstrap_logger()

        if info_level == "error":
            logger.error(log_message)
        elif info_level == "warning":
            logger.warning(log_message)
        elif info_level == "debug":
            logger.debug(log_message)
        else:  # info
            logger.info(log_message)

    def serialize_for_json(self, obj):
        """Convert objects to JSON-serializable format"""
        if isinstance(obj, Path):
            return str(obj)
        elif isinstance(obj, dict):
            return {k: self.serialize_for_json(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [self.serialize_for_json(item) for item in obj]
        elif hasattr(obj, "__dict__"):
            return self.serialize_for_json(obj.__dict__)
        else:
            return obj

    def to_json(self) -> str:
        """
        Serialize the TaskLog to a JSON string.

        Converts the dataclass to a dictionary, handles non-JSON-serializable
        objects (like Path), and returns a formatted JSON string.

        Returns:
            A JSON string representation of the task log with 2-space indentation.

        Note:
            Falls back to ASCII encoding if Unicode encoding fails.
        """
        # Convert to dict first
        data_dict = asdict(self)
        # Serialize any non-JSON-serializable objects
        serialized_dict = self.serialize_for_json(data_dict)
        try:
            return json.dumps(serialized_dict, ensure_ascii=False, indent=2)
        except UnicodeEncodeError as e:
            # Fallback: try with ASCII encoding if Unicode fails
            print(f"Warning: Unicode encoding failed, falling back to ASCII: {e}")
            return json.dumps(serialized_dict, ensure_ascii=True, indent=2)

    def save(self):
        """Save as a single JSON file"""
        os.makedirs(self.log_dir, exist_ok=True)
        timestamp = (
            self.start_time.replace(":", "-").replace(".", "-").replace(" ", "-")
        )

        filename = f"{self.log_dir}/task_{self.task_id}_{timestamp}.json"
        try:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(self.to_json())
        except UnicodeEncodeError as e:
            # Fallback: try with different encoding if UTF-8 fails
            print(f"Warning: UTF-8 encoding failed, trying with system default: {e}")
            with open(filename, "w") as f:
                f.write(self.to_json())
        return filename

    @classmethod
    def from_dict(cls, d: dict) -> "TaskLog":
        """
        Create a TaskLog instance from a dictionary.

        Args:
            d: Dictionary containing TaskLog field values.

        Returns:
            A new TaskLog instance initialized with the dictionary values.

        Note:
            The dictionary keys should match the TaskLog field names.
        """
        return cls(**d)


================================================
FILE: apps/miroflow-agent/src/utils/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Utility functions for parsing, prompts, and wrappers."""

from .parsing_utils import (
    extract_failure_experience_summary,
    extract_llm_response_text,
    fix_server_name_in_text,
    parse_llm_response_for_tool_calls,
    safe_json_loads,
    set_tool_server_mapping,
)
from .prompt_utils import (
    FORMAT_ERROR_MESSAGE,
    generate_agent_specific_system_prompt,
    generate_agent_summarize_prompt,
    generate_mcp_system_prompt,
)
from .wrapper_utils import ErrorBox, ResponseBox

__all__ = [
    # parsing_utils
    "parse_llm_response_for_tool_calls",
    "extract_llm_response_text",
    "extract_failure_experience_summary",
    "fix_server_name_in_text",
    "set_tool_server_mapping",
    "safe_json_loads",
    # prompt_utils
    "FORMAT_ERROR_MESSAGE",
    "generate_mcp_system_prompt",
    "generate_agent_specific_system_prompt",
    "generate_agent_summarize_prompt",
    # wrapper_utils
    "ErrorBox",
    "ResponseBox",
]


================================================
FILE: apps/miroflow-agent/src/utils/parsing_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Parsing utilities for LLM responses and tool calls.

This module provides functions for:
- Parsing tool calls from LLM responses (both OpenAI and MCP formats)
- Extracting text content from responses
- Safe JSON parsing with automatic repair
- Failure experience summary extraction
"""

import json
import logging
import re
from typing import Any, Dict, List, Union

from json_repair import repair_json

logger = logging.getLogger("miroflow_agent")


def parse_tool_server_mapping(system_prompt: str) -> dict:
    """
    Parse system prompt to extract tool_name → server_name mapping.

    Parses patterns like:
        ## Server name: tool-python
        ### Tool name: run_python_code

    Only extracts mappings for the 3 target tools that models commonly get wrong:
    run_python_code, google_search, scrape_and_extract_info.

    Args:
        system_prompt: The system prompt containing MCP tool definitions

    Returns:
        Dict mapping tool_name to correct server_name, e.g.
        {"run_python_code": "tool-python", "google_search": "search_and_scrape_webpage", ...}
    """
    TARGET_TOOLS = {"run_python_code", "google_search", "scrape_and_extract_info"}
    mapping = {}
    current_server = None
    for line in system_prompt.split("\n"):
        server_match = re.match(r"## Server name:\s*(.+)", line)
        if server_match:
            current_server = server_match.group(1).strip()
            continue
        tool_match = re.match(r"### Tool name:\s*(.+)", line)
        if tool_match and current_server:
            tool_name = tool_match.group(1).strip()
            if tool_name in TARGET_TOOLS:
                mapping[tool_name] = current_server
    return mapping


# Module-level cache for tool_server_mapping
_tool_server_mapping: dict = {}


def set_tool_server_mapping(system_prompt: str) -> None:
    """
    Parse system prompt and cache the tool_name → server_name mapping.

    Should be called once when system prompt is available.

    Args:
        system_prompt: The system prompt containing MCP tool definitions
    """
    global _tool_server_mapping
    _tool_server_mapping = parse_tool_server_mapping(system_prompt)


def fix_server_name_in_text(text: str) -> str:
    """
    Fix incorrect server_name and tool_name in MCP XML tool calls.

    Uses the cached tool_server_mapping (parsed from system prompt) to determine
    the correct server_name for each tool. Only fixes the 3 target tools:
    run_python_code, google_search, scrape_and_extract_info.

    Also handles the special case where model outputs tool_name=python
    (should be run_python_code).

    Args:
        text: The LLM response text containing MCP tool calls

    Returns:
        Text with corrected server_name and tool_name if needed
    """
    if not isinstance(text, str):
        return text

    mapping = _tool_server_mapping
    if not mapping:
        return text

    # Special case: tool_name=python or python_code → rename to run_python_code
    # Only apply if system prompt defines run_python_code (not python)
    if "run_python_code" in mapping:
        for wrong_name in ("python", "python_code"):
            tag = f"<tool_name>{wrong_name}</tool_name>"
            if tag in text:
                text = text.replace(tag, "<tool_name>run_python_code</tool_name>")

    # Fix server_name for each target tool using the mapping from system prompt
    for tool_name, correct_server in mapping.items():
        tool_tag = f"<tool_name>{tool_name}</tool_name>"
        if tool_tag not in text:
            continue
        correct_server_tag = f"<server_name>{correct_server}</server_name>"
        if correct_server_tag in text:
            continue
        text = re.sub(
            r"<server_name>[^<]+</server_name>(\s*" + re.escape(tool_tag) + r")",
            correct_server_tag + r"\1",
            text,
        )

    return text


def filter_none_values(arguments: Union[Dict, Any]) -> Union[Dict, Any]:
    """
    Filter out keys with None values from arguments dictionary.

    Args:
        arguments: A dictionary to filter, or any other value

    Returns:
        The filtered dictionary, or the original value if not a dict
    """
    if not isinstance(arguments, dict):
        return arguments
    return {k: v for k, v in arguments.items() if v is not None}


def _fix_backslash_escapes(json_str: str) -> str:
    """
    Fix common backslash escape issues in JSON strings.
    This handles cases where backslashes in string values are not properly escaped.

    Common issues:
    - Unescaped backslashes before non-escape characters

    Note: This is a conservative fix that preserves valid escape sequences
    (\\, \", \/, \b, \f, \n, \r, \t) and only fixes clearly problematic cases.
    """
    fixed_str = json_str

    # Fix backslashes that are not part of valid escape sequences
    # Valid JSON escape sequences: \\, \", \/, \b, \f, \n, \r, \t, \uXXXX
    # Pattern: backslash not followed by a valid escape character
    # This regex matches \ followed by anything except valid escape chars
    # But we need to be careful not to match already-escaped backslashes (\\)

    # Strategy: Find all backslashes, but skip those that are:
    # 1. Already escaped (\\)
    # 2. Part of valid escape sequences (\", \/, \b, \f, \n, \r, \t, \u)

    # More conservative approach: Only fix backslashes before uppercase letters
    # (common in Windows paths) and other clearly problematic patterns
    # This avoids breaking valid JSON escape sequences

    # Fix backslashes before uppercase letters (Windows paths like C:\Users)
    fixed_str = re.sub(
        r"(?<!\\)\\([A-Z])",  # Backslash before uppercase letter, not already escaped
        r"\\\\\1",
        fixed_str,
    )

    # Fix backslashes before digits (common in paths like \1, \2)
    fixed_str = re.sub(
        r"(?<!\\)\\([0-9])",  # Backslash before digit, not already escaped
        r"\\\\\1",
        fixed_str,
    )

    # Fix other unescaped backslashes that are not part of valid escape sequences
    # This is more aggressive but should be safe after json_repair fails
    # Valid escape chars: \\, ", /, b, f, n, r, t, u
    # Use a capturing group to preserve the character after backslash
    fixed_str = re.sub(
        r'(?<!\\)\\([^\\"/bfnrtu])',  # Backslash followed by invalid escape char
        r"\\\\\1",  # Escape it and preserve the character
        fixed_str,
    )

    return fixed_str


def safe_json_loads(arguments_str: str) -> Dict[str, Any]:
    """
    Safely parse a JSON string with multiple fallbacks.

    Parsing strategy:
    1. Try standard json.loads()
    2. If it fails, try json_repair to fix common issues
    3. If all attempts fail, return an error object

    Args:
        arguments_str: JSON string to parse

    Returns:
        Parsed dictionary, or error dict with 'error' and 'raw' keys
    """
    # Step 1: Try standard JSON parsing
    try:
        return json.loads(arguments_str)
    except json.JSONDecodeError:
        pass

    # Step 2: Try json_repair to fix common issues
    try:
        repaired = repair_json(arguments_str, ensure_ascii=False)
        return json.loads(repaired)
    except Exception:
        logger.warning(f"Unable to parse JSON: {arguments_str}")

    # Step 3: Give up and return error information
    return {
        "error": "Failed to parse arguments",
        "raw": arguments_str,
    }


def extract_failure_experience_summary(text: str) -> str:
    """
    Extract failure experience summary from LLM response text.

    The text may contain:
    - <think>...</think> block (thinking content)
    - Main content after </think> and before <use_mcp_tool>
    - <use_mcp_tool>...</use_mcp_tool> block (tool call, ignored)

    Examples:
        "<think>\n{xxx}\n</think>\n\n{content}\n\n<use_mcp_tool>..."
        "<think>\n{xxx}\n</think>\n\n{content}"
        "{content}"  (no think block)

    Returns:
        - If content is empty after strip, return think_content
        - If both think_content and content are non-empty, return content
        - mcp_block is never used
    """
    if not text:
        return ""

    think_content = ""
    content = ""

    # Extract think content
    think_match = re.search(r"<think>([\s\S]*?)</think>", text)
    if think_match:
        think_content = think_match.group(1).strip()
        # Get content after </think>
        after_think = text[think_match.end() :]
    else:
        # No think block, entire text is potential content
        after_think = text

    # Remove <use_mcp_tool>...</use_mcp_tool> block from content
    mcp_match = re.search(r"<use_mcp_tool>[\s\S]*", after_think)
    if mcp_match:
        content = after_think[: mcp_match.start()].strip()
    else:
        content = after_think.strip()

    # Apply the rules:
    # - If content is empty, use think_content
    # - If both are non-empty, use content
    if content:
        return content
    else:
        return think_content


def extract_llm_response_text(llm_response: Union[str, Dict]) -> str:
    """
    Extract text from LLM response, excluding <use_mcp_tool> tags.

    Stops immediately when <use_mcp_tool> tag is encountered, returning
    only the content before it.

    Args:
        llm_response: Either a string or a dict with 'content' key

    Returns:
        Extracted text content, stripped of trailing whitespace
    """
    # If it's a dictionary type, extract the content field
    if isinstance(llm_response, dict):
        content = llm_response.get("content", "")
    else:
        # If it's a string type, use directly
        content = str(llm_response)

    # Find the position of <use_mcp_tool> tag
    tool_start_pattern = r"<use_mcp_tool>"
    match = re.search(tool_start_pattern, content)

    if match:
        # If <use_mcp_tool> tag is found, only return content before the tag
        return content[: match.start()].strip()
    else:
        # If no tag is found, return the complete content
        return content.strip()


def parse_llm_response_for_tool_calls(
    llm_response_content_text: Union[str, Dict, List],
) -> List[Dict[str, Any]]:
    """
    Parse tool calls from LLM response content.

    Supports multiple formats:
    - OpenAI Response API format (dict with 'output' containing function_call items)
    - OpenAI Completion API format (list of tool_call objects)
    - MCP format (<use_mcp_tool> XML tags in text)

    Args:
        llm_response_content_text: Response content in any supported format

    Returns:
        List of tool call dicts with keys: server_name, tool_name, arguments, id
    """
    # tool_calls or MCP reponse are handled differently
    # for openai response api, the tool_calls are in the response text
    if isinstance(llm_response_content_text, dict):
        tool_calls = []
        for item in llm_response_content_text.get("output") or []:
            if item.get("type") == "function_call":
                name = item.get("name", "")
                if "-" in name:
                    server_name, tool_name = name.rsplit("-", maxsplit=1)
                else:
                    server_name = "unknown"
                    tool_name = name
                arguments_str = item.get("arguments")
                arguments = safe_json_loads(arguments_str)
                arguments = filter_none_values(arguments)
                tool_calls.append(
                    dict(
                        server_name=server_name,
                        tool_name=tool_name,
                        arguments=arguments,
                        id=item.get("call_id"),
                    )
                )
        return tool_calls

    # for openai completion api, the tool_calls are in the response text
    if isinstance(llm_response_content_text, list):
        tool_calls = []
        for tool_call in llm_response_content_text:
            name = tool_call.function.name
            if "-" in name:
                server_name, tool_name = name.rsplit("-", maxsplit=1)
            else:
                server_name = "unknown"
                tool_name = name
            arguments_str = tool_call.function.arguments

            # Parse JSON string to dictionary
            try:
                # Try to handle possible newlines and escape characters
                arguments = json.loads(arguments_str)
            except json.JSONDecodeError:
                logger.info(
                    f"Warning: Unable to parse tool arguments JSON: {arguments_str}"
                )
                # Try more lenient parsing or log error
                try:
                    # Try to replace some common error formats, such as Python dict strings
                    arguments_str_fixed = (
                        arguments_str.replace("'", '"')
                        .replace("None", "null")
                        .replace("True", "true")
                        .replace("False", "false")
                    )
                    arguments = json.loads(arguments_str_fixed)
                    logger.info(
                        "Info: Successfully parsed arguments after attempting to fix."
                    )
                except json.JSONDecodeError:
                    logger.info(
                        f"Error: Still unable to parse tool arguments JSON after fixing: {arguments_str}"
                    )
                    arguments = {
                        "error": "Failed to parse arguments",
                        "raw": arguments_str,
                    }

            arguments = filter_none_values(arguments)
            tool_calls.append(
                dict(
                    server_name=server_name,
                    tool_name=tool_name,
                    arguments=arguments,
                    id=tool_call.id,
                )
            )
        return tool_calls

    # for other clients, such as qwen and anthropic, we use MCP instead of tool calls
    tool_calls = []
    # Find all <use_mcp_tool> tags
    tool_call_patterns = re.findall(
        r"<use_mcp_tool>\s*<server_name>(.*?)</server_name>\s*<tool_name>(.*?)</tool_name>\s*<arguments>\s*([\s\S]*?)\s*</arguments>\s*</use_mcp_tool>",
        llm_response_content_text,
        re.DOTALL,
    )

    for match in tool_call_patterns:
        server_name = match[0].strip()
        tool_name = match[1].strip()
        arguments_str = match[2].strip()

        # Parse JSON string to dictionary
        arguments = safe_json_loads(arguments_str)
        arguments = filter_none_values(arguments)

        tool_calls.append(
            {
                "server_name": server_name,
                "tool_name": tool_name,
                "arguments": arguments,
                "id": None,
            }
        )

    return tool_calls


================================================
FILE: apps/miroflow-agent/src/utils/prompt_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
Prompt templates and utilities for agent system prompts.

This module provides:
- System prompt generation for MCP tool usage
- Agent-specific prompt generation (main agent, browsing agent)
- Summary prompt templates for final answer generation
- Failure experience templates for retry mechanisms
"""

# ============================================================================
# Format Error Messages
# ============================================================================

FORMAT_ERROR_MESSAGE = "No \\boxed{} content found in the final answer."

# ============================================================================
# Failure Experience Templates (for format error retry)
# ============================================================================

# Header that appears once before all failure experiences
FAILURE_EXPERIENCE_HEADER = """

=== Previous Attempts Analysis ===
The following summarizes what was tried before and why it didn't work. Use this to guide a NEW approach.

"""

# Template for each individual failure experience (used multiple times)
FAILURE_EXPERIENCE_ITEM = """[Attempt {attempt_number}]
{failure_summary}

"""

# Footer that appears once after all failure experiences
FAILURE_EXPERIENCE_FOOTER = """=== End of Analysis ===

Based on the above, you should try a different strategy this time.
"""

FAILURE_SUMMARY_PROMPT = """The task was not completed successfully. Do NOT call any tools. Provide a summary:

Failure type: [incomplete / blocked / misdirected / format_missed]
  - incomplete: ran out of turns before finishing
  - blocked: got stuck due to tool failure or missing information
  - misdirected: went down the wrong path
  - format_missed: found the answer but forgot to use \\boxed{}
What happened: [describe the approach taken and why a final answer was not reached]
Useful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]"""

# Assistant prefix for failure summary generation (guides model to follow structured format)
FAILURE_SUMMARY_THINK_CONTENT = """We need to write a structured post-mortem style summary **without calling any tools**, explaining why the task was not completed, using these required sections:

* **Failure type**: pick one from **incomplete / blocked / misdirected / format_missed**
* **What happened**: describe the approach taken and why it didn't reach a final answer
* **Useful findings**: list any facts, intermediate results, or conclusions that can be reused"""

FAILURE_SUMMARY_ASSISTANT_PREFIX = (
    f"<think>\n{FAILURE_SUMMARY_THINK_CONTENT}\n</think>\n\n"
)

# ============================================================================
# MCP Tags for Parsing
# ============================================================================

mcp_tags = [
    "<use_mcp_tool>",
    "</use_mcp_tool>",
    "<server_name>",
    "</server_name>",
    "<arguments>",
    "</arguments>",
]

refusal_keywords = [
    "time constraint",
    "I’m sorry, but I can’t",
    "I'm sorry, I cannot solve",
]


def generate_mcp_system_prompt(date, mcp_servers):
    """
    Generate the MCP (Model Context Protocol) system prompt for LLM.

    Creates a structured prompt that instructs the LLM on how to use available
    MCP tools. Includes tool definitions, XML formatting instructions, and
    general task-solving guidelines.

    Args:
        date: Current date object for timestamp inclusion
        mcp_servers: List of server definitions, each containing 'name' and 'tools'

    Returns:
        Complete system prompt string with tool definitions and usage instructions
    """
    formatted_date = date.strftime("%Y-%m-%d")

    # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt
    template = f"""In this environment you have access to a set of tools you can use to answer the user's question. 

You only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use. Today is: {formatted_date}

# Tool-Use Formatting Instructions 

Tool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.

The Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.

Description: 
Request to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.

Parameters:
- server_name: (required) The name of the MCP server providing the tool
- tool_name: (required) The name of the tool to execute
- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON

Usage:
<use_mcp_tool>
<server_name>server name here</server_name>
<tool_name>tool name here</tool_name>
<arguments>
{{
"param1": "value1",
"param2": "value2 \\"escaped string\\""
}}
</arguments>
</use_mcp_tool>

Important Notes:
- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.
- Always adhere to this format for the tool use to ensure proper parsing and execution.

String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.
Here are the functions available in JSONSchema format:

"""

    # Add MCP servers section
    if mcp_servers and len(mcp_servers) > 0:
        for server in mcp_servers:
            template += f"\n## Server name: {server['name']}\n"

            if "tools" in server and len(server["tools"]) > 0:
                for tool in server["tools"]:
                    # Skip tools that failed to load (they only have 'error' key)
                    if "error" in tool and "name" not in tool:
                        continue
                    template += f"### Tool name: {tool['name']}\n"
                    template += f"Description: {tool['description']}\n"
                    template += f"Input JSON schema: {tool['schema']}\n"

    # Add the full objective system prompt
    template += """
# General Objective

You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.

"""

    return template


def generate_no_mcp_system_prompt(date):
    """
    Generate a minimal system prompt without MCP tool definitions.

    Used when no tools are available or when running in tool-less mode.

    Args:
        date: Current date object for timestamp inclusion

    Returns:
        Basic system prompt string without tool definitions
    """
    formatted_date = date.strftime("%Y-%m-%d")

    # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt
    template = """In this environment you have access to a set of tools you can use to answer the user's question. """

    template += f" Today is: {formatted_date}\n"

    template += """
Important Notes:
- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.
- Always adhere to this format for the tool use to ensure proper parsing and execution.

String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.
"""

    # Add the full objective system prompt
    template += """
# General Objective

You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.

"""
    return template


def generate_agent_specific_system_prompt(agent_type=""):
    """
    Generate agent-specific objective prompts based on agent type.

    Different agent types have different objectives:
    - main: Task-solving agent that uses tools to answer questions
    - agent-browsing: Web search and browsing agent for information retrieval

    Args:
        agent_type: Type of agent ("main", "agent-browsing", or "browsing-agent")

    Returns:
        Agent-specific objective prompt string
    """
    if agent_type == "main":
        system_prompt = """\n
# Agent Specific Objective

You are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools.

"""
    elif agent_type == "agent-browsing" or agent_type == "browsing-agent":
        system_prompt = """# Agent Specific Objective

You are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps.
Do not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content.
"""
    else:
        raise ValueError(f"Unknown agent type: {agent_type}")
    return system_prompt.strip()


def generate_agent_summarize_prompt(task_description, agent_type=""):
    """
    Generate the final summarization prompt for an agent.

    Creates prompts that instruct agents to summarize their work and provide
    final answers. Different agent types have different summarization formats:
    - main: Must wrap answer in \\boxed{} with strict formatting rules
    - agent-browsing: Provides structured report of findings

    Args:
        task_description: The original task/question to reference in the summary
        agent_type: Type of agent ("main" or "agent-browsing")

    Returns:
        Summarization prompt string with formatting instructions
    """
    if agent_type == "main":
        summarize_prompt = (
            "Summarize the above conversation, and output the FINAL ANSWER to the original question.\n\n"
            "If a clear answer has already been provided earlier in the conversation, do not rethink or recalculate it — "
            "simply extract that answer and reformat it to match the required format below.\n"
            "If a definitive answer could not be determined, make a well-informed educated guess based on the conversation.\n\n"
            "The original question is repeated here for reference:\n\n"
            f'"{task_description}"\n\n'
            "Wrap your final answer in \\boxed{}.\n"
            "Your final answer should be:\n"
            "- a number, OR\n"
            "- as few words as possible, OR\n"
            "- a comma-separated list of numbers and/or strings.\n\n"
            "ADDITIONALLY, your final answer MUST strictly follow any formatting instructions in the original question — "
            "such as alphabetization, sequencing, units, rounding, decimal places, etc.\n"
            "If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.\n"
            "If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.\n"
            "If you are asked for a comma-separated list, apply the above rules depending on whether the elements are numbers or strings.\n"
            "Do NOT include any punctuation such as '.', '!', or '?' at the end of the answer.\n"
            "Do NOT include any invisible or non-printable characters in the answer output.\n\n"
            "You must absolutely not perform any MCP tool call, tool invocation, search, scrape, code execution, or similar actions.\n"
            "You can only answer the original question based on the information already retrieved and your own internal knowledge.\n"
            "If you attempt to call any tool, it will be considered a mistake."
        )
    elif agent_type == "agent-browsing":
        summarize_prompt = (
            "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
            "We are now ending this session, and your conversation history will be deleted. "
            "You must NOT initiate any further tool use. This is your final opportunity to report "
            "*all* of the information gathered during the session.\n\n"
            "The original task is repeated here for reference:\n\n"
            f'"{task_description}"\n\n'
            "Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
            "If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
            "If you reached a conclusion or answer, include it as part of the response.\n"
            "If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "
            "Search results, quotes, and observations that might help a downstream agent solve the problem.\n"
            "If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n"
            "Your final response should be a clear, complete, and structured report.\n"
            "Organize the content into logical sections with appropriate headings.\n"
            "Do NOT include any tool call instructions, speculative filler, or vague summaries.\n"
            "Focus on factual, specific, and well-organized information."
        )
    else:
        raise ValueError(f"Unknown agent type: {agent_type}")

    return summarize_prompt.strip()


================================================
FILE: apps/miroflow-agent/src/utils/wrapper_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""Wrapper utilities for handling responses and errors in a type-safe manner."""

from typing import Any, Dict, Optional


class ErrorBox:
    """
    A wrapper class for error messages.

    Use this to wrap error messages that should be distinguishable from normal responses.

    Example:
        >>> error = ErrorBox("Connection failed")
        >>> if ErrorBox.is_error_box(error):
        ...     print(f"Error: {error}")
    """

    def __init__(self, error_msg: str) -> None:
        self.error_msg = error_msg

    def __str__(self) -> str:
        return self.error_msg

    def __repr__(self) -> str:
        return f"ErrorBox({self.error_msg!r})"

    @staticmethod
    def is_error_box(something: Any) -> bool:
        """Check if the given object is an ErrorBox instance."""
        return isinstance(something, ErrorBox)


class ResponseBox:
    """
    A wrapper class for responses with optional extra information.

    Use this to wrap responses that may include additional metadata.

    Example:
        >>> response = ResponseBox({"data": "value"}, {"warning_msg": "Rate limited"})
        >>> if response.has_extra_info():
        ...     print(response.get_extra_info())
    """

    def __init__(
        self, response: Any, extra_info: Optional[Dict[str, Any]] = None
    ) -> None:
        self.response = response
        self.extra_info = extra_info

    def __str__(self) -> str:
        return str(self.response)

    def __repr__(self) -> str:
        return f"ResponseBox({self.response!r}, extra_info={self.extra_info!r})"

    @staticmethod
    def is_response_box(something: Any) -> bool:
        """Check if the given object is a ResponseBox instance."""
        return isinstance(something, ResponseBox)

    def has_extra_info(self) -> bool:
        """Check if this response has extra information attached."""
        return self.extra_info is not None

    def get_extra_info(self) -> Optional[Dict[str, Any]]:
        """Get the extra information attached to this response."""
        return self.extra_info

    def get_response(self) -> Any:
        """Get the wrapped response object."""
        return self.response


================================================
FILE: apps/visualize-trace/.python-version
================================================
3.11 

================================================
FILE: apps/visualize-trace/README.md
================================================
# Trace Analysis Web Demo

An interactive web interface for analyzing and visualizing trace JSON files.

## Installation and Running

### Method 1: Using Python (Recommended)

```bash
pip install -r requirements.txt
python run.py
```

The startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000`

### Method 2: Using uv

```bash
uv run run.py
```

## Usage

1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser

1. **Load files**:

   - Select the trace JSON file to analyze from the dropdown menu in the top navigation bar
   - Click the "Load" button to load the file

1. **View analysis results**:

   - **Left panel**: Shows basic information, execution summary, and performance statistics
   - **Right panel**: Displays detailed execution flow
   - **Bottom panel**: Shows spans statistics and step logs statistics

1. **Interactive operations**:

   - Click on execution steps to expand/collapse detailed information
   - Use "Expand All"/"Collapse All" buttons to control all steps
   - Click "View Details" button to see complete message content


================================================
FILE: apps/visualize-trace/app.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import os

from flask import Flask, jsonify, render_template, request
from trace_analyzer import TraceAnalyzer

app = Flask(__name__)

# Global variable to store analyzer instance
analyzer = None


@app.route("/")
def index():
    """Main page"""
    return render_template("index.html")


@app.route("/api/list_files", methods=["GET"])
def list_files():
    """List available JSON files"""
    try:
        directory = request.args.get("directory", "")

        if not directory:
            # Default behavior: check parent directory
            directory = os.path.abspath("..")

        # Expand path (handle ~ and other symbols)
        directory = os.path.expanduser(directory)

        # Convert to absolute path
        directory = os.path.abspath(directory)

        if not os.path.exists(directory):
            return jsonify({"error": f"Directory does not exist: {directory}"}), 404

        if not os.path.isdir(directory):
            return jsonify({"error": f"Path is not a directory: {directory}"}), 400

        try:
            json_files = []
            for file in os.listdir(directory):
                if file.endswith(".json"):
                    file_path = os.path.join(directory, file)
                    try:
                        # Get file size and modification time
                        stat = os.stat(file_path)
                        json_files.append(
                            {
                                "name": file,
                                "path": file_path,
                                "size": stat.st_size,
                                "modified": stat.st_mtime,
                            }
                        )
                    except Exception:
                        json_files.append(
                            {"name": file, "path": file_path, "size": 0, "modified": 0}
                        )

            # Sort by filename
            json_files.sort(key=lambda x: x["name"])

            return jsonify(
                {
                    "files": json_files,
                    "directory": directory,
                    "message": f'Found {len(json_files)} JSON files in directory "{directory}"',
                }
            )
        except PermissionError:
            return jsonify(
                {"error": f"No permission to access directory: {directory}"}
            ), 403
        except Exception as e:
            return jsonify({"error": f"Failed to read directory: {str(e)}"}), 500

    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/load_trace", methods=["POST"])
def load_trace():
    """Load trace file"""
    global analyzer

    data = request.get_json()
    file_path = data.get("file_path")

    if not file_path:
        return jsonify({"error": "Please provide file path"}), 400

    # If it's a relative path, convert to absolute path
    if not os.path.isabs(file_path):
        file_path = os.path.abspath(file_path)

    if not os.path.exists(file_path):
        return jsonify({"error": f"File does not exist: {file_path}"}), 404

    try:
        analyzer = TraceAnalyzer(file_path)
        return jsonify(
            {
                "message": "File loaded successfully",
                "file_path": file_path,
                "file_name": os.path.basename(file_path),
            }
        )
    except Exception as e:
        return jsonify({"error": f"Failed to load file: {str(e)}"}), 500


@app.route("/api/basic_info")
def get_basic_info():
    """Get basic information"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.get_basic_info())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/performance_summary")
def get_performance_summary():
    """Get performance summary"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.get_performance_summary())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/execution_flow")
def get_execution_flow():
    """Get execution flow"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.analyze_conversation_flow())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/execution_summary")
def get_execution_summary():
    """Get execution summary"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.get_execution_summary())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/spans_summary")
def get_spans_summary():
    """Get spans summary"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.get_spans_summary())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/step_logs_summary")
def get_step_logs_summary():
    """Get step logs summary"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        return jsonify(analyzer.get_step_logs_summary())
    except Exception as e:
        return jsonify({"error": str(e)}), 500


@app.route("/api/debug/raw_messages")
def get_raw_messages():
    """Get raw message data for debugging"""
    if not analyzer:
        return jsonify({"error": "Please load trace file first"}), 400

    try:
        main_history = analyzer.get_main_agent_history()
        browser_sessions = analyzer.get_browser_agent_sessions()

        # Get message structure overview
        main_messages = analyzer.get_main_agent_messages()
        message_structure = []

        for i, message in enumerate(main_messages):
            message_structure.append(
                {
                    "index": i,
                    "role": message.get("role"),
                    "content_length": len(str(message.get("content", ""))),
                    "has_timestamp": "timestamp" in message,
                    "content_preview": str(message.get("content", ""))[:100] + "..."
                    if len(str(message.get("content", ""))) > 100
                    else str(message.get("content", "")),
                }
            )

        return jsonify(
            {
                "main_agent_history_structure": {
                    "total_messages": len(main_messages),
                    "messages": message_structure,
                },
                "browser_sessions": list(browser_sessions.keys()),
                "raw_main_history": main_history,
                "raw_browser_sessions": {
                    k: v for k, v in list(browser_sessions.items())[:2]
                },  # Only show first two sessions
            }
        )
    except Exception as e:
        return jsonify({"error": str(e)}), 500


if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0", port=5000)


================================================
FILE: apps/visualize-trace/pyproject.toml
================================================
[project]
name = "trace-dashboard"
version = "1.0.0"
description = "A web dashboard for analyzing trace JSON files"
requires-python = ">=3.8"
dependencies = [
    "flask>=2.3.3",
    "werkzeug>=2.3.7",
]

[tool.uv]
dev-dependencies = [] 

================================================
FILE: apps/visualize-trace/requirements.txt
================================================
flask==2.3.3
werkzeug==2.3.7 

================================================
FILE: apps/visualize-trace/run.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import os
import subprocess
import sys


def check_dependencies():
    """Check if dependencies are installed"""
    try:
        import importlib.util

        if importlib.util.find_spec("flask") is not None:
            print("✓ Flask is installed")
            return True
        else:
            raise ImportError("Flask not found")
    except ImportError:
        print("✗ Flask is not installed")
        print("Please use the following commands to install dependencies:")
        print("  uv sync")
        print("or:")
        print("  uv pip install -r requirements.txt")
        return False


def install_dependencies():
    """Install dependencies (recommended to use uv)"""
    print("Installing dependencies...")
    try:
        # Try using uv first
        try:
            subprocess.check_call(["uv", "sync"])
            print("✓ Dependencies installed successfully using uv")
            return True
        except (subprocess.CalledProcessError, FileNotFoundError):
            # Fallback to pip
            subprocess.check_call(
                [sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]
            )
            print("✓ Dependencies installed successfully using pip")
            return True
    except subprocess.CalledProcessError:
        print("✗ Failed to install dependencies")
        print("Please manually run: uv sync or pip install -r requirements.txt")
        return False


def main():
    """Main function"""
    import argparse

    # Parse command line arguments
    parser = argparse.ArgumentParser(description="Trace Analysis Web Demo")
    parser.add_argument(
        "-p",
        "--port",
        type=int,
        default=5000,
        help="Specify port number (default: 5000)",
    )
    args = parser.parse_args()

    print("=" * 50)
    print("Trace Analysis Web Demo")
    print("=" * 50)

    # Check dependencies
    if not check_dependencies():
        print("\nInstalling dependencies...")
        if not install_dependencies():
            print(
                "Please manually install dependencies: pip install -r requirements.txt"
            )
            return

    # Check JSON files
    parent_dir = os.path.dirname(os.path.abspath(__file__))
    json_files = [
        f for f in os.listdir(os.path.join(parent_dir, "..")) if f.endswith(".json")
    ]

    if not json_files:
        print("\nWarning: No JSON files found in parent directory")
        print("Please ensure trace JSON files are in the trace_analyze/ directory")
    else:
        print(f"\nFound {len(json_files)} JSON files:")
        for file in json_files[:5]:  # Only show first 5
            print(f"  - {file}")
        if len(json_files) > 5:
            print(f"  ... and {len(json_files) - 5} other files")

    # Start application
    print("\nStarting web application...")
    print(f"Application will run at http://localhost:{args.port}")
    print("Press Ctrl+C to stop the application")
    print("=" * 50)

    try:
        from app import app

        app.run(debug=True, host="0.0.0.0", port=args.port)
    except KeyboardInterrupt:
        print("\nApplication stopped")
    except Exception as e:
        print(f"\nFailed to start application: {e}")


if __name__ == "__main__":
    main()


================================================
FILE: apps/visualize-trace/static/css/style.css
================================================
/* Global styles */
body {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    background-color: #f8f9fa;
}

/* Set special font for non-tool call content */
.rendered-content, .preview-text, .browser-agent-content {
    font-family: 'Courier New', 'Monaco', 'Menlo', monospace;
    font-size: 14px;
    line-height: 1.6;
}

/* Keep MCP tool calls using original font */
.mcp-tool-call {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

/* Ensure MCP tool call content uses original font */
.mcp-tool-call * {
    font-family: inherit;
}

/* Navigation button styles */
.nav-btn {
    transition: all 0.3s ease;
}

.nav-btn:hover:not(:disabled) {
    background-color: rgba(255, 255, 255, 0.2);
}

.nav-btn:disabled {
    opacity: 0.5;
    cursor: not-allowed;
}

/* File selection input group styles */
.file-navigation {
    display: flex;
    align-items: center;
    gap: 0;
}

.file-navigation .form-select {
    border-radius: 0;
    border-left: 0;
    border-right: 0;
}

.file-navigation .btn:first-child {
    border-top-right-radius: 0;
    border-bottom-right-radius: 0;
}

.file-navigation .btn:last-child {
    border-top-left-radius: 0;
    border-bottom-left-radius: 0;
}

/* Loading overlay */
.loading-overlay {
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    height: 100%;
    background-color: rgba(0, 0, 0, 0.5);
    display: flex;
    justify-content: center;
    align-items: center;
    z-index: 9999;
}

/* Card styles */
.card {
    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    border: none;
    border-radius: 8px;
}

.card-header {
    background-color: #f8f9fa;
    border-bottom: 1px solid #dee2e6;
    font-weight: 500;
}

/* Top summary panel styles */
.summary-panel {
    background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
    border: none;
    box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}

.summary-panel h6 {
    color: #495057;
    font-weight: 600;
    margin-bottom: 15px;
    padding-bottom: 8px;
    border-bottom: 2px solid #dee2e6;
}

.summary-panel .answer-box {
    background: #fff;
    border: 1px solid #dee2e6;
    border-radius: 6px;
    padding: 8px 12px;
    margin-bottom: 10px;
    display: flex;
    align-items: center;
    gap: 10px;
}

.summary-panel .answer-label {
    font-weight: 600;
    color: #6c757d;
    font-size: 12px;
    margin-bottom: 0;
    white-space: nowrap;
}

.summary-panel .answer-content {
    font-size: 14px;
    line-height: 1.4;
    flex: 1;
}

.summary-panel .final-answer {
    border-left: 4px solid #007bff;
}

.summary-panel .ground-truth {
    border-left: 4px solid #28a745;
}

.summary-panel .stat-item {
    background: #fff;
    border: 1px solid #dee2e6;
    border-radius: 6px;
    padding: 8px 12px;
    margin-bottom: 8px;
    display: flex;
    justify-content: space-between;
    align-items: center;
}

.summary-panel .stat-label {
    font-size: 12px;
    color: #6c757d;
    font-weight: 500;
}

.summary-panel .stat-value {
    font-size: 14px;
    font-weight: 600;
    color: #495057;
}

/* Navigation panel styles */
.navigation-panel {
    position: sticky;
    top: 20px;
    max-height: calc(100vh - 40px);
    overflow-y: auto;
}

.navigation-list {
    max-height: calc(100vh - 120px);
    overflow-y: auto;
}

.nav-item {
    padding: 8px 12px;
    border-bottom: 1px solid #f1f1f1;
    cursor: pointer;
    transition: all 0.2s ease;
    font-size: 13px;
}

.nav-item:hover {
    /* Remove background color change, can add other subtle visual feedback */
}

.nav-item.active {
    background-color: #007bff;
    color: white;
}

.nav-item .step-number {
    font-weight: bold;
    color: #6c757d;
}

.nav-item.active .step-number {
    color: white;
}

.nav-item .step-role {
    font-size: 11px;
    padding: 2px 6px;
    border-radius: 3px;
    margin-left: 8px;
}

.nav-item .step-role.user {
    background-color: #28a745;
    color: white;
}

.nav-item .step-role.assistant {
    background-color: #007bff;
    color: white;
}

.nav-item .step-role.tool {
    background-color: #fd7e14;
    color: white;
}

.nav-item .step-role.system {
    background-color: #6c757d;
    color: white;
}

.nav-item .step-summary {
    color: #6c757d;
    font-size: 12px;
    margin-top: 4px;
    display: -webkit-box;
    -webkit-line-clamp: 2;
    -webkit-box-orient: vertical;
    overflow: hidden;
}

.nav-item.active .step-summary {
    color: #e9ecef;
}

/* Browser sub-step navigation styles */
.nav-item.browser-sub-step {
    padding-left: 24px;
    font-size: 12px;
    border-left: 2px solid #dee2e6;
    margin-left: 8px;
}

.nav-item.browser-sub-step .step-number {
    font-size: 11px;
    color: #6c757d;
}

.nav-item.browser-sub-step .step-role {
    font-size: 10px;
    padding: 1px 4px;
}

.nav-item.browser-sub-step .step-summary {
    font-size: 11px;
    -webkit-line-clamp: 1;
}

.nav-item.browser-sub-step.active {
    border-left-color: #007bff;
}

.nav-item .browser-toggle {
    margin-left: auto;
    cursor: pointer;
    font-size: 12px;
    color: #6c757d;
    padding: 2px 4px;
    border-radius: 2px;
    transition: all 0.2s ease;
}

.nav-item .browser-toggle:hover {
    background-color: #e9ecef;
}

.nav-item.active .browser-toggle {
    color: #fff;
}

.nav-item.active .browser-toggle:hover {
    background-color: rgba(255, 255, 255, 0.2);
}

.browser-sub-steps {
    display: none;
}

.browser-sub-steps.expanded {
    display: block;
}

/* Execution flow styles */
.execution-steps-container {
    display: flex;
    flex-direction: column;
    gap: 16px;
}

.execution-step {
    border: 1px solid #dee2e6;
    border-radius: 6px;
    margin-bottom: 0;  /* Remove bottom margin, use gap instead */
    background-color: white;
    transition: all 0.3s ease;
    position: relative;
}

.execution-step:hover {
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}

/* Ensure main agent steps have clear visual separation */
.execution-step[data-agent*="main_agent"] {
    border-left: 4px solid #007bff;
    z-index: 2;
}

/* Browser session should be indented inside main agent steps */
.browser-session {
    position: relative;
    margin-left: 20px;
    margin-top: 12px;
}

.step-header {
    padding: 12px 16px;
    cursor: pointer;
    position: relative;
    border-radius: 6px 6px 0 0;
}

.step-header:hover {
    background-color: #f8f9fa;
}

.step-header.user-message {
    background-color: #e3f2fd;
    border-left: 4px solid #2196f3;
}

.step-header.assistant-message {
    background-color: #f3e5f5;
    border-left: 4px solid #9c27b0;
}

.step-header.user-message.browser-agent {
    background-color: #e8f5e8;
    border-left: 4px solid #4caf50;
}

.step-header.assistant-message.browser-agent {
    background-color: #fff3e0;
    border-left: 4px solid #ff9800;
}

.step-header.tool-message {
    background-color: #fff3e0;
    border-left: 4px solid #fd7e14;
}

.step-header.system-message {
    background-color: #f8f9fa;
    border-left: 4px solid #6c757d;
}

.step-content {
    padding: 16px;
    border-top: 1px solid #dee2e6;
    background-color: #f8f9fa;
}

.step-toggle {
    position: absolute;
    right: 16px;
    top: 50%;
    transform: translateY(-50%);
    font-size: 14px;
    color: #6c757d;
}

/* Tool call styles */
.tool-call {
    background-color: #fff3cd;
    border: 1px solid #ffeaa7;
    border-radius: 4px;
    padding: 10px;
    margin: 8px 0;
}

.tool-call-header {
    font-weight: 500;
    color: #856404;
    margin-bottom: 5px;
}

.tool-call.browser-agent {
    background-color: #d4edda;
    border-color: #c3e6cb;
}

.tool-call.browser-agent .tool-call-header {
    color: #155724;
}

/* Browser session styles */
.browser-session {
    background-color: #f8f9fa;
    border: 1px solid #dee2e6;
    border-radius: 4px;
    margin-top: 10px;
    padding: 12px;
}

.browser-session-header {
    font-weight: 500;
    color: #495057;
    margin-bottom: 10px;
    padding-bottom: 8px;
    border-bottom: 1px solid #dee2e6;
}

.browser-step {
    background-color: white;
    border: 1px solid #e9ecef;
    border-radius: 4px;
    margin-bottom: 8px;
    padding: 8px 12px;
}

.browser-step.user {
    background-color: #f0f8ff;
}

.browser-step.assistant {
    background-color: #fdf6e3;
}

.browser-step.tool {
    background-color: #fff3e0;
    border-left: 3px solid #fd7e14;
}

.browser-step.system {
    background-color: #f8f9fa;
    border-left: 3px solid #6c757d;
}

/* Statistics styles */
.stat-item {
    display: flex;
    justify-content: space-between;
    align-items: center;
    padding: 8px 0;
    border-bottom: 1px solid #f0f0f0;
}

.stat-item:last-child {
    border-bottom: none;
}

.stat-label {
    font-weight: 500;
    color: #495057;
}

.stat-value {
    font-weight: 600;
    color: #007bff;
}

/* Badge styles */
.badge-role {
    font-size: 11px;
    padding: 4px 8px;
    border-radius: 12px;
    font-weight: 500;
    text-transform: uppercase;
}

.badge-user {
    background-color: #007bff;
    color: white;
}

.badge-assistant {
    background-color: #6f42c1;
    color: white;
}

.badge-tool {
    background-color: #fd7e14;
    color: white;
}

.badge-system {
    background-color: #6c757d;
    color: white;
}

.badge-browser {
    background-color: #28a745;
    color: white;
}

/* Timestamp styles */
.timestamp {
    font-size: 11px;
    color: #6c757d;
    font-family: monospace;
}

/* Content preview styles */
.content-preview {
    background-color: white;
    border-radius: 4px;
    padding: 8px;
    margin: 8px 0;
}

.content-preview .preview-text {
    line-height: 1.5;
}

.expand-preview-btn {
    color: #007bff !important;
    font-size: 12px;
    text-decoration: none;
}

.expand-preview-btn:hover {
    text-decoration: underline !important;
}

/* Step content area style adjustments */
.step-content {
    padding: 16px;
    border-top: 1px solid #dee2e6;
    background-color: #f8f9fa;
}

.step-content h6 {
    color: #495057;
    font-weight: 600;
    margin-bottom: 8px;
    font-size: 14px;
}

/* Button styles */
.btn-sm {
    font-size: 12px;
    padding: 4px 12px;
}

/* Responsive styles */
@media (max-width: 768px) {
    .container-fluid {
        padding: 0 10px;
    }
    
    .col-md-3 {
        order: 2;
    }
    
    .col-md-9 {
        order: 1;
    }
    
    .step-header {
        padding: 10px 12px;
    }
    
    .step-content {
        padding: 12px;
    }
}

/* Animation effects */
.collapse {
    transition: height 0.3s ease;
}

.fade-in {
    animation: fadeIn 0.3s ease-in;
}

@keyframes fadeIn {
    from {
        opacity: 0;
        transform: translateY(10px);
    }
    to {
        opacity: 1;
        transform: translateY(0);
    }
}

/* Tooltip styles */
.tooltip {
    font-size: 12px;
}

/* Code styles */
.code-block {
    background-color: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 6px;
    padding: 12px;
    font-family: 'Courier New', monospace;
    font-size: 13px;
    white-space: pre-wrap;
    margin: 8px 0;
    overflow-x: auto;
    line-height: 1.4;
}

.code-block pre {
    margin: 0;
    padding: 0;
    background: none;
    border: none;
    font-family: inherit;
    font-size: inherit;
    white-space: pre-wrap;
}

.code-block code {
    background: none;
    border: none;
    padding: 0;
    font-family: inherit;
    font-size: inherit;
    color: inherit;
}

/* Error styles */
.error-message {
    color: #dc3545;
    font-size: 14px;
    margin-top: 8px;
}

.success-message {
    color: #28a745;
    font-size: 14px;
    margin-top: 8px;
}

/* Scrollbar styles */
::-webkit-scrollbar {
    width: 8px;
}

::-webkit-scrollbar-track {
    background: #f1f1f1;
}

::-webkit-scrollbar-thumb {
    background: #c1c1c1;
    border-radius: 4px;
}

::-webkit-scrollbar-thumb:hover {
    background: #a8a8a8;
}

/* MCP tool call styles */
.mcp-tool-call {
    background-color: #ffffff;
    border: 2px solid #007bff;
    border-radius: 8px;
    padding: 16px;
    margin: 16px 0;
    box-shadow: 0 2px 8px rgba(0,123,255,0.1);
    overflow: hidden;
}

.mcp-tool-call.browser-agent {
    border-color: #28a745;
    background-color: #ffffff;
    box-shadow: 0 2px 8px rgba(40,167,69,0.1);
}

.mcp-tool-header {
    display: flex;
    align-items: center;
    font-weight: 600;
    color: #007bff;
    margin-bottom: 12px;
    font-size: 14px;
    padding-bottom: 8px;
    border-bottom: 1px solid #e9ecef;
}

.mcp-tool-call.browser-agent .mcp-tool-header {
    color: #28a745;
}

.mcp-tool-header i {
    margin-right: 8px;
    font-size: 16px;
}

.mcp-tool-name {
    font-family: 'Courier New', monospace;
    background-color: rgba(0,123,255,0.1);
    padding: 4px 8px;
    border-radius: 4px;
    margin-left: 4px;
    font-size: 13px;
}

.mcp-tool-call.browser-agent .mcp-tool-name {
    background-color: rgba(40,167,69,0.1);
}

.mcp-tool-content {
    margin-top: 8px;
}

.mcp-xml-structure {
    font-family: 'Courier New', monospace;
    background-color: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 4px;
    padding: 16px;
    line-height: 1.6;
    font-size: 13px;
}

.xml-tag {
    color: #0066cc;
    font-weight: 500;
    margin: 2px 0;
}

.xml-content {
    margin-left: 20px;
    margin: 8px 0 8px 20px;
}

.xml-arguments {
    background-color: #ffffff;
    border: 1px solid #dee2e6;
    border-radius: 4px;
    padding: 12px;
    margin: 8px 0 8px 20px;
    white-space: pre-wrap;
    color: #2c3e50;
    font-family: 'Courier New', monospace;
    font-size: 12px;
    line-height: 1.5;
    overflow-x: auto;
}

.mcp-tool-args {
    margin-top: 8px;
}

.mcp-args-label {
    font-weight: 500;
    color: #495057;
    margin-bottom: 6px;
    font-size: 13px;
}

/* Format badge styles */
.badge-format {
    font-size: 10px;
    padding: 2px 6px;
    border-radius: 3px;
    font-weight: normal;
}

.badge-format {
    background-color: #6c757d;
    color: white;
}

/* Format badge default styles, can be extended as needed */

/* Tool ID styles */
.tool-id {
    margin-top: 8px;
    padding-top: 8px;
    border-top: 1px solid #e9ecef;
}

/* Rendered content styles - white background */
.rendered-content {
    background-color: white;
    padding: 12px;
    border-radius: 4px;
    border: 1px solid #e9ecef;
    margin: 8px 0;
    line-height: 1.6;
}

.rendered-content h1 {
    color: #2c3e50;
    border-bottom: 2px solid #3498db;
    padding-bottom: 8px;
    margin-bottom: 16px;
    font-size: 1.5em;
}

.rendered-content h2 {
    color: #34495e;
    border-bottom: 1px solid #bdc3c7;
    padding-bottom: 6px;
    margin-bottom: 12px;
    font-size: 1.3em;
}

.rendered-content h3 {
    color: #2c3e50;
    margin-bottom: 10px;
    font-size: 1.1em;
}

.rendered-content strong {
    color: #2c3e50;
    font-weight: 600;
}

.rendered-content em {
    color: #7f8c8d;
    font-style: italic;
}

.rendered-content ul, .rendered-content ol {
    margin: 10px 0;
    padding-left: 20px;
}

.rendered-content li {
    margin: 4px 0;
}

.rendered-content a {
    color: #3498db;
    text-decoration: none;
}

.rendered-content a:hover {
    text-decoration: underline;
}

.rendered-content .inline-code {
    background-color: #f8f9fa;
    color: #e83e8c;
    padding: 2px 4px;
    border-radius: 3px;
    font-family: 'Courier New', monospace;
    font-size: 0.9em;
}

.rendered-content .code-block {
    background-color: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 4px;
    margin: 8px 0;
    overflow-x: auto;
}

.rendered-content .code-block pre {
    margin: 0;
    padding: 12px;
    background: none;
    border: none;
    font-family: 'Courier New', monospace;
    font-size: 0.9em;
    line-height: 1.4;
    color: #2c3e50;
}

.rendered-content .code-block code {
    background: none;
    padding: 0;
    color: inherit;
    font-family: inherit;
}

/* Improve browser agent content styles */
.browser-agent-content {
    background-color: #f8fff8;
    border: 1px solid #d4edda;
    border-radius: 4px;
    padding: 12px;
    margin: 8px 0;
}

/* Improve content display in modal */
.modal-body .rendered-content {
    max-height: 400px;
    overflow-y: auto;
} 

================================================
FILE: apps/visualize-trace/static/js/script.js
================================================
// Global variables
let currentFlowData = null;
let currentBasicInfo = null;
let currentFileList = [];
let currentFileIndex = -1;

// DOM elements
const elements = {
    directoryInput: document.getElementById('directoryInput'),
    browseDirectoryBtn: document.getElementById('browseDirectoryBtn'),
    fileSelect: document.getElementById('fileSelect'),
    prevFileBtn: document.getElementById('prevFileBtn'),
    nextFileBtn: document.getElementById('nextFileBtn'),
    loadBtn: document.getElementById('loadBtn'),
    refreshBtn: document.getElementById('refreshBtn'),
    expandAllBtn: document.getElementById('expandAllBtn'),
    collapseAllBtn: document.getElementById('collapseAllBtn'),
    basicInfo: document.getElementById('basicInfo'),
    executionSummary: document.getElementById('executionSummary'),
    performanceSummary: document.getElementById('performanceSummary'),
    executionFlow: document.getElementById('executionFlow'),
    spansStats: document.getElementById('spansStats'),
    stepLogsStats: document.getElementById('stepLogsStats'),
    loadingOverlay: document.getElementById('loadingOverlay'),
    errorToast: document.getElementById('errorToast'),
    successToast: document.getElementById('successToast'),
    errorMessage: document.getElementById('errorMessage'),
    successMessage: document.getElementById('successMessage'),
    messageModal: document.getElementById('messageModal'),
    messageContent: document.getElementById('messageContent'),
    navigationList: document.getElementById('navigationList')
};

// Initialize
document.addEventListener('DOMContentLoaded', function() {
    initializeApp();
});

function initializeApp() {
    // Bind event listeners
    elements.browseDirectoryBtn.addEventListener('click', browseDirectory);
    elements.directoryInput.addEventListener('keypress', function(e) {
        if (e.key === 'Enter') {
            browseDirectory();
        }
    });
    elements.fileSelect.addEventListener('change', onFileSelect);
    elements.prevFileBtn.addEventListener('click', gotoPrevFile);
    elements.nextFileBtn.addEventListener('click', gotoNextFile);
    elements.loadBtn.addEventListener('click', loadTraceFile);
    elements.refreshBtn.addEventListener('click', refreshFileList);
    elements.expandAllBtn.addEventListener('click', expandAllSteps);
    elements.collapseAllBtn.addEventListener('click', collapseAllSteps);
    
    // Set default directory path
    setDefaultDirectory();
    
    // Initialize button states
    updateNavigationButtons();
    
    // Add keyboard shortcut support
    document.addEventListener('keydown', handleKeyboardShortcuts);
}

// Utility functions
function showLoading() {
    elements.loadingOverlay.classList.remove('d-none');
}

function hideLoading() {
    elements.loadingOverlay.classList.add('d-none');
}

function showError(message) {
    elements.errorMessage.textContent = message;
    const toast = new bootstrap.Toast(elements.errorToast);
    toast.show();
}

function showSuccess(message) {
    elements.successMessage.textContent = message;
    const toast = new bootstrap.Toast(elements.successToast);
    toast.show();
}

function formatTimestamp(timestamp) {
    if (!timestamp) return '';
    try {
        const date = new Date(timestamp);
        return date.toLocaleString('zh-CN');
    } catch (e) {
        return timestamp;
    }
}

function truncateText(text, maxLength = 100) {
    if (!text) return '';
    if (text.length <= maxLength) return text;
    return text.substring(0, maxLength) + '...';
}

function formatFileSize(bytes) {
    if (bytes === 0) return '0 B';
    const k = 1024;
    const sizes = ['B', 'KB', 'MB', 'GB'];
    const i = Math.floor(Math.log(bytes) / Math.log(k));
    return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}

// Handle MCP tool call display
function formatMcpToolCallWithPlaceholders(text, placeholders) {
    if (!text || typeof text !== 'string') return text;
    
    // MCP tool call regex - more lenient matching, including newlines
    const mcpPattern = /<use_mcp_tool>\s*<server_name>(.*?)<\/server_name>\s*<tool_name>(.*?)<\/tool_name>\s*<arguments>\s*(.*?)\s*<\/arguments>\s*<\/use_mcp_tool>/gs;
    
    let placeholderCounter = 0;
    
    return text.replace(mcpPattern, (match, serverName, toolName, args) => {
        // Clean and format arguments
        let formattedArgs = args.trim();
        
        // First convert escaped newlines to actual newlines
        formattedArgs = formattedArgs.replace(/\\n/g, '\n');
        
        try {
            // Try to format JSON arguments
            const parsed = JSON.parse(formattedArgs);
            formattedArgs = JSON.stringify(parsed, null, 2);
        } catch (e) {
            // If not JSON, keep as is but ensure newlines are correct
            formattedArgs = formattedArgs.replace(/\n/g, '\n');
        }
        
        const isBrowserAgent = serverName.trim() === 'browsing-agent';
        const toolClass = isBrowserAgent ? 'browser-agent' : '';
        const iconClass = isBrowserAgent ? 'globe' : 'cog';
        
        // Create complete MCP tool call HTML structure
        const mcpHtml = `<div class="mcp-tool-call ${toolClass}">
    <div class="mcp-tool-header">
        <i class="fas fa-${iconClass}"></i>
        <span class="mcp-tool-name">${serverName.trim()}.${toolName.trim()}</span>
    </div>
    <div class="mcp-tool-content">
        <div class="mcp-xml-structure">
            <div class="xml-tag">&lt;use_mcp_tool&gt;</div>
            <div class="xml-content">
                <div class="xml-tag">&lt;server_name&gt;${serverName.trim()}&lt;/server_name&gt;</div>
                <div class="xml-tag">&lt;tool_name&gt;${toolName.trim()}&lt;/tool_name&gt;</div>
                <div class="xml-tag">&lt;arguments&gt;</div>
                <div class="xml-arguments">${formattedArgs}</div>
                <div class="xml-tag">&lt;/arguments&gt;</div>
            </div>
            <div class="xml-tag">&lt;/use_mcp_tool&gt;</div>
        </div>
    </div>
</div>`;
        
        // Use simple placeholder ID to avoid complex JSON strings
        const placeholderId = `MCP_PLACEHOLDER_${placeholderCounter++}`;
        placeholders.set(placeholderId, mcpHtml);
        
        return `[${placeholderId}]`;
    });
}

// Create new format tool call HTML
function createNewFormatToolCallHTML(tool) {
    const isBeowserAgent = tool.server_name.includes('browsing') || tool.server_name.includes('agent');
    const toolClass = isBeowserAgent ? 'browser-agent' : '';
    const iconClass = isBeowserAgent ? 'globe' : 'cog';
    
    // Format arguments
    let formattedArgs;
    try {
        if (typeof tool.arguments === 'string') {
            formattedArgs = tool.arguments;
        } else {
            formattedArgs = JSON.stringify(tool.arguments, null, 2);
        }
    } catch (e) {
        formattedArgs = String(tool.arguments);
    }
    
    return `<div class="mcp-tool-call ${toolClass}">
    <div class="mcp-tool-header">
        <i class="fas fa-${iconClass}"></i>
        <span class="mcp-tool-name">${tool.server_name}.${tool.tool_name}</span>
        <span class="badge badge-format ms-2">${tool.format || 'new'}</span>
    </div>
    <div class="mcp-tool-content">
        <div class="mcp-tool-args">
            <div class="mcp-args-label">Arguments:</div>
            <div class="xml-arguments">${formattedArgs}</div>
        </div>
        ${tool.id ? `<div class="tool-id"><small class="text-muted">ID: ${tool.id}</small></div>` : ''}
    </div>
</div>`;
}

// Modified markdown rendering support - preserve markdown syntax, only handle newlines and MCP tool calls
function renderMarkdown(text) {
    if (!text || typeof text !== 'string') return '';
    
    let html = text;
    let placeholders = new Map();
    
    // First process MCP tool calls, before HTML escaping
    html = formatMcpToolCallWithPlaceholders(html, placeholders);
    
    // Escape HTML special characters, but protect MCP tool call placeholders
    html = html.replace(/&/g, '&amp;')
               .replace(/</g, '&lt;')
               .replace(/>/g, '&gt;')
               .replace(/"/g, '&quot;')
               .replace(/'/g, '&#39;');
    
    // Only handle newlines, preserve all markdown syntax
    html = html.replace(/\n/g, '<br>');
    
    // Finally process MCP tool call placeholders, insert HTML directly
    placeholders.forEach((htmlContent, placeholderId) => {
        html = html.replace(`[${placeholderId}]`, htmlContent);
    });
    
    return html;
}

// 增强的内容渲染函数
function isJsonString(str) {
    try {
        const trimmed = str.trim();
        if ((trimmed.startsWith('{') && trimmed.endsWith('}')) || 
            (trimmed.startsWith('[') && trimmed.endsWith(']'))) {
            JSON.parse(trimmed);
            return true;
        }
        return false;
    } catch (e) {
        return false;
    }
}

function formatJsonContent(content) {
    try {
        const trimmed = content.trim();
        const parsed = JSON.parse(trimmed);
        const formatted = JSON.stringify(parsed, null, 4);
        return `<div class="code-block"><pre><code>${formatted}</code></pre></div>`;
    } catch (e) {
        return content;
    }
}

function renderContent(content, isBrowserAgent = false) {
    if (!content) return '';
    
    // 检查是否为纯JSON字符串
    if (isJsonString(content)) {
        return formatJsonContent(content);
    }
    
    // 直接渲染Markdown（已包含MCP工具调用处理）
    let processedContent = renderMarkdown(content);
    
    // 如果是browser agent，添加特殊样式
    if (isBrowserAgent) {
        processedContent = `<div class="browser-agent-content">${processedContent}</div>`;
    }
    
    return processedContent;
}

// API调用函数
async function apiCall(url, options = {}) {
    try {
        const response = await fetch(url, {
            headers: {
                'Content-Type': 'application/json',
                ...options.headers
            },
            ...options
        });
        
        if (!response.ok) {
            throw new Error(`HTTP error! status: ${response.status}`);
        }
        
        return await response.json();
    } catch (error) {
        console.error('API call failed:', error);
        throw error;
    }
}

// 文件管理
function setDefaultDirectory() {
    // 设置默认目录为上级目录
    elements.directoryInput.value = '../';
    // 自动加载文件列表
    refreshFileList();
}

async function browseDirectory() {
    const directory = elements.directoryInput.value.trim();
    if (!directory) {
        showError('请输入目录路径');
        return;
    }
    
    await refreshFileList(directory);
}

async function refreshFileList(directory = null) {
    try {
        const targetDirectory = directory || elements.directoryInput.value.trim();
        if (!targetDirectory) {
            elements.fileSelect.innerHTML = '<option value="">请先输入目录路径...</option>';
            currentFileList = [];
            currentFileIndex = -1;
            updateNavigationButtons();
            return;
        }
        
        showLoading();
        
        const url = `/api/list_files?directory=${encodeURIComponent(targetDirectory)}`;
        const data = await apiCall(url);
        
        elements.fileSelect.innerHTML = '<option value="">选择Trace文件...</option>';
        
        if (data.files.length === 0) {
            elements.fileSelect.innerHTML = '<option value="">该目录下没有JSON文件</option>';
            currentFileList = [];
            currentFileIndex = -1;
            showSuccess(`目录 "${targetDirectory}" 下没有找到JSON文件`);
            updateNavigationButtons();
            return;
        }
        
        // 保存文件列表到全局变量
        currentFileList = data.files;
        currentFileIndex = -1;
        
        data.files.forEach((file, index) => {
            const option = document.createElement('option');
            option.value = file.path;
            option.dataset.index = index;
            const fileSize = formatFileSize(file.size);
            const modifiedDate = new Date(file.modified * 1000).toLocaleString('zh-CN');
            option.textContent = `${file.name} (${fileSize}, ${modifiedDate})`;
            elements.fileSelect.appendChild(option);
        });
        
        showSuccess(`在目录 "${targetDirectory}" 中找到 ${data.files.length} 个JSON文件`);
        updateNavigationButtons();
        
    } catch (error) {
        showError('获取文件列表失败: ' + error.message);
        elements.fileSelect.innerHTML = '<option value="">获取文件列表失败</option>';
        currentFileList = [];
        currentFileIndex = -1;
        updateNavigationButtons();
    } finally {
        hideLoading();
    }
}

// 文件切换功能
function onFileSelect() {
    const selectedOption = elements.fileSelect.options[elements.fileSelect.selectedIndex];
    if (selectedOption && selectedOption.dataset.index !== undefined) {
        currentFileIndex = parseInt(selectedOption.dataset.index);
        updateNavigationButtons();
    }
}

function gotoPrevFile() {
    if (currentFileIndex > 0) {
        currentFileIndex--;
        selectFileByIndex(currentFileIndex);
        loadTraceFile();
    }
}

function gotoNextFile() {
    if (currentFileIndex < currentFileList.length - 1) {
        currentFileIndex++;
        selectFileByIndex(currentFileIndex);
        loadTraceFile();
    }
}

function selectFileByIndex(index) {
    if (index >= 0 && index < currentFileList.length) {
        elements.fileSelect.selectedIndex = index + 1; // +1 因为第一个选项是"选择Trace文件..."
        currentFileIndex = index;
        updateNavigationButtons();
    }
}

function updateNavigationButtons() {
    const hasPrev = currentFileIndex > 0;
    const hasNext = currentFileIndex >= 0 && currentFileIndex < currentFileList.length - 1;
    
    elements.prevFileBtn.disabled = !hasPrev;
    elements.nextFileBtn.disabled = !hasNext;
    
    // 更新按钮提示文本
    if (currentFileIndex >= 0 && currentFileList.length > 0) {
        const prevFile = hasPrev ? currentFileList[currentFileIndex - 1] : null;
        const nextFile = hasNext ? currentFileList[currentFileIndex + 1] : null;
        
        elements.prevFileBtn.title = prevFile ? `上一个: ${prevFile.name}` : '没有上一个文件';
        elements.nextFileBtn.title = nextFile ? `下一个: ${nextFile.name}` : '没有下一个文件';
    } else {
        elements.prevFileBtn.title = '上一个文件';
        elements.nextFileBtn.title = '下一个文件';
    }
}

// 键盘快捷键处理
function handleKeyboardShortcuts(event) {
    // 只有在没有焦点在输入框时才处理快捷键
    if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA' || event.target.tagName === 'SELECT') {
        return;
    }
    
    // 防止在模态框打开时触发
    if (elements.messageModal.classList.contains('show')) {
        return;
    }
    
    switch (event.key) {
        case 'ArrowLeft':
            event.preventDefault();
            if (!elements.prevFileBtn.disabled) {
                gotoPrevFile();
            }
            break;
        case 'ArrowRight':
            event.preventDefault();
            if (!elements.nextFileBtn.disabled) {
                gotoNextFile();
            }
            break;
        case 'Enter':
            event.preventDefault();
            if (elements.fileSelect.value) {
                loadTraceFile();
            }
            break;
        case 'r':
        case 'R':
            if (event.ctrlKey) {
                event.preventDefault();
                refreshFileList();
            }
            break;
    }
}

async function loadTraceFile() {
    const selectedFile = elements.fileSelect.value;
    if (!selectedFile) {
        showError('请选择一个trace文件');
        return;
    }
    
    showLoading();
    
    try {
        // 加载文件
        await apiCall('/api/load_trace', {
            method: 'POST',
            body: JSON.stringify({ file_path: selectedFile })
        });
        
        // 并行加载所有数据
        const [basicInfo, executionSummary, performanceSummary, executionFlow, spansStats, stepLogsStats] = await Promise.all([
            apiCall('/api/basic_info'),
            apiCall('/api/execution_summary'),
            apiCall('/api/performance_summary'),
            apiCall('/api/execution_flow'),
            apiCall('/api/spans_summary'),
            apiCall('/api/step_logs_summary')
        ]);
        
        // 更新界面
        updateBasicInfo(basicInfo);
        updateExecutionSummary(executionSummary);
        updatePerformanceSummary(performanceSummary);
        updateExecutionFlow(executionFlow);
        updateSpansStats(spansStats);
        updateStepLogsStats(stepLogsStats);
        
        // 显示当前文件信息
        const currentFile = currentFileList[currentFileIndex];
        if (currentFile) {
            showSuccess(`文件加载成功: ${currentFile.name} (${currentFileIndex + 1}/${currentFileList.length})`);
        } else {
            showSuccess('文件加载成功');
        }
        
    } catch (error) {
        showError('加载文件失败: ' + error.message);
    } finally {
        hideLoading();
    }
}

// 界面更新函数
function updateBasicInfo(data) {
    currentBasicInfo = data;
    
    const finalAnswer = data.final_boxed_answer || '暂无答案';
    const groundTruth = data.ground_truth || '暂无正确答案';
    
    const html = `
        <div class="stat-item">
            <span class="stat-label">任务ID:</span>
            <span class="stat-value">${data.task_id || 'N/A'}</span>
        </div>
        <div class="answer-box final-answer">
            <div class="answer-label">最终答案</div>
            <div class="answer-content">${finalAnswer}</div>
        </div>
        <div class="answer-box ground-truth">
            <div class="answer-label">正确答案</div>
            <div class="answer-content">${groundTruth}</div>
        </div>
        <div class="stat-item">
            <span class="stat-label">判断结果:</span>
            <span class="stat-value badge ${data.final_judge_result === 'CORRECT' ? 'bg-success' : 'bg-danger'}">${data.final_judge_result || 'N/A'}</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">判断类型:</span>
            <span class="stat-value">${data.judge_type || 'N/A'}</span>
        </div>
    `;
    
    elements.basicInfo.innerHTML = html;
}

function updateExecutionSummary(data) {
    const html = `
        <div class="stat-item">
            <span class="stat-label">总步骤数:</span>
            <span class="stat-value">${data.total_steps}</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">工具调用次数:</span>
            <span class="stat-value">${data.total_tool_calls}</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">Browser会话数:</span>
            <span class="stat-value">${data.browser_sessions_count}</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">browsing-agent.search_and_browse:</span>
            <span class="stat-value">${data.tool_usage_distribution['browsing-agent.search_and_browse'] || 0}</span>
        </div>
    `;
    
    elements.executionSummary.innerHTML = html;
}

function updatePerformanceSummary(data) {
    if (!data || Object.keys(data).length === 0) {
        elements.performanceSummary.innerHTML = '<p class="text-muted">无性能数据</p>';
        return;
    }
    
    const html = `
        <div class="stat-item">
            <span class="stat-label">总执行时间:</span>
            <span class="stat-value">${(data.total_wall_time || 0).toFixed(2)}s</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">browsing_agent:</span>
            <span class="stat-value">${data.primary_breakdown?.browsing_agent ? (data.primary_breakdown.browsing_agent.total || 0).toFixed(2) : 0}s</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">main_agent:</span>
            <span class="stat-value">${data.primary_breakdown?.main_agent ? (data.primary_breakdown.main_agent.total || 0).toFixed(2) : 0}s</span>
        </div>
    `;
    
    elements.performanceSummary.innerHTML = html;
}

function updateExecutionFlow(data) {
    currentFlowData = data;
    
    if (!data || data.length === 0) {
        elements.executionFlow.innerHTML = '<p class="text-muted">无执行流程数据</p>';
        updateNavigationList([]);
        return;
    }
    
    // 确保每个步骤都是独立的顶级元素
    const stepsContainer = document.createElement('div');
    stepsContainer.className = 'execution-steps-container';
    
    data.forEach((step, index) => {
        const stepElement = document.createElement('div');
        stepElement.innerHTML = createStepHTML(step, index);
        stepsContainer.appendChild(stepElement.firstElementChild);
    });
    
    elements.executionFlow.innerHTML = '';
    elements.executionFlow.appendChild(stepsContainer);
    
    // 更新导航列表
    updateNavigationList(data);
    
    // 绑定事件监听器
    bindStepEventListeners();
}

function createStepHTML(step, index) {
    const roleClass = step.role === 'user' ? 'user-message' : 
                     step.role === 'tool' ? 'tool-message' : 
                     step.role === 'system' ? 'system-message' : 
                     'assistant-message';
    const agentClass = step.agent.includes('browser') ? 'browser-agent' : '';
    
    // 渲染内容
    const renderedPreview = renderContent(step.content_preview);
    const renderedFullContent = renderContent(step.full_content);
    
    return `
        <div class="execution-step fade-in" data-step-id="${step.step_id}" data-agent="${step.agent}" id="step-${index}">
            <div class="step-header ${roleClass} ${agentClass}" data-toggle="collapse" data-target="#step-content-${index}">
                <div class="d-flex justify-content-between align-items-center">
                    <div>
                        <span class="badge badge-role badge-${step.role}">${step.role}</span>
                        <span class="badge badge-browser ms-2">${step.agent}</span>
                        ${step.tool_calls.length > 0 ? `<span class="badge bg-warning text-dark ms-2">${step.tool_calls.length} 工具调用</span>` : ''}
                        ${step.browser_session ? `<span class="badge bg-success ms-2">Browser会话</span>` : ''}
                    </div>
                    <div class="d-flex align-items-center">
                        <span class="timestamp me-2">${formatTimestamp(step.timestamp)}</span>
                        <span class="step-toggle">
                            <i class="fas fa-chevron-down"></i>
                        </span>
                    </div>
                </div>
                <div class="content-preview mt-2">
                    <div class="preview-text">
                        ${renderedPreview}
                    </div>
                </div>
            </div>
            
            <div class="step-content collapse" id="step-content-${index}">
                <div class="mb-3">
                    <h6>完整内容:</h6>
                    <div class="rendered-content">${renderedFullContent}</div>
                </div>
                
                ${step.tool_calls.length > 0 ? `
                    <div class="mb-3">
                        <h6>工具调用:</h6>
                        ${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}
                    </div>
                ` : ''}
                
                ${step.browser_flow && step.browser_flow.length > 0 ? `
                    <div class="mb-3">
                        <h6>Browser会话流程:</h6>
                        <div class="browser-session">
                            <div class="browser-session-header">
                                <i class="fas fa-globe"></i> ${step.browser_session} (${step.browser_flow.length} 步骤)
                            </div>
                            ${step.browser_flow.map(browserStep => createBrowserStepHTML(browserStep, index)).join('')}
                        </div>
                    </div>
                ` : ''}
                
                <div class="d-flex justify-content-end">
                    <button class="btn btn-outline-primary btn-sm" onclick="showFullMessage(${step.step_id})">
                        <i class="fas fa-expand"></i> 查看详情
                    </button>
                </div>
            </div>
        </div>
    `;
}

function createToolCallHTML(tool) {
    // 如果是新格式的工具调用，使用新的渲染方式
    if (tool.format === 'new') {
        return createNewFormatToolCallHTML(tool);
    }
    
    // 旧格式（MCP或其他）使用原有的渲染方式
    const isBeowserAgent = tool.server_name === 'browsing-agent' || tool.server_name.includes('agent');
    const toolClass = isBeowserAgent ? 'browser-agent' : '';
    
    return `
        <div class="tool-call ${toolClass}">
            <div class="tool-call-header">
                <i class="fas fa-${isBeowserAgent ? 'globe' : 'wrench'}"></i>
                ${tool.server_name}.${tool.tool_name}
                <span class="badge badge-format ms-2">${tool.format || 'mcp'}</span>
            </div>
            <div class="tool-arguments">
                <strong>参数:</strong>
                <div class="code-block">${JSON.stringify(tool.arguments, null, 2)}</div>
            </div>
        </div>
    `;
}

function createBrowserStepHTML(step, parentIndex) {
    // 为browser step创建唯一的ID
    const browserId = `browser-${parentIndex}-${step.step_id}`;
    
    // 判断内容是否被截断
    const isContentTruncated = step.full_content && step.content_preview.length < step.full_content.length;
    
    // 渲染内容
    const renderedPreview = renderContent(step.content_preview);
    const renderedFullContent = renderContent(step.full_content);
    
    return `
        <div class="browser-step ${step.role}" id="browser-step-${parentIndex}-${step.step_id}">
            <div class="d-flex justify-content-between align-items-center mb-2">
                <div>
                    <span class="badge badge-role badge-${step.role}">${step.role}</span>
                    ${step.tool_calls.length > 0 ? `<span class="badge bg-warning text-dark ms-2">${step.tool_calls.length} 工具</span>` : ''}
                </div>
                <span class="timestamp">${formatTimestamp(step.timestamp)}</span>
            </div>
            <div class="content-preview" id="browser-preview-${browserId}">
                <div class="preview-text">
                    ${renderedPreview}
                    ${isContentTruncated ? `
                        <span class="text-muted">...</span>
                        <button class="btn btn-link btn-sm p-0 ms-2 expand-preview-btn" onclick="toggleBrowserPreview('${browserId}', ${parentIndex}, ${step.step_id})" data-expanded="false">
                            <i class="fas fa-chevron-down"></i> 展开
                        </button>
                    ` : ''}
                </div>
            </div>
            ${step.tool_calls.length > 0 ? `
                <div class="mt-2">
                    <h7>工具调用:</h7>
                    ${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}
                </div>
            ` : ''}
        </div>
    `;
}

function updateSpansStats(data) {
    if (!data || Object.keys(data).length === 0) {
        elements.spansStats.innerHTML = '<p class="text-muted">无Spans数据</p>';
        return;
    }
    
    const html = `
        <div class="stat-item">
            <span class="stat-label">总Spans数:</span>
            <span class="stat-value">${data.total_spans}</span>
        </div>
        <div class="stat-item">
            <span class="stat-label">总时长:</span>
            <span class="stat-value">${(data.total_duration || 0).toFixed(2)}s</span>
        </div>
        <div class="mt-3">
            <h6>Agent统计:</h6>
            ${Object.entries(data.agent_stats || {}).map(([agent, stats]) => `
                <div class="mb-2">
                    <strong>${agent}:</strong>
                    <div class="stat-item">
                        <span class="stat-label">数量:</span>
                        <span class="stat-value">${stats.count}</span>
                    </div>
                    <div class="stat-item">
                        <span class="stat-label">时长:</span>
                        <span class="stat-value">${(stats.total_duration || 0).toFixed(2)}s</span>
                    </div>
                </div>
            `).join('')}
        </div>
    `;
    
    elements.spansStats.innerHTML = html;
}

function updateStepLogsStats(data) {
    if (!data || Object.keys(data).length === 0) {
        elements.stepLogsStats.innerHTML = '<p class="text-muted">无步骤日志数据</p>';
        return;
    }
    
    const html = `
        <div class="stat-item">
            <span class="stat-label">总日志数:</span>
            <span class="stat-value">${data.total_logs}</span>
        </div>
        <div class="mt-3">
            <h6>状态分布:</h6>
            ${Object.entries(data.status_distribution || {}).map(([status, count]) => `
                <div class="stat-item">
                    <span class="stat-label">${status}:</span>
                    <span class="stat-value">${count}</span>
                </div>
            `).join('')}
        </div>
        <div class="mt-3">
            <h6>步骤类型分布:</h6>
            ${Object.entries(data.step_type_distribution || {}).map(([type, count]) => `
                <div class="stat-item">
                    <span class="stat-label">${type}:</span>
                    <span class="stat-value">${count}</span>
                </div>
            `).join('')}
        </div>
    `;
    
    elements.stepLogsStats.innerHTML = html;
}

// 事件处理函数
function bindStepEventListeners() {
    // 步骤折叠/展开
    document.querySelectorAll('.step-header').forEach(header => {
        header.addEventListener('click', function() {
            const target = this.getAttribute('data-target');
            const content = document.querySelector(target);
            const icon = this.querySelector('.step-toggle i');
            
            if (content.classList.contains('show')) {
                content.classList.remove('show');
                icon.className = 'fas fa-chevron-down';
            } else {
                content.classList.add('show');
                icon.className = 'fas fa-chevron-up';
            }
        });
    });
}

function expandAllSteps() {
    // 展开main agent的步骤
    document.querySelectorAll('.step-content').forEach(content => {
        content.classList.add('show');
    });
    document.querySelectorAll('.step-toggle i').forEach(icon => {
        icon.className = 'fas fa-chevron-up';
    });
    
    // 展开browser agent的预览内容
    document.querySelectorAll('.expand-preview-btn').forEach(button => {
        const isExpanded = button.getAttribute('data-expanded') === 'true';
        if (!isExpanded) {
            button.click();
        }
    });
}

function collapseAllSteps() {
    // 收起main agent的步骤
    document.querySelectorAll('.step-content').forEach(content => {
        content.classList.remove('show');
    });
    document.querySelectorAll('.step-toggle i').forEach(icon => {
        icon.className = 'fas fa-chevron-down';
    });
    
    // 收起browser agent的预览内容
    document.querySelectorAll('.expand-preview-btn').forEach(button => {
        const isExpanded = button.getAttribute('data-expanded') === 'true';
        if (isExpanded) {
            button.click();
        }
    });
}

// 切换内容预览展开/收起
// 切换browser预览展开/收起
function toggleBrowserPreview(browserId, parentIndex, browserStepId) {
    const previewElement = document.getElementById(`browser-preview-${browserId}`);
    const button = previewElement.querySelector('.expand-preview-btn');
    const isExpanded = button.getAttribute('data-expanded') === 'true';
    
    if (!currentFlowData) return;
    
    const parentStep = currentFlowData[parentIndex];
    if (!parentStep || !parentStep.browser_flow) return;
    
    const browserStep = parentStep.browser_flow.find(step => step.step_id === browserStepId);
    if (!browserStep) return;
    
    if (isExpanded) {
        // 收起
        const renderedPreview = renderContent(browserStep.content_preview);
        previewElement.querySelector('.preview-text').innerHTML = `
            ${renderedPreview}
            <span class="text-muted">...</span>
            <button class="btn btn-link btn-sm p-0 ms-2 expand-preview-btn" onclick="toggleBrowserPreview('${browserId}', ${parentIndex}, ${browserStepId})" data-expanded="false">
                <i class="fas fa-chevron-down"></i> 展开
            </button>
        `;
    } else {
        // 展开
        const renderedFullContent = renderContent(browserStep.full_content);
        previewElement.querySelector('.preview-text').innerHTML = `
            ${renderedFullContent}
            <button class="btn btn-link btn-sm p-0 ms-2 expand-preview-btn" onclick="toggleBrowserPreview('${browserId}', ${parentIndex}, ${browserStepId})" data-expanded="true">
                <i class="fas fa-chevron-up"></i> 收起
            </button>
        `;
    }
}

function showFullMessage(stepId) {
    if (!currentFlowData) return;
    
    const step = currentFlowData.find(s => s.step_id === stepId);
    if (!step) return;
    
    const renderedFullContent = renderContent(step.full_content);
    
    const modal = new bootstrap.Modal(elements.messageModal);
    elements.messageContent.innerHTML = `
        <div class="mb-3">
            <h6>步骤信息:</h6>
            <div class="row">
                <div class="col-md-4"><strong>Step ID:</strong> ${step.step_id}</div>
                <div class="col-md-4"><strong>Agent:</strong> ${step.agent}</div>
                <div class="col-md-4"><strong>Role:</strong> ${step.role}</div>
            </div>
            <div class="row mt-2">
                <div class="col-md-6"><strong>时间:</strong> ${formatTimestamp(step.timestamp)}</div>
                <div class="col-md-6"><strong>工具调用:</strong> ${step.tool_calls.length}</div>
            </div>
        </div>
        
        <div class="mb-3">
            <h6>完整内容:</h6>
            <div class="rendered-content">${renderedFullContent}</div>
        </div>
        
        ${step.tool_calls.length > 0 ? `
            <div class="mb-3">
                <h6>工具调用详情:</h6>
                ${step.tool_calls.map(tool => `
                    <div class="card mb-2">
                        <div class="card-body">
                            <h7 class="card-title">${tool.server_name}.${tool.tool_name}</h7>
                            <div class="code-block">${JSON.stringify(tool.arguments, null, 2)}</div>
                        </div>
                    </div>
                `).join('')}
            </div>
        ` : ''}
        
        ${step.browser_flow && step.browser_flow.length > 0 ? `
            <div class="mb-3">
                <h6>Browser会话详情:</h6>
                <div class="accordion" id="browserAccordion">
                    ${step.browser_flow.map((browserStep, index) => {
                        const renderedBrowserContent = renderContent(browserStep.full_content);
                        return `
                            <div class="accordion-item">
                                <h2 class="accordion-header">
                                    <button class="accordion-button collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#browserStep${index}">
                                        Browser Step ${index + 1}: ${browserStep.role}
                                        ${browserStep.tool_calls.length > 0 ? `(${browserStep.tool_calls.length} 工具调用)` : ''}
                                    </button>
                                </h2>
                                <div id="browserStep${index}" class="accordion-collapse collapse">
                                    <div class="accordion-body">
                                        <div class="rendered-content">${renderedBrowserContent}</div>
                                        ${browserStep.tool_calls.length > 0 ? `
                                            <div class="mt-2">
                                                <strong>工具调用:</strong>
                                                ${browserStep.tool_calls.map(tool => `
                                                    <div class="small text-muted">
                                                        ${tool.server_name}.${tool.tool_name}
                                                    </div>
                                                `).join('')}
                                            </div>
                                        ` : ''}
                                    </div>
                                </div>
                            </div>
                        `;
                    }).join('')}
                </div>
            </div>
        ` : ''}
    `;
    
    modal.show();
} 

// ==================== 导航功能 ====================

function updateNavigationList(data) {
    if (!data || data.length === 0) {
        elements.navigationList.innerHTML = '<p class="text-muted p-3 mb-0">暂无步骤</p>';
        return;
    }
    
    const navigationHTML = data.map((step, index) => {
        const summary = truncateText(step.content_preview, 50);
        const toolsInfo = step.tool_calls.length > 0 ? ` (${step.tool_calls.length}工具)` : '';
        const browserInfo = step.browser_session ? ' [浏览器]' : '';
        
        let html = `
            <div class="nav-item" data-step-index="${index}" onclick="scrollToStep(${index})">
                <div class="d-flex align-items-center">
                    <span class="step-number">${index + 1}</span>
                    <span class="step-role ${step.role}">${step.role}</span>
                    ${step.browser_flow && step.browser_flow.length > 0 ? `
                        <span class="browser-toggle" onclick="toggleBrowserNav(${index}, event)">
                            <i class="fas fa-chevron-down"></i>
                        </span>
                    ` : ''}
                </div>
                <div class="step-summary">${summary}${toolsInfo}${browserInfo}</div>
            </div>
        `;
        
        // 添加browser子步骤
        if (step.browser_flow && step.browser_flow.length > 0) {
            html += `
                <div class="browser-sub-steps" id="browser-nav-${index}">
                    ${step.browser_flow.map((browserStep, browserIndex) => {
                        const browserSummary = truncateText(browserStep.content_preview, 40);
                        const browserToolsInfo = browserStep.tool_calls.length > 0 ? ` (${browserStep.tool_calls.length}工具)` : '';
                        
                        return `
                            <div class="nav-item browser-sub-step" data-step-index="${index}" data-browser-step-id="${browserStep.step_id}" onclick="scrollToBrowserStep(${index}, ${browserStep.step_id})">
                                <div class="d-flex align-items-center">
                                    <span class="step-number">${index + 1}.${browserIndex + 1}</span>
                                    <span class="step-role ${browserStep.role}">${browserStep.role}</span>
                                </div>
                                <div class="step-summary">${browserSummary}${browserToolsInfo}</div>
                            </div>
                        `;
                    }).join('')}
                </div>
            `;
        }
        
        return html;
    }).join('');
    
    elements.navigationList.innerHTML = navigationHTML;
}

function scrollToStep(stepIndex) {
    const stepElement = document.getElementById(`step-${stepIndex}`);
    if (stepElement) {
        stepElement.scrollIntoView({ 
            behavior: 'smooth', 
            block: 'start' 
        });
        
        // 更新活跃的导航项
        updateActiveNavItem(stepIndex);
        
        // 如果步骤是收起的，自动展开
        const stepContent = document.getElementById(`step-content-${stepIndex}`);
        if (stepContent && !stepContent.classList.contains('show')) {
            const collapseInstance = new bootstrap.Collapse(stepContent, {
                toggle: false
            });
            collapseInstance.show();
        }
    }
}

function scrollToBrowserStep(parentIndex, browserStepId) {
    const browserStepElement = document.getElementById(`browser-step-${parentIndex}-${browserStepId}`);
    if (browserStepElement) {
        browserStepElement.scrollIntoView({ 
            behavior: 'smooth', 
            block: 'start' 
        });
        
        // 更新活跃的导航项
        updateActiveNavItem(parentIndex, browserStepId);
        
        // 确保父步骤是展开的
        const stepContent = document.getElementById(`step-content-${parentIndex}`);
        if (stepContent && !stepContent.classList.contains('show')) {
            const collapseInstance = new bootstrap.Collapse(stepContent, {
                toggle: false
            });
            collapseInstance.show();
        }
    }
}

function toggleBrowserNav(stepIndex, event) {
    event.stopPropagation(); // 阻止事件冒泡
    
    const browserNavElement = document.getElementById(`browser-nav-${stepIndex}`);
    const toggleIcon = event.target.closest('.browser-toggle').querySelector('i');
    
    if (browserNavElement.classList.contains('expanded')) {
        browserNavElement.classList.remove('expanded');
        toggleIcon.className = 'fas fa-chevron-down';
    } else {
        browserNavElement.classList.add('expanded');
        toggleIcon.className = 'fas fa-chevron-up';
    }
}

function updateActiveNavItem(activeIndex, browserStepId = null) {
    // 移除所有活跃状态
    const navItems = elements.navigationList.querySelectorAll('.nav-item');
    navItems.forEach(item => item.classList.remove('active'));
    
    if (browserStepId) {
        // 激活browser子步骤
        const browserNavItem = elements.navigationList.querySelector(`[data-step-index="${activeIndex}"][data-browser-step-id="${browserStepId}"]`);
        if (browserNavItem) {
            browserNavItem.classList.add('active');
        }
    } else {
        // 激活主步骤
        const activeItem = elements.navigationList.querySelector(`[data-step-index="${activeIndex}"]:not([data-browser-step-id])`);
        if (activeItem) {
            activeItem.classList.add('active');
        }
    }
}

// 监听滚动事件，自动更新导航激活状态
let scrollTimeout;
function handleScroll() {
    clearTimeout(scrollTimeout);
    scrollTimeout = setTimeout(() => {
        if (!currentFlowData) return;
        
        const steps = document.querySelectorAll('.execution-step');
        const browserSteps = document.querySelectorAll('.browser-step');
        const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
        const windowHeight = window.innerHeight;
        
        let activeIndex = 0;
        let activeBrowserStepId = null;
        let minDistance = Infinity;
        
        // 检查browser子步骤
        browserSteps.forEach((browserStep) => {
            const rect = browserStep.getBoundingClientRect();
            const distance = Math.abs(rect.top - windowHeight / 3);
            
            if (distance < minDistance && rect.top < windowHeight * 0.7) {
                minDistance = distance;
                const id = browserStep.id;
                const matches = id.match(/browser-step-(\d+)-(\d+)/);
                if (matches) {
                    activeIndex = parseInt(matches[1]);
                    activeBrowserStepId = parseInt(matches[2]);
                }
            }
        });
        
        // 如果没有找到活跃的browser步骤，检查主步骤
        if (!activeBrowserStepId) {
            steps.forEach((step, index) => {
                const rect = step.getBoundingClientRect();
                const distance = Math.abs(rect.top - windowHeight / 3);
                
                if (distance < minDistance && rect.top < windowHeight * 0.7) {
                    minDistance = distance;
                    activeIndex = index;
                    activeBrowserStepId = null;
                }
            });
        }
        
        updateActiveNavItem(activeIndex, activeBrowserStepId);
    }, 100);
}

// 绑定滚动事件
window.addEventListener('scroll', handleScroll);

================================================
FILE: apps/visualize-trace/templates/index.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Trace Analysis Dashboard</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
    <link href="{{ url_for('static', filename='css/style.css') }}" rel="stylesheet">
</head>
<body>
    <div class="container-fluid">
        <!-- Header -->
        <div class="row">
            <div class="col-12">
                <nav class="navbar navbar-expand-lg navbar-dark bg-primary">
                    <div class="container-fluid">
                        <a class="navbar-brand" href="#">
                            <i class="fas fa-chart-line"></i> Trace Analysis Dashboard
                        </a>
                        <div class="navbar-nav ms-auto">
                            <div class="nav-item me-2">
                                <div class="input-group input-group-sm">
                                    <span class="input-group-text">Directory:</span>
                                    <input type="text" class="form-control" id="directoryInput" placeholder="Enter directory path..." style="width: 200px;">
                                    <button class="btn btn-outline-light" type="button" id="browseDirectoryBtn">
                                        <i class="fas fa-folder-open"></i>
                                    </button>
                                </div>
                            </div>
                            <div class="nav-item">
                                <div class="input-group input-group-sm file-navigation">
                                    <button class="btn btn-outline-light nav-btn" type="button" id="prevFileBtn" title="Previous file">
                                        <i class="fas fa-chevron-left"></i>
                                    </button>
                                    <select class="form-select form-select-sm" id="fileSelect" style="min-width: 250px;">
                                        <option value="">Select Trace file...</option>
                                    </select>
                                    <button class="btn btn-outline-light nav-btn" type="button" id="nextFileBtn" title="Next file">
                                        <i class="fas fa-chevron-right"></i>
                                    </button>
                                </div>
                            </div>
                            <button class="btn btn-outline-light btn-sm ms-2" id="loadBtn">
                                <i class="fas fa-upload"></i> Load
                            </button>
                            <button class="btn btn-outline-light btn-sm ms-2" id="refreshBtn">
                                <i class="fas fa-sync"></i> Refresh
                            </button>
                        </div>
                    </div>
                </nav>
            </div>
        </div>

        <!-- Top summary information -->
        <div class="row mt-3">
            <div class="col-12">
                <div class="card summary-panel">
                    <div class="card-body">
                        <div class="row">
                            <!-- Basic information -->
                            <div class="col-md-4">
                                <h6><i class="fas fa-info-circle"></i> Basic Information</h6>
                                <div id="basicInfo">
                                    <p class="text-muted">Please load a trace file first</p>
                                </div>
                            </div>
                            
                            <!-- Execution summary -->
                            <div class="col-md-4">
                                <h6><i class="fas fa-chart-pie"></i> Execution Summary</h6>
                                <div id="executionSummary">
                                    <p class="text-muted">Please load a trace file first</p>
                                </div>
                            </div>
                            
                            <!-- Performance summary -->
                            <div class="col-md-4">
                                <h6><i class="fas fa-clock"></i> Performance Summary</h6>
                                <div id="performanceSummary">
                                    <p class="text-muted">Please load a trace file first</p>
                                </div>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <!-- Main content -->
        <div class="row mt-3">
            <!-- Left navigation directory -->
            <div class="col-md-2">
                <div class="card navigation-panel">
                    <div class="card-header">
                        <h6><i class="fas fa-list"></i> Step Navigation</h6>
                    </div>
                    <div class="card-body p-0">
                        <div class="navigation-list" id="navigationList">
                            <p class="text-muted p-3 mb-0">Please load a trace file first</p>
                        </div>
                    </div>
                </div>
            </div>

            <!-- Right panel - Execution flow -->
            <div class="col-md-10">
                <div class="card">
                    <div class="card-header d-flex justify-content-between align-items-center">
                        <h5><i class="fas fa-project-diagram"></i> Execution Flow</h5>
                        <div>
                            <button class="btn btn-outline-primary btn-sm" id="expandAllBtn">
                                <i class="fas fa-expand"></i> Expand All
                            </button>
                            <button class="btn btn-outline-primary btn-sm" id="collapseAllBtn">
                                <i class="fas fa-compress"></i> Collapse All
                            </button>
                        </div>
                    </div>
                    <div class="card-body" id="executionFlow">
                        <p class="text-muted">Please load a trace file first</p>
                    </div>
                </div>
            </div>
        </div>

        <!-- Bottom statistics -->
        <div class="row mt-3">
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5><i class="fas fa-layer-group"></i> Spans Statistics</h5>
                    </div>
                    <div class="card-body" id="spansStats">
                        <p class="text-muted">Please load a trace file first</p>
                    </div>
                </div>
            </div>
            <div class="col-md-6">
                <div class="card">
                    <div class="card-header">
                        <h5><i class="fas fa-list-ul"></i> Step Logs Statistics</h5>
                    </div>
                    <div class="card-body" id="stepLogsStats">
                        <p class="text-muted">Please load a trace file first</p>
                    </div>
                </div>
            </div>
        </div>
    </div>

    <!-- Keyboard shortcuts hint -->
    <div class="position-fixed bottom-0 start-0 p-3" style="z-index: 10;">
        <div class="card border-0 shadow-sm" style="background-color: rgba(0,0,0,0.8); color: white; font-size: 12px;">
            <div class="card-body p-2">
                <div class="text-center">
                    <strong>Shortcuts:</strong> 
                    <span class="badge bg-secondary mx-1">←→</span> Switch files 
                    <span class="badge bg-secondary mx-1">Enter</span> Load 
                    <span class="badge bg-secondary mx-1">Ctrl+R</span> Refresh
                </div>
            </div>
        </div>
    </div>

    <!-- Message details modal -->
    <div class="modal fade" id="messageModal" tabindex="-1">
        <div class="modal-dialog modal-lg">
            <div class="modal-content">
                <div class="modal-header">
                    <h5 class="modal-title">Message Details</h5>
                    <button type="button" class="btn-close" data-bs-dismiss="modal"></button>
                </div>
                <div class="modal-body">
                    <div id="messageContent"></div>
                </div>
            </div>
        </div>
    </div>

    <!-- Loading overlay -->
    <div class="loading-overlay d-none" id="loadingOverlay">
        <div class="spinner-border text-primary" role="status">
            <span class="visually-hidden">Loading...</span>
        </div>
    </div>

    <!-- Toast notifications -->
    <div class="toast-container position-fixed top-0 end-0 p-3">
        <div id="errorToast" class="toast" role="alert">
            <div class="toast-header bg-danger text-white">
                <strong class="me-auto">Error</strong>
                <button type="button" class="btn-close btn-close-white" data-bs-dismiss="toast"></button>
            </div>
            <div class="toast-body" id="errorMessage"></div>
        </div>
        <div id="successToast" class="toast" role="alert">
            <div class="toast-header bg-success text-white">
                <strong class="me-auto">Success</strong>
                <button type="button" class="btn-close btn-close-white" data-bs-dismiss="toast"></button>
            </div>
            <div class="toast-body" id="successMessage"></div>
        </div>
    </div>

    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
    <script src="{{ url_for('static', filename='js/script.js') }}"></script>
</body>
</html> 

================================================
FILE: apps/visualize-trace/trace_analyzer.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import re
from typing import Any, Dict, List, Optional


class TraceAnalyzer:
    """
    Class for analyzing trace JSON files, convenient for reading and accessing important information

    Supports two tool call formats:
    1. Old format (MCP): Tool calls using XML tag format in content
    2. New format: Tool calls using tool_calls field directly in message
    """

    def __init__(self, json_file_path: str):
        """
        Initialize analyzer

        Args:
            json_file_path: Path to the JSON file
        """
        self.json_file_path = json_file_path
        self.data = self._load_json()

    def _load_json(self) -> Dict[str, Any]:
        """Load JSON file"""
        try:
            with open(self.json_file_path, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception as e:
            raise Exception(f"Failed to load JSON file: {e}")

    def _parse_new_format_tool_name(self, tool_name: str) -> tuple[str, str]:
        """
        Parse new format tool name

        Args:
            tool_name: New format tool name, for example:
                      - "tool-server_name-tool_name" format
                      - "agent-browsing-search_and_browse" format (browser agent)

        Returns:
            tuple: (server_name, actual_tool_name)
        """
        # Handle agent-browsing-* format (browser agent calls)
        if tool_name.startswith("agent-browsing-"):
            server_name = "agent-browsing"
            actual_tool_name = tool_name[len("agent-browsing-") :]
            return server_name, actual_tool_name

        # Handle other agent-* formats
        elif tool_name.startswith("agent-"):
            # Find the last '-' to split server_name and tool_name
            last_dash = tool_name.rfind("-")
            if last_dash > 6:  # There's content after "agent-"
                server_name = tool_name[:last_dash]
                actual_tool_name = tool_name[last_dash + 1 :]
            else:
                server_name = tool_name
                actual_tool_name = ""
            return server_name, actual_tool_name

        # Handle tool-server_name-tool_name format
        elif tool_name.startswith("tool-"):
            parts = tool_name.split("-", 2)
            if len(parts) >= 3:
                server_name = parts[1]
                actual_tool_name = parts[2]
            else:
                server_name = "unknown"
                actual_tool_name = tool_name
            return server_name, actual_tool_name

        # Other formats
        else:
            server_name = "unknown"
            actual_tool_name = tool_name
            return server_name, actual_tool_name

    # ==================== Basic Information ====================

    def get_basic_info(self) -> Dict[str, Any]:
        """Get basic information of the task"""
        return {
            "status": self.data.get("status"),
            "task_id": self.data.get("task_id"),
            "start_time": self.data.get("start_time"),
            "end_time": self.data.get("end_time"),
            "final_boxed_answer": self.data.get("final_boxed_answer"),
            "ground_truth": self.data.get("ground_truth"),
            "final_judge_result": self.data.get("final_judge_result"),
            "judge_type": self.data.get("judge_type"),
            "error": self.data.get("error", ""),
        }

    def get_performance_summary(self) -> Dict[str, Any]:
        """Get performance summary information"""
        trace_data = self.data.get("trace_data", {})
        return trace_data.get("performance_summary", {})

    # ==================== Main Agent Message History ====================

    def get_main_agent_history(self) -> Dict[str, Any]:
        """Get main agent message history"""
        return self.data.get("main_agent_message_history", {})

    def get_main_agent_messages(self) -> List[Dict[str, Any]]:
        """Get main agent message list"""
        history = self.get_main_agent_history()
        return history.get("message_history", [])

    # ==================== Browser Agent Message History ====================

    def get_browser_agent_sessions(self) -> Dict[str, Any]:
        """Get all browser agent sessions"""
        # Try two possible key names
        browser_sessions = self.data.get("browser_agent_message_history_sessions", {})
        if not browser_sessions:
            browser_sessions = self.data.get("sub_agent_message_history_sessions", {})
        return browser_sessions

    def get_browser_agent_session_messages(
        self, session_id: str
    ) -> List[Dict[str, Any]]:
        """Get message list for specified session"""
        sessions = self.get_browser_agent_sessions()
        session = sessions.get(session_id, {})
        return session.get("message_history", [])

    # ==================== MCP Tool Call Parsing ====================

    def parse_mcp_tool_call(self, text: str) -> Optional[Dict[str, Any]]:
        """Parse MCP tool call"""
        pattern = r"<use_mcp_tool>\s*<server_name>(.*?)</server_name>\s*<tool_name>(.*?)</tool_name>\s*<arguments>\s*(.*?)\s*</arguments>\s*</use_mcp_tool>"

        match = re.search(pattern, text, re.DOTALL)
        if match:
            server_name = match.group(1).strip()
            tool_name = match.group(2).strip()
            arguments_str = match.group(3).strip()

            try:
                arguments = json.loads(arguments_str)
            except json.JSONDecodeError:
                arguments = arguments_str

            return {
                "server_name": server_name,
                "tool_name": tool_name,
                "arguments": arguments,
            }

        return None

    def extract_text_content(self, content) -> str:
        """Extract text from message content"""
        if isinstance(content, list):
            text_parts = []
            for item in content:
                if isinstance(item, dict) and item.get("type") == "text":
                    text_parts.append(item.get("text", ""))
            return "".join(text_parts)
        return str(content)

    def analyze_conversation_flow(self) -> List[Dict[str, Any]]:
        """Analyze conversation flow, including tool calls"""
        flow_steps = []
        main_messages = self.get_main_agent_messages()
        sub_agent_sessions = self.get_browser_agent_sessions()

        sub_agent_call_count = 0

        for i, message in enumerate(main_messages):
            role = message.get("role")
            content = message.get("content", [])

            text_content = self.extract_text_content(content)

            step = {
                "step_id": i,
                "agent": "main_agent",
                "role": role,
                "content_preview": text_content[:200] + "..."
                if len(text_content) > 200
                else text_content,
                "full_content": text_content,
                "tool_calls": [],
                "browser_session": None,
                "timestamp": message.get("timestamp", ""),
                "browser_flow": [],
            }

            # If it's an assistant message, check for tool calls
            if role == "assistant":
                # Check for new format tool_calls
                if "tool_calls" in message and message["tool_calls"]:
                    for tool_call in message["tool_calls"]:
                        # Convert new format to unified format
                        if "function" in tool_call:
                            function_info = tool_call["function"]
                            tool_name = function_info.get("name", "")
                            arguments = function_info.get("arguments", "")

                            # Parse arguments string as JSON (if it's a string)
                            if isinstance(arguments, str):
                                try:
                                    arguments = json.loads(arguments)
                                except json.JSONDecodeError:
                                    pass

                            # Extract server_name from tool_name (if available)
                            server_name, actual_tool_name = (
                                self._parse_new_format_tool_name(tool_name)
                            )

                            parsed_tool_call = {
                                "server_name": server_name,
                                "tool_name": actual_tool_name,
                                "arguments": arguments,
                                "id": tool_call.get("id", ""),
                                "type": tool_call.get("type", "function"),
                                "format": "new",
                            }
                            step["tool_calls"].append(parsed_tool_call)

                            # Handle browser agent calls - maintain complete consistency with MCP format logic
                            if server_name.startswith("agent-"):
                                sub_agent_call_count += 1
                                session_id = f"{server_name}_{sub_agent_call_count}"
                                step["browser_session"] = session_id

                                # Analyze browser session conversation flow
                                if session_id in sub_agent_sessions:
                                    browser_flow = self.analyze_browser_session_flow(
                                        session_id
                                    )
                                    step["browser_flow"] = browser_flow
                            elif server_name.startswith("browsing-agent"):
                                sub_agent_call_count += 1
                                session_id = f"browser_agent_{sub_agent_call_count}"
                                step["browser_session"] = session_id

                                # Analyze browser session conversation flow
                                if session_id in sub_agent_sessions:
                                    browser_flow = self.analyze_browser_session_flow(
                                        session_id
                                    )
                                    step["browser_flow"] = browser_flow

                # Check for old format MCP tool calls (maintain compatibility)
                mcp_tool_call = self.parse_mcp_tool_call(text_content)
                if mcp_tool_call:
                    mcp_tool_call["format"] = "mcp"  # Mark as old format
                    step["tool_calls"].append(mcp_tool_call)

                    # If browsing-agent is called, associate browser session
                    if mcp_tool_call["server_name"].startswith("agent-"):
                        sub_agent_call_count += 1
                        session_id = (
                            f"{mcp_tool_call['server_name']}_{sub_agent_call_count}"
                        )
                        step["browser_session"] = session_id

                        # Analyze browser session conversation flow
                        if session_id in sub_agent_sessions:
                            browser_flow = self.analyze_browser_session_flow(session_id)
                            step["browser_flow"] = browser_flow
                    elif mcp_tool_call["server_name"].startswith("browsing-agent"):
                        sub_agent_call_count += 1
                        session_id = f"browser_agent_{sub_agent_call_count}"
                        step["browser_session"] = session_id

                        # Analyze browser session conversation flow
                        if session_id in sub_agent_sessions:
                            browser_flow = self.analyze_browser_session_flow(session_id)
                            step["browser_flow"] = browser_flow
            flow_steps.append(step)

        return flow_steps

    def analyze_browser_session_flow(self, session_id: str) -> List[Dict[str, Any]]:
        """Analyze browser session conversation flow"""
        browser_messages = self.get_browser_agent_session_messages(session_id)
        browser_flow = []

        for i, message in enumerate(browser_messages):
            role = message.get("role")
            content = message.get("content", [])

            text_content = self.extract_text_content(content)

            step = {
                "step_id": i,
                "agent": session_id,
                "role": role,
                "content_preview": text_content[:200] + "..."
                if len(text_content) > 200
                else text_content,
                "full_content": text_content,
                "tool_calls": [],
                "timestamp": message.get("timestamp", ""),
            }

            # If it's an assistant message, check for tool calls
            if role == "assistant":
                # Check for new format tool_calls
                if "tool_calls" in message and message["tool_calls"]:
                    for tool_call in message["tool_calls"]:
                        # Convert new format to unified format
                        if "function" in tool_call:
                            function_info = tool_call["function"]
                            tool_name = function_info.get("name", "")
                            arguments = function_info.get("arguments", "")

                            # Parse arguments string as JSON (if it's a string)
                            if isinstance(arguments, str):
                                try:
                                    arguments = json.loads(arguments)
                                except json.JSONDecodeError:
                                    pass

                            # Extract server_name from tool_name (if available)
                            server_name, actual_tool_name = (
                                self._parse_new_format_tool_name(tool_name)
                            )

                            parsed_tool_call = {
                                "server_name": server_name,
                                "tool_name": actual_tool_name,
                                "arguments": arguments,
                                "id": tool_call.get("id", ""),
                                "type": tool_call.get("type", "function"),
                                "format": "new",
                            }
                            step["tool_calls"].append(parsed_tool_call)

                # Check for old format MCP tool calls (maintain compatibility)
                mcp_tool_call = self.parse_mcp_tool_call(text_content)
                if mcp_tool_call:
                    mcp_tool_call["format"] = "mcp"  # Mark as old format
                    step["tool_calls"].append(mcp_tool_call)

            browser_flow.append(step)

        return browser_flow

    def get_execution_summary(self) -> Dict[str, Any]:
        """Get execution summary information"""
        flow_steps = self.analyze_conversation_flow()

        total_steps = len(flow_steps)
        tool_calls = []
        browser_sessions = []

        for step in flow_steps:
            if step["tool_calls"]:
                tool_calls.extend(step["tool_calls"])
            if step.get("browser_session"):
                browser_sessions.append(step["browser_session"])

            # Collect tool calls from browser sessions
            if step.get("browser_flow"):
                for browser_step in step["browser_flow"]:
                    if browser_step.get("tool_calls"):
                        tool_calls.extend(browser_step["tool_calls"])

        # Tool usage statistics
        tool_usage = {}
        for tool in tool_calls:
            # Choose appropriate key name generation method based on format
            if tool.get("format") == "new":
                # New format: use server_name.tool_name, if server_name is unknown then use only tool_name
                if tool.get("server_name") != "unknown":
                    key = f"{tool['server_name']}.{tool['tool_name']}"
                else:
                    key = tool["tool_name"]
            else:
                # Old format (MCP): maintain original method
                key = f"{tool['server_name']}.{tool['tool_name']}"
            tool_usage[key] = tool_usage.get(key, 0) + 1

        return {
            "total_steps": total_steps,
            "total_tool_calls": len(tool_calls),
            "browser_sessions_count": len(browser_sessions),
            "tool_usage_distribution": tool_usage,
            "browser_sessions": browser_sessions,
        }

    def get_spans_summary(self) -> Dict[str, Any]:
        """Get spans statistical summary"""
        trace_data = self.data.get("trace_data", {})
        spans = trace_data.get("spans", [])

        agent_stats = {}
        for span in spans:
            agent = span.get("agent_context", "unknown")
            if agent not in agent_stats:
                agent_stats[agent] = {
                    "count": 0,
                    "total_duration": 0,
                    "span_types": set(),
                }
            agent_stats[agent]["count"] += 1
            agent_stats[agent]["total_duration"] += span.get("duration_seconds", 0)
            agent_stats[agent]["span_types"].add(span.get("name", "unknown"))

        # Convert set to list
        for agent in agent_stats:
            agent_stats[agent]["span_types"] = list(agent_stats[agent]["span_types"])

        return {
            "total_spans": len(spans),
            "total_duration": sum(span.get("duration_seconds", 0) for span in spans),
            "agent_stats": agent_stats,
        }

    def get_step_logs_summary(self) -> Dict[str, Any]:
        """Get step logs summary statistics"""
        logs = self.data.get("step_logs", [])

        status_count = {}
        step_type_count = {}

        for log in logs:
            status = log.get("status", "unknown")
            step_name = log.get("step_name", "unknown")

            status_count[status] = status_count.get(status, 0) + 1
            step_type_count[step_name] = step_type_count.get(step_name, 0) + 1

        return {
            "total_logs": len(logs),
            "status_distribution": status_count,
            "step_type_distribution": step_type_count,
        }


================================================
FILE: assets/LOCAL-TOOL-DEPLOYMENT.md
================================================
# Local Tool Deployment Guide

This guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration.

## Overview

MiroThinker supports several optional open-source tools that you can deploy locally:

- **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files
- **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images
- **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks

These tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file.

## Prerequisites

- **GPU**: NVIDIA GPU with sufficient VRAM
- **Python 3.10+**
- **CUDA**: Compatible CUDA toolkit installed
- **Model Storage**: Sufficient disk space to download model checkpoints

## Tool Deployment

### 1. Audio Transcription Tool (`tool-transcribe-os`)

**Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo)

**Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs.

**Deployment with vLLM**:

```bash
# Install vLLM with audio support
pip install vllm==0.10.0
pip install vllm[audio]

# Start the server
vllm serve openai/whisper-large-v3-turbo \
  --served-model-name whisper-large-v3-turbo \
  --task transcription \
  --host 0.0.0.0 \
  --port 8000
```

**Configuration in `.env`**:

```bash
WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
WHISPER_API_KEY=your_api_key  # Optional, if your server requires authentication
WHISPER_BASE_URL="http://0.0.0.0:8000/v1"
```

### 2. Visual Question Answering Tool (`tool-vqa-os`)

**Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)

**Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats.

**Deployment with SGLang**:

```bash
# Install SGLang
pip install sglang[all]

# Start the server
python3 -m sglang.launch_server \
  --model-path Qwen/Qwen2.5-VL-72B-Instruct \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8001 \
  --trust-remote-code \
  --enable-metrics
```

**Configuration in `.env`**:

```bash
VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
VISION_API_KEY=your_api_key  # Optional, if your server requires authentication
VISION_BASE_URL="http://0.0.0.0:8001/v1/chat/completions"
```

### 3. Reasoning Engine Tool (`tool-reasoning-os`)

**Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)

**Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens).

**Deployment with SGLang**:

```bash
# Install SGLang
pip install sglang[all]

# Start the server
python3 -m sglang.launch_server \
  --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8002 \
  --trust-remote-code \
  --context-length 131072 \
  --enable-metrics
```

**Configuration in `.env`**:

```bash
REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
REASONING_API_KEY=your_api_key  # Optional, if your server requires authentication
REASONING_BASE_URL="http://0.0.0.0:8002/v1/chat/completions"
```

## Using Deployed Tools

Once you have deployed the tools, configure your agent to use them:

1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`):

```yaml
main_agent:
  tools:
    - tool-python
    - search_and_scrape_webpage
    - jina_scrape_llm_summary
    - tool-transcribe-os    # Use local Whisper deployment
    - tool-vqa-os           # Use local Qwen2.5-VL deployment
    - tool-reasoning-os     # Use local Qwen3-235B deployment
  max_turns: 400
```

2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above.

1. **Run your agent**:

```bash
cd apps/miroflow-agent
uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1
```

## Commercial Alternatives

If you prefer not to deploy these tools locally, you can use commercial alternatives:

- **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API
- **`tool-vqa`**: Uses Claude Sonnet 3.7 API
- **`tool-reasoning`**: Uses Claude Sonnet 3.7 API

Simply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`).

## Additional Resources

- **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/)
- **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/)
- **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations


================================================
FILE: assets/QA.md
================================================
# MiroFlow QA Documentation

## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations?

**Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools.

### Step-by-Step Process

1. **Extract GAIA-Text-103 Tasks**

   ```bash
   # Extract text-103 tasks to a separate directory
   uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation
   ```

   This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation`

1. **Re-grade with GAIA-Text-103 Evaluator**

   ```bash
   # Apply GAIA-Text-103 specific grading
   uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
   ```

1. **Verify Results**

   ```bash
   # Check accuracy and generate statistics
   uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
   ```

## Q2: Does the choice of judgment model affect evaluation performance?

**Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models.

We have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons:

- **Ease of deployment:** No need to host additional GPU-intensive models
- **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp)
- **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons

## Code Quality Checks

Before submitting a pull request, ensure your code meets our quality standards:

```bash
# Fix linting issues automatically
uv tool run ruff@0.8.0 check --fix .

# Format code according to our style guidelines
uv tool run ruff@0.8.0 format .
```

## Know Issues

- The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate.


================================================
FILE: assets/qwen3_nonthinking.jinja
================================================
 {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
{%- endfor %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in message.content %}
                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.last or (not loop.last and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n<think>\n\n</think>\n\n' }}
{%- endif %}

================================================
FILE: justfile
================================================
default:
    just --list

# lint monorepo
[group('precommit')]
lint:
    uv tool run ruff@0.8.0 check --fix .

# sort imports
[group('precommit')]
sort-imports:
    uv tool run ruff@0.8.0 check --select I --fix .

# format monorepo
[group('precommit')]
format:
    uv tool run ruff@0.8.0 format .

# check license
[group('precommit')]
check-license:
    uv run reuse lint

# insert license for contributor
insert-license:
    # https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings
    git diff --name-only --cached | xargs -I {} reuse annotate -c "$(git config --get user.name) <$(git config --get user.email)>" "{}"

# format markdown files
[group('precommit')]
format-md:
    find . -name "*.md" -type f | xargs uv tool run mdformat@0.7.17

# run precommit before PR
[group('precommit')]
precommit: lint sort-imports format-md format


================================================
FILE: libs/miroflow-tools/README.md
================================================
# 🛠️ MiroFlow Tools

> A comprehensive tool management system and MCP (Model Context Protocol) server collection for MiroFlow, providing a unified interface to various AI capabilities including code execution, vision processing, audio transcription, web searching, reasoning, and document reading.

## ✨ Features

- **🔧 Unified Tool Management**: Centralized `ToolManager` for managing multiple MCP servers
- **🌐 Multiple Transport Protocols**: Support for both stdio and SSE (HTTP) connections
- **📦 Rich Tool Ecosystem**: Pre-built MCP servers for common AI tasks
- **⚙️ Flexible Configuration**: Tool blacklisting, timeout management, and custom server configurations
- **🛡️ Error Handling**: Robust retry logic and fallback mechanisms

## 📦 Installation

This package is a local dependency that is automatically installed when you run `uv sync` in the `apps/miroflow-agent` directory. No separate installation is required.

For standalone usage or development:

```bash
cd libs/miroflow-tools
uv sync
```

## 📋 MCP Servers Overview

Quick reference tables of all available MCP servers and their tools. Click on "Details" to jump to the full documentation.

### 📊 Tools Used in MiroThinker v1.0 and v1.5

The following tools were used in the MiroThinker v1.0 and v1.5 evaluation:

| Category                   | Server Name                 | Tools                                                                                                                | Key Environment Variables                                                                 | Link                                     |
|----------------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------------------|
| **Execution Environment**  | `tool-python`               | `create_sandbox`, `run_command`, `run_python_code`                                                                   | `E2B_API_KEY`, `LOGS_DIR`                                                                 | [Details](#tool-python)                  |
| **File Management**        | `tool-python`               | `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY`, `LOGS_DIR`                                                                 | [Details](#tool-python)                  |
| **Information Retrieval**  | `search_and_scrape_webpage` | `google_search`                                                                                                      | `SERPER_API_KEY`, `SERPER_BASE_URL`                                                        | [Details](#search_and_scrape_webpage)    |
| **Information Retrieval**  | `jina_scrape_llm_summary`   | `scrape_and_extract_info`                                                                                            | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | [Details](#jina_scrape_llm_summary)      |

### 🔧 Additional Available Tools

The following tools are implemented but were not used in the MiroThinker v1.0/v1.5 evaluation:

| Category                    | Server Name          | Tools                                             | Key Environment Variables                                           | Link                           |
|-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------|
| **Web Searching**           | `tool-google-search` | `google_search`, `scrape_website`                 | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) |
| **Web Searching (Sogou)**  | `tool-sogou-search` | `sogou_search`, `scrape_website`                 | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) |
| **Vision Processing**       | `tool-vqa`           | `visual_question_answering`                       | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`                            | [Details](#tool-vqa)           |
| **Vision Processing**       | `tool-vqa-os`        | `visual_question_answering`                       | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`            | [Details](#tool-vqa-os)        |
| **Audio Processing**        | `tool-transcribe`    | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL`                                  | [Details](#tool-transcribe)    |
| **Audio Processing**        | `tool-transcribe-os` | `audio_transcription`                             | `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`         | [Details](#tool-transcribe-os) |
| **Document Reading**        | `tool-reading`       | `convert_to_markdown`                             | None required                                                       | [Details](#tool-reading)       |
| **Reasoning Engine**        | `tool-reasoning`     | `reasoning`                                       | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`                            | [Details](#tool-reasoning)     |
| **Reasoning Engine**        | `tool-reasoning-os`  | `reasoning`                                       | `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`   | [Details](#tool-reasoning-os)  |

## 🚀 Quick Start

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    # Initialize tool manager with server configurations
    server_configs = [
        {
            "name": "tool-python",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
                env={"E2B_API_KEY": "your_e2b_api_key"}  # Required for Python execution
            )
        },
        # Add more server configurations...
    ]

    tool_manager = ToolManager(server_configs)

    # Get all available tool definitions
    tool_definitions = await tool_manager.get_all_tool_definitions()

    # Create a sandbox first
    sandbox_result = await tool_manager.execute_tool_call(
        server_name="tool-python",
        tool_name="create_sandbox",
        arguments={"timeout": 600}
    )

    # Extract sandbox_id from result
    sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()

    # Execute a tool call
    result = await tool_manager.execute_tool_call(
    server_name="tool-python",
    tool_name="run_python_code",
        arguments={"code_block": "print('Hello, World!')", "sandbox_id": sandbox_id}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

## 🔧 ToolManager

The `ToolManager` class is the central component for managing and executing tools across multiple MCP servers.

### Key Features

- **🔌 Multi-Server Support**: Manage tools from multiple MCP servers simultaneously
- **🔗 Connection Management**: Automatic connection handling for stdio and SSE transports
- **🚫 Tool Blacklisting**: Filter out specific tools from specific servers
- **📝 Structured Logging**: Optional task logging integration
- **🔄 Error Recovery**: Automatic retry logic and fallback mechanisms

### Methods

- `get_all_tool_definitions()`: Retrieve tool schemas from all configured servers
- `execute_tool_call(server_name, tool_name, arguments)`: Execute a specific tool
- `set_task_log(task_log)`: Enable structured logging
- `get_server_params(server_name)`: Get configuration for a specific server

### Example Usage

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    # Configure servers
    server_configs = [
        {
            "name": "python-server",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
                env={"E2B_API_KEY": "your_key"}
            )
        }
    ]

    # Initialize with optional blacklist
    tool_blacklist = {("python-server", "some_tool")}
    manager = ToolManager(server_configs, tool_blacklist=tool_blacklist)

    # Enable logging
    # manager.set_task_log(your_task_logger)

    # Get tools
    tools = await manager.get_all_tool_definitions()

    # Create a sandbox first (required before running code)
    sandbox_result = await manager.execute_tool_call(
        server_name="python-server",
        tool_name="create_sandbox",
        arguments={"timeout": 600}
    )
    sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()

    # Execute tool
    result = await manager.execute_tool_call(
        server_name="python-server",
        tool_name="run_python_code",
        arguments={"code_block": "1 + 1", "sandbox_id": sandbox_id}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

## 🔌 MCP Servers

### Server: tool-python

Execute Python code in isolated E2B sandboxes with persistent sessions.

**Tools**:

- 🔨 `create_sandbox(timeout=600)`: Create a new Linux sandbox
- 🐍 `run_python_code(code_block, sandbox_id)`: Execute Python code
- 💻 `run_command(command, sandbox_id)`: Run shell commands
- ⬆️ `upload_file_from_local_to_sandbox(sandbox_id, local_file_path, sandbox_file_path)`: Upload files
- ⬇️ `download_file_from_internet_to_sandbox(sandbox_id, url, sandbox_file_path)`: Download files
- 💾 `download_file_from_sandbox_to_local(sandbox_id, sandbox_file_path, local_filename)`: Download files

**Environment Variables**:

- 🔑 `E2B_API_KEY`: E2B API key (required)
- 📁 `LOGS_DIR`: Directory for temporary files (default: `../../logs`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    # Configure server with environment variables
    server_configs = [
        {
            "name": "tool-python",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
                env={"E2B_API_KEY": "your_e2b_api_key"}
            )
        }
    ]

    manager = ToolManager(server_configs)

    # Create sandbox
    result = await manager.execute_tool_call(
        server_name="tool-python",
        tool_name="create_sandbox",
        arguments={"timeout": 600}
    )

    # Extract sandbox_id from result
    sandbox_id = result['result'].split('sandbox_id:')[-1].strip()

    # Run code
    result = await manager.execute_tool_call(
        server_name="tool-python",
        tool_name="run_python_code",
        arguments={"code_block": "import numpy as np; print(np.array([1,2,3]))", "sandbox_id": sandbox_id}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-vqa

Analyze images and answer questions about visual content using Anthropic Claude.

**Tools**:

- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images

**Environment Variables**:

- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)
- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-vqa",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"],
                env={
                    "ANTHROPIC_API_KEY": "your_anthropic_api_key",
                    "ANTHROPIC_BASE_URL": "https://api.anthropic.com"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-vqa",
        tool_name="visual_question_answering",
        arguments={
            "image_path_or_url": "https://example.com/image.jpg",
            "question": "What is in this image?"
        }
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-vqa-os

Analyze images and answer questions about visual content using open-source compatible models.

**Tools**:

- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images

**Environment Variables**:

- 🔑 `VISION_API_KEY`: API key (required)
- 🌐 `VISION_BASE_URL`: API endpoint URL (required)
- 🤖 `VISION_MODEL_NAME`: Model name (required)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-vqa-os",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"],
                env={
                    "VISION_API_KEY": "your_vision_api_key",
                    "VISION_BASE_URL": "your_vision_base_url",
                    "VISION_MODEL_NAME": "your_vision_model_name"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-vqa-os",
        tool_name="visual_question_answering",
        arguments={
            "image_path_or_url": "https://example.com/image.jpg",
            "question": "What is in this image?"
        }
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-transcribe

Transcribe audio files and answer questions about audio content using OpenAI Whisper.

**Tools**:

- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text
- 🎧 `audio_question_answering(audio_path_or_url, question)`: Answer questions about audio

**Environment Variables**:

- 🔑 `OPENAI_API_KEY`: OpenAI API key (required)
- 🌐 `OPENAI_BASE_URL`: API base URL (default: `https://api.openai.com/v1`)

**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-transcribe",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"],
                env={
                    "OPENAI_API_KEY": "your_openai_api_key",
                    "OPENAI_BASE_URL": "https://api.openai.com/v1"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    # Transcribe audio
    result = await manager.execute_tool_call(
        server_name="tool-transcribe",
        tool_name="audio_transcription",
        arguments={"audio_path_or_url": "/path/to/audio.mp3"}
    )
    print(result)

    # Answer questions about audio
    result = await manager.execute_tool_call(
        server_name="tool-transcribe",
        tool_name="audio_question_answering",
        arguments={
            "audio_path_or_url": "/path/to/audio.mp3",
            "question": "What is the main topic discussed?"
        }
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-transcribe-os

Transcribe audio files using open-source compatible models.

**Tools**:

- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text

**Environment Variables**:

- 🔑 `WHISPER_API_KEY`: API key (required)
- 🌐 `WHISPER_BASE_URL`: API endpoint URL (required)
- 🤖 `WHISPER_MODEL_NAME`: Model name (required)

**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-transcribe-os",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"],
                env={
                    "WHISPER_API_KEY": "your_whisper_api_key",
                    "WHISPER_BASE_URL": "your_whisper_base_url",
                    "WHISPER_MODEL_NAME": "your_whisper_model_name"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-transcribe-os",
        tool_name="audio_transcription",
        arguments={"audio_path_or_url": "/path/to/audio.mp3"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-reading

Convert various document formats to Markdown using MarkItDown.

**Tools**:

- 📄 `convert_to_markdown(uri)`: Convert documents (PDF, DOC, PPT, Excel, CSV, ZIP, etc.) to Markdown. URI must start with `file:`, `data:`, `http:`, or `https:` scheme.

**Supported Formats**: 📄 PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, CSV, ZIP, and more

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    # Configure server (no additional environment variables required)
    server_configs = [
        {
            "name": "tool-reading",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"]
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-reading",
        tool_name="convert_to_markdown",
        arguments={"uri": "file:///path/to/document.pdf"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-reasoning

Solve complex reasoning problems requiring chain-of-thought using Anthropic Claude with thinking.

**Tools**:

- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions

**Environment Variables**:

- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)
- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-reasoning",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server"],
                env={
                    "ANTHROPIC_API_KEY": "your_anthropic_api_key",
                    "ANTHROPIC_BASE_URL": "https://api.anthropic.com"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-reasoning",
        tool_name="reasoning",
        arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-reasoning-os

Solve complex reasoning problems requiring chain-of-thought using open-source compatible models.

**Tools**:

- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions

**Environment Variables**:

- 🔑 `REASONING_API_KEY`: API key (required)
- 🌐 `REASONING_BASE_URL`: API endpoint URL (required)
- 🤖 `REASONING_MODEL_NAME`: Model name (required)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-reasoning-os",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server_os"],
                env={
                    "REASONING_API_KEY": "your_reasoning_api_key",
                    "REASONING_BASE_URL": "your_reasoning_base_url",
                    "REASONING_MODEL_NAME": "your_reasoning_model_name"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="tool-reasoning-os",
        tool_name="reasoning",
        arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: search_and_scrape_webpage

Google search via Serper API. Used in MiroThinker v1.0/v1.5 evaluation.

**Tools**:

- 🔍 `google_search(q, gl="us", hl="en", location=None, num=None, tbs=None, page=None, autocorrect=None)`: Perform web searches via Serper API and retrieve rich results

**Environment Variables**:

- 🔑 `SERPER_API_KEY`: Serper API key (required)
- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "search_and_scrape_webpage",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.dev_mcp_servers.search_and_scrape_webpage"],
                env={
                    "SERPER_API_KEY": "your_serper_api_key",
                    "SERPER_BASE_URL": "https://google.serper.dev"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="search_and_scrape_webpage",
        tool_name="google_search",
        arguments={
            "q": "Python async programming",
            "gl": "us",
            "hl": "en",
            "num": 10
        }
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: jina_scrape_llm_summary

Scrape content from URLs and extract meaningful information using an LLM. Used in MiroThinker v1.0/v1.5 evaluation.

**Tools**:

- 🔎 `scrape_and_extract_info(url, info_to_extract, custom_headers=None)`: Scrape content from a URL (web pages, PDFs, code files, etc.) and extract meaningful information using an LLM

**Environment Variables**:

- 🔑 `JINA_API_KEY`: Jina.ai API key (required)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)
- 🔗 `SUMMARY_LLM_BASE_URL`: LLM API base URL for summarization (required)
- 🤖 `SUMMARY_LLM_MODEL_NAME`: LLM model name for summarization (required)
- 🔑 `SUMMARY_LLM_API_KEY`: LLM API key for summarization (optional, depends on LLM provider)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "jina_scrape_llm_summary",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary"],
                env={
                    "JINA_API_KEY": "your_jina_api_key",
                    "JINA_BASE_URL": "https://r.jina.ai",
                    "SUMMARY_LLM_BASE_URL": "your_llm_base_url",
                    "SUMMARY_LLM_MODEL_NAME": "your_llm_model_name",
                    "SUMMARY_LLM_API_KEY": "your_llm_api_key"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    result = await manager.execute_tool_call(
        server_name="jina_scrape_llm_summary",
        tool_name="scrape_and_extract_info",
        arguments={
            "url": "https://example.com/article",
            "info_to_extract": "What is the main topic of this article?"
        }
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-google-search

Google search via Serper API with website scraping capabilities.

**Tools**:

- 🔍 `google_search(q, gl="us", hl="en", location=None, num=10, tbs=None, page=1)`: Google search
- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai

**Environment Variables**:

- 🔑 `SERPER_API_KEY`: Serper API key (required for Google search)
- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)
- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)

**Filtering Options** (via environment variables):

- 🚫 `REMOVE_SNIPPETS`: Remove snippets from search results
- 🚫 `REMOVE_KNOWLEDGE_GRAPH`: Remove knowledge graph from results
- 🚫 `REMOVE_ANSWER_BOX`: Remove answer box from results

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-google-search",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.searching_google_mcp_server"],
                env={
                    "SERPER_API_KEY": "your_serper_api_key",
                    "SERPER_BASE_URL": "https://google.serper.dev",
                    "JINA_API_KEY": "your_jina_api_key",
                    "JINA_BASE_URL": "https://r.jina.ai"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    # Google search
    result = await manager.execute_tool_call(
        server_name="tool-google-search",
        tool_name="google_search",
        arguments={
            "q": "Python async programming",
            "gl": "us",
            "hl": "en",
            "num": 10
        }
    )
    print(result)

    # Scrape website
    result = await manager.execute_tool_call(
        server_name="tool-google-search",
        tool_name="scrape_website",
        arguments={"url": "https://example.com/article"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

### Server: tool-sogou-search

Sogou search (optimized for Chinese) with website scraping capabilities. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation*

**Tools**:

- 🔍 `sogou_search(Query, Cnt=10)`: Sogou search (Chinese)
- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai

**Environment Variables**:

- 🔑 `TENCENTCLOUD_SECRET_ID`: Tencent Cloud secret ID (required)
- 🔑 `TENCENTCLOUD_SECRET_KEY`: Tencent Cloud secret key (required)
- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)

**Example**:

<details>
<summary>Click to expand code example</summary>

```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters

async def main():
    server_configs = [
        {
            "name": "tool-sogou-search",
            "params": StdioServerParameters(
                command="python",
                args=["-m", "miroflow_tools.mcp_servers.searching_sogou_mcp_server"],
                env={
                    "TENCENTCLOUD_SECRET_ID": "your_tencent_secret_id",
                    "TENCENTCLOUD_SECRET_KEY": "your_tencent_secret_key",
                    "JINA_API_KEY": "your_jina_api_key",
                    "JINA_BASE_URL": "https://r.jina.ai"
                }
            )
        }
    ]

    manager = ToolManager(server_configs)

    # Sogou search
    result = await manager.execute_tool_call(
        server_name="tool-sogou-search",
        tool_name="sogou_search",
        arguments={
            "Query": "Python 异步编程",
            "Cnt": 10
        }
    )
    print(result)

    # Scrape website
    result = await manager.execute_tool_call(
        server_name="tool-sogou-search",
        tool_name="scrape_website",
        arguments={"url": "https://example.com/article"}
    )
    print(result)

if __name__ == "__main__":
    asyncio.run(main())
```

</details>

## 🚀 Development

### Adding a New MCP Server

1. Create a new server file in `mcp_servers/`
1. Use `FastMCP` to define tools:
   ```python
   from fastmcp import FastMCP
   mcp = FastMCP("server-name")

   @mcp.tool()
   async def my_tool(arg: str) -> str:
       """Tool description."""
       return "result"

   if __name__ == "__main__":
       mcp.run(transport="stdio")
   ```
1. Add server configuration to your application
1. Update this README with server documentation


================================================
FILE: libs/miroflow-tools/pyproject.toml
================================================
[project]
name = "miroflow-tools"
version = "0.1.0"
description = "Tool management and MCP server utilities for MiroFlow"
readme = "README.md"
authors = [
    { name = "MiroMind Team", email = "service@miromind.ai" }
]
requires-python = ">=3.12"
dependencies = [
    "mcp>=1.0.0",
    "fastmcp>=0.1.0",
    "playwright>=1.40.0",
    "requests>=2.32.0",
    "e2b-code-interpreter==1.2.1",
    "wikipedia",
    "mutagen",
    "markitdown-mcp>=0.0.1a3",
    "google-genai",
    "aiohttp",
    "redis"
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src/miroflow_tools"]

[dependency-groups]
dev = [
    "pytest>=8.4.1",
    "pytest-asyncio>=1.0.0",
    "pytest-cov>=6.2.1",
    "pytest-html>=4.1.1",
    "pytest-xdist>=3.7.0",
    "pytest-mock>=3.10.0",
    "pytest-timeout>=2.1.0",
    "inline-snapshot>=0.23.2",
]

[tool.pytest.ini_options]
minversion = "8.3.5"
testpaths = ["src/test"]
asyncio_default_fixture_loop_scope = "function"
addopts = [
    "-rA",
    "--show-capture=stderr",
    "-n=auto",
    "--html=report.html",
    "--self-contained-html",
    "--cov=miroflow_tools",
    "--cov-report=html",
    "--strict-markers",
    "-v",
]
markers = [
    "integration: marks tests as integration tests (may be slow)",
    "unit: marks tests as unit tests",
    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
    "requires_api_key: marks tests that require real API credentials",
] 

================================================
FILE: libs/miroflow-tools/src/__init__.py
================================================


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

from .manager import ToolManager

__all__ = ["ToolManager"]


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/jina_scrape_llm_summary.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import json
import logging
import os
from typing import Any, Dict

import httpx
from mcp.server.fastmcp import FastMCP

# Configure logging
logger = logging.getLogger("miroflow")

SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL")
SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME")
SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY")

JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")

# Initialize FastMCP server
mcp = FastMCP("jina_scrape_llm_summary")


@mcp.tool()
async def scrape_and_extract_info(
    url: str, info_to_extract: str, custom_headers: Dict[str, str] = None
):
    """
    Scrape content from a URL, including web pages, PDFs, code files, and other supported resources, and extract meaningful information using an LLM.
    If you need to extract information from a PDF, please use this tool.

    Args:
        url (str): The URL to scrape content from. Supports various types of URLs such as web pages, PDFs, raw text/code files (e.g., GitHub, Gist), and similar sources.
        info_to_extract (str): The specific types of information to extract (usually a question)
        custom_headers (Dict[str, str]): Additional headers to include in the scraping request

    Returns:
        Dict[str, Any]: A dictionary containing:
            - success (bool): Whether the operation was successful
            - url (str): The original URL
            - extracted_info (str): The extracted information
            - error (str): Error message if the operation failed
            - scrape_stats (Dict): Statistics about the scraped content
            - model_used (str): The model used for summarization
            - tokens_used (int): Number of tokens used (if available)
    """
    if _is_huggingface_dataset_or_space_url(url):
        return json.dumps(
            {
                "success": False,
                "url": url,
                "extracted_info": "",
                "error": "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.",
                "scrape_stats": {},
                "tokens_used": 0,
            },
            ensure_ascii=False,
        )

    # First, scrape the content with Jina
    scrape_result = await scrape_url_with_jina(url, custom_headers)

    # If Jina fails, try direct Python scraping as fallback
    if not scrape_result["success"]:
        logger.warning(
            f"Jina Scrape and Extract Info: Jina scraping failed: {scrape_result['error']}, trying direct Python scraping as fallback"
        )
        scrape_result = await scrape_url_with_python(url, custom_headers)

        if not scrape_result["success"]:
            logger.error(
                f"Jina Scrape and Extract Info: Both Jina and Python scraping failed: {scrape_result['error']}"
            )
            return json.dumps(
                {
                    "success": False,
                    "url": url,
                    "extracted_info": "",
                    "error": f"Scraping failed (both Jina and Python): {scrape_result['error']}",
                    "scrape_stats": {},
                    "tokens_used": 0,
                },
                ensure_ascii=False,
            )
        else:
            logger.info(
                f"Jina Scrape and Extract Info: Python fallback scraping succeeded for URL: {url}"
            )

    # Then, summarize the content
    extracted_result = await extract_info_with_llm(
        url=url,
        content=scrape_result["content"],
        info_to_extract=info_to_extract,
        model=SUMMARY_LLM_MODEL_NAME,
        max_tokens=8192,
    )

    # Combine results
    return json.dumps(
        {
            "success": extracted_result["success"],
            "url": url,
            "extracted_info": extracted_result["extracted_info"],
            "error": extracted_result["error"],
            "scrape_stats": {
                "line_count": scrape_result["line_count"],
                "char_count": scrape_result["char_count"],
                "last_char_line": scrape_result["last_char_line"],
                "all_content_displayed": scrape_result["all_content_displayed"],
            },
            "model_used": extracted_result["model_used"],
            "tokens_used": extracted_result["tokens_used"],
        },
        ensure_ascii=False,
    )


def _is_huggingface_dataset_or_space_url(url):
    """
    Check if the URL is a HuggingFace dataset or space URL.
    :param url: The URL to check
    :return: True if it's a HuggingFace dataset or space URL, False otherwise
    """
    if not url:
        return False
    return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url


async def scrape_url_with_jina(
    url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4
) -> Dict[str, Any]:
    """
    Scrape content from a URL and save to a temporary file. Need to read the content from the temporary file.


    Args:
        url (str): The URL to scrape content from
        custom_headers (Dict[str, str]): Additional headers to include in the request
        max_chars (int): Maximum number of characters to reserve for the scraped content

    Returns:
        Dict[str, Any]: A dictionary containing:
            - success (bool): Whether the operation was successful
            - filename (str): Absolute path to the temporary file containing the scraped content
            - content (str): The scraped content of the first 40k characters
            - error (str): Error message if the operation failed
            - line_count (int): Number of lines in the scraped content
            - char_count (int): Number of characters in the scraped content
            - last_char_line (int): Line number where the last displayed character is located
            - all_content_displayed (bool): Signal indicating if all content was displayed (True if content <= 40k chars)
    """

    # Validate input
    if not url or not url.strip():
        return {
            "success": False,
            "filename": "",
            "content": "",
            "error": "URL cannot be empty",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Get API key from environment
    if not JINA_API_KEY:
        return {
            "success": False,
            "filename": "",
            "content": "",
            "error": "JINA_API_KEY environment variable is not set",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Avoid duplicate Jina URL prefix
    if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
        url = url[len("https://r.jina.ai/") :]

    # Construct the Jina.ai API URL
    jina_url = f"{JINA_BASE_URL}/{url}"

    try:
        # Prepare headers
        headers = {
            "Authorization": f"Bearer {JINA_API_KEY}",
        }

        # Add custom headers if provided
        if custom_headers:
            headers.update(custom_headers)

        # Retry configuration
        retry_delays = [1, 2, 4, 8]

        for attempt, delay in enumerate(retry_delays, 1):
            try:
                # Make the request using httpx library
                async with httpx.AsyncClient() as client:
                    response = await client.get(
                        jina_url,
                        headers=headers,
                        timeout=httpx.Timeout(None, connect=20, read=60),
                        follow_redirects=True,  # Follow redirects (equivalent to curl -L)
                    )

                # Check if request was successful
                response.raise_for_status()
                break  # Success, exit retry loop

            except httpx.ConnectTimeout as e:
                # connection timeout, retry
                if attempt < len(retry_delays):
                    logger.info(
                        f"Jina Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Jina Scrape: Connection retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.ConnectError as e:
                # connection error, retry
                if attempt < len(retry_delays):
                    logger.info(
                        f"Jina Scrape: Connection error: {e}, {delay}s before next attempt"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Jina Scrape: Connection retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.ReadTimeout as e:
                # read timeout, retry
                if attempt < len(retry_delays):
                    logger.info(
                        f"Jina Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Jina Scrape: Read timeout retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.HTTPStatusError as e:
                status_code = e.response.status_code

                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]

                if should_retry and attempt < len(retry_delays):
                    logger.info(
                        f"Jina Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}"
                    )
                    await asyncio.sleep(delay)
                    continue
                elif should_retry:
                    logger.error(
                        f"Jina Scrape: HTTP {status_code} retry exhausted, url: {url}"
                    )
                    raise e
                else:
                    logger.error(
                        f"Jina Scrape: HTTP {status_code} (non-retryable), url: {url}"
                    )
                    raise e

            except httpx.RequestError as e:
                if attempt < len(retry_delays):
                    logger.info(
                        f"Jina Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Jina Scrape: Unknown request exception retry attempts exhausted, url: {url}"
                    )
                    raise e

    except Exception as e:
        error_msg = f"Jina Scrape: Unexpected error occurred: {str(e)}"
        logger.error(error_msg)
        return {
            "success": False,
            "filename": "",
            "content": "",
            "error": error_msg,
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Get the scraped content
    content = response.text

    if not content:
        return {
            "success": False,
            "filename": "",
            "content": "",
            "error": "No content returned from Jina.ai API",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # handle insufficient balance error
    try:
        content_dict = json.loads(content)
    except json.JSONDecodeError:
        content_dict = None
    if (
        isinstance(content_dict, dict)
        and content_dict.get("name") == "InsufficientBalanceError"
    ):
        return {
            "success": False,
            "filename": "",
            "content": "",
            "error": "Insufficient balance",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Get content statistics
    total_char_count = len(content)
    total_line_count = content.count("\n") + 1 if content else 0

    # Extract first max_chars characters
    displayed_content = content[:max_chars]
    all_content_displayed = total_char_count <= max_chars

    # Calculate the line number of the last character displayed
    if displayed_content:
        # Count newlines up to the last displayed character
        last_char_line = displayed_content.count("\n") + 1
    else:
        last_char_line = 0

    return {
        "success": True,
        "content": displayed_content,
        "error": "",
        "line_count": total_line_count,
        "char_count": total_char_count,
        "last_char_line": last_char_line,
        "all_content_displayed": all_content_displayed,
    }


async def scrape_url_with_python(
    url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4
) -> Dict[str, Any]:
    """
    Fallback scraping method using Python's httpx library directly.

    Args:
        url (str): The URL to scrape content from
        custom_headers (Dict[str, str]): Additional headers to include in the request
        max_chars (int): Maximum number of characters to reserve for the scraped content

    Returns:
        Dict[str, Any]: A dictionary containing:
            - success (bool): Whether the operation was successful
            - content (str): The scraped content
            - error (str): Error message if the operation failed
            - line_count (int): Number of lines in the scraped content
            - char_count (int): Number of characters in the scraped content
            - last_char_line (int): Line number where the last displayed character is located
            - all_content_displayed (bool): Signal indicating if all content was displayed
    """
    # Validate input
    if not url or not url.strip():
        return {
            "success": False,
            "content": "",
            "error": "URL cannot be empty",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    try:
        # Prepare headers
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        # Add custom headers if provided
        if custom_headers:
            headers.update(custom_headers)

        # Retry configuration
        retry_delays = [1, 2, 4]

        for attempt, delay in enumerate(retry_delays, 1):
            try:
                # Make the request using httpx library
                async with httpx.AsyncClient() as client:
                    response = await client.get(
                        url,
                        headers=headers,
                        timeout=httpx.Timeout(None, connect=20, read=60),
                        follow_redirects=True,
                    )

                # Check if request was successful
                response.raise_for_status()
                break  # Success, exit retry loop

            except httpx.ConnectTimeout as e:
                if attempt < len(retry_delays):
                    logger.info(
                        f"Python Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Python Scrape: Connection retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.ConnectError as e:
                if attempt < len(retry_delays):
                    logger.info(
                        f"Python Scrape: Connection error: {e}, {delay}s before next attempt"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Python Scrape: Connection retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.ReadTimeout as e:
                if attempt < len(retry_delays):
                    logger.info(
                        f"Python Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Python Scrape: Read timeout retry attempts exhausted, url: {url}"
                    )
                    raise e

            except httpx.HTTPStatusError as e:
                status_code = e.response.status_code

                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]

                if should_retry and attempt < len(retry_delays):
                    logger.info(
                        f"Python Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}"
                    )
                    await asyncio.sleep(delay)
                    continue
                elif should_retry:
                    logger.error(
                        f"Python Scrape: HTTP {status_code} retry exhausted, url: {url}"
                    )
                    raise e
                else:
                    logger.error(
                        f"Python Scrape: HTTP {status_code} (non-retryable), url: {url}"
                    )
                    raise e

            except httpx.RequestError as e:
                if attempt < len(retry_delays):
                    logger.info(
                        f"Python Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        f"Python Scrape: Unknown request exception retry attempts exhausted, url: {url}"
                    )
                    raise e

    except Exception as e:
        error_msg = f"Python Scrape: Unexpected error occurred: {str(e)}"
        logger.error(error_msg)
        return {
            "success": False,
            "content": "",
            "error": error_msg,
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Get the scraped content
    content = response.text

    if not content:
        return {
            "success": False,
            "content": "",
            "error": "No content returned from URL",
            "line_count": 0,
            "char_count": 0,
            "last_char_line": 0,
            "all_content_displayed": False,
        }

    # Get content statistics
    total_char_count = len(content)
    total_line_count = content.count("\n") + 1 if content else 0

    # Extract first max_chars characters
    displayed_content = content[:max_chars]
    all_content_displayed = total_char_count <= max_chars

    # Calculate the line number of the last character displayed
    if displayed_content:
        last_char_line = displayed_content.count("\n") + 1
    else:
        last_char_line = 0

    return {
        "success": True,
        "content": displayed_content,
        "error": "",
        "line_count": total_line_count,
        "char_count": total_char_count,
        "last_char_line": last_char_line,
        "all_content_displayed": all_content_displayed,
    }


EXTRACT_INFO_PROMPT = """You are given a piece of content and the requirement of information to extract. Your task is to extract the information specifically requested. Be precise and focus exclusively on the requested information.

INFORMATION TO EXTRACT:
{}

INSTRUCTIONS:
1. Extract the information relevant to the focus above.
2. If the exact information is not found, extract the most closely related details.
3. Be specific and include exact details when available.
4. Clearly organize the extracted information for easy understanding.
5. Do not include general summaries or unrelated content.

CONTENT TO ANALYZE:
{}

EXTRACTED INFORMATION:"""


def get_prompt_with_truncation(
    info_to_extract: str, content: str, truncate_last_num_chars: int = -1
) -> str:
    if truncate_last_num_chars > 0:
        content = content[:-truncate_last_num_chars] + "[...truncated]"

    # Prepare the prompt
    prompt = EXTRACT_INFO_PROMPT.format(info_to_extract, content)
    return prompt


async def extract_info_with_llm(
    url: str,
    content: str,
    info_to_extract: str,
    model: str = "LLM",
    max_tokens: int = 4096,
) -> Dict[str, Any]:
    """
    Summarize content using an LLM API.

    Args:
        content (str): The content to summarize
        info_to_extract (str): The specific types of information to extract (usually a question)
        model (str): The model to use for summarization
        max_tokens (int): Maximum tokens for the response

    Returns:
        Dict[str, Any]: A dictionary containing:
            - success (bool): Whether the operation was successful
            - extracted_info (str): The extracted information
            - error (str): Error message if the operation failed
            - model_used (str): The model used for summarization
            - tokens_used (int): Number of tokens used (if available)
    """

    # Validate input
    if not content or not content.strip():
        return {
            "success": False,
            "extracted_info": "",
            "error": "Content cannot be empty",
            "model_used": model,
            "tokens_used": 0,
        }

    prompt = get_prompt_with_truncation(info_to_extract, content)

    # Prepare the payload
    if "gpt" in model:
        payload = {
            "model": model,
            "max_completion_tokens": max_tokens,
            "messages": [
                {"role": "user", "content": prompt},
            ],
        }
        # Add cost-saving parameters for GPT-5 models
        if "gpt-5" in model.lower() or "gpt5" in model.lower():
            payload["service_tier"] = "flex"
            payload["reasoning_effort"] = "minimal"
    else:
        payload = {
            "model": model,
            "max_tokens": max_tokens,
            "messages": [
                {"role": "user", "content": prompt},
            ],
            "temperature": 1.0,
            # "top_p": 0.8,
            # "top_k": 20,
        }

    # Validate LLM endpoint configuration early for clearer errors
    if not SUMMARY_LLM_BASE_URL or not SUMMARY_LLM_BASE_URL.strip():
        return {
            "success": False,
            "extracted_info": "",
            "error": "SUMMARY_LLM_BASE_URL environment variable is not set",
            "model_used": model,
            "tokens_used": 0,
        }

    # Prepare headers (add Authorization if API key is available)
    headers = {"Content-Type": "application/json"}
    if SUMMARY_LLM_API_KEY:
        headers["Authorization"] = f"Bearer {SUMMARY_LLM_API_KEY}"

    try:
        # Retry configuration
        connect_retry_delays = [1, 2, 4, 8]

        for attempt, delay in enumerate(connect_retry_delays, 1):
            try:
                # Make the API request using httpx
                async with httpx.AsyncClient() as client:
                    response = await client.post(
                        SUMMARY_LLM_BASE_URL,
                        headers=headers,
                        json=payload,
                        timeout=httpx.Timeout(None, connect=30, read=300),
                    )
                    if response.text and len(response.text) >= 50:
                        tail_50 = response.text[-50:]
                        repeat_count = response.text.count(tail_50)
                        if repeat_count > 5:
                            logger.info("Repeat detected in extract_info_with_llm")
                            continue

                # Check if the request was successful
                if (
                    "Requested token count exceeds the model's maximum context length"
                    in response.text
                    or "longer than the model's context length" in response.text
                ):
                    prompt = get_prompt_with_truncation(
                        info_to_extract,
                        content,
                        truncate_last_num_chars=40960 * attempt,
                    )  # remove 40k * num_attempts chars from the end of the content
                    payload["messages"][0]["content"] = prompt
                    continue  # no need to raise error here, just try again

                response.raise_for_status()
                break  # Success, exit retry loop

            except httpx.ConnectTimeout as e:
                # connection timeout, retry
                if attempt < len(connect_retry_delays):
                    logger.info(
                        f"Jina Scrape and Extract Info: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        "Jina Scrape and Extract Info: Connection retry attempts exhausted"
                    )
                    raise e

            except httpx.ConnectError as e:
                # connection error, retry
                if attempt < len(connect_retry_delays):
                    logger.info(
                        f"Jina Scrape and Extract Info: Connection error: {e}, {delay}s before next attempt"
                    )
                    await asyncio.sleep(delay)
                    continue
                else:
                    logger.error(
                        "Jina Scrape and Extract Info: Connection retry attempts exhausted"
                    )
                    raise e

            except httpx.ReadTimeout as e:
                # read timeout, LLM API is too slow, no need to retry
                if attempt < len(connect_retry_delays):
                    logger.info(
                        f"Jina Scrape and Extract Info: LLM API attempt {attempt} read timeout"
                    )
                    continue
                else:
                    logger.error(
                        f"Jina Scrape and Extract Info: LLM API read timeout retry attempts exhausted, please check the request complexity, information to extract: {info_to_extract}, length of content: {len(content)}, url: {url}"
                    )
                    raise e

            except httpx.HTTPStatusError as e:
                status_code = e.response.status_code

                # Special case: GPT-5 service_tier parameter compatibility issue
                if (
                    "gpt-5" in model.lower() or "gpt5" in model.lower()
                ) and "service_tier" in payload:
                    logger.info(
                        "Extract Info: GPT-5 service_tier error, removing and retrying"
                    )
                    payload.pop("service_tier", None)
                    if attempt < len(connect_retry_delays):
                        await asyncio.sleep(delay)
                        continue

                # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
                should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]

                if should_retry and attempt < len(connect_retry_delays):
                    logger.info(
                        f"Extract Info: HTTP {status_code} (retryable), retry in {delay}s"
                    )
                    await asyncio.sleep(delay)
                    continue
                elif should_retry:
                    logger.error(f"Extract Info: HTTP {status_code} retry exhausted")
                    raise e
                else:
                    logger.error(f"Extract Info: HTTP {status_code} (non-retryable)")
                    raise httpx.HTTPStatusError(
                        f"response.text: {response.text}",
                        request=e.request,
                        response=e.response,
                    ) from e

            except httpx.RequestError as e:
                logger.error(
                    f"Jina Scrape and Extract Info: Unknown request exception: {e}"
                )
                raise e

    except Exception as e:
        error_msg = f"Jina Scrape and Extract Info: Unexpected error during LLM API call: {str(e)}"
        logger.error(error_msg)
        return {
            "success": False,
            "extracted_info": "",
            "error": error_msg,
            "model_used": model,
            "tokens_used": 0,
        }

    # Parse the response
    try:
        response_data = response.json()

    except json.JSONDecodeError as e:
        error_msg = (
            f"Jina Scrape and Extract Info: Failed to parse LLM API response: {str(e)}"
        )
        logger.error(error_msg)
        logger.error(f"Raw response: {response.text}")
        return {
            "success": False,
            "extracted_info": "",
            "error": error_msg,
            "model_used": model,
            "tokens_used": 0,
        }

    # Extract summary from response
    if "choices" in response_data and len(response_data["choices"]) > 0:
        try:
            summary = response_data["choices"][0]["message"]["content"]
        except Exception as e:
            error_msg = f"Jina Scrape and Extract Info: Failed to get summary from LLM API response: {str(e)}"
            logger.error(error_msg)
            return {
                "success": False,
                "extracted_info": "",
                "error": error_msg,
                "model_used": model,
                "tokens_used": 0,
            }

        # Extract token usage if available
        tokens_used = 0
        if "usage" in response_data:
            tokens_used = response_data["usage"].get("total_tokens", 0)

        return {
            "success": True,
            "extracted_info": summary,
            "error": "",
            "model_used": model,
            "tokens_used": tokens_used,
        }
    elif "error" in response_data:
        error_msg = (
            f"Jina Scrape and Extract Info: LLM API error: {response_data['error']}"
        )
        logger.error(error_msg)
        return {
            "success": False,
            "extracted_info": "",
            "error": error_msg,
            "model_used": model,
            "tokens_used": 0,
        }
    else:
        error_msg = f"Jina Scrape and Extract Info: No valid response from LLM API, response data: {response_data}"
        logger.error(error_msg)
        return {
            "success": False,
            "extracted_info": "",
            "error": error_msg,
            "model_used": model,
            "tokens_used": 0,
        }


if __name__ == "__main__":
    # Example usage and testing

    # Run the MCP server
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/search_and_scrape_webpage.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import logging
import os
from typing import Any, Dict

import httpx
from mcp.server.fastmcp import FastMCP
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)
from tencentcloud.common import credential
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
    TencentCloudSDKException,
)
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile

from ..mcp_servers.utils.url_unquote import decode_http_urls_in_dict

# Configure logging
logger = logging.getLogger("miroflow")

SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")

TENCENTCLOUD_SECRET_ID = os.getenv("TENCENTCLOUD_SECRET_ID", "")
TENCENTCLOUD_SECRET_KEY = os.getenv("TENCENTCLOUD_SECRET_KEY", "")

# Initialize FastMCP server
mcp = FastMCP("search_and_scrape_webpage")


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=retry_if_exception_type(
        (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError)
    ),
)
async def make_serper_request(
    payload: Dict[str, Any], headers: Dict[str, str]
) -> httpx.Response:
    """Make HTTP request to Serper API with retry logic."""
    async with httpx.AsyncClient() as client:
        response = await client.post(
            f"{SERPER_BASE_URL}/search",
            json=payload,
            headers=headers,
        )
        response.raise_for_status()
        return response


def _is_banned_url(url: str) -> bool:
    """
    Check if the URL is a banned URL.
    :param url: The URL to check
    :return: True if it's a banned URL, False otherwise
    """
    banned_list = [
        "unifuncs",
        "huggingface.co/datasets",
        "huggingface.co/spaces",
    ]
    if not url:
        return False
    return any(banned in url for banned in banned_list)


@mcp.tool()
async def google_search(
    q: str,
    gl: str = "us",
    hl: str = "en",
    location: str = None,
    num: int = None,
    tbs: str = None,
    page: int = None,
    autocorrect: bool = None,
):
    """
    Tool to perform web searches via Serper API and retrieve rich results.

    It is able to retrieve organic search results, people also ask,
    related searches, and knowledge graph.

    Args:
        q: Search query string
        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
        num: Number of results to return (default: 10)
        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)
        page: Page number of results to return (default: 1)
        autocorrect: Whether to autocorrect spelling in query

    Returns:
        Dictionary containing search results and metadata.
    """
    # Check for API key
    if not SERPER_API_KEY:
        return json.dumps(
            {
                "success": False,
                "error": "SERPER_API_KEY environment variable not set",
                "results": [],
            },
            ensure_ascii=False,
        )

    # Validate required parameter
    if not q or not q.strip():
        return json.dumps(
            {
                "success": False,
                "error": "Search query 'q' is required and cannot be empty",
                "results": [],
            },
            ensure_ascii=False,
        )

    try:
        # Helper function to perform a single search
        async def perform_search(search_query: str) -> tuple[list, dict]:
            """Perform a search and return organic results and search parameters."""
            # Build payload with all supported parameters
            payload: dict[str, Any] = {
                "q": search_query.strip(),
                "gl": gl,
                "hl": hl,
            }

            # Add optional parameters if provided
            if location:
                payload["location"] = location
            if num is not None:
                payload["num"] = num
            else:
                payload["num"] = 10  # Default
            if tbs:
                payload["tbs"] = tbs
            if page is not None:
                payload["page"] = page
            if autocorrect is not None:
                payload["autocorrect"] = autocorrect

            # Set up headers
            headers = {
                "X-API-KEY": SERPER_API_KEY,
                "Content-Type": "application/json",
            }

            # Make the API request
            response = await make_serper_request(payload, headers)
            data = response.json()

            # filter out HuggingFace dataset or space urls
            organic_results = []
            if "organic" in data:
                for item in data["organic"]:
                    if _is_banned_url(item.get("link", "")):
                        continue
                    organic_results.append(item)

            return organic_results, data.get("searchParameters", {})

        # Perform initial search
        original_query = q.strip()
        organic_results, search_params = await perform_search(original_query)

        # If no results and query contains quotes, retry without quotes
        if not organic_results and '"' in original_query:
            # Remove all types of quotes
            query_without_quotes = original_query.replace('"', "").strip()
            if query_without_quotes:  # Make sure we still have a valid query
                organic_results, search_params = await perform_search(
                    query_without_quotes
                )

        # Build comprehensive response
        response_data = {
            "organic": organic_results,
            "searchParameters": search_params,
        }
        response_data = decode_http_urls_in_dict(response_data)

        return json.dumps(response_data, ensure_ascii=False)

    except Exception as e:
        return json.dumps(
            {
                "success": False,
                "error": f"Unexpected error: {str(e)}",
                "results": [],
            },
            ensure_ascii=False,
        )


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=retry_if_exception_type(TencentCloudSDKException),
)
async def make_sogou_request(query: str, cnt: int) -> Dict[str, Any]:
    """Make request to Tencent Cloud SearchPro API with retry logic."""
    cred = credential.Credential(TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY)
    httpProfile = HttpProfile()
    httpProfile.endpoint = "wsa.tencentcloudapi.com"
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile

    params = f'{{"Query":"{query}","Mode":0, "Cnt":{cnt}}}'
    common_client = CommonClient("wsa", "2025-05-08", cred, "", profile=clientProfile)
    result = common_client.call_json("SearchPro", json.loads(params))["Response"]
    return result


@mcp.tool()
async def sogou_search(
    q: str,
    num: int = 10,
) -> str:
    """
    Tool to perform web searches via Tencent Cloud SearchPro API (Sogou search engine).

    Sogou search offers superior results for Chinese-language queries compared to Google.

    Args:
        q: Search query string (Required)
        num: Number of search results to return (Can only be 10/20/30/40/50, default: 10)

    Returns:
        JSON string containing search results with the following fields:
        - Query: The original search query
        - Pages: Array of search results, each containing title, url, passage, date, and site
    """
    # Check for API credentials
    if not TENCENTCLOUD_SECRET_ID or not TENCENTCLOUD_SECRET_KEY:
        return json.dumps(
            {
                "success": False,
                "error": "TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY environment variable not set",
                "results": [],
            },
            ensure_ascii=False,
        )

    # Validate required parameter
    if not q or not q.strip():
        return json.dumps(
            {
                "success": False,
                "error": "Search query 'q' is required and cannot be empty",
                "results": [],
            },
            ensure_ascii=False,
        )

    # Validate num parameter
    if num not in [10, 20, 30, 40, 50]:
        return json.dumps(
            {
                "success": False,
                "error": f"Invalid num value: {num}. Must be one of 10, 20, 30, 40, 50",
                "results": [],
            },
            ensure_ascii=False,
        )

    try:
        # Make the API request
        result = await make_sogou_request(q.strip(), num)

        # Remove RequestId from response
        if "RequestId" in result:
            del result["RequestId"]

        # Process and simplify the Pages field
        pages = []
        if "Pages" in result:
            for page in result["Pages"]:
                page_json = json.loads(page)
                new_page = {
                    "title": page_json.get("title", ""),
                    "url": page_json.get("url", ""),
                    "passage": page_json.get("passage", ""),
                    "date": page_json.get("date", ""),
                    "site": page_json.get("site", ""),
                }
                pages.append(new_page)
            result["Pages"] = pages

        # Decode URLs in the response
        result = decode_http_urls_in_dict(result)

        return json.dumps(result, ensure_ascii=False)

    except TencentCloudSDKException as e:
        return json.dumps(
            {
                "success": False,
                "error": f"Tencent Cloud API error: {str(e)}",
                "results": [],
            },
            ensure_ascii=False,
        )

    except Exception as e:
        return json.dumps(
            {
                "success": False,
                "error": f"Unexpected error: {str(e)}",
                "results": [],
            },
            ensure_ascii=False,
        )


if __name__ == "__main__":
    mcp.run()


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import os

from e2b_code_interpreter import Sandbox
from mcp.server.fastmcp import FastMCP

# Initialize FastMCP server
mcp = FastMCP("stateless-python-server")

# API keys
E2B_API_KEY = os.environ.get("E2B_API_KEY")

# DEFAULT CONFS
DEFAULT_TIMEOUT = 300  # seconds


@mcp.tool()
async def python(code: str) -> str:
    """Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
    When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output.
    IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time.

        Args:
            code: The python code to run.

        Returns:
            A string containing the execution result including stdout and stderr.
    """
    sandbox = Sandbox.create(
        timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template="1av7fdjfvcparqo8efq6"
    )

    max_attempts = 2
    for attempt in range(1, max_attempts + 1):
        try:
            execution = sandbox.run_code(code)
            break
        except Exception as e:
            if attempt == max_attempts:
                raise e
    execution = sandbox.run_code(code)

    sandbox.kill()

    return str(execution)


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/task_planner.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import json
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List
from uuid import uuid4

from mcp.server.fastmcp import FastMCP

# Configure logging
logger = logging.getLogger("miroflow")

# Initialize FastMCP server
mcp = FastMCP("task_planner")

# Configuration
TODO_DATA_DIR = os.environ.get("TODO_DATA_DIR", "../../logs/todo_lists")

# TASK_ID is required for task isolation
# Without TASK_ID, task planner operations will fail
TASK_ID = os.environ.get("TASK_ID")
if not TASK_ID:
    raise ValueError(
        "TASK_ID environment variable is required for task_planner tool. "
        "This tool must have a unique task identifier to prevent data conflicts in concurrent execution."
    )

TODO_DATA_FILE = os.path.join(TODO_DATA_DIR, f"todos_{TASK_ID}.json")

# Ensure data directory exists
Path(TODO_DATA_DIR).mkdir(parents=True, exist_ok=True)


def load_todos() -> List[Dict[str, Any]]:
    """Load task plan from the JSON file."""
    if not os.path.exists(TODO_DATA_FILE):
        return []

    try:
        with open(TODO_DATA_FILE, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        logger.error(f"Failed to load task plan: {str(e)}")
        return []


def save_todos(todos: List[Dict[str, Any]]) -> bool:
    """Save task plan to the JSON file."""
    try:
        with open(TODO_DATA_FILE, "w", encoding="utf-8") as f:
            json.dump(todos, f, ensure_ascii=False, indent=2)
        return True
    except Exception as e:
        logger.error(f"Failed to save task plan: {str(e)}")
        return False


def format_todos_as_markdown(todos: List[Dict[str, Any]], message: str = "") -> str:
    """
    Format task plan as markdown checklist.

    Args:
        todos: List of task items
        message: Optional message to display at the top

    Returns:
        Markdown formatted string
    """
    # Calculate statistics
    total = len(todos)
    completed = sum(1 for t in todos if t.get("completed", False))
    pending = total - completed

    # Build markdown
    lines = []
    if message:
        lines.append(f"{message}\n")

    lines.append("# Task Plan\n")
    lines.append(f"Total: {total} | Pending: {pending} | Completed: {completed}\n")
    lines.append("")

    if not todos:
        lines.append("No tasks planned yet.")
    else:
        for todo in todos:
            checkbox = "[x]" if todo.get("completed", False) else "[ ]"
            title = todo["title"]
            todo_id = todo["id"][:8]  # Show first 8 chars of ID
            lines.append(f"- {checkbox} {title} ({todo_id})")

    return "\n".join(lines)


@mcp.tool()
async def add_todo(titles: List[str]) -> str:
    """
    Create a task plan by adding one or more task items.

    CRITICAL: Before starting to work on ANY task, you MUST first create a complete task plan.
    This is the foundation of effective task execution:
    - Break down the main goal into clear, actionable steps
    - Identify all necessary subtasks upfront
    - Create a roadmap that guides your work
    - Ensure nothing is overlooked or forgotten

    Good task planning prevents confusion and ensures systematic progress toward your goal.

    Args:
        titles: List of task item titles. For example:
                - Single task: ["Complete project report"]
                - Multiple tasks: ["Complete project report", "Fix bug #123", "Update documentation"]
                - Complex project: ["Research requirements", "Design architecture", "Implement core features", "Write tests", "Document API"]

    Returns:
        Markdown formatted string showing the success message and current task plan.
    """
    if not titles:
        return "❌ Error: Task titles list cannot be empty."

    # Filter out empty titles
    title_list = [t.strip() for t in titles if t and t.strip()]

    if not title_list:
        return "❌ Error: No valid task titles provided."

    todos = load_todos()
    added_todos = []

    # Add all tasks
    for title in title_list:
        new_todo = {
            "id": str(uuid4()),
            "title": title,
            "completed": False,
            "created_at": datetime.now().isoformat(),
        }
        todos.append(new_todo)
        added_todos.append(title)

    if not save_todos(todos):
        return "❌ Error: Failed to save task plan."

    # Build success message
    if len(added_todos) == 1:
        message = f'✅ Task added: "{added_todos[0]}"'
    else:
        message = f"✅ Added {len(added_todos)} tasks:\n" + "\n".join(
            f"  - {t}" for t in added_todos
        )

    return format_todos_as_markdown(todos, message)


@mcp.tool()
async def list_todos() -> str:
    """
    Display the complete task plan with all items and their status.

    Use this to review your overall progress, see what's done and what remains,
    and understand where you are in the execution of your plan.

    Returns:
        Markdown formatted string showing all tasks with their completion status.
    """
    todos = load_todos()
    return format_todos_as_markdown(todos)


@mcp.tool()
async def complete_todo(todo_ids: List[str]) -> str:
    """
    Mark one or more tasks as completed in your plan.

    Use this after finishing a task to track your progress and maintain an
    accurate view of what's done and what's remaining.

    Args:
        todo_ids: List of task IDs to mark as completed (full ID or first 8 characters).
                  For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"]

    Returns:
        Markdown formatted string showing the success message and updated task plan.
    """
    if not todo_ids:
        return "❌ Error: Task IDs list cannot be empty."

    # Filter out empty IDs
    id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]

    if not id_list:
        return "❌ Error: No valid task IDs provided."

    todos = load_todos()
    completed_todos = []
    not_found_ids = []

    # Complete all matching tasks
    for todo_id in id_list:
        found = False
        for todo in todos:
            if todo["id"] == todo_id or todo["id"].startswith(todo_id):
                if not todo.get(
                    "completed", False
                ):  # Only mark if not already completed
                    todo["completed"] = True
                    completed_todos.append(todo["title"])
                found = True
                break
        if not found:
            not_found_ids.append(todo_id)

    if not completed_todos and not_found_ids:
        return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}"

    if not save_todos(todos):
        return "❌ Error: Failed to save changes."

    # Build success message
    if len(completed_todos) == 1:
        message = f'✅ Completed: "{completed_todos[0]}"'
    else:
        message = f"✅ Completed {len(completed_todos)} tasks:\n" + "\n".join(
            f"  - {t}" for t in completed_todos
        )

    if not_found_ids:
        message += f'\n⚠️  Not found: {", ".join(not_found_ids)}'

    return format_todos_as_markdown(todos, message)


@mcp.tool()
async def delete_todo(todo_ids: List[str]) -> str:
    """
    Remove one or more tasks from your plan.

    Use this to adjust your plan when tasks become irrelevant, duplicated,
    or no longer needed. This helps keep your plan focused and accurate.

    Args:
        todo_ids: List of task IDs to remove (full ID or first 8 characters).
                  For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"]

    Returns:
        Markdown formatted string showing the success message and remaining task plan.
    """
    if not todo_ids:
        return "❌ Error: Task IDs list cannot be empty."

    # Filter out empty IDs
    id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]

    if not id_list:
        return "❌ Error: No valid task IDs provided."

    todos = load_todos()
    deleted_todos = []
    not_found_ids = []
    ids_to_delete = set()

    # Find all tasks to delete
    for todo_id in id_list:
        found = False
        for todo in todos:
            if todo["id"] == todo_id or todo["id"].startswith(todo_id):
                deleted_todos.append(todo["title"])
                ids_to_delete.add(todo["id"])
                found = True
                break
        if not found:
            not_found_ids.append(todo_id)

    if not deleted_todos and not_found_ids:
        return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}"

    # Remove the tasks
    todos = [t for t in todos if t["id"] not in ids_to_delete]

    if not save_todos(todos):
        return "❌ Error: Failed to save changes."

    # Build success message
    if len(deleted_todos) == 1:
        message = f'🗑️ Deleted: "{deleted_todos[0]}"'
    else:
        message = f"🗑️ Deleted {len(deleted_todos)} tasks:\n" + "\n".join(
            f"  - {t}" for t in deleted_todos
        )

    if not_found_ids:
        message += f'\n⚠️  Not found: {", ".join(not_found_ids)}'

    return format_todos_as_markdown(todos, message)


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/manager.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import functools
from typing import Any, Awaitable, Callable, Protocol, TypeVar

from mcp import ClientSession, StdioServerParameters  # (already imported in config.py)
from mcp.client.sse import sse_client
from mcp.client.stdio import stdio_client

from .mcp_servers.browser_session import PlaywrightSession

# logger = logging.getLogger("miroflow_agent")

R = TypeVar("R")


def with_timeout(timeout_s: float = 300.0):
    """
    Decorator: wraps any *async* function in asyncio.wait_for().
    Usage:
        @with_timeout(20)
        async def create_message_foo(...): ...
    """

    def decorator(
        func: Callable[..., Awaitable[R]],
    ) -> Callable[..., Awaitable[R]]:
        @functools.wraps(func)
        async def wrapper(*args, **kwargs) -> R:
            return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)

        return wrapper

    return decorator


class ToolManagerProtocol(Protocol):
    """this enables other kinds of tool manager."""

    async def get_all_tool_definitions(self) -> Any: ...
    async def execute_tool_call(
        self, *, server_name: str, tool_name: str, arguments: dict[str, Any]
    ) -> Any: ...


class ToolManager(ToolManagerProtocol):
    def __init__(self, server_configs, tool_blacklist=None):
        """
        Initialize ToolManager.
        :param server_configs: List returned by create_server_parameters()
        """
        self.server_configs = server_configs
        self.server_dict = {
            config["name"]: config["params"] for config in server_configs
        }
        self.browser_session = None
        self.tool_blacklist = tool_blacklist if tool_blacklist else set()
        self.task_log = None

    def set_task_log(self, task_log):
        """Set the task logger for structured logging."""
        self.task_log = task_log

        self._log(
            "info",
            "ToolManager | Initialization",
            f"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}",
        )

    def _log(self, level, step_name, message, metadata=None):
        """Helper method to log using task_log if available, otherwise skip logging."""
        if self.task_log:
            self.task_log.log_step(level, step_name, message, metadata)

    def _is_huggingface_dataset_or_space_url(self, url):
        """
        Check if the URL is a Hugging Face dataset or space URL.
        :param url: The URL to check
        :return: True if it's a HuggingFace dataset or space URL, False otherwise
        """
        if not url:
            return False
        return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url

    def _should_block_hf_scraping(self, tool_name, arguments):
        """
        Check if we should block scraping of Hugging Face datasets/spaces.
        :param tool_name: The name of the tool being called
        :param arguments: The arguments passed to the tool
        :return: True if scraping should be blocked, False otherwise
        """
        return (
            tool_name in ["scrape", "scrape_website"]
            and arguments.get("url")
            and self._is_huggingface_dataset_or_space_url(arguments["url"])
        )

    def get_server_params(self, server_name):
        """Get parameters for the specified server"""
        return self.server_dict.get(server_name)

    async def get_all_tool_definitions(self):
        """
        Connect to all configured servers and get their tool definitions.
        Returns a list suitable for passing to the Prompt generator.
        """
        all_servers_for_prompt = []
        # Process remote server tools
        for config in self.server_configs:
            server_name = config["name"]
            server_params = config["params"]
            one_server_for_prompt = {"name": server_name, "tools": []}
            self._log(
                "info",
                "ToolManager | Get Tool Definitions",
                f"Getting tool definitions for server '{server_name}'...",
            )

            try:
                if isinstance(server_params, StdioServerParameters):
                    async with stdio_client(server_params) as (read, write):
                        async with ClientSession(
                            read, write, sampling_callback=None
                        ) as session:
                            await session.initialize()
                            tools_response = await session.list_tools()
                            # black list some tools
                            for tool in tools_response.tools:
                                if (server_name, tool.name) in self.tool_blacklist:
                                    self._log(
                                        "info",
                                        "ToolManager | Tool Blacklisted",
                                        f"Tool '{tool.name}' in server '{server_name}' is blacklisted, skipping.",
                                    )
                                    continue
                                one_server_for_prompt["tools"].append(
                                    {
                                        "name": tool.name,
                                        "description": tool.description,
                                        "schema": tool.inputSchema,
                                    }
                                )
                elif isinstance(server_params, str) and server_params.startswith(
                    ("http://", "https://")
                ):
                    # SSE endpoint
                    async with sse_client(server_params) as (read, write):
                        async with ClientSession(
                            read, write, sampling_callback=None
                        ) as session:
                            await session.initialize()
                            tools_response = await session.list_tools()
                            for tool in tools_response.tools:
                                # Can add specific tool filtering logic here (if needed)
                                # if server_name == "tool-excel" and tool.name not in ["get_workbook_metadata", "read_data_from_excel"]:
                                #     continue
                                one_server_for_prompt["tools"].append(
                                    {
                                        "name": tool.name,
                                        "description": tool.description,
                                        "schema": tool.inputSchema,
                                    }
                                )
                else:
                    self._log(
                        "error",
                        "ToolManager | Unknown Parameter Type",
                        f"Error: Unknown parameter type for server '{server_name}': {type(server_params)}",
                    )
                    raise TypeError(
                        f"Unknown server params type for {server_name}: {type(server_params)}"
                    )

                self._log(
                    "info",
                    "ToolManager | Tool Definitions Success",
                    f"Successfully obtained {len(one_server_for_prompt['tools'])} tool definitions from server '{server_name}'.",
                )
                all_servers_for_prompt.append(one_server_for_prompt)

            except Exception as e:
                self._log(
                    "error",
                    "ToolManager | Connection Error",
                    f"Error: Unable to connect or get tools from server '{server_name}': {e}",
                )
                # Still add server entry, but mark tool list as empty or include error information
                one_server_for_prompt["tools"] = [
                    {"error": f"Unable to fetch tools: {e}"}
                ]
                all_servers_for_prompt.append(one_server_for_prompt)

        return all_servers_for_prompt

    @with_timeout(1200)
    async def execute_tool_call(self, server_name, tool_name, arguments) -> Any:
        """
        Execute a single tool call.
        :param server_name: Server name
        :param tool_name: Tool name
        :param arguments: Tool arguments dictionary
        :return: Dictionary containing result or error
        """

        # Original remote server call logic
        server_params = self.get_server_params(server_name)
        if not server_params:
            self._log(
                "error",
                "ToolManager | Server Not Found",
                f"Error: Attempting to call server '{server_name}' not found",
            )
            return {
                "server_name": server_name,
                "tool_name": tool_name,
                "error": f"Server '{server_name}' not found.",
            }

        self._log(
            "info",
            "ToolManager | Tool Call Start",
            f"Connecting to server '{server_name}' to call tool '{tool_name}'",
            metadata={"arguments": arguments},
        )

        if server_name == "playwright":
            try:
                if self.browser_session is None:
                    self.browser_session = PlaywrightSession(server_params)
                    await self.browser_session.connect()
                tool_result = await self.browser_session.call_tool(
                    tool_name, arguments=arguments
                )
                return {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "result": tool_result,
                }
            except Exception as e:
                return {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "error": f"Tool call failed: {str(e)}",
                }
        else:
            try:
                result_content = None
                if isinstance(server_params, StdioServerParameters):
                    async with stdio_client(server_params) as (read, write):
                        async with ClientSession(
                            read, write, sampling_callback=None
                        ) as session:
                            await session.initialize()
                            try:
                                tool_result = await session.call_tool(
                                    tool_name, arguments=arguments
                                )
                                result_content = (
                                    tool_result.content[-1].text
                                    if tool_result.content
                                    else ""
                                )
                                # post hoc check for browsing agent reading answers from hf datsets
                                if self._should_block_hf_scraping(tool_name, arguments):
                                    result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
                            except Exception as tool_error:
                                self._log(
                                    "error",
                                    "ToolManager | Tool Execution Error",
                                    f"Tool execution error: {tool_error}",
                                )
                                return {
                                    "server_name": server_name,
                                    "tool_name": tool_name,
                                    "error": f"Tool execution failed: {str(tool_error)}",
                                }
                elif isinstance(server_params, str) and server_params.startswith(
                    ("http://", "https://")
                ):
                    async with sse_client(server_params) as (read, write):
                        async with ClientSession(
                            read, write, sampling_callback=None
                        ) as session:
                            await session.initialize()
                            try:
                                tool_result = await session.call_tool(
                                    tool_name, arguments=arguments
                                )
                                result_content = (
                                    tool_result.content[-1].text
                                    if tool_result.content
                                    else ""
                                )
                                # post hoc check for browsing agent reading answers from hf datsets
                                if self._should_block_hf_scraping(tool_name, arguments):
                                    result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
                            except Exception as tool_error:
                                self._log(
                                    "error",
                                    "ToolManager | Tool Execution Error",
                                    f"Tool execution error: {tool_error}",
                                )
                                return {
                                    "server_name": server_name,
                                    "tool_name": tool_name,
                                    "error": f"Tool execution failed: {str(tool_error)}",
                                }
                else:
                    raise TypeError(
                        f"Unknown server params type for {server_name}: {type(server_params)}"
                    )

                self._log(
                    "info",
                    "ToolManager | Tool Call Success",
                    f"Tool '{tool_name}' (server: '{server_name}') called successfully.",
                )

                return {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "result": result_content,  # Return extracted text content
                }

            except Exception as outer_e:  # Rename this to outer_e to avoid shadowing
                self._log(
                    "error",
                    "ToolManager | Tool Call Failed",
                    f"Error: Failed to call tool '{tool_name}' (server: '{server_name}'): {outer_e}",
                )

                # Store the original error message for later use
                error_message = str(outer_e)

                if (
                    tool_name in ["scrape", "scrape_website"]
                    and "unhandled errors" in error_message
                    and "url" in arguments
                    and arguments["url"] is not None
                ):
                    try:
                        self._log(
                            "info",
                            "ToolManager | Fallback Attempt",
                            "Attempting fallback using MarkItDown...",
                        )
                        from markitdown import MarkItDown

                        md = MarkItDown(
                            docintel_endpoint="<document_intelligence_endpoint>"
                        )
                        result = md.convert(arguments["url"])
                        self._log(
                            "info",
                            "ToolManager | Fallback Success",
                            "MarkItDown fallback successful",
                        )
                        return {
                            "server_name": server_name,
                            "tool_name": tool_name,
                            "result": result.text_content,  # Return extracted text content
                        }
                    except (
                        Exception
                    ) as inner_e:  # Use a different name to avoid shadowing
                        # Log the inner exception if needed
                        self._log(
                            "error",
                            "ToolManager | Fallback Failed",
                            f"Fallback also failed: {inner_e}",
                        )
                        # No need for pass here as we'll continue to the return statement

                # Always use the outer exception for the final error response
                return {
                    "server_name": server_name,
                    "tool_name": tool_name,
                    "error": f"Tool call failed: {error_message}",
                }


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py
================================================


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import base64
import contextlib
import mimetypes
import os
import tempfile
import wave
from urllib.parse import urlparse

import requests
from fastmcp import FastMCP
from mutagen import File as MutagenFile
from openai import OpenAI

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

# Initialize FastMCP server
mcp = FastMCP("audio-mcp-server")


def _get_audio_extension(url: str, content_type: str = None) -> str:
    """
    Determine the appropriate audio file extension from URL or content type.

    Args:
        url: The URL of the audio file
        content_type: The content type from HTTP headers

    Returns:
        File extension (with dot) to use for temporary file
    """
    # First try to get extension from URL
    parsed_url = urlparse(url)
    path = parsed_url.path.lower()

    # Common audio extensions
    audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"]
    for ext in audio_extensions:
        if path.endswith(ext):
            return ext

    # If no extension found in URL, try content type
    if content_type:
        content_type = content_type.lower()
        if "mp3" in content_type or "mpeg" in content_type:
            return ".mp3"
        elif "wav" in content_type:
            return ".wav"
        elif "m4a" in content_type:
            return ".m4a"
        elif "aac" in content_type:
            return ".aac"
        elif "ogg" in content_type:
            return ".ogg"
        elif "flac" in content_type:
            return ".flac"

    # Default fallback to mp3
    return ".mp3"


def _get_audio_duration(audio_path: str) -> float:
    """
    Get audio duration in seconds.

    Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).
    Returns 0.0 if duration cannot be determined.
    """
    # Try using wave for .wav files
    try:
        with contextlib.closing(wave.open(audio_path, "rb")) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            if duration > 0:
                return duration
    except Exception:
        pass  # Not a wav file or failed

    # Try using mutagen for other audio formats (mp3, etc)
    try:
        audio = MutagenFile(audio_path)
        if (
            audio is not None
            and hasattr(audio, "info")
            and hasattr(audio.info, "length")
        ):
            duration = float(audio.info.length)
            if duration > 0:
                return duration
    except Exception:
        pass  # Failed to get duration

    # Return 0.0 if all methods failed
    return 0.0


def _encode_audio_file(audio_path: str) -> tuple[str, str]:
    """Encode audio file to base64 and determine format."""
    with open(audio_path, "rb") as audio_file:
        audio_data = audio_file.read()
        encoded_string = base64.b64encode(audio_data).decode("utf-8")

    # Determine file format from file extension
    mime_type, _ = mimetypes.guess_type(audio_path)
    if mime_type and mime_type.startswith("audio/"):
        mime_format = mime_type.split("/")[-1]
        # Map MIME type formats to OpenAI supported formats
        format_mapping = {
            "mpeg": "mp3",  # audio/mpeg -> mp3
            "wav": "wav",  # audio/wav -> wav
            "wave": "wav",  # audio/wave -> wav
        }
        file_format = format_mapping.get(mime_format, "mp3")
    else:
        # Default to mp3 if we can't determine
        file_format = "mp3"

    return encoded_string, file_format


@mcp.tool()
async def audio_transcription(audio_path_or_url: str) -> str:
    """
    Transcribe audio file to text and return the transcription.
    Args:
        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.

    Returns:
        The transcription of the audio file.
    """
    max_retries = 3
    retry = 0
    transcription = None

    # Create client once outside the retry loop
    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

    while retry < max_retries:
        try:
            if os.path.exists(audio_path_or_url):  # Check if the file exists locally
                with open(audio_path_or_url, "rb") as audio_file:
                    transcription = client.audio.transcriptions.create(
                        model="gpt-4o-transcribe", file=audio_file
                    )
            elif "home/user" in audio_path_or_url:
                return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction"
            else:
                # download the audio file from the URL
                response = requests.get(audio_path_or_url)
                response.raise_for_status()  # Raise an exception for bad status codes

                # Basic content validation - check if response has content
                if not response.content:
                    return (
                        "[ERROR]: Audio transcription failed: Downloaded file is empty"
                    )

                # Check content type if available
                content_type = response.headers.get("content-type", "").lower()

                # Get proper extension for the temporary file
                file_extension = _get_audio_extension(audio_path_or_url, content_type)

                # Use proper temporary file handling with correct extension
                with tempfile.NamedTemporaryFile(
                    delete=False, suffix=file_extension
                ) as temp_file:
                    temp_file.write(response.content)
                    temp_audio_path = temp_file.name

                try:
                    with open(temp_audio_path, "rb") as audio_file:
                        transcription = client.audio.transcriptions.create(
                            model="gpt-4o-transcribe", file=audio_file
                        )
                finally:
                    # Clean up the temp file
                    if os.path.exists(temp_audio_path):
                        os.remove(temp_audio_path)
            break

        except requests.RequestException as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))
        except Exception as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))

    return transcription.text


@mcp.tool()
async def audio_question_answering(audio_path_or_url: str, question: str) -> str:
    """
    Answer the question based on the given audio information.

    Args:
        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.
        question: The question to answer.

    Returns:
        The answer to the question, and the duration of the audio file.
    """
    max_retries = 3
    retry = 0

    # Create client once outside the retry loop
    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

    # Initialize variables to avoid scope issues
    encoded_string = None
    file_format = None
    duration = 0.0

    while retry < max_retries:
        try:
            text_prompt = f"""Answer the following question based on the given \
            audio information:\n\n{question}"""

            if os.path.exists(audio_path_or_url):  # Check if the file exists locally
                encoded_string, file_format = _encode_audio_file(audio_path_or_url)
                duration = _get_audio_duration(audio_path_or_url)
            elif "home/user" in audio_path_or_url:
                return "[ERROR]: The audio_question_answering tool cannot access to sandbox file, please use the local path provided by original instruction"
            else:
                # download the audio file from the URL
                response = requests.get(
                    audio_path_or_url,
                    headers={
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                    },
                )
                response.raise_for_status()  # Raise an exception for bad status codes

                # Basic content validation - check if response has content
                if not response.content:
                    return "[ERROR]: Audio question answering failed: Downloaded file is empty.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported."

                # Check content type if available
                content_type = response.headers.get("content-type", "").lower()

                # Get proper extension for the temporary file
                file_extension = _get_audio_extension(audio_path_or_url, content_type)

                # Use proper temporary file handling with correct extension
                with tempfile.NamedTemporaryFile(
                    delete=False, suffix=file_extension
                ) as temp_file:
                    temp_file.write(response.content)
                    temp_audio_path = temp_file.name

                try:
                    encoded_string, file_format = _encode_audio_file(temp_audio_path)
                    duration = _get_audio_duration(temp_audio_path)
                finally:
                    # Clean up the temp file
                    if os.path.exists(temp_audio_path):
                        os.remove(temp_audio_path)

            if encoded_string is None or file_format is None:
                return "[ERROR]: Audio question answering failed: Failed to encode audio file.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported."

            response = client.chat.completions.create(
                model="gpt-4o-audio-preview",
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant specializing in audio analysis.",
                    },
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": text_prompt},
                            {
                                "type": "input_audio",
                                "input_audio": {
                                    "data": encoded_string,
                                    "format": file_format,
                                },
                            },
                        ],
                    },
                ],
            )

            # If we reach here, the API call was successful
            break

        except requests.RequestException as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio question answering failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))
        except Exception as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio question answering failed when calling OpenAI API: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))

    response_text = response.choices[0].message.content
    response_text += f"\n\nAudio duration: {duration} seconds"

    return response_text


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import base64
import contextlib
import mimetypes
import os
import tempfile
import wave
from urllib.parse import urlparse

import requests
from fastmcp import FastMCP
from mutagen import File as MutagenFile
from openai import OpenAI

WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY")
WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL")
WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME")

# Initialize FastMCP server
mcp = FastMCP("audio-mcp-server-os")


def _get_audio_extension(url: str, content_type: str = None) -> str:
    """
    Determine the appropriate audio file extension from URL or content type.

    Args:
        url: The URL of the audio file
        content_type: The content type from HTTP headers

    Returns:
        File extension (with dot) to use for temporary file
    """
    # First try to get extension from URL
    parsed_url = urlparse(url)
    path = parsed_url.path.lower()

    # Common audio extensions
    audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"]
    for ext in audio_extensions:
        if path.endswith(ext):
            return ext

    # If no extension found in URL, try content type
    if content_type:
        content_type = content_type.lower()
        if "mp3" in content_type or "mpeg" in content_type:
            return ".mp3"
        elif "wav" in content_type:
            return ".wav"
        elif "m4a" in content_type:
            return ".m4a"
        elif "aac" in content_type:
            return ".aac"
        elif "ogg" in content_type:
            return ".ogg"
        elif "flac" in content_type:
            return ".flac"

    # Default fallback to mp3
    return ".mp3"


def _get_audio_duration(audio_path: str) -> float:
    """
    Get audio duration in seconds.

    Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).
    """
    # Try using wave for .wav files
    try:
        with contextlib.closing(wave.open(audio_path, "rb")) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            if duration > 0:
                return duration
    except Exception:
        pass  # Not a wav file or failed

    # Try using mutagen for other audio formats (mp3, etc)
    try:
        audio = MutagenFile(audio_path)
        if (
            audio is not None
            and hasattr(audio, "info")
            and hasattr(audio.info, "length")
        ):
            duration = float(audio.info.length)
            if duration > 0:
                return duration
    except Exception as e:
        return f"[ERROR]: Failed to get audio duration: {e}"


def _encode_audio_file(audio_path: str) -> tuple[str, str]:
    """Encode audio file to base64 and determine format."""
    with open(audio_path, "rb") as audio_file:
        audio_data = audio_file.read()
        encoded_string = base64.b64encode(audio_data).decode("utf-8")

    # Determine file format from file extension
    mime_type, _ = mimetypes.guess_type(audio_path)
    if mime_type and mime_type.startswith("audio/"):
        mime_format = mime_type.split("/")[-1]
        # Map MIME type formats to OpenAI supported formats
        format_mapping = {
            "mpeg": "mp3",  # audio/mpeg -> mp3
            "wav": "wav",  # audio/wav -> wav
            "wave": "wav",  # audio/wave -> wav
        }
        file_format = format_mapping.get(mime_format, "mp3")
    else:
        # Default to mp3 if we can't determine
        file_format = "mp3"

    return encoded_string, file_format


@mcp.tool()
async def audio_transcription(audio_path_or_url: str) -> str:
    """
    Transcribe audio file to text and return the transcription.
    Args:
        audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.

    Returns:
        The transcription of the audio file.
    """
    max_retries = 3
    retry = 0
    transcription = None

    while retry < max_retries:
        try:
            client = OpenAI(base_url=WHISPER_BASE_URL, api_key=WHISPER_API_KEY)
            if os.path.exists(audio_path_or_url):  # Check if the file exists locally
                with open(audio_path_or_url, "rb") as audio_file:
                    transcription = client.audio.transcriptions.create(
                        model=WHISPER_MODEL_NAME, file=audio_file
                    )
            elif "home/user" in audio_path_or_url:
                return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction"
            else:
                # download the audio file from the URL
                response = requests.get(audio_path_or_url)
                response.raise_for_status()  # Raise an exception for bad status codes

                # Basic content validation - check if response has content
                if not response.content:
                    return (
                        "[ERROR]: Audio transcription failed: Downloaded file is empty"
                    )

                # Check content type if available
                content_type = response.headers.get("content-type", "").lower()
                if content_type and not any(
                    media_type in content_type
                    for media_type in ["audio", "video", "application/octet-stream"]
                ):
                    return f"[ERROR]: Audio transcription failed: Invalid content type '{content_type}'. Expected audio file."

                # Get proper extension for the temporary file
                file_extension = _get_audio_extension(audio_path_or_url, content_type)

                # Use proper temporary file handling with correct extension
                with tempfile.NamedTemporaryFile(
                    delete=False, suffix=file_extension
                ) as temp_file:
                    temp_file.write(response.content)
                    temp_audio_path = temp_file.name

                try:
                    with open(temp_audio_path, "rb") as audio_file:
                        transcription = client.audio.transcriptions.create(
                            model=WHISPER_MODEL_NAME, file=audio_file
                        )
                finally:
                    # Clean up the temp file
                    if os.path.exists(temp_audio_path):
                        os.remove(temp_audio_path)
            break

        except requests.RequestException as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))
        except Exception as e:
            retry += 1
            if retry >= max_retries:
                return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
            await asyncio.sleep(5 * (2**retry))

    return transcription.text


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import json
import logging

from mcp import StdioServerParameters
from mcp.client.session import ClientSession
from mcp.client.sse import sse_client
from mcp.client.stdio import stdio_client

logger = logging.getLogger("miroflow")


class PlaywrightSession:
    """Class to maintain a persistent Playwright MCP session."""

    def __init__(self, server_params):
        self.server_params = server_params
        self.read = None
        self.write = None
        self.session = None
        self._client = None

    async def connect(self):
        """Connect to the MCP server and initialize the session."""
        if self.session is None:
            if isinstance(self.server_params, StdioServerParameters):
                self._client = stdio_client(self.server_params)
            else:
                self._client = sse_client(self.server_params)
            self.read, self.write = await self._client.__aenter__()
            self.session = ClientSession(self.read, self.write, sampling_callback=None)
            await self.session.__aenter__()
            await self.session.initialize()
            logger.info("Connected to MCP server and initialized session")

    async def call_tool(self, tool_name, arguments=None):
        """Call a tool while maintaining the session."""
        if self.session is None:
            await self.connect()

        logger.info(f"Calling tool '{tool_name}'")
        tool_result = await self.session.call_tool(tool_name, arguments=arguments)
        result_content = tool_result.content[0].text if tool_result.content else ""
        return result_content

    async def close(self):
        """Close the session and connection."""
        if self.session:
            await self.session.__aexit__(None, None, None)
            self.session = None

        if self._client:
            await self._client.__aexit__(None, None, None)
            self._client = None
            self.read = None
            self.write = None
            logger.info("Closed MCP session")


# Example usage:
async def test_persistent_session():
    # Create a persistent session
    mcp_session = PlaywrightSession("http://localhost:8931")

    try:
        # First call: Navigate to a website
        await mcp_session.call_tool("browser_navigate", {"url": "https://example.com"})
        logger.info("Navigation complete")

        # Wait a moment for the page to load
        await asyncio.sleep(2)

        # Second call: Take a snapshot of the current page
        snapshot_result = await mcp_session.call_tool("browser_snapshot", {})

        # Process and save the snapshot
        snapshot_json = json.loads(snapshot_result)
        logger.info(f"Snapshot taken of page: {snapshot_json.get('url')}")
        logger.info(f"Page title: {snapshot_json.get('title')}")

        with open("snapshot.json", "w") as f:
            json.dump(snapshot_json, f, indent=2, ensure_ascii=False)

        logger.info("Snapshot saved to snapshot.json")

    finally:
        # Close the session when done with all tool calls
        await mcp_session.close()


if __name__ == "__main__":
    asyncio.run(test_persistent_session())


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/python_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import os
import shlex
from urllib.parse import urlparse

from e2b_code_interpreter import Sandbox
from fastmcp import FastMCP

# Initialize FastMCP server
mcp = FastMCP("e2b-python-interpreter")

# API keys
E2B_API_KEY = os.environ.get("E2B_API_KEY")
LOGS_DIR = os.environ.get(
    "LOGS_DIR", "../../logs"
)  # Directory where benchmark logs are stored

# DEFAULT TEMPLATE ID
DEFAULT_TEMPLATE_ID = "1av7fdjfvcparqo8efq6"

# DEFAULT CONFS
DEFAULT_TIMEOUT = 600  # seconds
# Maximum number of tokens that can be returned by the Python tool
MAX_RESULT_LEN = 20_000
# Maximum number of tokens allowed in an error message
MAX_ERROR_LEN = 4_000
# Invalid sandbox IDs that are not allowed to be used
INVALID_SANDBOX_IDS = {
    "default",
    "sandbox1",
    "sandbox",
    "some_id",
    "new_sandbox",
    "python",
    "create_sandbox",
    "sandbox123",
    "temp",
    "sandbox-0",
    "sandbox-1",
    "sandbox_0",
    "sandbox_1",
    "new",
    "0",
    "auto",
    "default_sandbox",
    "none",
    "sandbox_12345",
    "dummy",
    "sandbox_01",
}


def looks_like_dir(path: str) -> bool:
    """
    Return True if the given path either:
      - exists and is a directory, OR
      - does not exist but looks like a directory (e.g., ends with '/', or has no file extension)
    """
    # If it exists, trust the filesystem
    if os.path.isdir(path):
        return True

    # If it ends with '/' or has no extension, treat as directory
    if path.endswith(os.path.sep) or not os.path.splitext(path)[1]:
        return True

    return False


def truncate_result(result: str) -> str:
    """
    Truncate result to MAX_RESULT_LEN.

    Args:
        result: The full result string to potentially truncate

    Returns:
        Truncated result string
    """
    if len(result) > MAX_RESULT_LEN:
        result = result[:MAX_RESULT_LEN] + " [Result truncated due to length limit]"

    return result


@mcp.tool()
async def create_sandbox(timeout: int = DEFAULT_TIMEOUT) -> str:
    """Create a linux sandbox.

    Args:
        timeout: Time in seconds before the sandbox is automatically shutdown. The default is 600 seconds.

    Returns:
        The sandbox_id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox.
    """
    max_retries = 5
    timeout = min(timeout, DEFAULT_TIMEOUT)
    for attempt in range(1, max_retries + 1):
        sandbox = None
        try:
            sandbox = Sandbox(
                template=DEFAULT_TEMPLATE_ID,
                timeout=timeout,
                api_key=E2B_API_KEY,
            )
            info = sandbox.get_info()

            tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles")
            os.makedirs(tmpfiles_dir, exist_ok=True)

            return f"Sandbox created with sandbox_id: {info.sandbox_id}"
        except Exception as e:
            if attempt == max_retries:
                error_details = str(e)[:MAX_ERROR_LEN]
                return f"[ERROR]: Failed to create sandbox after {max_retries} attempts: {error_details}, please retry later."
            await asyncio.sleep(attempt**2)  # Exponential backoff
        finally:
            # Set timeout before exit to prevent timeout after function exits
            try:
                sandbox.set_timeout(timeout)
            except Exception:
                pass  # Ignore timeout setting errors


@mcp.tool()
async def run_command(command: str, sandbox_id: str) -> str:
    """Execute a lightweight shell command in the linux sandbox (no long-running, blocking, or resource-heavy processes).

    Args:
        command: The command to execute.
        sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`.

    Returns:
        A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)
    """
    if sandbox_id in INVALID_SANDBOX_IDS:
        return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."

    try:
        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
    except Exception:
        return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."

    max_retries = 3
    for attempt in range(1, max_retries + 1):
        try:
            sandbox.set_timeout(
                DEFAULT_TIMEOUT
            )  # refresh the timeout for each command execution
            result = sandbox.commands.run(command)

            result_str = str(result)
            return truncate_result(result_str)
        except Exception as e:
            if attempt == max_retries:
                # Build error message
                error_details = str(e)[:MAX_ERROR_LEN]
                error_msg = f"[ERROR]: Failed to run command after {max_retries} attempts.\n\nException type: {type(e).__name__}\nDetails: {error_details}"
                return error_msg
            await asyncio.sleep(attempt**2)  # Exponential backoff
        finally:
            # Set timeout before exit to prevent timeout after function exits
            try:
                sandbox.set_timeout(DEFAULT_TIMEOUT)
            except Exception:
                pass  # Ignore timeout setting errors


@mcp.tool()
async def run_python_code(code_block: str, sandbox_id: str) -> str:
    """Run short, safe python code in a sandbox and return the execution result (avoid long loops or heavy tasks; must finish quickly).

    Args:
        code_block: The python code to run.
        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.

    Returns:
        A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)
    """
    # If sandbox_id is invalid, fallback to stateless execution
    if not sandbox_id or sandbox_id in INVALID_SANDBOX_IDS:
        try:
            sandbox = Sandbox(
                template=DEFAULT_TEMPLATE_ID,
                timeout=DEFAULT_TIMEOUT,
                api_key=E2B_API_KEY,
            )
            try:
                execution = sandbox.run_code(code_block)
                return truncate_result(str(execution))
            finally:
                sandbox.kill()
        except Exception as e:
            error_details = str(e)[:MAX_ERROR_LEN]
            return f"[ERROR]: Failed to run code in stateless mode. Exception type: {type(e).__name__}, Details: {error_details}"

    try:
        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
    except Exception:
        return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."

    max_retries = 3
    for attempt in range(1, max_retries + 1):
        try:
            sandbox.set_timeout(
                DEFAULT_TIMEOUT
            )  # refresh the timeout for each command execution

            execution = sandbox.run_code(code_block)
            result_str = str(execution)
            return truncate_result(result_str)
        except Exception as e:
            if attempt == max_retries:
                error_details = str(e)[:MAX_ERROR_LEN]
                error_msg = f"[ERROR]: Failed to run code in sandbox {sandbox_id} after {max_retries} attempts. Exception type: {type(e).__name__}, Details: {error_details}"
                return error_msg
            await asyncio.sleep(attempt**2)  # Exponential backoff
        finally:
            # Set timeout before exit to prevent timeout after function exits
            try:
                sandbox.set_timeout(DEFAULT_TIMEOUT)
            except Exception:
                pass  # Ignore timeout setting errors


@mcp.tool()
async def upload_file_from_local_to_sandbox(
    sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user"
) -> str:
    """Upload a local file to the `/home/user` dir of the remote python interpreter.

    Args:
        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
        local_file_path: The path of the file on local machine to upload.
        sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`.

    Returns:
        The path of the uploaded file in the remote python interpreter if the upload is successful.
    """
    if sandbox_id in INVALID_SANDBOX_IDS:
        return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."

    try:
        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
    except Exception:
        return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."

    try:
        sandbox.set_timeout(
            DEFAULT_TIMEOUT
        )  # refresh the timeout for each command execution

        # Check if local file exists and is readable
        if not os.path.exists(local_file_path):
            return f"[ERROR]: Local file does not exist: {local_file_path}"
        if not os.path.isfile(local_file_path):
            return f"[ERROR]: Path is not a file: {local_file_path}"

        # Get the uploaded file path
        uploaded_file_path = os.path.join(
            sandbox_file_path, os.path.basename(local_file_path)
        )
        # Normalize the path
        uploaded_file_path = os.path.normpath(uploaded_file_path)

        # Ensure the parent directory exists in sandbox
        parent_dir = os.path.dirname(uploaded_file_path)
        if parent_dir and parent_dir != "/":
            mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}")
            if mkdir_result.exit_code != 0:
                mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]
                return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}"

        # Upload the file
        with open(local_file_path, "rb") as f:
            sandbox.files.write(uploaded_file_path, f)

        return f"File uploaded to {uploaded_file_path}"
    except Exception as e:
        error_details = str(e)[:MAX_ERROR_LEN]
        return f"[ERROR]: Failed to upload file {local_file_path} to sandbox {sandbox_id}: {error_details}"
    finally:
        # Set timeout before exit to prevent timeout after function exits
        try:
            sandbox.set_timeout(DEFAULT_TIMEOUT)
        except Exception:
            pass  # Ignore timeout setting errors


@mcp.tool()
async def download_file_from_internet_to_sandbox(
    sandbox_id: str, url: str, sandbox_file_path: str = "/home/user"
) -> str:
    """Download a file from the internet to the `/home/user` dir of the sandbox (avoid large or slow URLs).

    Args:
        sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
        url: The URL of the file to download.
        sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`.

    Returns:
        The path of the downloaded file in the sandbox if the download is successful.
    """
    if sandbox_id in INVALID_SANDBOX_IDS:
        return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."

    try:
        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
    except Exception:
        return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."

    try:
        sandbox.set_timeout(
            DEFAULT_TIMEOUT
        )  # refresh the timeout for each command execution

        # Extract basename from URL properly (handle query parameters)
        parsed_url = urlparse(url)
        basename = os.path.basename(parsed_url.path) or "downloaded_file"
        # Remove any query parameters or fragments from basename
        if "?" in basename:
            basename = basename.split("?")[0]
        if "#" in basename:
            basename = basename.split("#")[0]

        # Check whether sandbox_file_path looks like a directory
        if looks_like_dir(sandbox_file_path):
            # It's a directory — join with the filename
            downloaded_file_path = os.path.join(sandbox_file_path, basename)
        else:
            # It's a file path — use it directly
            downloaded_file_path = sandbox_file_path

        # Normalize the path
        downloaded_file_path = os.path.normpath(downloaded_file_path)

        # Ensure the parent directory exists in sandbox
        parent_dir = os.path.dirname(downloaded_file_path)
        if parent_dir and parent_dir != "/":
            mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}")
            if mkdir_result.exit_code != 0:
                mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]
                return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}"

        # Download the file with retry logic
        max_retries = 3
        for attempt in range(1, max_retries + 1):
            safe_url = shlex.quote(url)
            safe_path = shlex.quote(downloaded_file_path)
            cmd = f"wget {safe_url} -O {safe_path}"
            try:
                result = sandbox.commands.run(cmd)
                if result.exit_code == 0:
                    return f"File downloaded to {safe_path}"
                elif attempt < max_retries:
                    await asyncio.sleep(4**attempt)
                    continue  # Retry
                else:
                    # Extract detailed error information
                    error_details = ""
                    if hasattr(result, "stderr") and result.stderr:
                        error_details = f"stderr: {result.stderr}"[:MAX_ERROR_LEN]
                    error_msg = (
                        f"[ERROR]: Failed to download file from {url} to {downloaded_file_path} after {max_retries} attempts.\n\n"
                        f"exit_code: {result.exit_code}\n\n"
                        f"Details: {error_details}"
                    )
                    return error_msg
            except Exception as e:
                if attempt == max_retries:
                    error_details = str(e)[:MAX_ERROR_LEN]
                    error_msg = f"[ERROR]: Failed to download file from {url} to {downloaded_file_path}. Exception: {error_details}"
                    return error_msg
                await asyncio.sleep(4**attempt)
    except Exception as e:
        error_details = str(e)[:MAX_ERROR_LEN]
        return f"[ERROR]: Failed to download file from {url}: {error_details}"
    finally:
        # Set timeout before exit to prevent timeout after function exits
        try:
            sandbox.set_timeout(DEFAULT_TIMEOUT)
        except Exception:
            pass  # Ignore timeout setting errors


@mcp.tool()
async def download_file_from_sandbox_to_local(
    sandbox_id: str, sandbox_file_path: str, local_filename: str = None
) -> str:
    """Download a file from the sandbox to local system. Files in sandbox cannot be processed by tools from other servers - only local files and internet URLs can be processed by them.

    Args:
        sandbox_id: The id of the sandbox to download the file from. To have a sandbox, use tool `create_sandbox`.
        sandbox_file_path: The path of the file to download on the sandbox.
        local_filename: Optional filename to save as. If not provided, uses the original filename from sandbox_file_path.

    Returns:
        The local path of the downloaded file if successful, otherwise error message.
    """
    if sandbox_id in INVALID_SANDBOX_IDS:
        return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."

    try:
        sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
    except Exception:
        return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."

    try:
        sandbox.set_timeout(
            DEFAULT_TIMEOUT
        )  # refresh the timeout for each command execution

        # Create tmpfiles directory if it doesn't exist
        if not LOGS_DIR:
            return "[ERROR]: LOGS_DIR environment variable is not set. Cannot determine where to save the file."

        tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles")
        os.makedirs(tmpfiles_dir, exist_ok=True)

        # Check if the path is a directory (before attempting to read)
        check_result = sandbox.commands.run(
            f'test -d {shlex.quote(sandbox_file_path)} && echo "is_directory" || echo "not_directory"'
        )
        if check_result.stdout and "is_directory" in check_result.stdout:
            return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file."

        # Check if the file exists
        check_file_result = sandbox.commands.run(
            f'test -f {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"'
        )
        if check_file_result.stdout and "not_exists" in check_file_result.stdout:
            # Check if it exists at all (might be a symlink or other type)
            check_any_result = sandbox.commands.run(
                f'test -e {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"'
            )
            if check_any_result.stdout and "not_exists" in check_any_result.stdout:
                error_msg = f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: file does not exist."
                return error_msg

        # Determine local filename
        if local_filename is None or local_filename.strip() == "":
            local_filename = os.path.basename(sandbox_file_path)
            # If basename is empty or just '/', use a default name
            if not local_filename or local_filename == "/":
                local_filename = "downloaded_file"

        local_file_path = os.path.join(
            tmpfiles_dir, f"sandbox_{sandbox_id}_{local_filename}"
        )

        # Download the file
        try:
            with open(local_file_path, "wb") as f:
                content = sandbox.files.read(sandbox_file_path, format="bytes")
                f.write(content)
        except Exception as read_error:
            error_msg = str(read_error).lower()
            if "directory" in error_msg or "is a directory" in error_msg:
                return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file."
            else:
                read_error_details = str(read_error)[:MAX_ERROR_LEN]
                return f"[ERROR]: Failed to read file '{sandbox_file_path}' from sandbox {sandbox_id}: {read_error_details}"

        return f"File downloaded successfully to: {local_file_path}"
    except Exception as e:
        error_details = str(e)[:MAX_ERROR_LEN]
        return f"[ERROR]: Failed to download file '{sandbox_file_path}' from sandbox {sandbox_id}: {error_details}"
    finally:
        # Set timeout before exit to prevent timeout after function exits
        try:
            sandbox.set_timeout(DEFAULT_TIMEOUT)
        except Exception:
            pass  # Ignore timeout setting errors


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import argparse
import logging
import sys

from fastmcp import FastMCP
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client

logger = logging.getLogger("miroflow")

# Initialize FastMCP server
mcp = FastMCP("reading-mcp-server")


@mcp.tool()
async def convert_to_markdown(uri: str) -> str:
    """Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.)
    described by an file: or data: URI to markdown.

    Args:
        uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes.

    Returns:
        str: The converted markdown content, or an error message if conversion fails.
    """
    if not uri or not uri.strip():
        return "Error: URI parameter is required and cannot be empty."

    # Validate URI scheme
    valid_schemes = ["http:", "https:", "file:", "data:"]
    if not any(uri.lower().startswith(scheme) for scheme in valid_schemes):
        return f"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}"

    tool_name = "convert_to_markdown"
    arguments = {"uri": uri}

    server_params = StdioServerParameters(
        command=sys.executable,
        args=["-m", "markitdown_mcp"],
    )

    result_content = ""
    try:
        async with stdio_client(server_params) as (read, write):
            async with ClientSession(read, write, sampling_callback=None) as session:
                await session.initialize()
                try:
                    tool_result = await session.call_tool(
                        tool_name, arguments=arguments
                    )
                    result_content = (
                        tool_result.content[-1].text if tool_result.content else ""
                    )
                except Exception as tool_error:
                    logger.info(f"Tool execution error: {tool_error}")
                    return f"Error: Tool execution failed: {str(tool_error)}"
    except Exception as session_error:
        logger.info(f"Session error: {session_error}")
        return (
            f"Error: Failed to connect to markitdown-mcp server: {str(session_error)}"
        )

    return result_content


if __name__ == "__main__":
    # Set up argument parser
    parser = argparse.ArgumentParser(description="Reading MCP Server")
    parser.add_argument(
        "--transport",
        choices=["stdio", "http"],
        default="stdio",
        help="Transport method: 'stdio' or 'http' (default: stdio)",
    )
    parser.add_argument(
        "--port",
        type=int,
        default=8080,
        help="Port to use when running with HTTP transport (default: 8080)",
    )
    parser.add_argument(
        "--path",
        type=str,
        default="/mcp",
        help="URL path to use when running with HTTP transport (default: /mcp)",
    )

    # Parse command line arguments
    args = parser.parse_args()

    # Run the server with the specified transport method
    if args.transport == "stdio":
        mcp.run(transport="stdio")
    else:
        # For HTTP transport, include port and path options
        mcp.run(transport="streamable-http", port=args.port, path=args.path)


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import logging
import os

from anthropic import Anthropic
from fastmcp import FastMCP

logger = logging.getLogger("miroflow")

ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")

# Initialize FastMCP server
mcp = FastMCP("reasoning-mcp-server")


@mcp.tool()
async def reasoning(question: str) -> str:
    """You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
    DO NOT use this tool for simple and obvious question.

    Args:
        question: The hard question.

    Returns:
        The answer to the question.
    """
    messages_for_llm = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question,
                }
            ],
        }
    ]

    client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL)
    response = client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=21000,
        thinking={
            "type": "enabled",
            "budget_tokens": 19000,
        },
        messages=messages_for_llm,
        stream=False,
    )

    try:
        return response.content[-1].text
    except Exception:
        logger.info("Reasoning Error: only thinking content is returned")
        return response.content[-1].thinking


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import logging
import os
import random
import time

import requests
from fastmcp import FastMCP

logger = logging.getLogger("miroflow")

REASONING_API_KEY = os.environ.get("REASONING_API_KEY")
REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL")
REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME")

# Initialize FastMCP server
mcp = FastMCP("reasoning-mcp-server-os")

# Retry configuration
MAX_RETRIES = 10
BACKOFF_BASE = 1.0  # initial backoff in seconds
BACKOFF_MAX = 30.0  # maximum backoff in seconds


def post_with_retry(url, json, headers):
    """Send POST request with retry and exponential backoff.
    Returns response object if success, otherwise None."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.post(url, json=json, headers=headers, timeout=600)
            if resp.status_code == 200:
                return resp
            else:
                logger.warning(
                    f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}"
                )
        except requests.exceptions.RequestException as e:
            logger.warning(f"Request failed on attempt {attempt}: {e}")

        # Backoff before next retry
        if attempt < MAX_RETRIES:
            sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX)
            # Add jitter to avoid thundering herd
            sleep_time *= 0.8 + 0.4 * random.random()
            logger.info(f"Retrying in {sleep_time:.1f}s...")
            time.sleep(sleep_time)

    logger.warning(f"All {MAX_RETRIES} retries failed for {url}")
    return None


@mcp.tool()
async def reasoning(question: str) -> str:
    """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
    DO NOT use this tool for simple and obvious question.

    Args:
        question: The hard question.

    Returns:
        The answer to the question.
    """
    payload = {
        "model": REASONING_MODEL_NAME,
        "messages": [{"role": "user", "content": question}],
        "temperature": 0.6,
        "top_p": 0.95,
    }
    headers = {
        "Authorization": f"Bearer {REASONING_API_KEY}",
        "Content-Type": "application/json",
    }

    response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers)
    if response is None:
        return "Reasoning service unavailable. Please try again later."

    json_response = response.json()
    try:
        content = json_response["choices"][0]["message"]["content"]
        if "</think>" in content:
            content = content.split("</think>", 1)[1].strip()
        return content
    except Exception:
        logger.info("Reasoning Error: only thinking content is returned")
        return json_response["choices"][0]["message"]["reasoning_content"]


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_google_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import calendar
import datetime
import json
import os
import sys

import requests
import wikipedia
from fastmcp import FastMCP
from mcp import ClientSession, StdioServerParameters  # (already imported in config.py)
from mcp.client.stdio import stdio_client

from .utils import strip_markdown_links

SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")

# Google search result filtering environment variables
REMOVE_SNIPPETS = os.environ.get("REMOVE_SNIPPETS", "").lower() in ("true", "1", "yes")
REMOVE_KNOWLEDGE_GRAPH = os.environ.get("REMOVE_KNOWLEDGE_GRAPH", "").lower() in (
    "true",
    "1",
    "yes",
)
REMOVE_ANSWER_BOX = os.environ.get("REMOVE_ANSWER_BOX", "").lower() in (
    "true",
    "1",
    "yes",
)

# Initialize FastMCP server
mcp = FastMCP("searching-google-mcp-server")


def filter_google_search_result(result_content: str) -> str:
    """Filter google search result content based on environment variables.

    Args:
        result_content: The JSON string result from google search

    Returns:
        Filtered JSON string result
    """
    try:
        # Parse JSON
        data = json.loads(result_content)

        # Remove knowledgeGraph if requested
        if REMOVE_KNOWLEDGE_GRAPH and "knowledgeGraph" in data:
            del data["knowledgeGraph"]

        # Remove answerBox if requested
        if REMOVE_ANSWER_BOX and "answerBox" in data:
            del data["answerBox"]

        # Remove snippets if requested
        if REMOVE_SNIPPETS:
            # Remove snippets from organic results
            if "organic" in data:
                for item in data["organic"]:
                    if "snippet" in item:
                        del item["snippet"]

            # Remove snippets from peopleAlsoAsk
            if "peopleAlsoAsk" in data:
                for item in data["peopleAlsoAsk"]:
                    if "snippet" in item:
                        del item["snippet"]

        # Return filtered JSON
        return json.dumps(data, ensure_ascii=False, indent=None)

    except (json.JSONDecodeError, Exception):
        # If filtering fails, return original content
        return result_content


@mcp.tool()
async def google_search(
    q: str,
    gl: str = "us",
    hl: str = "en",
    location: str = None,
    num: int = 10,
    tbs: str = None,
    page: int = 1,
) -> str:
    """Perform google searches via Serper API and retrieve rich results.
    It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.

    Args:
        q: Search query string.
        gl: Country context for search (e.g., 'us' for United States, 'cn' for China, 'uk' for United Kingdom). Influences regional results priority. Default is 'us'.
        hl: Google interface language (e.g., 'en' for English, 'zh' for Chinese, 'es' for Spanish). Affects snippet language preference. Default is 'en'.
        location: City-level location for search results (e.g., 'SoHo, New York, United States', 'California, United States').
        num: The number of results to return (default: 10).
        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year).
        page: The page number of results to return (default: 1).

    Returns:
        The search results.
    """
    if SERPER_API_KEY == "":
        return (
            "[ERROR]: SERPER_API_KEY is not set, google_search tool is not available."
        )

    tool_name = "google_search"
    arguments = {
        "q": q,
        "gl": gl,
        "hl": hl,
        "num": num,
        "page": page,
        "autocorrect": False,
    }
    if location:
        arguments["location"] = location
    if tbs:
        arguments["tbs"] = tbs
    server_params = StdioServerParameters(
        command=sys.executable,
        args=["-m", "miroflow_tools.mcp_servers.serper_mcp_server"],
        env={"SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL},
    )
    result_content = ""

    retry_count = 0
    max_retries = 3

    while retry_count < max_retries:
        try:
            async with stdio_client(server_params) as (read, write):
                async with ClientSession(
                    read, write, sampling_callback=None
                ) as session:
                    await session.initialize()
                    tool_result = await session.call_tool(
                        tool_name, arguments=arguments
                    )
                    result_content = (
                        tool_result.content[-1].text if tool_result.content else ""
                    )
                    assert (
                        result_content is not None and result_content.strip() != ""
                    ), "Empty result from google_search tool, please try again."
                    # Apply filtering based on environment variables
                    filtered_result = filter_google_search_result(result_content)
                    return filtered_result  # Success, exit retry loop
        except Exception as error:
            retry_count += 1
            if retry_count >= max_retries:
                return f"[ERROR]: google_search tool execution failed after {max_retries} attempts: {str(error)}"
            # Wait before retrying
            await asyncio.sleep(min(2**retry_count, 60))

    return "[ERROR]: Unknown error occurred in google_search tool, please try again."


# @mcp.tool()
async def wiki_get_page_content(entity: str, first_sentences: int = 10) -> str:
    """Get specific Wikipedia page content for the specific entity (people, places, concepts, events) and return structured information.

    This tool searches Wikipedia for the given entity and returns either the first few sentences
    (which typically contain the summary/introduction) or full page content based on parameters.
    It handles disambiguation pages and provides clean, structured output.

    Args:
        entity: The entity to search for in Wikipedia.
        first_sentences: Number of first sentences to return from the page. Set to 0 to return full content. Defaults to 10.

    Returns:
        str: Formatted search results containing title, first sentences/full content, and URL.
             Returns error message if page not found or other issues occur.
    """
    try:
        # Try to get the Wikipedia page directly
        page = wikipedia.page(title=entity, auto_suggest=False)

        # Prepare the result
        result_parts = [f"Page Title: {page.title}"]

        if first_sentences > 0:
            # Get summary with specified number of sentences
            try:
                summary = wikipedia.summary(
                    entity, sentences=first_sentences, auto_suggest=False
                )
                result_parts.append(
                    f"First {first_sentences} sentences (introduction): {summary}"
                )
            except Exception:
                # Fallback to page summary if direct summary fails
                content_sentences = page.content.split(". ")[:first_sentences]
                summary = (
                    ". ".join(content_sentences) + "."
                    if content_sentences
                    else page.content[:5000] + "..."
                )
                result_parts.append(
                    f"First {first_sentences} sentences (introduction): {summary}"
                )
        else:
            # Return full content if first_sentences is 0
            # TODO: Context Engineering Needed
            result_parts.append(f"Content: {page.content}")

        result_parts.append(f"URL: {page.url}")

        return "\n\n".join(result_parts)

    except wikipedia.exceptions.DisambiguationError as e:
        options_list = "\n".join(
            [f"- {option}" for option in e.options[:10]]
        )  # Limit to first 10
        output = (
            f"Disambiguation Error: Multiple pages found for '{entity}'.\n\n"
            f"Available options:\n{options_list}\n\n"
            f"Please be more specific in your search query."
        )

        try:
            search_results = wikipedia.search(entity, results=5)
            if search_results:
                output += f"Try to search {entity} in Wikipedia: {search_results}"
            return output
        except Exception:
            pass

        return output

    except wikipedia.exceptions.PageError:
        # Try a search if direct page lookup fails
        try:
            search_results = wikipedia.search(entity, results=5)
            if search_results:
                suggestion_list = "\n".join(
                    [f"- {result}" for result in search_results[:5]]
                )
                return (
                    f"Page Not Found: No Wikipedia page found for '{entity}'.\n\n"
                    f"Similar pages found:\n{suggestion_list}\n\n"
                    f"Try searching for one of these suggestions instead."
                )
            else:
                return (
                    f"Page Not Found: No Wikipedia page found for '{entity}' "
                    f"and no similar pages were found. Please try a different search term."
                )
        except Exception as search_error:
            return (
                f"Page Not Found: No Wikipedia page found for '{entity}'. "
                f"Search for alternatives also failed: {str(search_error)}"
            )

    except wikipedia.exceptions.RedirectError:
        return f"Redirect Error: Failed to follow redirect for '{entity}'"

    except requests.exceptions.RequestException as e:
        return f"Network Error: Failed to connect to Wikipedia: {str(e)}"

    except wikipedia.exceptions.WikipediaException as e:
        return f"Wikipedia Error: An error occurred while searching Wikipedia: {str(e)}"

    except Exception as e:
        return f"Unexpected Error: An unexpected error occurred: {str(e)}"


# @mcp.tool()
async def search_wiki_revision(
    entity: str, year: int, month: int, max_revisions: int = 50
) -> str:
    """Search for an entity in Wikipedia and return the revision history for a specific month.

    Args:
        entity: The entity to search for in Wikipedia.
        year: The year of the revision (e.g. 2024).
        month: The month of the revision (1-12).
        max_revisions: Maximum number of revisions to return. Defaults to 50.

    Returns:
        str: Formatted revision history with timestamps, revision IDs, and URLs.
             Returns error message if page not found or other issues occur.
    """
    # Auto-adjust date values and track changes
    adjustments = []
    original_year, original_month = year, month
    current_year = datetime.datetime.now().year

    # Adjust year to valid range
    if year < 2000:
        year = 2000
        adjustments.append(
            f"Year adjusted from {original_year} to 2000 (minimum supported)"
        )
    elif year > current_year:
        year = current_year
        adjustments.append(
            f"Year adjusted from {original_year} to {current_year} (current year)"
        )

    # Adjust month to valid range
    if month < 1:
        month = 1
        adjustments.append(f"Month adjusted from {original_month} to 1")
    elif month > 12:
        month = 12
        adjustments.append(f"Month adjusted from {original_month} to 12")

    # Prepare adjustment message if any changes were made
    if adjustments:
        adjustment_msg = (
            "Date auto-adjusted: "
            + "; ".join(adjustments)
            + f". Using {year}-{month:02d} instead.\n\n"
        )
    else:
        adjustment_msg = ""

    base_url = "https://en.wikipedia.org/w/api.php"

    try:
        # Construct the time range
        start_date = datetime.datetime(year, month, 1)
        last_day = calendar.monthrange(year, month)[1]
        end_date = datetime.datetime(year, month, last_day, 23, 59, 59)

        # Convert to ISO format (UTC time)
        start_iso = start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
        end_iso = end_date.strftime("%Y-%m-%dT%H:%M:%SZ")

        # API parameters configuration
        params = {
            "action": "query",
            "format": "json",
            "titles": entity,
            "prop": "revisions",
            "rvlimit": min(max_revisions, 500),  # Wikipedia API limit
            "rvstart": start_iso,
            "rvend": end_iso,
            "rvdir": "newer",
            "rvprop": "timestamp|ids",
        }

        response = requests.get(base_url, params=params)
        response.raise_for_status()

        data = response.json()

        # Check for API errors
        if "error" in data:
            return f"[ERROR]: Wikipedia API Error: {data['error'].get('info', 'Unknown error')}"

        # Process the response
        pages = data.get("query", {}).get("pages", {})

        if not pages:
            return f"[ERROR]: No results found for entity '{entity}'"

        # Check if page exists
        page_id = list(pages.keys())[0]
        if page_id == "-1":
            return f"[ERROR]: Page Not Found: No Wikipedia page found for '{entity}'"

        page_info = pages[page_id]
        page_title = page_info.get("title", entity)

        if "revisions" not in page_info or not page_info["revisions"]:
            return (
                adjustment_msg + f"Page Title: {page_title}\n\n"
                f"No revisions found for '{entity}' in {year}-{month:02d}.\n\n"
                f"The page may not have been edited during this time period."
            )

        # Format the results
        result_parts = [
            f"Page Title: {page_title}",
            f"Revision Period: {year}-{month:02d}",
            f"Total Revisions Found: {len(page_info['revisions'])}",
        ]

        # Add revision details
        revisions_details = []
        for i, rev in enumerate(page_info["revisions"], 1):
            revision_id = rev["revid"]
            timestamp = rev["timestamp"]

            # Format timestamp for better readability
            try:
                dt = datetime.datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
                formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
            except Exception:
                formatted_time = timestamp

            # Construct revision URL
            rev_url = f"https://en.wikipedia.org/w/index.php?title={entity}&oldid={revision_id}"

            revisions_details.append(
                f"{i}. Revision ID: {revision_id}\n"
                f"   Timestamp: {formatted_time}\n"
                f"   URL: {rev_url}"
            )

        if revisions_details:
            result_parts.append("Revisions:\n" + "\n\n".join(revisions_details))

        return (
            adjustment_msg
            + "\n\n".join(result_parts)
            + "\n\nHint: You can use the `scrape_website` tool to get the webpage content of a URL."
        )

    except requests.exceptions.Timeout:
        return f"[ERROR]: Network Error: Request timed out while fetching revision history for '{entity}'"

    except requests.exceptions.RequestException as e:
        return f"[ERROR]: Network Error: Failed to connect to Wikipedia: {str(e)}"

    except ValueError as e:
        return f"[ERROR]: Date Error: Invalid date values - {str(e)}"

    except Exception as e:
        return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}"


# @mcp.tool()
async def search_archived_webpage(url: str, year: int, month: int, day: int) -> str:
    """Search the Wayback Machine (archive.org) for archived versions of a webpage, optionally for a specific date.

    Args:
        url: The URL to search for in the Wayback Machine.
        year: The target year (e.g., 2023).
        month: The target month (1-12).
        day: The target day (1-31).

    Returns:
        str: Formatted archive information including archived URL, timestamp, and status.
             Returns error message if URL not found or other issues occur.
    """
    # Handle empty URL
    if not url:
        return f"[ERROR]: Invalid URL: '{url}'. URL cannot be empty."

    # Auto-add https:// if no protocol is specified
    protocol_hint = ""
    if not url.startswith(("http://", "https://")):
        original_url = url
        url = f"https://{url}"
        protocol_hint = f"[NOTE]: Automatically added 'https://' to URL '{original_url}' -> '{url}'\n\n"

    hint_message = ""
    if ".wikipedia.org" in url:
        hint_message = "Note: You are trying to search a Wikipedia page, you can also use the `search_wiki_revision` tool to get the revision content of a Wikipedia page.\n\n"

    # Check if specific date is requested
    date = ""
    adjustment_msg = ""
    if year > 0 and month > 0:
        # Auto-adjust date values and track changes
        adjustments = []
        original_year, original_month, original_day = year, month, day
        current_year = datetime.datetime.now().year

        # Adjust year to valid range
        if year < 1995:
            year = 1995
            adjustments.append(
                f"Year adjusted from {original_year} to 1995 (minimum supported)"
            )
        elif year > current_year:
            year = current_year
            adjustments.append(
                f"Year adjusted from {original_year} to {current_year} (current year)"
            )

        # Adjust month to valid range
        if month < 1:
            month = 1
            adjustments.append(f"Month adjusted from {original_month} to 1")
        elif month > 12:
            month = 12
            adjustments.append(f"Month adjusted from {original_month} to 12")

        # Adjust day to valid range for the given month/year
        max_day = calendar.monthrange(year, month)[1]
        if day < 1:
            day = 1
            adjustments.append(f"Day adjusted from {original_day} to 1")
        elif day > max_day:
            day = max_day
            adjustments.append(
                f"Day adjusted from {original_day} to {max_day} (max for {year}-{month:02d})"
            )

        # Update the date string with adjusted values
        date = f"{year:04d}{month:02d}{day:02d}"

        try:
            # Validate the final adjusted date
            datetime.datetime(year, month, day)
        except ValueError as e:
            return f"[ERROR]: Invalid date: {year}-{month:02d}-{day:02d}. {str(e)}"

        # Prepare adjustment message if any changes were made
        if adjustments:
            adjustment_msg = (
                "Date auto-adjusted: "
                + "; ".join(adjustments)
                + f". Using {date} instead.\n\n"
            )

    try:
        base_url = "https://archive.org/wayback/available"
        # Search with specific date if provided
        if date:
            retry_count = 0
            # retry 5 times if the response is not valid
            while retry_count < 5:
                response = requests.get(f"{base_url}?url={url}&timestamp={date}")
                response.raise_for_status()
                data = response.json()
                if (
                    "archived_snapshots" in data
                    and "closest" in data["archived_snapshots"]
                ):
                    break
                retry_count += 1
                await asyncio.sleep(min(2**retry_count, 60))

            if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
                closest = data["archived_snapshots"]["closest"]
                archived_url = closest["url"]
                archived_timestamp = closest["timestamp"]
                available = closest.get("available", True)

                if not available:
                    return (
                        hint_message
                        + adjustment_msg
                        + (
                            f"Archive Status: Snapshot exists but is not available\n\n"
                            f"Original URL: {url}\n"
                            f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n"
                            f"Closest Snapshot: {archived_timestamp}\n\n"
                            f"Try a different date"
                        )
                    )

                # Format timestamp for better readability
                try:
                    dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S")
                    formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
                except Exception:
                    formatted_time = archived_timestamp

                return (
                    protocol_hint
                    + hint_message
                    + adjustment_msg
                    + (
                        f"Archive Found: Archived version located\n\n"
                        f"Original URL: {url}\n"
                        f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n"
                        f"Archived URL: {archived_url}\n"
                        f"Archived Timestamp: {formatted_time}\n"
                    )
                    + "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL."
                )

        # Search without specific date (most recent)
        retry_count = 0
        # retry 5 times if the response is not valid
        while retry_count < 5:
            response = requests.get(f"{base_url}?url={url}")
            response.raise_for_status()
            data = response.json()
            if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
                break
            retry_count += 1
            await asyncio.sleep(min(2**retry_count, 60))

        if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
            closest = data["archived_snapshots"]["closest"]
            archived_url = closest["url"]
            archived_timestamp = closest["timestamp"]
            available = closest.get("available", True)

            if not available:
                return (
                    protocol_hint
                    + hint_message
                    + (
                        f"Archive Status: Most recent snapshot exists but is not available\n\n"
                        f"Original URL: {url}\n"
                        f"Most Recent Snapshot: {archived_timestamp}\n\n"
                        f"The URL may have been archived but access is restricted"
                    )
                )

            # Format timestamp for better readability
            try:
                dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S")
                formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
            except Exception:
                formatted_time = archived_timestamp

            return (
                protocol_hint
                + hint_message
                + (
                    f"Archive Found: Most recent archived version\n\n"
                    f"Original URL: {url}\n"
                    f"Archived URL: {archived_url}\n"
                    f"Archived Timestamp: {formatted_time}\n"
                )
                + "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL."
            )
        else:
            return (
                protocol_hint
                + hint_message
                + (
                    f"Archive Not Found: No archived versions available\n\n"
                    f"Original URL: {url}\n\n"
                    f"The URL '{url}' has not been archived by the Wayback Machine.\n"
                    f"You may want to:\n"
                    f"- Check if the URL is correct\n"
                    f"- Try a different URL and date\n"
                )
            )

    except requests.exceptions.RequestException as e:
        return f"[ERROR]: Network Error: Failed to connect to Wayback Machine: {str(e)}"

    except ValueError as e:
        return f"[ERROR]: Data Error: Failed to parse response from Wayback Machine: {str(e)}"

    except Exception as e:
        return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}"


@mcp.tool()
async def scrape_website(url: str) -> str:
    """This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.

    Args:
        url: The URL of the website to scrape.
    Returns:
        The scraped website content.
    """
    # Validate URL format
    if not url or not url.startswith(("http://", "https://")):
        return f"Invalid URL: '{url}'. URL must start with http:// or https://"

    # Avoid duplicate Jina URL prefix
    if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
        url = url[len("https://r.jina.ai/") :]

    # Check for restricted domains
    if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url:
        return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."

    if JINA_API_KEY == "":
        return "JINA_API_KEY is not set, scrape_website tool is not available."

    try:
        # Use Jina.ai reader API to convert URL to LLM-friendly text
        jina_url = f"{JINA_BASE_URL}/{url}"

        # Make request with proper headers
        headers = {"Authorization": f"Bearer {JINA_API_KEY}"}

        response = requests.get(jina_url, headers=headers, timeout=60)
        response.raise_for_status()

        # Get the content
        content = response.text.strip()
        content = strip_markdown_links(content)

        if not content:
            return f"No content retrieved from URL: {url}"

        return content

    except requests.exceptions.Timeout:
        return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive."

    except requests.exceptions.ConnectionError:
        return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible."

    except requests.exceptions.HTTPError as e:
        status_code = e.response.status_code if e.response else "unknown"
        if status_code == 404:
            return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist."
        elif status_code == 403:
            return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden."
        elif status_code == 500:
            return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error."
        else:
            return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}"

    except requests.exceptions.RequestException as e:
        return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}"

    except Exception as e:
        return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}"


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sogou_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import json
import os

import requests
from fastmcp import FastMCP
from tencentcloud.common import credential
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
    TencentCloudSDKException,
)
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile

from .utils import strip_markdown_links

TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID", "")
TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY", "")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")

# Initialize FastMCP server
mcp = FastMCP("searching-sogou-mcp-server")


@mcp.tool()
async def sogou_search(Query: str, Cnt: int = 10) -> str:
    """Performs web searches using the Tencent Cloud SearchPro API to retrieve comprehensive information, with Sogou search offering superior results for Chinese-language queries.

    Args:
        Query: The core search query string. Be specific to improve result relevance (e.g., "2024 World Cup final results"). (Required, no default value)
        Cnt: Number of search results to return (Can only be 10/20/30/40/50). Optional, default: 10)

    Returns:
        The search results in JSON format, including the following core fields:
        - Query: The original search query (consistent with the input Query, for request verification)
        - Pages: Array of JSON strings, each containing details of a single search result (e.g., title, url, passage, date, site, favicon)
    """
    if TENCENTCLOUD_SECRET_ID == "" or TENCENTCLOUD_SECRET_KEY == "":
        return "[ERROR]: TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY is not set, sogou_search tool is not available."

    retry_count = 0
    max_retries = 3

    while retry_count < max_retries:
        try:
            cred = credential.Credential(
                TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY
            )
            httpProfile = HttpProfile()
            httpProfile.endpoint = "wsa.tencentcloudapi.com"
            clientProfile = ClientProfile()
            clientProfile.httpProfile = httpProfile

            params = f'{{"Query":"{Query}","Mode":0, "Cnt":{Cnt}}}'
            common_client = CommonClient(
                "wsa", "2025-05-08", cred, "", profile=clientProfile
            )
            result = common_client.call_json("SearchPro", json.loads(params))[
                "Response"
            ]
            del result["RequestId"]
            pages = []
            for page in result["Pages"]:
                page_json = json.loads(page)
                new_page = {}
                new_page["title"] = page_json["title"]
                new_page["url"] = page_json["url"]
                new_page["passage"] = page_json["passage"]
                new_page["date"] = page_json["date"]
                # new_page["content"] = page_json["content"]
                new_page["site"] = page_json["site"]
                # new_page["favicon"] = page_json["favicon"]
                pages.append(new_page)
            result["Pages"] = pages
            return json.dumps(result, ensure_ascii=False)
        except TencentCloudSDKException:
            retry_count += 1
            if retry_count >= max_retries:
                return f"[ERROR]: sogou_search tool execution failed after {max_retries} attempts: Unexpected error occurred."
            # Wait before retrying
            await asyncio.sleep(min(2**retry_count, 60))

    return "[ERROR]: Unknown error occurred in google_search tool, please try again."


@mcp.tool()
async def scrape_website(url: str) -> str:
    """This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.

    Args:
        url: The URL of the website to scrape.
    Returns:
        The scraped website content.
    """
    # Validate URL format
    if not url or not url.startswith(("http://", "https://")):
        return f"Invalid URL: '{url}'. URL must start with http:// or https://"

    # Avoid duplicate Jina URL prefix
    if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
        url = url[len("https://r.jina.ai/") :]

    # Check for restricted domains
    if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url:
        return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."

    if JINA_API_KEY == "":
        return "JINA_API_KEY is not set, scrape_website tool is not available."

    try:
        # Use Jina.ai reader API to convert URL to LLM-friendly text
        jina_url = f"{JINA_BASE_URL}/{url}"

        # Make request with proper headers
        headers = {"Authorization": f"Bearer {JINA_API_KEY}"}

        response = requests.get(jina_url, headers=headers, timeout=60)
        response.raise_for_status()

        # Get the content
        content = response.text.strip()
        content = strip_markdown_links(content)

        if not content:
            return f"No content retrieved from URL: {url}"

        return content

    except requests.exceptions.Timeout:
        return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive."

    except requests.exceptions.ConnectionError:
        return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible."

    except requests.exceptions.HTTPError as e:
        status_code = e.response.status_code if e.response else "unknown"
        if status_code == 404:
            return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist."
        elif status_code == 403:
            return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden."
        elif status_code == 500:
            return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error."
        else:
            return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}"

    except requests.exceptions.RequestException as e:
        return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}"

    except Exception as e:
        return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}"


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

"""
adapted from
https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1
"""

import json
import os
from typing import Any, Dict

import requests
from mcp.server.fastmcp import FastMCP
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from .utils import decode_http_urls_in_dict

SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")

# Initialize FastMCP server
mcp = FastMCP("serper-mcp-server")


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    retry=retry_if_exception_type(
        (requests.ConnectionError, requests.Timeout, requests.HTTPError)
    ),
)
def make_serper_request(
    payload: Dict[str, Any], headers: Dict[str, str]
) -> requests.Response:
    """Make HTTP request to Serper API with retry logic."""
    response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers)
    response.raise_for_status()
    return response


def _is_huggingface_dataset_or_space_url(url):
    """
    Check if the URL is a HuggingFace dataset or space URL.
    :param url: The URL to check
    :return: True if it's a HuggingFace dataset or space URL, False otherwise
    """
    if not url:
        return False
    return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url


@mcp.tool()
def google_search(
    q: str,
    gl: str = "us",
    hl: str = "en",
    location: str | None = None,
    num: int | None = None,
    tbs: str | None = None,
    page: int | None = None,
    autocorrect: bool | None = None,
):
    """
    Tool to perform web searches via Serper API and retrieve rich results.

    It is able to retrieve organic search results, people also ask,
    related searches, and knowledge graph.

    Args:
        q: Search query string
        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
        num: Number of results to return (default: 10)
        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week,
            'qdr:m' for past month, 'qdr:y' for past year)
        page: Page number of results to return (default: 1)
        autocorrect: Whether to autocorrect spelling in query

    Returns:
        Dictionary containing search results and metadata.
    """
    # Check for API key
    if not SERPER_API_KEY:
        return json.dumps(
            {
                "success": False,
                "error": "SERPER_API_KEY environment variable not set",
                "results": [],
            },
            ensure_ascii=False,
        )

    # Validate required parameter
    if not q or not q.strip():
        return json.dumps(
            {
                "success": False,
                "error": "Search query 'q' is required and cannot be empty",
                "results": [],
            },
            ensure_ascii=False,
        )

    try:
        # Build payload with all supported parameters
        payload: dict[str, Any] = {
            "q": q.strip(),
            "gl": gl,
            "hl": hl,
        }

        # Add optional parameters if provided
        if location:
            payload["location"] = location
        if num is not None:
            payload["num"] = num
        else:
            payload["num"] = 10  # Default
        if tbs:
            payload["tbs"] = tbs
        if page is not None:
            payload["page"] = page
        if autocorrect is not None:
            payload["autocorrect"] = autocorrect

        # Set up headers
        headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}

        # Make the API request
        response = make_serper_request(payload, headers)
        data = response.json()

        # filter out HuggingFace dataset or space urls
        organic_results = []
        if "organic" in data:
            for item in data["organic"]:
                if _is_huggingface_dataset_or_space_url(item.get("link", "")):
                    continue
                organic_results.append(item)

        # Keep all original fields, but overwrite "organic"
        response_data = dict(data)
        response_data["organic"] = organic_results
        response_data = decode_http_urls_in_dict(response_data)

        return json.dumps(response_data, ensure_ascii=False)

    except Exception as e:
        return json.dumps(
            {"success": False, "error": f"Unexpected error: {str(e)}", "results": []},
            ensure_ascii=False,
        )


if __name__ == "__main__":
    mcp.run()


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py
================================================
from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links

__all__ = [
    "safe_unquote",
    "decode_http_urls_in_dict",
    "strip_markdown_links",
]


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py
================================================
import re
from urllib.parse import unquote

from markdown_it import MarkdownIt

# RFC 3986 reserved characters percent-encoding (decoding these would alter URL semantics/structure)
# gen-delims: : / ? # [ ] @
# sub-delims: ! $ & ' ( ) * + , ; =
RESERVED_PERCENT_ENCODINGS = frozenset(
    {
        "%2f",
        "%2F",  # /  path separator
        "%3f",
        "%3F",  # ?  query string start
        "%23",  # #  fragment start
        "%26",  # &  query parameter separator
        "%3d",
        "%3D",  # =  key-value separator
        "%40",  # @
        "%3a",
        "%3A",  # :
        "%5b",
        "%5B",  # [
        "%5d",
        "%5D",  # ]
        "%21",  # !
        "%24",  # $
        "%27",  # '
        "%28",  # (
        "%29",  # )
        "%2a",
        "%2A",  # *
        "%2b",
        "%2B",  # +
        "%2c",
        "%2C",  # ,
        "%3b",
        "%3B",  # ;
        "%25",  # %  percent sign itself (prevents double-encoding issues)
        "%20",  # space (keep encoded to avoid URL semantic changes)
    }
)


def safe_unquote(url: str) -> str:
    """
    Safely decode URL-encoded strings, only decoding characters that won't alter URL semantics.

    Preserve the following encodings (because decoding would change URL structure/semantics):
    - %2F (/) - path separator, decoding would alter path hierarchy
    - %3F (?) - query string start marker
    - %23 (#) - fragment start marker (not sent to server)
    - %26 (&) - query parameter separator
    - %3D (=) - key-value separator
    - %25 (%) - percent sign itself (prevents double-encoding issues, e.g. %252F -> %2F -> /)
    - %20 ( ) - space (keep encoded to avoid URL semantic changes)
    - and other RFC 3986 reserved characters

    Only decode unreserved characters and UTF-8 encoded international characters (e.g. Chinese).
    """
    if not url:
        return url

    result = []
    i = 0
    n = len(url)

    while i < n:
        # Check if this is a percent-encoded sequence %XX
        if url[i] == "%" and i + 2 < n:
            hex_chars = url[i + 1 : i + 3]
            # Validate it's a valid hexadecimal
            if all(c in "0123456789ABCDEFabcdef" for c in hex_chars):
                percent_encoded = url[i : i + 3]

                # Check if this is a reserved character encoding that should be preserved
                if percent_encoded in RESERVED_PERCENT_ENCODINGS:
                    # Keep the encoding, don't decode
                    result.append(percent_encoded)
                    i += 3
                    continue

                # Try to decode (may be a UTF-8 multi-byte sequence)
                # Collect consecutive percent-encoded sequences
                encoded_sequence = percent_encoded
                j = i + 3
                while j + 2 < n and url[j] == "%":
                    next_hex = url[j + 1 : j + 3]
                    if all(c in "0123456789ABCDEFabcdef" for c in next_hex):
                        next_encoded = url[j : j + 3]
                        # Stop collecting if we encounter a reserved character
                        if next_encoded in RESERVED_PERCENT_ENCODINGS:
                            break
                        encoded_sequence += next_encoded
                        j += 3
                    else:
                        break

                # Decode the collected sequence
                try:
                    decoded = unquote(encoded_sequence)
                    result.append(decoded)
                    i = j
                    continue
                except Exception:
                    # Decoding failed, keep the original encoding
                    result.append(percent_encoded)
                    i += 3
                    continue

        result.append(url[i])
        i += 1

    return "".join(result)


def decode_http_urls_in_dict(data):
    """
    Traverse all values in the data structure:
    - If it's a string starting with http, apply urllib.parse.unquote
    - If it's a list, recursively process each element
    - If it's a dict, recursively process each value
    - Other types remain unchanged
    """
    if isinstance(data, str):
        if "%" in data and "http" in data:
            return safe_unquote(data)
        else:
            return data
    elif isinstance(data, list):
        return [decode_http_urls_in_dict(item) for item in data]
    elif isinstance(data, dict):
        return {key: decode_http_urls_in_dict(value) for key, value in data.items()}
    else:
        return data


md = MarkdownIt("commonmark")


def strip_markdown_links(markdown: str) -> str:
    tokens = md.parse(markdown)

    def render(ts):
        out = []
        for tok in ts:
            t = tok.type

            # 1) Links: drop the wrapper, keep inner text (children will be rendered)
            if t == "link_open" or t == "link_close":
                continue

            # 2) Images: skip the entire image block
            if t == "image":
                continue

            # 3) Line breaks and block closings
            if t == "softbreak":  # inline single line break
                out.append("\n")
                continue
            if (
                t == "hardbreak"
            ):  # explicit line break (two spaces + newline in Markdown)
                out.append("\n")
                continue
            if t in ("paragraph_close", "heading_close", "blockquote_close"):
                out.append("\n\n")
                continue
            if t in ("list_item_close", "bullet_list_close", "ordered_list_close"):
                out.append("\n")
                continue
            if t == "hr":
                out.append("\n\n")
                continue

            # 4) Inline or nested tokens
            if tok.children:
                out.append(render(tok.children))
                continue

            # Preserve inline code style
            if t == "code_inline":
                out.append(f"`{tok.content}`")
            else:
                out.append(tok.content or "")

        return "".join(out)

    text = render(tokens)

    # normalize excessive blank lines (avoid more than 2 consecutive newlines)
    text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n"

    return text.strip()


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import asyncio
import base64
import os

from fastmcp import FastMCP
from openai import OpenAI

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")

# Initialize FastMCP server
mcp = FastMCP("vision-mcp-server")

# Maximum file size for vision processing (20MB for images, 50MB for videos)
MAX_IMAGE_SIZE = 20 * 1024 * 1024  # 20MB
MAX_VIDEO_SIZE = 50 * 1024 * 1024  # 50MB


def guess_mime_media_type_from_extension(file_path: str) -> tuple[str, str]:
    """
    Guess the MIME type and media category based on the file extension.

    Returns:
        Tuple of (mime_type, media_category) where media_category is 'image' or 'video'
    """
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()

    # Image formats
    if ext in [".jpg", ".jpeg"]:
        return "image/jpeg", "image"
    elif ext == ".png":
        return "image/png", "image"
    elif ext == ".gif":
        return "image/gif", "image"
    elif ext == ".webp":
        return "image/webp", "image"
    elif ext == ".bmp":
        return "image/bmp", "image"
    elif ext == ".tiff" or ext == ".tif":
        return "image/tiff", "image"

    # Video formats
    elif ext == ".mp4":
        return "video/mp4", "video"
    elif ext == ".mov":
        return "video/quicktime", "video"
    elif ext == ".avi":
        return "video/x-msvideo", "video"
    elif ext == ".mkv":
        return "video/x-matroska", "video"
    elif ext == ".webm":
        return "video/webm", "video"

    # Default to JPEG for unknown formats
    return "image/jpeg", "image"


def _validate_file_size(file_path: str, media_category: str) -> tuple[bool, str]:
    """
    Validate file size based on media category.

    Returns:
        Tuple of (is_valid, error_message)
    """
    try:
        file_size = os.path.getsize(file_path)
        max_size = MAX_VIDEO_SIZE if media_category == "video" else MAX_IMAGE_SIZE
        max_size_mb = max_size / (1024 * 1024)

        if file_size > max_size:
            return (
                False,
                f"[ERROR]: File size ({file_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb}MB) for {media_category}",
            )

        if file_size == 0:
            return False, "[ERROR]: File is empty"

        return True, ""
    except Exception as e:
        return False, f"[ERROR]: Failed to check file size: {e}"


@mcp.tool()
async def visual_question_answering(media_path_or_url: str, question: str) -> str:
    """Ask question about an image or a video and get the answer with GPT-4o vision model.

    Args:
        media_path_or_url: The path of the image/video file locally or its URL. Supports images (jpg, png, gif, webp, bmp, tiff) and videos (mp4, mov, avi, mkv, webm).
        question: The question to ask about the image or video.

    Returns:
        The answer to the media-related question.
    """
    max_retries = 3
    retry = 0

    # Create client once outside the retry loop
    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)

    # Initialize variables
    response = None
    media_data = None
    mime_type = None
    media_category = None

    while retry < max_retries:
        try:
            # Build message content
            content = [{"type": "text", "text": question}]

            if os.path.exists(media_path_or_url):  # Check if the file exists locally
                # Get media type and validate
                mime_type, media_category = guess_mime_media_type_from_extension(
                    media_path_or_url
                )

                # Validate file size
                is_valid, error_msg = _validate_file_size(
                    media_path_or_url, media_category
                )
                if not is_valid:
                    return error_msg

                # Read and encode file
                with open(media_path_or_url, "rb") as media_file:
                    media_data = base64.b64encode(media_file.read()).decode("utf-8")

                # Add image_url content (works for both images and videos in OpenAI API)
                content.append(
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:{mime_type};base64,{media_data}"},
                    }
                )

            elif "home/user" in media_path_or_url:
                return "[ERROR]: The visual_question_answering tool cannot access sandbox files, please use the local path provided by original instruction"

            else:  # Otherwise, assume it's a URL
                # Basic URL validation
                if not media_path_or_url.startswith(("http://", "https://")):
                    return "[ERROR]: Invalid URL format. URLs must start with http:// or https://"

                content.append(
                    {"type": "image_url", "image_url": {"url": media_path_or_url}}
                )

            # Make API call
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": content}],
                max_tokens=1024,
            )

            # If we reach here, the API call was successful
            break

        except FileNotFoundError:
            return f"[ERROR]: File not found: {media_path_or_url}"
        except PermissionError:
            return f"[ERROR]: Permission denied when reading file: {media_path_or_url}"
        except Exception as e:
            retry += 1
            if retry >= max_retries:
                error_type = (
                    "API call"
                    if media_data is not None or not os.path.exists(media_path_or_url)
                    else "file processing"
                )
                return f"[ERROR]: Visual question answering failed during {error_type}: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction.\nSupported image formats: jpg, png, gif, webp, bmp, tiff\nSupported video formats: mp4, mov, avi, mkv, webm\nURLs must be publicly accessible and start with http:// or https://"
            await asyncio.sleep(5 * (2**retry))

    # Extract and return response
    try:
        if response and response.choices and len(response.choices) > 0:
            return response.choices[0].message.content
        else:
            return "[ERROR]: Received empty response from API"
    except (AttributeError, IndexError) as e:
        return f"[ERROR]: Failed to parse API response: {e}"


if __name__ == "__main__":
    mcp.run(transport="stdio")


================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.

import base64
import os

import aiohttp
import requests
from fastmcp import FastMCP

VISION_API_KEY = os.environ.get("VISION_API_KEY")
VISION_BASE_URL = os.environ.get("VISION_BASE_URL")
VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME")

# Initialize FastMCP server
mcp = FastMCP("vision-mcp-server-os")


def guess_mime_media_type_from_extension(file_path: str) -> str:
    """Guess the MIME type based on the file extension."""
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    if ext in [".jpg", ".jpeg"]:
        return "image/jpeg"
    elif ext == ".png":
        return "image/png"
    elif ext == ".gif":
        return "image/gif"
    else:
        return "image/jpeg"  # Default to JPEG if unknown


@mcp.tool()
async def visual_question_answering(image_path_or_url: str, question: str) -> str:
    """Ask question about an image or a video and get the answer with a vision language model.

    Args:
        image_path_or_url: The path of the image file locally or its URL.
        question: The question to ask about the image.

    Returns:
        The answer to the image-related question.
    """
    messages_for_llm = [
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": None}},
                {
                    "type": "text",
                    "text": question,
                },
            ],
        }
    ]

    headers = {
        "Authorization": f"Bearer {VISION_API_KEY}",
        "Content-Type": "application/json",
    }

    try:
        if os.path.exists(image_path_or_url):  # Check if the file exists locally
            with open(image_path_or_url, "rb") as image_file:
                image_data = base64.b64encode(image_file.read()).decode("utf-8")
                mime_type = guess_mime_media_type_from_extension(image_path_or_url)
                messages_for_llm[0]["content"][0]["image_url"]["url"] = (
                    f"data:{mime_type};base64,{image_data}"
                )
        elif image_path_or_url.startswith(("http://", "https://")):
            async with aiohttp.ClientSession() as session:
                async with session.get(image_path_or_url) as resp:
                    if resp.status == 200:
                        image_bytes = await resp.read()
                        mime_type = resp.headers.get(
                            "Content-Type", "image/png"
                        )  # fallback MIME type
                        image_data = base64.b64encode(image_bytes).decode("utf-8")
                        messages_for_llm[0]["content"][0]["image_url"]["url"] = (
                            f"data:{mime_type};base64,{image_data}"
                        )
                    else:
                        return f"Failed to fetch image from URL: {image_path_or_url}"
        else:
            messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url

        payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm}

        response = requests.post(VISION_BASE_URL, json=payload, headers=headers)

    except Exception as e:
        return f"Error: {e}"

    try:
        return response.json()["choices"][0]["message"]["content"]
    except (AttributeError, IndexError):
        return response.json()


if __name__ == "__main__":
    mcp.run(transport="stdio")