Repository: MiroMindAI/MiroThinker
Branch: main
Commit: 40a9faef2efd
Files: 169
Total size: 1.1 MB
Directory structure:
gitextract_qqy1lifh/
├── .github/
│ └── workflows/
│ └── run-ruff.yml
├── .gitignore
├── LICENSE
├── README.md
├── apps/
│ ├── collect-trace/
│ │ ├── README.md
│ │ ├── pyproject.toml
│ │ ├── scripts/
│ │ │ ├── collect_trace_claude37.sh
│ │ │ ├── collect_trace_gpt41.sh
│ │ │ ├── collect_trace_gpt5.sh
│ │ │ └── collect_trace_qwen3.sh
│ │ └── utils/
│ │ ├── converters/
│ │ │ ├── __init__.py
│ │ │ ├── convert_non_oai_to_chatml.py
│ │ │ ├── convert_oai_to_chatml.py
│ │ │ ├── convert_to_chatml_auto_batch.py
│ │ │ ├── example_usage.py
│ │ │ └── system_prompts.py
│ │ ├── merge_chatml_msgs_to_one_json.py
│ │ └── process_logs.py
│ ├── gradio-demo/
│ │ ├── README.md
│ │ ├── main.py
│ │ ├── prompt_patch.py
│ │ ├── pyproject.toml
│ │ └── utils.py
│ ├── lobehub-compatibility/
│ │ ├── MiroThinkerToolParser.py
│ │ ├── README.md
│ │ ├── chat_template.jinja
│ │ ├── requirements.txt
│ │ ├── test_tool_parser.py
│ │ └── unit_test.py
│ ├── miroflow-agent/
│ │ ├── README.md
│ │ ├── benchmarks/
│ │ │ ├── __init__.py
│ │ │ ├── check_progress/
│ │ │ │ ├── check_progress_aime2025.py
│ │ │ │ ├── check_progress_browsecomp.py
│ │ │ │ ├── check_progress_browsecomp_zh.py
│ │ │ │ ├── check_progress_deepsearchqa.py
│ │ │ │ ├── check_progress_frames.py
│ │ │ │ ├── check_progress_gaia-validation-text-103.py
│ │ │ │ ├── check_progress_gaia-validation.py
│ │ │ │ ├── check_progress_hle-text-2158.py
│ │ │ │ ├── check_progress_hle-text-500.py
│ │ │ │ ├── check_progress_hle.py
│ │ │ │ ├── check_progress_seal-0.py
│ │ │ │ ├── check_progress_webwalkerqa.py
│ │ │ │ ├── check_progress_xbench_deepsearch.py
│ │ │ │ └── common.py
│ │ │ ├── common_benchmark.py
│ │ │ ├── evaluators/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── calculate_average_score.py
│ │ │ │ ├── eval_utils.py
│ │ │ │ └── extract_futurex_results.py
│ │ │ └── subset_extraction/
│ │ │ ├── gaia-text-103-grader.py
│ │ │ └── gaia-to-text-103-mover.py
│ │ ├── conf/
│ │ │ ├── __init__.py
│ │ │ ├── agent/
│ │ │ │ ├── default.yaml
│ │ │ │ ├── demo.yaml
│ │ │ │ ├── mirothinker_1.7_keep5_max200.yaml
│ │ │ │ ├── mirothinker_1.7_keep5_max300.yaml
│ │ │ │ ├── mirothinker_v1.0.yaml
│ │ │ │ ├── mirothinker_v1.0_keep5.yaml
│ │ │ │ ├── mirothinker_v1.5.yaml
│ │ │ │ ├── mirothinker_v1.5_keep5_max200.yaml
│ │ │ │ ├── mirothinker_v1.5_keep5_max400.yaml
│ │ │ │ ├── multi_agent.yaml
│ │ │ │ ├── multi_agent_os.yaml
│ │ │ │ ├── single_agent.yaml
│ │ │ │ └── single_agent_keep5.yaml
│ │ │ ├── benchmark/
│ │ │ │ ├── aime2025.yaml
│ │ │ │ ├── browsecomp.yaml
│ │ │ │ ├── browsecomp_zh.yaml
│ │ │ │ ├── collect_trace.yaml
│ │ │ │ ├── debug.yaml
│ │ │ │ ├── deepsearchqa.yaml
│ │ │ │ ├── default.yaml
│ │ │ │ ├── frames.yaml
│ │ │ │ ├── futurex.yaml
│ │ │ │ ├── gaia-validation-text-103.yaml
│ │ │ │ ├── gaia-validation.yaml
│ │ │ │ ├── hle-text-2158.yaml
│ │ │ │ ├── hle-text-500.yaml
│ │ │ │ ├── hle.yaml
│ │ │ │ ├── seal-0.yaml
│ │ │ │ ├── webwalkerqa.yaml
│ │ │ │ └── xbench_deepsearch.yaml
│ │ │ ├── config.yaml
│ │ │ └── llm/
│ │ │ ├── claude-3-7.yaml
│ │ │ ├── default.yaml
│ │ │ ├── gpt-5.yaml
│ │ │ └── qwen-3.yaml
│ │ ├── main.py
│ │ ├── pyproject.toml
│ │ ├── scripts/
│ │ │ ├── run_evaluate_multiple_runs_aime2025.sh
│ │ │ ├── run_evaluate_multiple_runs_browsecomp.sh
│ │ │ ├── run_evaluate_multiple_runs_browsecomp_zh.sh
│ │ │ ├── run_evaluate_multiple_runs_debug.sh
│ │ │ ├── run_evaluate_multiple_runs_deepsearchqa.sh
│ │ │ ├── run_evaluate_multiple_runs_frames.sh
│ │ │ ├── run_evaluate_multiple_runs_futurex.sh
│ │ │ ├── run_evaluate_multiple_runs_gaia-validation-text-103.sh
│ │ │ ├── run_evaluate_multiple_runs_gaia-validation.sh
│ │ │ ├── run_evaluate_multiple_runs_hle-text-2158.sh
│ │ │ ├── run_evaluate_multiple_runs_hle-text-500.sh
│ │ │ ├── run_evaluate_multiple_runs_hle.sh
│ │ │ ├── run_evaluate_multiple_runs_seal-0.sh
│ │ │ ├── run_evaluate_multiple_runs_webwalkerqa.sh
│ │ │ └── run_evaluate_multiple_runs_xbench_deepsearch.sh
│ │ └── src/
│ │ ├── __init__.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ └── settings.py
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── answer_generator.py
│ │ │ ├── orchestrator.py
│ │ │ ├── pipeline.py
│ │ │ ├── stream_handler.py
│ │ │ └── tool_executor.py
│ │ ├── io/
│ │ │ ├── __init__.py
│ │ │ ├── input_handler.py
│ │ │ └── output_formatter.py
│ │ ├── llm/
│ │ │ ├── __init__.py
│ │ │ ├── base_client.py
│ │ │ ├── factory.py
│ │ │ ├── providers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── anthropic_client.py
│ │ │ │ └── openai_client.py
│ │ │ └── util.py
│ │ ├── logging/
│ │ │ ├── __init__.py
│ │ │ ├── summary_time_cost.py
│ │ │ └── task_logger.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ ├── parsing_utils.py
│ │ ├── prompt_utils.py
│ │ └── wrapper_utils.py
│ └── visualize-trace/
│ ├── .python-version
│ ├── README.md
│ ├── app.py
│ ├── pyproject.toml
│ ├── requirements.txt
│ ├── run.py
│ ├── static/
│ │ ├── css/
│ │ │ └── style.css
│ │ └── js/
│ │ └── script.js
│ ├── templates/
│ │ └── index.html
│ └── trace_analyzer.py
├── assets/
│ ├── LOCAL-TOOL-DEPLOYMENT.md
│ ├── QA.md
│ └── qwen3_nonthinking.jinja
├── justfile
└── libs/
└── miroflow-tools/
├── README.md
├── pyproject.toml
└── src/
├── __init__.py
└── miroflow_tools/
├── __init__.py
├── dev_mcp_servers/
│ ├── jina_scrape_llm_summary.py
│ ├── search_and_scrape_webpage.py
│ ├── stateless_python_server.py
│ └── task_planner.py
├── manager.py
└── mcp_servers/
├── __init__.py
├── audio_mcp_server.py
├── audio_mcp_server_os.py
├── browser_session.py
├── python_mcp_server.py
├── reading_mcp_server.py
├── reasoning_mcp_server.py
├── reasoning_mcp_server_os.py
├── searching_google_mcp_server.py
├── searching_sogou_mcp_server.py
├── serper_mcp_server.py
├── utils/
│ ├── __init__.py
│ └── url_unquote.py
├── vision_mcp_server.py
└── vision_mcp_server_os.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/run-ruff.yml
================================================
name: lint
on:
pull_request:
branches: [ "main" ]
jobs:
lint:
if: github.repository_owner == 'MiroMindAI'
name: lint pull request
runs-on: ubuntu-latest
steps:
- name: checkout code
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v5
- name: Check static error
run: |
uv tool run ruff@0.8.0 check --show-fixes --output-format=github
- name: Reformat code style
run: |
echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY
if diff_output="$(uv tool run ruff@0.8.0 format --diff 2>&1)"; then
echo "$diff_output"
echo '✅ Format check passed.' >> "$GITHUB_STEP_SUMMARY"
else
echo "$diff_output"
echo '❌ Format issues detected.' >> "$GITHUB_STEP_SUMMARY"
{
echo '```diff'
echo "$diff_output"
echo '```'
} >> "$GITHUB_STEP_SUMMARY"
exit 1
fi
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Cursor
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# -- ADDED --
# Log files
logs/
# Data directory - exclude everything except README
data/
.idea/
.DS_Store
apps/collect-trace/scripts/*/*.sh
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
================================================
FILE: README.md
================================================
**MiroThinker**: A deep research agent optimized for research and prediction. It achieves a 88.2 on the challenging BrowseComp benchmark. See [Quick Start](#-quick-start).
## 📋 Table of Contents
- 📰 [News & Updates](#-news--updates)
- 📝 [Introduction](#-introduction)
- ✨ [Key Features](#-key-features)
- 📈 [Performance on Benchmarks](#-performance-on-benchmarks)
- 🚀 [Quick Start](#-quick-start)
- 📊 [Benchmark Evaluation](#-benchmark-evaluation)
- 🔬 [Trace Collection](#-trace-collection)
- ❓ [FAQ & Troubleshooting](#-faq--troubleshooting)
- 📄 [License](#-license)
- 🙏 [Acknowledgments](#-acknowledgments)
## 📰 News & Updates
- **[2026-03-11]** 🎉🎉🎉 Introducing [MiroThinker-1.7](https://huggingface.co/collections/miromind-ai/mirothinker-17), including [MiroThinker-1.7-mini](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) and [MiroThinker-1.7](https://huggingface.co/miromind-ai/MiroThinker-1.7). MiroThinker-1.7-mini achieves 72.3 on BrowseComp-ZH, setting a new SOTA among open-source models while using only 30B parameters. Our proprietary agent MiroThinker-H1 achieves leading performance on BrowseComp and BrowseComp-ZH among open-source and commercial models.
- **\[2026-01-23\]** 🎉 We have brought two important updates to [MiroThinker online](http://dr.miromind.ai): (a) Core Research Report Generation: Deep Research online reports now support generation, preview, and sharing. (b) Extended Document Upload Types: Now supports the upload of various file formats, such as `.pdf`, `.doc`, `.ppt`, `.xls`, `.jpg`. Welcome to try it out! MiroThinker will continue to be maintained and iteratively upgraded, with the goal of becoming the best Research Agent you'll ever use!
- **\[2026-01-05\]** 🎉🎉 We release [MiroThinker-v1.5](https://huggingface.co/collections/miromind-ai/mirothinker-v15), a series of open-source deep research agents optimized for financial prediction. [MiroThinker-v1.5-30B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) surpasses Kimi-K2-Thinking on BrowseComp-ZH at much lower cost, using only 1/30 of the parameters. [MiroThinker-v1.5-235B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) scores 39.2% on HLE-Text, 69.8% on BrowseComp, 71.5% on BrowseComp-ZH, and 80.8% on GAIA-Val-165, setting a new state-of-the-art among search agents.
📜 Click to expand older updates
- **\[2025-11-13\]** 🎉 [MiroThinker-v1.0](https://huggingface.co/collections/miromind-ai/mirothinker-v10) is now released! Introducing **interactive scaling** as a third dimension of performance improvement, MiroThinker v1.0 supports 256K context window and up to 600 tool calls per task. Available in 8B, 30B, and 72B parameter scales, achieving 37.7%, 47.1%, 55.6%, and 81.9% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. See [Technical Report](https://arxiv.org/abs/2511.11793) for more details.
- **\[2025-09-11\]** MiroThinker-72B-Preview ranked 4th in this week's FutureX benchmark. See [FutureX](https://futurex-ai.github.io/).
- **\[2025-09-08\]** [MiroThinker-v0.2](https://huggingface.co/collections/miromind-ai/mirothinker-v02) is now released, achieving open-source SOTA performance across multiple benchmarks, including HLE (17.8%), HLE-Text-Only (19.1%), BrowseComp-EN (17.2%), BrowseComp-ZH (29.4%), XBench-DeepSearch (56.0%), and Frames (74.8%).
- **\[2025-09-07\]** We supported more benchmarks, including [BrowseComp-ZH](https://arxiv.org/abs/2504.19314), [XBench-DeepSearch](https://xbench.org/agi/aisearch), and [FutureX](https://futurex-ai.github.io/). We plan to add more benchmarks in the future.
- **\[2025-08-22\]** Introducing streamlined deployment options for MiroThinker with optimized resource usage and faster startup times. Experience the interactive demo: [🚀 Try Gradio Demo](apps/gradio-demo)
- **\[2025-08-08\]** [MiroThinker-v0.1](https://huggingface.co/collections/miromind-ai/mirothinker-v01-689301b6d0563321862d44a1) released.
## 📝 Introduction
### MiroThinker-1.7
Our new MiroThinker family represents a significant leap in building reliable agents for long-chain tasks. Engineered with enhanced post-training pipeline, our MiroThinker-1.7 family achieve SOTA performance in deep research tasks among open-source models.
**Key Features**
- 🚀 MiroThinker-1.7 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.
- 🔧 Handles up to 300 tool interactions per task, now with more accurate stepwise reasoning and decision-making.
- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.
- Our proprietary agent, MiroThinker-H1 provides promising evidence for long-chain verifiable reasoning — reasoning processes that are step-verifiable and globally verifiable, improving the performance of complex agentic workflows.
| Model Name | Parameters | Max Context | Max Tool Calls | HF Link |
|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|
| MiroThinker-1.7-mini | 30B | 256K | 300 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) |
| MiroThinker-1.7 | 235B | 256K | 300 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7) |
MiroThinker-1.7 demonstrates strong general-research performance across a broad range of benchmarks, achieving 74.0%, 75.3%, 82.7% and 42.9% on BrowseComp, BrowseComp-ZH, GAIA-Val-165 and HLE-Text, respectively. MiroThinker-1.7 achieves SOTA performance on BrowseComp-ZH.

### MiroThinker-v1.5
📦 Click to expand MiroThinker-v1.5 details
MiroThinker v1.5 is the world-leading open-source search agent that advances tool-augmented reasoning through **interactive scaling** — training the agent to handle deeper and more frequent agent-environment interactions as a third dimension of performance improvement, beyond model size and context length.

**Key Features**
- 🚀 MiroThinker v1.5 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis.
- 🔧 Handles up to 400 tool calls per task — a substantial improvement over previous open-source research agents.
- 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.
| Agent Name | Base Agent | Max Context | Max Tool Calls | HF Link |
|:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:|
| MiroThinker-v1.5-30B | Qwen3-30B-A3B-Thinking-2507 | 256K | 400 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) |
| MiroThinker-v1.5-235B | Qwen3-235B-A22B-Thinking-2507 | 256K | 400 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) |
MiroThinker v1.5 demonstrates strong general-research performance across a broad range of benchmarks, achieving 39.2%, 69.8%, 71.5%, and 80.8% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Val-165, respectively. These results surpass previous open-source agents and set the new world-leading BrowseComp performance.

### MiroThinker-v1.0
📦 Click to expand MiroThinker-v1.0 details
Unlike previous agents that scale only model size or context length, MiroThinker v1.0 introduces **interactive scaling** at the agent level, systematically training the agent to handle deeper and more frequent agent–environment interactions as a third dimension of performance improvement. Interactive scaling leverages environment feedback and external information acquisition to correct errors and refine trajectories.

### ✨ Key Features
- 🚀 **256K Context Window**: Supports long-horizon reasoning and deep multi-step analysis
- 🔧 **600 Tool Calls**: Handles up to 600 tool calls per task — a substantial improvement over previous open-source research agents
- 📦 **Multiple Scales**: Released in 8B, 30B, and 72B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets
MiroThinker v1.0 demonstrates strong general-research performance across a broad range of benchmarks, achieving **37.7%**, **47.1%**, **55.6%**, and **81.9%** on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. These results surpass previous open-source agents and narrow the gap with commercial counterparts such as **GPT-5-high**.
### MiroThinker-v0.2
📦 Click to expand MiroThinker-v0.2 details
In this new version, we introduced three key improvements:
- 📚 **Richer training data** from both English and Chinese sources, yielding significant gains in benchmark performance and generalization
- 🎯 **Unified DPO training** with a single preference dataset across all agents
- 📏 **Extended context length** from 40k to 64k for more challenging multi-turn tool-use tasks
Compared to v0.1, MiroThinker v0.2 delivers consistent gains across benchmarks. For example, scores improved from **57.3 → 64.1** on **GAIA-Text-103** and from **17.0 → 29.4** on **BrowseComp-ZH**, reflecting substantial advancements in the model’s general research agent capabilities.
### MiroThinker-v0.1
📦 Click to expand MiroThinker-v0.1 details
Performance of Open-Source Agents on GAIA-Validation Benchmark.
We have released the **MiroThinker v0.1** series, including both SFT and DPO variants at parameter scales of **8B**, **14B**, and **32B**. Notably, MiroThinker v0.1 achieves **state-of-the-art performance** among open-source models on the [GAIA benchmark](https://huggingface.co/datasets/gaia-benchmark/GAIA), a rigorous evaluation suite for advanced agentic capabilities, demonstrating its strength in long-context, decision-intensive, and real-world task scenarios.
## ✨ Key Features
### 🤖 **MiroThinker-Optimized Framework**
- 🔓 **Fully Open-Source Agent Framework**: Complete transparency with open framework and open agents
- 🔗 **Tool Integration**: Seamless integration with external tools and APIs
- 📝 **Trace Collection**: Comprehensive logging and analysis of agent interactions with elapsed time and estimated completion time displayed in minutes. Ready for SFT and DPO
- 📊 **Benchmark Evaluation**: Extensive testing across multiple benchmark datasets
### 📊 **Comprehensive Benchmark Suite**
📋 Click to expand benchmark list
- **GAIA Validation**: A benchmark for General AI Assistants. ([paper](https://arxiv.org/abs/2311.12983))
- **GAIA-Text-103**: A subset of GAIA Validation for text-only tasks. ([paper](https://arxiv.org/abs/2505.22648))
- **HLE**: Humanity's Last Exam. ([paper](https://arxiv.org/abs/2501.14249))
- **HLE-Text-2158**: A subset of HLE for text-only tasks. ([paper](https://arxiv.org/abs/2501.14249))
- **HLE-Text-500**: A subset of HLE for text-only tasks, created by [WebThinker](https://arxiv.org/pdf/2504.21776). ([paper](https://arxiv.org/pdf/2504.21776))
- **BrowseComp-EN**: Web browsing and comprehension tasks. ([paper](https://arxiv.org/abs/2504.12516))
- **BrowseComp-ZH**: A Chinese version of BrowseComp. ([paper](https://arxiv.org/abs/2504.19314))
- **WebWalkerQA**: Web navigation and question answering. ([paper](https://arxiv.org/abs/2501.07572))
- **Frames**: Factuality, Retrieval, And reasoning MEasurement Set. ([paper](https://arxiv.org/abs/2409.12941))
- **XBench-DeepSearch**: A benchmark for deep research agents. ([website](https://xbench.org/agi/aisearch))
- **FutureX**: A live benchmark designed for predicting unknown future. ([website](https://futurex-ai.github.io/))
- **SEAL-0**: A benchmark for evaluating LLMs on conflicting-evidence web questions. ([paper](https://arxiv.org/abs/2506.01062))
- **AIME2025**: American Invitational Mathematics Examination 2025. ([website](https://artificialanalysis.ai/evaluations/aime-2025))
- **DeepSearchQA**: Google's Deep Search Question Answering benchmark. ([paper](https://arxiv.org/abs/2505.20827))
## 📈 Performance on Benchmarks
### MiroThinker-1.7
> To prevent potential information leakage (e.g., retrieving benchmark answers from HuggingFace), we blocked access to certain websites during evaluation.
### MiroThinker-v1.5
📦 Click to expand MiroThinker-v1.5 details
> To prevent potential information leakage (e.g., searching benchmark answers from HuggingFace), access to HuggingFace has been explicitly disabled in these tools.
> We further perform canary string testing on the tool outputs of all trajectories and disregard any trajectory found to be contaminated, treating it as an incorrect answer.
### MiroThinker-v1.0
📦 Click to expand MiroThinker-v1.0 details
### MiroThinker-v0.2
📦 Click to expand MiroThinker-v0.2 details
#### Comparison with SOTA Research Agents
1. Following the practices of WebThinker, WebAgents, and CognitiveKernel, we report the Best Pass@1, the highest score across three runs, which often reflects stronger performance, though it may exhibit some variability. To provide a more stable measure, we additionally report Pass@1 (Avg@8), which offers greater consistency at the cost of slightly lower scores.
1. For consistency with prior open-source works, we evaluate GAIA-Text-103 using the WebAgents LLM-as-a-Judge template, and report results on GAIA-Val-165 using the official GAIA scorer script.
1. By default, we use open-source tools wherever possible, except for the code tool [E2B](https://github.com/e2b-dev/E2B) and the Google search tool [Serper](https://serper.dev/). We use [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct), and [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) in our implementation. The framework can be easily extended to other open-source tools of your choice.
1. Replacing these open-source tools with commercial alternatives can yield performance gains. Commercial tools were mainly used for multimodal capabilities and certain complex reasoning subtasks. The majority of tasks, including planning, browsing, refinement, navigation, and more, were handled by our agents.
#### More Benchmarks
1. MiroThinker’s performance was tested with this repository and open-source tools; other agents’ results are from their papers and official sites.
1. As [MiroVerse-v0.1](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) mainly contains English data, the agent’s Chinese capability is limited. We plan to add more Chinese data to improve performance in the next version.
## 🚀 Quick Start
For optimal usage, we recommend using MiroThinker with this tool-enabled agent framework and thinking mode enabled.
### Prerequisites
- 🐍 **Python 3.10+**
- 📦 **uv package manager** ([Installation guide](https://github.com/astral-sh/uv))
- 🔑 **Required API keys** (see configuration section below)
### Installation
```bash
# Clone the repository
git clone https://github.com/MiroMindAI/MiroThinker
cd MiroThinker
# Setup environment
cd apps/miroflow-agent
uv sync
# Configure API keys
cp .env.example .env
# Edit .env with your API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)
```
> **📝 Environment Variables**: See [Tool Configuration](#tool-configuration) section for required API keys.
### Tool Configuration
#### Minimal Configuration for MiroThinker-1.7.
| Server | Description | Tools Provided | Required Environment Variables |
|:-------|:------------|:---------------|:-------------------------------|
| **`tool-python`** | Execution environment and file management (E2B sandbox) | `create_sandbox`, `run_command`, `run_python_code`, `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY` |
| **`search_and_scrape_webpage`** | Google search via Serper API | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` |
| **`jina_scrape_llm_summary`** | Web scraping with LLM-based information extraction | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` |
**Minimal `.env` configuration example:**
```bash
# Required for MiroThinker v1.5 and v1.0 (minimal setup)
SERPER_API_KEY=your_serper_key
SERPER_BASE_URL="https://google.serper.dev"
JINA_API_KEY=your_jina_key
JINA_BASE_URL="https://r.jina.ai"
E2B_API_KEY=your_e2b_key
# Required for jina_scrape_llm_summary
# Note: Summary LLM can be a small model (e.g., Qwen3-14B or GPT-5-Nano)
# The choice has minimal impact on performance, use what's most convenient
SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
SUMMARY_LLM_MODEL_NAME=your_llm_model_name # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano"
SUMMARY_LLM_API_KEY=your_llm_api_key # Optional, depends on LLM provider
# Required for benchmark evaluation (LLM-as-a-Judge)
OPENAI_API_KEY=your_openai_key # Required for running benchmark evaluations
OPENAI_BASE_URL="https://api.openai.com/v1" # Optional, defaults to OpenAI's API
```
> **💡 Why this is minimal**: These 3 MCP servers cover the core capabilities needed for research tasks: web search, content extraction, and code execution. All other servers are optional enhancements.
>
> **🤖 Summary LLM**: The `SUMMARY_LLM` can be a small model like Qwen3-14B or GPT-5-Nano. The choice has minimal impact on overall performance, use whichever is most convenient for your setup.
>
> **📊 For Benchmark Evaluation**: If you plan to run benchmark evaluations, you also need `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL`) for LLM-as-a-Judge functionality used in evaluation scripts.
>
> **🖼️ For GAIA Multimodal Tasks**: GAIA-Val-165 includes tasks with image/audio/video files. Since MiroThinker is a text-only LLM, GPT-4o is used to pre-process these files into text descriptions. The same `OPENAI_API_KEY` is used for both this preprocessing and LLM-as-a-Judge.
>
> **📖 For more details**: See [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.
🔧 Click to expand additional available tools
The following optional tools are available but were not used in MiroThinker v1.0-1.7 evaluation:
| Server Name | Type | Description |
|:---------------------|:-------------|:--------------------------------------------|
| `tool-vqa` | Commercial | Vision processing using Claude |
| `tool-vqa-os` | Open-Source | Vision processing (open-source alternative) |
| `tool-transcribe` | Commercial | Audio transcription using OpenAI |
| `tool-transcribe-os` | Open-Source | Audio transcription using Whisper |
| `tool-reasoning` | Commercial | Reasoning engine using Claude |
| `tool-reasoning-os` | Open-Source | Reasoning engine (open-source alternative) |
| `tool-reading` | Open-Source | Document reading using MarkItDown |
| `tool-google-search` | Commercial | Web search using Google + scraping |
| `tool-sogou-search` | Commercial | Web search using Sogou (Chinese) |
> **📖 Local Deployment**: For instructions on deploying open-source tools (`tool-vqa-os`, `tool-transcribe-os`, `tool-reasoning-os`) locally, see [Local Tool Deployment Guide](assets/LOCAL-TOOL-DEPLOYMENT.md).
See the [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.
#### Pre-configured Agent Settings
The `apps/miroflow-agent/conf/agent/` directory contains several pre-configured agent settings. Each configuration uses different tools and requires corresponding environment variables in your `.env` file.
> **💡 Recommended**: For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management, recommended for most tasks) or `mirothinker_v1.7_keep5_max300` (only used for BrowseComp and BrowseComp-ZH).
| Configuration | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For |
|:---------------------------------------|:------------|:----------|:------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------|
| **`mirothinker_1.7_keep5_max200`** ⭐ | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **1.7 (recommended for most tasks)** |
| **`mirothinker_1.7_keep5_max300`** ⭐ | Single-agent with context management | 300 | Keep 5 most recent | Same as above | **1.7 (for BrowseComp & BrowseComp-ZH)** |
📦 Click to expand legacy configurations (v0.1/v0.2)
| Configuration | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For |
|:-------------------------|:------------|:----------|:------------------|:-------------------------------|:----------------|
| **`mirothinker_v1.5_keep5_max200`** | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **v1.5 (recommended for most tasks)** |
| **`mirothinker_v1.5_keep5_max400`** | Single-agent with context management | 400 | Keep 5 most recent | Same as above | **v1.5 (for BrowseComp & BrowseComp-ZH)** |
| **`mirothinker_v1.5`** | Single-agent for MiroThinker v1.5 | 600 | Keep all results | Same as above | **v1.5** |
| **`mirothinker_v1.0_keep5`** | Single-agent with context management | 600 | Keep 5 most recent | Same as above | **v1.0** |
| **`mirothinker_v1.0`** | Single-agent for MiroThinker v1.0 | 600 | Keep all results | Same as above | **v1.0** |
| **`multi_agent`** | Multi-agent with commercial tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_BASE_URL`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |
| **`multi_agent_os`** | Multi-agent with open-source tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`, `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`, `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |
> **💡 Note**: All environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and fill in the values for the tools you plan to use.
#### Creating Custom Tool Configurations
🔧 Click to expand custom tool configuration guide
You can create your own YAML configuration file to freely combine MCP servers. Here's how:
1. **Create a new YAML file** in `apps/miroflow-agent/conf/agent/`:
```yaml
# conf/agent/my_custom_config.yaml
defaults:
- default
- _self_
main_agent:
tools:
- tool-python # Execution environment
- search_and_scrape_webpage # Google search
- jina_scrape_llm_summary # Web scraping with LLM
- tool-vqa # Vision processing (optional)
- tool-transcribe # Audio processing (optional)
- tool-reasoning # Reasoning engine (optional)
- tool-reading # Document reading (optional)
max_turns: 300 # Maximum number of turns
sub_agents:
agent-browsing: # Optional sub-agent
tools:
- tool-google-search
- tool-vqa
- tool-reading
- tool-python
max_turns: 50
keep_tool_result: -1 # Context retention budget: -1 keeps all tool results, or specify K to keep only the K most recent tool responses
```
> **💡 Context Retention Strategy**: The `keep_tool_result` parameter implements a **recency-based context retention** strategy. In the standard ReAct paradigm, all tool outputs are retained in the message history, which can lead to inefficient context utilization. Empirically, we observe that the agent's subsequent actions depend primarily on recent observations rather than distant ones. This strategy retains only the most recent K tool responses (where K is the `keep_tool_result` value) while preserving the complete sequence of thoughts and actions.
>
> **Benefits:**
>
> - ✅ Preserves the reasoning and action trace
> - ✅ Focuses the agent's attention on the most contextually relevant observations
> - ✅ Frees additional context space for extended reasoning and deeper tool-use trajectories
> - ✅ Does not lead to performance degradation while allowing more context space for interactive scaling
>
> **Usage:** Set `keep_tool_result: -1` to keep all tool results, or specify a positive integer K (e.g., `keep_tool_result: 5`) to keep only the K most recent tool responses.
2. **Use your custom configuration** when running evaluations:
```bash
cd apps/miroflow-agent
uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1
```
3. **Configure environment variables** in `.env` based on the tools you use.
All available environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and configure the variables according to your chosen configuration:
```bash
cd apps/miroflow-agent
cp .env.example .env
# Edit .env with your actual API keys
```
**For MiroThinker v1.5** (`mirothinker_v1.5_keep5_max200.yaml`, `mirothinker_v1.5_keep5_max400.yaml`, or `mirothinker_v1.5.yaml`) and **v1.0** (`mirothinker_v1.0_keep5.yaml` or `mirothinker_v1.0.yaml`), see the [Minimal Configuration](#minimal-configuration-for-mirothinker-v15-and-v10) section above for the complete configuration example.
**For other configurations**, refer to the [Pre-configured Agent Settings](#pre-configured-agent-settings) table above to see which environment variables are required.
🔑 Click to expand optional API keys
```bash
# API for LLM-as-a-Judge (for benchmark testing, required for benchmark evaluation)
OPENAI_API_KEY=your_openai_key
OPENAI_BASE_URL="https://api.openai.com/v1" # Optional, defaults to OpenAI's API
# API for Open-Source Audio Transcription Tool (for benchmark testing, optional)
WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
WHISPER_API_KEY=your_whisper_key
WHISPER_BASE_URL="https://your_whisper_base_url/v1"
# API for Open-Source VQA Tool (for benchmark testing, optional)
VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
VISION_API_KEY=your_vision_key
VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions"
# API for Open-Source Reasoning Tool (for benchmark testing, optional)
REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
REASONING_API_KEY=your_reasoning_key
REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions"
# API for Claude Sonnet 3.7 as Commercial Tools (optional)
ANTHROPIC_API_KEY=your_anthropic_key
# API for Sogou Search (optional)
TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id
TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key
# API for Summary LLM (can use small models like Qwen3-14B or GPT-5-Nano)
SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano"
SUMMARY_LLM_API_KEY=your_summary_llm_api_key
```
### Serve the MiroThinker Agent
#### Option 1 (Recommended): Serve with SGLang or vLLM
Use SGLang to serve MiroThinker models at port 61002:
```bash
NUM_GPUS=4
PORT=61002
# Downloading agent from HF
AGENT_PATH=miromind-ai/MiroThinker-1.7-mini
python3 -m sglang.launch_server \
--model-path $AGENT_PATH \
--tp $NUM_GPUS \
--dp 1 \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code
```
> **📍 Server URL**: This will start a server at `http://0.0.0.0:$PORT`. Use this as your server base URL (e.g., `http://0.0.0.0:61002/v1`).
#### Option 2: Quantized Light-Weight Options
We also provide comprehensive guidance for serving MiroThinker agents using CPU-optimized and GPU-accelerated quantization techniques, along with detailed analysis and guidelines for deployment with llama.cpp, Ollama, SGLang, and other inference frameworks.
> **📖 Complete Guide**: See [Deployment Documentation](apps/gradio-demo/) for detailed deployment instructions.
### Run Your First Task
After setting up the environment and starting your server, run `main.py` to test with a default question: *"What is the title of today's arxiv paper in computer science?"*
```bash
cd apps/miroflow-agent
# Using MiroThinker agents (requires your own server)
uv run python main.py llm=qwen-3 agent=mirothinker_1.7_keep5_max200 llm.base_url=http://localhost:61002/v1
# Or using Claude (requires ANTHROPIC_API_KEY in .env)
uv run python main.py llm=claude-3-7 agent=single_agent_keep5
# Or using GPT-5 (requires OPENAI_API_KEY in .env)
uv run python main.py llm=gpt-5 agent=single_agent_keep5
```
**To customize your question**, edit `main.py` line 32:
```python
task_description = "Your custom question here"
```
The agent will search the web, execute code if needed, and provide an answer with sources.
> **📖 More details**: See [apps/miroflow-agent/README.md](apps/miroflow-agent/README.md) for available configurations and troubleshooting.
## 📊 Benchmark Evaluation
> For researchers who want to reproduce our benchmark results or evaluate on standard benchmarks.
### Download Benchmark Data
```bash
cd MiroThinker # Back to project root
wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/data_20251115_password_protected.zip
unzip data_20251115_password_protected.zip
# Password: pf4*
rm data_20251115_password_protected.zip
```
### Run Benchmark Evaluation
> **Note:** For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management), `mirothinker_1.7_keep5_max300` (with context management).
**Available Parameters:**
You can customize the evaluation by setting the following environment variables before running the script:
| Parameter | Default | Description |
|:----------|:--------|:------------|
| `LLM_MODEL` | `"MiroThinker-Agents"` | Agent name identifier |
| `BASE_URL` | `"https://your-api.com/v1"` | Base URL of your server |
| `NUM_RUNS` | Varies by benchmark | Number of evaluation runs (3 for most benchmarks, 8 for GAIA/XBench/FutureX/SEAL-0, 32 for AIME2025) |
| `LLM_PROVIDER` | `"qwen"` | LLM provider (e.g., `qwen`, `openai`, `anthropic`) |
| `AGENT_SET` | `"mirothinker_1.7_keep5_max200"` | Agent configuration (e.g., `mirothinker_1.7_keep5_max200`, `mirothinker_1.7_keep5_max300`.) |
| `MAX_CONTEXT_LENGTH` | `262144` | Maximum context length (256K) |
| `MAX_CONCURRENT` | `10` | Maximum concurrent tasks |
| `PASS_AT_K` | `1` | Pass@K evaluation metric |
| `TEMPERATURE` | `1.0` | Sampling temperature |
| `API_KEY` | `"xxx"` | API key for the server |
**Example Usage:**
```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent
# Basic usage with v1.5 (recommended)
NUM_RUNS=8 LLM_MODEL="MiroThinker-1.7-mini" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
# Or with v1.0
# NUM_RUNS=8 LLM_MODEL="MiroThinker-v1.0-30B" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
# Customize number of runs and agent configuration (v1.5 with context management)
LLM_MODEL="MiroThinker-1.7-mini" \
BASE_URL="https://your-api.com/v1" \
NUM_RUNS=8 \
AGENT_SET="mirothinker_1.7_keep5_max200" \
bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
```
📋 Click to expand all benchmark commands
> **⚠️ Important for MiroThinker-1.7**: To reproduce our reported results, you must set the correct `AGENT_SET`:
>
> - **BrowseComp & BrowseComp-ZH**: Use `AGENT_SET="mirothinker_1.7_keep5_max300"`
> - **All other benchmarks**: Use `AGENT_SET="mirothinker_1.7_keep5_max200"`
```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent
# HLE
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle.sh
# HLE-Text-2158
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-2158.sh
# HLE-Text-500
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-500.sh
# GAIA-Text-103
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
# GAIA-Validation (GAIA-Val-165)
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation.sh
# BrowseComp-EN (⚠️ use max300)
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp.sh
# BrowseComp-ZH (⚠️ use max300)
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp_zh.sh
# WebWalkerQA
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_webwalkerqa.sh
# XBench-DeepSearch
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh
# FRAMES
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_frames.sh
# SEAL-0
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_seal-0.sh
# FutureX
NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_futurex.sh
# AIME2025
NUM_RUNS=32 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_aime2025.sh
# DeepSearchQA
NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_deepsearchqa.sh
```
#### 3. **Monitor evaluation progress**
📊 Click to expand progress monitoring commands
```bash
# Navigate to the miroflow-agent directory first
cd apps/miroflow-agent
# For HLE
python benchmarks/check_progress/check_progress_hle.py /path/to/evaluation/logs
# For HLE-Text-2158
python benchmarks/check_progress/check_progress_hle-text-2158.py /path/to/evaluation/logs
# For HLE-Text-500
python benchmarks/check_progress/check_progress_hle-text-500.py /path/to/evaluation/logs
# For BrowseComp-EN
python benchmarks/check_progress/check_progress_browsecomp.py /path/to/evaluation/logs
# For BrowseComp-ZH
python benchmarks/check_progress/check_progress_browsecomp_zh.py /path/to/evaluation/logs
# For GAIA-Validation
python benchmarks/check_progress/check_progress_gaia-validation.py /path/to/evaluation/logs
# For GAIA-Text-103
python benchmarks/check_progress/check_progress_gaia-validation-text-103.py /path/to/evaluation/logs
# For WebWalkerQA
python benchmarks/check_progress/check_progress_webwalkerqa.py /path/to/evaluation/logs
# For Frames
python benchmarks/check_progress/check_progress_frames.py /path/to/evaluation/logs
# For XBench-DeepSearch
python benchmarks/check_progress/check_progress_xbench_deepsearch.py /path/to/evaluation/logs
# For SEAL-0
python benchmarks/check_progress/check_progress_seal-0.py /path/to/evaluation/logs
# For AIME2025
python benchmarks/check_progress/check_progress_aime2025.py /path/to/evaluation/logs
# For DeepSearchQA
python benchmarks/check_progress/check_progress_deepsearchqa.py /path/to/evaluation/logs
```
## 🔬 Trace Collection
📋 Click to expand trace collection commands
```bash
cd apps/collect-trace
# Collect Traces for SFT
bash scripts/collect_trace_claude37.sh
bash scripts/collect_trace_gpt5.sh
# Collect Traces for DPO
bash scripts/collect_trace_qwen3.sh
```
## ❓ FAQ & Troubleshooting
### Common Issues
🔧 Click to expand troubleshooting guide
#### **Q: Which version should I use?**
**A:** We recommend **MiroThinker-1.7** ⭐ with the minimal configuration:
- **v1.7** ⭐: Latest version with 256K context, world-leading performance. Use config (with context management):
- `mirothinker_1.7_keep5_max200` (up to 200 turns, recommended for most tasks)
- `mirothinker_1.7_keep5_max300` (up to 300 turns, only used for BrowseComp and BrowseComp-ZH)
#### **Q: How do I get API keys?**
**A:** You need these keys for minimal setup:
- **SERPER_API_KEY**: Get from [Serper.dev](https://serper.dev/) (Google search API)
- **JINA_API_KEY**: Get from [Jina.ai](https://jina.ai/) (Web scraping)
- **E2B_API_KEY**: Get from [E2B.dev](https://e2b.dev/) (Code execution sandbox)
- **SUMMARY_LLM_API_KEY**: Your LLM API credentials (for content summarization). Can be a small model like Qwen3-14B or GPT-5-Nano—the choice has minimal impact on performance.
- **OPENAI_API_KEY**: Get from [OpenAI](https://platform.openai.com/) (Required for benchmark evaluation, used for LLM-as-a-Judge)
- **OPENAI_BASE_URL**: Optional, defaults to `https://api.openai.com/v1`. Can be changed to use OpenAI-compatible APIs.
#### **Q: Agent server connection errors**
**A:** Common issues:
- **Check base URL format**: Should end with `/v1` (e.g., `https://your-api.com/v1`)
- **Verify API key**: Ensure `API_KEY` is set correctly in environment or script
- **Check server status**: Make sure your server is running and accessible
- **Network issues**: Verify firewall/network settings allow connections
#### **Q: Evaluation script fails to run**
**A:** Troubleshooting steps:
1. **Check working directory**: Make sure you're in `apps/miroflow-agent` directory
1. **Verify environment**: Run `uv sync` to ensure dependencies are installed
1. **Check .env file**: Ensure all required environment variables are set
1. **Review logs**: Check `logs/` directory for detailed error messages
1. **Verify data path**: Ensure benchmark data is downloaded and in correct location
#### **Q: Out of memory errors**
**A:** Solutions:
- **Reduce context length**: Set `MAX_CONTEXT_LENGTH` to a smaller value (e.g., 131072 for 128K)
- **Use context management with fewer turns**:
- For v1.5: Use `mirothinker_1.7_keep5_max200` or `mirothinker_1.7_keep5_max300` (with context management)
- **Reduce concurrent tasks**: Set `MAX_CONCURRENT` to a smaller number (e.g., 5)
- **Use smaller agents**:
- For v1.5: Try 30B instead of 235B
- For v1.0: Try 8B or 30B instead of 72B
#### **Q: Tool execution errors**
**A:** Common fixes:
- **E2B errors**: Verify `E2B_API_KEY` is valid and account has credits
- **Serper errors**: Check `SERPER_API_KEY` and rate limits
- **Jina errors**: Verify `JINA_API_KEY` and `JINA_BASE_URL` are correct
- **LLM summarization errors**: Check `SUMMARY_LLM_*` variables and agent availability
#### **Q: How to monitor long-running evaluations?**
**A:** Use the progress monitoring scripts:
```bash
cd apps/miroflow-agent
python benchmarks/check_progress/check_progress_.py /path/to/logs
```
The scripts show completion status, elapsed time, and estimated remaining time.
### Getting Help
- 📖 **Documentation**: Check [MiroFlow Tools README](libs/miroflow-tools/README.md) for tool details
- 💬 **Discord**: Join our [Discord community](https://discord.com/invite/GPqEnkzQZd)
- 🐛 **Issues**: Report bugs on [GitHub Issues](https://github.com/MiroMindAI/MiroThinker/issues)
- 📧 **Contact**: Visit [our website](https://miromind.ai/) for more information
## 📄 License
This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.
## 🙏 Acknowledgments
We extend our sincere gratitude to:
- 🏆 **Benchmark Contributors** for the comprehensive evaluation datasets
- 🌍 **Open Source Community** for the tools and libraries that make this possible
- 👥 **All Contributors** who have helped make MiroThinker better
Join our community and help us build the future of AI agents!
### References
If you find this project useful in your research, please consider citing:
**MiroThinker** (Model & Method)
```
@article{miromind2026mirothinker,
title={MiroThinker-1.7 & H1: Towards Heavy-Duty Research Agents via Verification},
author={MiroMind Team and Bai, S. and Bing, L. and Lei, L. and Li, R. and Li, X. and Lin, X. and Min, E. and Su, L. and Wang, B. and Wang, L. and Wang, L. and Wang, S. and Wang, X. and Zhang, Y. and Zhang, Z. and others},
journal={arXiv preprint arXiv:2603.15726},
year={2026}
}
```
**MiroFlow** (Framework)
```bibtex
@article{miromind2026miroflow,
title={MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework for General Deep Research Tasks},
author={Su, Shiqian and Xing, Sen and Dong, Xuan and Zhong, Muyan and Wang, Bin and Zhu, Xizhou and Chen, Yuntao and Wang, Wenhai and Deng, Yue and Zhu, Pengxiang and others},
journal={arXiv preprint arXiv:2602.22808},
year={2026}
}
```
[](https://star-history.com/#MiroMindAI/MiroThinker&Date)
================================================
FILE: apps/collect-trace/README.md
================================================
# Collect Trace
> TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-a-Judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO).
## 📝 Overview
Collect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct.
Workflow:
1. Load each RLVR item’s question and verifiable answer.
1. Run the agent in the evaluation pipeline (with tool use / browsing as needed).
1. Verify the model's answer with an LLM-as-a-Judge against the RLVR reference answer.
1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples.
## 🚀 Quick Start
### Prerequisites
- Python 3.10+
- [uv](https://github.com/astral-sh/uv) package manager
- OpenAI API key (for LLM-based validation)
- RLVR dataset (JSONL; contains question and a verifiable answer)
### Installation
1. **Navigate to the collect-trace directory**:
```bash
cd apps/collect-trace
```
1. **Install dependencies**:
```bash
uv sync
```
1. **Set up environment variables**:
```bash
# Create .env if missing (safe; won't overwrite existing file)
[ -f ../miroflow-agent/.env ] || cp ../miroflow-agent/.env.example ../miroflow-agent/.env
# (Alternative on macOS/Linux) cp -n ../miroflow-agent/.env.example ../miroflow-agent/.env || true
# Edit .env and fill in your keys
# Required: OPENAI_API_KEY (for LLM-as-a-Judge)
# Optional: other keys for specific tools
```
### Basic Usage
Run a benchmark evaluation to collect traces:
```bash
# Using Claude-3.7 for trace collection
bash scripts/collect_trace_claude37.sh
# Using GPT-5 for trace collection
bash scripts/collect_trace_gpt5.sh
# Using Qwen-3 for trace collection
bash scripts/collect_trace_qwen3.sh
```
================================================
FILE: apps/collect-trace/pyproject.toml
================================================
[project]
name = "collect-trace"
version = "0.1.0"
description = "Executes a user-defined agent loop for capturing multi-turn interaction traces"
readme = "README.md"
requires-python = ">=3.12"
authors = [{ name = "MiroMind Team", email = "service@miromind.ai" }]
dependencies = [
"miroflow-tools>=0.1.0",
"dotenv>=0.9.9",
"openai>=1.90.0",
]
[tool.uv.sources]
miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }
================================================
FILE: apps/collect-trace/scripts/collect_trace_claude37.sh
================================================
# Check if ANTHROPIC_API_KEY is set
if [ -z "$ANTHROPIC_API_KEY" ]; then
echo "Error: ANTHROPIC_API_KEY is not set."
exit 1
else
echo "ANTHROPIC_API_KEY detected."
fi
# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"
# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_claude37"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR
# Collect traces
uv run python benchmarks/common_benchmark.py \
benchmark=collect_trace \
benchmark.data.data_dir="../../data/debug" \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=claude-3-7 \
llm.provider=anthropic \
llm.model_name=claude-3-7-sonnet-20250219 \
llm.api_key="$ANTHROPIC_API_KEY" \
llm.base_url=https://api.anthropic.com \
llm.async_client=true \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=10 \
benchmark.execution.pass_at_k=1 \
agent=single_agent \
hydra.run.dir=$LOG_DIR \
2>&1 | tee "$LOG_DIR/output.log"
# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
================================================
FILE: apps/collect-trace/scripts/collect_trace_gpt41.sh
================================================
# Check if OPENAI_API_KEY is set
if [ -z "$OPENAI_API_KEY" ]; then
echo "Error: OPENAI_API_KEY is not set."
exit 1
else
echo "OPENAI_API_KEY detected."
fi
# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"
# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_gpt41"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR
# Collect traces
uv run python benchmarks/common_benchmark.py \
benchmark=collect_trace \
benchmark.data.data_dir="../../data/debug" \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=gpt-5 \
llm.provider=openai \
llm.model_name=gpt-4.1-mini \
llm.api_key="$OPENAI_API_KEY" \
llm.base_url=https://api.openai.com/v1 \
llm.async_client=true \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=10 \
benchmark.execution.pass_at_k=1 \
agent=single_agent \
hydra.run.dir=$LOG_DIR \
2>&1 | tee "$LOG_DIR/output.log"
# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
================================================
FILE: apps/collect-trace/scripts/collect_trace_gpt5.sh
================================================
# Check if OPENAI_API_KEY is set
if [ -z "$OPENAI_API_KEY" ]; then
echo "Error: OPENAI_API_KEY is not set."
exit 1
else
echo "OPENAI_API_KEY detected."
fi
# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"
# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_gpt5"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR
# Collect traces
uv run python benchmarks/common_benchmark.py \
benchmark=collect_trace \
benchmark.data.data_dir="../../data/debug" \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=gpt-5 \
llm.provider=openai \
llm.model_name=gpt-5-2025-08-07 \
llm.api_key="$OPENAI_API_KEY" \
llm.base_url=https://api.openai.com/v1 \
llm.async_client=true \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=10 \
benchmark.execution.pass_at_k=1 \
agent=single_agent \
hydra.run.dir=$LOG_DIR \
2>&1 | tee "$LOG_DIR/output.log"
# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
================================================
FILE: apps/collect-trace/scripts/collect_trace_qwen3.sh
================================================
# Get the directory where the current script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current script directory: $SCRIPT_DIR"
# Enter the apps/miroflow-agent directory
TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
mkdir -p ../../logs
LOG_DIR="../../logs/collect_trace_qwen3"
echo "Log directory: $LOG_DIR"
mkdir -p $LOG_DIR
# Collect traces
uv run python benchmarks/common_benchmark.py \
benchmark=collect_trace \
benchmark.data.data_dir="../../data/debug" \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=qwen \
llm.model_name=qwen-3-32b \
llm.api_key="" \
llm.base_url=https://your-api.com/v1 \
llm.async_client=true \
llm.temperature=1.0 \
llm.max_context_length=131072 \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=10 \
benchmark.execution.pass_at_k=1 \
agent=single_agent \
hydra.run.dir=$LOG_DIR \
2>&1 | tee "$LOG_DIR/output.log"
# Enter the apps/collect-trace directory
TARGET_DIR="$SCRIPT_DIR/../"
echo "Target directory: $TARGET_DIR"
cd $TARGET_DIR
# Process traces
uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
================================================
FILE: apps/collect-trace/utils/converters/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
from .convert_non_oai_to_chatml import (
convert_to_json_chatml,
extract_and_save_chat_history,
)
from .convert_oai_to_chatml import (
extract_message_history_from_log,
oai_tool_message_to_chat_message,
process_log_file,
save_chatml_to_files,
)
from .convert_to_chatml_auto_batch import (
batch_process_files,
determine_conversion_method,
get_llm_provider,
process_single_file,
)
__all__ = [
# OAI conversion functions
"oai_tool_message_to_chat_message",
"extract_message_history_from_log",
"save_chatml_to_files",
"process_log_file",
# Non-OAI conversion functions
"convert_to_json_chatml",
"extract_and_save_chat_history",
# Auto batch conversion functions
"get_llm_provider",
"determine_conversion_method",
"process_single_file",
"batch_process_files",
]
================================================
FILE: apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import sys
from pathlib import Path
from typing import Any, Dict, List
def convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""
Convert message list to OpenAI JSON format ChatML
Filter out messages with role 'tool', convert content None to empty string
"""
chatml_list = []
for message in messages:
role = message.get("role", "")
if role == "tool":
continue # Skip tool messages
if role == "system":
continue # Skip system messages
content = message.get("content", "")
if content is None:
content = ""
# Handle different content formats
if isinstance(content, list):
text_parts = []
for item in content:
if isinstance(item, dict) and item.get("type") == "text":
text_parts.append(item.get("text", ""))
content = " ".join(text_parts)
elif isinstance(content, str):
pass
else:
content = str(content)
chatml_list.append({"role": role, "content": content})
return chatml_list
def extract_and_save_chat_history(
log_data: Dict[str, Any], output_dir: Path, input_filename: str
):
"""
Extract message history from log data and save as ChatML format
Args:
log_data: Log data dictionary
output_dir: Output directory
input_filename: Input filename (without extension)
"""
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# 1. Extract main_agent_message_history
main_agent_history = log_data.get("main_agent_message_history", {})
if main_agent_history and "message_history" in main_agent_history:
main_messages = main_agent_history["message_history"]
if main_messages:
chatml_list = convert_to_json_chatml(main_messages)
chatml_list.insert(
0,
{
"role": "system",
"content": main_agent_history.get("system_prompt", ""),
},
)
# Save main agent chat records
main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json"
with open(main_output_file, "w", encoding="utf-8") as f:
json.dump(chatml_list, f, ensure_ascii=False, indent=2)
print(f"✓ Saved main agent chat records: {main_output_file}")
# 2. Extract sub_agent_message_history_sessions
sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {})
if sub_agent_sessions:
for session_name, session_data in sub_agent_sessions.items():
if "message_history" in session_data:
sub_agent_messages = session_data["message_history"]
if sub_agent_messages:
chatml_list = convert_to_json_chatml(sub_agent_messages)
chatml_list.insert(
0,
{
"role": "system",
"content": session_data.get("system_prompt", ""),
},
)
# Save browser agent chat records
sub_agent_output_file = (
output_dir / f"{input_filename}_{session_name}_chatml.json"
)
with open(sub_agent_output_file, "w", encoding="utf-8") as f:
json.dump(chatml_list, f, ensure_ascii=False, indent=2)
print(f"✓ Saved sub agent chat records: {sub_agent_output_file}")
def main():
"""Main function"""
if len(sys.argv) < 2:
print("Usage: python convert_non_oai_to_chatml.py [output_dir]")
print(
"Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json"
)
print(
"Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats"
)
sys.exit(1)
log_file_path = Path(sys.argv[1])
output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("extracted_chats")
# Check if input file exists
if not log_file_path.exists():
print(f"Error: Log file does not exist: {log_file_path}")
sys.exit(1)
try:
# Read log file
print(f"Reading log file: {log_file_path}")
with open(log_file_path, "r", encoding="utf-8") as f:
log_data = json.load(f)
# Extract input filename (without extension)
input_filename = log_file_path.stem
# Extract and save chat history
print(f"Extracting chat history to: {output_dir}")
extract_and_save_chat_history(log_data, output_dir, input_filename)
print("\n✓ Chat history extraction completed!")
print(f"Output directory: {output_dir.absolute()}")
except json.JSONDecodeError as e:
print(f"Error: Cannot parse JSON file: {e}")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: apps/collect-trace/utils/converters/convert_oai_to_chatml.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import ast
import json
import os
import sys
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Any, Dict
from system_prompts import (
main_system_prompt_foreword,
sub_agent_system_prompt_foreword,
system_prompt_tool_instrcutions,
)
# Initialize creation_time_str with current time
creation_time_str = datetime.now().strftime("%Y-%m-%d")
def oai_tool_message_to_chat_message(oai_messages, agent_type, tool_definition):
def convert_oai_tool_call_to_mcp_tool_call_str(oai_tool_call):
if isinstance(oai_tool_call, list):
assert len(oai_tool_call) >= 1
if isinstance(oai_tool_call, str):
oai_tool_call = [json.loads(oai_tool_call)]
mcp_tool_call_templates = []
for each_oai_tool_call in oai_tool_call:
assert isinstance(
each_oai_tool_call, dict
), f"oai_tool_call should be a dict, but got {type(each_oai_tool_call)}"
server_name, tool_name = each_oai_tool_call["function"]["name"].rsplit(
"-", maxsplit=1
)
arguments = json.loads(each_oai_tool_call["function"]["arguments"])
mcp_tool_call_template = f"\n{server_name}\n{tool_name}\n\n{json.dumps(arguments)}\n\n"
mcp_tool_call_templates.append(mcp_tool_call_template)
return "\n\n".join(mcp_tool_call_templates)
def safe_get_text(content):
"""Safely extract text content, handling different content formats"""
if isinstance(content, list) and content:
if isinstance(content[0], dict) and "text" in content[0]:
return content[0]["text"]
elif isinstance(content[0], str):
return content[0]
else:
return str(content[0])
elif isinstance(content, str):
return content
elif content is None:
return ""
else:
return str(content)
def generate_mcp_servers_str(tool_definition):
mcp_servers_str = ""
if tool_definition and len(tool_definition) > 0:
for server in tool_definition:
mcp_servers_str += f"## Server name: {server['name']}\n"
if "tools" in server and len(server["tools"]) > 0:
for tool in server["tools"]:
# Skip tools that failed to load (they only have 'error' key)
if "error" in tool and "name" not in tool:
continue
mcp_servers_str += f"### Tool name: {tool['name']}\n"
mcp_servers_str += f"Description: {tool['description']}\n"
mcp_servers_str += f"Input JSON schema: {tool['schema']}\n"
return mcp_servers_str
oai_messages = deepcopy(oai_messages)
chat_messages = []
idx = 0
pending_user_tool_contents = []
# Merge pending_user_tool_contents into a single user message and add to chat_messages
def flush_pending(pending_user_tool_contents, chat_messages):
if pending_user_tool_contents:
combined_content = "\n\n".join(pending_user_tool_contents)
chat_messages.append(
{
"role": "user",
"content": combined_content,
}
)
return [] # Always return a new empty list
try:
for idx, msg in enumerate(oai_messages):
if msg["role"] in ["developer", "system"]:
assert idx == 0, "System messages should be the first message"
time_str = f" Today is: {creation_time_str}\n"
tool_definition_str = generate_mcp_servers_str(tool_definition)
ori_system_prompt = msg["content"][0]["text"]
system_prompt_after_general_objective = ori_system_prompt[
ori_system_prompt.find("# General Objective") :
]
if agent_type == "main":
system_prompt = (
main_system_prompt_foreword
+ time_str
+ system_prompt_tool_instrcutions
+ tool_definition_str
+ system_prompt_after_general_objective
)
elif agent_type == "sub_agent":
system_prompt = (
sub_agent_system_prompt_foreword
+ time_str
+ system_prompt_tool_instrcutions
+ tool_definition_str
+ system_prompt_after_general_objective
)
else:
raise ValueError(f"Unknown agent type: {agent_type}")
chat_messages.append(
{
"role": "system",
"content": system_prompt,
}
)
elif msg["role"] in ["user", "tool"]:
content = safe_get_text(msg["content"])
pending_user_tool_contents.append(content)
elif msg["role"] == "assistant" and "tool_calls" in msg:
# Flush pending user/tool messages
pending_user_tool_contents = flush_pending(
pending_user_tool_contents, chat_messages
)
content = safe_get_text(msg.get("content", ""))
if content != "":
content += "\n\n" # Concatenate thinking text with tool call
chat_messages.append(
{
"role": "assistant",
"content": content
+ convert_oai_tool_call_to_mcp_tool_call_str(msg["tool_calls"]),
}
)
elif msg["role"] == "assistant" and "tool_calls" not in msg:
# Flush pending user/tool messages
pending_user_tool_contents = flush_pending(
pending_user_tool_contents, chat_messages
)
content = safe_get_text(msg["content"])
chat_messages.append(
{
"role": "assistant",
"content": content,
}
)
else:
raise ValueError(f"Unknown role: {msg['role']}")
assert (
len(pending_user_tool_contents) == 0
), "Error: Trace ends with user/tool round. Pending user/tool contents should be empty."
except Exception as e:
raise ValueError(f"Error processing messages: {e}")
return chat_messages
def extract_message_history_from_log(
log_data: Dict[str, Any],
):
"""
Extract message history from log data and convert to OpenAI ChatML format
Args:
log_data: Log data dictionary
Returns:
Dictionary containing main_agent and sub_agents message history
"""
result = {"main_agent": [], "sub_agents": {}}
# Extract main_agent_message_history
main_agent_history = log_data.get("main_agent_message_history", {})
if main_agent_history and "message_history" in main_agent_history:
main_messages = main_agent_history["message_history"]
if main_messages:
tool_main_agent_definition = extract_step_message(
log_data, "get_main_tool_definitions"
)
result["main_agent"] = oai_tool_message_to_chat_message(
main_messages,
"main",
tool_main_agent_definition,
)
# Extract sub_agent_message_history_sessions
sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {})
if sub_agent_sessions:
for session_name, session_data in sub_agent_sessions.items():
if "message_history" in session_data:
sub_agent_messages = session_data["message_history"]
if sub_agent_messages:
sub_agent_type = session_name.split("_")[0]
tool_sub_agent_definition = extract_step_message(
log_data, f"get_sub_{sub_agent_type}_tool_definitions"
)
result["sub_agents"][session_name] = (
oai_tool_message_to_chat_message(
sub_agent_messages, "sub_agent", tool_sub_agent_definition
)
)
return result
def save_chatml_to_files(
chatml_data: Dict[str, Any],
output_dir: Path,
input_filename: str,
):
"""
Save ChatML format messages to files
Args:
chatml_data: Dictionary containing message history
output_dir: Output directory
input_filename: Input filename (without extension)
"""
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# Save main agent messages
if chatml_data["main_agent"]:
main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json"
with open(main_output_file, "w", encoding="utf-8") as f:
json.dump(chatml_data["main_agent"], f, ensure_ascii=False, indent=2)
print(f"✓ Saved main agent ChatML: {main_output_file}")
# Save sub agent messages
for session_name, messages in chatml_data["sub_agents"].items():
# Extract numeric suffix
sub_agent_output_file = (
output_dir / f"{input_filename}_{session_name}_chatml.json"
)
with open(sub_agent_output_file, "w", encoding="utf-8") as f:
json.dump(messages, f, ensure_ascii=False, indent=2)
print(f"✓ Saved sub agent {session_name} ChatML: {sub_agent_output_file}")
def extract_step_message(data, target_step_name):
try:
# Check if step_logs field exists
if "step_logs" not in data:
print("step_logs field not found in log file")
return None
# Iterate through step_logs to find target step_name
for i, step in enumerate(data["step_logs"]):
step_name = step.get("step_name")
if step_name == target_step_name:
message = step.get("message")
return ast.literal_eval(message)
print(f"No record found with step_name '{target_step_name}'")
return None
except Exception as e:
print(f"Error processing file: {e}")
return None
def process_log_file(log_file_path: str, output_dir: str = "extracted_chatml"):
"""
Process a single log file, extract message history and convert to ChatML format
Args:
log_file_path: Log file path
output_dir: Output directory
"""
log_path = Path(log_file_path)
output_path = Path(output_dir)
if not log_path.exists():
print(f"Error: Log file does not exist: {log_file_path}")
return
# Get file creation time
global creation_time_str
try:
stat_info = os.stat(log_path)
creation_time = datetime.fromtimestamp(stat_info.st_ctime)
creation_time_str = creation_time.strftime("%Y-%m-%d")
print(f"File creation time: {creation_time_str}")
except Exception as e:
print(f"Warning: Could not get file creation time: {e}")
try:
# Read log file
print(f"Reading log file: {log_path}")
with open(log_path, "r", encoding="utf-8") as f:
log_data = json.load(f)
# Extract input filename (without extension)
input_filename = log_path.stem
# Extract message history and convert to ChatML format
print("Extracting message history...")
chatml_data = extract_message_history_from_log(log_data)
# Save to files
print(f"Saving ChatML files to: {output_path}")
save_chatml_to_files(chatml_data, output_path, input_filename)
print("\n✓ Processing completed!")
print(f"Output directory: {output_path.absolute()}")
except json.JSONDecodeError as e:
print(f"Error: Cannot parse JSON file: {e}")
except Exception as e:
print(f"Error: {e}")
def main():
"""Main function"""
if len(sys.argv) < 2:
print("Usage: python convert_oai_to_chatml.py [output_dir]")
print("Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json")
print(
"Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chatml"
)
sys.exit(1)
log_file_path = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted_chatml"
process_log_file(log_file_path, output_dir)
if __name__ == "__main__":
main()
================================================
FILE: apps/collect-trace/utils/converters/convert_to_chatml_auto_batch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import subprocess
import sys
from pathlib import Path
from typing import Dict, List
def get_llm_provider(json_file_path: str) -> str:
"""
Extract llm_provider from JSON file
Args:
json_file_path: Path to JSON file
Returns:
llm_provider value or 'unknown' if not found
"""
try:
with open(json_file_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Extract llm_provider from env_info
provider = data.get("env_info", {}).get("llm_provider")
if provider:
return provider
else:
return "unknown"
except Exception as e:
print(f"Error reading JSON file {json_file_path}: {e}")
return "error"
def determine_conversion_method(provider: str) -> str:
"""
Determine conversion method based on provider
Args:
provider: LLM provider name
Returns:
'oai' for OpenAI, 'non-oai' for others
"""
if provider.lower() in ["openai", "claude_newapi", "deepseek_newapi"]:
return "oai"
else:
return "non-oai"
def get_script_paths() -> tuple:
"""
Get paths to conversion scripts
Returns:
Tuple of (oai_script_path, non_oai_script_path)
"""
# Get directory of current script
current_dir = Path(__file__).parent
oai_script = current_dir / "convert_oai_to_chatml.py"
non_oai_script = current_dir / "convert_non_oai_to_chatml.py"
# Check if scripts exist
if not oai_script.exists():
raise FileNotFoundError(f"OAI conversion script not found: {oai_script}")
if not non_oai_script.exists():
raise FileNotFoundError(
f"Non-OAI conversion script not found: {non_oai_script}"
)
return str(oai_script), str(non_oai_script)
def process_single_file(json_file_path: str, output_dir: str) -> bool:
"""
Process a single JSON file
Args:
json_file_path: Path to JSON file
output_dir: Output directory
Returns:
True if successful, False otherwise
"""
try:
# Get llm_provider
provider = get_llm_provider(json_file_path)
if provider == "error":
print(f"❌ Failed to read provider from: {json_file_path}")
return False
# Determine conversion method
conversion_method = determine_conversion_method(provider)
# Get script paths
oai_script, non_oai_script = get_script_paths()
# Choose script based on conversion method
if conversion_method == "oai":
script_path = oai_script
print(f"🔧 Using OAI conversion for provider: {provider}")
else:
script_path = non_oai_script
print(f"🔧 Using Non-OAI conversion for provider: {provider}")
# Run conversion script
result = subprocess.run(
[sys.executable, script_path, json_file_path, output_dir],
capture_output=True,
text=True,
)
if result.returncode == 0:
print(f"✅ Successfully processed: {json_file_path}")
return True
else:
print(f"❌ Failed to process {json_file_path}: {result.stderr}")
return False
except Exception as e:
print(f"❌ Error processing {json_file_path}: {e}")
return False
def find_json_files(input_paths: List[str]) -> List[str]:
"""
Find JSON files from input paths
Args:
input_paths: List of file paths, directories, or patterns
Returns:
List of JSON file paths
"""
json_files = []
for path in input_paths:
path_obj = Path(path)
if path_obj.is_file():
# Single file
if path_obj.suffix.lower() == ".json":
json_files.append(str(path_obj))
elif path_obj.is_dir():
# Directory - find all JSON files
for json_file in path_obj.glob("*.json"):
json_files.append(str(json_file))
else:
# Pattern matching
try:
for json_file in Path(".").glob(path):
if json_file.suffix.lower() == ".json":
json_files.append(str(json_file))
except Exception:
print(f"Warning: Could not process pattern: {path}")
return json_files
def batch_process_files(input_paths: List[str], output_dir: str) -> Dict[str, int]:
"""
Batch process multiple files
Args:
input_paths: List of input paths
output_dir: Output directory
Returns:
Dictionary with processing statistics
"""
# Find JSON files
json_files = find_json_files(input_paths)
if not json_files:
print("❌ No JSON files found in the specified paths")
return {"total": 0, "success": 0, "failed": 0}
print(f"📁 Found {len(json_files)} JSON files to process")
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Process files
success_count = 0
failed_count = 0
for json_file in json_files:
if process_single_file(json_file, output_dir):
success_count += 1
else:
failed_count += 1
return {"total": len(json_files), "success": success_count, "failed": failed_count}
def show_help():
"""Show help information"""
help_text = """
Auto ChatML Conversion Script
============================
Automatically determines conversion method based on llm_provider field in JSON files
Usage:
python convert_to_chatml_auto_batch.py [output_dir]
python convert_to_chatml_auto_batch.py [output_dir]
python convert_to_chatml_auto_batch.py [output_dir]
Parameters:
input_paths: JSON files, directories, or patterns
output_dir: Output directory (optional, default: extracted_chatml)
Examples:
python convert_to_chatml_auto_batch.py logs/debug_logs/
python convert_to_chatml_auto_batch.py logs/debug_logs/*.json
python convert_to_chatml_auto_batch.py logs/debug_logs/ ./my_output
python convert_to_chatml_auto_batch.py task_1.json task_2.json
Conversion Logic:
- If llm_provider = 'openai': Use convert_oai_to_chatml.py
- If llm_provider = anything else: Use convert_non_oai_to_chatml.py
Features:
1. Auto-detect conversion method per file
2. Batch process log files
3. Extract main_agent_message_history
4. Extract browser_agent_message_history_sessions
5. Convert to OpenAI ChatML format
6. Save as separate files
7. Generate processing summary
"""
print(help_text)
def main():
"""Main function"""
# Check for help
if len(sys.argv) < 2 or sys.argv[1] in ["-h", "--help"]:
show_help()
return
# Parse arguments
args = sys.argv[1:]
# Check if last argument is output directory
if len(args) > 1 and not args[-1].startswith("-"):
# Check if last argument looks like a directory
last_arg = args[-1]
if (
last_arg.endswith("/")
or not Path(last_arg).suffix
or last_arg == "extracted_chatml"
or last_arg.startswith("./")
):
output_dir = last_arg
input_paths = args[:-1]
else:
output_dir = "extracted_chatml"
input_paths = args
else:
output_dir = "extracted_chatml"
input_paths = args
print("🚀 Starting auto ChatML conversion")
print(f"📂 Input paths: {input_paths}")
print(f"📁 Output directory: {output_dir}")
try:
# Check if conversion scripts exist
get_script_paths()
# Process files
stats = batch_process_files(input_paths, output_dir)
# Show results
print("\n" + "=" * 50)
print("📊 Processing Summary")
print("=" * 50)
print(f"Total files: {stats['total']}")
print(f"Successfully processed: {stats['success']}")
print(f"Failed: {stats['failed']}")
print(f"Output directory: {Path(output_dir).absolute()}")
if stats["failed"] > 0:
print(f"\n⚠️ {stats['failed']} files failed to process")
sys.exit(1)
else:
print("\n✅ All files processed successfully!")
except FileNotFoundError as e:
print(f"❌ {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Unexpected error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: apps/collect-trace/utils/converters/example_usage.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import os
import sys
import tempfile
from pathlib import Path
# Add parent directory to Python path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from utils.converters import (
extract_and_save_chat_history,
extract_message_history_from_log,
)
def example_1_basic_conversion():
"""Example 1: Basic conversion using Python API"""
print("=== Example 1: Basic Conversion ===")
# Sample log data
log_data = {
"main_agent_message_history": {
"system_prompt": "You are a helpful assistant.",
"message_history": [
{
"role": "developer",
"content": [
{"type": "text", "text": "You are a helpful assistant."}
],
},
{
"role": "user",
"content": [{"type": "text", "text": "Hello, how are you?"}],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "I'm doing well, thank you!"}],
},
],
},
"browser_agent_message_history_sessions": {
"browser_agent_1": {
"system_prompt": "You are a browsing agent.",
"message_history": [
{
"role": "developer",
"content": [
{"type": "text", "text": "You are a browsing agent."}
],
},
{
"role": "user",
"content": [{"type": "text", "text": "Search for something"}],
},
{
"role": "assistant",
"content": [{"type": "text", "text": "I found it."}],
},
],
}
},
"env_info": {"llm_provider": "openai"},
}
# Convert using OAI method
chatml_data = extract_message_history_from_log(log_data)
print(
f"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent"
)
print(
f"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent"
)
# Convert using Non-OAI method
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
extract_and_save_chat_history(log_data, temp_path, "example")
# Check generated files
main_file = temp_path / "example_main_agent_chatml.json"
browser_file = temp_path / "example_browser_agent_1_chatml.json"
if main_file.exists():
with open(main_file, "r") as f:
main_content = json.load(f)
print(
f"Non-OAI conversion result: {len(main_content)} messages in main agent"
)
if browser_file.exists():
with open(browser_file, "r") as f:
browser_content = json.load(f)
print(
f"Non-OAI conversion result: {len(browser_content)} messages in browser agent"
)
if __name__ == "__main__":
print("ChatML Conversion Utilities - Usage Examples")
print("=" * 50)
example_1_basic_conversion()
print("\n" + "=" * 50)
print("Examples completed successfully!")
print("\nFor more information, see the README.md file.")
================================================
FILE: apps/collect-trace/utils/converters/system_prompts.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
main_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""
sub_agent_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""
system_prompt_tool_instrcutions = """# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n\nserver name here\ntool name here\n\n{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\"escaped string\\\"\"\n}\n\n\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n"""
================================================
FILE: apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import glob
import json
import os
def merge_json_files(input_dir, type="main"):
# List to store all messages
all_conversations = []
# Get all JSON files matching the pattern
json_files = glob.glob(os.path.join(input_dir, f"*{type}*.json"))
# Read each JSON file and merge its content
for json_file in json_files:
try:
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
conversation = {
"messages": data,
}
all_conversations.append(conversation)
print(f"Successfully processed: {json_file}")
except Exception as e:
print(f"Error processing {json_file}: {str(e)}")
output_file = os.path.join(input_dir, f"{type}_merged.json")
# Write the merged data to a new JSON file
with open(output_file, "w", encoding="utf-8") as f:
json.dump(all_conversations, f, ensure_ascii=False, indent=2)
print(
f"\nMerging complete! All {type} JSON files have been merged into {output_file}"
)
print(f"Total number of files processed: {len(json_files)}")
print(f"Total number of messages: {len(all_conversations)}")
def main():
parser = argparse.ArgumentParser(
description="Merge multiple JSON files which contain chat messages into a single file"
)
parser.add_argument(
"--input_dir",
type=str,
required=True,
help="File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')",
)
args = parser.parse_args()
merge_json_files(args.input_dir, type="main_agent")
merge_json_files(args.input_dir, type="agent-browsing")
if __name__ == "__main__":
main()
================================================
FILE: apps/collect-trace/utils/process_logs.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import json
import os
import shutil
def get_successful_log_paths(jsonl_file_path: str) -> list:
"""
Collects the paths of successful log files from a dataset.
This function extracts log file paths of successful records based on
the value of `final_judge_result`. If the dataset has been fully
processed, it reads from a `benchmark_results.jsonl` file. Otherwise,
if processing was interrupted, it falls back to scanning individual
`.json` files in the given directory.
Success is determined by:
- `PASS_AT_K_SUCCESS` for records in JSONL files.
- `CORRECT` for records in individual JSON files.
Args:
jsonl_file_path (str): Path to a JSONL file or a directory of JSON files.
Returns:
list: A list of log file paths for successful records.
"""
log_paths = []
if jsonl_file_path.endswith(".jsonl"):
with open(jsonl_file_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
data = json.loads(line)
if data.get("final_judge_result") == "PASS_AT_K_SUCCESS":
log_path = data.get("log_file_path")
if log_path:
log_paths.append(log_path)
except json.JSONDecodeError:
continue
else:
filenames = os.listdir(jsonl_file_path)
filenames = [filename for filename in filenames if filename.endswith(".json")]
for filename in filenames:
filepath = os.path.join(jsonl_file_path, filename)
try:
data = json.load(open(filepath, "r"))
except Exception:
continue
try:
final_judge_result = data["final_judge_result"]
except KeyError:
print(data.keys())
continue
if final_judge_result == "CORRECT":
log_paths.append(filepath)
return log_paths
# Usage example
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract successful log paths from JSONL file"
)
parser.add_argument(
"file_path", help="Path to the JSONL file containing benchmark results"
)
args = parser.parse_args()
result = get_successful_log_paths(args.file_path)
# Get the parent directory of args.file_path
parent_dir = os.path.abspath(os.path.dirname(args.file_path))
# Create successful logs directory
success_log_dir = parent_dir + "/successful_logs"
success_chatml_log_dir = parent_dir + "/successful_chatml_logs"
os.makedirs(success_log_dir, exist_ok=True)
print(f"Successful logs directory: {success_log_dir}")
for i, path in enumerate(result, 1):
basename = os.path.basename(path)
print(f"Copying file: {path} to {success_log_dir}/{basename}")
shutil.copy(path, f"{success_log_dir}/{basename}")
os.system(
f"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}"
)
os.system(
f"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}"
)
================================================
FILE: apps/gradio-demo/README.md
================================================
# Local Deep Research Demo with Gradio Web UI
Host your own Deep Research demo using our [MiroThinker v1.5](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) models and lightweight Gradio-based web interface.
## 🖥️ Hardware Requirements
- **GPU**: NVIDIA RTX 40xx/50xx series or equivalent
- **VRAM**:
- **16GB minimum** (with Q4 quantization via llama.cpp)
- **48GB+ recommended** (for FP8 quantization or longer context)
- MiroThinker-v1.5-30B is a 30B MoE model with 3B active parameters
## ⚙️ LLM Server Deployment
### Download Model Checkpoints
Download the full checkpoint from Hugging Face:
```python
from huggingface_hub import snapshot_download
snapshot_download(repo_id="miromind-ai/MiroThinker-v1.5-30B", local_dir="model/MiroThinker-v1.5-30B")
```
### Option 1: SGLang Server (Recommended)
FP8 is a highly efficient 8-bit floating point format that significantly reduces memory usage while maintaining model quality. This approach provides excellent performance for inference workloads on modern GPUs.
Please install [SGLang](https://github.com/sgl-project/sglang) first. Then initialize fast inference with FP8 precision:
```bash
MODEL_PATH=model/MiroThinker-v1.5-30B
python3 -m sglang.launch_server \
--model-path $MODEL_PATH \
--mem-fraction-static 0.9 \
--quantization fp8 \
--tp 1 \
--dp 1 \
--host 0.0.0.0 \
--port 61005 \
--trust-remote-code
```
It will start an openai compatible server with BASE_URL=`http://0.0.0.0:61005/v1`.
### Option 2: llama.cpp (Quantized)
For memory-efficient inference, download the pre-quantized GGUF version from the community:
**Note**: Thanks to the community for providing quantized versions: [mradermacher](https://huggingface.co/mradermacher)
```bash
# Download Q4_K_M quantized model (recommended balance)
wget https://huggingface.co/mradermacher/MiroThinker-v1.5-30B-GGUF/resolve/main/MiroThinker-v1.5-30B.Q4_K_M.gguf
```
Follow the [official llama.cpp installation guide](https://github.com/ggml-org/llama.cpp) to set up the environment. After that:
```bash
# Set up model path
MODEL_PATH=model/MiroThinker-v1.5-30B.Q4_K_M.gguf
# Start the server
llama-server -m $MODEL_PATH \
--port 61005 \
-ngl 99 \
-v
```
This will start an OpenAI-compatible server at `http://0.0.0.0:61005/v1`.
### Other Options
You can also leverage other frameworks for model serving like Ollama, vLLM, and Text Generation Inference (TGI) for different deployment scenarios.
## 🚀 Quick Start Guide
### 1. **Environment Setup**
Get your API keys:
- [Serper](https://serper.dev/): 2,500 free search credits for new accounts (required for web search)
- [E2B](https://e2b.dev/): Free tier available (required for Python code execution)
- [Jina](https://jina.ai/): Free tier available (required for web scraping)
Edit the `apps/miroflow-agent/.env` file with your API keys:
```bash
# Required - Web Search
SERPER_API_KEY=your_serper_key
# Required - Python Code Execution (E2B Cloud Sandbox)
E2B_API_KEY=your_e2b_key
# Required - Web Scraping
JINA_API_KEY=your_jina_key
# Required - Summary LLM (for webpage summarization)
# Option 1: Use OpenAI GPT-5-Nano (recommended, cost-effective)
SUMMARY_LLM_BASE_URL=https://api.openai.com/v1
SUMMARY_LLM_MODEL_NAME=gpt-5-nano
SUMMARY_LLM_API_KEY=your_openai_key
# Option 2: Use MiroThinker itself (if you have enough VRAM)
# SUMMARY_LLM_BASE_URL=http://0.0.0.0:61005/v1
# SUMMARY_LLM_MODEL_NAME=MiroThinker
# SUMMARY_LLM_API_KEY=none
```
### 2. **Install Dependencies**
We use [uv](https://github.com/astral-sh/uv) to manage all dependencies.
```bash
cd apps/gradio-demo
uv sync
```
### 3. **Configure API Endpoint**
Set your LLM API endpoint and API key:
```bash
export BASE_URL=http://your-sglang-address:your-sglang-port/v1
export API_KEY=your_api_key # Optional, required if your endpoint needs authentication
```
### 4. **Launch the Application**
```bash
uv run main.py
```
### 5. **Access the Web Interface**
Open your browser and navigate to: `http://localhost:8080`
### 📝 Notes
- Ensure your LLM server is up and running before launching the demo
- The demo will use your local CPU/GPU for inference while leveraging external APIs for search and code execution
- Monitor your API usage through the respective provider dashboards
================================================
FILE: apps/gradio-demo/main.py
================================================
import asyncio
import json
import logging
import os
import threading
import time
import uuid
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import AsyncGenerator, List, Optional
import gradio as gr
from dotenv import load_dotenv
from hydra import compose, initialize_config_dir
from omegaconf import DictConfig
from prompt_patch import apply_prompt_patch
from src.config.settings import expose_sub_agents_as_tools
from src.core.pipeline import create_pipeline_components, execute_task_pipeline
from utils import replace_chinese_punctuation
# Apply custom system prompt patch (adds MiroThinker identity)
apply_prompt_patch()
# Create global cleanup thread pool for operations that won't be affected by asyncio.cancel
cleanup_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="cleanup")
logger = logging.getLogger(__name__)
# Set DEMO_MODE for simplified tool configuration
os.environ["DEMO_MODE"] = "1"
# Load environment variables from .env file
load_dotenv()
# Global Hydra initialization flag
_hydra_initialized = False
def load_miroflow_config(config_overrides: Optional[dict] = None) -> DictConfig:
"""
Load the full MiroFlow configuration using Hydra, similar to how benchmarks work.
"""
global _hydra_initialized
# Get the path to the miroflow agent config directory
miroflow_config_dir = Path(__file__).parent.parent / "miroflow-agent" / "conf"
miroflow_config_dir = miroflow_config_dir.resolve()
logger.debug(f"Config dir: {miroflow_config_dir}")
if not miroflow_config_dir.exists():
raise FileNotFoundError(
f"MiroFlow config directory not found: {miroflow_config_dir}"
)
# Initialize Hydra if not already done
if not _hydra_initialized:
try:
initialize_config_dir(
config_dir=str(miroflow_config_dir), version_base=None
)
_hydra_initialized = True
except Exception as e:
logger.warning(f"Hydra already initialized or error: {e}")
# Compose configuration with environment variable overrides
overrides = []
# Add environment variable based overrides (refer to scripts/debug.sh)
llm_provider = os.getenv(
"DEFAULT_LLM_PROVIDER", "qwen"
) # debug.sh defaults to qwen
model_name = os.getenv(
"DEFAULT_MODEL_NAME", "MiroThinker"
) # debug.sh default model
agent_set = os.getenv("DEFAULT_AGENT_SET", "demo") # Use demo config
base_url = os.getenv("BASE_URL", "http://localhost:11434")
api_key = os.getenv("API_KEY", "") # API key for LLM endpoint
logger.debug(f"LLM base_url: {base_url}")
# Map provider names to config files
# Available configs: default.yaml, claude-3-7.yaml, gpt-5.yaml, qwen-3.yaml
provider_config_map = {
"anthropic": "claude-3-7",
"openai": "gpt-5",
"qwen": "qwen-3",
}
llm_config = provider_config_map.get(
llm_provider, "qwen-3"
) # fallback to qwen-3 config
overrides.extend(
[
f"llm={llm_config}",
f"llm.provider={llm_provider}",
f"llm.model_name={model_name}",
f"llm.base_url={base_url}",
f"llm.api_key={api_key}",
f"agent={agent_set}",
"agent.main_agent.max_turns=50", # Limit max turns for gradio demo
"benchmark=gaia-validation", # refer to debug.sh
]
)
# Add config overrides from request
if config_overrides:
for key, value in config_overrides.items():
if isinstance(value, dict):
for subkey, subvalue in value.items():
overrides.append(f"{key}.{subkey}={subvalue}")
else:
overrides.append(f"{key}={value}")
try:
cfg = compose(config_name="config", overrides=overrides)
return cfg
except Exception as e:
logger.error(f"Failed to compose Hydra config: {e}")
exit()
# Lazy loading for tool definitions to speed up page load
# Tools will be loaded on first request instead of blocking startup
_preload_cache = {
"cfg": None,
"main_agent_tool_manager": None,
"sub_agent_tool_managers": None,
"output_formatter": None,
"tool_definitions": None,
"sub_agent_tool_definitions": None,
"loaded": False,
}
_preload_lock = threading.Lock()
def _ensure_preloaded():
"""Lazy load pipeline components on first request."""
global _preload_cache
if _preload_cache["loaded"]:
return
with _preload_lock:
if _preload_cache["loaded"]:
return
logger.info("Loading pipeline components (first request)...")
cfg = load_miroflow_config(None)
main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (
create_pipeline_components(cfg)
)
tool_definitions = asyncio.run(
main_agent_tool_manager.get_all_tool_definitions()
)
if cfg.agent.sub_agents:
tool_definitions += expose_sub_agents_as_tools(cfg.agent.sub_agents)
sub_agent_tool_definitions = {
name: asyncio.run(sub_agent_tool_manager.get_all_tool_definitions())
for name, sub_agent_tool_manager in sub_agent_tool_managers.items()
}
_preload_cache["cfg"] = cfg
_preload_cache["main_agent_tool_manager"] = main_agent_tool_manager
_preload_cache["sub_agent_tool_managers"] = sub_agent_tool_managers
_preload_cache["output_formatter"] = output_formatter
_preload_cache["tool_definitions"] = tool_definitions
_preload_cache["sub_agent_tool_definitions"] = sub_agent_tool_definitions
_preload_cache["loaded"] = True
logger.info("Pipeline components loaded successfully.")
class ThreadSafeAsyncQueue:
"""Thread-safe async queue wrapper"""
def __init__(self):
self._queue = asyncio.Queue()
self._loop = None
self._closed = False
def set_loop(self, loop):
self._loop = loop
async def put(self, item):
"""Put data safely from any thread"""
if self._closed:
return
await self._queue.put(item)
def put_nowait_threadsafe(self, item):
"""Put data from other threads - use direct queue put for lower latency"""
if self._closed or not self._loop:
return
# Use put_nowait directly instead of creating a task for lower latency
self._loop.call_soon_threadsafe(lambda: self._queue.put_nowait(item))
async def get(self):
return await self._queue.get()
def close(self):
self._closed = True
def filter_google_search_organic(organic: List[dict]) -> List[dict]:
"""
Filter google search organic results to remove unnecessary information
"""
result = []
for item in organic:
result.append(
{
"title": item.get("title", ""),
"link": item.get("link", ""),
}
)
return result
def is_scrape_error(result: str) -> bool:
"""
Check if the scrape result is an error
"""
try:
json.loads(result)
return False
except json.JSONDecodeError:
return True
def filter_message(message: dict) -> dict:
"""
Filter message to remove unnecessary information
"""
if message["event"] == "tool_call":
tool_name = message["data"].get("tool_name")
tool_input = message["data"].get("tool_input")
if (
tool_name == "google_search"
and isinstance(tool_input, dict)
and "result" in tool_input
):
result_dict = json.loads(tool_input["result"])
if "organic" in result_dict:
new_result = {
"organic": filter_google_search_organic(result_dict["organic"])
}
message["data"]["tool_input"]["result"] = json.dumps(
new_result, ensure_ascii=False
)
if (
tool_name in ["scrape", "scrape_website"]
and isinstance(tool_input, dict)
and "result" in tool_input
):
# if error, it can not be json
if is_scrape_error(tool_input["result"]):
message["data"]["tool_input"] = {"error": tool_input["result"]}
else:
message["data"]["tool_input"] = {}
return message
async def stream_events_optimized(
task_id: str, query: str, _: Optional[dict] = None, disconnect_check=None
) -> AsyncGenerator[dict, None]:
"""Optimized event stream generator that directly outputs structured events, no longer wrapped as SSE strings."""
workflow_id = task_id
last_send_time = time.time()
last_heartbeat_time = time.time()
# Create thread-safe queue
stream_queue = ThreadSafeAsyncQueue()
stream_queue.set_loop(asyncio.get_event_loop())
cancel_event = threading.Event()
def run_pipeline_in_thread():
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
class ThreadQueueWrapper:
def __init__(self, thread_queue, cancel_event):
self.thread_queue = thread_queue
self.cancel_event = cancel_event
async def put(self, item):
if self.cancel_event.is_set():
logger.info("Pipeline cancelled, stopping execution")
return
self.thread_queue.put_nowait_threadsafe(filter_message(item))
wrapper_queue = ThreadQueueWrapper(stream_queue, cancel_event)
# Ensure pipeline components are loaded (lazy loading)
_ensure_preloaded()
async def pipeline_with_cancellation():
pipeline_task = asyncio.create_task(
execute_task_pipeline(
cfg=_preload_cache["cfg"],
task_id=workflow_id,
task_description=query,
task_file_name=None,
main_agent_tool_manager=_preload_cache[
"main_agent_tool_manager"
],
sub_agent_tool_managers=_preload_cache[
"sub_agent_tool_managers"
],
output_formatter=_preload_cache["output_formatter"],
stream_queue=wrapper_queue,
log_dir=os.getenv("LOG_DIR", "logs/api-server"),
tool_definitions=_preload_cache["tool_definitions"],
sub_agent_tool_definitions=_preload_cache[
"sub_agent_tool_definitions"
],
)
)
async def check_cancellation():
while not cancel_event.is_set():
await asyncio.sleep(0.5)
logger.info("Cancel event detected, cancelling pipeline")
pipeline_task.cancel()
cancel_task = asyncio.create_task(check_cancellation())
try:
done, pending = await asyncio.wait(
[pipeline_task, cancel_task],
return_when=asyncio.FIRST_COMPLETED,
)
for task in pending:
task.cancel()
for task in done:
if task == pipeline_task:
try:
await task
except asyncio.CancelledError:
logger.info("Pipeline task was cancelled")
except Exception as e:
logger.error(f"Pipeline execution error: {e}")
pipeline_task.cancel()
cancel_task.cancel()
loop.run_until_complete(pipeline_with_cancellation())
except Exception as e:
if not cancel_event.is_set():
logger.error(f"Pipeline error: {e}", exc_info=True)
stream_queue.put_nowait_threadsafe(
{
"event": "error",
"data": {"error": str(e), "workflow_id": workflow_id},
}
)
finally:
stream_queue.put_nowait_threadsafe(None)
if "loop" in locals():
loop.close()
executor = ThreadPoolExecutor(max_workers=1)
future = executor.submit(run_pipeline_in_thread)
try:
while True:
try:
if disconnect_check and await disconnect_check():
logger.info("Client disconnected, stopping pipeline")
cancel_event.set()
break
message = await asyncio.wait_for(stream_queue.get(), timeout=0.1)
if message is None:
logger.info("Pipeline completed")
break
yield message
last_send_time = time.time()
except asyncio.TimeoutError:
current_time = time.time()
if current_time - last_send_time > 300:
logger.info("Stream timeout")
break
if future.done():
try:
message = stream_queue._queue.get_nowait()
if message is not None:
yield message
continue
except Exception:
break
if current_time - last_heartbeat_time >= 15:
yield {
"event": "heartbeat",
"data": {"timestamp": current_time, "workflow_id": workflow_id},
}
last_heartbeat_time = current_time
except Exception as e:
logger.error(f"Stream error: {e}", exc_info=True)
yield {
"event": "error",
"data": {"workflow_id": workflow_id, "error": f"Stream error: {str(e)}"},
}
finally:
cancel_event.set()
stream_queue.close()
try:
future.result(timeout=1.0)
except Exception:
pass
executor.shutdown(wait=False)
# ========================= Gradio Integration =========================
def _init_render_state():
return {
"agent_order": [],
"agents": {}, # agent_id -> {"agent_name": str, "tool_call_order": [], "tools": {tool_call_id: {...}}}
"current_agent_id": None,
"errors": [],
}
def _format_think_content(text: str) -> str:
"""Convert tags to readable markdown format."""
import re
# Replace tags with blockquote format (no label)
text = re.sub(r"\s*", "\n> ", text)
text = re.sub(r"\s*", "\n", text)
# Convert newlines within thinking to blockquote continuation
lines = text.split("\n")
result = []
in_thinking = False
for line in lines:
if line.strip().startswith(">") and not in_thinking:
in_thinking = True
result.append(line)
elif in_thinking and line.strip() and not line.startswith(">"):
result.append(f"> {line}")
else:
if line.strip() == "" and in_thinking:
in_thinking = False
result.append(line)
return "\n".join(result)
def _append_show_text(tool_entry: dict, delta: str):
existing = tool_entry.get("content", "")
# Skip "Final boxed answer" content (already shown in main response)
if "Final boxed answer" in delta:
return
# Format think tags for display
formatted_delta = _format_think_content(delta)
tool_entry["content"] = existing + formatted_delta
def _is_empty_payload(value) -> bool:
if value is None:
return True
if isinstance(value, str):
stripped = value.strip()
return stripped == "" or stripped in ("{}", "[]")
if isinstance(value, (dict, list, tuple, set)):
return len(value) == 0
return False
def _format_search_results(tool_input: dict, tool_output: dict) -> str:
"""Format google_search results in a beautiful card layout."""
lines = []
# Get search query from input
query = ""
if isinstance(tool_input, dict):
query = tool_input.get("q", "") or tool_input.get("query", "")
# Parse results from output - handle multiple formats
results = []
if isinstance(tool_output, dict):
# Case 1: output has "result" field containing JSON string
result_str = tool_output.get("result", "")
if isinstance(result_str, str) and result_str.strip():
try:
result_data = json.loads(result_str)
if isinstance(result_data, dict):
results = result_data.get("organic", [])
except json.JSONDecodeError:
pass
elif isinstance(result_str, dict):
results = result_str.get("organic", [])
# Case 2: output directly contains "organic" field
if not results and "organic" in tool_output:
results = tool_output.get("organic", [])
if not results and not query:
return ""
# Build the card
lines.append('
')
for item in results[:10]: # Limit to 10 results
title = item.get("title", "Untitled")
link = item.get("link", "#")
lines.append(f"""🌐{title}""")
lines.append("
")
lines.append("
")
return "\n".join(lines)
def _format_sogou_search_results(tool_input: dict, tool_output: dict) -> str:
"""Format sogou_search results in a beautiful card layout."""
lines = []
# Get search query from input
query = ""
if isinstance(tool_input, dict):
query = tool_input.get("q", "") or tool_input.get("query", "")
# Parse results from output - sogou uses "Pages" instead of "organic"
results = []
if isinstance(tool_output, dict):
result_str = tool_output.get("result", "")
if isinstance(result_str, str) and result_str.strip():
try:
result_data = json.loads(result_str)
if isinstance(result_data, dict):
results = result_data.get("Pages", [])
except json.JSONDecodeError:
pass
elif isinstance(result_str, dict):
results = result_str.get("Pages", [])
if not results and "Pages" in tool_output:
results = tool_output.get("Pages", [])
if not results and not query:
return ""
# Build the card
lines.append('
')
for item in results[:10]: # Limit to 10 results
title = item.get("title", "Untitled")
link = item.get("url", item.get("link", "#"))
lines.append(f"""🌐{title}""")
lines.append("
")
lines.append("
")
return "\n".join(lines)
def _format_scrape_results(tool_input: dict, tool_output: dict) -> str:
"""Format scrape/webpage results in a card layout."""
lines = []
# Get URL
url = ""
if isinstance(tool_input, dict):
url = tool_input.get("url", tool_input.get("link", ""))
# Check for error
if isinstance(tool_output, dict) and "error" in tool_output:
lines.append('
")
return "\n".join(lines)
def _render_markdown(state: dict) -> str:
lines = []
final_summary_lines = [] # Collect final summary content separately
# Render errors first if any
if state.get("errors"):
for err in state["errors"]:
lines.append(f'
❌ {err}
')
# Render all agents' content
for agent_id in state.get("agent_order", []):
agent = state["agents"].get(agent_id, {})
agent_name = agent.get("agent_name", "")
is_final_summary = agent_name == "Final Summary"
for call_id in agent.get("tool_call_order", []):
call = agent["tools"].get(call_id, {})
tool_name = call.get("tool_name", "unknown_tool")
# Show text / message - display directly
if tool_name in ("show_text", "message"):
content = call.get("content", "")
if content:
if is_final_summary:
final_summary_lines.append(content)
else:
lines.append(content)
continue
tool_input = call.get("input", {})
tool_output = call.get("output", {})
has_input = not _is_empty_payload(tool_input)
has_output = not _is_empty_payload(tool_output)
# Special formatting for google_search
if tool_name == "google_search" and (has_input or has_output):
formatted = _format_search_results(tool_input, tool_output)
if formatted:
lines.append(formatted)
continue
# Special formatting for sogou_search
if tool_name == "sogou_search" and (has_input or has_output):
formatted = _format_sogou_search_results(tool_input, tool_output)
if formatted:
lines.append(formatted)
continue
# Special formatting for scrape/webpage tools
if tool_name in (
"scrape",
"scrape_website",
"scrape_webpage",
"scrape_and_extract_info",
) and (has_input or has_output):
formatted = _format_scrape_results(tool_input, tool_output)
if formatted:
lines.append(formatted)
continue
# Special formatting for code execution tools
if tool_name in ("python", "run_python_code") and (has_input or has_output):
# Use pure Markdown to avoid HTML wrapper blocking Markdown rendering
lines.append("\n---\n")
lines.append("#### 💻 Code Execution\n")
# Show code input - try multiple possible keys
code = ""
if isinstance(tool_input, dict):
code = tool_input.get("code") or tool_input.get("code_block") or ""
elif isinstance(tool_input, str):
code = tool_input
if code:
lines.append(f"\n```python\n{code}\n```\n")
# Show output if available
if has_output:
output = ""
if isinstance(tool_output, dict):
output = (
tool_output.get("result")
or tool_output.get("output")
or tool_output.get("stdout")
or ""
)
elif isinstance(tool_output, str):
output = tool_output
if isinstance(output, str) and output.strip():
lines.append("\n**Output:**\n")
lines.append(
f'\n```text\n{output[:1000]}{"..." if len(output) > 1000 else ""}\n```\n'
)
lines.append("\n✅ Executed\n")
continue
# Other tools - show as compact card
if has_input or has_output:
target_lines = final_summary_lines if is_final_summary else lines
target_lines.append('
')
target_lines.append(f'
🔧 {tool_name}
')
if has_input:
# Show brief input summary
if isinstance(tool_input, dict):
brief = ", ".join(
f"{k}: {str(v)[:30]}..."
if len(str(v)) > 30
else f"{k}: {v}"
for k, v in list(tool_input.items())[:2]
)
target_lines.append(f'
{brief}
')
if has_output:
target_lines.append('
✓ Done
')
target_lines.append("
")
# Add final summary with Markdown-based styling (no HTML wrapper to preserve Markdown rendering)
if final_summary_lines:
lines.append("\n\n---\n\n") # Markdown horizontal rule as divider
lines.append("## 📋 Research Summary\n\n")
lines.extend(final_summary_lines)
return "\n".join(lines) if lines else "*Waiting to start research...*"
def _update_state_with_event(state: dict, message: dict):
event = message.get("event")
data = message.get("data", {})
if event == "start_of_agent":
agent_id = data.get("agent_id")
agent_name = data.get("agent_name", "unknown")
if agent_id and agent_id not in state["agents"]:
state["agents"][agent_id] = {
"agent_name": agent_name,
"tool_call_order": [],
"tools": {},
}
state["agent_order"].append(agent_id)
state["current_agent_id"] = agent_id
elif event == "end_of_agent":
# End marker, no special handling needed, keep structure
state["current_agent_id"] = None
elif event == "tool_call":
tool_call_id = data.get("tool_call_id")
tool_name = data.get("tool_name", "unknown_tool")
agent_id = state.get("current_agent_id") or (
state["agent_order"][-1] if state["agent_order"] else None
)
if not agent_id:
return state
agent = state["agents"].setdefault(
agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}}
)
tools = agent["tools"]
if tool_call_id not in tools:
tools[tool_call_id] = {"tool_name": tool_name}
agent["tool_call_order"].append(tool_call_id)
entry = tools[tool_call_id]
if tool_name == "show_text" and "delta_input" in data:
delta = data.get("delta_input", {}).get("text", "")
_append_show_text(entry, delta)
elif tool_name == "show_text" and "tool_input" in data:
ti = data.get("tool_input")
text = ""
if isinstance(ti, dict):
text = ti.get("text", "") or (
(ti.get("result") or {}).get("text")
if isinstance(ti.get("result"), dict)
else ""
)
elif isinstance(ti, str):
text = ti
if text:
_append_show_text(entry, text)
else:
# Distinguish between input and output:
if "tool_input" in data:
# Could be input (first time) or output with result (second time)
ti = data["tool_input"]
# If contains result, assign to output; otherwise assign to input
if isinstance(ti, dict) and "result" in ti:
entry["output"] = ti
else:
# Only update input if we don't already have valid input data, or if the new data is not empty
if "input" not in entry or not _is_empty_payload(ti):
entry["input"] = ti
elif event == "message":
# Same incremental text display as show_text, aggregated by message_id
message_id = data.get("message_id")
agent_id = state.get("current_agent_id") or (
state["agent_order"][-1] if state["agent_order"] else None
)
if not agent_id:
return state
agent = state["agents"].setdefault(
agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}}
)
tools = agent["tools"]
if message_id not in tools:
tools[message_id] = {"tool_name": "message"}
agent["tool_call_order"].append(message_id)
entry = tools[message_id]
delta_content = (data.get("delta") or {}).get("content", "")
if isinstance(delta_content, str) and delta_content:
_append_show_text(entry, delta_content)
elif event == "error":
# Collect errors, display uniformly during rendering
err_text = data.get("error") if isinstance(data, dict) else None
if not err_text:
try:
err_text = json.dumps(data, ensure_ascii=False)
except Exception:
err_text = str(data)
state.setdefault("errors", []).append(err_text)
else:
# Ignore heartbeat or other events
pass
return state
_CANCEL_FLAGS = {}
_CANCEL_LOCK = threading.Lock()
def _set_cancel_flag(task_id: str):
with _CANCEL_LOCK:
_CANCEL_FLAGS[task_id] = True
def _reset_cancel_flag(task_id: str):
with _CANCEL_LOCK:
_CANCEL_FLAGS[task_id] = False
async def _disconnect_check_for_task(task_id: str):
with _CANCEL_LOCK:
return _CANCEL_FLAGS.get(task_id, False)
def _spinner_markup(running: bool) -> str:
if not running:
return ""
return (
'\n\n
Don't just chat. Predict, verify, and discover with science-based AI.
""")
# Input Section
with gr.Column(elem_id="input-section"):
inp = gr.Textbox(
lines=4,
placeholder="Enter your research question...",
show_label=False,
elem_id="question-input",
)
with gr.Row(elem_id="btn-row"):
stop_btn = gr.Button(
"⏹ Stop",
elem_id="stop-btn",
variant="stop",
interactive=False,
scale=1,
)
run_btn = gr.Button(
"Start Research ➤", elem_id="run-btn", variant="primary", scale=2
)
# Output Section
with gr.Column(elem_id="output-section"):
gr.HTML('
Research Progress
')
out_md = gr.Markdown("*Waiting to start research...*", elem_id="log-view")
# State
ui_state = gr.State({"task_id": None})
# Event handlers
run_btn.click(
fn=gradio_run,
inputs=[inp, ui_state],
outputs=[out_md, run_btn, stop_btn, ui_state],
)
stop_btn.click(fn=stop_current, inputs=[ui_state], outputs=[run_btn, stop_btn])
# Footer
gr.HTML("""
""")
return demo
if __name__ == "__main__":
demo = build_demo()
host = os.getenv("HOST", "0.0.0.0")
port = int(os.getenv("PORT", "8080"))
demo.queue().launch(server_name=host, server_port=port)
================================================
FILE: apps/gradio-demo/prompt_patch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Custom Prompt Override (Monkey Patching)
This module allows customizing prompts without modifying miroflow-agent code.
Patches applied:
1. `generate_mcp_system_prompt` - Prepends custom identity prompt
2. `process_input` - Removes the boxed format requirement suffix
3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt for demo
4. `format_final_summary_and_log` - Disables boxed format check to prevent retry
Usage:
from prompt_patch import apply_prompt_patch
apply_prompt_patch()
"""
import re
# ============================================================================
# Custom Identity Prompt
# ============================================================================
CUSTOM_IDENTITY_PROMPT = """You are MiroThinker, a specialized deep research AI assistant developed by MiroMind.
IMPORTANT IDENTITY REMINDER:
- You are NOT ChatGPT, Claude, or any other AI assistant
"""
# ============================================================================
# Strings to Remove from Input Processing
# ============================================================================
# This string is appended to task descriptions in input_handler.py
# We remove it for demo mode since we don't need strict boxed format
BOXED_FORMAT_SUFFIX = "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}."
# ============================================================================
# Custom Summarize Prompt for Demo Mode
# ============================================================================
def get_demo_summarize_prompt(target_language: str, task_description: str) -> str:
"""
Generate a user-friendly summarize prompt for demo mode.
This prompt is designed for better user experience, producing well-formatted
Markdown responses instead of strict boxed answers.
Args:
target_language: The language to write the response in
task_description: The original user question
Returns:
The summarize prompt string
"""
return f"""Please provide the final research summary based only on the information already gathered.
No further tool calls are allowed.
## Requirements
- **Language**: Write the entire response in **{target_language}**.
- **Focus**: Directly answer the original question above. Do not just summarize gathered information — provide a clear, actionable answer.
- **Response Length**: Match the complexity of your response to the question. For simple or short questions, provide a concise and direct answer without unnecessary elaboration. For complex questions, provide a detailed and structured report.
- Use clear and structured Markdown formatting when appropriate.
- Use appropriate Markdown headings (e.g., #, ##, ###) only when the content warrants structure.
- Present key findings in an organized, concise, and readable way.
- Use tables only when they genuinely improve clarity.
- **Currency Format**: Use `\\$` instead of `$` for currency amounts (e.g., `\\$100`, `\\$1,000`) to avoid conflicts with inline math syntax.
- **Citation Format**:
- **In-Text**: Use the format `[ID]`, where `ID` is a **numeric identifier only** (digits 0–9), e.g. `[1]`, `[2]`.
- **References Section(if has any sources)**: At the very end, add "References" (or equivalent in {target_language}). Format: [ID] TITLE/SECTION_TITLE. /.
- Do NOT mention tools, tool calls, or internal reasoning steps.
- Focus solely on delivering a professional, easy-to-read response that answers the user's original question.
## Original Question (for reference)
{task_description}"""
def _detect_language(text: str) -> str:
"""
Simple language detection based on character analysis.
Returns a language description suitable for the summarize prompt.
"""
# Count characters by script
chinese_chars = sum(1 for c in text if "\u4e00" <= c <= "\u9fff")
japanese_chars = sum(
1 for c in text if "\u3040" <= c <= "\u30ff" or "\u31f0" <= c <= "\u31ff"
)
korean_chars = sum(1 for c in text if "\uac00" <= c <= "\ud7af")
total_chars = len(text.replace(" ", ""))
if total_chars == 0:
return "English"
# Determine primary language
if chinese_chars / total_chars > 0.1:
return "Chinese (Simplified)"
elif japanese_chars / total_chars > 0.1:
return "Japanese"
elif korean_chars / total_chars > 0.1:
return "Korean"
else:
return "the same language as the user's question"
# ============================================================================
# Monkey Patching
# ============================================================================
_patched = False
def apply_prompt_patch():
"""
Apply monkey patches to customize prompts for demo mode.
Patches applied:
1. `generate_mcp_system_prompt` - Prepends custom identity prompt to system prompt
2. `process_input` - Removes the boxed format requirement from task descriptions
3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt
4. `format_final_summary_and_log` - Disables boxed format check to prevent retry
This function is idempotent - calling it multiple times has no additional effect.
"""
global _patched
if _patched:
return
_patch_system_prompt()
_patch_input_handler()
_patch_summarize_prompt()
_patch_output_formatter()
_patched = True
def _patch_system_prompt():
"""Patch system prompt generation to include custom identity."""
from src.llm.providers import anthropic_client, openai_client
from src.utils import prompt_utils
# Store original function
original_generate_mcp_system_prompt = prompt_utils.generate_mcp_system_prompt
def patched_generate_mcp_system_prompt(date, mcp_servers):
"""Patched version that prepends custom identity prompt."""
original_prompt = original_generate_mcp_system_prompt(date, mcp_servers)
return CUSTOM_IDENTITY_PROMPT + original_prompt
# Apply patches to all modules that import and use this function
prompt_utils.generate_mcp_system_prompt = patched_generate_mcp_system_prompt
openai_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt
anthropic_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt
def _patch_input_handler():
"""Patch input handler to remove boxed format requirement."""
from src.core import orchestrator
from src.io import input_handler
# Store original function
original_process_input = input_handler.process_input
def patched_process_input(task_description: str, task_file_name: str):
"""Patched version that removes boxed format requirement."""
result1, result2 = original_process_input(task_description, task_file_name)
# Remove the boxed format suffix from both results
result1 = result1.replace(BOXED_FORMAT_SUFFIX, "")
result2 = result2.replace(BOXED_FORMAT_SUFFIX, "")
return result1, result2
# Apply patch to input_handler module
input_handler.process_input = patched_process_input
# Also patch in orchestrator where it's imported
orchestrator.process_input = patched_process_input
def _patch_summarize_prompt():
"""Patch summarize prompt generation for better user experience."""
from src.core import answer_generator, orchestrator
from src.utils import prompt_utils
def patched_generate_agent_summarize_prompt(
task_description: str, agent_type: str = ""
) -> str:
"""
Patched version that uses user-friendly prompt for main agent.
For main agent in demo mode, uses a Markdown-friendly prompt instead of
the strict boxed format prompt used for benchmarks.
"""
if agent_type == "main":
# Detect language from task description
target_language = _detect_language(task_description)
return get_demo_summarize_prompt(target_language, task_description)
elif agent_type == "agent-browsing" or agent_type == "browsing-agent":
# Keep original behavior for sub-agents
summarize_prompt = (
"This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
"We are now ending this session, and your conversation history will be deleted. "
"You must NOT initiate any further tool use. This is your final opportunity to report "
"*all* of the information gathered during the session.\n\n"
"The original task is repeated here for reference:\n\n"
f'"{task_description}"\n\n'
"Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
"If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
"If you reached a conclusion or answer, include it as part of the response.\n"
"If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "
"Search results, quotes, and observations that might help a downstream agent solve the problem.\n"
"If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n"
"Your final response should be a clear, complete, and structured report.\n"
"Organize the content into logical sections with appropriate headings.\n"
"Do NOT include any tool call instructions, speculative filler, or vague summaries.\n"
"Focus on factual, specific, and well-organized information."
)
return summarize_prompt.strip()
else:
raise ValueError(f"Unknown agent type: {agent_type}")
# Apply patches to all modules that import and use this function
prompt_utils.generate_agent_summarize_prompt = (
patched_generate_agent_summarize_prompt
)
orchestrator.generate_agent_summarize_prompt = (
patched_generate_agent_summarize_prompt
)
answer_generator.generate_agent_summarize_prompt = (
patched_generate_agent_summarize_prompt
)
def _patch_output_formatter():
"""
Patch output formatter to disable boxed format check.
In demo mode, we don't require \boxed{} format, so we patch the
format_final_summary_and_log method to always return a valid result
instead of FORMAT_ERROR_MESSAGE, which would trigger retry logic.
"""
from src.io import output_formatter
# Get the OutputFormatter class
OutputFormatter = output_formatter.OutputFormatter
def patched_format_final_summary_and_log(self, final_answer_text: str, client=None):
"""
Patched version that doesn't return FORMAT_ERROR_MESSAGE.
Instead of checking for \boxed{} content, we use the entire answer
(with thinking tags removed) as the result.
"""
summary_lines = []
summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30)
summary_lines.append(final_answer_text)
# In demo mode, use the full answer text (minus thinking) as the result
# Remove ... tags for the extracted result
boxed_result = re.sub(
r".*?", "", final_answer_text, flags=re.DOTALL
).strip()
# If there's actual boxed content, extract it (for compatibility)
actual_boxed = self._extract_boxed_content(final_answer_text)
if actual_boxed:
boxed_result = actual_boxed
# Add extracted result section
summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20)
summary_lines.append(boxed_result if boxed_result else final_answer_text)
# Token usage statistics and cost estimation
if client and hasattr(client, "format_token_usage_summary"):
token_summary_lines, log_string = client.format_token_usage_summary()
summary_lines.extend(token_summary_lines)
else:
summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20)
summary_lines.append("Token usage information not available.")
summary_lines.append("-" * (40 + len(" Token Usage & Cost ")))
log_string = "Token usage information not available."
# Return boxed_result (never FORMAT_ERROR_MESSAGE in demo mode)
# This ensures no retry is triggered
return (
"\n".join(summary_lines),
boxed_result or "Demo mode - no boxed format required",
log_string,
)
# Apply patch
OutputFormatter.format_final_summary_and_log = patched_format_final_summary_and_log
def get_custom_identity_prompt() -> str:
"""Return the custom identity prompt string."""
return CUSTOM_IDENTITY_PROMPT
================================================
FILE: apps/gradio-demo/pyproject.toml
================================================
[project]
name = "gradio-demo"
version = "0.1.0"
description = "Gradio Demo"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"pydantic>=2.10.0",
"python-dotenv>=1.0.0",
"hydra-core>=1.3.0",
"miroflow-agent",
"aiohttp>=3.12.15",
"gradio>=5.42.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["./"]
[tool.uv.sources]
miroflow-agent = { path = "../miroflow-agent", editable = true }
[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
"httpx>=0.28.1",
]
================================================
FILE: apps/gradio-demo/utils.py
================================================
import re
def contains_chinese(text):
"""
Detect if a string contains Chinese characters or Chinese punctuation
Args:
text (str): The string to detect
Returns:
bool: True if contains Chinese characters or punctuation, False otherwise
"""
# Chinese character Unicode ranges:
# \u4e00-\u9fff: CJK Unified Ideographs
# \u3400-\u4dbf: CJK Extension A
# \uf900-\ufaff: CJK Compatibility Ideographs
# \u3000-\u303f: CJK Symbols and Punctuation
# \uff00-\uffef: Fullwidth ASCII, Fullwidth punctuation
chinese_pattern = re.compile(
r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]"
)
return bool(chinese_pattern.search(text))
def replace_chinese_punctuation(text):
# Handle single-character replacements with translate
punctuation_map = str.maketrans(
{
",": ",",
"。": ".",
"!": "!",
"?": "?",
";": ";",
":": ":",
"“": '"',
"”": '"',
"‘": "'",
"’": "'",
"(": "(",
")": ")",
"【": "[",
"】": "]",
"《": "<",
"》": ">",
"、": ",",
"—": "-",
}
)
# First, replace multi-character punctuation
text = text.replace("……", "...")
# Then apply single-character replacements
return text.translate(punctuation_map)
================================================
FILE: apps/lobehub-compatibility/MiroThinkerToolParser.py
================================================
"""
Tool parser plugin for vLLM for MiroThinker MCP format to compatible with the tool calling interface of openai.
MCP format:
server nametool name
{...}
"""
import json
from collections.abc import Sequence
import json_repair
import regex as re
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaFunctionCall,
DeltaMessage,
DeltaToolCall,
ExtractedToolCallInformation,
FunctionCall,
ToolCall,
)
from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
ToolParserManager,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
class MirothinkerToolParser(ToolParser):
def __init__(self, tokenizer):
super().__init__(tokenizer)
# State tracking for streaming
self.current_tool_name_sent: bool = False
self.prev_tool_call_arr: list[dict] = []
self.current_tool_id: int = -1
self.streamed_args_for_tool: list[str] = []
self.buffer: str = "" # Buffer for potential tool call tags
self._resolved_tool_name_cache: dict[tuple[str, str], str] = {}
# Correctness-first streaming state (incremental state machine)
self._stream_mode: str = "text" # "text" | "tool"
self._text_token_prefix: str = "" # possible prefix of
self._tool_end_token_prefix: str = "" # possible prefix of
self._tool_block_buffer: str = (
"" # accumulates between and
)
self._stream_tool_call_ids: list[str] = []
# Token definitions
self.tool_call_start_token: str = ""
self.tool_call_end_token: str = ""
# Regex patterns
self.tool_call_regex = re.compile(
r"\s*"
r"(.*?)\s*"
r"(.*?)\s*"
r"\s*(.*?)\s*\s*"
r"",
re.DOTALL,
)
# For streaming partial tool calls
# IMPORTANT: Use GREEDY matching (.*) for arguments to capture all content
# in streaming mode. We'll clean up tag in the code if present.
# The outer ()? makes the whole section optional
# The inner (.*) will match empty string if exists but has no content yet
self.partial_tool_regex = re.compile(
r"\s*"
r"(?:(.*?)\s*)?"
r"(?:(.*?)\s*)?"
r"(?:(\s*.*))?", # Move \s* inside capture group so empty match returns ""
re.DOTALL,
)
# For correctness-first parsing on COMPLETE tool blocks only
self._complete_tool_block_regex = re.compile(
r"\s*"
r"(?:(.*?)\s*)?"
r"(?:(.*?)\s*)?"
r"(?:\s*(.*?)\s*(?:\s*)?)?"
r"",
re.DOTALL,
)
def _resolve_tool_name(
self, server_name: str, tool_name: str, request: ChatCompletionRequest
) -> str:
"""
Resolve the actual tool name by combining server_name and tool_name
if server_name is not 'default'.
"""
if not server_name or server_name == "default":
return tool_name
if not request or not request.tools:
return tool_name
cache_key = (server_name, tool_name)
cached = self._resolved_tool_name_cache.get(cache_key)
if cached:
return cached
# Filter tools that contain server_name
candidates = []
for tool in request.tools:
if hasattr(tool, "function") and hasattr(tool.function, "name"):
name = tool.function.name
if tool_name in name:
candidates.append(name)
if len(candidates) == 1:
resolved = candidates[0]
self._resolved_tool_name_cache[cache_key] = resolved
return resolved
# Find match containing tool_name
for candidate in candidates:
if server_name in candidate:
logger.debug(
"Resolved tool %s -> %s (server: %s)",
tool_name,
candidate,
server_name,
)
self._resolved_tool_name_cache[cache_key] = candidate
return candidate
return tool_name
def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest:
request = super().adjust_request(request)
if request.tools and request.tool_choice != "none":
# Do not skip special tokens for proper tool parsing
request.skip_special_tokens = False
return request
def _ensure_tool_id_valid(self, tool_id: int) -> bool:
"""Ensure the tool_id is valid and arrays have enough elements"""
if tool_id < 0:
return False
# Ensure arrays are large enough
while len(self.streamed_args_for_tool) <= tool_id:
self.streamed_args_for_tool.append("")
while len(self.prev_tool_call_arr) <= tool_id:
self.prev_tool_call_arr.append({})
return True
def extract_tool_calls(
self,
model_output: str,
request: ChatCompletionRequest,
) -> ExtractedToolCallInformation:
# Sanity check; avoid unnecessary processing
if logger.isEnabledFor(10): # DEBUG
logger.debug("model_output len=%s", len(model_output))
if (
self.tool_call_start_token not in model_output
or request.tool_choice == "none"
or not request.tools
):
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
try:
tool_calls = []
had_any_match = False
had_parse_error = False
# Find all complete tool calls
for match in self.tool_call_regex.finditer(model_output):
had_any_match = True
server_name = match.group(1).strip()
tool_name = match.group(2).strip()
arguments_str = match.group(3).strip()
# Resolve tool name
tool_name = self._resolve_tool_name(server_name, tool_name, request)
try:
# Parse arguments as JSON
arguments = json.loads(arguments_str)
tool_call = ToolCall(
type="function",
function=FunctionCall(
name=tool_name,
arguments=json.dumps(arguments, ensure_ascii=False),
),
)
tool_calls.append(tool_call)
except json.JSONDecodeError:
try:
repaired = json_repair.repair_json(arguments_str)
if not repaired:
had_parse_error = True
logger.warning(
"Failed to repair tool arguments JSON: %s",
arguments_str,
)
continue
arguments = json.loads(repaired)
tool_call = ToolCall(
type="function",
function=FunctionCall(
name=tool_name,
arguments=json.dumps(arguments, ensure_ascii=False),
),
)
tool_calls.append(tool_call)
except Exception:
had_parse_error = True
logger.warning(
"Failed to parse tool arguments after repair: %s",
arguments_str,
)
continue
# If we couldn't successfully parse tool calls (or format didn't match), do not truncate.
# Return the full model output as content to avoid losing text.
if had_parse_error or not tool_calls or not had_any_match:
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
# Extract content before first tool call
content = model_output[: model_output.find(self.tool_call_start_token)]
return ExtractedToolCallInformation(
tools_called=len(tool_calls) > 0,
tool_calls=tool_calls,
content=content if content else None,
)
except Exception:
logger.exception("Error in extracting tool call from response.")
return ExtractedToolCallInformation(
tools_called=False, tool_calls=[], content=model_output
)
def extract_tool_calls_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
request: ChatCompletionRequest,
) -> DeltaMessage | None:
# Reset state if this is the start of a new request
if not previous_text:
self.current_tool_name_sent = False
self.prev_tool_call_arr = []
self.current_tool_id = -1
self.streamed_args_for_tool = []
self.buffer = ""
self._resolved_tool_name_cache = {}
self._stream_mode = "text"
self._text_token_prefix = ""
self._tool_end_token_prefix = ""
self._tool_block_buffer = ""
self._stream_tool_call_ids = []
# If tools are disabled for this request, do not suppress tags or parse tool calls.
# Flush any internal buffers as plain text so we never drop output.
if request.tool_choice == "none" or not request.tools:
out = ""
if self.buffer:
out += self.buffer
self.buffer = ""
if self._text_token_prefix:
out += self._text_token_prefix
self._text_token_prefix = ""
if self._tool_block_buffer:
out += self.tool_call_start_token + self._tool_block_buffer
self._tool_block_buffer = ""
if self._tool_end_token_prefix:
out += self._tool_end_token_prefix
self._tool_end_token_prefix = ""
out += delta_text
return DeltaMessage(content=out) if out else None
def _longest_token_prefix_at_end(s: str, token: str) -> str:
max_len = min(len(token) - 1, len(s))
for i in range(max_len, 0, -1):
if token.startswith(s[-i:]):
return s[-i:]
return ""
emitted_text_parts: list[str] = []
emitted_tool_calls: list[DeltaToolCall] = []
chunk = delta_text
while chunk:
if self._stream_mode == "text":
if self._text_token_prefix:
chunk = self._text_token_prefix + chunk
self._text_token_prefix = ""
start_idx = chunk.find(self.tool_call_start_token)
if start_idx < 0:
prefix = _longest_token_prefix_at_end(
chunk, self.tool_call_start_token
)
if prefix:
safe = chunk[: -len(prefix)]
if safe:
emitted_text_parts.append(safe)
self._text_token_prefix = prefix
else:
emitted_text_parts.append(chunk)
break
before = chunk[:start_idx]
if before:
emitted_text_parts.append(before)
chunk = chunk[start_idx + len(self.tool_call_start_token) :]
self._stream_mode = "tool"
self._tool_block_buffer = ""
self._tool_end_token_prefix = ""
continue
# tool mode
if self._tool_end_token_prefix:
chunk = self._tool_end_token_prefix + chunk
self._tool_end_token_prefix = ""
end_idx = chunk.find(self.tool_call_end_token)
if end_idx < 0:
prefix = _longest_token_prefix_at_end(chunk, self.tool_call_end_token)
if prefix:
self._tool_block_buffer += chunk[: -len(prefix)]
self._tool_end_token_prefix = prefix
else:
self._tool_block_buffer += chunk
break
# Complete tool block
self._tool_block_buffer += chunk[:end_idx]
tool_block = (
self.tool_call_start_token
+ self._tool_block_buffer
+ self.tool_call_end_token
)
remainder = chunk[end_idx + len(self.tool_call_end_token) :]
# Reset tool buffers before parsing
self._stream_mode = "text"
self._tool_block_buffer = ""
self._tool_end_token_prefix = ""
try:
m = self._complete_tool_block_regex.search(tool_block)
if not m:
emitted_text_parts.append(tool_block)
chunk = remainder
continue
server_name = (m.group(1) or "").strip()
tool_name = (m.group(2) or "").strip()
arguments_str = (m.group(3) or "").strip()
if not tool_name:
emitted_text_parts.append(tool_block)
chunk = remainder
continue
resolved_name = (
self._resolve_tool_name(server_name, tool_name, request)
if server_name
else tool_name
)
# Finalize arguments strictly at end of the block
if not arguments_str:
arguments_json_str = "{}"
else:
try:
arguments_obj = json.loads(arguments_str)
except Exception:
repaired = json_repair.repair_json(arguments_str)
if not repaired:
emitted_text_parts.append(tool_block)
chunk = remainder
continue
arguments_obj = json.loads(repaired)
arguments_json_str = json.dumps(arguments_obj, ensure_ascii=False)
tool_index = len(self._stream_tool_call_ids)
tool_call_id = make_tool_call_id()
self._stream_tool_call_ids.append(tool_call_id)
emitted_tool_calls.append(
DeltaToolCall(
index=tool_index,
type="function",
id=tool_call_id,
function=DeltaFunctionCall(
name=resolved_name,
arguments=arguments_json_str,
).model_dump(exclude_none=True),
)
)
except Exception:
logger.exception(
"Error parsing complete tool block in streaming; falling back to plain text."
)
emitted_text_parts.append(tool_block)
chunk = remainder
emitted_text = "".join(emitted_text_parts) if emitted_text_parts else None
if emitted_text is not None and emitted_text == "":
emitted_text = None
if emitted_text is None and not emitted_tool_calls:
return None
# vLLM's DeltaMessage.tool_calls is validated as a list; do not pass None explicitly.
if emitted_tool_calls:
return DeltaMessage(content=emitted_text, tool_calls=emitted_tool_calls)
return DeltaMessage(content=emitted_text)
# Register the tool parser to ToolParserManager
ToolParserManager.register_module("mirothinker", True, MirothinkerToolParser)
================================================
FILE: apps/lobehub-compatibility/README.md
================================================
# LobeChat Integration Guide
This guide describes how to integrate the MiroThinker model with [LobeChat](https://github.com/lobehub/lobe-chat), an open-source, modern LLM UI framework supporting tool usage (function calling).
## Before You Start
MiroThinker is a reasoning model. When generating responses, it first outputs its reasoning process inside `...` tags, then provides the final answer. For agentic tasks (multi-step tool use), the model performs better when it can see its previous reasoning in the conversation history.
However, LobeChat does not preserve reasoning content in conversation history. When sending messages back to the API, LobeChat strips the `...` content from previous assistant messages. This means the model cannot see its prior reasoning steps.
- For general chat: This works fine.
- For agentic workflows: Performance may be degraded since the model cannot reference its previous reasoning.
If you need full reasoning preservation for agentic use cases, consider modifying LobeChat's source code to return `reasoning_content` in conversation history.
## 1. Start the Inference Service
First, launch the MiroThinker model using vLLM with the OpenAI-compatible API adapter. We use vLLM because it supports loading custom tool parsers from external Python files, while SGLang does not. Ensure you include the tool parser plugin.
```bash
# Configuration
PORT=61002
MODEL_PATH=miromind-ai/MiroThinker-v1.5-30B
# Start vLLM server
vllm serve $MODEL_PATH \
--served-model-name mirothinker \
--port $PORT \
--trust-remote-code \
--chat-template chat_template.jinja \
--tool-parser-plugin MiroThinkerToolParser.py \
--tool-call-parser mirothinker \
--enable-auto-tool-choice
```
## 2. Configure LobeChat
You can use either the self-hosted version or the [web application](https://lobechat.com/chat).
### Step 1: Access Settings
Navigate to **Settings** -> **AI Service Provider** to add a custom AI service provider.

### Step 2: Add Custom AI Provider
Click the `+` button to add a new provider and configure it as follows:

| Field | Value | Description |
| :--- | :--- | :--- |
| **Provider ID** | `miromind` | Or any identifier you prefer. |
| **Request Format** | `OPENAI` | |
| **API Key** | `your-api-key` | Use any string if auth is disabled. |
| **API Proxy Address** | `http://localhost:61002/v1` | Replace with your actual service address. |
### Step 3: Configure the Model
After adding the provider, add the models you deploy to the service provider's model list.:
1. Add a new model with the ID `mirothinker` (must match `--served-model-name`).
1. **Crucial**: Enable the **Function Calling** capability toggle.
1. Click "Check" to verify connectivity.

## 3. Usage Demo
Once configured, you can use MiroThinker in LobeChat with full tool-calling capabilities.

================================================
FILE: apps/lobehub-compatibility/chat_template.jinja
================================================
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "In this environment you have access to a set of tools you can use to answer the user's question.\n\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\n\nToday is: " + strftime_now('%Y-%m-%d') + ". For time-dependent questions, answer based on the world as it would reasonably be today.\n\n# Tool-Use Formatting Instructions\n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription:\nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n\nserver name here\ntool name here\n\n{\n \"param1\": \"value1\",\n \"param2\": \"value2 \\\"escaped string\\\"\"\n}\n\n\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n## Server name: default" }}
{%- for tool in tools %}
{%- set func = tool.function if tool.function is defined else tool %}
{{- "\n### Tool name: " + func.name + "\n" }}
{{- "Description:\n" }}
{%- set desc = func.description if func.description else '' %}
{%- if desc[:4] == ' ' %}
{{- desc }}
{%- else %}
{{- " " + desc }}
{%- endif %}
{%- if "Args:" not in desc and func.parameters is defined and func.parameters.properties is defined %}
{{- "\n\n Args:" }}
{%- for prop_name, prop_value in func.parameters.properties.items() %}
{%- if prop_value.description is defined %}
{{- "\n " + prop_name + ": " + prop_value.description }}
{%- else %}
{{- "\n " + prop_name + ": " + (prop_value.type if prop_value.type is defined else "any") }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "\n\nInput JSON schema: " + (func.parameters | tojson) + "\n" }}
{%- endfor %}
{{- "\n# General Objective\n\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.<|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content if message.content is not none else '' %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '' in content %}
{%- set reasoning_content = (content.split('')[0]).rstrip('\n') %}
{%- set reasoning_content = (reasoning_content.split('')[-1]).lstrip('\n') %}
{%- set content = (content.split('')[-1]).lstrip('\n') %}
{%- endif %}
{%- endif %}
{{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\ndefault\n' }}
{{- tool_call.name }}
{{- '\n\n' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '\n\n' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user\n' }}
{%- else %}
{{- '\n\n' }}
{%- endif %}
{{- message.content }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- if enable_thinking is defined and enable_thinking is false %}
{{- '\n\n\n\n' }}
{%- endif %}
{%- endif %}
================================================
FILE: apps/lobehub-compatibility/requirements.txt
================================================
vllm>=0.11.0
json-repair
regex
================================================
FILE: apps/lobehub-compatibility/test_tool_parser.py
================================================
#!/usr/bin/env python3
"""
Test MiroThinkerToolParser for correctness.
"""
import json
import sys
from types import SimpleNamespace
from unittest.mock import MagicMock
import regex as re
# Mock vLLM imports for testing without vLLM installed
# Create mock modules
mock_vllm = MagicMock()
mock_vllm.entrypoints = MagicMock()
mock_vllm.entrypoints.chat_utils = MagicMock()
mock_vllm.entrypoints.chat_utils.make_tool_call_id = lambda: "call_test_123"
mock_protocol = SimpleNamespace(
ChatCompletionRequest=MagicMock,
DeltaFunctionCall=MagicMock,
DeltaMessage=MagicMock,
DeltaToolCall=MagicMock,
ExtractedToolCallInformation=MagicMock,
FunctionCall=MagicMock,
ToolCall=MagicMock,
)
mock_tool_parser = SimpleNamespace(
ToolParser=object,
ToolParserManager=MagicMock(),
)
mock_logger = SimpleNamespace(
init_logger=lambda x: MagicMock(isEnabledFor=lambda _: False),
)
sys.modules["vllm"] = mock_vllm
sys.modules["vllm.entrypoints"] = mock_vllm.entrypoints
sys.modules["vllm.entrypoints.chat_utils"] = mock_vllm.entrypoints.chat_utils
sys.modules["vllm.entrypoints.openai"] = MagicMock()
sys.modules["vllm.entrypoints.openai.protocol"] = mock_protocol
sys.modules["vllm.entrypoints.openai.tool_parsers"] = MagicMock()
sys.modules["vllm.entrypoints.openai.tool_parsers.abstract_tool_parser"] = (
mock_tool_parser
)
sys.modules["vllm.logger"] = mock_logger
def test_tool_call_regex():
"""Test the main tool call regex pattern."""
tool_call_regex = re.compile(
r"\s*"
r"(.*?)\s*"
r"(.*?)\s*"
r"\s*(.*?)\s*\s*"
r"",
re.DOTALL,
)
# Test 1: Basic tool call
text1 = """my_mcp_serverweb_search
{"query": "AI news"}
"""
match = tool_call_regex.search(text1)
assert match is not None, "Should match basic tool call"
assert match.group(1).strip() == "my_mcp_server"
assert match.group(2).strip() == "web_search"
assert json.loads(match.group(3).strip()) == {"query": "AI news"}
print("✅ Test 1: Basic tool call - PASSED")
# Test 2: Tool call with content before
text2 = """Let me search for that.
my_mcp_serversearch
{"q": "test"}
"""
match = tool_call_regex.search(text2)
assert match is not None, "Should match tool call with content before"
print("✅ Test 2: Tool call with content before - PASSED")
# Test 3: Multiple tool calls
text3 = """server1tool1{"a": 1}server2tool2{"b": 2}"""
matches = list(tool_call_regex.finditer(text3))
assert len(matches) == 2, f"Should find 2 tool calls, found {len(matches)}"
assert matches[0].group(2).strip() == "tool1"
assert matches[1].group(2).strip() == "tool2"
print("✅ Test 3: Multiple tool calls - PASSED")
# Test 4: Complex JSON arguments
text4 = """my_mcp_servercomplex_tool
{
"query": "test with quotes and apostrophes",
"options": {"nested": true},
"list": [1, 2, 3]
}
"""
match = tool_call_regex.search(text4)
assert match is not None, "Should match complex JSON"
args = json.loads(match.group(3).strip())
assert args["query"] == "test with quotes and apostrophes"
assert args["options"]["nested"] is True
print("✅ Test 4: Complex JSON arguments - PASSED")
# Test 5: Empty arguments
text5 = """my_mcp_serverno_args_tool
{}
"""
match = tool_call_regex.search(text5)
assert match is not None, "Should match empty arguments"
assert json.loads(match.group(3).strip()) == {}
print("✅ Test 5: Empty arguments - PASSED")
# Test 6: Minimal whitespace
text6 = "st{}"
match = tool_call_regex.search(text6)
assert match is not None, "Should match minimal whitespace"
print("✅ Test 6: Minimal whitespace - PASSED")
def test_partial_tool_regex():
"""Test the partial tool regex for streaming."""
partial_tool_regex = re.compile(
r"\s*"
r"(?:(.*?)\s*)?"
r"(?:(.*?)\s*)?"
r"(?:(\s*.*))?",
re.DOTALL,
)
# Test partial: only opening tag
text1 = "\n"
match = partial_tool_regex.search(text1)
assert match is not None
print("✅ Partial test 1: Only opening tag - PASSED")
# Test partial: server_name only
text2 = "\nmy_server\n"
match = partial_tool_regex.search(text2)
assert match is not None
assert match.group(1).strip() == "my_server"
assert match.group(2) is None
print("✅ Partial test 2: Server name only - PASSED")
# Test partial: incomplete arguments
text3 = """my_servermy_tool
{"query": "incomp"""
match = partial_tool_regex.search(text3)
assert match is not None
assert match.group(1).strip() == "my_server"
assert match.group(2).strip() == "my_tool"
assert '{"query": "incomp' in match.group(3)
print("✅ Partial test 3: Incomplete arguments - PASSED")
def test_complete_tool_block_regex():
"""Test the complete tool block regex used in streaming."""
complete_regex = re.compile(
r"\s*"
r"(?:(.*?)\s*)?"
r"(?:(.*?)\s*)?"
r"(?:\s*(.*?)\s*(?:\s*)?)?"
r"",
re.DOTALL,
)
# Test: Complete block
text1 = """my_mcp_serversearch
{"q": "test"}
"""
match = complete_regex.search(text1)
assert match is not None
assert match.group(1).strip() == "my_mcp_server"
assert match.group(2).strip() == "search"
assert json.loads(match.group(3).strip()) == {"q": "test"}
print("✅ Complete block test 1: Full block - PASSED")
# Test: Without arguments tag
text2 = """my_mcp_serversimple_tool"""
match = complete_regex.search(text2)
assert match is not None
assert match.group(2).strip() == "simple_tool"
assert match.group(3) is None
print("✅ Complete block test 2: Without arguments - PASSED")
def test_edge_cases():
"""Test edge cases and potential bugs."""
tool_call_regex = re.compile(
r"\s*"
r"(.*?)\s*"
r"(.*?)\s*"
r"\s*(.*?)\s*\s*"
r"",
re.DOTALL,
)
# Edge case 1: Unicode in arguments
text1 = """my_mcp_serversearch
{"query": "你好世界 🎉"}
"""
match = tool_call_regex.search(text1)
assert match is not None
args = json.loads(match.group(3).strip())
assert args["query"] == "你好世界 🎉"
print("✅ Edge case 1: Unicode in arguments - PASSED")
# Edge case 2: Newlines in JSON
text2 = """my_mcp_serversearch
{
"query": "line1\\nline2\\nline3"
}
"""
match = tool_call_regex.search(text2)
assert match is not None
args = json.loads(match.group(3).strip())
assert "line1\nline2" in args["query"]
print("✅ Edge case 2: Newlines in JSON - PASSED")
# Edge case 3: Tags in content (should not match nested)
text3 = """my_mcp_serversearch
{"query": "test"}
"""
match = tool_call_regex.search(text3)
assert match is not None
args = json.loads(match.group(3).strip())
assert "" in args["query"]
print("✅ Edge case 3: HTML tags in arguments - PASSED")
def check_unused_code():
"""Check for unused code in the parser."""
print("\n" + "=" * 60)
print("CODE ANALYSIS - Potential Issues")
print("=" * 60)
issues = []
# Issue 1: Unused variables
unused_vars = [
"self.current_tool_name_sent",
"self.prev_tool_call_arr",
"self.current_tool_id",
"self.streamed_args_for_tool",
"self.buffer",
]
issues.append(
f"⚠️ Unused instance variables (defined but never used in main logic):\n {', '.join(unused_vars)}"
)
# Issue 2: Unused method
issues.append("⚠️ `_ensure_tool_id_valid` method is defined but never called")
# Issue 3: Unused regex
issues.append("⚠️ `partial_tool_regex` is defined but never used")
# Issue 4: server_name handling
issues.append(
"⚠️ `_resolve_tool_name` checks for 'default' server_name,\n but chat_template.jinja uses 'my_mcp_server'"
)
for issue in issues:
print(f"\n{issue}")
print("\n" + "=" * 60)
print("RECOMMENDATIONS")
print("=" * 60)
print("""
1. Remove unused variables and methods to clean up the code
2. Either use `partial_tool_regex` or remove it
3. Update `_resolve_tool_name` to handle 'my_mcp_server' correctly
4. The streaming implementation looks correct with the state machine approach
5. The main `extract_tool_calls` and `extract_tool_calls_streaming` logic appears sound
""")
def main():
print("=" * 60)
print("MiroThinkerToolParser Test Suite")
print("=" * 60)
print("\n--- Testing Main Tool Call Regex ---")
test_tool_call_regex()
print("\n--- Testing Partial Tool Regex ---")
test_partial_tool_regex()
print("\n--- Testing Complete Tool Block Regex ---")
test_complete_tool_block_regex()
print("\n--- Testing Edge Cases ---")
test_edge_cases()
check_unused_code()
print("\n" + "=" * 60)
print("ALL REGEX TESTS PASSED ✅")
print("=" * 60)
if __name__ == "__main__":
main()
================================================
FILE: apps/lobehub-compatibility/unit_test.py
================================================
#!/usr/bin/env python3
"""
Unit tests for MiroThinker chat template.
Run with: pytest unit_test.py -v
"""
from datetime import datetime
from pathlib import Path
import pytest
from jinja2 import BaseLoader, Environment
# ============================================================================
# Fixtures
# ============================================================================
def strftime_now(format_str: str) -> str:
"""Simulate vLLM's strftime_now function."""
return datetime.now().strftime(format_str)
@pytest.fixture
def template():
"""Load the chat template."""
template_path = Path(__file__).parent / "chat_template.jinja"
with open(template_path, "r") as f:
template_str = f.read()
env = Environment(loader=BaseLoader())
env.globals["strftime_now"] = strftime_now
return env.from_string(template_str)
@pytest.fixture
def today_date():
"""Get today's date in YYYY-MM-DD format."""
return datetime.now().strftime("%Y-%m-%d")
# ============================================================================
# Test: Basic Message Formatting
# ============================================================================
class TestBasicMessageFormatting:
"""Tests for basic message formatting without tools."""
def test_user_message_format(self, template):
"""User message should be wrapped in <|im_start|>user ... <|im_end|>."""
messages = [{"role": "user", "content": "Hello!"}]
result = template.render(messages=messages, add_generation_prompt=False)
assert "<|im_start|>user\nHello!<|im_end|>" in result
def test_system_message_format(self, template):
"""System message should be wrapped correctly."""
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hi"},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result
def test_assistant_message_format(self, template):
"""Assistant message should be wrapped correctly with tags."""
messages = [
{"role": "user", "content": "Hello"},
{"role": "assistant", "content": "Hi there!"},
]
result = template.render(messages=messages, add_generation_prompt=False)
# Assistant always outputs tags (even if empty)
assert (
"<|im_start|>assistant\n\n\n\n\nHi there!<|im_end|>"
in result
)
def test_add_generation_prompt(self, template):
"""add_generation_prompt should add <|im_start|>assistant at the end."""
messages = [{"role": "user", "content": "Hello"}]
result = template.render(messages=messages, add_generation_prompt=True)
assert result.endswith("<|im_start|>assistant\n")
def test_multi_turn_conversation(self, template):
"""Multi-turn conversation should maintain correct order."""
messages = [
{"role": "system", "content": "System prompt"},
{"role": "user", "content": "User 1"},
{"role": "assistant", "content": "Assistant 1"},
{"role": "user", "content": "User 2"},
]
result = template.render(messages=messages, add_generation_prompt=True)
# Check order
sys_pos = result.find("System prompt")
user1_pos = result.find("User 1")
asst1_pos = result.find("Assistant 1")
user2_pos = result.find("User 2")
assert sys_pos < user1_pos < asst1_pos < user2_pos
# ============================================================================
# Test: Thinking/Reasoning Content
# ============================================================================
class TestThinkingContent:
"""Tests for tag handling."""
def test_reasoning_content_field(self, template):
"""reasoning_content field should be wrapped in tags."""
messages = [
{"role": "user", "content": "What is 2+2?"},
{
"role": "assistant",
"content": "The answer is 4.",
"reasoning_content": "2+2=4 by basic arithmetic.",
},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert "\n2+2=4 by basic arithmetic.\n" in result
assert "The answer is 4." in result
def test_think_tags_in_content(self, template):
""" tags in content should be extracted and reformatted."""
messages = [
{"role": "user", "content": "Question"},
{
"role": "assistant",
"content": "\nMy reasoning here.\n\n\nMy answer here.",
},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert "\nMy reasoning here.\n" in result
assert "My answer here." in result
def test_think_preserved_in_history(self, template):
"""Think tags should be preserved in historical messages, not removed."""
messages = [
{"role": "user", "content": "First question"},
{
"role": "assistant",
"content": "First answer",
"reasoning_content": "First reasoning",
},
{"role": "user", "content": "Second question"},
]
result = template.render(messages=messages, add_generation_prompt=True)
# Historical thinking should be present
assert "\nFirst reasoning\n" in result
def test_enable_thinking_false(self, template):
"""enable_thinking=false should output empty think tags."""
messages = [{"role": "user", "content": "Hello"}]
result = template.render(
messages=messages, add_generation_prompt=True, enable_thinking=False
)
assert result.endswith("<|im_start|>assistant\n\n\n\n\n")
def test_enable_thinking_true(self, template):
"""enable_thinking=true should not output empty think tags."""
messages = [{"role": "user", "content": "Hello"}]
result = template.render(
messages=messages, add_generation_prompt=True, enable_thinking=True
)
assert result.endswith("<|im_start|>assistant\n")
assert "\n\n" not in result
# ============================================================================
# Test: Tool Definitions in System Prompt
# ============================================================================
class TestToolDefinitions:
"""Tests for tool definition formatting in system prompt."""
def test_tools_trigger_system_prompt(self, template, today_date):
"""When tools are provided, a special system prompt should be generated."""
messages = [{"role": "user", "content": "Search something"}]
tools = [
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "In this environment you have access to a set of tools" in result
assert f"Today is: {today_date}" in result
assert "# Tool-Use Formatting Instructions" in result
def test_tool_name_format(self, template):
"""Tool should be formatted with ### Tool name: header."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "my_tool",
"description": "My description",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "### Tool name: my_tool" in result
def test_tool_server_name(self, template):
"""Tool server should be my_mcp_server."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "test_tool",
"description": "Test",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "## Server name: default" in result
def test_tool_description_indentation(self, template):
"""Tool description should be indented with 4 spaces."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "test_tool",
"description": "My tool description",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "Description:\n My tool description" in result
def test_tool_args_auto_generated(self, template):
"""Args section should be auto-generated from parameters.properties."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search function",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"},
"limit": {"type": "integer", "description": "Max results"},
},
},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "Args:" in result
assert "query: Search query" in result
assert "limit: Max results" in result
def test_tool_args_not_duplicated(self, template):
"""If description already has Args:, don't add another."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search function\n\nArgs:\n query: The query",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"}
},
},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
# Should only have one Args: section
assert result.count("Args:") == 1
def test_tool_json_schema_included(self, template):
"""Input JSON schema should be included."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "test",
"description": "Test",
"parameters": {
"type": "object",
"properties": {"x": {"type": "string"}},
},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "Input JSON schema:" in result
assert '"type": "object"' in result or '"type":"object"' in result
def test_tool_without_function_wrapper(self, template):
"""Tools can be passed without the function wrapper."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"name": "direct_tool",
"description": "Direct tool format",
"parameters": {"type": "object", "properties": {}},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "### Tool name: direct_tool" in result
def test_tool_none_description(self, template):
"""Tool with None description should not crash."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "test",
"description": None,
"parameters": {"type": "object", "properties": {}},
},
}
]
# Should not raise an exception
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "### Tool name: test" in result
def test_tool_empty_description(self, template):
"""Tool with empty description should not crash."""
messages = [{"role": "user", "content": "Test"}]
tools = [
{
"type": "function",
"function": {
"name": "test",
"description": "",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "### Tool name: test" in result
def test_system_message_prepended_with_tools(self, template):
"""Custom system message should be prepended when tools are present."""
messages = [
{"role": "system", "content": "You are MiroThinker."},
{"role": "user", "content": "Hi"},
]
tools = [
{
"type": "function",
"function": {
"name": "test",
"description": "Test",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
# System message should come first, then tool instructions
sys_idx = result.find("You are MiroThinker.")
tools_idx = result.find("In this environment you have access")
assert sys_idx < tools_idx
# ============================================================================
# Test: Tool Calls in Assistant Messages
# ============================================================================
class TestToolCalls:
"""Tests for tool call formatting in assistant messages."""
def test_tool_call_format(self, template):
"""Tool calls should be formatted with tags."""
messages = [
{"role": "user", "content": "Search for AI"},
{
"role": "assistant",
"content": "Let me search.",
"tool_calls": [
{
"id": "call_1",
"type": "function",
"function": {
"name": "web_search",
"arguments": '{"query": "AI news"}',
},
}
],
},
]
tools = [
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=False
)
assert "" in result
assert "default" in result
assert "web_search" in result
assert "" in result
assert '{"query": "AI news"}' in result
assert "" in result
assert "" in result
def test_tool_call_no_content(self, template):
"""Tool call with None content should work."""
messages = [
{"role": "user", "content": "Search"},
{
"role": "assistant",
"content": None,
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "search",
"arguments": '{"q": "test"}',
},
}
],
},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=False
)
# Should have tool call with empty think tags (no content before tool call)
assert "<|im_start|>assistant\n\n\n\n\n" in result
def test_multiple_tool_calls(self, template):
"""Multiple tool calls should be separated by newlines."""
messages = [
{"role": "user", "content": "Compare Tokyo and Osaka"},
{
"role": "assistant",
"content": "I'll search both.",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "search",
"arguments": '{"q": "Tokyo"}',
},
},
{
"id": "call_2",
"function": {
"name": "search",
"arguments": '{"q": "Osaka"}',
},
},
],
},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=False
)
# Extract assistant message part (after the last <|im_start|>assistant)
assistant_start = result.rfind("<|im_start|>assistant")
assistant_part = result[assistant_start:]
# Should have two tool calls in assistant message
assert assistant_part.count("") == 2
assert assistant_part.count("") == 2
def test_tool_call_arguments_dict(self, template):
"""Tool call with dict arguments should be JSON serialized."""
messages = [
{"role": "user", "content": "Search"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "search",
"arguments": {"q": "test", "limit": 5}, # dict, not string
},
}
],
},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=False
)
# Arguments should be JSON serialized
assert "" in result
assert '"q"' in result or "'q'" in result
# ============================================================================
# Test: Tool Responses
# ============================================================================
class TestToolResponses:
"""Tests for tool response handling."""
def test_tool_response_in_user_message(self, template):
"""Tool response should be embedded in a user message."""
messages = [
{"role": "user", "content": "Search"},
{
"role": "assistant",
"content": "Searching...",
"tool_calls": [
{
"id": "call_1",
"function": {"name": "search", "arguments": '{"q": "test"}'},
}
],
},
{
"role": "tool",
"tool_call_id": "call_1",
"content": "Search results here",
},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
# Tool response should be in a user message
assert "<|im_start|>user\nSearch results here<|im_end|>" in result
def test_multiple_tool_responses_merged(self, template):
"""Multiple consecutive tool responses should be merged into one user message."""
messages = [
{"role": "user", "content": "Compare"},
{
"role": "assistant",
"content": "Searching...",
"tool_calls": [
{
"id": "call_1",
"function": {"name": "search", "arguments": '{"q": "A"}'},
},
{
"id": "call_2",
"function": {"name": "search", "arguments": '{"q": "B"}'},
},
],
},
{"role": "tool", "tool_call_id": "call_1", "content": "Result A"},
{"role": "tool", "tool_call_id": "call_2", "content": "Result B"},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
# Should have only one user message containing both results
# Results should be separated by \n\n
assert "Result A\n\nResult B" in result
# Count im_start|>user - should have 2 (original user + tool results)
user_count = result.count("<|im_start|>user")
assert user_count == 2
def test_tool_response_no_wrapper_tags(self, template):
"""Tool responses should NOT be wrapped in tags."""
messages = [
{"role": "user", "content": "Search"},
{
"role": "assistant",
"content": "",
"tool_calls": [
{
"id": "call_1",
"function": {"name": "search", "arguments": '{"q": "test"}'},
}
],
},
{"role": "tool", "tool_call_id": "call_1", "content": "Results"},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
assert "" not in result
assert "" not in result
# ============================================================================
# Test: Edge Cases
# ============================================================================
class TestEdgeCases:
"""Tests for edge cases and error handling."""
def test_only_system_message(self, template):
"""Only system message should work."""
messages = [{"role": "system", "content": "You are helpful."}]
result = template.render(messages=messages, add_generation_prompt=False)
assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result
def test_assistant_empty_content(self, template):
"""Assistant with empty string content should work."""
messages = [
{"role": "user", "content": "Hi"},
{"role": "assistant", "content": ""},
]
result = template.render(messages=messages, add_generation_prompt=False)
# Assistant always outputs tags (even with empty content)
assert "<|im_start|>assistant\n\n\n\n\n<|im_end|>" in result
def test_unicode_content(self, template):
"""Unicode content should be preserved."""
messages = [
{"role": "user", "content": "你好!🎉"},
{"role": "assistant", "content": "こんにちは!"},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert "你好!🎉" in result
assert "こんにちは!" in result
def test_special_characters_in_content(self, template):
"""Special characters should be preserved."""
messages = [
{"role": "user", "content": "Test & \"quotes\" 'apostrophe'"},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert ' & "quotes"' in result
def test_newlines_preserved(self, template):
"""Newlines in content should be preserved."""
messages = [
{"role": "user", "content": "Line 1\nLine 2\n\nLine 4"},
]
result = template.render(messages=messages, add_generation_prompt=False)
assert "Line 1\nLine 2\n\nLine 4" in result
# ============================================================================
# Test: Complete Flow
# ============================================================================
class TestCompleteFlow:
"""Integration tests for complete conversation flows."""
def test_full_tool_use_flow(self, template, today_date):
"""Test a complete tool use flow."""
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the weather?"},
{
"role": "assistant",
"content": "Let me check.",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "weather",
"arguments": '{"city": "Tokyo"}',
},
}
],
},
{"role": "tool", "tool_call_id": "call_1", "content": "Sunny, 25°C"},
{
"role": "assistant",
"content": "It's sunny and 25°C in Tokyo!",
},
{"role": "user", "content": "Thanks!"},
]
tools = [
{
"type": "function",
"function": {
"name": "weather",
"description": "Get weather info",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"}
},
},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=True
)
# Check structure
assert "<|im_start|>system" in result
assert "You are a helpful assistant." in result
assert f"Today is: {today_date}" in result
assert "### Tool name: weather" in result
assert "" in result
assert "default" in result
assert "Sunny, 25°C" in result
assert "It's sunny and 25°C in Tokyo!" in result
assert result.endswith("<|im_start|>assistant\n")
def test_reasoning_with_tool_use(self, template):
"""Test reasoning content combined with tool use."""
messages = [
{"role": "user", "content": "Search for Python tutorials"},
{
"role": "assistant",
"content": "I'll search for Python tutorials.",
"reasoning_content": "User wants Python tutorials. I should use web search.",
"tool_calls": [
{
"id": "call_1",
"function": {
"name": "search",
"arguments": '{"q": "Python tutorials"}',
},
}
],
},
]
tools = [
{
"type": "function",
"function": {
"name": "search",
"description": "Search",
"parameters": {"type": "object", "properties": {}},
},
}
]
result = template.render(
messages=messages, tools=tools, add_generation_prompt=False
)
# Should have both thinking and tool call
assert "" in result
assert "User wants Python tutorials" in result
assert "" in result
assert "" in result
# ============================================================================
# Run tests
# ============================================================================
if __name__ == "__main__":
pytest.main([__file__, "-v"])
================================================
FILE: apps/miroflow-agent/README.md
================================================
# MiroFlow Agent
> For comprehensive documentation, installation guide, and tool configuration, see the [main README](../../README.md).
## Prerequisites
Before running the agent, ensure you have:
1. **Installed dependencies**: Run `uv sync` in this directory
1. **Configured environment variables**: Copy `.env.example` to `.env` and fill in your API keys
```bash
cp .env.example .env
# Edit .env with your actual API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.)
```
1. **Started your model server** (for MiroThinker models): See the [Serve the MiroThinker Model](../../README.md#serve-the-mirothinker-model) section
## Quick Start
### Run a Single Task
The simplest way to test the agent is running `main.py` directly. It will execute a default task: *"What is the title of today's arxiv paper in computer science?"*
```bash
# Using MiroThinker models (requires your own model server)
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 llm.base_url=http://localhost:61002/v1
# Using Claude (requires ANTHROPIC_API_KEY in .env)
uv run python main.py llm=claude-3-7 agent=single_agent_keep5
# Using GPT-5 (requires OPENAI_API_KEY in .env)
uv run python main.py llm=gpt-5 agent=single_agent_keep5
```
### Customize Your Task
To ask a different question, edit `main.py` line 32:
```python
task_description = "Your custom question here"
```
Then run the agent again. It will search the web, execute code, and provide an answer.
### Run Benchmark Evaluation
For systematic evaluation on standard benchmarks, add the `benchmark=` parameter:
```bash
# Run on debug benchmark (quick test)
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=debug llm.base_url=http://localhost:61002/v1
# Run on specific benchmarks
uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=gaia-validation-text-103 llm.base_url=http://localhost:61002/v1
```
## Available Configurations
### LLM Models
| Model | Config Name | Requirements |
|-------|-------------|--------------|
| MiroThinker (self-hosted) | `qwen-3` | Model server + `llm.base_url` |
| Claude 3.7 Sonnet | `claude-3-7` | `ANTHROPIC_API_KEY` in .env |
| GPT-5 | `gpt-5` | `OPENAI_API_KEY` in .env |
### Agent Configurations
**MiroThinker v1.5:**
- `mirothinker_v1.5_keep5_max200` ⭐ (recommended) - context management, up to 200 turns
- `mirothinker_v1.5_keep5_max400` - context management, up to 400 turns (for BrowseComp)
- `mirothinker_v1.5` - no context management, up to 600 turns
**MiroThinker v1.0:**
- `mirothinker_v1.0_keep5` (recommended) - context management, up to 600 turns
- `mirothinker_v1.0` - no context management, up to 600 turns
**General (for closed-source models like Claude, GPT-5):**
- `single_agent_keep5` (recommended) - single agent with context management
- `single_agent` - single agent without context management
**Multi-Agent (Legacy for v0.1/v0.2):**
- `multi_agent` - multi-agent with commercial tools
- `multi_agent_os` - multi-agent with open-source tools
### Benchmark Configs
`debug`, `browsecomp`, `browsecomp_zh`, `hle`, `hle-text-2158`, `hle-text-500`, `gaia-validation-text-103`, `gaia-validation`, `frames`, `xbench_deepsearch`, `futurex`, `seal-0`, `aime2025`, `deepsearchqa`, `webwalkerqa`
## Output
The agent will:
1. Execute the task using available tools (search, code execution, etc.)
1. Generate a final summary and boxed answer
1. Save detailed logs to `../../logs/` directory
1. Display the results in the terminal
## Troubleshooting
| Problem | Solution |
|---------|----------|
| API key errors | Check `.env` file has correct keys |
| Model connection failed | Verify `llm.base_url` is accessible |
| Tool execution errors | Check E2B/Serper/Jina API keys and quotas |
| Out of memory | Use `mirothinker_v1.5_keep5_max200` config |
For detailed logs, check the `logs/` directory.
================================================
FILE: apps/miroflow-agent/benchmarks/__init__.py
================================================
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "aime2025"
BENCHMARK_NAME_STD = "AIME2025"
TASKS_PER_RUN = 30
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "browsecomp"
BENCHMARK_NAME_STD = "BrowseComp-EN"
TASKS_PER_RUN = 1266
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "browsecomp_zh"
BENCHMARK_NAME_STD = "BrowseComp-ZH"
TASKS_PER_RUN = 289
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_deepsearchqa.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import glob
import json
import os
from pathlib import Path
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "deepsearchqa"
BENCHMARK_NAME_STD = "DeepSearchQA"
TASKS_PER_RUN = 900
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def extract_eval_details_from_log(log_file: str) -> dict:
"""
Extract evaluation details from a completed task log file.
Returns:
Dict with num_correct, num_expected, num_excessive, or empty dict if not found
"""
try:
with open(log_file, "r") as f:
content = f.read()
# Try to parse as JSON first (task log files are JSON)
try:
log_data = json.loads(content)
# Method 1: Check for eval_details field (new format - saved directly)
if "eval_details" in log_data and log_data["eval_details"]:
eval_details = log_data["eval_details"]
if all(
k in eval_details
for k in ["num_correct", "num_expected", "num_excessive"]
):
return {
"num_correct": eval_details["num_correct"],
"num_expected": eval_details["num_expected"],
"num_excessive": eval_details["num_excessive"],
}
# Method 2: Check if llm_response contains the evaluation output (legacy format)
if "llm_response" in log_data and log_data["llm_response"]:
llm_response = log_data["llm_response"]
# Look for DeepSearchQA Judge output
if "DeepSearchQA Judge - Correct:" in llm_response:
for line in llm_response.split("\n"):
if "DeepSearchQA Judge - Correct:" in line:
# Parse "Correct: X/Y, Excessive: Z"
parts = line.split("Correct:")[1].strip()
correct_part, excessive_part = parts.split(", Excessive:")
num_correct, num_expected = map(
int, correct_part.split("/")
)
num_excessive = int(excessive_part.strip())
return {
"num_correct": num_correct,
"num_expected": num_expected,
"num_excessive": num_excessive,
}
except json.JSONDecodeError:
# Not JSON, try as plain text (legacy format)
if "DeepSearchQA Judge - Correct:" in content:
for line in content.split("\n"):
if "DeepSearchQA Judge - Correct:" in line:
# Parse "Correct: X/Y, Excessive: Z"
parts = line.split("Correct:")[1].strip()
correct_part, excessive_part = parts.split(", Excessive:")
num_correct, num_expected = map(int, correct_part.split("/"))
num_excessive = int(excessive_part.strip())
return {
"num_correct": num_correct,
"num_expected": num_expected,
"num_excessive": num_excessive,
}
except Exception:
pass
return {}
def calculate_deepsearchqa_metrics_from_logs(base_path: str) -> dict:
"""
Calculate metrics from individual task log files (for in-progress runs).
Returns:
Dict with metrics or None if no completed tasks found
"""
try:
# Find all completed task log files
pattern = os.path.join(base_path, "run_*/task_*.json")
log_files = glob.glob(pattern)
if not log_files:
return None
num_valid = 0
num_fully_correct = 0
num_fully_incorrect = 0
num_correct_with_extraneous = 0
f1_list = []
for log_file in log_files:
details = extract_eval_details_from_log(log_file)
if not details:
continue
num_correct = details["num_correct"]
num_expected = details["num_expected"]
num_excessive = details["num_excessive"]
# Calculate per-item metrics
true_positives = num_correct
false_negatives = num_expected - num_correct
false_positives = num_excessive
# Calculate precision and recall for F1
precision = 0.0
if (true_positives + false_positives) > 0:
precision = true_positives / (true_positives + false_positives)
recall = 0.0
if (true_positives + false_negatives) > 0:
recall = true_positives / (true_positives + false_negatives)
f1 = 0.0
if (precision + recall) > 0:
f1 = 2 * (precision * recall) / (precision + recall)
f1_list.append(f1)
# Classify into categories
all_expected_correct = num_correct == num_expected
has_extraneous = num_excessive > 0
if all_expected_correct and not has_extraneous:
num_fully_correct += 1
elif num_correct == 0:
num_fully_incorrect += 1
elif all_expected_correct and has_extraneous:
num_correct_with_extraneous += 1
num_valid += 1
if num_valid > 0:
return {
"num_valid": num_valid,
"fully_correct": num_fully_correct,
"fully_incorrect": num_fully_incorrect,
"correct_with_extraneous": num_correct_with_extraneous,
"pct_fully_correct": num_fully_correct / num_valid,
"pct_fully_incorrect": num_fully_incorrect / num_valid,
"pct_correct_with_extraneous": num_correct_with_extraneous / num_valid,
"avg_f1": sum(f1_list) / len(f1_list),
}
return None
except Exception:
return None
def calculate_deepsearchqa_metrics(results_file: str) -> dict:
"""
Calculate DeepSearchQA-specific metrics from results file.
Following the official Google DeepSearchQA evaluation metrics:
1. Fully Correct: All expected answers correct + no extraneous answers
2. Fully Incorrect: No correct answers
3. Correct with Extraneous Answers: All expected answers correct + has extraneous
4. F1 Score: Harmonic mean of precision and recall
Returns:
Dict with the 4 core metrics
"""
try:
results = []
with open(results_file, "r") as f:
for line in f:
if line.strip():
results.append(json.loads(line))
num_valid = 0
num_fully_correct = 0
num_fully_incorrect = 0
num_correct_with_extraneous = 0
f1_list = []
for result in results:
if result.get("status") != "success":
continue
# Extract eval_details from attempts
if "attempts" in result and result["attempts"]:
for attempt in result["attempts"]:
if "eval_details" in attempt and attempt["eval_details"]:
details = attempt["eval_details"]
num_correct = details.get("num_correct", 0)
num_expected = details.get("num_expected", 0)
num_excessive = details.get("num_excessive", 0)
# Calculate per-item metrics
true_positives = num_correct
false_negatives = num_expected - num_correct
false_positives = num_excessive
# Calculate precision and recall for F1
precision = 0.0
if (true_positives + false_positives) > 0:
precision = true_positives / (
true_positives + false_positives
)
recall = 0.0
if (true_positives + false_negatives) > 0:
recall = true_positives / (true_positives + false_negatives)
f1 = 0.0
if (precision + recall) > 0:
f1 = 2 * (precision * recall) / (precision + recall)
f1_list.append(f1)
# Classify into categories
all_expected_correct = num_correct == num_expected
has_extraneous = num_excessive > 0
if all_expected_correct and not has_extraneous:
num_fully_correct += 1
elif num_correct == 0:
num_fully_incorrect += 1
elif all_expected_correct and has_extraneous:
num_correct_with_extraneous += 1
num_valid += 1
break # Only use first attempt with details
if num_valid > 0:
return {
"num_valid": num_valid,
"fully_correct": num_fully_correct,
"fully_incorrect": num_fully_incorrect,
"correct_with_extraneous": num_correct_with_extraneous,
"pct_fully_correct": num_fully_correct / num_valid,
"pct_fully_incorrect": num_fully_incorrect / num_valid,
"pct_correct_with_extraneous": num_correct_with_extraneous / num_valid,
"avg_f1": sum(f1_list) / len(f1_list),
}
else:
return {"num_valid": 0}
except Exception as e:
print(f"Warning: Could not calculate DeepSearchQA metrics: {e}")
return {"num_valid": 0}
def show_deepsearchqa_metrics(base_path: str):
"""
Show DeepSearchQA-specific metrics for all runs.
Following Google DeepSearchQA official metrics:
1. Fully Correct
2. Fully Incorrect
3. Correct with Extraneous Answers
4. F1 Score
"""
print("\n" + "=" * 80)
print("DeepSearchQA Metrics (Official Google Metrics)")
print("=" * 80)
# Find all benchmark_results.jsonl files
results_files = glob.glob(os.path.join(base_path, "run_*/benchmark_results.jsonl"))
if not results_files:
print("(Metrics will be available after tasks complete)")
return
all_fully_correct = []
all_fully_incorrect = []
all_correct_with_extraneous = []
all_f1 = []
for results_file in sorted(results_files):
run_dir = Path(results_file).parent.name
metrics = calculate_deepsearchqa_metrics(results_file)
if metrics["num_valid"] > 0:
fully_correct_pct = metrics["pct_fully_correct"]
fully_incorrect_pct = metrics["pct_fully_incorrect"]
correct_with_extraneous_pct = metrics["pct_correct_with_extraneous"]
f1 = metrics["avg_f1"]
all_fully_correct.append(fully_correct_pct)
all_fully_incorrect.append(fully_incorrect_pct)
all_correct_with_extraneous.append(correct_with_extraneous_pct)
all_f1.append(f1)
print(f"\n{run_dir} ({metrics['num_valid']} items):")
print(
f" Fully Correct: {fully_correct_pct:6.2%} ({metrics['fully_correct']} items)"
)
print(
f" Fully Incorrect: {fully_incorrect_pct:6.2%} ({metrics['fully_incorrect']} items)"
)
print(
f" Correct w/ Extraneous: {correct_with_extraneous_pct:6.2%} ({metrics['correct_with_extraneous']} items)"
)
print(f" F1 Score: {f1:6.2%}")
if all_fully_correct:
print("\n" + "=" * 80)
print(f"Average across {len(all_fully_correct)} runs:")
print("=" * 80)
avg_fully_correct = sum(all_fully_correct) / len(all_fully_correct)
avg_fully_incorrect = sum(all_fully_incorrect) / len(all_fully_incorrect)
avg_correct_with_extraneous = sum(all_correct_with_extraneous) / len(
all_correct_with_extraneous
)
avg_f1 = sum(all_f1) / len(all_f1)
print(f" Fully Correct: {avg_fully_correct:6.2%}")
print(f" Fully Incorrect: {avg_fully_incorrect:6.2%}")
print(f" Correct w/ Extraneous: {avg_correct_with_extraneous:6.2%}")
print(f" F1 Score: {avg_f1:6.2%}")
print("=" * 80)
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Show DeepSearchQA-specific metrics (only if runs are complete)
# Check if any run has completed all its tasks
has_complete_run = False
run_dirs = glob.glob(os.path.join(args.path, "run_*"))
for run_dir in run_dirs:
results_file = os.path.join(run_dir, "benchmark_results.jsonl")
if os.path.exists(results_file):
has_complete_run = True
break
if has_complete_run:
show_deepsearchqa_metrics(args.path)
elif summary.total_completed > 0:
# Try to show intermediate metrics from completed tasks
interim_metrics = calculate_deepsearchqa_metrics_from_logs(args.path)
print("\n" + "=" * 80)
print("DeepSearchQA Metrics (Official Google Metrics)")
print("=" * 80)
if interim_metrics and interim_metrics.get("num_valid", 0) > 0:
num_with_details = interim_metrics["num_valid"]
print(
f"⚠️ INTERIM RESULTS (based on {num_with_details}/{summary.total_completed} tasks with eval_details)"
)
if num_with_details < summary.total_completed:
print(
f" Note: {summary.total_completed - num_with_details} completed tasks don't have eval_details (likely ran before the update)"
)
print("-" * 80)
fully_correct_pct = interim_metrics["pct_fully_correct"]
fully_incorrect_pct = interim_metrics["pct_fully_incorrect"]
correct_with_extraneous_pct = interim_metrics[
"pct_correct_with_extraneous"
]
f1 = interim_metrics["avg_f1"]
print(
f" Fully Correct: {fully_correct_pct:6.2%} ({interim_metrics['fully_correct']} items)"
)
print(
f" Fully Incorrect: {fully_incorrect_pct:6.2%} ({interim_metrics['fully_incorrect']} items)"
)
print(
f" Correct w/ Extraneous: {correct_with_extraneous_pct:6.2%} ({interim_metrics['correct_with_extraneous']} items)"
)
print(f" F1 Score: {f1:6.2%}")
print()
print(
f"Note: Based on {interim_metrics['num_valid']} completed tasks. Final metrics may differ."
)
else:
print(f"Tasks in progress... ({summary.total_completed} completed)")
print("Detailed metrics will be available when runs complete.")
print("=" * 80)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "frames"
BENCHMARK_NAME_STD = "Frames"
TASKS_PER_RUN = 824
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import GAIAProgressChecker as ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "gaia-2023-validation-text-103"
BENCHMARK_NAME_STD = "GAIA-Text-103"
TASKS_PER_RUN = 103
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import GAIAProgressChecker as ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "gaia-2023-validation"
BENCHMARK_NAME_STD = "GAIA-Val-165"
TASKS_PER_RUN = 165
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle-text-2158"
BENCHMARK_NAME_STD = "HLE-Text-2158"
TASKS_PER_RUN = 2158
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle-text-500"
BENCHMARK_NAME_STD = "HLE-Text-500"
TASKS_PER_RUN = 500
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "hle"
BENCHMARK_NAME_STD = "HLE-2500"
TASKS_PER_RUN = 2500
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "seal-0"
BENCHMARK_NAME_STD = "SEAL-0"
TASKS_PER_RUN = 111
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "webwalkerqa"
BENCHMARK_NAME_STD = "WebWalkerQA"
TASKS_PER_RUN = 680
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_task_id_(\d+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import os
from common import ProgressChecker
# Benchmark configuration
FILENAME = os.path.basename(__file__)
BENCHMARK_NAME = "xbench_deepsearch"
BENCHMARK_NAME_STD = "XBench-DeepSearch"
TASKS_PER_RUN = 100
DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
TASK_ID_PATTERN = r"task_([a-f0-9]+)"
def parse_args():
parser = argparse.ArgumentParser(
description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
)
parser.add_argument(
"path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
try:
# Create progress checker and run analysis
checker = ProgressChecker(
args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
)
summary = checker.run_analysis(
benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
)
# Exit with appropriate code
if summary.total_tasks == 0:
print("No task files found in any run directories")
elif summary.total_completed == 0:
print("No tasks completed yet")
except FileNotFoundError as e:
print(f"Error: {e}")
except PermissionError as e:
print(f"Error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
================================================
FILE: apps/miroflow-agent/benchmarks/check_progress/common.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import glob
import json
import math
import os
import re
from dataclasses import dataclass
from datetime import datetime
from io import StringIO
from typing import Dict, List, Optional, Tuple
# Time estimation constants
DEFAULT_TASK_TIME_MINUTES = 3.5
MINUTES_PER_HOUR = 60
HOURS_PER_DAY = 24
MINUTES_PER_DAY = MINUTES_PER_HOUR * HOURS_PER_DAY
# Progress bar configuration
PROGRESS_BAR_WIDTH = 20
GREEN_THRESHOLD = 80
YELLOW_THRESHOLD = 60
ORANGE_THRESHOLD = 40
# Judge result patterns for correctness
CORRECT_RESULTS = ["CORRECT", "SUCCESS"]
SUCCESS_PATTERNS = ["PASS_AT_K_SUCCESS"]
# Log file configuration
LOG_FILE_PREFIX = "progress_analysis_"
LOG_FILE_TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S"
def create_progress_bar(percentage: float, width: int = PROGRESS_BAR_WIDTH) -> str:
"""Create a visual progress bar for percentage display"""
filled = int(width * percentage / 100)
bar = "█" * filled + "░" * (width - filled)
# Add color based on percentage
if percentage >= GREEN_THRESHOLD:
color = "\033[92m" # Green
elif percentage >= YELLOW_THRESHOLD:
color = "\033[93m" # Yellow
elif percentage >= ORANGE_THRESHOLD:
color = "\033[33m" # Orange
else:
color = "\033[91m" # Red
reset = "\033[0m"
return f"{color}[{bar}] {percentage:.1f}%{reset}"
def find_earliest_start_time(completed_files: List[str]) -> Optional[datetime]:
"""Find the earliest start time from all completed files"""
earliest_time = None
for file_path in completed_files:
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if "start_time" in data:
# Parse UTC time and convert to naive datetime
start_time_str = data["start_time"]
if start_time_str.endswith("Z"):
start_time_str = start_time_str[:-1] + "+00:00"
start_time = datetime.fromisoformat(start_time_str)
# Convert to naive datetime for comparison
start_time = start_time.replace(tzinfo=None)
if earliest_time is None or start_time < earliest_time:
earliest_time = start_time
except (json.JSONDecodeError, KeyError, ValueError, OSError):
continue # Skip files with invalid timing data
return earliest_time
def find_latest_end_time(completed_files: List[str]) -> Optional[datetime]:
"""Find the latest end time from all completed files"""
latest_time = None
for file_path in completed_files:
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if "end_time" in data:
# Parse UTC time and convert to naive datetime
end_time_str = data["end_time"]
if end_time_str.endswith("Z"):
end_time_str = end_time_str[:-1] + "+00:00"
end_time = datetime.fromisoformat(end_time_str)
# Convert to naive datetime for comparison (UTC-naive)
end_time = end_time.replace(tzinfo=None)
if latest_time is None or end_time > latest_time:
latest_time = end_time
except (json.JSONDecodeError, KeyError, ValueError, OSError):
continue # Skip files with invalid timing data
# If no valid end_time found, return current UTC (naive)
return latest_time or datetime.now().replace(tzinfo=None)
def calculate_mean_and_std(values: List[float]) -> Tuple[float, float]:
"""Calculate mean and standard deviation of a list of values"""
if not values:
return 0.0, 0.0
n = len(values)
mean = sum(values) / n
if n == 1:
return mean, 0.0
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
std = math.sqrt(variance)
return mean, std
def estimate_completion_time(
total_tasks: int, completed_tasks: int, completed_files: List[str]
) -> str:
"""Estimate completion time based on overall progress rate from all completed tasks"""
if completed_tasks == 0:
return "Cannot estimate (no completed tasks)"
# Check if all tasks are completed
if completed_tasks >= total_tasks:
return "All tasks completed"
remaining_tasks = total_tasks - completed_tasks
# Use overall completion rate from all successfully completed tasks
earliest_start = find_earliest_start_time(completed_files)
latest_end = find_latest_end_time(completed_files)
if earliest_start is None:
# Fallback to default estimation if no valid timing data
estimated_minutes = remaining_tasks * DEFAULT_TASK_TIME_MINUTES
else:
# Calculate overall elapsed time
elapsed_time = latest_end - earliest_start
elapsed_minutes = elapsed_time.total_seconds() / 60
if elapsed_minutes <= 0:
return "Cannot estimate (time interval too short)"
# Calculate average time per task based on all completed tasks
avg_minutes_per_task = elapsed_minutes / completed_tasks
if avg_minutes_per_task <= 0:
return "Cannot estimate (invalid time per task)"
estimated_minutes = remaining_tasks * avg_minutes_per_task
# Format the estimate in minutes
return f"~{int(estimated_minutes)} minutes"
@dataclass
class TaskStats:
"""Statistics for a single task"""
completed: int = 0
running: int = 0
failed: int = 0
judge_correct: int = 0
total: int = 0
# Completed files for timing analysis
completed_files: List[str] = None
# Turn statistics
total_turns: int = 0
completed_tasks_with_turns: int = 0
# No boxed content found statistics
no_boxed_found: int = 0
def __post_init__(self):
if self.completed_files is None:
self.completed_files = []
@property
def judge_accuracy(self) -> float:
"""Calculate judge accuracy percentage"""
return (
(self.judge_correct / self.completed * 100) if self.completed > 0 else 0.0
)
@property
def completion_rate(self) -> float:
"""Calculate completion rate percentage"""
return (self.completed / self.total * 100) if self.total > 0 else 0.0
@property
def average_turns(self) -> float:
"""Calculate average turns per completed task"""
return (
(self.total_turns / self.completed_tasks_with_turns)
if self.completed_tasks_with_turns > 0
else 0.0
)
@dataclass
class GAIATaskStats(TaskStats):
"""Statistics for a single task"""
# Difficulty level tracking
level1_completed: int = 0
level1_correct: int = 0
level2_completed: int = 0
level2_correct: int = 0
level3_completed: int = 0
level3_correct: int = 0
@property
def level1_accuracy(self) -> float:
"""Calculate Level 1 accuracy percentage"""
return (
(self.level1_correct / self.level1_completed * 100)
if self.level1_completed > 0
else 0.0
)
@property
def level2_accuracy(self) -> float:
"""Calculate Level 2 accuracy percentage"""
return (
(self.level2_correct / self.level2_completed * 100)
if self.level2_completed > 0
else 0.0
)
@property
def level3_accuracy(self) -> float:
"""Calculate Level 3 accuracy percentage"""
return (
(self.level3_correct / self.level3_completed * 100)
if self.level3_completed > 0
else 0.0
)
@dataclass
class SummaryStats:
"""Summary statistics across all runs"""
total_tasks: int = 0
total_completed: int = 0
total_running: int = 0
total_failed: int = 0
total_judge_correct: int = 0
total_no_boxed_found: int = 0
@property
def total_judge_accuracy(self) -> float:
"""Calculate overall judge accuracy percentage"""
return (
(self.total_judge_correct / self.total_completed * 100)
if self.total_completed > 0
else 0.0
)
def average_run_accuracy(
self, run_stats_list: List[Tuple[str, TaskStats]]
) -> Tuple[float, float]:
"""Calculate overall accuracy (mean) and standard deviation across individual runs"""
if not run_stats_list:
return 0.0, 0.0
# Mean accuracy is the overall accuracy (weighted average)
# This matches the OVERALL JUDGE ACCURACY calculation
mean = self.total_judge_accuracy
# Standard deviation is calculated from individual run accuracies
accuracies = [
stats.judge_accuracy for _, stats in run_stats_list if stats.completed > 0
]
if not accuracies:
return mean, 0.0
_, std = calculate_mean_and_std(accuracies)
return mean, std
@property
def total_completion_rate(self) -> float:
"""Calculate overall completion rate percentage"""
return (
(self.total_completed / self.total_tasks * 100)
if self.total_tasks > 0
else 0.0
)
@dataclass
class GAIASummaryStats(SummaryStats):
"""Summary statistics across all runs"""
# Difficulty level summary stats
level1_completed: int = 0
level1_correct: int = 0
level2_completed: int = 0
level2_correct: int = 0
level3_completed: int = 0
level3_correct: int = 0
@property
def level1_accuracy(self) -> float:
"""Calculate overall Level 1 accuracy percentage"""
return (
(self.level1_correct / self.level1_completed * 100)
if self.level1_completed > 0
else 0.0
)
@property
def level2_accuracy(self) -> float:
"""Calculate overall Level 2 accuracy percentage"""
return (
(self.level2_correct / self.level2_completed * 100)
if self.level2_completed > 0
else 0.0
)
@property
def level3_accuracy(self) -> float:
"""Calculate overall Level 3 accuracy percentage"""
return (
(self.level3_correct / self.level3_completed * 100)
if self.level3_completed > 0
else 0.0
)
class ProgressChecker:
"""Main class for checking benchmark progress"""
def __init__(self, target_path: str, task_per_run: int, data_path: str):
self.target_path = target_path
self.run_dirs: List[str] = []
self.total_tasks_per_run = task_per_run
# Load benchmark data
self._load_benchmark_data(data_path)
def _load_benchmark_data(self, data_path) -> None:
"""Load benchmark data and configuration"""
try:
# Load benchmark data if available
if os.path.exists(data_path):
with open(data_path) as f:
benchmark_data = [json.loads(line) for line in f.readlines()]
print(f"Loaded {len(benchmark_data)} tasks from {data_path}")
except Exception as e:
print(f"Warning: Could not load data: {e}")
def find_run_directories(self) -> List[str]:
"""Find all run directories in the target path"""
run_dirs = []
if not os.path.exists(self.target_path):
raise FileNotFoundError(f"Path '{self.target_path}' does not exist")
# Check if target_path itself is a run directory
if os.path.basename(self.target_path).startswith("run_"):
run_dirs.append(self.target_path)
else:
# Find run_* directories under target_path
try:
for item in os.listdir(self.target_path):
item_path = os.path.join(self.target_path, item)
if os.path.isdir(item_path) and item.startswith("run_"):
run_dirs.append(item_path)
except PermissionError:
raise PermissionError(
f"No permission to access directory '{self.target_path}'"
)
# Sort by run number
run_dirs.sort(key=lambda x: self._extract_run_number(x))
if not run_dirs:
raise ValueError(f"No run directories found in '{self.target_path}'")
return run_dirs
def _extract_run_number(self, path: str) -> int:
"""Extract run number from directory path for sorting"""
basename = os.path.basename(path)
parts = basename.split("_")
if len(parts) > 1 and parts[1].isdigit():
return int(parts[1])
return 0
def _extract_task_id(self, filename: str, task_id_pattern: str) -> Optional[str]:
"""Extract task ID from filename"""
match = re.match(task_id_pattern, filename)
return match.group(1) if match else None
def _get_latest_task_files(self, run_dir: str, task_id_pattern: str) -> List[str]:
"""Get the latest task file for each task ID in a run directory"""
json_files = glob.glob(os.path.join(run_dir, "task_*.json"))
if not json_files:
return []
# Group by task ID, keep only the latest file for each task
task_groups: Dict[str, Dict] = {}
for json_file in json_files:
filename = os.path.basename(json_file)
task_id = self._extract_task_id(filename, task_id_pattern)
if task_id:
try:
# Read the JSON file to get the start_time
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
start_time_str = data.get("start_time", "")
if start_time_str:
# Parse the ISO format timestamp
from datetime import datetime
start_time = datetime.fromisoformat(
start_time_str.replace("Z", "+00:00")
)
start_timestamp = start_time.timestamp()
else:
# Fallback to file modification time if start_time is not available
start_timestamp = os.path.getmtime(json_file)
if (
task_id not in task_groups
or start_timestamp > task_groups[task_id]["timestamp"]
):
task_groups[task_id] = {
"file": json_file,
"timestamp": start_timestamp,
}
except (json.JSONDecodeError, ValueError, OSError) as e:
# Fallback to file modification time if JSON parsing fails
print(f"Warning: Could not parse {json_file}: {e}")
file_mtime = os.path.getmtime(json_file)
if (
task_id not in task_groups
or file_mtime > task_groups[task_id]["timestamp"]
):
task_groups[task_id] = {
"file": json_file,
"timestamp": file_mtime,
}
return [info["file"] for info in task_groups.values()]
def _is_task_completed(self, data: Dict) -> bool:
"""Check if a task is completed based on its data"""
end_time = data.get("end_time", "")
error = data.get("error", "")
status = data.get("status", "")
final_answer = data.get("final_boxed_answer", "")
return (
(end_time != "" and error == "")
or (status == "completed")
or (final_answer != "" and error == "")
)
def _is_judge_correct(self, judge_result) -> bool:
"""Determine if LLM judge result indicates correct answer"""
if isinstance(judge_result, bool):
return judge_result
elif isinstance(judge_result, str):
result_str = judge_result.upper()
return (
result_str in CORRECT_RESULTS
or any(pattern in result_str for pattern in SUCCESS_PATTERNS)
or result_str.lower() in ["true", "1", "yes", "pass"]
)
elif isinstance(judge_result, (int, float)):
return judge_result > 0
elif isinstance(judge_result, dict):
return judge_result.get("correct", False) or judge_result.get(
"is_correct", False
)
return False
def _calculate_turns(self, data: Dict) -> int:
"""Calculate number of turns from task data (excluding system prompt)"""
try:
main_agent_history = data.get("main_agent_message_history", {})
message_history = main_agent_history.get("message_history", [])
if not message_history:
return 0
# Filter out system messages and count total messages, then divide by 2
# Turn count = (total messages excluding system) / 2
non_system_messages = [
msg for msg in message_history if msg.get("role") != "system"
]
# Each turn consists of user + assistant, so divide by 2
turn_count = len(non_system_messages) // 2
return turn_count
except (KeyError, TypeError, IndexError):
return 0
def analyze_run_directory(
self, run_dir: str, task_id_pattern: str
) -> Tuple[TaskStats, Dict[str, bool]]:
"""Analyze a single run directory and return statistics and task results
Returns:
Tuple[TaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct
"""
latest_files = self._get_latest_task_files(run_dir, task_id_pattern)
# Use the correct total tasks
stats = TaskStats(total=self.total_tasks_per_run)
completed_files = [] # Track completed files for timing analysis
task_results = {} # Track task_id -> is_correct mapping
for json_file in latest_files:
try:
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
status = data.get("status", "")
if status == "running":
stats.running += 1
elif self._is_task_completed(data):
stats.completed += 1
completed_files.append(json_file) # Track for timing analysis
# Check judge result for completed tasks
judge_result = data.get("final_judge_result", None)
is_correct = judge_result is not None and self._is_judge_correct(
judge_result
)
if is_correct:
stats.judge_correct += 1
# Extract task ID and store result
filename = os.path.basename(json_file)
task_id = self._extract_task_id(filename, task_id_pattern)
if task_id:
task_results[task_id] = is_correct
# Check if final_boxed_answer contains "No \\boxed{} content found"
final_boxed_answer = data.get("final_boxed_answer", "")
if (
isinstance(final_boxed_answer, str)
and "No \\boxed{} content found" in final_boxed_answer
):
stats.no_boxed_found += 1
# Calculate turns for completed tasks
turns = self._calculate_turns(data)
if turns > 0:
stats.total_turns += turns
stats.completed_tasks_with_turns += 1
else:
stats.failed += 1
except (json.JSONDecodeError, IOError) as e:
# Skip files that are being written or corrupted
if "Expecting value" in str(e) or "line 1 column 1" in str(e):
continue # Skip corrupted/empty files
print(f"Warning: Could not parse {json_file}: {e}")
stats.failed += 1
except Exception as e:
print(f"Warning: Unexpected error processing {json_file}: {e}")
stats.failed += 1
# Store completed files in stats for timing analysis
stats.completed_files = completed_files
return stats, task_results
def run_analysis(
self, benchmark_name_std: str, task_id_pattern: str
) -> SummaryStats:
"""Run the complete analysis and return summary statistics"""
self.run_dirs = self.find_run_directories()
summary = SummaryStats()
run_stats_list = [] # Store statistics for each run
all_completed_files = [] # Collect all completed files for timing analysis
all_task_results = {} # Collect task_id -> list of is_correct across all runs
print()
print("=" * 80)
print(f"Analyzing benchmark progress for: {self.target_path}")
print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)
# Analyze each run directory
for run_dir in self.run_dirs:
run_name = os.path.basename(run_dir)
stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)
if stats.total == 0:
print(f"{run_name}: No task files found")
print()
continue
# Display run statistics in a single line
run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}"
# Add accuracy information
if stats.completed > 0:
run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)"
# Add average turns information (show even if some tasks are still running)
if stats.completed_tasks_with_turns > 0:
run_info += f" | Avg Turns: {stats.average_turns:.1f}"
print(run_info)
print()
# Store run statistics for later display
run_stats_list.append((run_name, stats))
# Collect completed files for timing analysis
all_completed_files.extend(stats.completed_files)
# Collect task results for Pass@n calculation
for task_id, is_correct in task_results.items():
if task_id not in all_task_results:
all_task_results[task_id] = []
all_task_results[task_id].append(is_correct)
# Update summary statistics
summary.total_tasks += stats.total
summary.total_completed += stats.completed
summary.total_running += stats.running
summary.total_failed += stats.failed
summary.total_judge_correct += stats.judge_correct
summary.total_no_boxed_found += stats.no_boxed_found
# Display summary after all runs are processed
self._display_summary(
summary,
run_stats_list,
all_completed_files,
benchmark_name_std,
all_task_results,
)
return summary
def _calculate_pass_at_n(
self, all_task_results: Dict[str, List[bool]], total_tasks: int
) -> Tuple[int, float]:
"""Calculate Pass@n: number of tasks with at least one correct answer across all runs
Returns:
Tuple[int, float]: (pass_at_n_count, pass_at_n_percentage)
"""
if not all_task_results or total_tasks == 0:
return 0, 0.0
pass_at_n_count = 0
for task_id, results in all_task_results.items():
# If at least one run got it correct, this task passes
if any(results):
pass_at_n_count += 1
pass_at_n_percentage = (
(pass_at_n_count / total_tasks * 100) if total_tasks > 0 else 0.0
)
return pass_at_n_count, pass_at_n_percentage
def _display_summary(
self,
summary: SummaryStats,
run_stats_list: List[Tuple[str, TaskStats]],
completed_files: List[str],
benchmark_name_std: str,
all_task_results: Dict[str, List[bool]] = None,
):
"""Display summary statistics"""
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)
print(
f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)"
)
# Estimate completion time using overall progress rate
if summary.total_tasks > 0 and summary.total_completed > 0:
remaining_tasks = summary.total_tasks - summary.total_completed
earliest_start = find_earliest_start_time(completed_files)
latest_end = find_latest_end_time(completed_files)
completion_estimate = estimate_completion_time(
summary.total_tasks, summary.total_completed, completed_files
)
print(f"Remaining Tasks: {remaining_tasks}")
if earliest_start:
elapsed_time = latest_end - earliest_start
elapsed_minutes = elapsed_time.total_seconds() / 60
tasks_per_minute = (
summary.total_completed / elapsed_minutes
if elapsed_minutes > 0
else 0
)
print(f"Elapsed Time: {elapsed_minutes:.1f} minutes")
print(f"Completion Rate: {tasks_per_minute:.1f} tasks/minute")
print(f"Estimated Time to Complete: {completion_estimate}")
if summary.total_completed > 0:
accuracy_bar = create_progress_bar(summary.total_judge_accuracy)
print(
f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}"
)
# Calculate and display overall average turns
total_turns = sum(stats.total_turns for _, stats in run_stats_list)
total_tasks_with_turns = sum(
stats.completed_tasks_with_turns for _, stats in run_stats_list
)
if total_tasks_with_turns > 0:
overall_avg_turns = total_turns / total_tasks_with_turns
print(f"Overall Average Turns: {overall_avg_turns:.1f}")
# Display each run's correct percentage
if run_stats_list:
print()
print("INDIVIDUAL RUN ACCURACIES:")
for run_name, stats in run_stats_list:
if stats.completed > 0:
accuracy_bar = create_progress_bar(stats.judge_accuracy)
print(
f" {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}"
)
else:
print(
f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)"
)
# Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
num_runs = len(run_stats_list)
mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
if mean_acc > 0:
print()
if num_runs > 1:
print(
f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%"
)
else:
print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%")
# Display Pass@n if multiple runs
if num_runs > 1 and all_task_results:
# Calculate total unique tasks (use the first run's total as reference)
first_run_total = (
run_stats_list[0][1].total
if run_stats_list
else summary.total_tasks
)
pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
all_task_results, first_run_total
)
pass_at_n_bar = create_progress_bar(pass_at_n_percentage)
print(
f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}"
)
# Display no boxed content found statistics
if summary.total_completed > 0:
print(
f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)"
)
print("=" * 80)
print()
# Save analysis results to log file
self._save_analysis_log(
summary,
run_stats_list,
completed_files,
benchmark_name_std,
all_task_results,
)
def _save_analysis_log(
self,
summary: SummaryStats,
run_stats_list: List[Tuple[str, TaskStats]],
completed_files: List[str],
benchmark_name_std: str,
all_task_results: Dict[str, List[bool]] = None,
) -> None:
"""Save analysis results to a log file in the target directory"""
try:
# Create log filename with timestamp
timestamp = datetime.now().strftime(LOG_FILE_TIMESTAMP_FORMAT)
log_filename = f"{LOG_FILE_PREFIX}{timestamp}.log"
log_path = os.path.join(self.target_path, log_filename)
# Capture the analysis output
output_buffer = StringIO()
# Write header
output_buffer.write("=" * 80 + "\n")
output_buffer.write(f"{benchmark_name_std} Progress Analysis\n")
output_buffer.write(
f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
)
output_buffer.write(f"Target Path: {self.target_path}\n")
output_buffer.write("=" * 80 + "\n\n")
# Write run statistics
for run_name, stats in run_stats_list:
output_buffer.write(
f"{run_name}: Status: {stats.completed} completed, {stats.running} running, {stats.failed} failed\n"
)
if stats.completed > 0:
accuracy = stats.judge_correct / stats.completed * 100
output_buffer.write(
f" Overall Accuracy: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n"
)
else:
output_buffer.write(
f" Overall Accuracy: {stats.judge_correct}/{stats.completed} (N/A)\n"
)
output_buffer.write("\n")
# Write summary statistics
output_buffer.write("=" * 80 + "\n")
output_buffer.write("SUMMARY STATISTICS\n")
output_buffer.write("=" * 80 + "\n")
output_buffer.write(
f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\n"
)
# Write timing information
if summary.total_tasks > 0 and summary.total_completed > 0:
remaining_tasks = summary.total_tasks - summary.total_completed
earliest_start = find_earliest_start_time(completed_files)
latest_end = find_latest_end_time(completed_files)
completion_estimate = estimate_completion_time(
summary.total_tasks, summary.total_completed, completed_files
)
output_buffer.write(f"Remaining Tasks: {remaining_tasks}\n")
if earliest_start:
elapsed_time = latest_end - earliest_start
elapsed_minutes = elapsed_time.total_seconds() / 60
tasks_per_minute = (
summary.total_completed / elapsed_minutes
if elapsed_minutes > 0
else 0
)
output_buffer.write(
f"Elapsed Time: {elapsed_minutes:.1f} minutes\n"
)
output_buffer.write(
f"Completion Rate: {tasks_per_minute:.1f} tasks/minute\n"
)
output_buffer.write(
f"Estimated Time to Complete: {completion_estimate}\n"
)
if summary.total_completed > 0:
accuracy = summary.total_judge_correct / summary.total_completed * 100
output_buffer.write(
f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} ({accuracy:.1f}%)\n"
)
no_boxed_percentage = (
summary.total_no_boxed_found / summary.total_completed * 100
)
output_buffer.write(
f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n"
)
# Write individual run accuracies
if run_stats_list:
output_buffer.write("\nINDIVIDUAL RUN ACCURACIES:\n")
for run_name, stats in run_stats_list:
if stats.completed > 0:
accuracy = stats.judge_correct / stats.completed * 100
output_buffer.write(
f" {run_name}: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n"
)
else:
output_buffer.write(
f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\n"
)
# Write mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
num_runs = len(run_stats_list)
mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
if mean_acc > 0:
if num_runs > 1:
output_buffer.write(
f"\nPass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\n"
)
else:
output_buffer.write(
f"\nMEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\n"
)
# Write Pass@n if multiple runs
if num_runs > 1 and all_task_results:
first_run_total = (
run_stats_list[0][1].total
if run_stats_list
else summary.total_tasks
)
pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
all_task_results, first_run_total
)
output_buffer.write(
f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} ({pass_at_n_percentage:.1f}%)\n"
)
if summary.total_completed > 0:
no_boxed_percentage = (
summary.total_no_boxed_found / summary.total_completed * 100
)
output_buffer.write(
f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n"
)
output_buffer.write("=" * 80 + "\n")
# Write to file
with open(log_path, "w", encoding="utf-8") as f:
f.write(output_buffer.getvalue())
output_buffer.close()
print(f"Analysis results saved to: {log_path}")
except Exception as e:
print(f"Warning: Could not save analysis log: {e}")
class GAIAProgressChecker(ProgressChecker):
"""Main class for checking GAIA benchmark progress"""
DIFFICULTY_LEVELS = [1, 2, 3]
def __init__(self, target_path: str, task_per_run: int, data_path: str):
super().__init__(target_path, task_per_run=0, data_path="") # 调用父类构造函数
# Difficulty level mapping
self.task_difficulty_map: Dict[str, int] = {}
self.total_tasks_per_run = task_per_run
# Load GAIA data if this is a GAIA validation directory
self._load_benchmark_data(data_path)
def _load_benchmark_data(self, data_path) -> None:
"""Load GAIA-specific data and configuration"""
try:
if os.path.exists(data_path):
with open(data_path) as f:
benchmark_data = [json.loads(line) for line in f.readlines()]
print(f"Loaded {len(benchmark_data)} tasks from {data_path}")
for line in benchmark_data:
task_id = line["task_id"]
metadata = line.get("metadata", {})
difficulty_level = (
metadata.get("Level") or metadata.get("level") or 0
)
if difficulty_level in self.DIFFICULTY_LEVELS:
self.task_difficulty_map[task_id] = difficulty_level
level_counts = {
level: sum(
1 for v in self.task_difficulty_map.values() if v == level
)
for level in self.DIFFICULTY_LEVELS
}
print(f"Difficulty level distribution: {level_counts}")
except Exception as e:
print(f"Warning: Could not load GAIA data: {e}")
def _update_difficulty_stats(
self, stats: GAIATaskStats, task_id: str, is_correct: bool
) -> None:
"""Update difficulty level statistics for a task"""
if task_id not in self.task_difficulty_map:
return
difficulty_level = self.task_difficulty_map[task_id]
if difficulty_level == 1:
stats.level1_completed += 1
if is_correct:
stats.level1_correct += 1
elif difficulty_level == 2:
stats.level2_completed += 1
if is_correct:
stats.level2_correct += 1
elif difficulty_level == 3:
stats.level3_completed += 1
if is_correct:
stats.level3_correct += 1
def analyze_run_directory(
self, run_dir: str, task_id_pattern: str
) -> Tuple[GAIATaskStats, Dict[str, bool]]:
"""Analyze a single run directory and return statistics (GAIA-specific)
Returns:
Tuple[GAIATaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct
"""
latest_files = self._get_latest_task_files(
run_dir, task_id_pattern
) # 直接用父类的实现
stats = GAIATaskStats(total=len(latest_files))
completed_files = []
task_results = {} # Track task_id -> is_correct mapping
for json_file in latest_files:
try:
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
status = data.get("status", "")
if status == "running":
stats.running += 1
elif self._is_task_completed(data):
stats.completed += 1
completed_files.append(json_file)
judge_result = data.get("final_judge_result", None)
is_correct = judge_result is not None and self._is_judge_correct(
judge_result
)
if is_correct:
stats.judge_correct += 1
# Check if final_boxed_answer contains "No \\boxed{} content found"
final_boxed_answer = data.get("final_boxed_answer", "")
if (
isinstance(final_boxed_answer, str)
and "No \\boxed{} content found" in final_boxed_answer
):
stats.no_boxed_found += 1
task_id = self._extract_task_id(
os.path.basename(json_file), task_id_pattern
)
if task_id:
self._update_difficulty_stats(stats, task_id, is_correct)
task_results[task_id] = is_correct
# Calculate turns for completed tasks
turns = self._calculate_turns(data)
if turns > 0:
stats.total_turns += turns
stats.completed_tasks_with_turns += 1
else:
stats.failed += 1
except Exception as e:
print(f"Warning: Could not process {json_file}: {e}")
stats.failed += 1
stats.completed_files = completed_files
return stats, task_results
def run_analysis(
self, benchmark_name_std: str, task_id_pattern: str
) -> GAIASummaryStats:
"""Run the complete analysis and return summary statistics"""
self.run_dirs = self.find_run_directories()
summary = GAIASummaryStats()
run_stats_list = [] # Store statistics for each run
all_completed_files = [] # Collect all completed files for timing analysis
all_task_results = {} # Collect task_id -> list of is_correct across all runs
print()
print("=" * 80)
print(f"Analyzing benchmark progress for: {self.target_path}")
print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)
# Analyze each run directory
for run_dir in self.run_dirs:
run_name = os.path.basename(run_dir)
stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern)
if stats.total == 0:
print(f"{run_name}: No task files found")
print()
continue
# Display run statistics in a single line
run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}"
# Add accuracy information
if stats.completed > 0:
run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)"
# Add average turns information (show even if some tasks are still running)
if stats.completed_tasks_with_turns > 0:
run_info += f" | Avg Turns: {stats.average_turns:.1f}"
print(run_info)
print()
# Store run statistics for later display
run_stats_list.append((run_name, stats))
# Collect completed files for timing analysis
all_completed_files.extend(stats.completed_files)
# Collect task results for Pass@n calculation
for task_id, is_correct in task_results.items():
if task_id not in all_task_results:
all_task_results[task_id] = []
all_task_results[task_id].append(is_correct)
# Update summary statistics
self._update_summary_stats(summary, stats)
# Display summary after all runs are processed
self._display_summary(
summary,
run_stats_list,
all_completed_files,
benchmark_name_std,
all_task_results,
)
return summary
def _update_summary_stats(
self, summary: GAIASummaryStats, stats: GAIATaskStats
) -> None:
"""Update summary statistics with data from a single run"""
summary.total_tasks += stats.total
summary.total_completed += stats.completed
summary.total_running += stats.running
summary.total_failed += stats.failed
summary.total_judge_correct += stats.judge_correct
summary.total_no_boxed_found += stats.no_boxed_found
# Update difficulty level summary stats
summary.level1_completed += stats.level1_completed
summary.level1_correct += stats.level1_correct
summary.level2_completed += stats.level2_completed
summary.level2_correct += stats.level2_correct
summary.level3_completed += stats.level3_completed
summary.level3_correct += stats.level3_correct
def _display_summary(
self,
summary: GAIASummaryStats,
run_stats_list: List[Tuple[str, GAIATaskStats]],
completed_files: List[str],
benchmark_name_std: str,
all_task_results: Dict[str, List[bool]] = None,
):
"""Display summary statistics"""
print("=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)
# Estimate completion time using overall progress rate
if summary.total_completed > 0:
num_runs = len(run_stats_list) if run_stats_list else 1
expected_total_tasks = self.total_tasks_per_run * num_runs
remaining_tasks = expected_total_tasks - summary.total_completed
earliest_start = find_earliest_start_time(completed_files)
last_end = find_latest_end_time(completed_files)
completion_estimate = estimate_completion_time(
expected_total_tasks, summary.total_completed, completed_files
)
print(
f"Current Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)"
)
print(f"Remaining Tasks to Complete: {remaining_tasks}")
if earliest_start:
elapsed_time = last_end - earliest_start
elapsed_minutes = elapsed_time.total_seconds() / 60
overall_rate = (
summary.total_completed / elapsed_minutes
if elapsed_minutes > 0
else 0
)
print(f"Elapsed Time: {elapsed_minutes:.1f} minutes")
print(f"Completion Rate: {overall_rate:.2f} tasks/minute")
print(f"Estimated Time to Complete: {completion_estimate}")
# Display each run's correct percentage
if run_stats_list:
print()
print("INDIVIDUAL RUN ACCURACIES:")
for run_name, stats in run_stats_list:
if stats.completed > 0:
accuracy_bar = create_progress_bar(stats.judge_accuracy)
print(
f" {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}"
)
# Add difficulty level information for each run
if (
stats.level1_completed > 0
or stats.level2_completed > 0
or stats.level3_completed > 0
):
# Calculate total expected tasks for each difficulty level
total_level1 = sum(
1
for level in self.task_difficulty_map.values()
if level == 1
)
total_level2 = sum(
1
for level in self.task_difficulty_map.values()
if level == 2
)
total_level3 = sum(
1
for level in self.task_difficulty_map.values()
if level == 3
)
difficulty_info = (
f" L1: {stats.level1_correct}/{stats.level1_completed}/{total_level1} ({stats.level1_accuracy:.1f}%) | "
f"L2: {stats.level2_correct}/{stats.level2_completed}/{total_level2} ({stats.level2_accuracy:.1f}%) | "
f"L3: {stats.level3_correct}/{stats.level3_completed}/{total_level3} ({stats.level3_accuracy:.1f}%)"
)
print(f" {difficulty_info}")
print()
else:
print(
f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)"
)
# Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n))
num_runs = len(run_stats_list)
mean_acc, std_acc = summary.average_run_accuracy(run_stats_list)
if mean_acc > 0:
print()
if num_runs > 1:
print(
f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%"
)
else:
print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%")
# Display Pass@n if multiple runs
if num_runs > 1 and all_task_results:
# Use the first run's total as reference
first_run_total = (
run_stats_list[0][1].total
if run_stats_list
else summary.total_tasks
)
pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n(
all_task_results, first_run_total
)
pass_at_n_bar = create_progress_bar(pass_at_n_percentage)
print(
f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}"
)
# Display no boxed content found statistics
if summary.total_completed > 0:
print(
f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)"
)
# Display overall judge accuracy after individual runs
if summary.total_completed > 0:
print()
accuracy_bar = create_progress_bar(summary.total_judge_accuracy)
print(
f"OVERALL JUDGE ACCURACY: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}"
)
# Calculate and display overall average turns
total_turns = sum(stats.total_turns for _, stats in run_stats_list)
total_tasks_with_turns = sum(
stats.completed_tasks_with_turns for _, stats in run_stats_list
)
if total_tasks_with_turns > 0:
overall_avg_turns = total_turns / total_tasks_with_turns
print(f"OVERALL AVERAGE TURNS: {overall_avg_turns:.1f}")
# Display difficulty level summary if available
if (
summary.level1_completed > 0
or summary.level2_completed > 0
or summary.level3_completed > 0
):
print()
print("DIFFICULTY LEVEL SUMMARY:")
# Calculate total expected tasks for each difficulty level
total_level1 = sum(
1 for level in self.task_difficulty_map.values() if level == 1
)
total_level2 = sum(
1 for level in self.task_difficulty_map.values() if level == 2
)
total_level3 = sum(
1 for level in self.task_difficulty_map.values() if level == 3
)
print(
f" L1: {summary.level1_correct}/{summary.level1_completed}/{total_level1} ({summary.level1_accuracy:.1f}%) | L2: {summary.level2_correct}/{summary.level2_completed}/{total_level2} ({summary.level2_accuracy:.1f}%) | L3: {summary.level3_correct}/{summary.level3_completed}/{total_level3} ({summary.level3_accuracy:.1f}%)"
)
print("=" * 80)
print()
================================================
FILE: apps/miroflow-agent/benchmarks/common_benchmark.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import gc
import json
import os
import random
import re
from abc import ABC
from concurrent.futures import ProcessPoolExecutor
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import hydra
# Import from the new modular structure
from evaluators.eval_utils import verify_answer_for_datasets
from omegaconf import DictConfig, OmegaConf
from src.core.pipeline import (
create_pipeline_components,
execute_task_pipeline,
)
from src.logging.summary_time_cost import generate_summary
from src.utils.prompt_utils import (
FAILURE_EXPERIENCE_FOOTER,
FAILURE_EXPERIENCE_HEADER,
FAILURE_EXPERIENCE_ITEM,
FORMAT_ERROR_MESSAGE,
)
def _task_worker(task_dict, cfg_dict, evaluator_kwargs):
"""
Worker function to run a single task in a separate process.
This function is called by ProcessPoolExecutor and must be at module level.
"""
import asyncio
from omegaconf import OmegaConf
# Reconstruct config in this process
cfg = OmegaConf.create(cfg_dict)
# Reconstruct task
task = BenchmarkTask(
task_id=task_dict["task_id"],
task_question=task_dict["task_question"],
ground_truth=task_dict["ground_truth"],
file_path=task_dict.get("file_path"),
metadata=task_dict.get("metadata", {}),
)
# Create evaluator in this process
evaluator = GenericEvaluator(
data_dir=evaluator_kwargs["data_dir"],
benchmark_name=evaluator_kwargs["benchmark_name"],
cfg=cfg,
metadata_file=evaluator_kwargs.get("metadata_file", "metadata.jsonl"),
task_id_field=evaluator_kwargs.get("task_id_field", "task_id"),
question_field=evaluator_kwargs.get("question_field", "task_question"),
ground_truth_field=evaluator_kwargs.get("ground_truth_field", "ground_truth"),
file_name_field=evaluator_kwargs.get("file_name_field"),
)
# Run task in new event loop
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Set exception handler to suppress "Task exception was never retrieved" warnings
def exception_handler(loop, context):
# Suppress all asyncio internal warnings for cleaner output
pass
loop.set_exception_handler(exception_handler)
try:
result = loop.run_until_complete(evaluator.run_single_task(task))
# Convert result to dict for serialization
return asdict(result)
finally:
loop.close()
@dataclass
class BenchmarkTask:
"""Generic benchmark task data structure"""
task_id: str
task_question: str
ground_truth: str
file_path: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
model_boxed_answer: str = ""
status: str = "pending" # pending, success, failed
@dataclass
class BenchmarkResult:
"""Generic benchmark evaluation result structure"""
task_id: str
task_question: str
ground_truth: str
file_path: Optional[str]
status: str
model_boxed_answer: str = ""
metadata: Dict[str, Any] = field(default_factory=dict)
error_message: str = ""
final_judge_result: Optional[str] = None
judge_type: Optional[str] = None
log_file_path: Optional[str] = None
# Pass@K support fields
attempts: List[Dict[str, Any]] = field(default_factory=list) # Store all attempts
pass_at_k_success: bool = False # Whether task passed using pass@k evaluation
k_value: int = 1 # The k value used for this evaluation
class BenchmarkEvaluator(ABC):
"""Abstract base class for benchmark evaluators"""
def __init__(self, data_dir: str, benchmark_name: str, cfg: DictConfig):
"""
Initialize benchmark evaluator
Args:
data_dir: Path to benchmark data directory
benchmark_name: Name of the benchmark
cfg: The Hydra configuration object
"""
self.data_dir = Path(data_dir)
self.benchmark_name = benchmark_name
self.cfg = cfg
self.pass_at_k = cfg.benchmark.execution.get("pass_at_k", 1)
self.tasks: List[BenchmarkTask] = []
self.results: List[BenchmarkResult] = []
# Format error tracking and retry configuration
# Read from agent config as it's part of context management
self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
# Get LLM provider and model from the config object
self.llm_provider = cfg.llm.provider
self.llm_model = cfg.llm.model_name
# Initialize pipeline components
print("Initializing pipeline components...")
(
self.main_agent_tool_manager,
self.sub_agent_tool_managers,
self.output_formatter,
) = create_pipeline_components(cfg)
print(
f"Pipeline components initialized successfully! Using pass@{self.pass_at_k}"
)
def get_log_dir(self) -> Path:
"""Get the log directory for the current benchmark and model."""
return Path(hydra.core.hydra_config.HydraConfig.get().run.dir)
async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
"""
Run inference for a single benchmark task with pass@k support
Args:
task: BenchmarkTask object
Returns:
BenchmarkResult object
"""
print(f"Processing task {task.task_id} with pass@{self.pass_at_k}")
result = BenchmarkResult(
task_id=task.task_id,
task_question=task.task_question,
ground_truth=task.ground_truth,
file_path=task.file_path,
model_boxed_answer="",
status="pending",
metadata=task.metadata.copy(),
k_value=self.pass_at_k,
)
logs_dir = self.get_log_dir()
found_correct_answer = False
# Print debug info about log directory
print(f" Current log directory: {logs_dir}")
try:
# Prepare task
task_description, task_file_path = self.prepare_task_description(task)
# Run up to k attempts (with early stopping when correct answer found)
for attempt in range(1, self.pass_at_k + 1):
print(f" Attempt {attempt}/{self.pass_at_k} for task {task.task_id}")
format_retry_count = 0
# Check if log file exists for this specific attempt in current directory
log_pattern = f"task_{task.task_id}_attempt-{attempt}_*.json"
matching_logs = []
# Search only in current log directory
if logs_dir.exists():
dir_logs = sorted(list(logs_dir.glob(log_pattern)))
if dir_logs:
matching_logs.extend(dir_logs)
if matching_logs:
# Sort by timestamp in filename to get the most recent
def extract_timestamp(file_path):
filename = file_path.name
# Extract timestamp from filename like: task_xxx_attempt-1_format-retry-0_2025-08-13-10-13-20.json
# The timestamp is the last part before .json
if "_" in filename and filename.endswith(".json"):
timestamp_part = filename.split("_")[-1].replace(
".json", ""
)
# Convert timestamp to datetime for proper sorting
from datetime import datetime
return datetime.strptime(
timestamp_part, "%Y-%m-%d-%H-%M-%S"
)
return filename
matching_logs = sorted(matching_logs, key=extract_timestamp)
attempt_result = {
"attempt_number": attempt,
"model_boxed_answer": "",
"status": "pending",
"log_file_path": None,
"final_judge_result": None,
"judge_type": None,
"is_correct": False,
}
# Try to load existing result for this attempt
if matching_logs:
log_file = matching_logs[-1]
attempt_result["log_file_path"] = str(log_file)
print(
f" Found existing log for attempt {attempt}: {log_file.name}"
)
match = re.search(r"retry-(\d+)", os.path.basename(str(log_file)))
if match:
format_retry_count = int(match.group(1))
else:
raise ValueError(
f"Failed to extract retry number from log file: {log_file}"
)
try:
with open(log_file) as f:
log_data = json.loads(f.read())
if log_data.get("status") == "success":
format_retry_count += 1
if log_data.get("final_boxed_answer"):
attempt_result["model_boxed_answer"] = log_data[
"final_boxed_answer"
]
attempt_result["status"] = log_data.get("status")
# Check if we already have judge result in log
if log_data.get("final_judge_result"):
attempt_result["final_judge_result"] = log_data[
"final_judge_result"
]
attempt_result["judge_type"] = log_data.get(
"judge_type", ""
)
attempt_result["is_correct"] = (
log_data["final_judge_result"] == "CORRECT"
)
# Load evaluation details if available
if log_data.get("eval_details"):
attempt_result["eval_details"] = log_data[
"eval_details"
]
print(
f" Loaded existing result: {attempt_result['model_boxed_answer']}"
)
except Exception as e:
print(f" Error loading log file {log_file}: {e}")
# Run inference if no existing result or if we have a format error
if (
not attempt_result["model_boxed_answer"]
or attempt_result["model_boxed_answer"] == FORMAT_ERROR_MESSAGE
):
# Try to get a valid response with format retry
print(f"TASK ID: {task.task_id}, ATTEMPT: {attempt}")
max_format_retries = self.context_compress_limit
# Track accumulated failure experiences for this attempt
# Start with the original task description
current_task_description = task_description
failure_experiences = []
# Resume: Recover failure experiences from previous retry logs
if format_retry_count > 0 and logs_dir.exists():
print(
f" Resuming from retry {format_retry_count}, recovering previous failure experiences..."
)
for prev_retry in range(format_retry_count):
prev_log_pattern = f"task_{task.task_id}_attempt-{attempt}_format-retry-{prev_retry}_*.json"
prev_logs = sorted(list(logs_dir.glob(prev_log_pattern)))
if prev_logs:
prev_log_file = prev_logs[-1] # Get the latest one
try:
with open(
prev_log_file, "r", encoding="utf-8"
) as f:
prev_log_data = json.load(f)
# Extract failure experience from trace_data
trace_data = prev_log_data.get("trace_data", {})
prev_failure_exp = trace_data.get(
"failure_experience_summary"
)
if prev_failure_exp:
failure_experiences.append(prev_failure_exp)
print(
f" Recovered failure experience from retry {prev_retry}"
)
except Exception as e:
print(
f" Warning: Failed to load previous log {prev_log_file}: {e}"
)
# Rebuild enhanced task description with recovered failure experiences
if failure_experiences:
current_task_description += FAILURE_EXPERIENCE_HEADER
for idx, exp in enumerate(failure_experiences, 1):
current_task_description += (
FAILURE_EXPERIENCE_ITEM.format(
attempt_number=idx,
failure_summary=exp,
)
)
current_task_description += FAILURE_EXPERIENCE_FOOTER
print(
f" Recovered {len(failure_experiences)} failure experience(s) from previous retries"
)
while format_retry_count <= max_format_retries:
try:
# Check if this is the final retry (no more chances after this)
is_final_retry = format_retry_count == max_format_retries
(
response,
final_boxed_answer,
log_file_path,
failure_experience_summary,
) = await execute_task_pipeline(
cfg=self.cfg,
task_id=f"{task.task_id}_attempt-{attempt}_format-retry-{format_retry_count}",
task_file_name=task_file_path,
task_description=current_task_description,
main_agent_tool_manager=self.main_agent_tool_manager,
sub_agent_tool_managers=self.sub_agent_tool_managers,
output_formatter=self.output_formatter,
ground_truth=task.ground_truth,
log_dir=str(self.get_log_dir()),
is_final_retry=is_final_retry,
)
attempt_result["model_boxed_answer"] = (
final_boxed_answer if final_boxed_answer else ""
)
attempt_result["log_file_path"] = log_file_path
# Check for format error
if (
attempt_result["model_boxed_answer"]
== FORMAT_ERROR_MESSAGE
):
format_retry_count += 1
if format_retry_count <= max_format_retries:
# Use the model-generated failure experience summary
print(
f" Format error detected, using model-generated failure summary for retry {format_retry_count}..."
)
if failure_experience_summary:
failure_experiences.append(
failure_experience_summary
)
# Build enhanced task description with accumulated failure experiences
# Start fresh from original task_description each time
current_task_description = task_description
current_task_description += (
FAILURE_EXPERIENCE_HEADER
)
for idx, exp in enumerate(
failure_experiences, 1
):
current_task_description += (
FAILURE_EXPERIENCE_ITEM.format(
attempt_number=idx,
failure_summary=exp,
)
)
current_task_description += (
FAILURE_EXPERIENCE_FOOTER
)
print(
f" Enhanced task description with {len(failure_experiences)} failure experience(s)"
)
else:
print(
" No failure experience summary generated, retrying without enhancement..."
)
continue
else:
# Exceeded format retry limit
attempt_result["status"] = "success"
attempt_result["model_boxed_answer"] = (
f"{FORMAT_ERROR_MESSAGE} (after {max_format_retries} retries)"
)
attempt_result["error_message"] = (
f"Exceeded format error retry limit ({max_format_retries})"
)
break
else:
# Got valid response, success
attempt_result["status"] = "success"
break
except Exception as e:
attempt_result["status"] = "failed"
attempt_result["error_message"] = str(e)
print(
f" Error in attempt {attempt}, format retry {format_retry_count}: {e}"
)
break
# Perform LLM verification if we have an answer and haven't verified yet
if (
attempt_result["model_boxed_answer"]
and attempt_result["final_judge_result"] is None
and task.ground_truth is not None
):
print(f" Verifying answer for attempt {attempt}...")
try:
(
evaluation_result,
judge_type,
eval_details,
) = await verify_answer_for_datasets(
benchmark_name=self.benchmark_name,
question=task.task_question,
target=task.ground_truth,
predicted_answer=attempt_result["model_boxed_answer"],
metadata=task.metadata,
)
attempt_result["final_judge_result"] = evaluation_result
attempt_result["judge_type"] = judge_type
attempt_result["is_correct"] = evaluation_result == "CORRECT"
# Store evaluation details (e.g., for DeepSearchQA metrics)
if eval_details:
attempt_result["eval_details"] = eval_details
# Update the log file with verification result
if attempt_result["log_file_path"]:
self._update_log_file_with_evaluation(
attempt_result["model_boxed_answer"],
attempt_result["log_file_path"],
evaluation_result,
judge_type,
eval_details, # Pass eval_details to save in log file
)
if attempt_result["is_correct"]:
print(f" ✅ Attempt {attempt}: CORRECT!")
found_correct_answer = True
else:
print(
f" ❌ Attempt {attempt}: INCORRECT ({evaluation_result})"
)
except Exception as e:
print(f" Error verifying attempt {attempt}: {e}")
attempt_result["final_judge_result"] = "ERROR"
attempt_result["judge_type"] = "error"
attempt_result["is_correct"] = False
elif attempt_result["is_correct"]:
print(f" ✅ Attempt {attempt}: CORRECT (cached)")
found_correct_answer = True
elif attempt_result["final_judge_result"]:
print(
f" ❌ Attempt {attempt}: INCORRECT (cached: {attempt_result['final_judge_result']})"
)
else:
print(f" ⚠️ Attempt {attempt}: No valid answer to verify")
result.attempts.append(attempt_result)
# Update main result with the first successful attempt or best attempt so far
if attempt == 1 or (
attempt_result["status"] == "success"
and not result.model_boxed_answer
):
result.model_boxed_answer = attempt_result["model_boxed_answer"]
result.log_file_path = attempt_result["log_file_path"]
result.status = attempt_result["status"]
if "error_message" in attempt_result:
result.error_message = attempt_result["error_message"]
# Early stopping: if we found a correct answer, we can stop
if found_correct_answer:
print(
f" 🎯 Found correct answer! Stopping early after {attempt} attempts."
)
break
except Exception as e:
result.error_message = str(e)
result.status = "failed"
print(f"Error processing task {task.task_id}: {e}")
finally:
result.pass_at_k_success = found_correct_answer
# Set main result judge result based on pass@k outcome
if found_correct_answer:
result.final_judge_result = "PASS_AT_K_SUCCESS"
result.judge_type = "pass_at_k"
else:
if result.ground_truth is None:
result.final_judge_result = "TEST_SET_MODE"
else:
result.final_judge_result = "PASS_AT_K_FAILED"
result.judge_type = "pass_at_k"
print(f"Task {task.task_id} completed with {len(result.attempts)} attempts")
if result.ground_truth is not None:
print(
f" Pass@{self.pass_at_k} result: {'✅ SUCCESS' if found_correct_answer else '❌ FAILED'}"
)
gc.collect()
return result
def _run_single_task_sync(self, task: BenchmarkTask) -> BenchmarkResult:
"""Sync wrapper for run_single_task to be used in threads"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
# Set exception handler to suppress "Task exception was never retrieved" warnings
def exception_handler(loop, context):
# Suppress all asyncio internal warnings for cleaner output
pass
loop.set_exception_handler(exception_handler)
try:
# Direct await is simpler and cleaner than gather for single task
return loop.run_until_complete(self.run_single_task(task))
finally:
loop.close()
def run_parallel_inference(
self, tasks: List[BenchmarkTask], max_concurrent: int = 3
) -> List[BenchmarkResult]:
"""Run inference on multiple tasks in parallel using multiprocessing"""
print(
f"Running inference on {len(tasks)} tasks with max_concurrent={max_concurrent} (multiprocessing)"
)
# Serialize config
cfg_dict = OmegaConf.to_container(self.cfg, resolve=True)
# Shuffle tasks to avoid order bias and improve balancing
shuffled_tasks = tasks.copy()
random.shuffle(shuffled_tasks)
# Prepare evaluator kwargs for worker processes
evaluator_kwargs = {
"data_dir": str(self.data_dir),
"benchmark_name": self.benchmark_name,
}
# Add GenericEvaluator specific kwargs if available
if hasattr(self, "metadata_file"):
evaluator_kwargs["metadata_file"] = str(self.metadata_file.name)
if hasattr(self, "task_id_field"):
evaluator_kwargs["task_id_field"] = self.task_id_field
if hasattr(self, "question_field"):
evaluator_kwargs["question_field"] = self.question_field
if hasattr(self, "ground_truth_field"):
evaluator_kwargs["ground_truth_field"] = self.ground_truth_field
if hasattr(self, "file_name_field"):
evaluator_kwargs["file_name_field"] = self.file_name_field
# Prepare serializable arguments for worker processes
worker_args = []
for task in shuffled_tasks:
task_dict = {
"task_id": task.task_id,
"task_question": task.task_question,
"ground_truth": task.ground_truth,
"file_path": task.file_path,
"metadata": task.metadata,
}
worker_args.append((task_dict, cfg_dict, evaluator_kwargs))
# Use ProcessPoolExecutor for true parallelism (bypasses GIL)
processed_results = []
task_index_map = {
task.task_id: (i, task) for i, task in enumerate(shuffled_tasks)
}
results_dict = {} # Store results by task_id to maintain order
executor = None
try:
executor = ProcessPoolExecutor(max_workers=max_concurrent)
# Submit all tasks
future_to_task_id = {}
for args in worker_args:
task_dict = args[0] # First element is task_dict
future = executor.submit(_task_worker, *args)
future_to_task_id[future] = task_dict["task_id"]
# Collect results as they complete
from concurrent.futures import as_completed
for future in as_completed(future_to_task_id):
task_id = future_to_task_id[future]
try:
result_dict = future.result()
# Reconstruct BenchmarkResult from dict
result = BenchmarkResult(**result_dict)
results_dict[task_id] = result
completed = len(results_dict)
print(
f"Progress: {completed}/{len(shuffled_tasks)} tasks completed"
)
except Exception as e:
print(f"Exception in task {task_id}: {e}")
# Get original task for error result
_, original_task = task_index_map[task_id]
error_result = BenchmarkResult(
task_id=original_task.task_id,
task_question=original_task.task_question,
ground_truth=original_task.ground_truth,
file_path=original_task.file_path,
model_boxed_answer="",
status="failed",
metadata=original_task.metadata.copy(),
error_message=str(e),
)
results_dict[task_id] = error_result
except KeyboardInterrupt:
print("\n⚠️ Received interrupt signal, shutting down gracefully...")
if executor:
print(" Cancelling pending tasks and terminating worker processes...")
# Cancel all pending futures
for future in future_to_task_id:
future.cancel()
# Forcefully terminate worker processes
# Access internal processes and terminate them
if hasattr(executor, "_processes") and executor._processes:
for pid, process in executor._processes.items():
try:
if process.is_alive():
print(f" Terminating worker process {pid}...")
process.terminate()
except Exception as e:
print(
f" Warning: Failed to terminate process {pid}: {e}"
)
# Give processes a short time to terminate gracefully
import time
time.sleep(0.5)
# Force kill any remaining processes
for pid, process in executor._processes.items():
try:
if process.is_alive():
print(f" Force killing worker process {pid}...")
process.kill()
except Exception as e:
print(f" Warning: Failed to kill process {pid}: {e}")
# Shutdown executor without waiting for pending tasks
executor.shutdown(wait=False, cancel_futures=True)
print(" Shutdown complete.")
raise
finally:
# Ensure executor is properly cleaned up
if executor:
try:
executor.shutdown(wait=True)
except Exception:
pass # Ignore errors during cleanup
# Reconstruct results in original task order
processed_results = [results_dict[task.task_id] for task in shuffled_tasks]
# Sort results to maintain original task order
task_id_to_index = {task.task_id: i for i, task in enumerate(tasks)}
processed_results.sort(
key=lambda r: task_id_to_index.get(r.task_id, len(tasks))
)
self.results = processed_results
return processed_results
def save_results(self, output_file: str) -> str:
"""Save evaluation results to JSONL file"""
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for result in self.results:
f.write(json.dumps(asdict(result), ensure_ascii=False) + "\n")
print(f"Results saved to {output_path}")
return str(output_path)
def evaluate_accuracy(self) -> float:
"""Evaluate pass@k accuracy (verification already done in run_single_task)"""
if not self.results:
print("No results to evaluate")
return 0.0
print(
f"Calculating pass@{self.pass_at_k} accuracy for {len(self.results)} results..."
)
correct_count = 0
total_count = 0
for result in self.results:
total_count += 1
# Display task results
print(f"\nTask {result.task_id}:")
print(f" Attempts: {len(result.attempts)}")
if result.ground_truth is not None:
print(
f" Pass@{self.pass_at_k}: {'✅ SUCCESS' if result.pass_at_k_success else '❌ FAILED'}"
)
print(" " + "=" * 50)
print(f" Reference: {result.ground_truth}")
print(" " + "=" * 50)
if result.pass_at_k_success:
correct_count += 1
pass_at_k_accuracy = correct_count / total_count if total_count > 0 else 0.0
print(f"\nPass@{self.pass_at_k} Final Results:")
print(f"Tasks passed: {correct_count}/{total_count}")
print(f"Pass@{self.pass_at_k} Accuracy: {pass_at_k_accuracy:.2%}")
return pass_at_k_accuracy
def _update_log_file_with_evaluation(
self,
model_boxed_answer: str,
log_file_path: str,
evaluation_result: str,
judge_type: str,
eval_details: Optional[Dict[str, Any]] = None,
):
"""Helper method to update log file with evaluation result"""
try:
log_file = Path(log_file_path)
# Read existing data
with open(log_file, "r", encoding="utf-8") as f:
log_data = json.load(f)
# Update with evaluation result
log_data["final_boxed_answer"] = model_boxed_answer
log_data["final_judge_result"] = evaluation_result
log_data["judge_type"] = judge_type
# Store evaluation details (e.g., for DeepSearchQA metrics)
if eval_details:
log_data["eval_details"] = eval_details
# Write to a temporary file and then atomically replace
temp_log_file = log_file.with_suffix(f"{log_file.suffix}.tmp")
with open(temp_log_file, "w", encoding="utf-8") as f:
json.dump(log_data, f, indent=2, ensure_ascii=False)
os.replace(temp_log_file, log_file)
print(f" Updated log file {log_file.name} with evaluation result.")
except Exception as e:
print(f" Error updating log file {log_file_path}: {e}")
class GenericEvaluator(BenchmarkEvaluator):
"""Generic benchmark evaluator for JSONL format"""
def __init__(
self,
data_dir: str,
benchmark_name: str,
cfg: DictConfig,
metadata_file: str = "metadata.jsonl",
task_id_field: str = "task_id",
question_field: str = "task_question",
ground_truth_field: str = "ground_truth",
file_name_field: Optional[str] = "file_name_field",
):
"""
Initialize generic evaluator
Args:
data_dir: Path to benchmark data directory
benchmark_name: Name of the benchmark
cfg: The Hydra configuration object
metadata_file: Name of the metadata file
task_id_field: Field name for task ID in the data
question_field: Field name for task question in the data
ground_truth_field: Field name for ground truth answer in the data
file_name_field: Field name for file name in the data (optional)
pass_at_k: Pass@K value for evaluation (default: 1)
"""
super().__init__(data_dir=data_dir, benchmark_name=benchmark_name, cfg=cfg)
self.metadata_file = self.data_dir / metadata_file
self.task_id_field = task_id_field
self.question_field = question_field
self.ground_truth_field = ground_truth_field
self.file_name_field = file_name_field
self.tasks: List[BenchmarkTask] = []
self.results: List[BenchmarkResult] = []
def load_tasks(self, limit: Optional[int] = None) -> List[BenchmarkTask]:
"""
Load benchmark tasks from metadata.jsonl
Args:
limit: Maximum number of tasks to load (None for all)
Returns:
List of BenchmarkTask objects
"""
print(f"Loading tasks from {self.metadata_file}")
if not self.metadata_file.exists():
raise FileNotFoundError(f"Metadata file not found: {self.metadata_file}")
tasks = []
with open(self.metadata_file, "r", encoding="utf-8") as f:
for i, line in enumerate(f):
if limit and i >= limit:
break
try:
data = json.loads(line.strip())
# Extract file path if specified
file_path = None
if self.file_name_field and self.file_name_field in data:
file_path = data[self.file_name_field]
# Create metadata dict with all remaining fields
metadata = {
k: v
for k, v in data.items()
if k
not in [
self.task_id_field,
self.question_field,
self.ground_truth_field,
self.file_name_field,
]
}
task = BenchmarkTask(
task_id=data[self.task_id_field],
task_question=data[self.question_field],
ground_truth=data[self.ground_truth_field],
file_path=file_path,
metadata=metadata,
)
tasks.append(task)
except Exception as e:
print(f"Warning: Failed to parse line {i + 1}: {e}")
continue
gc.collect()
self.tasks = tasks
print(f"Loaded {len(tasks)} tasks")
return tasks
def prepare_task_description(
self, task: BenchmarkTask
) -> Tuple[str, Optional[str]]:
"""
Prepare task description and file path for the agent
Args:
task: BenchmarkTask object
Returns:
Tuple of (task_description, task_file_path)
"""
task_file_path = None
if task.file_path:
# Build complete file path: data directory + relative path
full_file_path = self.data_dir / task.file_path
# Convert to absolute path and resolve any symbolic links
task_file_path = str(full_file_path.resolve())
else:
task_file_path = None
# Return task question and file path
return task.task_question, task_file_path
class CommonBenchmark:
"""Main class to run a benchmark"""
def __init__(self, cfg: DictConfig):
"""
Initialize the benchmark run
Args:
cfg: Hydra configuration object
"""
self.cfg = cfg
self.benchmark_name = cfg.benchmark.name
evaluator_kwargs = cfg.benchmark.get("evaluator_kwargs", OmegaConf.create({}))
# Support for legacy config structure
if "metadata_file" in cfg.benchmark.data:
evaluator_kwargs["metadata_file"] = cfg.benchmark.data.metadata_file
if "field_mapping" in cfg.benchmark.data:
mapping = cfg.benchmark.data.field_mapping
if "task_id_field" in mapping:
evaluator_kwargs["task_id_field"] = mapping.task_id_field
if "task_question_field" in mapping:
evaluator_kwargs["question_field"] = mapping.task_question_field
if "ground_truth_field" in mapping:
evaluator_kwargs["ground_truth_field"] = mapping.ground_truth_field
if "file_name_field" in mapping:
evaluator_kwargs["file_name_field"] = mapping.file_name_field
self.evaluator = GenericEvaluator(
data_dir=cfg.benchmark.data.data_dir,
benchmark_name=self.benchmark_name,
cfg=cfg,
**evaluator_kwargs,
)
def run_evaluation(self) -> float:
"""
Run the full benchmark evaluation process
"""
print(f"Starting evaluation for benchmark: {self.benchmark_name}")
print(f"LLM Provider: {self.evaluator.llm_provider}")
print(f"LLM Model: {self.evaluator.llm_model}")
# Load tasks
self.evaluator.load_tasks(limit=self.cfg.benchmark.execution.max_tasks)
if not self.evaluator.tasks:
print("No tasks loaded. Exiting.")
return 0.0
# Run inference
print(
f"\nStarting parallel inference with {self.cfg.benchmark.execution.max_concurrent} concurrent tasks..."
)
print(f"Using pass@{self.evaluator.pass_at_k} evaluation...")
self.evaluator.run_parallel_inference(
self.evaluator.tasks,
max_concurrent=self.cfg.benchmark.execution.max_concurrent,
)
# Evaluate accuracy
print("Evaluating accuracy...")
accuracy = self.evaluator.evaluate_accuracy()
print(f"\nOverall pass@{self.evaluator.pass_at_k} accuracy: {accuracy:.2%}")
# Save results
# Construct the full path in the correct log directory
log_dir = self.evaluator.get_log_dir()
results_path = log_dir / "benchmark_results.jsonl"
self.evaluator.save_results(str(results_path))
print(f"\nEvaluation completed! Results saved to {results_path}")
# save accuracy to a file
accuracy_file = str(results_path).replace(
".jsonl", f"_pass_at_{self.evaluator.pass_at_k}_accuracy.txt"
)
with open(accuracy_file, "w") as f:
f.write(f"{accuracy:.2%}")
# Generate and save summary
generate_summary(log_dir)
return accuracy
@hydra.main(config_path="../conf", config_name="config", version_base=None)
def run_benchmark(cfg: DictConfig) -> None:
"""
Main entry point for running benchmarks with Hydra.
"""
print("Benchmark configuration:\n", OmegaConf.to_yaml(cfg.benchmark))
benchmark = CommonBenchmark(cfg)
benchmark.run_evaluation()
if __name__ == "__main__":
run_benchmark()
================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/__init__.py
================================================
================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import glob
import os
import re
import statistics
import sys
def detect_pass_at_k(results_dir: str) -> tuple:
"""Detect the pass_at_k value used in the results directory"""
# Find all possible pass_at_k files
pattern = os.path.join(
results_dir, "run_*", "benchmark_results_pass_at_*_accuracy.txt"
)
all_files = glob.glob(pattern)
if not all_files:
print(f"No accuracy files found in {results_dir}")
print(f"Expected pattern: {pattern}")
return None, []
# Extract pass_at_k value from the first file
filename = os.path.basename(all_files[0])
match = re.search(r"pass_at_(\d+)_accuracy\.txt", filename)
if not match:
print(f"Cannot extract pass_at_k from filename: {filename}")
return None, []
k = int(match.group(1))
# Get all files with this k value
accuracy_files = glob.glob(
os.path.join(
results_dir, "run_*", f"benchmark_results_pass_at_{k}_accuracy.txt"
)
)
return k, accuracy_files
def calculate_average_scores(results_dir: str) -> dict:
"""Calculate average scores from multiple runs - automatically detect pass_at_k value"""
# Detect pass_at_k value and corresponding files
pass_at_k, accuracy_files = detect_pass_at_k(results_dir)
if pass_at_k is None:
return None
print(f"Detected pass_at_{pass_at_k} files")
print(f"Found {len(accuracy_files)} accuracy files")
scores = []
# Read each accuracy file
for i, file_path in enumerate(sorted(accuracy_files), 1):
try:
with open(file_path, "r") as f:
content = f.read().strip()
# Remove percentage sign and convert to float
score = float(content.replace("%", ""))
scores.append(score)
print(f"Run {i}: {score:.2f}%")
except Exception as e:
print(f"Error reading {file_path}: {e}")
continue
if not scores:
print("No valid scores found")
return None
# Calculate statistics
stats = {
"pass_at_k": pass_at_k,
"num_runs": len(scores),
"individual_scores": scores,
"average_score": statistics.mean(scores),
"std_dev": statistics.stdev(scores) if len(scores) > 1 else 0,
"min_score": min(scores),
"max_score": max(scores),
}
return stats
def print_results(stats: dict):
"""Print results"""
print("\n" + "=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)
print(f"Pass@{stats['pass_at_k']} Results:")
print(f"Number of runs: {stats['num_runs']}")
print(f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}")
print()
print(f"Standard deviation: {stats['std_dev']:.2f}%")
print(f"Min score: {stats['min_score']:.2f}%")
print(f"Max score: {stats['max_score']:.2f}%")
print(f"Average score: {stats['average_score']:.2f}%")
print("=" * 50)
def main():
if len(sys.argv) < 2:
print("Usage: python calculate_average_score.py ")
print("Example: python calculate_average_score.py logs/gaia-validation/mytest")
sys.exit(1)
results_dir = sys.argv[1]
if not os.path.exists(results_dir):
print(f"Results directory does not exist: {results_dir}")
sys.exit(1)
print(f"Analyzing results from: {results_dir}")
stats = calculate_average_scores(results_dir)
if stats:
print_results(stats)
# Save simple statistics results
output_file = os.path.join(
results_dir, f"average_scores_pass_at_{stats['pass_at_k']}.txt"
)
with open(output_file, "w") as f:
f.write("EVALUATION RESULTS\n")
f.write("=" * 50 + "\n")
f.write(f"Pass@{stats['pass_at_k']} Results:\n")
f.write(f"Number of runs: {stats['num_runs']}\n")
f.write(
f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\n"
)
f.write(f"Standard deviation: {stats['std_dev']:.2f}%\n")
f.write(f"Min score: {stats['min_score']:.2f}%\n")
f.write(f"Max score: {stats['max_score']:.2f}%\n")
f.write(f"Average score: {stats['average_score']:.2f}%\n")
f.write("=" * 50 + "\n")
print(f"\nResults saved to: {output_file}")
else:
print("Failed to calculate statistics")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/eval_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import json
import os
import re
import string
import warnings
from typing import Any, Dict, Literal, Optional
from dotenv import load_dotenv
from openai import AsyncOpenAI, OpenAI
from pydantic import BaseModel
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL")
evaluation_llm_client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
model_as_a_judge_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# ================================================
# verify_answer_simpleqa
# ================================================
EVALUATION_PROMPT_SIMPLEQA = """
Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
First, I will give examples of each grade, and then you will grade a new example.
The following are examples of CORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia Obama and Sasha Obama
Predicted answer 1: sasha and malia obama
Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
```
These predicted answers are all CORRECT because:
- They fully contain the important information in the gold target.
- They do not contain any information that contradicts the gold target.
- Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
- Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
The following are examples of INCORRECT predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: Malia.
Predicted answer 2: Malia, Sasha, and Susan.
Predicted answer 3: Barack Obama does not have any children.
Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
```
These predicted answers are all INCORRECT because:
- A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
The following are examples of NOT_ATTEMPTED predicted answers.
```
Question: What are the names of Barack Obama's children?
Gold target: Malia and Sasha
Predicted answer 1: I don't know.
Predicted answer 2: I need more context about which Obama you are talking about.
Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
```
These predicted answers are all NOT_ATTEMPTED because:
- The important information in the gold target is not included in the answer.
- No statements in the answer contradict the gold target.
Also note the following things:
- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
- Predicted answers "120k", "124k", and 115k" are all CORRECT.
- Predicted answers "100k" and "113k" are INCORRECT.
- Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
- For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
- For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
- Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
- For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
- For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
- Do not punish for typos in people's name if it's clearly the same name.
- For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
```
Question: {}
Gold target: {}
Predicted answer: {}
```
Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
C: NOT_ATTEMPTED
Just return the letters "A", "B", or "C", with no text around it.
""".strip()
async def verify_answer_simpleqa(
question: str, target: str, predicted_answer: str
) -> str:
"""
Use LLM to verify if the predicted answer is correct.
Expects the LLM to choose between A (correct), B or C (incorrect).
"""
messages = [
{
"role": "user",
"content": EVALUATION_PROMPT_SIMPLEQA.format(
question, target, predicted_answer
),
}
]
CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"}
try:
llm_response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14", messages=messages, max_completion_tokens=2
)
content = llm_response.choices[0].message.content
match = re.search(r"(A|B|C)", content)
if match:
return CHOICE_MAP[match.group(0)]
except Exception as e:
print(f"LLM evaluation failed: {e}")
return "NOT_ATTEMPTED"
# ================================================
# verify_answer_hle
# ================================================
HLE_JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
[question]: {question}
[response]: {response}
Your judgement must be in the format and criteria specified below:
extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
[correct_answer]: {correct_answer}
reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available."""
class HLEExtractedAnswer(BaseModel):
extracted_final_answer: str
reasoning: str
correct: Literal["yes", "no"]
confidence: int
strict: Literal[True] = True # 100% reliability
async def verify_answer_hle(question: str, target: str, predicted_answer: str) -> str:
"""
Use HLE-style LLM judge to verify if the predicted answer is correct.
Returns the evaluation result as a string: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED".
Args:
question: The question being answered
target: The correct/target answer
predicted_answer: The model's predicted answer
Returns:
String indicating the evaluation result
"""
prompt = HLE_JUDGE_PROMPT.format(
question=question, correct_answer=target, response=predicted_answer
)
try:
response = await evaluation_llm_client.beta.chat.completions.parse(
model="o3-mini-2025-01-31",
max_completion_tokens=4096,
messages=[{"role": "user", "content": prompt}],
response_format=HLEExtractedAnswer,
)
content = response.choices[0].message.parsed
# Print HLE reasoning
print(f"LLM as Judge Reasoning: {content.reasoning}")
print(f"LLM as Judge Result: {content.correct}")
print(f"LLM as Judge Confidence: {content.confidence}%")
# Convert HLE format to eval_utils format
if content.correct == "yes":
return "CORRECT"
else:
return "INCORRECT"
except Exception as e:
if "Incorrect API key provided" in str(e):
print(f"LLM evaluation failed: {e}")
exit()
print(f"LLM evaluation failed: {e}")
return "NOT_ATTEMPTED"
# ================================================
# verify_answer_gaia
# ================================================
async def verify_answer_gaia(question: str, target: str, predicted_answer: str) -> str:
"""
Use GAIA-style judge to verify if the predicted answer is correct.
"""
def normalize_number_str(number_str: str) -> float | None:
# we replace these common units and commas to allow
# conversion to float
for char in ["$", "%", ","]:
number_str = number_str.replace(char, "")
try:
return float(number_str)
except ValueError:
print(f"String {number_str} cannot be normalized to number str.")
return None # Return None instead of inf to handle gracefully
def split_string(
s: str,
char_list: list[str] = [",", ";"],
) -> list[str]:
pattern = f"[{''.join(char_list)}]"
return re.split(pattern, s)
def normalize_str(input_str, remove_punct=True) -> str:
"""
Normalize a string by:
- Removing all white spaces
- Optionally removing punctuation (if remove_punct is True)
- Converting to lowercase
Parameters:
- input_str: str, the string to normalize
- remove_punct: bool, whether to remove punctuation (default: True)
Returns:
- str, the normalized string
"""
# Remove all white spaces. Required e.g for seagull vs. sea gull
no_spaces = re.sub(r"\s", "", input_str)
# Remove punctuation, if specified.
if remove_punct:
translator = str.maketrans("", "", string.punctuation)
return no_spaces.lower().translate(translator)
else:
return no_spaces.lower()
def question_scorer(
model_answer: str,
ground_truth: str,
) -> bool:
def is_float(element: any) -> bool:
try:
float(element)
return True
except ValueError:
return False
if model_answer is None:
model_answer = "None"
# if gt is a number
if is_float(ground_truth):
print(f"Evaluating {model_answer} as a number.")
normalized_answer = normalize_number_str(model_answer)
# If normalization failed, the answer is incorrect
if normalized_answer is None:
return False
return normalized_answer == float(ground_truth)
# if gt is a list
elif any(char in ground_truth for char in [",", ";"]):
print(f"Evaluating {model_answer} as a comma separated list.")
# question with the fish: normalization removes punct
gt_elems = split_string(ground_truth)
ma_elems = split_string(model_answer)
# check length is the same
if len(gt_elems) != len(ma_elems):
warnings.warn(
"Answer lists have different lengths, returning False.", UserWarning
)
return False
# compare each element as float or str
comparisons = []
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
if is_float(gt_elem):
normalized_ma_elem = normalize_number_str(ma_elem)
# If normalization failed, this element is incorrect
if normalized_ma_elem is None:
comparisons.append(False)
else:
comparisons.append(normalized_ma_elem == float(gt_elem))
else:
# we do not remove punct since comparisons can include punct
comparisons.append(
normalize_str(ma_elem, remove_punct=False)
== normalize_str(gt_elem, remove_punct=False)
)
return all(comparisons)
# if gt is a str
else:
print(f"Evaluating {model_answer} as a string.")
return normalize_str(model_answer) == normalize_str(ground_truth)
# Use the question_scorer to evaluate the answer
try:
is_correct = question_scorer(predicted_answer, target)
return "CORRECT" if is_correct else "INCORRECT"
except Exception as e:
print(f"GAIA evaluation failed: {e}")
raise e
# use raise error instead, later we could judge it as NOT_ATTEMPTED.
# return "NOT_ATTEMPTED"
# ================================================
# verify_answer_gaia_validation_text_103
# Prompt from WebAgent
# https://github.com/Alibaba-NLP/WebAgent/blob/f25dae54daf0ce2874ffd5ed5ffb20feca7c4c4e/WebSailor/src/prompt.py#L98
# ================================================
GAIA_VALIDATION_TEXT_103_SCORER_PROMPT = """You are an evaluation assistant. Please determine if the predicted answer is equivalent to the labeled answer.
Question: {question}
Labeled Answer: {correct_answer}
Predicted Answer: {response}
Did the model give an answer **equivalent** to the labeled answer? Please respond with "Correct" if they are equivalent, or "Incorrect" if they are not equivalent. Do not include any other text.
"""
async def verify_answer_gaia_validation_text_103(
question: str, target: str, predicted_answer: str
) -> str:
prompt = GAIA_VALIDATION_TEXT_103_SCORER_PROMPT.format(
question=question, correct_answer=target, response=predicted_answer
)
max_tries = 10
for attempt in range(max_tries):
try:
response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14",
messages=[{"role": "user", "content": prompt}],
)
content = response.choices[0].message.content
print("LLM Judge Response: ", content)
if response:
break
except Exception as e:
if attempt == (max_tries - 1):
raise e
# Use case-insensitive matching and strip whitespace/punctuation
content_normalized = content.strip().rstrip(".").lower()
if content_normalized == "correct":
return "CORRECT"
elif content_normalized == "incorrect":
return "INCORRECT"
else:
# If we can't parse the response, default to NOT_ATTEMPTED to trigger retry
print(f"Warning: Could not parse judge response: {content}")
return "NOT_ATTEMPTED"
# ================================================
# verify_answer_browsecomp
# Prompt from Tongyi DeepResearch
# https://github.com/Alibaba-NLP/DeepResearch/blob/main/WebAgent/WebWatcher/infer/evaluation/prompt.py#L110
# ================================================
JUDGE_PROMPT_BC_zh = """
请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为:【正确】、【错误】。
首先,我们将列出每个评定类别的示例,然后请您对新问题的预测答案进行评定。
以下是【正确】的答复示例:
```
问题:贝拉克·奥巴马的孩子叫什么名字?
标准答案:玛丽亚·奥巴马和萨莎·奥巴马
模型预测1:Malia Obama and Sasha Obama
模型预测2:玛丽亚和萨沙
模型预测3:大多数人会说是玛丽亚和萨莎,但我不确定,需要再确认
模型预测4:巴拉克·奥巴马有两个女儿,她们分别是玛丽亚·安和娜塔莎·玛丽安,但通常称作玛丽亚·奥巴马和萨莎·奥巴马。
```
这些答复均为【正确】,因为:
- 完整地包含了标准答案中的重要信息。
- 不包含任何与标准答案矛盾的信息。
- 只关注语义内容,中英文,大小写、标点、语法和顺序不重要。
- 答复中出现模糊语句或猜测是可以接受的,前提是包含了标准答案且不含有不正确信息或矛盾。
以下是【错误】的答复示例:
```
问题:巴拉克·奥巴马的孩子叫什么名字?
标准答案:玛丽亚·奥巴马和萨莎·奥巴马
模型预测1:玛丽亚
模型预测2:玛丽亚、萨莎和苏珊和萨莎·奥巴马或玛丽亚·奥巴马,或娜塔莎·玛丽安,或爱因斯坦
模型预测3:虽然我不知道他们的确切名字,但能说出巴拉克·奥巴马有两个孩子。
模型预测4:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗?
模型预测5:巴拉克·奥巴马的孩子
```
这些答复均为【错误】,因为:
- 答复中包含与标准答案矛盾的事实陈述。
- 答案为空、重复表述问题。
- 答案枚举了多个答案,重复表述答案。
需要格外注意的是:
- 标准答案中包含对于问题中多个方面的回答,并且在同一个方面的答案中可能会有多种不同的描述,这些描述均是正确的,并且在同一个括号中给出,通过逗号连接。例如,考虑问题"抖音自己的人工智能大模型叫什么名字?",标准答案为"【【豆包,云雀】】":
- 预测答案"豆包"、"豆包、云雀"、"云雀"等均为【正确】。
- 对于标准答案中包含的不同方面的回答,模型需要同时给出所有方面的回答才可以算是正确,否则直接判断为【错误】,不存在【部分正确】这种输出方式,这些答案会在不同的括号中给出。例如,考虑问题"TFBOYS组合中的成员有哪些?",标准答案为"【【王俊凯】【王源】【易洋千玺】】":
- 预测答案"王俊凯、王源、易洋千玺"等同时包含所有答案,才可以算为【正确】。
- 预测答案为"王俊凯、易洋千玺"等没有同时包含所有答案,会被算为【错误】。
另外注意以下几点:
- 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题"金山铁路黄浦江特大桥的全长是多少米?",标准答案为"3518.17":
- 预测答案"3518"、"3518.1"、"3518.17"均为【正确】。
- 预测答案"3520"和"3600"均为【错误】。
- 如果模型预测并没有直接回答问题,模型试图绕过或未能直接给出标准答案视为【错误】答案。
- 例如:问题"林宥嘉的老婆是谁",标准答案为"丁文琪"。模型预测"林宥嘉的老婆"、"林宥嘉的老婆应该很优秀"、"林宥嘉的老婆可能是某个公众人物"均为【错误】。
- 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。
- 例如,考虑问题"菱镁矿的主要化学成分是什么?"标准答案为"碳酸镁(MgCO3)"。"碳酸镁"或"MgCO3"均视为【正确】答案。
- 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。
- 例如,问题"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?"标准答案为"意大利撒丁岛",预测答案"撒丁岛"被视为【正确】。
- 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。
- 例如,如果标准答案是"Robinson",那么回答鲁滨逊或者鲁滨孙均正确。
- 你应该更关注标准答案和模型预测的匹配度,而不是关心标准答案是否是正确的。
下面是一个新的问题示例。请只回复【正确】、【错误】之一,不要道歉或纠正自己的错误,只需要评估该回答。
```
问题: {question}
标准答案: {correct_answer}
预测答案: {response}
```
将此新问题的预测答案评定为以下之一:
A.【正确】
B.【错误】
只返回【正确】、【错误】所代表的选项即可,即仅返回A或B即可,无须添加任何其他的文本。
""".strip()
JUDGE_PROMPT_BC_en = """
Based on the given question, standard answer, and model-predicted answer, evaluate whether the model's response is correct. Your task is to classify the result as: [CORRECT] or [INCORRECT].
First, we'll list examples for each category, then you'll evaluate a new question's predicted answer.
Here are examples of [CORRECT] responses:
```
Question: What are the names of Barack Obama's children?
Standard Answer: Malia Obama and Sasha Obama
Model Prediction 1: Malia Obama and Sasha Obama
Model Prediction 2: Malia and Sasha
Model Prediction 3: Most would say Malia and Sasha, but I'm not sure, I should verify
Model Prediction 4: Barack Obama has two daughters, Malia Ann and Natasha Marian, commonly known as Malia Obama and Sasha Obama.
```
These responses are all [CORRECT] because they:
- Fully include the important information from the standard answer.
- Don't contain any information that contradicts the standard answer.
- Focus only on semantic content; language, capitalization, punctuation, grammar, and order aren't important.
- Vague statements or guesses are acceptable as long as they include the standard answer and don't contain incorrect information or contradictions.
Here are examples of [INCORRECT] responses:
```
Question: What are the names of Barack Obama's children?
Standard Answer: Malia Obama and Sasha Obama
Model Prediction 1: Malia
Model Prediction 2: Malia, Sasha and Susan or Sasha Obama or Malia Obama, or Natasha Marian, or Einstein
Model Prediction 3: While I don't know their exact names, I can tell you Barack Obama has two children.
Model Prediction 4: You might be thinking of Betsy and Olivia. But you should verify the details with the latest references. Is that the correct answer?
Model Prediction 5: Barack Obama's children
```
These responses are all [INCORRECT] because they:
- Contain factual statements that contradict the standard answer.
- Are empty or merely repeat the question.
- Enumerate multiple answers or repeat the answer.
Pay special attention to the following:
- The standard answer may contain responses to multiple aspects of the question, and within the same aspect, there might be different descriptions, all of which are correct and are given in the same bracket, connected by commas. For example, for the question "What is the name of ByteDance's AI model?", the standard answer is "[[Doubao, Skylark]]":
- Predicted answers "Doubao", "Doubao, Skylark", "Skylark", etc. are all [CORRECT].
- For standard answers containing responses to different aspects, the model needs to provide answers to all aspects to be considered correct; otherwise, it's directly judged as [INCORRECT]. There is no [PARTIALLY CORRECT] output option. These answers will be given in different brackets. For example, for the question "Who are the members of TFBOYS?", the standard answer is "[[Wang Junkai][Wang Yuan][Yi Yangqianxi]]":
- Predicted answers like "Wang Junkai, Wang Yuan, Yi Yangqianxi" that include all answers are [CORRECT].
- Predicted answers like "Wang Junkai, Yi Yangqianxi" that don't include all answers are [INCORRECT].
Also note the following points:
- For questions with numerical standard answers, the predicted answer should match the standard answer. For example, for the question "What is the total length in meters of the Huangpu River Bridge on the Jinshan Railway?", the standard answer is "3518.17":
- Predicted answers "3518", "3518.1", "3518.17" are all [CORRECT].
- Predicted answers "3520" and "3600" are [INCORRECT].
- If the model prediction doesn't directly answer the question, attempts to circumvent or fails to directly provide the standard answer, it's considered an [INCORRECT] answer.
- For example, for the question "Who is JJ Lin's wife?", with the standard answer "Ding Wenqi", model predictions like "JJ Lin's wife", "JJ Lin's wife should be excellent", "JJ Lin's wife might be a public figure" are all [INCORRECT].
- If the standard answer contains more information than the question asks for, the predicted answer only needs to include the information mentioned in the question.
- For example, for the question "What is the main chemical component of magnesite?", with the standard answer "Magnesium carbonate (MgCO3)", "Magnesium carbonate" or "MgCO3" are both considered [CORRECT] answers.
- If information omitted in the predicted answer can be clearly inferred from the question, it's considered correct.
- For example, for the question "The Nuragic ruins of Barumini were listed as a World Cultural Heritage by UNESCO in 1997, so where is this site located?", with the standard answer "Sardinia, Italy", the predicted answer "Sardinia" is considered [CORRECT].
- If it's clear that different translations of a name refer to the same person, it's considered correct.
- For example, if the standard answer is "Robinson", answers like "Lubinson" or "Lubinsun" are both correct.
- You should focus more on the match between the standard answer and the model prediction, rather than whether the standard answer itself is correct.
Below is a new question example. Please reply with only [CORRECT] or [INCORRECT], without apologies or corrections to your own errors, just evaluate the answer.
```
Question: {question}
Standard Answer: {correct_answer}
Predicted Answer: {response}
```
Evaluate this new question's predicted answer as one of the following:
A. [CORRECT]
B. [INCORRECT]
Return only the option representing [CORRECT] or [INCORRECT], i.e., just return A or B, without adding any other text.
""".strip()
async def verify_answer_browsecomp(
question: str, target: str, predicted_answer: str
) -> str:
"""
Use BrowseComp judge (English version) to verify if the predicted answer is correct.
Expects the LLM to return A (correct) or B (incorrect).
"""
prompt = JUDGE_PROMPT_BC_en.format(
question=question, correct_answer=target, response=predicted_answer
)
try:
response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14",
messages=[{"role": "user", "content": prompt}],
max_completion_tokens=2,
)
content = response.choices[0].message.content
print(f"BrowseComp Judge Response: {content}")
# Extract A or B from the response
match = re.search(r"[AB]", content)
if match:
choice = match.group(0)
if choice == "A":
return "CORRECT"
elif choice == "B":
return "INCORRECT"
# If no clear A or B is found, return NOT_ATTEMPTED to trigger retry
print(f"Warning: Could not parse BrowseComp judge response: {content}")
return "NOT_ATTEMPTED"
except Exception as e:
print(f"BrowseComp evaluation failed: {e}")
raise e
async def verify_answer_browsecomp_zh(
question: str, target: str, predicted_answer: str
) -> str:
"""
Use BrowseComp judge (Chinese version) to verify if the predicted answer is correct.
Expects the LLM to return A (correct) or B (incorrect).
"""
prompt = JUDGE_PROMPT_BC_zh.format(
question=question, correct_answer=target, response=predicted_answer
)
try:
response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14",
messages=[{"role": "user", "content": prompt}],
max_completion_tokens=2,
)
content = response.choices[0].message.content
print(f"BrowseComp-ZH Judge Response: {content}")
# Extract A or B from the response
match = re.search(r"[AB]", content)
if match:
choice = match.group(0)
if choice == "A":
return "CORRECT"
elif choice == "B":
return "INCORRECT"
# If no clear A or B is found, return NOT_ATTEMPTED to trigger retry
print(f"Warning: Could not parse BrowseComp-ZH judge response: {content}")
return "NOT_ATTEMPTED"
except Exception as e:
print(f"BrowseComp-ZH evaluation failed: {e}")
raise e
# ================================================
# verify_answer_xbench_deepsearch
# Prompt from XBench-Evals
# https://github.com/xbench-ai/xbench-evals/blob/main/eval_grader.py#L25
# ================================================
JUDGE_PROMPT_XBENCH = """
你是一个通用人工智能助手。根据下面给出的[正确答案], 判断以下对[原问题]的[回答]的回答是否正确。
[原问题]: {question}
[正确答案]: {correct_answer}
[回答]:{response}
你的判断必须按照以下格式和标准进行:
最终答案: 从[回答]中提取出的最终准确答案。如果[回答]中没有明确的最终答案, 则填写'无'。
解释: 根据[正确答案]解释为什么[最终答案]是正确的或错误的。只关注[最终答案]与[正确答案]之间是否存在实质性差异, 不要评论题目的背景, 不要尝试重新解题, 不要为任何不同于[正确答案]的答案辩护, 只专注于判断答案是否一致。
结论: 如果[最终答案]与上方给出的[正确答案]一致, 或者在数值题目中处于可接受的微小误差范围内, 则填写'正确'; 否则(即存在任何不一致、歧义、不等价或提取出的答案错误的情况)填写'错误'。
""".strip()
async def verify_answer_xbench_deepsearch(
question: str, target: str, predicted_answer: str
) -> str:
"""
Use XBench-DeepSearch judge to verify if the predicted answer is correct.
"""
def parse_match_result(match):
if match is None:
return match
match = match.group(0)
try:
target = match.split(":")[1].strip()
return target
except Exception:
return match # return naive result in case of failed
if predicted_answer is None:
return "INCORRECT"
judge_prompt = JUDGE_PROMPT_XBENCH.format(
question=question,
correct_answer=target,
response=predicted_answer,
)
try:
response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14",
messages=[{"role": "user", "content": judge_prompt}],
)
judge_response = response.choices[0].message.content
except Exception:
judge_response = None
if judge_response is None:
return "NOT_ATTEMPTED"
# Extract grader conclusions
extract_match = re.search(r"最终答案:*(.*)", judge_response)
extract_match = parse_match_result(extract_match)
# Fixed regex: make the dot optional with \s* (zero or more whitespace)
correct_match = re.search(r"结论:*\s*(正确|错误)", judge_response)
correct_match = parse_match_result(correct_match)
explain_match = re.search(r"解释:*(.*)", judge_response)
explain_match = parse_match_result(explain_match)
# Print debug info
print(f"XBench Judge - Extract: {extract_match}, Correct: {correct_match}")
if correct_match == "正确":
return "CORRECT"
elif correct_match == "错误":
return "INCORRECT"
else:
# If we can't parse the result, return NOT_ATTEMPTED to trigger retry
print(
f"Warning: Could not parse XBench judge response, correct_match={correct_match}"
)
return "NOT_ATTEMPTED"
# ================================================
# verify_answer_deepsearchqa
#
# Official prompt from DeepSearchQA benchmark
# https://www.kaggle.com/code/andrewmingwang/deepsearchqa-starter-code
# ================================================
JUDGE_PROMPT_DEEPSEARCHQA = """Your task is to evaluate whether a given "AI Response" for a specific "User Prompt" arrived at the correct answer.
**Answer Correctness Task**
* **Purpose:** Assess whether the AI response provides the correct answer(s) based on the provided "Correct Answer" and "Prompt Type".
* **Process:**
* Identify the "Prompt Type": "".
* Refer to the "Correct Answer": "".
* Based on the "Prompt Type", determine if the "AI Response" contains the expected answer(s).
* **'Single Answer'**: Check if the response provides the answer that addresses the user's question. It does not have to match the exact wording of the provided answer.
* **'Set Answer'**: Check if the response includes *each* item from the provided ground truth answers. The order might not matter unless specified otherwise. The response might include more answers than the list. Determine the correctness *only* based on the list first and then check if the response includes answers not in the list.
* **Explanation:** Provide a brief explanation justifying your assessment of answer correctness, referencing specific parts of the AI response and the correct answer.
* **Correctness Details:** Provide a dictionary, one key for each expected answer part, and value is a boolean indicating whether each expected answer part was found.
* For 'Set Answer', this will be a list of attributes, one for each item/part in the "Correct Answer". Each key will be a string indicating the expected answer part, and the value will be a boolean indicating whether that part was found in the response.
* **Excessive Answers:** Provide a list of strings, each indicating an excessive answer part. If the response provides answers that are **not** in the "Correct Answer" list, add these answers as excessive answers. Return an empty list when there's no excessive answers in the response.
**Output Format:**
Your evaluation *must* be structured as a nested JSON dictionary with the following top-level keys: `"Answer Correctness"`. Please return NULL if any of "Prompt", "AI Response" or "Correct Answer" is empty.
The value for `"Answer Correctness"` should be a dictionary containing `"Explanation"` (a string), `"Correctness Details"` (a dictionary where each key is the expected correct answer, and the value is a boolean indicating whether the response contains the correct answer), and `"Excessive Answers"` (a list of strings indicating the excessive answers).
Make sure you return a valid JSON string. Pay special attention to quotes, commas and special characters in the JSON string. Make sure to escape all special characters and quotes in the JSON string.
**Example (Partial):**
"```json
{{
"Answer Correctness": {{
"Explanation": "The response correctly identified Belgium and France but also includes an excessive answer, Italy.",
"Correctness Details": {{
"Belgium": true,
"France": true,
}},
"Excessive Answers": [ "Italy" ]
}}
}}
```"
**Now, proceed with the evaluation using the provided User Prompt, AI Response, and Correct Answer.**
User Prompt (Wrapped in and ):
{prompt}
--------------------
** Correct Answer (Wrapped in and ):
Prompt Type: {prompt_type}
{answer}
--------------------
AI assistant response (Wrapped in and ):
{response}
--------------------
Rating:"""
async def verify_answer_deepsearchqa(
question: str,
target: str,
predicted_answer: str,
metadata: Optional[Dict[str, Any]] = None,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
"""
Use DeepSearchQA-specific judge to verify if the predicted answer is correct.
Uses the official DeepSearchQA evaluation prompt with JSON output format.
Args:
question: The question being answered
target: The correct/target answer
predicted_answer: The model's predicted answer
metadata: Optional metadata dict with additional context (e.g., problem_category, answer_type)
Returns:
Tuple of (result, judge_type, details_dict):
- result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED"
- judge_type: "deepsearchqa_judge"
- details_dict: Dict with keys:
- correctness_details: Dict[str, bool] mapping answer parts to correctness
- excessive_answers: List[str] of extra answers not in ground truth
- explanation: str explaining the judgment
- num_correct: int number of correct answer parts
- num_expected: int total number of expected answer parts
- num_excessive: int number of excessive answers
"""
if predicted_answer is None:
return "INCORRECT", "deepsearchqa_judge", None
# Determine prompt_type from metadata
prompt_type = "Single Answer" # Default
if metadata and "answer_type" in metadata:
answer_type = metadata["answer_type"]
# Map answer_type to prompt_type
if answer_type == "Set Answer":
prompt_type = "Set Answer"
# Add more mappings if needed
judge_prompt = JUDGE_PROMPT_DEEPSEARCHQA.format(
prompt_type=prompt_type,
prompt=question,
answer=target,
response=predicted_answer,
)
try:
response = await evaluation_llm_client.chat.completions.create(
model="gpt-4.1-2025-04-14",
messages=[{"role": "user", "content": judge_prompt}],
)
judge_response = response.choices[0].message.content
except Exception as e:
print(f"DeepSearchQA judge failed: {e}")
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
if judge_response is None:
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
# Parse JSON response
try:
# Extract JSON from the response (might be wrapped in markdown code blocks)
json_match = re.search(r"```json\s*(\{.*?\})\s*```", judge_response, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# Try to find JSON without code blocks
json_match = re.search(r"\{.*\}", judge_response, re.DOTALL)
if json_match:
json_str = json_match.group(0)
else:
print("Warning: Could not find JSON in DeepSearchQA judge response")
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
result = json.loads(json_str)
answer_correctness = result.get("Answer Correctness", {})
explanation = answer_correctness.get("Explanation", "")
correctness_details = answer_correctness.get("Correctness Details", {})
excessive_answers = answer_correctness.get("Excessive Answers", [])
# Calculate statistics
num_expected = len(correctness_details)
num_correct = sum(1 for v in correctness_details.values() if v)
num_excessive = len(excessive_answers)
# Build details dict
details = {
"correctness_details": correctness_details,
"excessive_answers": excessive_answers,
"explanation": explanation,
"num_correct": num_correct,
"num_expected": num_expected,
"num_excessive": num_excessive,
}
# Print debug info
print(
f"DeepSearchQA Judge - Correct: {num_correct}/{num_expected}, Excessive: {num_excessive}"
)
print(f"DeepSearchQA Judge - Explanation: {explanation}")
# Determine if answer is correct
# Following official logic: all expected parts must be found, and no excessive answers
if correctness_details:
all_correct = all(correctness_details.values())
if all_correct and not excessive_answers:
return "CORRECT", "deepsearchqa_judge", details
else:
# Either missing some expected answers or has excessive answers
return "INCORRECT", "deepsearchqa_judge", details
else:
# No correctness details, can't determine
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
except json.JSONDecodeError as e:
print(f"Warning: Failed to parse JSON from DeepSearchQA judge: {e}")
print(f"Response: {judge_response[:200]}...")
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
except Exception as e:
print(f"Warning: Error processing DeepSearchQA judge response: {e}")
return "NOT_ATTEMPTED", "deepsearchqa_judge", None
# ================================================
# verify_answer_for_datasets
# ================================================
async def _verify_answer_for_datasets_core(
benchmark_name: str,
question: str,
target: str,
predicted_answer: str,
metadata: Optional[Dict[str, Any]] = None,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
"""
Verify the answer for a given dataset.
Args:
benchmark_name: Name of the benchmark dataset
question: The question being answered
target: The correct/target answer
predicted_answer: The model's predicted answer
metadata: Optional metadata dict with additional context
Returns:
A tuple of (result, judge_type, details_dict).
details_dict is None for most benchmarks, but contains evaluation details for DeepSearchQA.
"""
# For benchmarks that need detailed evaluation, don't use exact_match
if benchmark_name not in ["deepsearchqa"]:
if predicted_answer == target:
return "CORRECT", "exact_match", None
# For gaia-validation, use gaia-validation-text-103-scorer
# We found that gaia_scorer tends to label many correct answers as incorrect, so we believe
# that using an LLM-as-judge approach can more accurately reflect the model’s performance.
if benchmark_name == "gaia-validation":
# result = await verify_answer_gaia(question, target, predicted_answer)
# return result, "gaia_scorer", None
result = await verify_answer_gaia_validation_text_103(
question, target, predicted_answer
)
return result, "gaia_validation_text_103_judge", None
# For gaia-validation-text-103, use gaia-validation-text-103-scorer
elif benchmark_name == "gaia-validation-text-103":
result = await verify_answer_gaia_validation_text_103(
question, target, predicted_answer
)
return result, "gaia_validation_text_103_judge", None
# For browsecomp (English) and browsecomp-zh (Chinese), use different judges
elif benchmark_name == "browsecomp":
result = await verify_answer_browsecomp(question, target, predicted_answer)
return result, "browsecomp_judge", None
elif benchmark_name == "browsecomp_zh":
result = await verify_answer_browsecomp_zh(question, target, predicted_answer)
return result, "browsecomp_zh_judge", None
# For hle, hle-text-500, and hle-text-2158, use hle_judge
elif "hle" in benchmark_name:
result = await verify_answer_hle(question, target, predicted_answer)
return result, "hle_judge", None
# For webwalkerqa, frames, and seal-0, use gaia_validation_text_103_judge
elif benchmark_name in ["webwalkerqa", "frames", "seal-0"]:
result = await verify_answer_gaia_validation_text_103(
question, target, predicted_answer
)
return result, "gaia_validation_text_103_judge", None
# For simpleqa, use simpleqa_judge
elif benchmark_name == "simpleqa" or benchmark_name == "collect_trace":
result = await verify_answer_simpleqa(question, target, predicted_answer)
return result, "simpleqa_judge", None
# For xbench_deepsearch, use xbench_deepsearch_judge
elif benchmark_name == "xbench_deepsearch":
result = await verify_answer_xbench_deepsearch(
question, target, predicted_answer
)
return result, "xbench_deepsearch_judge", None
# For deepsearchqa, use deepsearchqa_judge (with metadata support and detailed evaluation)
elif benchmark_name == "deepsearchqa":
result, judge_type, details = await verify_answer_deepsearchqa(
question, target, predicted_answer, metadata
)
# Return details for DeepSearchQA-specific metrics calculation
return result, judge_type, details
# For other benchmarks, use gaia_validation_text_103_judge
else:
result = await verify_answer_gaia_validation_text_103(
question, target, predicted_answer
)
return result, "gaia_validation_text_103_judge", None
async def verify_answer_for_datasets(
benchmark_name: str,
question: str,
target: str,
predicted_answer: str,
metadata: Optional[Dict[str, Any]] = None,
max_retries: int = 10,
retry_interval: int = 5,
) -> tuple[str, str, Optional[Dict[str, Any]]]:
"""
Wrapper with retry logic for NOT_ATTEMPTED results.
Args:
benchmark_name: Name of the benchmark dataset
question: The question being answered
target: The correct/target answer
predicted_answer: The model's predicted answer
metadata: Optional metadata dict with additional context
max_retries: Maximum number of retry attempts
retry_interval: Seconds to wait between retries
Returns:
A tuple of (result, judge_type, details_dict).
details_dict contains evaluation details (for DeepSearchQA) or None (for other benchmarks).
"""
for attempt in range(1, max_retries + 1):
result, judge_type, details = await _verify_answer_for_datasets_core(
benchmark_name, question, target, predicted_answer, metadata
)
if result != "NOT_ATTEMPTED":
return result, judge_type, details
if attempt < max_retries:
print(
f"[Retry {attempt}/{max_retries}] Got NOT_ATTEMPTED, retrying in {retry_interval}s..."
)
await asyncio.sleep(retry_interval)
# still NOT_ATTEMPTED after retries
print(f"All {max_retries} attempts resulted in NOT_ATTEMPTED.")
return "NOT_ATTEMPTED", "retry_wrapper", None
================================================
FILE: apps/miroflow-agent/benchmarks/evaluators/extract_futurex_results.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import json
import os
from collections import Counter, defaultdict
from typing import Dict, List, Tuple
def majority_vote(
preds: List[str], first_seen_order: Dict[str, int]
) -> Tuple[str, Dict[str, int]]:
"""
Compute the majority-vote prediction for a list of candidate predictions.
Tie-breaking rules (deterministic):
1) Highest frequency wins.
2) If there is a tie on frequency, choose the candidate that appeared earliest
across all runs (based on the provided first_seen_order index).
3) As a final guard (shouldn't be needed if first_seen_order is complete),
fall back to lexicographic order.
Returns:
(chosen_prediction, counts_dict)
"""
counter = Counter(preds)
# Get the max vote count
max_count = max(counter.values())
# All candidates that share the max vote count
tied = [c for c, cnt in counter.items() if cnt == max_count]
if len(tied) == 1:
chosen = tied[0]
else:
# Prefer the one seen earliest globally
tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x))
chosen = tied[0]
# Expose counts for optional debugging/inspection
return chosen, dict(counter)
def discover_runs(results_dir: str) -> List[str]:
"""
Discover subdirectories inside results_dir that potentially contain a
'benchmark_results.jsonl'. We don't strictly require the subdir name to
start with 'run_', but we sort the list to keep processing deterministic.
"""
runs = []
for name in sorted(os.listdir(results_dir)):
path = os.path.join(results_dir, name)
if os.path.isdir(path):
fpath = os.path.join(path, "benchmark_results.jsonl")
if os.path.isfile(fpath):
runs.append(path)
return runs
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Aggregate multiple run_*/benchmark_results.jsonl files and produce a FutureX submission with majority voting."
)
parser.add_argument(
"results_dir",
help="Path to results dir containing run_*/benchmark_results.jsonl",
)
parser.add_argument(
"-o",
"--output",
default=None,
help="Output JSONL file path (default: /futurex_submission.jsonl)",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
results_dir = os.path.abspath(args.results_dir)
if not os.path.isdir(results_dir):
raise FileNotFoundError(f"Results dir not found: {results_dir}")
output_file = (
os.path.abspath(args.output)
if args.output
else os.path.join(results_dir, "futurex_submission.jsonl")
)
# Maps task_id -> list of predictions collected across runs
preds_by_task: Dict[str, List[str]] = defaultdict(list)
# Track first-seen order index for each distinct prediction string across all runs.
# This enables deterministic tie-breaking.
first_seen_order: Dict[str, int] = {}
next_order_idx = 0
runs = discover_runs(results_dir)
if not runs:
raise FileNotFoundError(
f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}"
)
total_lines = 0
used_lines = 0
# Read and aggregate predictions
for run_dir in runs:
fpath = os.path.join(run_dir, "benchmark_results.jsonl")
print(f"Reading: {fpath}")
with open(fpath, "r", encoding="utf-8") as fin:
for line in fin:
total_lines += 1
line = line.strip()
if not line:
continue
try:
rec = json.loads(line)
except json.JSONDecodeError:
# Skip malformed JSON lines, but keep going
continue
task_id = rec.get("task_id")
pred = rec.get("model_boxed_answer")
# Only accept non-empty strings; coerce to str for safety
if task_id and pred is not None and str(pred).strip():
pred_str = str(pred).strip()
preds_by_task[task_id].append(pred_str)
if pred_str not in first_seen_order:
first_seen_order[pred_str] = next_order_idx
next_order_idx += 1
used_lines += 1
# Write submission JSONL
# We sort task_ids to keep output reproducible.
num_tasks = 0
with open(output_file, "w", encoding="utf-8") as out:
for task_id in sorted(preds_by_task.keys()):
voted_pred, _counts = majority_vote(
preds_by_task[task_id], first_seen_order
)
out.write(
json.dumps(
{"id": task_id, "prediction": voted_pred}, ensure_ascii=False
)
+ "\n"
)
num_tasks += 1
# Optional: small summary to stdout
print(f"Collected from {len(runs)} run(s).")
print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
print(f"Aggregated {num_tasks} unique task_id(s).")
print(f"✅ Submission saved to {output_file}")
if __name__ == "__main__":
main()
================================================
FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-text-103-grader.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
GAIA-Text-103 Task Grader
This script:
1. Loads extracted GAIA-Text-103 tasks from the extraction directory
2. Grades each task using the GAIA-Text-103 evaluator (LLM judgement)
3. Updates the original task files with grading results
Usage:
uv run benchmarks/subset_extraction/gaia-text-103-grader.py /path/to/extraction/directory
"""
import argparse
import asyncio
import json
import os
import sys
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional
# Add the benchmarks directory to the path to import evaluators
sys.path.append(str(Path(__file__).parent.parent))
from evaluators.eval_utils import verify_answer_gaia_validation_text_103
@dataclass
class GradingResult:
"""Result of grading a single task"""
task_id: str
run_name: str
file_path: str
question: str
ground_truth: str
predicted_answer: str
judge_result: str
judge_type: str = "gaia_validation_text_103_scorer"
grading_time: float = 0.0
error_message: str = ""
class GAIAText103Grader:
"""Grader for GAIA-Text-103 tasks using LLM judgement"""
def __init__(self, extraction_dir: str):
"""
Initialize the grader
Args:
extraction_dir: Directory containing extracted GAIA-Text-103 tasks
"""
self.extraction_dir = Path(extraction_dir)
self.results: List[GradingResult] = []
self.stats = {
"total_tasks": 0,
"graded_tasks": 0,
"errors": 0,
"total_grading_time": 0.0,
}
def find_task_files(self) -> List[Path]:
"""Find all task JSON files in the extraction directory"""
task_files = []
# Recursively search for task files
for root, dirs, files in os.walk(self.extraction_dir):
for file in files:
if file.startswith("task_") and file.endswith(".json"):
task_files.append(Path(root) / file)
return sorted(task_files)
def extract_task_info(self, task_file: Path) -> Optional[Dict]:
"""Extract task information from a task file"""
try:
with open(task_file, "r", encoding="utf-8") as f:
task_data = json.load(f)
# Check if task has already been graded with our specific scorer
if task_data.get("judge_type") == "gaia_validation_text_103_scorer":
print(f"Skipping already graded task: {task_file.name}")
return None
# Extract basic information
task_info = {
"task_id": task_data.get("task_id", ""),
"run_name": task_data.get("run_name", ""),
"file_path": str(task_file),
"question": task_data.get("input", {}).get("task_description", ""),
"ground_truth": task_data.get("ground_truth", ""),
"predicted_answer": task_data.get("final_boxed_answer", ""),
}
# Validate required fields
if not all(
[
task_info["question"],
task_info["ground_truth"],
task_info["predicted_answer"],
]
):
print(f"Warning: Missing required fields in {task_file}")
print(f" question: {task_info['question']}")
print(f" ground_truth: {task_info['ground_truth']}")
print(f" predicted_answer: {task_info['predicted_answer']}")
return None
return task_info
except Exception as e:
print(f"Error reading task file {task_file}: {e}")
return None
async def grade_single_task(self, task_info: Dict) -> GradingResult:
"""Grade a single task using GAIA-Text-103 evaluator"""
start_time = time.time()
result = GradingResult(
task_id=task_info["task_id"],
run_name=task_info["run_name"],
file_path=task_info["file_path"],
question=task_info["question"],
ground_truth=task_info["ground_truth"],
predicted_answer=task_info["predicted_answer"],
judge_result="",
judge_type="gaia_validation_text_103_scorer",
)
try:
# Use the GAIA-Text-103 evaluator
judge_result = await verify_answer_gaia_validation_text_103(
question=task_info["question"],
target=task_info["ground_truth"],
predicted_answer=task_info["predicted_answer"],
)
result.judge_result = judge_result
result.grading_time = time.time() - start_time
print(
f"Task {task_info['task_id']} ({task_info['run_name']}): {judge_result}"
)
except Exception as e:
result.error_message = str(e)
result.judge_result = "ERROR"
result.grading_time = time.time() - start_time
self.stats["errors"] += 1
print(f"Error grading task {task_info['task_id']}: {e}")
return result
async def grade_all_tasks(self, max_concurrent: int = 5) -> List[GradingResult]:
"""Grade all tasks with concurrent processing"""
task_files = self.find_task_files()
print(f"Found {len(task_files)} task files to grade")
# Extract task information
task_infos = []
for task_file in task_files:
task_info = self.extract_task_info(task_file)
if task_info:
task_infos.append(task_info)
self.stats["total_tasks"] = len(task_infos)
print(f"Extracted {len(task_infos)} valid tasks for grading")
if not task_infos:
print("No valid tasks found for grading")
return []
# Grade tasks with concurrency control
semaphore = asyncio.Semaphore(max_concurrent)
async def grade_with_semaphore(task_info):
async with semaphore:
return await self.grade_single_task(task_info)
# Create tasks for concurrent execution
tasks = [grade_with_semaphore(task_info) for task_info in task_infos]
# Execute all grading tasks
results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out exceptions and collect valid results
valid_results = []
for i, result in enumerate(results):
if isinstance(result, Exception):
print(f"Exception in task {i}: {result}")
self.stats["errors"] += 1
else:
valid_results.append(result)
self.stats["graded_tasks"] += 1
self.stats["total_grading_time"] += result.grading_time
self.results = valid_results
return valid_results
def update_original_files(self):
"""Update original task files with grading results"""
updated_count = 0
for result in self.results:
try:
# Read original file
with open(result.file_path, "r", encoding="utf-8") as f:
task_data = json.load(f)
# Add grading information
task_data["final_judge_result"] = result.judge_result
task_data["judge_type"] = result.judge_type
task_data["grading_time"] = result.grading_time
if result.error_message:
task_data["grading_error"] = result.error_message
# Write back to file
with open(result.file_path, "w", encoding="utf-8") as f:
json.dump(task_data, f, indent=2, ensure_ascii=False)
updated_count += 1
except Exception as e:
print(f"Error updating file {result.file_path}: {e}")
print(f"Updated {updated_count} original task files with grading results")
def print_summary(self):
"""Print grading summary"""
print("\n" + "=" * 60)
print("GAIA-Text-103 Grading Summary")
print("=" * 60)
print(f"Total tasks found: {self.stats['total_tasks']}")
print(f"Successfully graded: {self.stats['graded_tasks']}")
print(f"Errors: {self.stats['errors']}")
print("=" * 60)
async def main():
"""Main function"""
parser = argparse.ArgumentParser(
description="Grade GAIA-Text-103 tasks using LLM judgement"
)
parser.add_argument(
"extraction_dir", help="Directory containing extracted GAIA-Text-103 tasks"
)
parser.add_argument(
"--max-concurrent",
type=int,
default=5,
help="Maximum number of concurrent grading tasks (default: 5)",
)
args = parser.parse_args()
# Validate input directory
if not os.path.exists(args.extraction_dir):
print(f"Error: Extraction directory not found: {args.extraction_dir}")
return 1
print(f"Extraction directory: {args.extraction_dir}")
print(f"Max concurrent tasks: {args.max_concurrent}")
print()
# Create grader and run grading
grader = GAIAText103Grader(args.extraction_dir)
try:
print("Starting grading process...")
results = await grader.grade_all_tasks(max_concurrent=args.max_concurrent)
if results:
# Update original files only
grader.update_original_files()
# Print summary
grader.print_summary()
print("\n✅ Grading completed successfully!")
print("📝 Original task files updated with grading results")
else:
print("❌ No tasks were graded successfully")
return 1
except KeyboardInterrupt:
print("\nGrading interrupted by user")
return 1
except Exception as e:
print(f"Error during grading: {e}")
return 1
return 0
if __name__ == "__main__":
exit_code = asyncio.run(main())
sys.exit(exit_code)
================================================
FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-to-text-103-mover.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
GAIA to Text-103 Task Copier
This script:
1. Loads GAIA validation logs from a specified directory
2. Identifies tasks that belong to GAIA-Text-103 dataset
3. Copies those tasks to a new directory structure maintaining the original layout
"""
import argparse
import json
import os
import shutil
from pathlib import Path
from typing import Set
class GAIAtoText103Copier:
"""Copy GAIA-Text-103 tasks from GAIA validation logs"""
def __init__(self, gaia_text_103_data_path: str, output_dir: str):
"""
Initialize the copier
Args:
gaia_text_103_data_path: Path to GAIA-Text-103 standardized data file
output_dir: Directory to save copied tasks
"""
self.gaia_text_103_data_path = gaia_text_103_data_path
self.output_dir = Path(output_dir)
self.gaia_text_103_task_ids: Set[str] = set()
self.copied_count = 0
# Load GAIA-Text-103 task IDs
self._load_gaia_text_103_tasks()
def _load_gaia_text_103_tasks(self):
"""Load task IDs from GAIA-Text-103 dataset"""
print(f"Loading GAIA-Text-103 task IDs from {self.gaia_text_103_data_path}")
if not os.path.exists(self.gaia_text_103_data_path):
raise FileNotFoundError(
f"GAIA-Text-103 data file not found: {self.gaia_text_103_data_path}"
)
with open(self.gaia_text_103_data_path, "r", encoding="utf-8") as f:
for line in f:
if line.strip():
task_data = json.loads(line)
task_id = task_data.get("task_id")
if task_id:
self.gaia_text_103_task_ids.add(task_id)
print(f"Loaded {len(self.gaia_text_103_task_ids)} GAIA-Text-103 task IDs")
def copy_gaia_text_103_tasks(self, gaia_logs_dir: str) -> int:
"""
Copy GAIA-Text-103 tasks from GAIA validation logs
Args:
gaia_logs_dir: Directory containing GAIA validation logs
Returns:
Number of copied tasks
"""
print(f"Copying GAIA-Text-103 tasks from {gaia_logs_dir}")
# Find all task JSON files in the logs directory (including in run subdirectories)
task_files = []
for root, dirs, files in os.walk(gaia_logs_dir):
for file in files:
if file.startswith("task_") and file.endswith(".json"):
task_files.append(os.path.join(root, file))
print(f"Found {len(task_files)} task files to process")
copied_count = 0
for task_file in task_files:
try:
filename = os.path.basename(task_file)
# Extract task ID from filename like: task_5188369a-3bbe-43d8-8b94-11558f909a08_attempt_1_format_retry_0_2025-08-06T21-14-23-770872Z.json
task_id = (
filename.split("_")[1]
if filename.startswith("task_") and "_" in filename
else ""
)
if task_id and task_id in self.gaia_text_103_task_ids:
# This is a GAIA-Text-103 task, copy it
copied_count += 1
# Preserve the original directory structure
# Get the relative path from the original directory
original_dir = os.path.dirname(gaia_logs_dir)
relative_path = os.path.relpath(task_file, original_dir)
# Create the same directory structure in the output
output_file = self.output_dir / relative_path
output_file.parent.mkdir(parents=True, exist_ok=True)
# Copy the file
shutil.copy2(task_file, output_file)
if copied_count % 50 == 0:
print(f"Copied {copied_count} tasks...")
except Exception as e:
print(f"Error processing {task_file}: {e}")
continue
print(f"Successfully copied {copied_count} GAIA-Text-103 tasks")
self.copied_count = copied_count
return copied_count
def print_summary(self):
"""Print copying summary to console"""
print("\n" + "=" * 60)
print("GAIA-Text-103 Task Copying Summary")
print("=" * 60)
print(f"Total Tasks Copied: {self.copied_count}")
print(f"Output Directory: {self.output_dir}")
print("=" * 60)
def main():
"""Main function"""
parser = argparse.ArgumentParser(
description="Copy GAIA-Text-103 tasks from GAIA validation logs"
)
parser.add_argument(
"gaia_logs_dir", help="Directory containing GAIA validation logs"
)
parser.add_argument(
"--gaia_text_103_data",
default="../../data/gaia-2023-validation-text-103/standardized_data.jsonl",
help="Path to GAIA-Text-103 standardized data file",
)
parser.add_argument(
"--output-dir",
help="Output directory for copied tasks (default: side by side with gaia-validation)",
)
args = parser.parse_args()
# Set default output directory side by side with gaia-validation
if not args.output_dir:
gaia_logs_path = Path(args.gaia_logs_dir)
# If the input is a gaia-validation directory, create gaia-text-103-extraction next to it
if gaia_logs_path.name == "gaia-validation":
args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction")
else:
# Otherwise, create in the same directory as the input
args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction")
# Validate inputs
if not os.path.exists(args.gaia_logs_dir):
print(f"Error: GAIA logs directory not found: {args.gaia_logs_dir}")
return 1
if not os.path.exists(args.gaia_text_103_data):
print(f"Error: GAIA-Text-103 data file not found: {args.gaia_text_103_data}")
return 1
print(f"Input GAIA logs directory: {args.gaia_logs_dir}")
print(f"Output directory: {args.output_dir}")
print(f"GAIA-Text-103 data file: {args.gaia_text_103_data}")
print()
try:
# Initialize copier
copier = GAIAtoText103Copier(args.gaia_text_103_data, args.output_dir)
# Copy tasks
copied_count = copier.copy_gaia_text_103_tasks(args.gaia_logs_dir)
if copied_count == 0:
print("No GAIA-Text-103 tasks found in the logs directory")
return 0
# Print summary
copier.print_summary()
return 0
except Exception as e:
print(f"Error: {e}")
return 1
if __name__ == "__main__":
exit_code = main()
exit(exit_code)
================================================
FILE: apps/miroflow-agent/conf/__init__.py
================================================
# This file makes the conf directory a Python package
================================================
FILE: apps/miroflow-agent/conf/agent/default.yaml
================================================
# conf/agent/default.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
main_agent:
tools:
- tool-python
- tool-vqa
- tool-transcribe
- tool-reasoning
- tool-reader
max_turns: 20 # Maximum number of turns for main agent execution
sub_agents:
agent-browsing:
tools:
- tool-google-search
- tool-vqa
- tool-reader
- tool-python
max_turns: 20
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/demo.yaml
================================================
# conf/agent/demo.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 20 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max200.yaml
================================================
# conf/agent/mirothinker_1.7_keep5_max200.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 200 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
retry_with_summary: False # default is true
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max300.yaml
================================================
# conf/agent/mirothinker_1.7_keep5_max300.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 300 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
retry_with_summary: False # default is true
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml
================================================
# conf/agent/mirothinker_v1.0.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 600 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml
================================================
# conf/agent/mirothinker_v1.0_keep5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 600 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml
================================================
# conf/agent/mirothinker_v1.5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 600 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml
================================================
# conf/agent/mirothinker_v1.5_keep5_max200.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 200 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml
================================================
# conf/agent/mirothinker_v1.5_keep5_max400.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 400 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/multi_agent.yaml
================================================
# conf/agent/multi_agent.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- tool-python
- tool-vqa
- tool-transcribe
- tool-reasoning
- tool-reader
max_turns: 50 # Maximum number of turns for main agent execution
sub_agents:
agent-browsing:
tools:
- tool-google-search
- tool-vqa
- tool-reader
- tool-python
max_turns: 50
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/multi_agent_os.yaml
================================================
# conf/agent/multi_agent_os.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- tool-python
- tool-vqa-os
- tool-transcribe-os
- tool-reasoning-os
- tool-reader
max_turns: 50 # Maximum number of turns for main agent execution
sub_agents:
agent-browsing:
tools:
- tool-google-search
- tool-vqa-os
- tool-reader
- tool-python
max_turns: 50
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/single_agent.yaml
================================================
# conf/agent/single_agent.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 600 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: -1
context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/agent/single_agent_keep5.yaml
================================================
# conf/agent/single_agent_keep5.yaml
# The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
# Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
defaults:
- default
- _self_
main_agent:
tools:
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-python
tool_blacklist:
- [ "search_and_scrape_webpage", "sogou_search" ]
- [ "tool-python", "download_file_from_sandbox_to_local" ]
max_turns: 600 # Maximum number of turns for main agent execution
sub_agents:
# Settings for context management
keep_tool_result: 5
context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled).
================================================
FILE: apps/miroflow-agent/conf/benchmark/aime2025.yaml
================================================
# conf/benchmark/aime2025.yaml
defaults:
- default
- _self_
name: "aime2025"
data:
data_dir: "../../data/aime2025"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/browsecomp.yaml
================================================
# conf/benchmark/browsecomp.yaml
defaults:
- default
- _self_
name: "browsecomp"
data:
data_dir: "../../data/browsecomp"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml
================================================
# conf/benchmark/browsecomp_zh.yaml
defaults:
- default
- _self_
name: "browsecomp_zh"
data:
data_dir: "../../data/browsecomp_zh"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/collect_trace.yaml
================================================
# conf/benchmark/collect_trace.yaml
defaults:
- default
- _self_
name: "collect_trace"
data:
data_dir: "../../data/debug"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/debug.yaml
================================================
# conf/benchmark/debug.yaml
defaults:
- default
- _self_
name: "debug"
data:
data_dir: "../../data/debug"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/deepsearchqa.yaml
================================================
# conf/benchmark/deepsearchqa.yaml
defaults:
- default
- _self_
name: "deepsearchqa"
data:
data_dir: "../../data/deepsearchqa"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/default.yaml
================================================
# conf/benchmark/default.yaml - Default benchmark configuration
# This is a base configuration for benchmarks. Specific benchmarks can override these defaults.
name: "default"
data:
metadata_file: "standardized_data.jsonl"
field_mapping:
task_id_field: "task_id"
task_question_field: "task_question"
ground_truth_field: "ground_truth"
file_name_field: "file_name"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/frames.yaml
================================================
# conf/benchmark/frames.yaml
defaults:
- default
- _self_
name: "frames"
data:
data_dir: "../../data/frames"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/futurex.yaml
================================================
# conf/benchmark/futurex.yaml
defaults:
- default
- _self_
name: "futurex"
data:
data_dir: "../../data/futurex"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml
================================================
# conf/benchmark/gaia-validation-text-103.yaml
defaults:
- default
- _self_
name: "gaia-validation-text-103"
data:
data_dir: "../../data/gaia-2023-validation-text-103"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/gaia-validation.yaml
================================================
# conf/benchmark/gaia-validation.yaml
defaults:
- default
- _self_
name: "gaia-validation"
data:
data_dir: "../../data/gaia-2023-validation"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml
================================================
# conf/benchmark/hle-text-2158.yaml
defaults:
- default
- _self_
name: "hle-text-2158"
data:
data_dir: "../../data/hle-text-2158"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/hle-text-500.yaml
================================================
# conf/benchmark/hle-text-500.yaml
defaults:
- default
- _self_
name: "hle-text-500"
data:
data_dir: "../../data/hle-text-500"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/hle.yaml
================================================
# conf/benchmark/hle.yaml
defaults:
- default
- _self_
name: "hle"
data:
data_dir: "../../data/hle"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/seal-0.yaml
================================================
# conf/benchmark/seal-0.yaml
defaults:
- default
- _self_
name: "seal-0"
data:
data_dir: "../../data/seal-0"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml
================================================
# conf/benchmark/webwalkerqa.yaml
defaults:
- default
- _self_
name: "webwalkerqa"
data:
data_dir: "../../data/webwalkerqa"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml
================================================
# conf/benchmark/xbench_deepsearch.yaml
defaults:
- default
- _self_
name: "xbench_deepsearch"
data:
data_dir: "../../data/xbench_deepsearch"
execution:
max_tasks: null # null means no limit
max_concurrent: 5
pass_at_k: 1
================================================
FILE: apps/miroflow-agent/conf/config.yaml
================================================
# conf/config.yaml
defaults:
- llm: default
- agent: default
- benchmark: default
- _self_ # Allows variables to be defined at the top of this file
hydra:
run:
dir: ../../logs/debug
# You can define some top-level or default parameters here
project_name: "miroflow-agent"
debug_dir: "../../logs/debug"
================================================
FILE: apps/miroflow-agent/conf/llm/claude-3-7.yaml
================================================
# conf/llm/claude-3-7.yaml
defaults:
- default
- _self_
provider: "anthropic"
model_name: "claude-3-7-sonnet-20250219"
base_url: https://api.anthropic.com
max_context_length: 65536
================================================
FILE: apps/miroflow-agent/conf/llm/default.yaml
================================================
# conf/llm/default.yaml - Default LLM configuration
provider: "anthropic" # openai, anthropic, qwen
model_name: "claude-3-7-sonnet-20250219"
async_client: false
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
api_key: ""
base_url: https://api.anthropic.com
repetition_penalty: 1.0
================================================
FILE: apps/miroflow-agent/conf/llm/gpt-5.yaml
================================================
# conf/llm/gpt-5.yaml
defaults:
- default
- _self_
provider: "openai"
model_name: "gpt-5-2025-08-07"
base_url: https://api.openai.com/v1
max_context_length: 65536
================================================
FILE: apps/miroflow-agent/conf/llm/qwen-3.yaml
================================================
# conf/llm/qwen-3.yaml
defaults:
- default
- _self_
provider: "qwen"
model_name: "qwen-3"
base_url: "https://your-api.com/v1"
max_context_length: 262144
max_tokens: 16384
top_p: 0.95
repetition_penalty: 1.05
temperature: 1.0
================================================
FILE: apps/miroflow-agent/main.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import hydra
from omegaconf import DictConfig, OmegaConf
# Import from the new modular structure
from src.core.pipeline import (
create_pipeline_components,
execute_task_pipeline,
)
from src.logging.task_logger import bootstrap_logger
# Configure logger and get the configured instance
logger = bootstrap_logger()
async def amain(cfg: DictConfig) -> None:
"""Asynchronous main function."""
logger.info(OmegaConf.to_yaml(cfg))
# Create pipeline components using the factory function
main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (
create_pipeline_components(cfg)
)
# Define task parameters
task_id = "task_example"
task_description = "What is the title of today's arxiv paper in computer science?"
task_file_name = ""
# Execute task using the pipeline
final_summary, final_boxed_answer, log_file_path, _ = await execute_task_pipeline(
cfg=cfg,
task_id=task_id,
task_file_name=task_file_name,
task_description=task_description,
main_agent_tool_manager=main_agent_tool_manager,
sub_agent_tool_managers=sub_agent_tool_managers,
output_formatter=output_formatter,
log_dir=cfg.debug_dir,
)
@hydra.main(config_path="conf", config_name="config", version_base=None)
def main(cfg: DictConfig) -> None:
asyncio.run(amain(cfg))
if __name__ == "__main__":
main()
================================================
FILE: apps/miroflow-agent/pyproject.toml
================================================
[project]
name = "miroflow-agent"
version = "0.1.0"
description = "An agent framework for complex task solving with LLM and MCP tools"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"miroflow-tools>=0.1.0",
"huggingface-hub>=0.28.0",
"requests>=2.32.3",
"rich>=13.9.4",
"jinja2>=3.1.4",
"pillow>=11.0.0",
"markdownify>=0.14.1",
"duckduckgo-search>=6.3.7",
"python-dotenv",
"pdfminer-six",
"python-pptx",
"puremagic",
"pydub",
"SpeechRecognition",
"youtube_transcript_api",
"mcp",
"fastmcp",
"anthropic",
"e2b-code-interpreter==1.2.1",
"jsonlines>=4.0.0",
"mammoth>=1.9.0",
"numpy>=2.2.5",
"ipdb>=0.13.13",
"datasets>=3.5.0",
"openpyxl>=3.1.5",
"markitdown-mcp>=0.0.1a3",
"markitdown>=0.1.1",
"regex>=2024.11.6",
"openai>=1.78.1",
"tenacity>=9.1.2",
"transformers>=4.51.3",
"omegaconf>=2.3.0",
"wikipedia",
"mutagen",
"hydra-core",
"google-genai",
"tiktoken>=0.9.0",
"aiohttp",
"colorama>=0.4.6",
"json-repair>=0.49.0",
"tencentcloud-sdk-python>=3.0.1451"
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src"]
[tool.uv.sources]
miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }
[dependency-groups]
dev = [
"inline-snapshot>=0.23.2",
"pyright>=1.1.403",
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
"pytest-cov>=6.2.1",
"pytest-html>=4.1.1",
"pytest-xdist>=3.7.0",
"ty>=0.0.1a14",
]
[tool.pytest.ini_options]
# see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml
minversion = "8.3.5"
testpaths = ["tests"]
# make warning go away
# https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915
asyncio_default_fixture_loop_scope = "function"
addopts = [
# summary for failed AND passed tests
"-rA",
# only show stderr for test. stdlog can contain sensitive information
"--show-capture=stderr",
# use `pytest-xdist` to run tests in parallel
"-n=auto",
# use `pytest-html` to generate test report in html format
"--html=report.html",
"--self-contained-html",
# use `pytest-testmon` to run tests on changed files only
# "--testmon",
# use `pytest-cov` to generate test coverage report
"--cov=miroflow_agent",
"--cov-report=html",
]
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-32}
BENCHMARK_NAME="aime2025"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/aime2025 \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="browsecomp"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/browsecomp \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="browsecomp_zh"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/browsecomp_zh \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-1}
BENCHMARK_NAME="debug"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-1}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/debug \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_deepsearchqa.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="deepsearchqa"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/deepsearchqa \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="frames"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/frames \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="futurex"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data_250924_250930.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/futurex \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
# echo "Calculating average scores..."
# uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "Extracting predictions and formatting for FutureX submission..."
uv run python benchmarks/evaluators/extract_futurex_results.py "$RESULTS_DIR"
# Check status and provide user-friendly message
if [ $? -eq 0 ]; then
echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl"
echo "You can now upload this file to the FutureX test server."
else
echo "❌ Failed to generate submission file. Please check the logs for details."
fi
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="gaia-validation-text-103"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="gaia-validation"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/gaia-2023-validation \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle-text-2158"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data_original.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/hle-text-2158 \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle-text-500"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data_original.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/hle-text-500 \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="hle"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/hle \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="seal-0"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/seal-0 \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-3}
BENCHMARK_NAME="webwalkerqa"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/webwalkerqa \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh
================================================
#!/bin/bash
# Parse environment variables, use defaults if not set
LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
# Configuration parameters
NUM_RUNS=${NUM_RUNS:-8}
BENCHMARK_NAME="xbench_deepsearch"
LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
MAX_CONCURRENT=${MAX_CONCURRENT:-10}
PASS_AT_K=${PASS_AT_K:-1}
TEMPERATURE=${TEMPERATURE:-1.0}
API_KEY=${API_KEY:-"xxx"}
# Set results directory
RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
echo "Starting $NUM_RUNS runs of the evaluation..."
echo "Results will be saved in: $RESULTS_DIR"
# Create results directory
mkdir -p "$RESULTS_DIR"
# Launch all parallel tasks
for i in $(seq 1 $NUM_RUNS); do
echo "=========================================="
echo "Launching experiment $i/$NUM_RUNS"
echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
echo "=========================================="
# Set specific identifier for this run
RUN_ID="run_$i"
# Run experiment (background execution)
(
uv run python benchmarks/common_benchmark.py \
benchmark=$BENCHMARK_NAME \
benchmark.data.metadata_file="standardized_data.jsonl" \
llm=qwen-3 \
llm.provider=$LLM_PROVIDER \
llm.model_name=$LLM_MODEL \
llm.base_url=$BASE_URL \
llm.async_client=true \
llm.temperature=$TEMPERATURE \
llm.max_context_length=$MAX_CONTEXT_LENGTH \
llm.api_key=$API_KEY \
benchmark.execution.max_tasks=null \
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
benchmark.execution.pass_at_k=$PASS_AT_K \
benchmark.data.data_dir=../../data/xbench_deepsearch \
agent=$AGENT_SET \
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log"
# Check if run was successful
if [ $? -eq 0 ]; then
echo "Run $i completed successfully"
RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
if [ -f "$RESULT_FILE" ]; then
echo "Results saved to $RESULT_FILE"
else
echo "Warning: Result file not found for run $i"
fi
else
echo "Run $i failed!"
fi
) &
# Small delay between launches to avoid simultaneous requests
sleep 2
done
echo "All $NUM_RUNS runs have been launched in parallel"
echo "Waiting for all runs to complete..."
# Wait for all background tasks to complete
wait
echo "=========================================="
echo "All $NUM_RUNS runs completed!"
echo "=========================================="
# Calculate average scores
echo "Calculating average scores..."
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
echo "=========================================="
echo "Multiple runs evaluation completed!"
echo "Check results in: $RESULTS_DIR"
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
echo "=========================================="
================================================
FILE: apps/miroflow-agent/src/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""MiroFlow Agent - A modular agent framework for task execution."""
from .core.orchestrator import Orchestrator
from .core.pipeline import create_pipeline_components, execute_task_pipeline
from .io.output_formatter import OutputFormatter
from .llm.factory import ClientFactory
from .logging.task_logger import TaskLog, bootstrap_logger
__all__ = [
"Orchestrator",
"create_pipeline_components",
"execute_task_pipeline",
"OutputFormatter",
"ClientFactory",
"TaskLog",
"bootstrap_logger",
]
================================================
FILE: apps/miroflow-agent/src/config/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Configuration module for MiroFlow Agent."""
from .settings import (
create_mcp_server_parameters,
expose_sub_agents_as_tools,
get_env_info,
)
__all__ = [
"create_mcp_server_parameters",
"expose_sub_agents_as_tools",
"get_env_info",
]
================================================
FILE: apps/miroflow-agent/src/config/settings.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Configuration settings and MCP server parameter management.
This module handles:
- Loading environment variables for API keys and service URLs
- Creating MCP server configurations for different tools
- Exposing sub-agents as callable tools
- Collecting environment information for logging
"""
import os
import sys
from dotenv import load_dotenv
from mcp import StdioServerParameters
from omegaconf import DictConfig
# Load environment variables from .env file
load_dotenv()
# API for Google Search
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev")
# API for Web Scraping
JINA_API_KEY = os.environ.get("JINA_API_KEY")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")
# API for Linux Sandbox
E2B_API_KEY = os.environ.get("E2B_API_KEY")
# API for Open-Source Audio Transcription Tool
WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL")
WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY")
WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME")
# API for Open-Source VQA Tool
VISION_API_KEY = os.environ.get("VISION_API_KEY")
VISION_BASE_URL = os.environ.get("VISION_BASE_URL")
VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME")
# API for Open-Source Reasoning Tool
REASONING_API_KEY = os.environ.get("REASONING_API_KEY")
REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL")
REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME")
# API for Claude Sonnet 3.7 as Commercial Tools
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
# API Keys for Commercial Tools
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
# API for Sogou Search
TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID")
TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY")
# API for Summary LLM
SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY")
SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL")
SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME")
# MCP server configuration generation function
def create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig):
"""
Create MCP server configurations based on agent configuration.
Dynamically generates StdioServerParameters for each tool specified in the
agent configuration. Each tool type (search, python, vqa, etc.) has its own
MCP server with appropriate environment variables.
Args:
cfg: Global Hydra configuration object
agent_cfg: Agent-specific configuration containing 'tools' and 'tool_blacklist'
Returns:
Tuple of (configs, blacklist) where:
- configs: List of dicts with 'name' and 'params' (StdioServerParameters)
- blacklist: Set of (server_name, tool_name) tuples to exclude
"""
configs = []
if (
agent_cfg.get("tools", None) is not None
and "tool-google-search" in agent_cfg["tools"]
):
if not SERPER_API_KEY:
raise ValueError(
"SERPER_API_KEY not set, tool-google-search will be unavailable."
)
configs.append(
{
"name": "tool-google-search",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.mcp_servers.searching_google_mcp_server",
],
env={
"SERPER_API_KEY": SERPER_API_KEY,
"SERPER_BASE_URL": SERPER_BASE_URL,
"JINA_API_KEY": JINA_API_KEY,
"JINA_BASE_URL": JINA_BASE_URL,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-sogou-search" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-sogou-search",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.mcp_servers.searching_sogou_mcp_server",
],
env={
"TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID,
"TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY,
"JINA_API_KEY": JINA_API_KEY,
"JINA_BASE_URL": JINA_BASE_URL,
},
),
}
)
if agent_cfg.get("tools", None) is not None and "tool-python" in agent_cfg["tools"]:
configs.append(
{
"name": "tool-python",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
env={"E2B_API_KEY": E2B_API_KEY},
),
}
)
if agent_cfg.get("tools", None) is not None and "tool-vqa" in agent_cfg["tools"]:
configs.append(
{
"name": "tool-vqa",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"],
env={
"OPENAI_API_KEY": OPENAI_API_KEY,
"OPENAI_BASE_URL": OPENAI_BASE_URL,
},
),
}
)
if agent_cfg.get("tools", None) is not None and "tool-vqa-os" in agent_cfg["tools"]:
configs.append(
{
"name": "tool-vqa-os",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"],
env={
"VISION_API_KEY": VISION_API_KEY,
"VISION_BASE_URL": VISION_BASE_URL,
"VISION_MODEL_NAME": VISION_MODEL_NAME,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-transcribe" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-transcribe",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"],
env={
"OPENAI_API_KEY": OPENAI_API_KEY,
"OPENAI_BASE_URL": OPENAI_BASE_URL,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-transcribe-os" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-transcribe-os",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"],
env={
"WHISPER_BASE_URL": WHISPER_BASE_URL,
"WHISPER_API_KEY": WHISPER_API_KEY,
"WHISPER_MODEL_NAME": WHISPER_MODEL_NAME,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-reasoning" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-reasoning",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.mcp_servers.reasoning_mcp_server",
],
env={
"ANTHROPIC_API_KEY": ANTHROPIC_API_KEY,
"ANTHROPIC_BASE_URL": ANTHROPIC_BASE_URL,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-reasoning-os" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-reasoning-os",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.mcp_servers.reasoning_mcp_server_os",
],
env={
"REASONING_API_KEY": REASONING_API_KEY,
"REASONING_BASE_URL": REASONING_BASE_URL,
"REASONING_MODEL_NAME": REASONING_MODEL_NAME,
},
),
}
)
# reader
if agent_cfg.get("tools", None) is not None and "tool-reader" in agent_cfg["tools"]:
configs.append(
{
"name": "tool-reader",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "markitdown_mcp"],
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "tool-reading" in agent_cfg["tools"]
):
configs.append(
{
"name": "tool-reading",
"params": StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"],
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "search_and_scrape_webpage" in agent_cfg["tools"]
):
configs.append(
{
"name": "search_and_scrape_webpage",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.dev_mcp_servers.search_and_scrape_webpage",
],
env={
"SERPER_API_KEY": SERPER_API_KEY,
"SERPER_BASE_URL": SERPER_BASE_URL,
"TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID,
"TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "jina_scrape_llm_summary" in agent_cfg["tools"]
):
configs.append(
{
"name": "jina_scrape_llm_summary",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary",
],
env={
"JINA_API_KEY": JINA_API_KEY,
"JINA_BASE_URL": JINA_BASE_URL,
"SUMMARY_LLM_BASE_URL": SUMMARY_LLM_BASE_URL,
"SUMMARY_LLM_MODEL_NAME": SUMMARY_LLM_MODEL_NAME,
"SUMMARY_LLM_API_KEY": SUMMARY_LLM_API_KEY,
},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "stateless_python" in agent_cfg["tools"]
):
configs.append(
{
"name": "stateless_python",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.dev_mcp_servers.stateless_python_server",
],
env={"E2B_API_KEY": E2B_API_KEY},
),
}
)
if (
agent_cfg.get("tools", None) is not None
and "task_planner" in agent_cfg["tools"]
):
# Generate a random UUID for each MCP server instance to ensure isolation
# Each time create_mcp_server_parameters is called, a new UUID is generated
# This automatically isolates todo lists for concurrent tasks
import uuid
todo_task_id = str(uuid.uuid4())
configs.append(
{
"name": "task_planner",
"params": StdioServerParameters(
command=sys.executable,
args=[
"-m",
"miroflow_tools.dev_mcp_servers.task_planner",
],
env={"TASK_ID": todo_task_id},
),
}
)
blacklist = set()
for black_list_item in agent_cfg.get("tool_blacklist", []):
blacklist.add((black_list_item[0], black_list_item[1]))
return configs, blacklist
def expose_sub_agents_as_tools(sub_agents_cfg: DictConfig):
"""
Convert sub-agent configurations into tool definitions for the main agent.
This allows the main agent to invoke sub-agents (like the browsing agent)
as if they were regular MCP tools, enabling a hierarchical agent architecture.
Args:
sub_agents_cfg: Configuration containing sub-agent definitions
Returns:
List of server parameter dicts, each with 'name' and 'tools' keys.
Each tool includes 'name', 'description', and 'schema' for the sub-agent.
"""
sub_agents_server_params = []
for sub_agent in sub_agents_cfg.keys():
if "agent-browsing" in sub_agent:
sub_agents_server_params.append(
dict(
name="agent-browsing",
tools=[
dict(
name="search_and_browse",
description="This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe result of the subtask. ",
schema={
"type": "object",
"properties": {
"subtask": {"title": "Subtask", "type": "string"}
},
"required": ["subtask"],
"title": "search_and_browseArguments",
},
)
],
)
)
return sub_agents_server_params
def get_env_info(cfg: DictConfig) -> dict:
"""
Collect current configuration and environment information for logging.
Gathers LLM settings, agent configuration, API key availability (masked),
and base URLs. Used for debugging and task log enrichment.
Args:
cfg: Hydra configuration object
Returns:
Dictionary containing:
- LLM configuration (provider, model, temperature, etc.)
- Agent configuration (max turns for main/sub agents)
- API key availability flags (boolean, not actual keys)
- Service base URLs
"""
return {
# LLM Configuration
"llm_provider": cfg.llm.provider,
"llm_base_url": cfg.llm.base_url,
"llm_model_name": cfg.llm.model_name,
"llm_temperature": cfg.llm.temperature,
"llm_top_p": cfg.llm.top_p,
"llm_min_p": cfg.llm.min_p,
"llm_top_k": cfg.llm.top_k,
"llm_max_tokens": cfg.llm.max_tokens,
"llm_repetition_penalty": cfg.llm.repetition_penalty,
"llm_async_client": cfg.llm.async_client,
"keep_tool_result": cfg.agent.keep_tool_result,
# Agent Configuration
"main_agent_max_turns": cfg.agent.main_agent.max_turns,
**(
{
f"sub_{sub_agent}_max_turns": cfg.agent.sub_agents[sub_agent].max_turns
for sub_agent in cfg.agent.sub_agents
}
if cfg.agent.sub_agents is not None
else {}
),
# API Keys (masked for security)
"has_serper_api_key": bool(SERPER_API_KEY),
"has_jina_api_key": bool(JINA_API_KEY),
"has_anthropic_api_key": bool(ANTHROPIC_API_KEY),
"has_openai_api_key": bool(OPENAI_API_KEY),
"has_e2b_api_key": bool(E2B_API_KEY),
"has_tencent_secret_id": bool(TENCENTCLOUD_SECRET_ID),
"has_tencent_secret_key": bool(TENCENTCLOUD_SECRET_KEY),
"has_summary_llm_api_key": bool(SUMMARY_LLM_API_KEY),
# Base URLs
"openai_base_url": OPENAI_BASE_URL,
"anthropic_base_url": ANTHROPIC_BASE_URL,
"jina_base_url": JINA_BASE_URL,
"serper_base_url": SERPER_BASE_URL,
"whisper_base_url": WHISPER_BASE_URL,
"vision_base_url": VISION_BASE_URL,
"reasoning_base_url": REASONING_BASE_URL,
"summary_llm_base_url": SUMMARY_LLM_BASE_URL,
}
================================================
FILE: apps/miroflow-agent/src/core/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Core module containing orchestrator and pipeline components."""
from .answer_generator import AnswerGenerator
from .orchestrator import Orchestrator
from .pipeline import create_pipeline_components, execute_task_pipeline
from .stream_handler import StreamHandler
from .tool_executor import ToolExecutor
__all__ = [
"AnswerGenerator",
"Orchestrator",
"StreamHandler",
"ToolExecutor",
"create_pipeline_components",
"execute_task_pipeline",
]
================================================
FILE: apps/miroflow-agent/src/core/answer_generator.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Answer generator module for final answer generation and context management.
This module provides the AnswerGenerator class that handles:
- LLM call processing
- Failure summary generation for context compression
- Final answer generation with retries
- Context management fallback strategies
"""
import logging
from typing import Any, Dict, List, Optional, Tuple
from omegaconf import DictConfig
from ..io.output_formatter import OutputFormatter
from ..llm.base_client import BaseClient
from ..logging.task_logger import TaskLog
from ..utils.parsing_utils import extract_failure_experience_summary
from ..utils.prompt_utils import (
FAILURE_SUMMARY_ASSISTANT_PREFIX,
FAILURE_SUMMARY_PROMPT,
FORMAT_ERROR_MESSAGE,
generate_agent_summarize_prompt,
)
from ..utils.wrapper_utils import ErrorBox, ResponseBox
from .stream_handler import StreamHandler
logger = logging.getLogger(__name__)
# Safety limits for retry loops
DEFAULT_MAX_FINAL_ANSWER_RETRIES = 3
class AnswerGenerator:
"""
Generator for final answers with context management support.
Handles the generation of final answers, failure summaries for retry,
and various fallback strategies based on context management settings.
"""
def __init__(
self,
llm_client: BaseClient,
output_formatter: OutputFormatter,
task_log: TaskLog,
stream_handler: StreamHandler,
cfg: DictConfig,
intermediate_boxed_answers: List[str],
):
"""
Initialize the answer generator.
Args:
llm_client: The LLM client for API calls
output_formatter: Formatter for output processing
task_log: Logger for task execution
stream_handler: Handler for streaming events
cfg: Configuration object
intermediate_boxed_answers: List to track intermediate answers
"""
self.llm_client = llm_client
self.output_formatter = output_formatter
self.task_log = task_log
self.stream = stream_handler
self.cfg = cfg
self.intermediate_boxed_answers = intermediate_boxed_answers
# Context management settings
self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
self.max_final_answer_retries = (
DEFAULT_MAX_FINAL_ANSWER_RETRIES if cfg.agent.keep_tool_result == -1 else 1
)
self.retry_with_summary = cfg.agent.get("retry_with_summary", True)
async def handle_llm_call(
self,
system_prompt: str,
message_history: List[Dict[str, Any]],
tool_definitions: List[Dict],
step_id: int,
purpose: str = "",
agent_type: str = "main",
) -> Tuple[Optional[str], bool, Optional[Any], List[Dict[str, Any]]]:
"""
Unified LLM call and logging processing.
Args:
system_prompt: System prompt for the LLM
message_history: Conversation history
tool_definitions: Available tool definitions
step_id: Current step ID for logging
purpose: Description of the call purpose
agent_type: Type of agent making the call
Returns:
Tuple of (response_text, should_break, tool_calls_info, message_history)
"""
original_message_history = message_history
try:
response, message_history = await self.llm_client.create_message(
system_prompt=system_prompt,
message_history=message_history,
tool_definitions=tool_definitions,
keep_tool_result=self.cfg.agent.keep_tool_result,
step_id=step_id,
task_log=self.task_log,
agent_type=agent_type,
)
if ErrorBox.is_error_box(response):
await self.stream.show_error(str(response))
response = None
if ResponseBox.is_response_box(response):
if response.has_extra_info():
extra_info = response.get_extra_info()
if extra_info.get("warning_msg"):
await self.stream.show_error(
extra_info.get("warning_msg", "Empty warning message")
)
response = response.get_response()
# Check if response is None (indicating an error occurred)
if response is None:
self.task_log.log_step(
"error",
f"{purpose} | LLM Call Failed",
f"{purpose} failed - no response received",
)
return "", False, None, original_message_history
# Use client's response processing method
assistant_response_text, should_break, message_history = (
self.llm_client.process_llm_response(
response, message_history, agent_type
)
)
# Use client's tool call information extraction method
tool_calls_info = self.llm_client.extract_tool_calls_info(
response, assistant_response_text
)
self.task_log.log_step(
"info",
f"{purpose} | LLM Call",
"completed successfully",
)
return (
assistant_response_text,
should_break,
tool_calls_info,
message_history,
)
except Exception as e:
self.task_log.log_step(
"error",
f"{purpose} | LLM Call ERROR",
f"{purpose} error: {str(e)}",
)
# Return empty response with should_break=False, need to retry
return "", False, None, original_message_history
async def generate_failure_summary(
self,
system_prompt: str,
message_history: List[Dict[str, Any]],
tool_definitions: List[Dict],
turn_count: int,
) -> Optional[str]:
"""
Generate a failure experience summary for context compression.
This is the core of the context management mechanism. When a task attempt fails
(i.e., the task is not completed within the given turns and context window),
we compress the entire conversation history into a structured summary containing:
- Failure type: incomplete / blocked / misdirected / format_missed
- What happened: the approach taken and why a final answer was not reached
- Useful findings: facts, intermediate results, or conclusions to be reused
Args:
system_prompt: The system prompt used in the conversation
message_history: The full conversation history to be compressed
tool_definitions: Available tool definitions
turn_count: Current turn count for step ID
Returns:
The compressed failure experience summary, or None if generation failed
"""
self.task_log.log_step(
"info",
"Main Agent | Failure Summary",
"Generating failure experience summary for potential retry...",
)
# Build failure summary history
failure_summary_history = message_history.copy()
if failure_summary_history and failure_summary_history[-1]["role"] == "user":
failure_summary_history.pop()
# Add failure summary prompt and assistant prefix for structured output
failure_summary_history.append(
{"role": "user", "content": FAILURE_SUMMARY_PROMPT}
)
failure_summary_history.append(
{"role": "assistant", "content": FAILURE_SUMMARY_ASSISTANT_PREFIX}
)
# Call LLM to generate failure summary
(
failure_summary_text,
_,
_,
_,
) = await self.handle_llm_call(
system_prompt,
failure_summary_history,
tool_definitions,
turn_count + 10, # Use a different step id
"Main Agent | Failure Experience Summary",
agent_type="main",
)
# Prepend the assistant prefix to the response for complete output
if failure_summary_text:
failure_summary_text = (
FAILURE_SUMMARY_ASSISTANT_PREFIX + failure_summary_text
)
failure_experience_summary = extract_failure_experience_summary(
failure_summary_text
)
# Truncate for logging, but only add "..." if actually truncated
log_preview = failure_experience_summary[:500]
if len(failure_experience_summary) > 500:
log_preview += "..."
self.task_log.log_step(
"info",
"Main Agent | Failure Summary",
f"Generated failure experience summary:\n{log_preview}",
)
return failure_experience_summary
else:
self.task_log.log_step(
"warning",
"Main Agent | Failure Summary",
"Failed to generate failure experience summary",
)
return None
async def generate_final_answer_with_retries(
self,
system_prompt: str,
message_history: List[Dict[str, Any]],
tool_definitions: List[Dict],
turn_count: int,
task_description: str,
) -> Tuple[Optional[str], str, Optional[str], str, List[Dict[str, Any]]]:
"""
Generate final answer with retry mechanism.
Args:
system_prompt: System prompt for the LLM
message_history: Conversation history
tool_definitions: Available tool definitions
turn_count: Current turn count
task_description: Original task description
Returns:
Tuple of (final_answer_text, final_summary, final_boxed_answer, usage_log, message_history)
"""
# Generate summary prompt
summary_prompt = generate_agent_summarize_prompt(
task_description,
agent_type="main",
)
if message_history[-1]["role"] == "user":
message_history.pop(-1)
message_history.append({"role": "user", "content": summary_prompt})
final_answer_text = None
final_boxed_answer = None
final_summary = ""
usage_log = ""
for retry_idx in range(self.max_final_answer_retries):
(
final_answer_text,
should_break,
tool_calls_info,
message_history,
) = await self.handle_llm_call(
system_prompt,
message_history,
tool_definitions,
turn_count + 1 + retry_idx,
f"Main agent | Final Summary (attempt {retry_idx + 1}/{self.max_final_answer_retries})",
agent_type="main",
)
if final_answer_text:
final_summary, final_boxed_answer, usage_log = (
self.output_formatter.format_final_summary_and_log(
final_answer_text, self.llm_client
)
)
if final_boxed_answer != FORMAT_ERROR_MESSAGE:
self.task_log.log_step(
"info",
"Main Agent | Final Answer",
f"Boxed answer found on attempt {retry_idx + 1}",
)
break
else:
self.task_log.log_step(
"warning",
"Main Agent | Final Answer",
f"No boxed answer on attempt {retry_idx + 1}, retrying...",
)
if retry_idx < self.max_final_answer_retries - 1:
if (
message_history
and message_history[-1]["role"] == "assistant"
):
message_history.pop()
else:
self.task_log.log_step(
"warning",
"Main Agent | Final Answer",
f"Failed to generate answer on attempt {retry_idx + 1}",
)
if retry_idx < self.max_final_answer_retries - 1:
if message_history and message_history[-1]["role"] == "assistant":
message_history.pop()
# Ensure final_boxed_answer is never None
if final_boxed_answer is None:
final_boxed_answer = FORMAT_ERROR_MESSAGE
return (
final_answer_text,
final_summary,
final_boxed_answer,
usage_log,
message_history,
)
def handle_no_context_management_fallback(
self,
final_answer_text: Optional[str],
final_summary: str,
final_boxed_answer: Optional[str],
) -> Tuple[str, str, str]:
"""
Handle fallback when context_compress_limit == 0 (no context management).
In this mode, the model has only one chance to answer.
We should try to use intermediate answers as fallback to maximize accuracy.
Args:
final_answer_text: The generated final answer text
final_summary: The final summary
final_boxed_answer: The extracted boxed answer
Returns:
Tuple of (final_answer_text, final_summary, final_boxed_answer)
"""
# Validate final_answer_text
if not final_answer_text:
final_answer_text = "No final answer generated."
final_summary = final_answer_text
final_boxed_answer = FORMAT_ERROR_MESSAGE
self.task_log.log_step(
"error",
"Main Agent | Final Answer",
"Unable to generate final answer after all retries",
)
else:
self.task_log.log_step(
"info",
"Main Agent | Final Answer",
f"Final answer content:\n\n{final_answer_text}",
)
# Fallback to intermediate answer if no valid boxed answer
if (
final_boxed_answer == FORMAT_ERROR_MESSAGE or final_boxed_answer is None
) and self.intermediate_boxed_answers:
final_boxed_answer = self.intermediate_boxed_answers[-1]
self.task_log.log_step(
"info",
"Main Agent | Final Answer (No Context Management)",
f"Using intermediate boxed answer as fallback: {final_boxed_answer}",
)
# Ensure final_boxed_answer is never None
if final_boxed_answer is None:
final_boxed_answer = FORMAT_ERROR_MESSAGE
return final_answer_text, final_summary, final_boxed_answer
def handle_context_management_no_fallback(
self,
final_answer_text: Optional[str],
final_summary: str,
final_boxed_answer: Optional[str],
) -> Tuple[str, str, str]:
"""
Handle failure when context_compress_limit > 0 (context management enabled).
In this mode, the model has multiple chances to retry with context management.
We should NOT guess or use intermediate answers, because:
- A wrong guess can reduce accuracy
- The model will have another chance to answer with failure experience
Args:
final_answer_text: The generated final answer text
final_summary: The final summary
final_boxed_answer: The extracted boxed answer
Returns:
Tuple of (final_answer_text, final_summary, final_boxed_answer)
"""
# Validate final_answer_text
if not final_answer_text:
final_answer_text = "No final answer generated."
final_summary = final_answer_text
final_boxed_answer = FORMAT_ERROR_MESSAGE
self.task_log.log_step(
"error",
"Main Agent | Final Answer",
"Unable to generate final answer after all retries",
)
else:
self.task_log.log_step(
"info",
"Main Agent | Final Answer",
f"Final answer content:\n\n{final_answer_text}",
)
# Ensure final_boxed_answer is never None
if final_boxed_answer is None:
final_boxed_answer = FORMAT_ERROR_MESSAGE
# With context management, do NOT fallback to intermediate answers
if final_boxed_answer == FORMAT_ERROR_MESSAGE:
self.task_log.log_step(
"info",
"Main Agent | Final Answer (Context Management Mode)",
"No valid boxed answer found. Not using intermediate fallback - will generate failure summary for retry.",
)
return final_answer_text, final_summary, final_boxed_answer
async def generate_and_finalize_answer(
self,
system_prompt: str,
message_history: List[Dict[str, Any]],
tool_definitions: List[Dict],
turn_count: int,
task_description: str,
reached_max_turns: bool = False,
is_final_retry: bool = False,
save_callback=None,
) -> Tuple[str, str, Optional[str], str, List[Dict[str, Any]]]:
"""
Generate final answer and handle fallback based on context management settings.
Context Management (context_compress_limit > 0) is essentially a context compression
mechanism that enables multi-attempt problem solving.
Decision table based on (context_management, reached_max_turns):
| Context Management | Reached Max Turns | Behavior |
|--------------------|-------------------|---------------------------------------------|
| OFF (limit=0) | No | Generate answer → fallback to intermediate |
| OFF (limit=0) | Yes | Generate answer → fallback to intermediate |
| ON (limit>0) | No | Generate answer → no fallback, fail summary |
| ON (limit>0) | Yes | SKIP generation → fail summary directly |
Args:
system_prompt: System prompt for the LLM
message_history: Conversation history
tool_definitions: Available tool definitions
turn_count: Current turn count
task_description: Original task description
reached_max_turns: Whether the main loop ended due to reaching max turns
save_callback: Optional callback to save message history
Returns:
Tuple of (final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history)
"""
context_management_enabled = self.context_compress_limit > 0
failure_experience_summary = None
usage_log = ""
# CASE: Context management ON + reached max turns + NOT final retry
# Skip answer generation entirely - any answer would be a blind guess
# But if this is the final retry, we still try to generate an answer (last chance)
if context_management_enabled and reached_max_turns and not is_final_retry:
self.task_log.log_step(
"info",
"Main Agent | Final Answer (Context Management Mode)",
"Reached max turns. Skipping answer generation to avoid blind guessing.",
)
if save_callback:
save_callback(system_prompt, message_history)
if self.retry_with_summary:
failure_experience_summary = await self.generate_failure_summary(
system_prompt, message_history, tool_definitions, turn_count
)
return (
"Task incomplete - reached maximum turns. Will retry with failure experience.",
FORMAT_ERROR_MESSAGE,
failure_experience_summary,
usage_log,
message_history,
)
# ALL OTHER CASES: Generate final answer first
# (including final retry with reached_max_turns - last chance to get an answer)
(
final_answer_text,
final_summary,
final_boxed_answer,
usage_log,
message_history,
) = await self.generate_final_answer_with_retries(
system_prompt=system_prompt,
message_history=message_history,
tool_definitions=tool_definitions,
turn_count=turn_count,
task_description=task_description,
)
if save_callback:
save_callback(system_prompt, message_history)
# CASE: Context management OFF or final retry
# Try to use intermediate answers as fallback to maximize accuracy
# For final retry, there's no more retry opportunity, so we use fallback
if not context_management_enabled or is_final_retry:
final_answer_text, final_summary, final_boxed_answer = (
self.handle_no_context_management_fallback(
final_answer_text, final_summary, final_boxed_answer
)
)
if is_final_retry:
self.task_log.log_step(
"info",
"Main Agent | Final Answer (Final Retry)",
"This is the final retry. Using intermediate fallback if available.",
)
return (
final_summary,
final_boxed_answer,
None,
usage_log,
message_history,
)
# CASE: Context management ON + normal completion (not reached max turns, not final retry)
# Don't use fallback - wrong guess would reduce accuracy
final_answer_text, final_summary, final_boxed_answer = (
self.handle_context_management_no_fallback(
final_answer_text, final_summary, final_boxed_answer
)
)
if final_boxed_answer == FORMAT_ERROR_MESSAGE and self.retry_with_summary:
failure_experience_summary = await self.generate_failure_summary(
system_prompt, message_history, tool_definitions, turn_count
)
return (
final_summary,
final_boxed_answer,
failure_experience_summary,
usage_log,
message_history,
)
================================================
FILE: apps/miroflow-agent/src/core/orchestrator.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Orchestrator module for coordinating agent task execution.
This module contains the main Orchestrator class that manages the execution of tasks
by coordinating between the main agent, sub-agents, and various tools.
"""
import asyncio
import gc
import logging
import time
import uuid
from collections import defaultdict
from datetime import date
from typing import Any, Dict, List, Optional
from miroflow_tools.manager import ToolManager
from omegaconf import DictConfig
from ..config.settings import expose_sub_agents_as_tools
from ..io.input_handler import process_input
from ..io.output_formatter import OutputFormatter
from ..llm.base_client import BaseClient
from ..logging.task_logger import TaskLog, get_utc_plus_8_time
from ..utils.parsing_utils import extract_llm_response_text
from ..utils.prompt_utils import (
generate_agent_specific_system_prompt,
generate_agent_summarize_prompt,
mcp_tags,
refusal_keywords,
)
from .answer_generator import AnswerGenerator
from .stream_handler import StreamHandler
from .tool_executor import ToolExecutor
logger = logging.getLogger(__name__)
# =============================================================================
# Constants
# =============================================================================
# Default timeout for LLM calls in seconds
DEFAULT_LLM_TIMEOUT = 600
# Safety limits for retry loops
DEFAULT_MAX_CONSECUTIVE_ROLLBACKS = 5
# Additional attempts beyond max_turns for total loop protection
EXTRA_ATTEMPTS_BUFFER = 200
def _list_tools(sub_agent_tool_managers: Dict[str, ToolManager]):
"""
Create a cached async function for fetching sub-agent tool definitions.
This factory function returns an async closure that lazily fetches and caches
tool definitions from all sub-agent tool managers. The cache ensures that
tool definitions are only fetched once per orchestrator instance.
Args:
sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.
Returns:
An async function that returns a dictionary of tool definitions for each sub-agent.
"""
cache = None
async def wrapped():
nonlocal cache
if cache is None:
# Only fetch tool definitions if not already cached
result = {
name: await tool_manager.get_all_tool_definitions()
for name, tool_manager in sub_agent_tool_managers.items()
}
cache = result
return cache
return wrapped
class Orchestrator:
"""
Main orchestrator for coordinating agent task execution.
Manages the execution loop for main and sub-agents, coordinating
LLM calls, tool execution, streaming events, and context management.
"""
def __init__(
self,
main_agent_tool_manager: ToolManager,
sub_agent_tool_managers: Dict[str, ToolManager],
llm_client: BaseClient,
output_formatter: OutputFormatter,
cfg: DictConfig,
task_log: Optional["TaskLog"] = None,
stream_queue: Optional[Any] = None,
tool_definitions: Optional[List[Dict[str, Any]]] = None,
sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,
):
"""
Initialize the orchestrator.
Args:
main_agent_tool_manager: Tool manager for main agent
sub_agent_tool_managers: Dictionary of tool managers for sub-agents
llm_client: The LLM client for API calls
output_formatter: Formatter for output processing
cfg: Configuration object
task_log: Logger for task execution
stream_queue: Optional async queue for streaming events
tool_definitions: Pre-fetched tool definitions (optional)
sub_agent_tool_definitions: Pre-fetched sub-agent tool definitions (optional)
"""
self.main_agent_tool_manager = main_agent_tool_manager
self.sub_agent_tool_managers = sub_agent_tool_managers
self.llm_client = llm_client
self.output_formatter = output_formatter
self.cfg = cfg
self.task_log = task_log
self.stream_queue = stream_queue
self.tool_definitions = tool_definitions
self.sub_agent_tool_definitions = sub_agent_tool_definitions
# Initialize sub-agent tool list function
self._list_sub_agent_tools = None
if sub_agent_tool_managers:
self._list_sub_agent_tools = _list_tools(sub_agent_tool_managers)
# Pass task_log to llm_client
if self.llm_client and task_log:
self.llm_client.task_log = task_log
# Track boxed answers extracted during main loop turns
self.intermediate_boxed_answers: List[str] = []
# Record used subtask / q / Query to detect duplicates
self.used_queries: Dict[str, Dict[str, int]] = {}
# Retry loop protection limits
self.MAX_CONSECUTIVE_ROLLBACKS = DEFAULT_MAX_CONSECUTIVE_ROLLBACKS
# Context management settings
self.context_compress_limit = cfg.agent.get("context_compress_limit", 0)
# Initialize helper components
self.stream = StreamHandler(stream_queue)
self.tool_executor = ToolExecutor(
main_agent_tool_manager=main_agent_tool_manager,
sub_agent_tool_managers=sub_agent_tool_managers,
output_formatter=output_formatter,
task_log=task_log,
stream_handler=self.stream,
max_consecutive_rollbacks=DEFAULT_MAX_CONSECUTIVE_ROLLBACKS,
)
self.answer_generator = AnswerGenerator(
llm_client=llm_client,
output_formatter=output_formatter,
task_log=task_log,
stream_handler=self.stream,
cfg=cfg,
intermediate_boxed_answers=self.intermediate_boxed_answers,
)
def _save_message_history(
self, system_prompt: str, message_history: List[Dict[str, Any]]
):
"""Save message history to task log."""
self.task_log.main_agent_message_history = {
"system_prompt": system_prompt,
"message_history": message_history,
}
self.task_log.save()
async def _handle_response_format_issues(
self,
assistant_response_text: str,
message_history: List[Dict[str, Any]],
turn_count: int,
consecutive_rollbacks: int,
total_attempts: int,
max_attempts: int,
agent_name: str,
) -> tuple:
"""
Handle MCP tag format errors and refusal keywords.
Args:
assistant_response_text: The LLM response text
message_history: Current message history
turn_count: Current turn count
consecutive_rollbacks: Current consecutive rollback count
total_attempts: Total attempts made
max_attempts: Maximum allowed attempts
agent_name: Name of the agent for logging
Returns:
Tuple of (should_continue, should_break, turn_count, consecutive_rollbacks, message_history)
"""
# Check for MCP tags in response (format error)
if any(mcp_tag in assistant_response_text for mcp_tag in mcp_tags):
if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
turn_count -= 1
consecutive_rollbacks += 1
if message_history[-1]["role"] == "assistant":
message_history.pop()
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | Rollback",
f"Tool call format incorrect - found MCP tags in response. "
f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, "
f"Total attempts: {total_attempts}/{max_attempts}",
)
return True, False, turn_count, consecutive_rollbacks, message_history
else:
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks",
f"Ending agent loop after {consecutive_rollbacks} consecutive MCP format errors",
)
return False, True, turn_count, consecutive_rollbacks, message_history
# Check for refusal keywords
if any(keyword in assistant_response_text for keyword in refusal_keywords):
matched_keywords = [
kw for kw in refusal_keywords if kw in assistant_response_text
]
if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
turn_count -= 1
consecutive_rollbacks += 1
if message_history[-1]["role"] == "assistant":
message_history.pop()
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | Rollback",
f"LLM refused to answer - found refusal keywords: {matched_keywords}. "
f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, "
f"Total attempts: {total_attempts}/{max_attempts}",
)
return True, False, turn_count, consecutive_rollbacks, message_history
else:
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks",
f"Ending agent loop after {consecutive_rollbacks} consecutive refusals with keywords: {matched_keywords}",
)
return False, True, turn_count, consecutive_rollbacks, message_history
# No format issues - normal end without tool calls
return False, True, turn_count, consecutive_rollbacks, message_history
async def _check_duplicate_query(
self,
tool_name: str,
arguments: dict,
cache_name: str,
consecutive_rollbacks: int,
turn_count: int,
total_attempts: int,
max_attempts: int,
message_history: List[Dict[str, Any]],
agent_name: str,
) -> tuple:
"""
Check for duplicate queries and handle rollback if needed.
Args:
tool_name: Name of the tool being called
arguments: Tool arguments
cache_name: Name of the query cache to use
consecutive_rollbacks: Current consecutive rollback count
turn_count: Current turn count
total_attempts: Total attempts made
max_attempts: Maximum allowed attempts
message_history: Current message history
agent_name: Name of the agent for logging
Returns:
Tuple of (is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history)
"""
query_str = self.tool_executor.get_query_str_from_tool_call(
tool_name, arguments
)
if not query_str:
return False, False, turn_count, consecutive_rollbacks, message_history
self.used_queries.setdefault(cache_name, defaultdict(int))
count = self.used_queries[cache_name][query_str]
if count > 0:
if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
message_history.pop()
turn_count -= 1
consecutive_rollbacks += 1
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | Rollback",
f"Duplicate query detected - tool: {tool_name}, query: '{query_str}', "
f"previous count: {count}. Consecutive rollbacks: {consecutive_rollbacks}/"
f"{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}",
)
return True, True, turn_count, consecutive_rollbacks, message_history
else:
self.task_log.log_step(
"warning",
f"{agent_name} | Turn: {turn_count} | Allow Duplicate",
f"Allowing duplicate query after {consecutive_rollbacks} rollbacks - "
f"tool: {tool_name}, query: '{query_str}', previous count: {count}",
)
return False, False, turn_count, consecutive_rollbacks, message_history
async def _record_query(self, cache_name: str, tool_name: str, arguments: dict):
"""Record a successful query execution."""
query_str = self.tool_executor.get_query_str_from_tool_call(
tool_name, arguments
)
if query_str:
self.used_queries.setdefault(cache_name, defaultdict(int))
self.used_queries[cache_name][query_str] += 1
async def run_sub_agent(
self,
sub_agent_name: str,
task_description: str,
):
"""
Run a sub-agent to handle a subtask.
Args:
sub_agent_name: Name of the sub-agent to run
task_description: Description of the subtask
Returns:
The final answer text from the sub-agent
"""
task_description += "\n\nPlease provide the answer and detailed supporting information of the subtask given to you."
self.task_log.log_step(
"info",
f"{sub_agent_name} | Task Description",
f"Subtask: {task_description}",
)
# Stream sub-agent start
display_name = sub_agent_name.replace("agent-", "")
sub_agent_id = await self.stream.start_agent(display_name)
await self.stream.start_llm(display_name)
# Start new sub-agent session
self.task_log.start_sub_agent_session(sub_agent_name, task_description)
# Initialize message history
message_history = [{"role": "user", "content": task_description}]
# Get sub-agent tool definitions
if not self.sub_agent_tool_definitions:
tool_definitions = await self._list_sub_agent_tools()
tool_definitions = tool_definitions.get(sub_agent_name, {})
else:
tool_definitions = self.sub_agent_tool_definitions[sub_agent_name]
if not tool_definitions:
self.task_log.log_step(
"warning",
f"{sub_agent_name} | No Tools",
"No tool definitions available.",
)
# Generate sub-agent system prompt
system_prompt = self.llm_client.generate_agent_system_prompt(
date=date.today(),
mcp_servers=tool_definitions,
) + generate_agent_specific_system_prompt(agent_type=sub_agent_name)
# Limit sub-agent turns
if self.cfg.agent.sub_agents:
max_turns = self.cfg.agent.sub_agents[sub_agent_name].max_turns
else:
max_turns = 0
turn_count = 0
total_attempts = 0
max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER
consecutive_rollbacks = 0
while turn_count < max_turns and total_attempts < max_attempts:
turn_count += 1
total_attempts += 1
if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:
self.task_log.log_step(
"error",
f"{sub_agent_name} | Too Many Rollbacks",
f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.",
)
break
self.task_log.save()
# Reset 'last_call_tokens'
self.llm_client.last_call_tokens = {
"prompt_tokens": 0,
"completion_tokens": 0,
}
# LLM call using answer generator
(
assistant_response_text,
should_break,
tool_calls,
message_history,
) = await self.answer_generator.handle_llm_call(
system_prompt,
message_history,
tool_definitions,
turn_count,
f"{sub_agent_name} | Turn: {turn_count}",
agent_type=sub_agent_name,
)
if should_break:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
"should break is True, breaking the loop",
)
break
if assistant_response_text:
text_response = extract_llm_response_text(assistant_response_text)
if text_response:
await self.stream.tool_call("show_text", {"text": text_response})
else:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
"LLM call failed",
)
await asyncio.sleep(5)
continue
# Handle no tool calls case
if not tool_calls:
(
should_continue,
should_break_loop,
turn_count,
consecutive_rollbacks,
message_history,
) = await self._handle_response_format_issues(
assistant_response_text,
message_history,
turn_count,
consecutive_rollbacks,
total_attempts,
max_attempts,
sub_agent_name,
)
if should_continue:
continue
if should_break_loop:
if not any(
mcp_tag in assistant_response_text for mcp_tag in mcp_tags
) and not any(
keyword in assistant_response_text
for keyword in refusal_keywords
):
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | LLM Call",
f"No tool calls found in {sub_agent_name}, ending on turn {turn_count}",
)
break
# Execute tool calls
tool_calls_data = []
all_tool_results_content_with_id = []
should_rollback_turn = False
for call in tool_calls:
server_name = call["server_name"]
tool_name = call["tool_name"]
arguments = call["arguments"]
call_id = call["id"]
# Fix common parameter name mistakes
arguments = self.tool_executor.fix_tool_call_arguments(
tool_name, arguments
)
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
f"Executing {tool_name} on {server_name}",
)
call_start_time = time.time()
try:
# Check for duplicate query
cache_name = sub_agent_id + "_" + tool_name
(
is_duplicate,
should_rollback,
turn_count,
consecutive_rollbacks,
message_history,
) = await self._check_duplicate_query(
tool_name,
arguments,
cache_name,
consecutive_rollbacks,
turn_count,
total_attempts,
max_attempts,
message_history,
sub_agent_name,
)
if should_rollback:
should_rollback_turn = True
break
# Send stream event
tool_call_id = await self.stream.tool_call(tool_name, arguments)
# Execute tool call
tool_result = await self.sub_agent_tool_managers[
sub_agent_name
].execute_tool_call(server_name, tool_name, arguments)
# Update query count if successful
if "error" not in tool_result:
await self._record_query(cache_name, tool_name, arguments)
# Post-process result
tool_result = self.tool_executor.post_process_tool_call_result(
tool_name, tool_result
)
result = (
tool_result.get("result")
if tool_result.get("result")
else tool_result.get("error")
)
# Check for errors that should trigger rollback
if self.tool_executor.should_rollback_result(
tool_name, result, tool_result
):
if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1:
message_history.pop()
turn_count -= 1
consecutive_rollbacks += 1
should_rollback_turn = True
self.task_log.log_step(
"warning",
f"{sub_agent_name} | Turn: {turn_count} | Rollback",
f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'",
)
break
await self.stream.tool_call(
tool_name, {"result": result}, tool_call_id=tool_call_id
)
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} completed in {call_duration_ms}ms",
)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"result": tool_result,
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
except Exception as e:
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"error": str(e),
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
tool_result = {
"error": f"Tool call failed: {str(e)}",
"server_name": server_name,
"tool_name": tool_name,
}
self.task_log.log_step(
"error",
f"{sub_agent_name} | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} failed to execute: {str(e)}",
)
tool_result_for_llm = self.output_formatter.format_tool_result_for_user(
tool_result
)
all_tool_results_content_with_id.append((call_id, tool_result_for_llm))
if should_rollback_turn:
continue
# Reset consecutive rollbacks on successful execution
if consecutive_rollbacks > 0:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | Recovery",
f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks",
)
consecutive_rollbacks = 0
# Update message history
message_history = self.llm_client.update_message_history(
message_history, all_tool_results_content_with_id
)
# Check context length
temp_summary_prompt = generate_agent_summarize_prompt(
task_description,
agent_type=sub_agent_name,
)
pass_length_check, message_history = self.llm_client.ensure_summary_context(
message_history, temp_summary_prompt
)
if not pass_length_check:
turn_count = max_turns
self.task_log.log_step(
"info",
f"{sub_agent_name} | Turn: {turn_count} | Context Limit Reached",
"Context limit reached, triggering summary",
)
break
# Log loop end
if turn_count >= max_turns:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Max Turns Reached / Context Limit Reached",
f"Reached maximum turns ({max_turns}) or context limit reached",
)
else:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Main Loop Completed",
f"Main loop completed after {turn_count} turns",
)
# Generate final summary
self.task_log.log_step(
"info",
f"{sub_agent_name} | Final Summary",
f"Generating {sub_agent_name} final summary",
)
summary_prompt = generate_agent_summarize_prompt(
task_description,
agent_type=sub_agent_name,
)
if message_history[-1]["role"] == "user":
message_history.pop()
message_history.append({"role": "user", "content": summary_prompt})
await self.stream.tool_call(
"Partial Summary", {}, tool_call_id=str(uuid.uuid4())
)
# Generate final answer
(
final_answer_text,
should_break,
tool_calls_info,
message_history,
) = await self.answer_generator.handle_llm_call(
system_prompt,
message_history,
tool_definitions,
turn_count + 1,
f"{sub_agent_name} | Final summary",
agent_type=sub_agent_name,
)
if final_answer_text:
self.task_log.log_step(
"info",
f"{sub_agent_name} | Final Answer",
"Final answer generated successfully",
)
else:
final_answer_text = (
f"No final answer generated by sub agent {sub_agent_name}."
)
self.task_log.log_step(
"error",
f"{sub_agent_name} | Final Answer",
"Unable to generate final answer",
)
# Save session history
self.task_log.sub_agent_message_history_sessions[
self.task_log.current_sub_agent_session_id
] = {"system_prompt": system_prompt, "message_history": message_history}
self.task_log.save()
self.task_log.end_sub_agent_session(sub_agent_name)
# Remove thinking content
final_answer_text = final_answer_text.split("")[-1].strip()
final_answer_text = final_answer_text.split("")[-1].strip()
# Stream sub-agent end
await self.stream.end_llm(display_name)
await self.stream.end_agent(display_name, sub_agent_id)
return final_answer_text
async def run_main_agent(
self,
task_description,
task_file_name=None,
task_id="default_task",
is_final_retry=False,
):
"""
Execute the main end-to-end task.
Args:
task_description: Description of the task to execute
task_file_name: Optional file associated with the task
task_id: Unique identifier for the task
Returns:
Tuple of (final_summary, final_boxed_answer, failure_experience_summary)
"""
workflow_id = await self.stream.start_workflow(task_description)
self.task_log.log_step("info", "Main Agent", f"Start task with id: {task_id}")
self.task_log.log_step(
"info", "Main Agent", f"Task description: {task_description}"
)
if task_file_name:
self.task_log.log_step(
"info", "Main Agent", f"Associated file: {task_file_name}"
)
# Process input
initial_user_content, processed_task_desc = process_input(
task_description, task_file_name
)
message_history = [{"role": "user", "content": initial_user_content}]
# Record initial user input
user_input = processed_task_desc
if task_file_name:
user_input += f"\n[Attached file: {task_file_name}]"
# Get tool definitions
if not self.tool_definitions:
tool_definitions = (
await self.main_agent_tool_manager.get_all_tool_definitions()
)
if self.cfg.agent.sub_agents is not None:
tool_definitions += expose_sub_agents_as_tools(
self.cfg.agent.sub_agents
)
else:
tool_definitions = self.tool_definitions
if not tool_definitions:
self.task_log.log_step(
"warning",
"Main Agent | Tool Definitions",
"Warning: No tool definitions found. LLM cannot use any tools.",
)
# Generate system prompt
system_prompt = self.llm_client.generate_agent_system_prompt(
date=date.today(),
mcp_servers=tool_definitions,
) + generate_agent_specific_system_prompt(agent_type="main")
system_prompt = system_prompt.strip()
# Main loop configuration
max_turns = self.cfg.agent.main_agent.max_turns
turn_count = 0
total_attempts = 0
max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER
consecutive_rollbacks = 0
self.current_agent_id = await self.stream.start_agent("main")
await self.stream.start_llm("main")
while turn_count < max_turns and total_attempts < max_attempts:
turn_count += 1
total_attempts += 1
if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS:
self.task_log.log_step(
"error",
"Main Agent | Too Many Rollbacks",
f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.",
)
break
self.task_log.save()
# LLM call
(
assistant_response_text,
should_break,
tool_calls,
message_history,
) = await self.answer_generator.handle_llm_call(
system_prompt,
message_history,
tool_definitions,
turn_count,
f"Main agent | Turn: {turn_count}",
agent_type="main",
)
# Process LLM response
if assistant_response_text:
text_response = extract_llm_response_text(assistant_response_text)
if text_response:
await self.stream.tool_call("show_text", {"text": text_response})
# Extract boxed content
boxed_content = self.output_formatter._extract_boxed_content(
assistant_response_text
)
if boxed_content:
self.intermediate_boxed_answers.append(boxed_content)
if should_break:
self.task_log.log_step(
"info",
f"Main Agent | Turn: {turn_count} | LLM Call",
"should break is True, breaking the loop",
)
break
else:
turn_count -= 1
self.task_log.log_step(
"warning",
f"Main Agent | Turn: {turn_count} | LLM Call",
"No valid response from LLM, retrying",
)
await asyncio.sleep(5)
continue
# Handle no tool calls case
if not tool_calls:
(
should_continue,
should_break_loop,
turn_count,
consecutive_rollbacks,
message_history,
) = await self._handle_response_format_issues(
assistant_response_text,
message_history,
turn_count,
consecutive_rollbacks,
total_attempts,
max_attempts,
"Main Agent",
)
if should_continue:
continue
if should_break_loop:
if not any(
mcp_tag in assistant_response_text for mcp_tag in mcp_tags
) and not any(
keyword in assistant_response_text
for keyword in refusal_keywords
):
self.task_log.log_step(
"info",
f"Main Agent | Turn: {turn_count} | LLM Call",
"LLM did not request tool usage, ending process.",
)
break
# Execute tool calls
tool_calls_data = []
all_tool_results_content_with_id = []
should_rollback_turn = False
main_agent_last_call_tokens = self.llm_client.last_call_tokens
for call in tool_calls:
server_name = call["server_name"]
tool_name = call["tool_name"]
arguments = call["arguments"]
call_id = call["id"]
# Fix common parameter name mistakes
arguments = self.tool_executor.fix_tool_call_arguments(
tool_name, arguments
)
call_start_time = time.time()
try:
if server_name.startswith("agent-") and self.cfg.agent.sub_agents:
# Sub-agent execution
cache_name = "main_" + tool_name
(
is_duplicate,
should_rollback,
turn_count,
consecutive_rollbacks,
message_history,
) = await self._check_duplicate_query(
tool_name,
arguments,
cache_name,
consecutive_rollbacks,
turn_count,
total_attempts,
max_attempts,
message_history,
"Main Agent",
)
if should_rollback:
should_rollback_turn = True
break
# Stream events
await self.stream.end_llm("main")
await self.stream.end_agent("main", self.current_agent_id)
# Execute sub-agent
sub_agent_result = await self.run_sub_agent(
server_name,
arguments["subtask"],
)
# Update query count
await self._record_query(cache_name, tool_name, arguments)
tool_result = {
"server_name": server_name,
"tool_name": tool_name,
"result": sub_agent_result,
}
self.current_agent_id = await self.stream.start_agent(
"main", display_name="Summarizing"
)
await self.stream.start_llm("main", display_name="Summarizing")
else:
# Regular tool execution
cache_name = "main_" + tool_name
(
is_duplicate,
should_rollback,
turn_count,
consecutive_rollbacks,
message_history,
) = await self._check_duplicate_query(
tool_name,
arguments,
cache_name,
consecutive_rollbacks,
turn_count,
total_attempts,
max_attempts,
message_history,
"Main Agent",
)
if should_rollback:
should_rollback_turn = True
break
# Send stream event
tool_call_id = await self.stream.tool_call(tool_name, arguments)
# Execute tool call
tool_result = (
await self.main_agent_tool_manager.execute_tool_call(
server_name=server_name,
tool_name=tool_name,
arguments=arguments,
)
)
# Update query count if successful
if "error" not in tool_result:
await self._record_query(cache_name, tool_name, arguments)
# Post-process result
tool_result = self.tool_executor.post_process_tool_call_result(
tool_name, tool_result
)
result = (
tool_result.get("result")
if tool_result.get("result")
else tool_result.get("error")
)
# Check for errors that should trigger rollback
if self.tool_executor.should_rollback_result(
tool_name, result, tool_result
):
if (
consecutive_rollbacks
< self.MAX_CONSECUTIVE_ROLLBACKS - 1
):
message_history.pop()
turn_count -= 1
consecutive_rollbacks += 1
should_rollback_turn = True
self.task_log.log_step(
"warning",
f"Main Agent | Turn: {turn_count} | Rollback",
f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'",
)
break
await self.stream.tool_call(
tool_name, {"result": result}, tool_call_id=tool_call_id
)
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"result": tool_result,
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
self.task_log.log_step(
"info",
f"Main Agent | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} completed in {call_duration_ms}ms",
)
except Exception as e:
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"error": str(e),
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
tool_result = {
"server_name": server_name,
"tool_name": tool_name,
"error": str(e),
}
self.task_log.log_step(
"error",
f"Main Agent | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} failed to execute: {str(e)}",
)
# Format results for LLM
tool_result_for_llm = self.output_formatter.format_tool_result_for_user(
tool_result
)
all_tool_results_content_with_id.append((call_id, tool_result_for_llm))
if should_rollback_turn:
continue
# Reset consecutive rollbacks on successful execution
if consecutive_rollbacks > 0:
self.task_log.log_step(
"info",
f"Main Agent | Turn: {turn_count} | Recovery",
f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks",
)
consecutive_rollbacks = 0
# Update 'last_call_tokens'
self.llm_client.last_call_tokens = main_agent_last_call_tokens
# Update message history
message_history = self.llm_client.update_message_history(
message_history, all_tool_results_content_with_id
)
self.task_log.main_agent_message_history = {
"system_prompt": system_prompt,
"message_history": message_history,
}
self.task_log.save()
# Check context length
temp_summary_prompt = generate_agent_summarize_prompt(
task_description,
agent_type="main",
)
pass_length_check, message_history = self.llm_client.ensure_summary_context(
message_history, temp_summary_prompt
)
if not pass_length_check:
turn_count = max_turns
self.task_log.log_step(
"warning",
f"Main Agent | Turn: {turn_count} | Context Limit Reached",
"Context limit reached, triggering summary",
)
break
await self.stream.end_llm("main")
await self.stream.end_agent("main", self.current_agent_id)
# Determine if max turns was reached
reached_max_turns = turn_count >= max_turns
if reached_max_turns:
self.task_log.log_step(
"warning",
"Main Agent | Max Turns Reached / Context Limit Reached",
f"Reached maximum turns ({max_turns}) or context limit reached",
)
else:
self.task_log.log_step(
"info",
"Main Agent | Main Loop Completed",
f"Main loop completed after {turn_count} turns",
)
# Final summary
self.task_log.log_step(
"info", "Main Agent | Final Summary", "Generating final summary"
)
self.current_agent_id = await self.stream.start_agent("Final Summary")
await self.stream.start_llm("Final Summary")
# Generate final answer using answer generator
(
final_summary,
final_boxed_answer,
failure_experience_summary,
usage_log,
message_history,
) = await self.answer_generator.generate_and_finalize_answer(
system_prompt=system_prompt,
message_history=message_history,
tool_definitions=tool_definitions,
turn_count=turn_count,
task_description=task_description,
reached_max_turns=reached_max_turns,
is_final_retry=is_final_retry,
save_callback=self._save_message_history,
)
await self.stream.tool_call("show_text", {"text": final_boxed_answer})
await self.stream.end_llm("Final Summary")
await self.stream.end_agent("Final Summary", self.current_agent_id)
await self.stream.end_workflow(workflow_id)
self.task_log.log_step(
"info", "Main Agent | Usage Calculation", f"Usage log: {usage_log}"
)
self.task_log.log_step(
"info",
"Main Agent | Final boxed answer",
f"Final boxed answer:\n\n{final_boxed_answer}",
)
self.task_log.log_step(
"info",
"Main Agent | Task Completed",
f"Main agent task {task_id} completed successfully",
)
gc.collect()
return final_summary, final_boxed_answer, failure_experience_summary
================================================
FILE: apps/miroflow-agent/src/core/pipeline.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Task execution pipeline module.
This module provides:
- execute_task_pipeline: Main function to run a complete task from start to finish
- create_pipeline_components: Factory function to initialize all pipeline components
The pipeline orchestrates the interaction between LLM clients, tool managers,
and the orchestrator to execute complex multi-turn agent tasks.
"""
import traceback
import uuid
from typing import Any, Dict, List, Optional
from miroflow_tools.manager import ToolManager
from omegaconf import DictConfig
from ..config.settings import (
create_mcp_server_parameters,
get_env_info,
)
from ..io.output_formatter import OutputFormatter
from ..llm.factory import ClientFactory
from ..logging.task_logger import (
TaskLog,
get_utc_plus_8_time,
)
from .orchestrator import Orchestrator
async def execute_task_pipeline(
cfg: DictConfig,
task_id: str,
task_description: str,
task_file_name: str,
main_agent_tool_manager: ToolManager,
sub_agent_tool_managers: Dict[str, ToolManager],
output_formatter: OutputFormatter,
ground_truth: Optional[Any] = None,
log_dir: str = "logs",
stream_queue: Optional[Any] = None,
tool_definitions: Optional[List[Dict[str, Any]]] = None,
sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None,
is_final_retry: bool = False,
):
"""
Executes the full pipeline for a single task.
Args:
cfg: The Hydra configuration object.
task_id: A unique identifier for this task run (used for logging).
task_description: The description of the task for the LLM.
task_file_name: The path to an associated file (empty string if none).
main_agent_tool_manager: An initialized main agent ToolManager instance.
sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances.
output_formatter: An initialized OutputFormatter instance.
ground_truth: The ground truth for the task (optional).
log_dir: The directory to save the task log (default: "logs").
stream_queue: A queue for streaming the task execution (optional).
tool_definitions: The definitions of the tools for the main agent (optional).
sub_agent_tool_definitions: The definitions of the tools for the sub-agents (optional).
Returns:
A tuple of (final_summary, final_boxed_answer, log_file_path, failure_experience_summary):
- final_summary: A string with the final execution summary, or an error message.
- final_boxed_answer: The extracted boxed answer from the LLM response.
- log_file_path: The path to the saved task log file.
- failure_experience_summary: Summary of failure experience for retry (None if successful).
"""
# Create task log
task_log = TaskLog(
log_dir=log_dir,
task_id=task_id,
start_time=get_utc_plus_8_time(),
input={"task_description": task_description, "task_file_name": task_file_name},
env_info=get_env_info(cfg),
ground_truth=ground_truth,
)
# Log task start
task_log.log_step(
"info", "Main | Task Start", f"--- Starting Task Execution: {task_id} ---"
)
# Set task_log for all ToolManager instances
main_agent_tool_manager.set_task_log(task_log)
if sub_agent_tool_managers:
for sub_agent_tool_manager in sub_agent_tool_managers.values():
sub_agent_tool_manager.set_task_log(task_log)
try:
# Initialize LLM client
random_uuid = str(uuid.uuid4())
unique_id = f"{task_id}-{random_uuid}"
llm_client = ClientFactory(task_id=unique_id, cfg=cfg, task_log=task_log)
# Initialize orchestrator
orchestrator = Orchestrator(
main_agent_tool_manager=main_agent_tool_manager,
sub_agent_tool_managers=sub_agent_tool_managers,
llm_client=llm_client,
output_formatter=output_formatter,
cfg=cfg,
task_log=task_log,
stream_queue=stream_queue,
tool_definitions=tool_definitions,
sub_agent_tool_definitions=sub_agent_tool_definitions,
)
(
final_summary,
final_boxed_answer,
failure_experience_summary,
) = await orchestrator.run_main_agent(
task_description=task_description,
task_file_name=task_file_name,
task_id=task_id,
is_final_retry=is_final_retry,
)
llm_client.close()
task_log.final_boxed_answer = final_boxed_answer
task_log.status = "success"
# Store failure experience summary in task log if available
if failure_experience_summary:
task_log.trace_data["failure_experience_summary"] = (
failure_experience_summary
)
log_file_path = task_log.save()
return (
final_summary,
final_boxed_answer,
log_file_path,
failure_experience_summary,
)
except Exception as e:
error_details = traceback.format_exc()
task_log.log_step(
"warning",
"task_error_notification",
f"An error occurred during task {task_id}",
)
task_log.log_step("error", "task_error_details", error_details)
error_message = (
f"Error executing task {task_id}:\n"
f"Description: {task_description}\n"
f"File: {task_file_name}\n"
f"Error Type: {type(e).__name__}\n"
f"Error Details:\n{error_details}"
)
task_log.status = "failed"
task_log.error = error_details
log_file_path = task_log.save()
return error_message, "", log_file_path, None
finally:
task_log.end_time = get_utc_plus_8_time()
# Record task summary to structured log
task_log.log_step(
"info",
"task_execution_finished",
f"Task {task_id} execution completed with status: {task_log.status}",
)
task_log.save()
def create_pipeline_components(cfg: DictConfig):
"""
Creates and initializes the core components of the agent pipeline.
Args:
cfg: The Hydra configuration object.
Returns:
Tuple of (main_agent_tool_manager, sub_agent_tool_managers, output_formatter)
"""
# Create ToolManagers for main agent and sub-agents
main_agent_mcp_server_configs, main_agent_blacklist = create_mcp_server_parameters(
cfg, cfg.agent.main_agent
)
main_agent_tool_manager = ToolManager(
main_agent_mcp_server_configs,
tool_blacklist=main_agent_blacklist,
)
# Create OutputFormatter
output_formatter = OutputFormatter()
sub_agent_tool_managers = {}
# For single agent mode
if not cfg.agent.sub_agents:
return main_agent_tool_manager, {}, output_formatter
for sub_agent in cfg.agent.sub_agents:
sub_agent_mcp_server_configs, sub_agent_blacklist = (
create_mcp_server_parameters(cfg, cfg.agent.sub_agents[sub_agent])
)
sub_agent_tool_manager = ToolManager(
sub_agent_mcp_server_configs,
tool_blacklist=sub_agent_blacklist,
)
sub_agent_tool_managers[sub_agent] = sub_agent_tool_manager
return main_agent_tool_manager, sub_agent_tool_managers, output_formatter
================================================
FILE: apps/miroflow-agent/src/core/stream_handler.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Stream handler module for SSE (Server-Sent Events) protocol.
This module provides the StreamHandler class that manages all streaming events
for real-time communication with clients during agent task execution.
"""
import logging
import uuid
from typing import Any, Optional
logger = logging.getLogger(__name__)
class StreamHandler:
"""
Handler for streaming events in SSE protocol format.
Manages the sending of various event types including workflow lifecycle,
agent lifecycle, LLM interactions, and tool calls.
"""
def __init__(self, stream_queue: Optional[Any] = None):
"""
Initialize the stream handler.
Args:
stream_queue: Optional async queue for sending stream messages.
If None, streaming is disabled.
"""
self.stream_queue = stream_queue
async def update(self, event_type: str, data: dict):
"""
Send a streaming update in SSE protocol format.
Args:
event_type: The type of event (e.g., 'start_of_workflow', 'tool_call')
data: The event payload data
"""
if self.stream_queue:
try:
stream_message = {
"event": event_type,
"data": data,
}
await self.stream_queue.put(stream_message)
except Exception as e:
logger.warning(f"Failed to send stream update: {e}")
async def start_workflow(self, user_input: str) -> str:
"""
Send start_of_workflow event.
Args:
user_input: The initial user input for the workflow
Returns:
The generated workflow ID
"""
workflow_id = str(uuid.uuid4())
await self.update(
"start_of_workflow",
{
"workflow_id": workflow_id,
"input": [
{
"role": "user",
"content": user_input,
}
],
},
)
return workflow_id
async def end_workflow(self, workflow_id: str):
"""
Send end_of_workflow event.
Args:
workflow_id: The workflow ID to end
"""
await self.update(
"end_of_workflow",
{
"workflow_id": workflow_id,
},
)
async def show_error(self, error: str):
"""
Send show_error event and signal stream end.
Args:
error: The error message to display
"""
await self.tool_call("show_error", {"error": error})
if self.stream_queue:
try:
await self.stream_queue.put(None)
except Exception as e:
logger.warning(f"Failed to send show_error: {e}")
async def start_agent(self, agent_name: str, display_name: str = None) -> str:
"""
Send start_of_agent event.
Args:
agent_name: Internal name of the agent
display_name: Optional display name for UI
Returns:
The generated agent ID
"""
agent_id = str(uuid.uuid4())
await self.update(
"start_of_agent",
{
"agent_name": agent_name,
"display_name": display_name,
"agent_id": agent_id,
},
)
return agent_id
async def end_agent(self, agent_name: str, agent_id: str):
"""
Send end_of_agent event.
Args:
agent_name: Internal name of the agent
agent_id: The agent ID to end
"""
await self.update(
"end_of_agent",
{
"agent_name": agent_name,
"agent_id": agent_id,
},
)
async def start_llm(self, agent_name: str, display_name: str = None):
"""
Send start_of_llm event.
Args:
agent_name: Name of the agent making the LLM call
display_name: Optional display name for UI
"""
await self.update(
"start_of_llm",
{
"agent_name": agent_name,
"display_name": display_name,
},
)
async def end_llm(self, agent_name: str):
"""
Send end_of_llm event.
Args:
agent_name: Name of the agent that finished LLM call
"""
await self.update(
"end_of_llm",
{
"agent_name": agent_name,
},
)
async def message(self, message_id: str, delta_content: str):
"""
Send message event with streaming content.
Args:
message_id: Unique identifier for the message
delta_content: The content delta to send
"""
await self.update(
"message",
{
"message_id": message_id,
"delta": {
"content": delta_content,
},
},
)
async def tool_call(
self,
tool_name: str,
payload: dict,
streaming: bool = False,
tool_call_id: str = None,
) -> str:
"""
Send tool_call event.
Args:
tool_name: Name of the tool being called
payload: Tool call arguments or results
streaming: If True, send payload keys as deltas
tool_call_id: Optional existing tool call ID
Returns:
The tool call ID (generated if not provided)
"""
if not tool_call_id:
tool_call_id = str(uuid.uuid4())
if streaming:
for key, value in payload.items():
await self.update(
"tool_call",
{
"tool_call_id": tool_call_id,
"tool_name": tool_name,
"delta_input": {key: value},
},
)
else:
# Send complete tool call
await self.update(
"tool_call",
{
"tool_call_id": tool_call_id,
"tool_name": tool_name,
"tool_input": payload,
},
)
return tool_call_id
================================================
FILE: apps/miroflow-agent/src/core/tool_executor.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Tool executor module for handling tool call execution.
This module provides the ToolExecutor class that manages tool call execution,
including argument fixing, duplicate detection, result processing, and error handling.
"""
import json
import logging
import os
import time
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple
from miroflow_tools.manager import ToolManager
from ..io.output_formatter import OutputFormatter
from ..logging.task_logger import TaskLog, get_utc_plus_8_time
from .stream_handler import StreamHandler
logger = logging.getLogger(__name__)
# Maximum length for scrape results in demo mode (to support more conversation turns)
DEMO_SCRAPE_MAX_LENGTH = 20_000
class ToolExecutor:
"""
Executor for tool calls with support for duplicate detection and result processing.
Handles the execution of tool calls, including parameter fixing, duplicate query
detection, result truncation in demo mode, and error handling.
"""
def __init__(
self,
main_agent_tool_manager: ToolManager,
sub_agent_tool_managers: Dict[str, ToolManager],
output_formatter: OutputFormatter,
task_log: TaskLog,
stream_handler: StreamHandler,
max_consecutive_rollbacks: int = 5,
):
"""
Initialize the tool executor.
Args:
main_agent_tool_manager: Tool manager for main agent
sub_agent_tool_managers: Dictionary of tool managers for sub-agents
output_formatter: Formatter for tool results
task_log: Logger for task execution
stream_handler: Handler for streaming events
max_consecutive_rollbacks: Maximum allowed consecutive rollbacks
"""
self.main_agent_tool_manager = main_agent_tool_manager
self.sub_agent_tool_managers = sub_agent_tool_managers
self.output_formatter = output_formatter
self.task_log = task_log
self.stream = stream_handler
self.max_consecutive_rollbacks = max_consecutive_rollbacks
# Track used queries to detect duplicates
self.used_queries: Dict[str, Dict[str, int]] = {}
def fix_tool_call_arguments(self, tool_name: str, arguments: dict) -> dict:
"""
Fix common parameter name mistakes made by LLM.
Args:
tool_name: Name of the tool being called
arguments: Original arguments dictionary
Returns:
Fixed arguments dictionary
"""
# Create a copy to avoid modifying the original
fixed_args = arguments.copy()
# Fix scrape_and_extract_info parameter names
if tool_name == "scrape_and_extract_info":
# Map common mistakes to the correct parameter name
mistake_names = ["description", "introduction"]
if "info_to_extract" not in fixed_args:
for mistake_name in mistake_names:
if mistake_name in fixed_args:
fixed_args["info_to_extract"] = fixed_args.pop(mistake_name)
break
# Fix run_python_code parameter names: 'code' -> 'code_block'
# Also add default sandbox_id if missing (will trigger stateless fallback)
if tool_name == "run_python_code":
if "code_block" not in fixed_args and "code" in fixed_args:
fixed_args["code_block"] = fixed_args.pop("code")
if "sandbox_id" not in fixed_args:
fixed_args["sandbox_id"] = "default"
return fixed_args
def get_query_str_from_tool_call(
self, tool_name: str, arguments: dict
) -> Optional[str]:
"""
Extract the query string from tool call arguments based on tool_name.
Supports search_and_browse, google_search, sogou_search, scrape_website,
and scrape_and_extract_info.
Args:
tool_name: Name of the tool
arguments: Tool arguments dictionary
Returns:
Query string for duplicate detection, or None if not applicable
"""
if tool_name == "search_and_browse":
return tool_name + "_" + arguments.get("subtask", "")
elif tool_name == "google_search":
return tool_name + "_" + arguments.get("q", "")
elif tool_name == "sogou_search":
return tool_name + "_" + arguments.get("Query", "")
elif tool_name == "scrape_website":
return tool_name + "_" + arguments.get("url", "")
elif tool_name == "scrape_and_extract_info":
return (
tool_name
+ "_"
+ arguments.get("url", "")
+ "_"
+ arguments.get("info_to_extract", "")
)
return None
def is_duplicate_query(self, cache_name: str, query_str: str) -> Tuple[bool, int]:
"""
Check if a query has been executed before.
Args:
cache_name: Name of the cache (e.g., "main_google_search")
query_str: The query string to check
Returns:
Tuple of (is_duplicate, previous_count)
"""
self.used_queries.setdefault(cache_name, defaultdict(int))
count = self.used_queries[cache_name][query_str]
return count > 0, count
def record_query(self, cache_name: str, query_str: str):
"""
Record that a query has been executed.
Args:
cache_name: Name of the cache
query_str: The query string to record
"""
self.used_queries.setdefault(cache_name, defaultdict(int))
self.used_queries[cache_name][query_str] += 1
def is_google_search_empty_result(self, tool_name: str, tool_result: dict) -> bool:
"""
Check if google_search result has empty organic results.
This indicates a poor search query that should be retried.
Args:
tool_name: Name of the tool
tool_result: The tool execution result
Returns:
True if the result is empty and should trigger retry
"""
if tool_name != "google_search":
return False
result = tool_result.get("result")
if not result:
return False
try:
if isinstance(result, str):
result_dict = json.loads(result)
else:
result_dict = result
organic = result_dict.get("organic", [])
return len(organic) == 0
except (json.JSONDecodeError, TypeError, AttributeError):
return False
def get_scrape_result(self, result: str) -> str:
"""
Process scrape result and truncate if too long.
Args:
result: Raw scrape result string (JSON or plain text)
Returns:
Processed result, truncated to DEMO_SCRAPE_MAX_LENGTH if necessary
"""
try:
scrape_result_dict = json.loads(result)
text = scrape_result_dict.get("text")
if text and len(text) > DEMO_SCRAPE_MAX_LENGTH:
text = text[:DEMO_SCRAPE_MAX_LENGTH]
return json.dumps({"text": text}, ensure_ascii=False)
except json.JSONDecodeError:
if isinstance(result, str) and len(result) > DEMO_SCRAPE_MAX_LENGTH:
result = result[:DEMO_SCRAPE_MAX_LENGTH]
return result
def post_process_tool_call_result(
self, tool_name: str, tool_call_result: dict
) -> dict:
"""
Process tool call results.
Only in demo mode: truncate scrape results to 20,000 chars
to support more conversation turns.
Args:
tool_name: Name of the tool
tool_call_result: The tool execution result
Returns:
Processed tool result
"""
if os.environ.get("DEMO_MODE") == "1":
if "result" in tool_call_result and tool_name in [
"scrape",
"scrape_website",
]:
tool_call_result["result"] = self.get_scrape_result(
tool_call_result["result"]
)
return tool_call_result
def should_rollback_result(
self, tool_name: str, result: Any, tool_result: dict
) -> bool:
"""
Check if a tool result should trigger a rollback.
Args:
tool_name: Name of the tool
result: The result value
tool_result: Full tool result dictionary
Returns:
True if the result indicates an error that should trigger rollback
"""
return (
str(result).startswith("Unknown tool:")
or str(result).startswith("Error executing tool")
or self.is_google_search_empty_result(tool_name, tool_result)
)
async def execute_single_tool_call(
self,
tool_manager: ToolManager,
server_name: str,
tool_name: str,
arguments: dict,
agent_name: str,
turn_count: int,
) -> Tuple[dict, int, List[dict]]:
"""
Execute a single tool call.
Args:
tool_manager: The tool manager to use
server_name: Name of the MCP server
tool_name: Name of the tool
arguments: Tool arguments
agent_name: Name of the agent making the call
turn_count: Current turn count
Returns:
Tuple of (tool_result, duration_ms, tool_calls_data)
"""
call_start_time = time.time()
tool_calls_data = []
try:
# Execute tool call
tool_result = await tool_manager.execute_tool_call(
server_name, tool_name, arguments
)
# Post-process result
tool_result = self.post_process_tool_call_result(tool_name, tool_result)
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
self.task_log.log_step(
"info",
f"{agent_name} | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} completed in {call_duration_ms}ms",
)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"result": tool_result,
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
return tool_result, call_duration_ms, tool_calls_data
except Exception as e:
call_end_time = time.time()
call_duration_ms = int((call_end_time - call_start_time) * 1000)
tool_calls_data.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"error": str(e),
"duration_ms": call_duration_ms,
"call_time": get_utc_plus_8_time(),
}
)
tool_result = {
"error": f"Tool call failed: {str(e)}",
"server_name": server_name,
"tool_name": tool_name,
}
self.task_log.log_step(
"error",
f"{agent_name} | Turn: {turn_count} | Tool Call",
f"Tool {tool_name} failed to execute: {str(e)}",
)
return tool_result, call_duration_ms, tool_calls_data
def format_tool_result_for_llm(self, tool_result: dict) -> dict:
"""
Format tool result for feeding back to LLM.
Args:
tool_result: The tool execution result
Returns:
Formatted result suitable for LLM message
"""
return self.output_formatter.format_tool_result_for_user(tool_result)
================================================
FILE: apps/miroflow-agent/src/io/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Input/Output module for processing task inputs and formatting outputs."""
from .input_handler import process_input
from .output_formatter import OutputFormatter
__all__ = [
"process_input",
"OutputFormatter",
]
================================================
FILE: apps/miroflow-agent/src/io/input_handler.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Input handler module for processing various file types.
This module provides functions for:
- Processing task inputs with associated files
- Converting documents (PDF, DOCX, PPTX, XLSX) to markdown
- Generating captions for images, audio, and video files
- Extracting task-relevant information from media files
Supported file formats:
- Documents: PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, HTML
- Images: JPG, JPEG, PNG, GIF, WEBP
- Audio: WAV, MP3, M4A
- Video: MP4, MOV, AVI, MKV, WEBM
- Data: JSON, JSONLD, CSV, YAML, TOML
- Code: PY, SH, MD, TXT
- Archives: ZIP
"""
import base64
import html
import json
import os
import re
import shutil
import tempfile
import traceback
from typing import Any, Tuple, Union
from urllib.parse import quote, unquote, urlparse, urlunparse
import mammoth
import markdownify
import openpyxl
import pdfminer
import pdfminer.high_level
import pptx
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from markitdown import MarkItDown
from openai import OpenAI
from openpyxl.utils import get_column_letter
# Ensure .env file is loaded
load_dotenv()
# File extension constants for different media types
IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "gif", "webp"}
AUDIO_EXTENSIONS = {"wav", "mp3", "m4a"}
VIDEO_EXTENSIONS = {"mp4", "mov", "avi", "mkv", "webm"}
MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS
# Extensions that should skip MarkItDown fallback processing
SKIP_MARKITDOWN_EXTENSIONS = MEDIA_EXTENSIONS | {"pdb"}
def _generate_image_caption(image_path: str) -> str:
"""
Generate a caption for an image using OpenAI's GPT-4o vision model.
Args:
image_path: Path to the image file
Returns:
Caption string, or error message if failed
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return "[Caption unavailable: OPENAI_API_KEY not set]"
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Read and encode image
with open(image_path, "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode("utf-8")
# Guess MIME type
_, ext = os.path.splitext(image_path)
ext = ext.lower()
mime_type = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
}.get(ext, "image/jpeg")
# Call OpenAI API
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Please provide a detailed description of this image. Include key objects, people, text, colors, and any other relevant details.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_data}"
},
},
],
}
],
max_tokens=2048,
temperature=0,
)
content = response.choices[0].message.content
return content if content else "[Caption unavailable: Empty response]"
except Exception as e:
return f"[Caption generation failed: {str(e)}]"
def _generate_audio_caption(audio_path: str) -> str:
"""
Generate a caption for an audio file using OpenAI's audio transcription.
Args:
audio_path: Path to the audio file
Returns:
Caption string (transcription), or error message if failed
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return "[Caption unavailable: OPENAI_API_KEY not set]"
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Transcribe audio
with open(audio_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="gpt-4o-transcribe", file=audio_file
)
text = transcription.text
return text if text else "[Transcription unavailable: Empty response]"
except Exception as e:
return f"[Caption generation failed: {str(e)}]"
def _generate_video_caption(video_path: str) -> str:
"""
Generate a caption for a video using OpenAI's GPT-4o vision model.
Args:
video_path: Path to the video file
Returns:
Caption string, or error message if failed
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return "[Caption unavailable: OPENAI_API_KEY not set]"
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Read and encode video
with open(video_path, "rb") as video_file:
video_data = base64.b64encode(video_file.read()).decode("utf-8")
# Guess MIME type
_, ext = os.path.splitext(video_path)
ext = ext.lower()
mime_type = {
".mp4": "video/mp4",
".mov": "video/quicktime",
".avi": "video/x-msvideo",
".mkv": "video/x-matroska",
".webm": "video/webm",
}.get(ext, "video/mp4")
# Call OpenAI API
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Please provide a detailed description of this video. Include key events, people, objects, actions, audio information, and any text visible in the video.",
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{video_data}"
},
},
],
}
],
max_tokens=2048,
temperature=0,
)
content = response.choices[0].message.content
return content if content else "[Caption unavailable: Empty response]"
except Exception as e:
return f"[Caption generation failed: {str(e)}]"
def _extract_task_relevant_info_from_image(
image_path: str, task_description: str
) -> str:
"""
Extract task-relevant information directly from an image based on the task description.
Args:
image_path: Path to the image file
task_description: The user's task description
Returns:
Extracted relevant information, or empty string if extraction fails
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return ""
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Read and encode image
with open(image_path, "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode("utf-8")
# Guess MIME type
_, ext = os.path.splitext(image_path)
ext = ext.lower()
mime_type = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
}.get(ext, "image/jpeg")
# Call OpenAI API with task-specific prompt
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": f"""Based on the following task, analyze this image and extract only the information that is directly relevant to completing the task.
Task: {task_description}
Please provide a concise summary of the relevant information from the image that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the image." Keep the response brief and focused.""",
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{image_data}"
},
},
],
}
],
max_tokens=1024,
temperature=0,
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Warning: Failed to extract task-relevant info from image: {str(e)}")
return ""
def _extract_task_relevant_info_from_audio(
audio_path: str, task_description: str
) -> str:
"""
Extract task-relevant information directly from an audio file based on the task description.
Args:
audio_path: Path to the audio file
task_description: The user's task description
Returns:
Extracted relevant information, or empty string if extraction fails
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return ""
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Read and encode audio file
with open(audio_path, "rb") as audio_file:
audio_data = base64.b64encode(audio_file.read()).decode("utf-8")
# Detect audio format
_, ext = os.path.splitext(audio_path)
ext = ext.lower()
audio_format = {
".mp3": "mp3",
".wav": "wav",
".m4a": "m4a",
}.get(ext, "mp3")
# Use gpt-4o-audio-preview for direct audio question answering
text_prompt = f"""Based on the following task, analyze this audio and extract only the information that is directly relevant to completing the task.
Task: {task_description}
Please provide a concise summary of the relevant information from the audio that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the audio." Keep the response brief and focused."""
response = client.chat.completions.create(
model="gpt-4o-audio-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant specializing in audio analysis.",
},
{
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": audio_data,
"format": audio_format,
},
},
],
},
],
max_tokens=1024,
temperature=0,
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Warning: Failed to extract task-relevant info from audio: {str(e)}")
return ""
def _extract_task_relevant_info_from_video(
video_path: str, task_description: str
) -> str:
"""
Extract task-relevant information directly from a video based on the task description.
Args:
video_path: Path to the video file
task_description: The user's task description
Returns:
Extracted relevant information, or empty string if extraction fails
"""
try:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
if not OPENAI_API_KEY:
return ""
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Read and encode video
with open(video_path, "rb") as video_file:
video_data = base64.b64encode(video_file.read()).decode("utf-8")
# Guess MIME type
_, ext = os.path.splitext(video_path)
ext = ext.lower()
mime_type = {
".mp4": "video/mp4",
".mov": "video/quicktime",
".avi": "video/x-msvideo",
".mkv": "video/x-matroska",
".webm": "video/webm",
}.get(ext, "video/mp4")
# Call OpenAI API with task-specific prompt
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": f"""Based on the following task, analyze this video and extract only the information that is directly relevant to completing the task.
Task: {task_description}
Please provide a concise summary of the relevant information from the video that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the video." Keep the response brief and focused.""",
},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{video_data}"
},
},
],
}
],
max_tokens=1024,
temperature=0,
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Warning: Failed to extract task-relevant info from video: {str(e)}")
return ""
def process_input(task_description: str, task_file_name: str) -> Tuple[str, str]:
"""
Process user input and associated files.
Extracts content from the task file (if provided) and appends it to the
task description in a format suitable for the LLM.
Args:
task_description: The original task description
task_file_name: Path to an associated file, or empty string if none
Returns:
Tuple of (updated_task_description, updated_task_description)
Both values are the same - the task description with file content appended
"""
updated_task_description = task_description
file_content_section = "" # Collect file content to append at the end
if task_file_name:
try:
file_extension = task_file_name.rsplit(".", maxsplit=1)[-1].lower()
parsing_result = None
if file_extension in IMAGE_EXTENSIONS:
# Generate unconditional image caption
caption = _generate_image_caption(task_file_name)
# Extract task-relevant information directly from the image
relevant_info = _extract_task_relevant_info_from_image(
task_file_name, task_description
)
# Format as Markdown
file_content_section += f"\n\nNote: An image file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Image Content\nFile: {task_file_name}\n\n"
file_content_section += f"> {caption}\n\n"
if relevant_info:
file_content_section += "Task-Relevant Information:\n\n"
file_content_section += f"{relevant_info}\n\n"
elif file_extension == "py":
# Python files - read directly
with open(task_file_name, "r", encoding="utf-8") as f:
parsing_result = DocumentConverterResult(
title=None, text_content=f.read()
)
file_content_section += f"\n\nNote: A Python file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Python File\nFile: {task_file_name}\n\n"
elif file_extension in ["txt", "md", "sh", "yaml", "yml", "toml", "csv"]:
# Text-based files - read directly
with open(task_file_name, "r", encoding="utf-8") as f:
parsing_result = DocumentConverterResult(
title=None, text_content=f.read()
)
file_type_name = {
"txt": "Text",
"md": "Markdown",
"sh": "Shell Script",
"yaml": "YAML",
"yml": "YAML",
"toml": "TOML",
"csv": "CSV",
}.get(file_extension, "Text")
file_content_section += f"\n\nNote: A {file_type_name.lower()} file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += (
f"## {file_type_name} File\nFile: {task_file_name}\n\n"
)
elif file_extension in ["jsonld", "json"]:
with open(task_file_name, "r", encoding="utf-8") as f:
parsing_result = DocumentConverterResult(
title=None,
text_content=json.dumps(
json.load(f), ensure_ascii=False, indent=2
),
)
file_content_section += f"\n\nNote: A JSON file '{task_file_name}' is associated with this task. The content has been extracted as JSON format below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## JSON File\nFile: {task_file_name}\n\n"
elif file_extension in ["xlsx", "xls"]:
parsing_result = XlsxConverter(local_path=task_file_name)
file_content_section += f"\n\nNote: An Excel file '{task_file_name}' is associated with this task. The content has been extracted as a markdown table below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Excel File\nFile: {task_file_name}\n\n"
elif file_extension == "pdf":
parsing_result = DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(task_file_name),
)
file_content_section += f"\n\nNote: A PDF file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## PDF File\nFile: {task_file_name}\n\n"
elif file_extension in ["docx", "doc"]:
parsing_result = DocxConverter(local_path=task_file_name)
file_content_section += f"\n\nNote: A Word document '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Word Document\nFile: {task_file_name}\n\n"
elif file_extension in ["html", "htm"]:
parsing_result = HtmlConverter(local_path=task_file_name)
file_content_section += f"\n\nNote: An HTML file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## HTML File\nFile: {task_file_name}\n\n"
elif file_extension in ["pptx", "ppt"]:
parsing_result = PptxConverter(local_path=task_file_name)
file_content_section += f"\n\nNote: A PowerPoint presentation '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += (
f"## PowerPoint Presentation\nFile: {task_file_name}\n\n"
)
elif file_extension in AUDIO_EXTENSIONS:
# Generate unconditional audio transcription
caption = _generate_audio_caption(task_file_name)
# Extract task-relevant information directly from the audio
relevant_info = _extract_task_relevant_info_from_audio(
task_file_name, task_description
)
# Format as Markdown
file_content_section += f"\n\nNote: An audio file '{task_file_name}' is associated with this task. The content has been extracted as a transcription below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Audio Content\nFile: {task_file_name}\n\n"
file_content_section += f"> {caption}\n\n"
if relevant_info:
file_content_section += "Task-Relevant Information:\n\n"
file_content_section += f"{relevant_info}\n\n"
elif file_extension in VIDEO_EXTENSIONS:
# Generate unconditional video caption
caption = _generate_video_caption(task_file_name)
# Extract task-relevant information directly from the video
relevant_info = _extract_task_relevant_info_from_video(
task_file_name, task_description
)
# Format as Markdown
file_content_section += f"\n\nNote: A video file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## Video Content\nFile: {task_file_name}\n\n"
file_content_section += f"> {caption}\n\n"
if relevant_info:
file_content_section += "Task-Relevant Information:\n\n"
file_content_section += f"{relevant_info}\n\n"
elif file_extension in ["zip"]:
parsing_result = ZipConverter(local_path=task_file_name)
file_content_section += f"\n\nNote: A ZIP archive '{task_file_name}' is associated with this task. The content has been extracted as file list and contents below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += f"## ZIP Archive\nFile: {task_file_name}\n\n"
elif file_extension == "pdb":
# PDB files (protein database) - only add note
file_content_section += f"\n\nNote: A PDB file '{task_file_name}' is associated with this task. You may use available tools to read its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
else:
# For other file types, let MarkItDown try to handle it
pass # MarkItDown will be tried below
#### markitdown process - ONLY if no specialized converter handled it ####
if parsing_result is None:
try:
if file_extension not in SKIP_MARKITDOWN_EXTENSIONS:
md = MarkItDown(enable_plugins=True)
parsing_result = md.convert(task_file_name)
print(
f"Info: Used MarkItDown as fallback to process file {task_file_name}"
)
# Add prompt for files processed by MarkItDown
file_content_section += f"\n\nNote: A file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n"
file_content_section += (
f"## File Content\nFile: {task_file_name}\n\n"
)
except Exception as e:
print(
f"Warning: MarkItDown failed to process {task_file_name}: {e}"
)
pass
############################
# Collect the content and title (if has) to append later
if getattr(parsing_result, "title", None):
file_content_section += "Title:\n\n{}\n\n".format(parsing_result.title)
file_content_section += "Content:\n\n```\n{}\n```\n".format(
parsing_result.text_content
)
elif getattr(parsing_result, "text_content", None):
content = parsing_result.text_content
max_len = 200_000 # Limit the length of results returned to LLM
if len(content) > max_len:
content = content[:max_len] + "\n... [File truncated]"
file_content_section += "```\n{}\n```\n".format(content)
else:
pass # for image, audio, video files that already have their content formatted
except FileNotFoundError:
print(f"Error: File not found {task_file_name}")
file_content_section += (
f"\nWarning: The specified file '{task_file_name}' was not found."
)
except Exception as e:
print(f"Error: Error processing file {task_file_name}: {e}")
traceback.print_exc()
file_content_section += f"\nWarning: There was an error processing the file '{task_file_name}': {str(e)}"
# output format requirement
updated_task_description += "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}."
# Append file content at the end
updated_task_description += file_content_section
updated_task_description = updated_task_description.strip()
return updated_task_description, updated_task_description
class _CustomMarkdownify(markdownify.MarkdownConverter):
"""
A custom version of markdownify's MarkdownConverter. Changes include:
- Altering the default heading style to use '#', '##', etc.
- Removing javascript hyperlinks.
- Truncating images with large data:uri sources.
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
"""
def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
"""Same as usual, but be sure to start with a new line"""
if not convert_as_inline:
if not re.search(r"^\n", text):
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
"""Same as usual converter, but removes Javascript links and escapes URIs."""
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ""
href = el.get("href")
title = el.get("title")
# Escape URIs and skip non-http or file schemes
if href:
try:
parsed_url = urlparse(href) # type: ignore
if parsed_url.scheme and parsed_url.scheme.lower() not in [
"http",
"https",
"file",
]: # type: ignore
return "%s%s%s" % (prefix, text, suffix)
href = urlunparse(
parsed_url._replace(path=quote(unquote(parsed_url.path)))
) # type: ignore
except ValueError: # It's not clear if this ever gets thrown
return "%s%s%s" % (prefix, text, suffix)
# For the replacement see #29: text nodes underscores are escaped
if (
self.options["autolinks"]
and text.replace(r"\_", "_") == href
and not title
and not self.options["default_title"]
):
# Shortcut syntax
return "<%s>" % href
if self.options["default_title"] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
return (
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
if href
else text
)
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
"""Same as usual converter, but removes data URIs"""
alt = el.attrs.get("alt", None) or ""
src = el.attrs.get("src", None) or ""
title = el.attrs.get("title", None) or ""
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
if (
convert_as_inline
and el.parent.name not in self.options["keep_inline_images_in"]
):
return alt
# Remove dataURIs
if src.startswith("data:"):
src = src.split(",")[0] + "..."
return "" % (alt, src, title_part)
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
class DocumentConverterResult:
"""The result of converting a document to text."""
def __init__(self, title: Union[str, None] = None, text_content: str = ""):
self.title: Union[str, None] = title
self.text_content: str = text_content
def convert_html_to_md(html_content):
"""
Placeholder for HTML to Markdown conversion function
In the original class, this would call self._convert()
"""
soup = BeautifulSoup(html_content, "html.parser")
for script in soup(["script", "style"]):
script.extract()
# Print only the main content
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
assert isinstance(webpage_text, str)
return DocumentConverterResult(
title=None if soup.title is None else soup.title.string,
text_content=webpage_text,
)
def HtmlConverter(local_path: str):
"""
Convert an HTML file to Markdown format.
Args:
local_path: Path to the HTML file to convert.
Returns:
DocumentConverterResult containing the converted Markdown text.
"""
with open(local_path, "rt", encoding="utf-8") as fh:
html_content = fh.read()
return convert_html_to_md(html_content)
def DocxConverter(local_path: str):
"""
Convert a DOCX file to Markdown format.
Uses mammoth library to first convert DOCX to HTML, then converts
the HTML to Markdown.
Args:
local_path: Path to the DOCX file to convert.
Returns:
DocumentConverterResult containing the converted Markdown text.
"""
with open(local_path, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html_content = result.value
return convert_html_to_md(html_content)
def XlsxConverter(local_path: str):
"""
Converts Excel files to Markdown using openpyxl.
Preserves color formatting and other cell styling information.
Args:
local_path: Path to the Excel file
Returns:
DocumentConverterResult with the Markdown representation of the Excel file
"""
# Load the workbook
wb = openpyxl.load_workbook(local_path, data_only=True)
md_content = ""
# Helper function to convert RGB color to hex
def rgb_to_hex(rgb_value):
if not rgb_value:
return None
# Convert RGB value to string for processing
rgb_string = str(rgb_value)
# Handle RGB format like 'RGB(255, 255, 255)'
if isinstance(rgb_value, str) and rgb_string.startswith("RGB"):
rgb_match = re.match(r"RGB\((\d+), (\d+), (\d+)\)", rgb_string)
if rgb_match:
r, g, b = map(int, rgb_match.groups())
return f"#{r:02x}{g:02x}{b:02x}"
# Special handling for FFFFFFFF (white) and 00000000 (transparent/none)
if rgb_string in ["FFFFFFFF", "00000000", "none", "auto"]:
return None
# Handle ARGB format (common in openpyxl)
if len(rgb_string) == 8: # ARGB format like 'FF5733FF'
return f"#{rgb_string[2:]}" # Strip alpha channel
# Handle direct hex values like 'FF5733'
if isinstance(rgb_value, str):
return f"#{rgb_string}" if not rgb_string.startswith("#") else rgb_string
return None # Return None for unrecognized formats
# Helper function to detect and format cell styling
def get_cell_format_info(cell):
info = {}
# Get background color if it exists
if cell.fill and hasattr(cell.fill, "fgColor") and cell.fill.fgColor:
# Get the RGB value - in openpyxl this can be stored in different attributes
rgb_value = None
if hasattr(cell.fill.fgColor, "rgb") and cell.fill.fgColor.rgb:
rgb_value = cell.fill.fgColor.rgb
elif hasattr(cell.fill.fgColor, "value") and cell.fill.fgColor.value:
rgb_value = cell.fill.fgColor.value
if rgb_value:
bg_color = rgb_to_hex(rgb_value)
if bg_color: # Skip transparent or white (handled in rgb_to_hex)
info["bg_color"] = bg_color
# Get font color if it exists
if cell.font and hasattr(cell.font, "color") and cell.font.color:
# Get the RGB value - in openpyxl this can be stored in different attributes
rgb_value = None
if hasattr(cell.font.color, "rgb") and cell.font.color.rgb:
rgb_value = cell.font.color.rgb
elif hasattr(cell.font.color, "value") and cell.font.color.value:
rgb_value = cell.font.color.value
if rgb_value:
font_color = rgb_to_hex(rgb_value)
if font_color: # Skip transparent (handled in rgb_to_hex)
info["font_color"] = font_color
# Get font weight (bold)
if cell.font and cell.font.bold:
info["bold"] = True
# Get font style (italic)
if cell.font and cell.font.italic:
info["italic"] = True
# Get font underline
if cell.font and cell.font.underline and cell.font.underline != "none":
info["underline"] = True
return info
# Process each sheet in the workbook
for sheet_name in wb.sheetnames:
try:
sheet = wb[sheet_name]
md_content += f"## {sheet_name}\n\n"
# Get the dimensions of the used part of the sheet
min_row, min_col = 1, 1
max_row = max(
(cell.row for cell in sheet._cells.values() if cell.value is not None),
default=0,
)
max_col = max(
(
cell.column
for cell in sheet._cells.values()
if cell.value is not None
),
default=0,
)
if max_row == 0 or max_col == 0:
md_content += "This sheet is empty.\n\n"
continue
except Exception as e:
error_msg = f"Error processing sheet '{sheet_name}': {str(e)}"
print(error_msg)
md_content += (
f"## {sheet_name}\n\nError processing this sheet: {str(e)}\n\n"
)
continue
try:
# First, determine column widths
col_widths = {}
for col_idx in range(min_col, max_col + 1):
max_length = 0
# col_letter = get_column_letter(col_idx)
_ = get_column_letter(col_idx)
for row_idx in range(min_row, max_row + 1):
try:
cell = sheet.cell(row=row_idx, column=col_idx)
cell_value = str(cell.value) if cell.value is not None else ""
max_length = max(max_length, len(cell_value))
except Exception as e:
print(
f"Warning: Error processing cell at row {row_idx}, column {col_idx}: {str(e)}"
)
max_length = max(max_length, 10) # Use reasonable default
col_widths[col_idx] = max(max_length + 2, 5) # Min width of 5
# Start building the table
# Header row with column separators
md_content += "|"
for col_idx in range(min_col, max_col + 1):
md_content += " " + " " * col_widths[col_idx] + " |"
md_content += "\n"
# Separator row
md_content += "|"
for col_idx in range(min_col, max_col + 1):
md_content += ":" + "-" * col_widths[col_idx] + ":|"
md_content += "\n"
# Data rows
for row_idx in range(min_row, max_row + 1):
md_content += "|"
for col_idx in range(min_col, max_col + 1):
try:
cell = sheet.cell(row=row_idx, column=col_idx)
cell_value = str(cell.value) if cell.value is not None else ""
# Get formatting info
try:
format_info = get_cell_format_info(cell)
except Exception as e:
print(
f"Warning: Error getting formatting for cell at row {row_idx}, column {col_idx}: {str(e)}"
)
format_info = {}
formatted_value = cell_value
# Add HTML-style formatting if needed
if format_info:
style_parts = []
if "bg_color" in format_info:
style_parts.append(
f"background-color:{format_info['bg_color']}"
)
if "font_color" in format_info:
style_parts.append(f"color:{format_info['font_color']}")
span_attributes = []
if style_parts:
span_attributes.append(
f'style="{"; ".join(style_parts)}"'
)
# Format with bold/italic/underline if needed
inner_value = cell_value
if "bold" in format_info:
inner_value = f"{inner_value}"
if "italic" in format_info:
inner_value = f"{inner_value}"
if "underline" in format_info:
inner_value = f"{inner_value}"
# Only add a span if we have style attributes
if span_attributes:
formatted_value = f"{inner_value}"
else:
formatted_value = inner_value
# Pad to column width and add to markdown
padding = col_widths[col_idx] - len(cell_value)
padded_value = " " + formatted_value + " " * (padding + 1)
md_content += padded_value + "|"
except Exception as e:
print(
f"Error processing cell at row {row_idx}, column {col_idx}: {str(e)}"
)
# Add a placeholder for the failed cell
padded_value = " [Error] " + " " * (col_widths[col_idx] - 7)
md_content += padded_value + " |"
md_content += "\n"
except Exception as e:
error_msg = f"Error generating table for sheet '{sheet_name}': {str(e)}\n{traceback.format_exc()}"
print(error_msg)
md_content += f"Error generating table: {str(e)}\n\n"
# Add formatting legend
has_formatting = False
for row_idx in range(min_row, max_row + 1):
for col_idx in range(min_col, max_col + 1):
cell = sheet.cell(row=row_idx, column=col_idx)
if get_cell_format_info(cell):
has_formatting = True
break
if has_formatting:
break
if has_formatting:
md_content += "\n### Formatting Information\n"
md_content += "The table above includes HTML formatting to represent colors and styles from the original Excel file.\n"
md_content += "This formatting may not display in all Markdown viewers.\n"
md_content += "\n\n" # Extra newlines between sheets
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
def PptxConverter(local_path: str) -> DocumentConverterResult:
"""
Converts PPTX files to Markdown. Supports headings, tables and images with alt text.
Args:
local_path: Path to the PPTX file
Returns:
DocumentConverterResult containing the converted Markdown text
"""
def is_picture(shape):
"""Check if a shape is a picture"""
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
return True
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
if hasattr(shape, "image"):
return True
return False
def is_table(shape):
"""Check if a shape is a table"""
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
return True
return False
if not local_path.endswith(".pptx"):
return DocumentConverterResult(
title=None,
text_content=f"Error: Expected .pptx file, got: {local_path}",
)
md_content = ""
presentation = pptx.Presentation(local_path)
slide_num = 0
for slide in presentation.slides:
slide_num += 1
md_content += f"\n\n\n"
title = slide.shapes.title
for shape in slide.shapes:
# Pictures
if is_picture(shape):
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
alt_text = ""
try:
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
except Exception:
pass
# A placeholder name
filename = re.sub(r"\W", "", shape.name) + ".jpg"
md_content += (
"\n\n"
)
# Tables
if is_table(shape):
html_table = "
"
first_row = True
for row in shape.table.rows:
html_table += "
"
for cell in row.cells:
if first_row:
html_table += "
" + html.escape(cell.text) + "
"
else:
html_table += "
" + html.escape(cell.text) + "
"
html_table += "
"
first_row = False
html_table += "
"
# Note: This would require a separate HTML to Markdown converter function
# In this version, I'm assuming a convert_html_to_md function exists
md_content += (
"\n" + convert_html_to_md(html_table).text_content.strip() + "\n"
)
# Text areas
elif shape.has_text_frame:
if shape == title:
md_content += "# " + shape.text.lstrip() + "\n"
else:
md_content += shape.text + "\n"
md_content = md_content.strip()
if slide.has_notes_slide:
md_content += "\n\n### Notes:\n"
notes_frame = slide.notes_slide.notes_text_frame
if notes_frame is not None:
md_content += notes_frame.text
md_content = md_content.strip()
return DocumentConverterResult(
title=None,
text_content=md_content.strip(),
)
def ZipConverter(local_path: str, **kwargs):
"""
Extracts ZIP files to a temporary directory and processes each file according to its extension.
Returns a combined result of all processed files.
"""
import zipfile
temp_dir = tempfile.mkdtemp(prefix="zip_extract_")
md_content = f"# Extracted from ZIP: {os.path.basename(local_path)}\n\n"
try:
with zipfile.ZipFile(local_path, "r") as zip_ref:
zip_ref.extractall(temp_dir)
# Get all extracted files
extracted_files = []
for root, dirs, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
rel_path = os.path.relpath(file_path, temp_dir)
extracted_files.append((file_path, rel_path))
if not extracted_files:
md_content += "The ZIP file is empty or contains no files.\n"
else:
md_content += f"Total files extracted: {len(extracted_files)}\n\n"
for file_path, rel_path in extracted_files:
md_content += f"## File: {rel_path}\n\n"
# Process each file based on its extension
file_extension = (
file_path.rsplit(".", maxsplit=1)[-1].lower()
if "." in file_path
else ""
)
file_result = None
try:
# Use the same processing logic as process_input
if file_extension == "py":
with open(file_path, "r", encoding="utf-8") as f:
file_result = DocumentConverterResult(
title=None, text_content=f.read()
)
elif file_extension in [
"txt",
"md",
"sh",
"yaml",
"yml",
"toml",
"csv",
]:
with open(file_path, "r", encoding="utf-8") as f:
file_result = DocumentConverterResult(
title=None, text_content=f.read()
)
elif file_extension in ["jsonld", "json"]:
with open(file_path, "r", encoding="utf-8") as f:
file_result = DocumentConverterResult(
title=None,
text_content=json.dumps(
json.load(f), ensure_ascii=False, indent=2
),
)
elif file_extension in ["xlsx", "xls"]:
file_result = XlsxConverter(local_path=file_path)
elif file_extension == "pdf":
file_result = DocumentConverterResult(
title=None,
text_content=pdfminer.high_level.extract_text(file_path),
)
elif file_extension in ["docx", "doc"]:
file_result = DocxConverter(local_path=file_path)
elif file_extension in ["html", "htm"]:
file_result = HtmlConverter(local_path=file_path)
elif file_extension in ["pptx", "ppt"]:
file_result = PptxConverter(local_path=file_path)
elif file_extension in IMAGE_EXTENSIONS:
# Generate image caption for files in ZIP
caption = _generate_image_caption(file_path)
md_content += "[Image file]\n\n"
md_content += f"> {caption}\n\n"
continue
elif file_extension in AUDIO_EXTENSIONS:
# Generate audio caption for files in ZIP
caption = _generate_audio_caption(file_path)
md_content += "[Audio file]\n\n"
md_content += f"> {caption}\n\n"
continue
elif file_extension in VIDEO_EXTENSIONS:
# Generate video caption for files in ZIP
caption = _generate_video_caption(file_path)
md_content += "[Video file]\n\n"
md_content += f"> {caption}\n\n"
continue
elif file_extension == "pdb":
md_content += "[PDB file - specialized format]\n\n"
continue
else:
# Try MarkItDown as fallback
try:
md_tool = MarkItDown(enable_plugins=True)
file_result = md_tool.convert(file_path)
except Exception:
md_content += (
f"[Unsupported file type: {file_extension}]\n\n"
)
continue
# Add the processed content
if file_result and getattr(file_result, "text_content", None):
content = file_result.text_content
# Limit length for each file
max_len = 50_000
if len(content) > max_len:
content = content[:max_len] + "\n... [Content truncated]"
md_content += f"```\n{content}\n```\n\n"
except Exception as e:
md_content += f"[Error processing file: {str(e)}]\n\n"
print(f"Warning: Error processing {rel_path} from ZIP: {e}")
finally:
# Clean up temporary directory
try:
shutil.rmtree(temp_dir)
except Exception as e:
print(f"Warning: Could not remove temporary directory {temp_dir}: {e}")
return DocumentConverterResult(
title="ZIP Archive Contents", text_content=md_content.strip()
)
================================================
FILE: apps/miroflow-agent/src/io/output_formatter.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Output formatting utilities for agent responses."""
import re
from typing import Tuple
from ..utils.prompt_utils import FORMAT_ERROR_MESSAGE
# Maximum length for tool results before truncation (100k chars ≈ 25k tokens)
TOOL_RESULT_MAX_LENGTH = 100_000
class OutputFormatter:
"""Formatter for processing and formatting agent outputs."""
def _extract_boxed_content(self, text: str) -> str:
r"""
Extract the content of the last \boxed{...} occurrence in the given text.
Supports:
- Arbitrary levels of nested braces
- Escaped braces (\{ and \})
- Whitespace between \boxed and the opening brace
- Empty content inside braces
- Incomplete boxed expressions (extracts to end of string as fallback)
Args:
text: Input text that may contain \boxed{...} expressions
Returns:
The extracted boxed content, or empty string if no match is found.
"""
if not text:
return ""
_BOXED_RE = re.compile(r"\\boxed\b", re.DOTALL)
last_result = None # Track the last boxed content (complete or incomplete)
i = 0
n = len(text)
while True:
# Find the next \boxed occurrence
m = _BOXED_RE.search(text, i)
if not m:
break
j = m.end()
# Skip any whitespace after \boxed
while j < n and text[j].isspace():
j += 1
# Require that the next character is '{'
if j >= n or text[j] != "{":
i = j
continue
# Parse the brace content manually to handle nesting and escapes
depth = 0
k = j
escaped = False
found_closing = False
while k < n:
ch = text[k]
if escaped:
escaped = False
elif ch == "\\":
escaped = True
elif ch == "{":
depth += 1
elif ch == "}":
depth -= 1
# When depth returns to zero, the boxed content ends
if depth == 0:
last_result = text[j + 1 : k]
i = k + 1
found_closing = True
break
k += 1
# If we didn't find a closing brace, this is an incomplete boxed
# Store it as the last result (will be overwritten if we find more boxed later)
if not found_closing and depth > 0:
last_result = text[j + 1 : n]
i = k # Continue from where we stopped
elif not found_closing:
i = j + 1 # Move past this invalid boxed
# Return the last boxed content found (complete or incomplete)
black_list = ["?", "??", "???", "?", "……", "…", "...", "unknown", None]
return last_result.strip() if last_result not in black_list else ""
def format_tool_result_for_user(self, tool_call_execution_result: dict) -> dict:
"""
Format tool execution results to be fed back to LLM as user messages.
Only includes necessary information (results or errors). Long results
are truncated to TOOL_RESULT_MAX_LENGTH to prevent context overflow.
Args:
tool_call_execution_result: Dict containing server_name, tool_name,
and either 'result' or 'error'.
Returns:
Dict with 'type' and 'text' keys suitable for LLM message content.
"""
server_name = tool_call_execution_result["server_name"]
tool_name = tool_call_execution_result["tool_name"]
if "error" in tool_call_execution_result:
# Provide concise error information to LLM
content = f"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}"
elif "result" in tool_call_execution_result:
# Provide the original output result of the tool
content = tool_call_execution_result["result"]
# Truncate overly long results to prevent context overflow
if len(content) > TOOL_RESULT_MAX_LENGTH:
content = content[:TOOL_RESULT_MAX_LENGTH] + "\n... [Result truncated]"
else:
content = f"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result."
return {"type": "text", "text": content}
def format_final_summary_and_log(
self, final_answer_text: str, client=None
) -> Tuple[str, str, str]:
"""
Format final summary information, including answers and token statistics.
Args:
final_answer_text: The final answer text from the agent
client: Optional LLM client for token usage statistics
Returns:
Tuple of (summary_text, boxed_result, usage_log)
"""
summary_lines = []
summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30)
summary_lines.append(final_answer_text)
# Extract boxed result - find the last match using safer regex patterns
boxed_result = self._extract_boxed_content(final_answer_text)
# Add extracted result section
summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20)
if boxed_result:
summary_lines.append(boxed_result)
elif final_answer_text:
summary_lines.append("No \\boxed{} content found.")
boxed_result = FORMAT_ERROR_MESSAGE
# Token usage statistics and cost estimation - use client method
if client and hasattr(client, "format_token_usage_summary"):
token_summary_lines, log_string = client.format_token_usage_summary()
summary_lines.extend(token_summary_lines)
else:
# If no client or client doesn't support it, use default format
summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20)
summary_lines.append("Token usage information not available.")
summary_lines.append("-" * (40 + len(" Token Usage & Cost ")))
log_string = "Token usage information not available."
return "\n".join(summary_lines), boxed_result, log_string
================================================
FILE: apps/miroflow-agent/src/llm/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
from .base_client import BaseClient
from .factory import ClientFactory
from .providers import (
AnthropicClient,
OpenAIClient,
)
__all__ = [
"BaseClient",
"ClientFactory",
"AnthropicClient",
"OpenAIClient",
]
================================================
FILE: apps/miroflow-agent/src/llm/base_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Base client module for LLM providers.
This module defines the abstract base class and common utilities for LLM clients,
supporting both OpenAI and Anthropic API formats.
"""
import asyncio
import dataclasses
from abc import ABC
from typing import (
Any,
Dict,
List,
Optional,
Tuple,
TypedDict,
)
from omegaconf import DictConfig
from ..logging.task_logger import TaskLog
from .util import with_timeout
# Default timeout for LLM API calls (10 minutes)
DEFAULT_LLM_TIMEOUT_SECONDS = 600
class TokenUsage(TypedDict, total=True):
"""
Unified token usage tracking across different LLM providers.
We unify OpenAI and Anthropic formats. There are four usage types:
- input/output tokens: Standard input and output token counts
- cache write/read tokens: Tokens involved in caching operations
Provider-specific notes:
- OpenAI: Cache write is free, cache read is cheaper
- Anthropic: Cache write has a small cost, cache read is cheaper
"""
total_input_tokens: int
total_output_tokens: int
total_cache_read_input_tokens: int
total_cache_write_input_tokens: int
@dataclasses.dataclass
class BaseClient(ABC):
"""
Abstract base class for LLM provider clients.
This class provides the common interface and utilities for interacting with
different LLM providers (OpenAI, Anthropic, etc.). Concrete implementations
should override _create_client() and provider-specific methods.
Attributes:
task_id: Unique identifier for the current task (used for tracking)
cfg: Hydra configuration containing LLM settings
task_log: Optional logger for recording task execution details
"""
# Required arguments (no default value)
task_id: str
cfg: DictConfig
# Optional arguments (with default value)
task_log: Optional["TaskLog"] = None
# Initialized in __post_init__
client: Any = dataclasses.field(init=False)
token_usage: TokenUsage = dataclasses.field(init=False)
last_call_tokens: Dict[str, int] = dataclasses.field(init=False)
def __post_init__(self):
# Initialize last_call_tokens before other operations
self.last_call_tokens: Dict[str, int] = {
"prompt_tokens": 0,
"completion_tokens": 0,
}
# Explicitly assign from cfg object
self.provider: str = self.cfg.llm.provider
self.model_name: str = self.cfg.llm.model_name
self.temperature: float = self.cfg.llm.temperature
self.top_p: float = self.cfg.llm.top_p
self.min_p: float = self.cfg.llm.min_p
self.top_k: int = self.cfg.llm.top_k
self.max_context_length: int = self.cfg.llm.max_context_length
self.max_tokens: int = self.cfg.llm.max_tokens
self.async_client: bool = self.cfg.llm.async_client
self.keep_tool_result: int = self.cfg.agent.keep_tool_result
self.api_key: Optional[str] = self.cfg.llm.get("api_key")
self.base_url: Optional[str] = self.cfg.llm.get("base_url")
self.use_tool_calls: Optional[bool] = self.cfg.llm.get("use_tool_calls")
self.repetition_penalty: float = self.cfg.llm.get("repetition_penalty", 1.0)
self.token_usage = self._reset_token_usage()
self.client = self._create_client()
self.task_log.log_step(
"info",
"LLM | Initialization",
f"LLMClient {self.provider} {self.model_name} initialization completed.",
)
def _reset_token_usage(self) -> TokenUsage:
"""
Reset token usage counter to zero.
Returns:
A new TokenUsage dict with all counters set to zero.
"""
return TokenUsage(
total_input_tokens=0,
total_output_tokens=0,
total_cache_write_input_tokens=0,
total_cache_read_input_tokens=0,
)
def _remove_tool_result_from_messages(
self, messages, keep_tool_result
) -> List[Dict]:
"""Remove tool results from messages
Args:
messages: List of message dictionaries
keep_tool_result: Number of tool results to keep. -1 means keep all.
Returns:
List of messages with tool results filtered according to keep_tool_result
"""
messages_copy = [m.copy() for m in messages]
if keep_tool_result == -1:
# No processing needed, keep all messages
return messages_copy
# Find indices of all user/tool messages (these are tool results)
user_indices = [
i
for i, msg in enumerate(messages_copy)
if msg.get("role") == "user" or msg.get("role") == "tool"
]
if len(user_indices) == 0:
# No user/tool messages found
self.task_log.log_step(
"info",
"LLM | Message Retention",
"No user/tool messages found in the history.",
)
return messages_copy
# The first user message is the initial task, not a tool result
# Tool results start from the second user message onwards
if len(user_indices) == 1:
# Only one user message (the initial task), no tool results to filter
self.task_log.log_step(
"info",
"LLM | Message Retention",
"Only 1 user message found (initial task). Keeping it as is.",
)
return messages_copy
# Tool result indices (excluding the first user message which is the initial task)
tool_result_indices = user_indices[1:]
first_user_idx = user_indices[
0
] # Always keep the first user message (initial task)
# Calculate how many tool results to keep from the end
if keep_tool_result == 0:
# Keep 0 tool results, only keep the initial task
num_tool_results_to_keep = 0
else:
# Keep the last keep_tool_result tool results
num_tool_results_to_keep = min(keep_tool_result, len(tool_result_indices))
# Get indices of tool results to keep from the end
tool_result_indices_to_keep = (
tool_result_indices[-num_tool_results_to_keep:]
if num_tool_results_to_keep > 0
else []
)
# Combine first message (initial task) and tool results to keep
indices_to_keep = [first_user_idx] + tool_result_indices_to_keep
self.task_log.log_step(
"info",
"LLM | Message Retention",
f"Message retention summary: Total user/tool messages: {len(user_indices)}, "
f"Initial task at index: {first_user_idx}, "
f"Keeping last {num_tool_results_to_keep} tool results at indices: {tool_result_indices_to_keep}, "
f"Total messages to keep: {len(indices_to_keep)}",
)
# Replace content of tool results that should be omitted
for i, msg in enumerate(messages_copy):
if (
msg.get("role") == "user" or msg.get("role") == "tool"
) and i not in indices_to_keep:
# Preserve the message structure but replace content
if isinstance(msg.get("content"), list):
# For Anthropic format
msg["content"] = [
{
"type": "text",
"text": "Tool result is omitted to save tokens.",
}
]
else:
# For OpenAI format
msg["content"] = "Tool result is omitted to save tokens."
return messages_copy
@with_timeout(DEFAULT_LLM_TIMEOUT_SECONDS)
async def create_message(
self,
system_prompt: str,
message_history: List[Dict],
tool_definitions: List[Dict],
keep_tool_result: int = -1,
step_id: int = 1,
task_log: Optional["TaskLog"] = None,
agent_type: str = "main",
) -> Tuple[Any, List[Dict]]:
"""
Call LLM to generate a response with optional tool call support.
This is the main entry point for LLM interactions. It handles:
- Message history management
- Tool result filtering based on keep_tool_result
- Error handling and logging
Args:
system_prompt: System prompt to guide the LLM's behavior
message_history: List of previous messages in the conversation
tool_definitions: List of available tool definitions
keep_tool_result: Number of recent tool results to keep (-1 = keep all)
step_id: Current step identifier for logging
task_log: Optional logger for task execution
agent_type: Type of agent making the call ("main" or sub-agent name)
Returns:
Tuple of (response, updated_message_history)
"""
# Unified LLM call processing
try:
response, message_history = await self._create_message(
system_prompt,
message_history,
tool_definitions,
keep_tool_result=keep_tool_result,
)
except Exception as e:
self.task_log.log_step(
"error",
f"FATAL ERROR | {agent_type} | LLM Call ERROR",
f"{agent_type} failed: {str(e)}",
)
response = None
return response, message_history
@staticmethod
async def convert_tool_definition_to_tool_call(tools_definitions):
"""
Convert MCP tool definitions to OpenAI function call format.
Transforms the internal tool definition format used by MCP servers into
the format expected by OpenAI's function calling API.
Args:
tools_definitions: List of server definitions, each containing a 'name'
and 'tools' list with tool specifications.
Returns:
List of tool definitions in OpenAI function call format, where each
tool name is prefixed with its server name (e.g., "server-name-tool-name").
"""
tool_list = []
for server in tools_definitions:
if "tools" in server and len(server["tools"]) > 0:
for tool in server["tools"]:
tool_def = dict(
type="function",
function=dict(
name=f"{server['name']}-{tool['name']}",
description=tool["description"],
parameters=tool["schema"],
),
)
tool_list.append(tool_def)
return tool_list
def close(self):
"""Close client connection.
Note: For async clients (AsyncOpenAI, AsyncAnthropic), the connection
will be closed when the client object is garbage collected.
For proper async cleanup, use `await client.aclose()` in an async context.
"""
if hasattr(self.client, "close"):
if asyncio.iscoroutinefunction(self.client.close):
# For async clients, we cannot call close() synchronously.
# The async HTTP client will be closed when garbage collected.
# For explicit async cleanup, call aclose() from an async context.
if hasattr(self.client, "_client"):
# Try to close the underlying httpx client if available
try:
self.client._client.close()
except Exception:
pass # Ignore errors during cleanup
else:
self.client.close()
elif hasattr(self.client, "_client") and hasattr(self.client._client, "close"):
# Some clients may have internal _client attribute
self.client._client.close()
def _format_response_for_log(self, response) -> Dict:
"""Format response for logging"""
if not response:
return {}
# Basic response information
formatted = {
"response_type": type(response).__name__,
}
# Anthropic response
if hasattr(response, "content"):
formatted["content"] = []
for block in response.content:
if hasattr(block, "type"):
if block.type == "text":
formatted["content"].append(
{
"type": "text",
"text": block.text[:500] + "..."
if len(block.text) > 500
else block.text,
}
)
elif block.type == "tool_use":
formatted["content"].append(
{
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": str(block.input)[:200] + "..."
if len(str(block.input)) > 200
else str(block.input),
}
)
# OpenAI response
if hasattr(response, "choices"):
formatted["choices"] = []
for choice in response.choices:
choice_data = {"finish_reason": choice.finish_reason}
if hasattr(choice, "message"):
message = choice.message
choice_data["message"] = {
"role": message.role,
"content": message.content[:500] + "..."
if message.content and len(message.content) > 500
else message.content,
}
if hasattr(message, "tool_calls") and message.tool_calls:
choice_data["message"]["tool_calls_count"] = len(
message.tool_calls
)
formatted["choices"].append(choice_data)
return formatted
================================================
FILE: apps/miroflow-agent/src/llm/factory.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
LLM Client Factory module.
This module provides a factory function for creating LLM clients based on
configuration. It supports multiple providers including OpenAI, Anthropic,
and Qwen (via OpenAI-compatible API).
"""
from typing import Optional, Union
from omegaconf import DictConfig, OmegaConf
from ..logging.task_logger import TaskLog
from .providers.anthropic_client import AnthropicClient
from .providers.openai_client import OpenAIClient
# Supported LLM providers
SUPPORTED_PROVIDERS = {"anthropic", "openai", "qwen"}
def ClientFactory(
task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs
) -> Union[OpenAIClient, AnthropicClient]:
"""
Create an LLM client based on the provider specified in configuration.
This factory function automatically selects and instantiates the appropriate
client class based on the `llm.provider` field in the configuration.
Args:
task_id: Unique identifier for the current task (used for tracking)
cfg: Hydra configuration object containing LLM settings
task_log: Optional logger for recording task execution details
**kwargs: Additional keyword arguments to merge into configuration
Returns:
An instance of the appropriate LLM client (OpenAIClient or AnthropicClient)
Example:
>>> client = ClientFactory(
... task_id="task_001",
... cfg=cfg,
... task_log=task_log
... )
"""
provider = cfg.llm.provider
config = OmegaConf.merge(cfg, kwargs)
client_creators = {
"anthropic": lambda: AnthropicClient(
task_id=task_id, task_log=task_log, cfg=config
),
"qwen": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
"openai": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
}
factory = client_creators.get(provider)
if not factory:
raise ValueError(
f"Unsupported provider: '{provider}'. "
f"Supported providers are: {', '.join(sorted(SUPPORTED_PROVIDERS))}"
)
return factory()
================================================
FILE: apps/miroflow-agent/src/llm/providers/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
from .anthropic_client import AnthropicClient
from .openai_client import OpenAIClient
__all__ = [
"AnthropicClient",
"OpenAIClient",
]
================================================
FILE: apps/miroflow-agent/src/llm/providers/anthropic_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Anthropic Claude LLM client implementation.
This module provides the AnthropicClient class for interacting with Anthropic's
Claude API, with support for prompt caching and extended thinking.
Features:
- Async and sync API support
- Prompt caching with ephemeral cache control
- Token usage tracking including cache statistics
- MCP tool call parsing and response processing
"""
import asyncio
import dataclasses
import logging
from typing import Any, Dict, List, Tuple, Union
import tiktoken
from anthropic import (
NOT_GIVEN,
Anthropic,
AsyncAnthropic,
DefaultAsyncHttpxClient,
DefaultHttpxClient,
)
from tenacity import retry, stop_after_attempt, wait_fixed
from ...utils.prompt_utils import generate_mcp_system_prompt
from ..base_client import BaseClient
logger = logging.getLogger("miroflow_agent")
@dataclasses.dataclass
class AnthropicClient(BaseClient):
def __post_init__(self):
super().__post_init__()
# Anthropic-specific token counters
self.input_tokens: int = 0
self.output_tokens: int = 0
self.cache_creation_tokens: int = 0
self.cache_read_tokens: int = 0
def _create_client(self) -> Union[AsyncAnthropic, Anthropic]:
"""Create LLM client"""
http_client_args = {"headers": {"x-upstream-session-id": self.task_id}}
if self.async_client:
return AsyncAnthropic(
api_key=self.api_key,
base_url=self.base_url,
http_client=DefaultAsyncHttpxClient(**http_client_args),
)
else:
return Anthropic(
api_key=self.api_key,
base_url=self.base_url,
http_client=DefaultHttpxClient(**http_client_args),
)
def _update_token_usage(self, usage_data: Any) -> None:
"""Update cumulative token usage"""
if usage_data:
# Update based on actual field names returned by Anthropic API
self.token_usage["total_cache_write_input_tokens"] += (
getattr(usage_data, "cache_creation_input_tokens", 0) or 0
)
self.token_usage["total_cache_read_input_tokens"] += (
getattr(usage_data, "cache_read_input_tokens", 0) or 0
)
self.token_usage["total_input_tokens"] += (
getattr(usage_data, "input_tokens", 0) or 0
)
self.token_usage["total_output_tokens"] += (
getattr(usage_data, "output_tokens", 0) or 0
)
self.task_log.log_step(
"info",
"LLM | Token Usage",
f"Input: {getattr(usage_data, 'input_tokens', 0)}, "
f"Cache: {getattr(usage_data, 'cache_creation_input_tokens', 0)}+{getattr(usage_data, 'cache_read_input_tokens', 0)}, "
f"Output: {getattr(usage_data, 'output_tokens', 0)}",
)
self.last_call_tokens = {
"input_tokens": getattr(usage_data, "input_tokens", 0)
+ getattr(usage_data, "cache_creation_input_tokens", 0)
+ getattr(usage_data, "cache_read_input_tokens", 0),
"output_tokens": getattr(usage_data, "output_tokens", 0),
}
else:
self.task_log.log_step(
"warning", "LLM | Token Usage", "Warning: No valid usage_data received."
)
@retry(wait=wait_fixed(10), stop=stop_after_attempt(5))
async def _create_message(
self,
system_prompt: str,
messages_history: List[Dict[str, Any]],
tools_definitions,
keep_tool_result: int = -1,
):
"""
Send message to Anthropic API.
:param system_prompt: System prompt string.
:param messages_history: Message history list.
:return: Anthropic API response object or None (if error occurs).
"""
self.task_log.log_step(
"info",
"LLM | Call Start",
f"Calling LLM ({'async' if self.async_client else 'sync'})",
)
# Create a filtered copy for sending to LLM (to save tokens)
# But keep the original messages_history for returning (for complete log)
messages_for_llm = self._remove_tool_result_from_messages(
messages_history, keep_tool_result
)
# Apply cache control
processed_messages = self._apply_cache_control(messages_for_llm)
try:
# Note: Anthropic API does not support repetition_penalty parameter
if self.async_client:
response = await self.client.messages.create(
model=self.model_name,
temperature=self.temperature,
top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,
top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,
max_tokens=self.max_tokens,
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
messages=processed_messages,
stream=False,
)
else:
response = self.client.messages.create(
model=self.model_name,
temperature=self.temperature,
top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN,
top_k=self.top_k if self.top_k != -1 else NOT_GIVEN,
max_tokens=self.max_tokens,
system=[
{
"type": "text",
"text": system_prompt,
"cache_control": {"type": "ephemeral"},
}
],
messages=processed_messages,
stream=False,
)
self._update_token_usage(getattr(response, "usage", None))
self.task_log.log_step(
"info",
"LLM | Call Status",
f"LLM call status: {getattr(response, 'stop_reason', 'N/A')}",
)
# Return the original messages_history (not the filtered copy)
# This ensures that the complete conversation history is preserved in logs
return response, messages_history
except asyncio.CancelledError:
self.task_log.log_step(
"warning",
"LLM | Call Cancelled",
"⚠️ LLM API call was cancelled during execution",
)
raise # Re-raise to allow decorator to log it
except Exception as e:
self.task_log.log_step(
"error", "LLM | Call Failed", f"Anthropic LLM call failed: {str(e)}"
)
raise e
def process_llm_response(
self, llm_response: Any, message_history: List[Dict], agent_type: str = "main"
) -> tuple[str, bool, List[Dict]]:
"""Process LLM response"""
if not llm_response:
self.task_log.log_step(
"error",
"LLM | Response Processing",
"❌ LLM call failed, skipping this response.",
)
return "", True, message_history
if not hasattr(llm_response, "content") or not llm_response.content:
self.task_log.log_step(
"error",
"LLM | Response Processing",
"❌ LLM response is empty or contains no content.",
)
return "", True, message_history
# Extract response content
assistant_response_text = ""
assistant_response_content = []
from ...utils.parsing_utils import fix_server_name_in_text
for block in llm_response.content:
if block.type == "text":
assistant_response_text += block.text + "\n"
assistant_response_content.append({"type": "text", "text": block.text})
elif block.type == "tool_use":
assistant_response_content.append(
{
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": block.input,
}
)
# Fix server_name in text content
assistant_response_text = fix_server_name_in_text(assistant_response_text)
for item in assistant_response_content:
if item.get("type") == "text":
item["text"] = fix_server_name_in_text(item["text"])
# Add assistant response to history
message_history.append(
{"role": "assistant", "content": assistant_response_content}
)
self.task_log.log_step(
"info", "LLM | Response", f"LLM Response: {assistant_response_text}"
)
return assistant_response_text, False, message_history
def extract_tool_calls_info(
self, llm_response: Any, assistant_response_text: str
) -> List[Dict]:
"""Extract tool call information from LLM response"""
from ...utils.parsing_utils import parse_llm_response_for_tool_calls
return parse_llm_response_for_tool_calls(assistant_response_text)
def update_message_history(
self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]
) -> List[Dict]:
"""Update message history with tool calls data (llm client specific)"""
merged_text = "\n".join(
[
item[1]["text"]
for item in all_tool_results_content_with_id
if item[1]["type"] == "text"
]
)
message_history.append(
{
"role": "user",
"content": [{"type": "text", "text": merged_text}],
}
)
return message_history
def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:
from ...utils.parsing_utils import set_tool_server_mapping
prompt = generate_mcp_system_prompt(date, mcp_servers)
set_tool_server_mapping(prompt)
return prompt
def _estimate_tokens(self, text: str) -> int:
"""Use tiktoken to estimate the number of tokens in text"""
if not hasattr(self, "encoding"):
# Initialize tiktoken encoder
try:
self.encoding = tiktoken.get_encoding("o200k_base")
except Exception:
# If o200k_base is not available, use cl100k_base as fallback
self.encoding = tiktoken.get_encoding("cl100k_base")
try:
return len(self.encoding.encode(text))
except Exception as e:
# If encoding fails, use simple estimation: approximately 1 token per 4 characters
self.task_log.log_step(
"error",
"LLM | Token Estimation Error",
f"Error: {str(e)}",
)
return len(text) // 4
def ensure_summary_context(
self, message_history: list, summary_prompt: str
) -> tuple[bool, list]:
"""
Check if current message_history + summary_prompt will exceed context
If it will exceed, remove the last assistant-user pair and return False
Return True to continue, False if messages have been rolled back
"""
# Get token usage from the last LLM call
last_input_tokens = self.last_call_tokens.get("input_tokens", 0)
last_output_tokens = self.last_call_tokens.get("output_tokens", 0)
buffer_factor = 1.5
# Calculate token count for summary prompt
summary_tokens = int(self._estimate_tokens(str(summary_prompt)) * buffer_factor)
# Calculate token count for the last user message in message_history
last_user_tokens = 0
if message_history[-1]["role"] == "user":
content = message_history[-1]["content"]
last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)
# Calculate total token count: last input + output + last user message + summary + reserved response space
estimated_total = (
last_input_tokens
+ last_output_tokens
+ last_user_tokens
+ summary_tokens
+ self.max_tokens
+ 1000 # Add 1000 tokens as buffer
)
if estimated_total >= self.max_context_length:
self.task_log.log_step(
"info",
"LLM | Context Limit Reached",
"Context limit reached, proceeding to step back and summarize the conversation",
)
# Remove the last user message (tool call results)
if message_history[-1]["role"] == "user":
message_history.pop()
# Remove the second-to-last assistant message (tool call request)
if message_history[-1]["role"] == "assistant":
message_history.pop()
self.task_log.log_step(
"info",
"LLM | Context Limit Reached",
f"Removed the last assistant-user pair, current message_history length: {len(message_history)}",
)
return False, message_history
self.task_log.log_step(
"info",
"LLM | Context Limit Not Reached",
f"{estimated_total}/{self.max_context_length}",
)
return True, message_history
def format_token_usage_summary(self) -> tuple[List[str], str]:
"""Format token usage statistics, return summary_lines for format_final_summary and log string"""
token_usage = self.get_token_usage()
total_input = token_usage.get("total_input_tokens", 0)
total_output = token_usage.get("total_output_tokens", 0)
total_cache_creation = token_usage.get("total_cache_write_input_tokens", 0)
total_cache_read = token_usage.get("total_cache_read_input_tokens", 0)
summary_lines = []
summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20)
summary_lines.append(f"Total Input Tokens (non-cache): {total_input}")
summary_lines.append(
f"Total Cache Creation Input Tokens: {total_cache_creation}"
)
summary_lines.append(f"Total Cache Read Input Tokens: {total_cache_read}")
summary_lines.append(f"Total Output Tokens: {total_output}")
summary_lines.append("-" * (40 + len(" Token Usage ")))
summary_lines.append("Pricing is disabled - no cost information available")
summary_lines.append("-" * (40 + len(" Token Usage ")))
# Generate log string
log_string = (
f"[{self.model_name}] Total Input: {total_input}, "
f"Cache Creation: {total_cache_creation}, "
f"Cache Read: {total_cache_read}, "
f"Output: {total_output}"
)
return summary_lines, log_string
def get_token_usage(self):
return self.token_usage.copy()
def _apply_cache_control(self, messages: List[Dict]) -> List[Dict]:
"""Apply cache control to the last user message and system message (if applicable)"""
cached_messages = []
user_turns_processed = 0
for turn in reversed(messages):
if turn["role"] == "user" and user_turns_processed < 1:
# Add ephemeral cache control to the text part of the last user message
new_content = []
processed_text = False
# Check if content is a list
if isinstance(turn["content"], str):
turn["content"] = [{"type": "text", "text": turn["content"]}]
if isinstance(turn.get("content"), list):
# see example here
# https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching
for item in turn["content"]:
if (
item.get("type") == "text"
and len(item.get("text")) > 0
and not processed_text
):
# Copy and add cache control
text_item = item.copy()
text_item["cache_control"] = {"type": "ephemeral"}
new_content.append(text_item)
processed_text = True
else:
# Other types of content (like image) copied directly
new_content.append(item.copy())
cached_messages.append({"role": "user", "content": new_content})
else:
# If content is not a list (e.g., plain text), add as is without cache control
# Or adjust logic as needed
self.task_log.log_step(
"warning",
"LLM | Cache Control",
"Warning: User message content is not in expected list format, cache control not applied.",
)
cached_messages.append(turn)
user_turns_processed += 1
else:
# Add other messages directly
cached_messages.append(turn)
return list(reversed(cached_messages))
================================================
FILE: apps/miroflow-agent/src/llm/providers/openai_client.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
OpenAI-compatible LLM client implementation.
This module provides the OpenAIClient class for interacting with OpenAI's API
and OpenAI-compatible endpoints (such as vLLM, Qwen, DeepSeek, etc.).
Features:
- Async and sync API support
- Automatic retry with exponential backoff
- Token usage tracking and context length management
- MCP tool call parsing and response processing
"""
import asyncio
import dataclasses
import logging
from typing import Any, Dict, List, Tuple, Union
import tiktoken
from openai import AsyncOpenAI, DefaultAsyncHttpxClient, DefaultHttpxClient, OpenAI
from ...utils.prompt_utils import generate_mcp_system_prompt
from ..base_client import BaseClient
logger = logging.getLogger("miroflow_agent")
@dataclasses.dataclass
class OpenAIClient(BaseClient):
def _create_client(self) -> Union[AsyncOpenAI, OpenAI]:
"""Create LLM client"""
http_client_args = {"headers": {"x-upstream-session-id": self.task_id}}
if self.async_client:
return AsyncOpenAI(
api_key=self.api_key,
base_url=self.base_url,
http_client=DefaultAsyncHttpxClient(**http_client_args),
)
else:
return OpenAI(
api_key=self.api_key,
base_url=self.base_url,
http_client=DefaultHttpxClient(**http_client_args),
)
def _update_token_usage(self, usage_data: Any) -> None:
"""Update cumulative token usage"""
if usage_data:
input_tokens = getattr(usage_data, "prompt_tokens", 0)
output_tokens = getattr(usage_data, "completion_tokens", 0)
prompt_tokens_details = getattr(usage_data, "prompt_tokens_details", None)
if prompt_tokens_details:
cached_tokens = (
getattr(prompt_tokens_details, "cached_tokens", None) or 0
)
else:
cached_tokens = 0
# Record token usage for the most recent call
self.last_call_tokens = {
"prompt_tokens": input_tokens,
"completion_tokens": output_tokens,
}
# OpenAI does not provide cache_creation_input_tokens
self.token_usage["total_input_tokens"] += input_tokens
self.token_usage["total_output_tokens"] += output_tokens
self.token_usage["total_cache_read_input_tokens"] += cached_tokens
self.task_log.log_step(
"info",
"LLM | Token Usage",
f"Input: {self.token_usage['total_input_tokens']}, "
f"Output: {self.token_usage['total_output_tokens']}",
)
async def _create_message(
self,
system_prompt: str,
messages_history: List[Dict[str, Any]],
tools_definitions,
keep_tool_result: int = -1,
):
"""
Send message to OpenAI API.
:param system_prompt: System prompt string.
:param messages_history: Message history list.
:return: OpenAI API response object or None (if error occurs).
"""
# Create a copy for sending to LLM (to avoid modifying the original)
messages_for_llm = [m.copy() for m in messages_history]
# put the system prompt in the first message since OpenAI API does not support system prompt in
if system_prompt:
# Check if there's already a system or developer message
if messages_for_llm and messages_for_llm[0]["role"] in [
"system",
"developer",
]:
messages_for_llm[0] = {
"role": "system",
"content": system_prompt,
}
else:
messages_for_llm.insert(
0,
{
"role": "system",
"content": system_prompt,
},
)
# Filter tool results to save tokens (only affects messages sent to LLM)
messages_for_llm = self._remove_tool_result_from_messages(
messages_for_llm, keep_tool_result
)
# Retry loop with dynamic max_tokens adjustment
max_retries = 10
base_wait_time = 30
current_max_tokens = self.max_tokens
for attempt in range(max_retries):
params = {
"model": self.model_name,
"temperature": self.temperature,
"messages": messages_for_llm,
"stream": False,
"top_p": self.top_p,
"extra_body": {},
}
# Check if the model is GPT-5, and adjust the parameter accordingly
if "gpt-5" in self.model_name:
# Use 'max_completion_tokens' for GPT-5
params["max_completion_tokens"] = current_max_tokens
else:
# Use 'max_tokens' for GPT-4 and other models
params["max_tokens"] = current_max_tokens
# Add repetition_penalty if it's not the default value
if self.repetition_penalty != 1.0:
params["extra_body"]["repetition_penalty"] = self.repetition_penalty
if "deepseek-v3-1" in self.model_name:
params["extra_body"]["thinking"] = {"type": "enabled"}
# auto-detect if we need to continue from the last assistant message
if messages_for_llm and messages_for_llm[-1].get("role") == "assistant":
params["extra_body"]["continue_final_message"] = True
params["extra_body"]["add_generation_prompt"] = False
try:
if self.async_client:
response = await self.client.chat.completions.create(**params)
else:
response = self.client.chat.completions.create(**params)
# Update token count
self._update_token_usage(getattr(response, "usage", None))
self.task_log.log_step(
"info",
"LLM | Response Status",
f"{getattr(response.choices[0], 'finish_reason', 'N/A')}",
)
# Check if response was truncated due to length limit
finish_reason = getattr(response.choices[0], "finish_reason", None)
if finish_reason == "length":
# If this is not the last retry, increase max_tokens and retry
if attempt < max_retries - 1:
# Increase max_tokens by 10%
current_max_tokens = int(current_max_tokens * 1.1)
self.task_log.log_step(
"warning",
"LLM | Length Limit Reached",
f"Response was truncated due to length limit (attempt {attempt + 1}/{max_retries}). Increasing max_tokens to {current_max_tokens} and retrying...",
)
await asyncio.sleep(base_wait_time)
continue
else:
# Last retry, return the truncated response instead of raising exception
self.task_log.log_step(
"warning",
"LLM | Length Limit Reached - Returning Truncated Response",
f"Response was truncated after {max_retries} attempts. Returning truncated response to allow ReAct loop to continue.",
)
# Return the truncated response and let the orchestrator handle it
return response, messages_history
# Check if the last 50 characters of the response appear more than 5 times in the response content.
# If so, treat it as a severe repeat and trigger a retry.
if hasattr(response.choices[0], "message") and hasattr(
response.choices[0].message, "content"
):
resp_content = response.choices[0].message.content or ""
else:
resp_content = getattr(response.choices[0], "text", "")
if resp_content and len(resp_content) >= 50:
tail_50 = resp_content[-50:]
repeat_count = resp_content.count(tail_50)
if repeat_count > 5:
# If this is not the last retry, retry
if attempt < max_retries - 1:
self.task_log.log_step(
"warning",
"LLM | Repeat Detected",
f"Severe repeat: the last 50 chars appeared over 5 times (attempt {attempt + 1}/{max_retries}), retrying...",
)
await asyncio.sleep(base_wait_time)
continue
else:
# Last retry, return anyway
self.task_log.log_step(
"warning",
"LLM | Repeat Detected - Returning Anyway",
f"Severe repeat detected after {max_retries} attempts. Returning response anyway.",
)
# Success - return the original messages_history (not the filtered copy)
# This ensures that the complete conversation history is preserved in logs
return response, messages_history
except asyncio.TimeoutError as e:
if attempt < max_retries - 1:
self.task_log.log_step(
"warning",
"LLM | Timeout Error",
f"Timeout error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...",
)
await asyncio.sleep(base_wait_time)
continue
else:
self.task_log.log_step(
"error",
"LLM | Timeout Error",
f"Timeout error after {max_retries} attempts: {str(e)}",
)
raise e
except asyncio.CancelledError as e:
self.task_log.log_step(
"error",
"LLM | Request Cancelled",
f"Request was cancelled: {str(e)}",
)
raise e
except Exception as e:
if "Error code: 400" in str(e) and "longer than the model" in str(e):
self.task_log.log_step(
"error",
"LLM | Context Length Error",
f"Error: {str(e)}",
)
raise e
else:
if attempt < max_retries - 1:
self.task_log.log_step(
"warning",
"LLM | API Error",
f"Error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...",
)
await asyncio.sleep(base_wait_time)
continue
else:
self.task_log.log_step(
"error",
"LLM | API Error",
f"Error after {max_retries} attempts: {str(e)}",
)
raise e
# Should never reach here, but just in case
raise Exception("Unexpected error: retry loop completed without returning")
def process_llm_response(
self, llm_response: Any, message_history: List[Dict], agent_type: str = "main"
) -> tuple[str, bool, List[Dict]]:
"""Process LLM response"""
if not llm_response or not llm_response.choices:
error_msg = "LLM did not return a valid response."
self.task_log.log_step(
"error", "LLM | Response Error", f"Error: {error_msg}"
)
return "", True, message_history # Exit loop, return message_history
# Extract LLM response text
from ...utils.parsing_utils import fix_server_name_in_text
if llm_response.choices[0].finish_reason == "stop":
assistant_response_text = llm_response.choices[0].message.content or ""
assistant_response_text = fix_server_name_in_text(assistant_response_text)
message_history.append(
{"role": "assistant", "content": assistant_response_text}
)
elif llm_response.choices[0].finish_reason == "length":
assistant_response_text = llm_response.choices[0].message.content or ""
assistant_response_text = fix_server_name_in_text(assistant_response_text)
if assistant_response_text == "":
assistant_response_text = "LLM response is empty."
elif "Context length exceeded" in assistant_response_text:
# This is the case where context length is exceeded, needs special handling
self.task_log.log_step(
"warning",
"LLM | Context Length",
"Detected context length exceeded, returning error status",
)
message_history.append(
{"role": "assistant", "content": assistant_response_text}
)
return (
assistant_response_text,
True,
message_history,
) # Return True to indicate need to exit loop
# Add assistant response to history
message_history.append(
{"role": "assistant", "content": assistant_response_text}
)
else:
raise ValueError(
f"Unsupported finish reason: {llm_response.choices[0].finish_reason}"
)
return assistant_response_text, False, message_history
def extract_tool_calls_info(
self, llm_response: Any, assistant_response_text: str
) -> List[Dict]:
"""Extract tool call information from LLM response"""
from ...utils.parsing_utils import parse_llm_response_for_tool_calls
return parse_llm_response_for_tool_calls(assistant_response_text)
def update_message_history(
self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple]
) -> List[Dict]:
"""Update message history with tool calls data (llm client specific)"""
merged_text = "\n".join(
[
item[1]["text"]
for item in all_tool_results_content_with_id
if item[1]["type"] == "text"
]
)
message_history.append(
{
"role": "user",
"content": merged_text,
}
)
return message_history
def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str:
from ...utils.parsing_utils import set_tool_server_mapping
prompt = generate_mcp_system_prompt(date, mcp_servers)
set_tool_server_mapping(prompt)
return prompt
def _estimate_tokens(self, text: str) -> int:
"""Use tiktoken to estimate the number of tokens in text"""
if not hasattr(self, "encoding"):
# Initialize tiktoken encoder
try:
self.encoding = tiktoken.get_encoding("o200k_base")
except Exception:
# If o200k_base is not available, use cl100k_base as fallback
self.encoding = tiktoken.get_encoding("cl100k_base")
try:
return len(self.encoding.encode(text))
except Exception as e:
# If encoding fails, use simple estimation: approximately 1 token per 4 characters
self.task_log.log_step(
"error",
"LLM | Token Estimation Error",
f"Error: {str(e)}",
)
return len(text) // 4
def ensure_summary_context(
self, message_history: list, summary_prompt: str
) -> tuple[bool, list]:
"""
Check if current message_history + summary_prompt will exceed context
If it will exceed, remove the last assistant-user pair and return False
Return True to continue, False if messages have been rolled back
"""
# Get token usage from the last LLM call
last_prompt_tokens = self.last_call_tokens.get("prompt_tokens", 0)
last_completion_tokens = self.last_call_tokens.get("completion_tokens", 0)
buffer_factor = 1.5
# Calculate token count for summary prompt
summary_tokens = int(self._estimate_tokens(summary_prompt) * buffer_factor)
# Calculate token count for the last user message in message_history
last_user_tokens = 0
if message_history[-1]["role"] == "user":
content = message_history[-1]["content"]
last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor)
# Calculate total token count: last prompt + completion + last user message + summary + reserved response space
estimated_total = (
last_prompt_tokens
+ last_completion_tokens
+ last_user_tokens
+ summary_tokens
+ self.max_tokens
+ 1000 # Add 1000 tokens as buffer
)
if estimated_total >= self.max_context_length:
self.task_log.log_step(
"info",
"LLM | Context Limit Reached",
"Context limit reached, proceeding to step back and summarize the conversation",
)
# Remove the last user message (tool call results)
if message_history[-1]["role"] == "user":
message_history.pop()
# Remove the second-to-last assistant message (tool call request)
if message_history[-1]["role"] == "assistant":
message_history.pop()
self.task_log.log_step(
"info",
"LLM | Context Limit Reached",
f"Removed the last assistant-user pair, current message_history length: {len(message_history)}",
)
return False, message_history
self.task_log.log_step(
"info",
"LLM | Context Limit Not Reached",
f"{estimated_total}/{self.max_context_length}",
)
return True, message_history
def format_token_usage_summary(self) -> tuple[List[str], str]:
"""Format token usage statistics, return summary_lines for format_final_summary and log string"""
token_usage = self.get_token_usage()
total_input = token_usage.get("total_input_tokens", 0)
total_output = token_usage.get("total_output_tokens", 0)
cache_input = token_usage.get("total_cache_input_tokens", 0)
summary_lines = []
summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20)
summary_lines.append(f"Total Input Tokens: {total_input}")
summary_lines.append(f"Total Cache Input Tokens: {cache_input}")
summary_lines.append(f"Total Output Tokens: {total_output}")
summary_lines.append("-" * (40 + len(" Token Usage ")))
summary_lines.append("Pricing is disabled - no cost information available")
summary_lines.append("-" * (40 + len(" Token Usage ")))
# Generate log string
log_string = (
f"[{self.model_name}] Total Input: {total_input}, "
f"Cache Input: {cache_input}, "
f"Output: {total_output}"
)
return summary_lines, log_string
def get_token_usage(self):
return self.token_usage.copy()
================================================
FILE: apps/miroflow-agent/src/llm/util.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Utility decorators and helpers for LLM client operations.
This module provides:
- Timeout decorator for async LLM API calls
- Other common utilities shared across LLM providers
"""
import asyncio
import functools
from typing import Awaitable, Callable, TypeVar
T = TypeVar("T")
def with_timeout(
timeout_s: float = 300.0,
) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
"""
Decorator: wraps any *async* function in asyncio.wait_for().
Usage:
@with_timeout(20)
async def create_message_foo(...): ...
"""
def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
@functools.wraps(func)
async def wrapper(*args, **kwargs) -> T:
return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)
return wrapper
return decorator
================================================
FILE: apps/miroflow-agent/src/logging/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Logging module for task execution tracking."""
from .task_logger import (
LLMCallLog,
StepLog,
TaskLog,
ToolCallLog,
bootstrap_logger,
get_utc_plus_8_time,
)
__all__ = [
"TaskLog",
"StepLog",
"LLMCallLog",
"ToolCallLog",
"bootstrap_logger",
"get_utc_plus_8_time",
]
================================================
FILE: apps/miroflow-agent/src/logging/summary_time_cost.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
from collections import defaultdict
from pathlib import Path
from .task_logger import logger
def _get_summary_template():
"""Returns a template for the summary data structure."""
return {
"total_tasks": 0,
"total_wall_time": 0.0,
"primary_breakdown": {
"main_agent": defaultdict(float),
"browsing_agent": defaultdict(float),
},
"cross_cutting_breakdown": defaultdict(float),
"tool_workload_breakdown": defaultdict(float),
}
def _update_summary_data(summary_block, perf_summary, tool_workload):
"""Updates a summary block with data from a single result."""
summary_block["total_tasks"] += 1
summary_block["total_wall_time"] += perf_summary.get("total_wall_time", 0.0)
# Update primary breakdown
primary_breakdown = perf_summary.get("primary_breakdown", {})
for agent, data in primary_breakdown.items():
if agent in summary_block["primary_breakdown"]:
for key, value in data.items():
summary_block["primary_breakdown"][agent][key] += value
# Update cross-cutting breakdown
cross_cutting_breakdown = perf_summary.get("cross_cutting_breakdown", {})
for key, value in cross_cutting_breakdown.items():
summary_block["cross_cutting_breakdown"][key] += value
# Update tool workload breakdown
for key, value in tool_workload.items():
summary_block["tool_workload_breakdown"][key] += value
def _calculate_averages(summary_block):
"""Calculates and adds average values to a summary block."""
num_tasks = summary_block["total_tasks"]
if num_tasks == 0:
return
summary_block["average_wall_time"] = summary_block["total_wall_time"] / num_tasks
# Calculate averages for primary breakdown
for agent, data in summary_block["primary_breakdown"].items():
summary_block["primary_breakdown"][agent] = dict(data) # Convert back to dict
avg_data = {f"avg_{k}": v / num_tasks for k, v in data.items()}
summary_block["primary_breakdown"][agent].update(avg_data)
# Calculate averages for cross-cutting breakdown
summary_block["cross_cutting_breakdown"] = dict(
summary_block["cross_cutting_breakdown"]
)
avg_cross_cutting = {
f"avg_{k}": v / num_tasks
for k, v in summary_block["cross_cutting_breakdown"].items()
}
summary_block["cross_cutting_breakdown"].update(avg_cross_cutting)
# Calculate averages for tool workload breakdown
summary_block["tool_workload_breakdown"] = dict(
summary_block["tool_workload_breakdown"]
)
avg_tool_workload = {
f"avg_{k}": v / num_tasks
for k, v in summary_block["tool_workload_breakdown"].items()
}
summary_block["tool_workload_breakdown"].update(avg_tool_workload)
def generate_summary(log_dir: Path):
"""
Generates a summary of benchmark results by reading log files from a directory,
calculating total and average trace data, both overall and grouped by
final_judge_result.
Args:
log_dir: The directory where the individual result log files are and where
the summary file will be saved.
"""
results = []
for log_file in log_dir.glob("*.json"):
if log_file.name == "summary.json":
continue
try:
with open(log_file, "r", encoding="utf-8") as f:
results.append(json.load(f))
except json.JSONDecodeError:
logger.info(f"Warning: Could not decode JSON from {log_file}. Skipping.")
except Exception as e:
logger.info(f"Warning: Could not read file {log_file}: {e}. Skipping.")
overall_summary = _get_summary_template()
summary_by_judge = defaultdict(_get_summary_template)
for result in results:
trace_data = result.get("trace_data")
if not trace_data or "performance_summary" not in trace_data:
continue
perf_summary = trace_data["performance_summary"]
tool_workload = trace_data.get("tool_workload_breakdown", {})
# Update overall summary
_update_summary_data(overall_summary, perf_summary, tool_workload)
# Update summary by judge result
judge_result = result.get("final_judge_result", "unknown")
_update_summary_data(
summary_by_judge[judge_result], perf_summary, tool_workload
)
# Calculate averages for all summary blocks
_calculate_averages(overall_summary)
for judge_result in summary_by_judge:
_calculate_averages(summary_by_judge[judge_result])
summary_data = {
"overall_summary": overall_summary,
"summary_by_final_judge_result": dict(summary_by_judge),
}
summary_file = log_dir / "summary_time_cost.json"
with open(summary_file, "w", encoding="utf-8") as f:
json.dump(summary_data, f, indent=4, ensure_ascii=False)
================================================
FILE: apps/miroflow-agent/src/logging/task_logger.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Task logging and structured output module.
This module provides:
- TaskLog: Main dataclass for tracking task execution state and history
- StepLog: Individual step logging with timestamps and metadata
- ColoredFormatter: Console output formatting with color-coded log levels
- Utility functions for time handling and logger configuration
All logs are persisted to JSON files for later analysis and debugging.
"""
import json
import logging
import os
from dataclasses import asdict, dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional
# Import colorama for cross-platform colored output
from colorama import Fore, Style, init
# Initialize colorama
init(autoreset=True, strip=False)
# This will be set to the configured logger instance
logger = None
def get_color_for_level(level: str) -> str:
"""Get color code based on log level for better visual distinction"""
if level == "ERROR":
return f"{Fore.RED}{Style.BRIGHT}"
elif level == "WARNING":
return f"{Fore.YELLOW}{Style.BRIGHT}"
elif level == "INFO":
return f"{Fore.GREEN}{Style.BRIGHT}"
elif level == "DEBUG":
return f"{Fore.CYAN}{Style.BRIGHT}"
else:
return f"{Fore.WHITE}{Style.BRIGHT}"
class ColoredFormatter(logging.Formatter):
"""Custom formatter that adds colors for better developer visualization"""
def format(self, record):
# Get timestamp and format it
timestamp = self.formatTime(record, self.datefmt)
# Color the level name based on severity
level_color = get_color_for_level(record.levelname)
level_reset = Style.RESET_ALL
# Color the logger name (miroflow_agent)
name_color = f"{Fore.BLUE}{Style.BRIGHT}"
name_reset = Style.RESET_ALL
# Get the message as is (icons are already added in log_step)
message = record.getMessage()
# Format with selective coloring
formatted = f"[{timestamp}][{name_color}{record.name}{name_reset}][{level_color}{record.levelname}{level_reset}] - {message}"
return formatted
def bootstrap_logger() -> logging.Logger:
"""Configure the miroflow_agent logger with consistent formatting"""
global logger
# Configure miroflow_agent logger
miroflow_agent_logger = logging.getLogger("miroflow_agent")
# Check if logger already has handlers to prevent duplicate configuration
if miroflow_agent_logger.handlers:
logger = miroflow_agent_logger
return miroflow_agent_logger
# Create formatter with consistent format
formatter = ColoredFormatter(
"%(asctime)s,%(msecs)03d",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Add our handler with the specified formatter
handler = logging.StreamHandler()
handler.setFormatter(formatter)
miroflow_agent_logger.addHandler(handler)
miroflow_agent_logger.setLevel(logging.DEBUG)
# Disable propagation to prevent duplicate logging from root logger
miroflow_agent_logger.propagate = False
# Set the global logger variable
logger = miroflow_agent_logger
return miroflow_agent_logger
def get_utc_plus_8_time() -> str:
"""Get UTC+8 timezone current time string"""
utc_plus_8 = timezone(timedelta(hours=8))
return datetime.now(utc_plus_8).strftime("%Y-%m-%d %H:%M:%S")
@dataclass
class LLMCallLog:
"""Record technical details of LLM calls"""
provider: str
model: str
input_tokens: int = 0
output_tokens: int = 0
cache_creation_tokens: int = 0
cache_read_tokens: int = 0
error: Optional[str] = None
@dataclass
class ToolCallLog:
"""Record detailed information of tool calls"""
server_name: str
tool_name: str
arguments: Dict[str, Any] = field(default_factory=dict)
result: Any = None
error: Optional[str] = None
call_time: Optional[str] = None
@dataclass
class StepLog:
"""Record detailed information of task execution steps"""
step_name: str
message: str
timestamp: str
info_level: Literal["info", "warning", "error", "debug"] = "info"
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Validate info_level after initialization"""
valid_levels = {"info", "warning", "error", "debug"}
if self.info_level not in valid_levels:
raise ValueError(
f"info_level must be one of {valid_levels}, got '{self.info_level}'"
)
@dataclass
class TaskLog:
status: str = "running"
start_time: str = ""
end_time: str = ""
task_id: str = ""
input: Any = None
ground_truth: str = ""
final_boxed_answer: str = ""
final_judge_result: str = ""
judge_type: str = ""
eval_details: Optional[Dict[str, Any]] = None # For DeepSearchQA metrics
error: str = ""
# Main records: main agent conversation turns
current_main_turn_id: int = 0
current_sub_agent_turn_id: int = 0
sub_agent_counter: int = 0
current_sub_agent_session_id: Optional[str] = None
env_info: Optional[dict] = field(default_factory=dict)
log_dir: str = "logs"
main_agent_message_history: List[Dict[str, Any]] = field(default_factory=list)
sub_agent_message_history_sessions: Dict[str, List[Dict[str, Any]]] = field(
default_factory=dict
)
step_logs: List[StepLog] = field(default_factory=list)
trace_data: Dict[str, Any] = field(default_factory=dict)
def start_sub_agent_session(
self, sub_agent_name: str, subtask_description: str
) -> str:
"""Start a new sub-agent session"""
self.sub_agent_counter += 1
session_id = f"{sub_agent_name}_{self.sub_agent_counter}"
self.current_sub_agent_session_id = session_id
# Record sub-agent session start
self.log_step(
"info",
f"{sub_agent_name} | Session Start",
f"Starting {session_id} for subtask: {subtask_description[:100]}{'...' if len(subtask_description) > 100 else ''}",
metadata={"session_id": session_id, "subtask": subtask_description},
)
return session_id
def end_sub_agent_session(self, sub_agent_name: str) -> Optional[str]:
"""End the current sub-agent session"""
self.log_step(
"info",
f"{sub_agent_name} | Session End",
f"Ending {self.current_sub_agent_session_id}",
metadata={"session_id": self.current_sub_agent_session_id},
)
self.current_sub_agent_session_id = None
return None
def log_step(
self,
info_level: Literal["info", "warning", "error", "debug"],
step_name: str,
message: str,
metadata: Optional[Dict[str, Any]] = None,
):
"""Record execution step"""
# Add icons to step_name based on content
icon = ""
if "Tool Call Start" in step_name:
icon = "▶️ "
elif "Tool Call Success" in step_name:
icon = "✅ "
elif "Tool Call Error" in step_name or (
"error" in info_level and "tool" in step_name.lower()
):
icon = "❌ "
elif "agent-" in step_name:
icon = "🤖 "
elif "Main Agent" in step_name:
icon = "👑 "
elif "LLM" in step_name:
icon = "🧠 "
elif "ToolManager" in step_name or "Tool Call" in step_name:
icon = "🔧 "
elif "tool-python" in step_name.lower():
icon = "🐍 "
elif "tool-google-search" in step_name.lower():
icon = "🔍 "
elif "tool-browser" in step_name.lower() or "playwright" in step_name.lower():
icon = "🌐 "
# Add icon to step_name
step_name_with_icon = f"{icon}{step_name}"
step_log = StepLog(
step_name=step_name_with_icon,
message=message,
timestamp=get_utc_plus_8_time(),
info_level=info_level,
metadata=metadata or {},
)
self.step_logs.append(step_log)
# Print the structured log to console using the configured logger
log_message = f"{step_name_with_icon}: {message}"
# Ensure logger is configured
global logger
if logger is None:
logger = bootstrap_logger()
if info_level == "error":
logger.error(log_message)
elif info_level == "warning":
logger.warning(log_message)
elif info_level == "debug":
logger.debug(log_message)
else: # info
logger.info(log_message)
def serialize_for_json(self, obj):
"""Convert objects to JSON-serializable format"""
if isinstance(obj, Path):
return str(obj)
elif isinstance(obj, dict):
return {k: self.serialize_for_json(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [self.serialize_for_json(item) for item in obj]
elif hasattr(obj, "__dict__"):
return self.serialize_for_json(obj.__dict__)
else:
return obj
def to_json(self) -> str:
"""
Serialize the TaskLog to a JSON string.
Converts the dataclass to a dictionary, handles non-JSON-serializable
objects (like Path), and returns a formatted JSON string.
Returns:
A JSON string representation of the task log with 2-space indentation.
Note:
Falls back to ASCII encoding if Unicode encoding fails.
"""
# Convert to dict first
data_dict = asdict(self)
# Serialize any non-JSON-serializable objects
serialized_dict = self.serialize_for_json(data_dict)
try:
return json.dumps(serialized_dict, ensure_ascii=False, indent=2)
except UnicodeEncodeError as e:
# Fallback: try with ASCII encoding if Unicode fails
print(f"Warning: Unicode encoding failed, falling back to ASCII: {e}")
return json.dumps(serialized_dict, ensure_ascii=True, indent=2)
def save(self):
"""Save as a single JSON file"""
os.makedirs(self.log_dir, exist_ok=True)
timestamp = (
self.start_time.replace(":", "-").replace(".", "-").replace(" ", "-")
)
filename = f"{self.log_dir}/task_{self.task_id}_{timestamp}.json"
try:
with open(filename, "w", encoding="utf-8") as f:
f.write(self.to_json())
except UnicodeEncodeError as e:
# Fallback: try with different encoding if UTF-8 fails
print(f"Warning: UTF-8 encoding failed, trying with system default: {e}")
with open(filename, "w") as f:
f.write(self.to_json())
return filename
@classmethod
def from_dict(cls, d: dict) -> "TaskLog":
"""
Create a TaskLog instance from a dictionary.
Args:
d: Dictionary containing TaskLog field values.
Returns:
A new TaskLog instance initialized with the dictionary values.
Note:
The dictionary keys should match the TaskLog field names.
"""
return cls(**d)
================================================
FILE: apps/miroflow-agent/src/utils/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Utility functions for parsing, prompts, and wrappers."""
from .parsing_utils import (
extract_failure_experience_summary,
extract_llm_response_text,
fix_server_name_in_text,
parse_llm_response_for_tool_calls,
safe_json_loads,
set_tool_server_mapping,
)
from .prompt_utils import (
FORMAT_ERROR_MESSAGE,
generate_agent_specific_system_prompt,
generate_agent_summarize_prompt,
generate_mcp_system_prompt,
)
from .wrapper_utils import ErrorBox, ResponseBox
__all__ = [
# parsing_utils
"parse_llm_response_for_tool_calls",
"extract_llm_response_text",
"extract_failure_experience_summary",
"fix_server_name_in_text",
"set_tool_server_mapping",
"safe_json_loads",
# prompt_utils
"FORMAT_ERROR_MESSAGE",
"generate_mcp_system_prompt",
"generate_agent_specific_system_prompt",
"generate_agent_summarize_prompt",
# wrapper_utils
"ErrorBox",
"ResponseBox",
]
================================================
FILE: apps/miroflow-agent/src/utils/parsing_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Parsing utilities for LLM responses and tool calls.
This module provides functions for:
- Parsing tool calls from LLM responses (both OpenAI and MCP formats)
- Extracting text content from responses
- Safe JSON parsing with automatic repair
- Failure experience summary extraction
"""
import json
import logging
import re
from typing import Any, Dict, List, Union
from json_repair import repair_json
logger = logging.getLogger("miroflow_agent")
def parse_tool_server_mapping(system_prompt: str) -> dict:
"""
Parse system prompt to extract tool_name → server_name mapping.
Parses patterns like:
## Server name: tool-python
### Tool name: run_python_code
Only extracts mappings for the 3 target tools that models commonly get wrong:
run_python_code, google_search, scrape_and_extract_info.
Args:
system_prompt: The system prompt containing MCP tool definitions
Returns:
Dict mapping tool_name to correct server_name, e.g.
{"run_python_code": "tool-python", "google_search": "search_and_scrape_webpage", ...}
"""
TARGET_TOOLS = {"run_python_code", "google_search", "scrape_and_extract_info"}
mapping = {}
current_server = None
for line in system_prompt.split("\n"):
server_match = re.match(r"## Server name:\s*(.+)", line)
if server_match:
current_server = server_match.group(1).strip()
continue
tool_match = re.match(r"### Tool name:\s*(.+)", line)
if tool_match and current_server:
tool_name = tool_match.group(1).strip()
if tool_name in TARGET_TOOLS:
mapping[tool_name] = current_server
return mapping
# Module-level cache for tool_server_mapping
_tool_server_mapping: dict = {}
def set_tool_server_mapping(system_prompt: str) -> None:
"""
Parse system prompt and cache the tool_name → server_name mapping.
Should be called once when system prompt is available.
Args:
system_prompt: The system prompt containing MCP tool definitions
"""
global _tool_server_mapping
_tool_server_mapping = parse_tool_server_mapping(system_prompt)
def fix_server_name_in_text(text: str) -> str:
"""
Fix incorrect server_name and tool_name in MCP XML tool calls.
Uses the cached tool_server_mapping (parsed from system prompt) to determine
the correct server_name for each tool. Only fixes the 3 target tools:
run_python_code, google_search, scrape_and_extract_info.
Also handles the special case where model outputs tool_name=python
(should be run_python_code).
Args:
text: The LLM response text containing MCP tool calls
Returns:
Text with corrected server_name and tool_name if needed
"""
if not isinstance(text, str):
return text
mapping = _tool_server_mapping
if not mapping:
return text
# Special case: tool_name=python or python_code → rename to run_python_code
# Only apply if system prompt defines run_python_code (not python)
if "run_python_code" in mapping:
for wrong_name in ("python", "python_code"):
tag = f"{wrong_name}"
if tag in text:
text = text.replace(tag, "run_python_code")
# Fix server_name for each target tool using the mapping from system prompt
for tool_name, correct_server in mapping.items():
tool_tag = f"{tool_name}"
if tool_tag not in text:
continue
correct_server_tag = f"{correct_server}"
if correct_server_tag in text:
continue
text = re.sub(
r"[^<]+(\s*" + re.escape(tool_tag) + r")",
correct_server_tag + r"\1",
text,
)
return text
def filter_none_values(arguments: Union[Dict, Any]) -> Union[Dict, Any]:
"""
Filter out keys with None values from arguments dictionary.
Args:
arguments: A dictionary to filter, or any other value
Returns:
The filtered dictionary, or the original value if not a dict
"""
if not isinstance(arguments, dict):
return arguments
return {k: v for k, v in arguments.items() if v is not None}
def _fix_backslash_escapes(json_str: str) -> str:
"""
Fix common backslash escape issues in JSON strings.
This handles cases where backslashes in string values are not properly escaped.
Common issues:
- Unescaped backslashes before non-escape characters
Note: This is a conservative fix that preserves valid escape sequences
(\\, \", \/, \b, \f, \n, \r, \t) and only fixes clearly problematic cases.
"""
fixed_str = json_str
# Fix backslashes that are not part of valid escape sequences
# Valid JSON escape sequences: \\, \", \/, \b, \f, \n, \r, \t, \uXXXX
# Pattern: backslash not followed by a valid escape character
# This regex matches \ followed by anything except valid escape chars
# But we need to be careful not to match already-escaped backslashes (\\)
# Strategy: Find all backslashes, but skip those that are:
# 1. Already escaped (\\)
# 2. Part of valid escape sequences (\", \/, \b, \f, \n, \r, \t, \u)
# More conservative approach: Only fix backslashes before uppercase letters
# (common in Windows paths) and other clearly problematic patterns
# This avoids breaking valid JSON escape sequences
# Fix backslashes before uppercase letters (Windows paths like C:\Users)
fixed_str = re.sub(
r"(? Dict[str, Any]:
"""
Safely parse a JSON string with multiple fallbacks.
Parsing strategy:
1. Try standard json.loads()
2. If it fails, try json_repair to fix common issues
3. If all attempts fail, return an error object
Args:
arguments_str: JSON string to parse
Returns:
Parsed dictionary, or error dict with 'error' and 'raw' keys
"""
# Step 1: Try standard JSON parsing
try:
return json.loads(arguments_str)
except json.JSONDecodeError:
pass
# Step 2: Try json_repair to fix common issues
try:
repaired = repair_json(arguments_str, ensure_ascii=False)
return json.loads(repaired)
except Exception:
logger.warning(f"Unable to parse JSON: {arguments_str}")
# Step 3: Give up and return error information
return {
"error": "Failed to parse arguments",
"raw": arguments_str,
}
def extract_failure_experience_summary(text: str) -> str:
"""
Extract failure experience summary from LLM response text.
The text may contain:
- ... block (thinking content)
- Main content after and before
- ... block (tool call, ignored)
Examples:
"\n{xxx}\n\n\n{content}\n\n..."
"\n{xxx}\n\n\n{content}"
"{content}" (no think block)
Returns:
- If content is empty after strip, return think_content
- If both think_content and content are non-empty, return content
- mcp_block is never used
"""
if not text:
return ""
think_content = ""
content = ""
# Extract think content
think_match = re.search(r"([\s\S]*?)", text)
if think_match:
think_content = think_match.group(1).strip()
# Get content after
after_think = text[think_match.end() :]
else:
# No think block, entire text is potential content
after_think = text
# Remove ... block from content
mcp_match = re.search(r"[\s\S]*", after_think)
if mcp_match:
content = after_think[: mcp_match.start()].strip()
else:
content = after_think.strip()
# Apply the rules:
# - If content is empty, use think_content
# - If both are non-empty, use content
if content:
return content
else:
return think_content
def extract_llm_response_text(llm_response: Union[str, Dict]) -> str:
"""
Extract text from LLM response, excluding tags.
Stops immediately when tag is encountered, returning
only the content before it.
Args:
llm_response: Either a string or a dict with 'content' key
Returns:
Extracted text content, stripped of trailing whitespace
"""
# If it's a dictionary type, extract the content field
if isinstance(llm_response, dict):
content = llm_response.get("content", "")
else:
# If it's a string type, use directly
content = str(llm_response)
# Find the position of tag
tool_start_pattern = r""
match = re.search(tool_start_pattern, content)
if match:
# If tag is found, only return content before the tag
return content[: match.start()].strip()
else:
# If no tag is found, return the complete content
return content.strip()
def parse_llm_response_for_tool_calls(
llm_response_content_text: Union[str, Dict, List],
) -> List[Dict[str, Any]]:
"""
Parse tool calls from LLM response content.
Supports multiple formats:
- OpenAI Response API format (dict with 'output' containing function_call items)
- OpenAI Completion API format (list of tool_call objects)
- MCP format ( XML tags in text)
Args:
llm_response_content_text: Response content in any supported format
Returns:
List of tool call dicts with keys: server_name, tool_name, arguments, id
"""
# tool_calls or MCP reponse are handled differently
# for openai response api, the tool_calls are in the response text
if isinstance(llm_response_content_text, dict):
tool_calls = []
for item in llm_response_content_text.get("output") or []:
if item.get("type") == "function_call":
name = item.get("name", "")
if "-" in name:
server_name, tool_name = name.rsplit("-", maxsplit=1)
else:
server_name = "unknown"
tool_name = name
arguments_str = item.get("arguments")
arguments = safe_json_loads(arguments_str)
arguments = filter_none_values(arguments)
tool_calls.append(
dict(
server_name=server_name,
tool_name=tool_name,
arguments=arguments,
id=item.get("call_id"),
)
)
return tool_calls
# for openai completion api, the tool_calls are in the response text
if isinstance(llm_response_content_text, list):
tool_calls = []
for tool_call in llm_response_content_text:
name = tool_call.function.name
if "-" in name:
server_name, tool_name = name.rsplit("-", maxsplit=1)
else:
server_name = "unknown"
tool_name = name
arguments_str = tool_call.function.arguments
# Parse JSON string to dictionary
try:
# Try to handle possible newlines and escape characters
arguments = json.loads(arguments_str)
except json.JSONDecodeError:
logger.info(
f"Warning: Unable to parse tool arguments JSON: {arguments_str}"
)
# Try more lenient parsing or log error
try:
# Try to replace some common error formats, such as Python dict strings
arguments_str_fixed = (
arguments_str.replace("'", '"')
.replace("None", "null")
.replace("True", "true")
.replace("False", "false")
)
arguments = json.loads(arguments_str_fixed)
logger.info(
"Info: Successfully parsed arguments after attempting to fix."
)
except json.JSONDecodeError:
logger.info(
f"Error: Still unable to parse tool arguments JSON after fixing: {arguments_str}"
)
arguments = {
"error": "Failed to parse arguments",
"raw": arguments_str,
}
arguments = filter_none_values(arguments)
tool_calls.append(
dict(
server_name=server_name,
tool_name=tool_name,
arguments=arguments,
id=tool_call.id,
)
)
return tool_calls
# for other clients, such as qwen and anthropic, we use MCP instead of tool calls
tool_calls = []
# Find all tags
tool_call_patterns = re.findall(
r"\s*(.*?)\s*(.*?)\s*\s*([\s\S]*?)\s*\s*",
llm_response_content_text,
re.DOTALL,
)
for match in tool_call_patterns:
server_name = match[0].strip()
tool_name = match[1].strip()
arguments_str = match[2].strip()
# Parse JSON string to dictionary
arguments = safe_json_loads(arguments_str)
arguments = filter_none_values(arguments)
tool_calls.append(
{
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
"id": None,
}
)
return tool_calls
================================================
FILE: apps/miroflow-agent/src/utils/prompt_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
Prompt templates and utilities for agent system prompts.
This module provides:
- System prompt generation for MCP tool usage
- Agent-specific prompt generation (main agent, browsing agent)
- Summary prompt templates for final answer generation
- Failure experience templates for retry mechanisms
"""
# ============================================================================
# Format Error Messages
# ============================================================================
FORMAT_ERROR_MESSAGE = "No \\boxed{} content found in the final answer."
# ============================================================================
# Failure Experience Templates (for format error retry)
# ============================================================================
# Header that appears once before all failure experiences
FAILURE_EXPERIENCE_HEADER = """
=== Previous Attempts Analysis ===
The following summarizes what was tried before and why it didn't work. Use this to guide a NEW approach.
"""
# Template for each individual failure experience (used multiple times)
FAILURE_EXPERIENCE_ITEM = """[Attempt {attempt_number}]
{failure_summary}
"""
# Footer that appears once after all failure experiences
FAILURE_EXPERIENCE_FOOTER = """=== End of Analysis ===
Based on the above, you should try a different strategy this time.
"""
FAILURE_SUMMARY_PROMPT = """The task was not completed successfully. Do NOT call any tools. Provide a summary:
Failure type: [incomplete / blocked / misdirected / format_missed]
- incomplete: ran out of turns before finishing
- blocked: got stuck due to tool failure or missing information
- misdirected: went down the wrong path
- format_missed: found the answer but forgot to use \\boxed{}
What happened: [describe the approach taken and why a final answer was not reached]
Useful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]"""
# Assistant prefix for failure summary generation (guides model to follow structured format)
FAILURE_SUMMARY_THINK_CONTENT = """We need to write a structured post-mortem style summary **without calling any tools**, explaining why the task was not completed, using these required sections:
* **Failure type**: pick one from **incomplete / blocked / misdirected / format_missed**
* **What happened**: describe the approach taken and why it didn't reach a final answer
* **Useful findings**: list any facts, intermediate results, or conclusions that can be reused"""
FAILURE_SUMMARY_ASSISTANT_PREFIX = (
f"\n{FAILURE_SUMMARY_THINK_CONTENT}\n\n\n"
)
# ============================================================================
# MCP Tags for Parsing
# ============================================================================
mcp_tags = [
"",
"",
"",
"",
"",
"",
]
refusal_keywords = [
"time constraint",
"I’m sorry, but I can’t",
"I'm sorry, I cannot solve",
]
def generate_mcp_system_prompt(date, mcp_servers):
"""
Generate the MCP (Model Context Protocol) system prompt for LLM.
Creates a structured prompt that instructs the LLM on how to use available
MCP tools. Includes tool definitions, XML formatting instructions, and
general task-solving guidelines.
Args:
date: Current date object for timestamp inclusion
mcp_servers: List of server definitions, each containing 'name' and 'tools'
Returns:
Complete system prompt string with tool definitions and usage instructions
"""
formatted_date = date.strftime("%Y-%m-%d")
# Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt
template = f"""In this environment you have access to a set of tools you can use to answer the user's question.
You only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use. Today is: {formatted_date}
# Tool-Use Formatting Instructions
Tool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.
The Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.
Description:
Request to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.
Parameters:
- server_name: (required) The name of the MCP server providing the tool
- tool_name: (required) The name of the tool to execute
- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON
Usage:
server name heretool name here
{{
"param1": "value1",
"param2": "value2 \\"escaped string\\""
}}
Important Notes:
- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.
- Always adhere to this format for the tool use to ensure proper parsing and execution.
String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.
Here are the functions available in JSONSchema format:
"""
# Add MCP servers section
if mcp_servers and len(mcp_servers) > 0:
for server in mcp_servers:
template += f"\n## Server name: {server['name']}\n"
if "tools" in server and len(server["tools"]) > 0:
for tool in server["tools"]:
# Skip tools that failed to load (they only have 'error' key)
if "error" in tool and "name" not in tool:
continue
template += f"### Tool name: {tool['name']}\n"
template += f"Description: {tool['description']}\n"
template += f"Input JSON schema: {tool['schema']}\n"
# Add the full objective system prompt
template += """
# General Objective
You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.
"""
return template
def generate_no_mcp_system_prompt(date):
"""
Generate a minimal system prompt without MCP tool definitions.
Used when no tools are available or when running in tool-less mode.
Args:
date: Current date object for timestamp inclusion
Returns:
Basic system prompt string without tool definitions
"""
formatted_date = date.strftime("%Y-%m-%d")
# Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt
template = """In this environment you have access to a set of tools you can use to answer the user's question. """
template += f" Today is: {formatted_date}\n"
template += """
Important Notes:
- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.
- Always adhere to this format for the tool use to ensure proper parsing and execution.
String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.
"""
# Add the full objective system prompt
template += """
# General Objective
You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.
"""
return template
def generate_agent_specific_system_prompt(agent_type=""):
"""
Generate agent-specific objective prompts based on agent type.
Different agent types have different objectives:
- main: Task-solving agent that uses tools to answer questions
- agent-browsing: Web search and browsing agent for information retrieval
Args:
agent_type: Type of agent ("main", "agent-browsing", or "browsing-agent")
Returns:
Agent-specific objective prompt string
"""
if agent_type == "main":
system_prompt = """\n
# Agent Specific Objective
You are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools.
"""
elif agent_type == "agent-browsing" or agent_type == "browsing-agent":
system_prompt = """# Agent Specific Objective
You are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps.
Do not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content.
"""
else:
raise ValueError(f"Unknown agent type: {agent_type}")
return system_prompt.strip()
def generate_agent_summarize_prompt(task_description, agent_type=""):
"""
Generate the final summarization prompt for an agent.
Creates prompts that instruct agents to summarize their work and provide
final answers. Different agent types have different summarization formats:
- main: Must wrap answer in \\boxed{} with strict formatting rules
- agent-browsing: Provides structured report of findings
Args:
task_description: The original task/question to reference in the summary
agent_type: Type of agent ("main" or "agent-browsing")
Returns:
Summarization prompt string with formatting instructions
"""
if agent_type == "main":
summarize_prompt = (
"Summarize the above conversation, and output the FINAL ANSWER to the original question.\n\n"
"If a clear answer has already been provided earlier in the conversation, do not rethink or recalculate it — "
"simply extract that answer and reformat it to match the required format below.\n"
"If a definitive answer could not be determined, make a well-informed educated guess based on the conversation.\n\n"
"The original question is repeated here for reference:\n\n"
f'"{task_description}"\n\n'
"Wrap your final answer in \\boxed{}.\n"
"Your final answer should be:\n"
"- a number, OR\n"
"- as few words as possible, OR\n"
"- a comma-separated list of numbers and/or strings.\n\n"
"ADDITIONALLY, your final answer MUST strictly follow any formatting instructions in the original question — "
"such as alphabetization, sequencing, units, rounding, decimal places, etc.\n"
"If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.\n"
"If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.\n"
"If you are asked for a comma-separated list, apply the above rules depending on whether the elements are numbers or strings.\n"
"Do NOT include any punctuation such as '.', '!', or '?' at the end of the answer.\n"
"Do NOT include any invisible or non-printable characters in the answer output.\n\n"
"You must absolutely not perform any MCP tool call, tool invocation, search, scrape, code execution, or similar actions.\n"
"You can only answer the original question based on the information already retrieved and your own internal knowledge.\n"
"If you attempt to call any tool, it will be considered a mistake."
)
elif agent_type == "agent-browsing":
summarize_prompt = (
"This is a direct instruction to you (the assistant), not the result of a tool call.\n\n"
"We are now ending this session, and your conversation history will be deleted. "
"You must NOT initiate any further tool use. This is your final opportunity to report "
"*all* of the information gathered during the session.\n\n"
"The original task is repeated here for reference:\n\n"
f'"{task_description}"\n\n'
"Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n"
"If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n"
"If you reached a conclusion or answer, include it as part of the response.\n"
"If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, "
"Search results, quotes, and observations that might help a downstream agent solve the problem.\n"
"If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n"
"Your final response should be a clear, complete, and structured report.\n"
"Organize the content into logical sections with appropriate headings.\n"
"Do NOT include any tool call instructions, speculative filler, or vague summaries.\n"
"Focus on factual, specific, and well-organized information."
)
else:
raise ValueError(f"Unknown agent type: {agent_type}")
return summarize_prompt.strip()
================================================
FILE: apps/miroflow-agent/src/utils/wrapper_utils.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""Wrapper utilities for handling responses and errors in a type-safe manner."""
from typing import Any, Dict, Optional
class ErrorBox:
"""
A wrapper class for error messages.
Use this to wrap error messages that should be distinguishable from normal responses.
Example:
>>> error = ErrorBox("Connection failed")
>>> if ErrorBox.is_error_box(error):
... print(f"Error: {error}")
"""
def __init__(self, error_msg: str) -> None:
self.error_msg = error_msg
def __str__(self) -> str:
return self.error_msg
def __repr__(self) -> str:
return f"ErrorBox({self.error_msg!r})"
@staticmethod
def is_error_box(something: Any) -> bool:
"""Check if the given object is an ErrorBox instance."""
return isinstance(something, ErrorBox)
class ResponseBox:
"""
A wrapper class for responses with optional extra information.
Use this to wrap responses that may include additional metadata.
Example:
>>> response = ResponseBox({"data": "value"}, {"warning_msg": "Rate limited"})
>>> if response.has_extra_info():
... print(response.get_extra_info())
"""
def __init__(
self, response: Any, extra_info: Optional[Dict[str, Any]] = None
) -> None:
self.response = response
self.extra_info = extra_info
def __str__(self) -> str:
return str(self.response)
def __repr__(self) -> str:
return f"ResponseBox({self.response!r}, extra_info={self.extra_info!r})"
@staticmethod
def is_response_box(something: Any) -> bool:
"""Check if the given object is a ResponseBox instance."""
return isinstance(something, ResponseBox)
def has_extra_info(self) -> bool:
"""Check if this response has extra information attached."""
return self.extra_info is not None
def get_extra_info(self) -> Optional[Dict[str, Any]]:
"""Get the extra information attached to this response."""
return self.extra_info
def get_response(self) -> Any:
"""Get the wrapped response object."""
return self.response
================================================
FILE: apps/visualize-trace/.python-version
================================================
3.11
================================================
FILE: apps/visualize-trace/README.md
================================================
# Trace Analysis Web Demo
An interactive web interface for analyzing and visualizing trace JSON files.
## Installation and Running
### Method 1: Using Python (Recommended)
```bash
pip install -r requirements.txt
python run.py
```
The startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000`
### Method 2: Using uv
```bash
uv run run.py
```
## Usage
1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser
1. **Load files**:
- Select the trace JSON file to analyze from the dropdown menu in the top navigation bar
- Click the "Load" button to load the file
1. **View analysis results**:
- **Left panel**: Shows basic information, execution summary, and performance statistics
- **Right panel**: Displays detailed execution flow
- **Bottom panel**: Shows spans statistics and step logs statistics
1. **Interactive operations**:
- Click on execution steps to expand/collapse detailed information
- Use "Expand All"/"Collapse All" buttons to control all steps
- Click "View Details" button to see complete message content
================================================
FILE: apps/visualize-trace/app.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import os
from flask import Flask, jsonify, render_template, request
from trace_analyzer import TraceAnalyzer
app = Flask(__name__)
# Global variable to store analyzer instance
analyzer = None
@app.route("/")
def index():
"""Main page"""
return render_template("index.html")
@app.route("/api/list_files", methods=["GET"])
def list_files():
"""List available JSON files"""
try:
directory = request.args.get("directory", "")
if not directory:
# Default behavior: check parent directory
directory = os.path.abspath("..")
# Expand path (handle ~ and other symbols)
directory = os.path.expanduser(directory)
# Convert to absolute path
directory = os.path.abspath(directory)
if not os.path.exists(directory):
return jsonify({"error": f"Directory does not exist: {directory}"}), 404
if not os.path.isdir(directory):
return jsonify({"error": f"Path is not a directory: {directory}"}), 400
try:
json_files = []
for file in os.listdir(directory):
if file.endswith(".json"):
file_path = os.path.join(directory, file)
try:
# Get file size and modification time
stat = os.stat(file_path)
json_files.append(
{
"name": file,
"path": file_path,
"size": stat.st_size,
"modified": stat.st_mtime,
}
)
except Exception:
json_files.append(
{"name": file, "path": file_path, "size": 0, "modified": 0}
)
# Sort by filename
json_files.sort(key=lambda x: x["name"])
return jsonify(
{
"files": json_files,
"directory": directory,
"message": f'Found {len(json_files)} JSON files in directory "{directory}"',
}
)
except PermissionError:
return jsonify(
{"error": f"No permission to access directory: {directory}"}
), 403
except Exception as e:
return jsonify({"error": f"Failed to read directory: {str(e)}"}), 500
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/load_trace", methods=["POST"])
def load_trace():
"""Load trace file"""
global analyzer
data = request.get_json()
file_path = data.get("file_path")
if not file_path:
return jsonify({"error": "Please provide file path"}), 400
# If it's a relative path, convert to absolute path
if not os.path.isabs(file_path):
file_path = os.path.abspath(file_path)
if not os.path.exists(file_path):
return jsonify({"error": f"File does not exist: {file_path}"}), 404
try:
analyzer = TraceAnalyzer(file_path)
return jsonify(
{
"message": "File loaded successfully",
"file_path": file_path,
"file_name": os.path.basename(file_path),
}
)
except Exception as e:
return jsonify({"error": f"Failed to load file: {str(e)}"}), 500
@app.route("/api/basic_info")
def get_basic_info():
"""Get basic information"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.get_basic_info())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/performance_summary")
def get_performance_summary():
"""Get performance summary"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.get_performance_summary())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/execution_flow")
def get_execution_flow():
"""Get execution flow"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.analyze_conversation_flow())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/execution_summary")
def get_execution_summary():
"""Get execution summary"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.get_execution_summary())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/spans_summary")
def get_spans_summary():
"""Get spans summary"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.get_spans_summary())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/step_logs_summary")
def get_step_logs_summary():
"""Get step logs summary"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
return jsonify(analyzer.get_step_logs_summary())
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route("/api/debug/raw_messages")
def get_raw_messages():
"""Get raw message data for debugging"""
if not analyzer:
return jsonify({"error": "Please load trace file first"}), 400
try:
main_history = analyzer.get_main_agent_history()
browser_sessions = analyzer.get_browser_agent_sessions()
# Get message structure overview
main_messages = analyzer.get_main_agent_messages()
message_structure = []
for i, message in enumerate(main_messages):
message_structure.append(
{
"index": i,
"role": message.get("role"),
"content_length": len(str(message.get("content", ""))),
"has_timestamp": "timestamp" in message,
"content_preview": str(message.get("content", ""))[:100] + "..."
if len(str(message.get("content", ""))) > 100
else str(message.get("content", "")),
}
)
return jsonify(
{
"main_agent_history_structure": {
"total_messages": len(main_messages),
"messages": message_structure,
},
"browser_sessions": list(browser_sessions.keys()),
"raw_main_history": main_history,
"raw_browser_sessions": {
k: v for k, v in list(browser_sessions.items())[:2]
}, # Only show first two sessions
}
)
except Exception as e:
return jsonify({"error": str(e)}), 500
if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0", port=5000)
================================================
FILE: apps/visualize-trace/pyproject.toml
================================================
[project]
name = "trace-dashboard"
version = "1.0.0"
description = "A web dashboard for analyzing trace JSON files"
requires-python = ">=3.8"
dependencies = [
"flask>=2.3.3",
"werkzeug>=2.3.7",
]
[tool.uv]
dev-dependencies = []
================================================
FILE: apps/visualize-trace/requirements.txt
================================================
flask==2.3.3
werkzeug==2.3.7
================================================
FILE: apps/visualize-trace/run.py
================================================
#!/usr/bin/env python3
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import os
import subprocess
import sys
def check_dependencies():
"""Check if dependencies are installed"""
try:
import importlib.util
if importlib.util.find_spec("flask") is not None:
print("✓ Flask is installed")
return True
else:
raise ImportError("Flask not found")
except ImportError:
print("✗ Flask is not installed")
print("Please use the following commands to install dependencies:")
print(" uv sync")
print("or:")
print(" uv pip install -r requirements.txt")
return False
def install_dependencies():
"""Install dependencies (recommended to use uv)"""
print("Installing dependencies...")
try:
# Try using uv first
try:
subprocess.check_call(["uv", "sync"])
print("✓ Dependencies installed successfully using uv")
return True
except (subprocess.CalledProcessError, FileNotFoundError):
# Fallback to pip
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]
)
print("✓ Dependencies installed successfully using pip")
return True
except subprocess.CalledProcessError:
print("✗ Failed to install dependencies")
print("Please manually run: uv sync or pip install -r requirements.txt")
return False
def main():
"""Main function"""
import argparse
# Parse command line arguments
parser = argparse.ArgumentParser(description="Trace Analysis Web Demo")
parser.add_argument(
"-p",
"--port",
type=int,
default=5000,
help="Specify port number (default: 5000)",
)
args = parser.parse_args()
print("=" * 50)
print("Trace Analysis Web Demo")
print("=" * 50)
# Check dependencies
if not check_dependencies():
print("\nInstalling dependencies...")
if not install_dependencies():
print(
"Please manually install dependencies: pip install -r requirements.txt"
)
return
# Check JSON files
parent_dir = os.path.dirname(os.path.abspath(__file__))
json_files = [
f for f in os.listdir(os.path.join(parent_dir, "..")) if f.endswith(".json")
]
if not json_files:
print("\nWarning: No JSON files found in parent directory")
print("Please ensure trace JSON files are in the trace_analyze/ directory")
else:
print(f"\nFound {len(json_files)} JSON files:")
for file in json_files[:5]: # Only show first 5
print(f" - {file}")
if len(json_files) > 5:
print(f" ... and {len(json_files) - 5} other files")
# Start application
print("\nStarting web application...")
print(f"Application will run at http://localhost:{args.port}")
print("Press Ctrl+C to stop the application")
print("=" * 50)
try:
from app import app
app.run(debug=True, host="0.0.0.0", port=args.port)
except KeyboardInterrupt:
print("\nApplication stopped")
except Exception as e:
print(f"\nFailed to start application: {e}")
if __name__ == "__main__":
main()
================================================
FILE: apps/visualize-trace/static/css/style.css
================================================
/* Global styles */
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background-color: #f8f9fa;
}
/* Set special font for non-tool call content */
.rendered-content, .preview-text, .browser-agent-content {
font-family: 'Courier New', 'Monaco', 'Menlo', monospace;
font-size: 14px;
line-height: 1.6;
}
/* Keep MCP tool calls using original font */
.mcp-tool-call {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
/* Ensure MCP tool call content uses original font */
.mcp-tool-call * {
font-family: inherit;
}
/* Navigation button styles */
.nav-btn {
transition: all 0.3s ease;
}
.nav-btn:hover:not(:disabled) {
background-color: rgba(255, 255, 255, 0.2);
}
.nav-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
/* File selection input group styles */
.file-navigation {
display: flex;
align-items: center;
gap: 0;
}
.file-navigation .form-select {
border-radius: 0;
border-left: 0;
border-right: 0;
}
.file-navigation .btn:first-child {
border-top-right-radius: 0;
border-bottom-right-radius: 0;
}
.file-navigation .btn:last-child {
border-top-left-radius: 0;
border-bottom-left-radius: 0;
}
/* Loading overlay */
.loading-overlay {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: rgba(0, 0, 0, 0.5);
display: flex;
justify-content: center;
align-items: center;
z-index: 9999;
}
/* Card styles */
.card {
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
border: none;
border-radius: 8px;
}
.card-header {
background-color: #f8f9fa;
border-bottom: 1px solid #dee2e6;
font-weight: 500;
}
/* Top summary panel styles */
.summary-panel {
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
border: none;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.summary-panel h6 {
color: #495057;
font-weight: 600;
margin-bottom: 15px;
padding-bottom: 8px;
border-bottom: 2px solid #dee2e6;
}
.summary-panel .answer-box {
background: #fff;
border: 1px solid #dee2e6;
border-radius: 6px;
padding: 8px 12px;
margin-bottom: 10px;
display: flex;
align-items: center;
gap: 10px;
}
.summary-panel .answer-label {
font-weight: 600;
color: #6c757d;
font-size: 12px;
margin-bottom: 0;
white-space: nowrap;
}
.summary-panel .answer-content {
font-size: 14px;
line-height: 1.4;
flex: 1;
}
.summary-panel .final-answer {
border-left: 4px solid #007bff;
}
.summary-panel .ground-truth {
border-left: 4px solid #28a745;
}
.summary-panel .stat-item {
background: #fff;
border: 1px solid #dee2e6;
border-radius: 6px;
padding: 8px 12px;
margin-bottom: 8px;
display: flex;
justify-content: space-between;
align-items: center;
}
.summary-panel .stat-label {
font-size: 12px;
color: #6c757d;
font-weight: 500;
}
.summary-panel .stat-value {
font-size: 14px;
font-weight: 600;
color: #495057;
}
/* Navigation panel styles */
.navigation-panel {
position: sticky;
top: 20px;
max-height: calc(100vh - 40px);
overflow-y: auto;
}
.navigation-list {
max-height: calc(100vh - 120px);
overflow-y: auto;
}
.nav-item {
padding: 8px 12px;
border-bottom: 1px solid #f1f1f1;
cursor: pointer;
transition: all 0.2s ease;
font-size: 13px;
}
.nav-item:hover {
/* Remove background color change, can add other subtle visual feedback */
}
.nav-item.active {
background-color: #007bff;
color: white;
}
.nav-item .step-number {
font-weight: bold;
color: #6c757d;
}
.nav-item.active .step-number {
color: white;
}
.nav-item .step-role {
font-size: 11px;
padding: 2px 6px;
border-radius: 3px;
margin-left: 8px;
}
.nav-item .step-role.user {
background-color: #28a745;
color: white;
}
.nav-item .step-role.assistant {
background-color: #007bff;
color: white;
}
.nav-item .step-role.tool {
background-color: #fd7e14;
color: white;
}
.nav-item .step-role.system {
background-color: #6c757d;
color: white;
}
.nav-item .step-summary {
color: #6c757d;
font-size: 12px;
margin-top: 4px;
display: -webkit-box;
-webkit-line-clamp: 2;
-webkit-box-orient: vertical;
overflow: hidden;
}
.nav-item.active .step-summary {
color: #e9ecef;
}
/* Browser sub-step navigation styles */
.nav-item.browser-sub-step {
padding-left: 24px;
font-size: 12px;
border-left: 2px solid #dee2e6;
margin-left: 8px;
}
.nav-item.browser-sub-step .step-number {
font-size: 11px;
color: #6c757d;
}
.nav-item.browser-sub-step .step-role {
font-size: 10px;
padding: 1px 4px;
}
.nav-item.browser-sub-step .step-summary {
font-size: 11px;
-webkit-line-clamp: 1;
}
.nav-item.browser-sub-step.active {
border-left-color: #007bff;
}
.nav-item .browser-toggle {
margin-left: auto;
cursor: pointer;
font-size: 12px;
color: #6c757d;
padding: 2px 4px;
border-radius: 2px;
transition: all 0.2s ease;
}
.nav-item .browser-toggle:hover {
background-color: #e9ecef;
}
.nav-item.active .browser-toggle {
color: #fff;
}
.nav-item.active .browser-toggle:hover {
background-color: rgba(255, 255, 255, 0.2);
}
.browser-sub-steps {
display: none;
}
.browser-sub-steps.expanded {
display: block;
}
/* Execution flow styles */
.execution-steps-container {
display: flex;
flex-direction: column;
gap: 16px;
}
.execution-step {
border: 1px solid #dee2e6;
border-radius: 6px;
margin-bottom: 0; /* Remove bottom margin, use gap instead */
background-color: white;
transition: all 0.3s ease;
position: relative;
}
.execution-step:hover {
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
/* Ensure main agent steps have clear visual separation */
.execution-step[data-agent*="main_agent"] {
border-left: 4px solid #007bff;
z-index: 2;
}
/* Browser session should be indented inside main agent steps */
.browser-session {
position: relative;
margin-left: 20px;
margin-top: 12px;
}
.step-header {
padding: 12px 16px;
cursor: pointer;
position: relative;
border-radius: 6px 6px 0 0;
}
.step-header:hover {
background-color: #f8f9fa;
}
.step-header.user-message {
background-color: #e3f2fd;
border-left: 4px solid #2196f3;
}
.step-header.assistant-message {
background-color: #f3e5f5;
border-left: 4px solid #9c27b0;
}
.step-header.user-message.browser-agent {
background-color: #e8f5e8;
border-left: 4px solid #4caf50;
}
.step-header.assistant-message.browser-agent {
background-color: #fff3e0;
border-left: 4px solid #ff9800;
}
.step-header.tool-message {
background-color: #fff3e0;
border-left: 4px solid #fd7e14;
}
.step-header.system-message {
background-color: #f8f9fa;
border-left: 4px solid #6c757d;
}
.step-content {
padding: 16px;
border-top: 1px solid #dee2e6;
background-color: #f8f9fa;
}
.step-toggle {
position: absolute;
right: 16px;
top: 50%;
transform: translateY(-50%);
font-size: 14px;
color: #6c757d;
}
/* Tool call styles */
.tool-call {
background-color: #fff3cd;
border: 1px solid #ffeaa7;
border-radius: 4px;
padding: 10px;
margin: 8px 0;
}
.tool-call-header {
font-weight: 500;
color: #856404;
margin-bottom: 5px;
}
.tool-call.browser-agent {
background-color: #d4edda;
border-color: #c3e6cb;
}
.tool-call.browser-agent .tool-call-header {
color: #155724;
}
/* Browser session styles */
.browser-session {
background-color: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 4px;
margin-top: 10px;
padding: 12px;
}
.browser-session-header {
font-weight: 500;
color: #495057;
margin-bottom: 10px;
padding-bottom: 8px;
border-bottom: 1px solid #dee2e6;
}
.browser-step {
background-color: white;
border: 1px solid #e9ecef;
border-radius: 4px;
margin-bottom: 8px;
padding: 8px 12px;
}
.browser-step.user {
background-color: #f0f8ff;
}
.browser-step.assistant {
background-color: #fdf6e3;
}
.browser-step.tool {
background-color: #fff3e0;
border-left: 3px solid #fd7e14;
}
.browser-step.system {
background-color: #f8f9fa;
border-left: 3px solid #6c757d;
}
/* Statistics styles */
.stat-item {
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 0;
border-bottom: 1px solid #f0f0f0;
}
.stat-item:last-child {
border-bottom: none;
}
.stat-label {
font-weight: 500;
color: #495057;
}
.stat-value {
font-weight: 600;
color: #007bff;
}
/* Badge styles */
.badge-role {
font-size: 11px;
padding: 4px 8px;
border-radius: 12px;
font-weight: 500;
text-transform: uppercase;
}
.badge-user {
background-color: #007bff;
color: white;
}
.badge-assistant {
background-color: #6f42c1;
color: white;
}
.badge-tool {
background-color: #fd7e14;
color: white;
}
.badge-system {
background-color: #6c757d;
color: white;
}
.badge-browser {
background-color: #28a745;
color: white;
}
/* Timestamp styles */
.timestamp {
font-size: 11px;
color: #6c757d;
font-family: monospace;
}
/* Content preview styles */
.content-preview {
background-color: white;
border-radius: 4px;
padding: 8px;
margin: 8px 0;
}
.content-preview .preview-text {
line-height: 1.5;
}
.expand-preview-btn {
color: #007bff !important;
font-size: 12px;
text-decoration: none;
}
.expand-preview-btn:hover {
text-decoration: underline !important;
}
/* Step content area style adjustments */
.step-content {
padding: 16px;
border-top: 1px solid #dee2e6;
background-color: #f8f9fa;
}
.step-content h6 {
color: #495057;
font-weight: 600;
margin-bottom: 8px;
font-size: 14px;
}
/* Button styles */
.btn-sm {
font-size: 12px;
padding: 4px 12px;
}
/* Responsive styles */
@media (max-width: 768px) {
.container-fluid {
padding: 0 10px;
}
.col-md-3 {
order: 2;
}
.col-md-9 {
order: 1;
}
.step-header {
padding: 10px 12px;
}
.step-content {
padding: 12px;
}
}
/* Animation effects */
.collapse {
transition: height 0.3s ease;
}
.fade-in {
animation: fadeIn 0.3s ease-in;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(10px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Tooltip styles */
.tooltip {
font-size: 12px;
}
/* Code styles */
.code-block {
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6px;
padding: 12px;
font-family: 'Courier New', monospace;
font-size: 13px;
white-space: pre-wrap;
margin: 8px 0;
overflow-x: auto;
line-height: 1.4;
}
.code-block pre {
margin: 0;
padding: 0;
background: none;
border: none;
font-family: inherit;
font-size: inherit;
white-space: pre-wrap;
}
.code-block code {
background: none;
border: none;
padding: 0;
font-family: inherit;
font-size: inherit;
color: inherit;
}
/* Error styles */
.error-message {
color: #dc3545;
font-size: 14px;
margin-top: 8px;
}
.success-message {
color: #28a745;
font-size: 14px;
margin-top: 8px;
}
/* Scrollbar styles */
::-webkit-scrollbar {
width: 8px;
}
::-webkit-scrollbar-track {
background: #f1f1f1;
}
::-webkit-scrollbar-thumb {
background: #c1c1c1;
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: #a8a8a8;
}
/* MCP tool call styles */
.mcp-tool-call {
background-color: #ffffff;
border: 2px solid #007bff;
border-radius: 8px;
padding: 16px;
margin: 16px 0;
box-shadow: 0 2px 8px rgba(0,123,255,0.1);
overflow: hidden;
}
.mcp-tool-call.browser-agent {
border-color: #28a745;
background-color: #ffffff;
box-shadow: 0 2px 8px rgba(40,167,69,0.1);
}
.mcp-tool-header {
display: flex;
align-items: center;
font-weight: 600;
color: #007bff;
margin-bottom: 12px;
font-size: 14px;
padding-bottom: 8px;
border-bottom: 1px solid #e9ecef;
}
.mcp-tool-call.browser-agent .mcp-tool-header {
color: #28a745;
}
.mcp-tool-header i {
margin-right: 8px;
font-size: 16px;
}
.mcp-tool-name {
font-family: 'Courier New', monospace;
background-color: rgba(0,123,255,0.1);
padding: 4px 8px;
border-radius: 4px;
margin-left: 4px;
font-size: 13px;
}
.mcp-tool-call.browser-agent .mcp-tool-name {
background-color: rgba(40,167,69,0.1);
}
.mcp-tool-content {
margin-top: 8px;
}
.mcp-xml-structure {
font-family: 'Courier New', monospace;
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 4px;
padding: 16px;
line-height: 1.6;
font-size: 13px;
}
.xml-tag {
color: #0066cc;
font-weight: 500;
margin: 2px 0;
}
.xml-content {
margin-left: 20px;
margin: 8px 0 8px 20px;
}
.xml-arguments {
background-color: #ffffff;
border: 1px solid #dee2e6;
border-radius: 4px;
padding: 12px;
margin: 8px 0 8px 20px;
white-space: pre-wrap;
color: #2c3e50;
font-family: 'Courier New', monospace;
font-size: 12px;
line-height: 1.5;
overflow-x: auto;
}
.mcp-tool-args {
margin-top: 8px;
}
.mcp-args-label {
font-weight: 500;
color: #495057;
margin-bottom: 6px;
font-size: 13px;
}
/* Format badge styles */
.badge-format {
font-size: 10px;
padding: 2px 6px;
border-radius: 3px;
font-weight: normal;
}
.badge-format {
background-color: #6c757d;
color: white;
}
/* Format badge default styles, can be extended as needed */
/* Tool ID styles */
.tool-id {
margin-top: 8px;
padding-top: 8px;
border-top: 1px solid #e9ecef;
}
/* Rendered content styles - white background */
.rendered-content {
background-color: white;
padding: 12px;
border-radius: 4px;
border: 1px solid #e9ecef;
margin: 8px 0;
line-height: 1.6;
}
.rendered-content h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 8px;
margin-bottom: 16px;
font-size: 1.5em;
}
.rendered-content h2 {
color: #34495e;
border-bottom: 1px solid #bdc3c7;
padding-bottom: 6px;
margin-bottom: 12px;
font-size: 1.3em;
}
.rendered-content h3 {
color: #2c3e50;
margin-bottom: 10px;
font-size: 1.1em;
}
.rendered-content strong {
color: #2c3e50;
font-weight: 600;
}
.rendered-content em {
color: #7f8c8d;
font-style: italic;
}
.rendered-content ul, .rendered-content ol {
margin: 10px 0;
padding-left: 20px;
}
.rendered-content li {
margin: 4px 0;
}
.rendered-content a {
color: #3498db;
text-decoration: none;
}
.rendered-content a:hover {
text-decoration: underline;
}
.rendered-content .inline-code {
background-color: #f8f9fa;
color: #e83e8c;
padding: 2px 4px;
border-radius: 3px;
font-family: 'Courier New', monospace;
font-size: 0.9em;
}
.rendered-content .code-block {
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 4px;
margin: 8px 0;
overflow-x: auto;
}
.rendered-content .code-block pre {
margin: 0;
padding: 12px;
background: none;
border: none;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.4;
color: #2c3e50;
}
.rendered-content .code-block code {
background: none;
padding: 0;
color: inherit;
font-family: inherit;
}
/* Improve browser agent content styles */
.browser-agent-content {
background-color: #f8fff8;
border: 1px solid #d4edda;
border-radius: 4px;
padding: 12px;
margin: 8px 0;
}
/* Improve content display in modal */
.modal-body .rendered-content {
max-height: 400px;
overflow-y: auto;
}
================================================
FILE: apps/visualize-trace/static/js/script.js
================================================
// Global variables
let currentFlowData = null;
let currentBasicInfo = null;
let currentFileList = [];
let currentFileIndex = -1;
// DOM elements
const elements = {
directoryInput: document.getElementById('directoryInput'),
browseDirectoryBtn: document.getElementById('browseDirectoryBtn'),
fileSelect: document.getElementById('fileSelect'),
prevFileBtn: document.getElementById('prevFileBtn'),
nextFileBtn: document.getElementById('nextFileBtn'),
loadBtn: document.getElementById('loadBtn'),
refreshBtn: document.getElementById('refreshBtn'),
expandAllBtn: document.getElementById('expandAllBtn'),
collapseAllBtn: document.getElementById('collapseAllBtn'),
basicInfo: document.getElementById('basicInfo'),
executionSummary: document.getElementById('executionSummary'),
performanceSummary: document.getElementById('performanceSummary'),
executionFlow: document.getElementById('executionFlow'),
spansStats: document.getElementById('spansStats'),
stepLogsStats: document.getElementById('stepLogsStats'),
loadingOverlay: document.getElementById('loadingOverlay'),
errorToast: document.getElementById('errorToast'),
successToast: document.getElementById('successToast'),
errorMessage: document.getElementById('errorMessage'),
successMessage: document.getElementById('successMessage'),
messageModal: document.getElementById('messageModal'),
messageContent: document.getElementById('messageContent'),
navigationList: document.getElementById('navigationList')
};
// Initialize
document.addEventListener('DOMContentLoaded', function() {
initializeApp();
});
function initializeApp() {
// Bind event listeners
elements.browseDirectoryBtn.addEventListener('click', browseDirectory);
elements.directoryInput.addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
browseDirectory();
}
});
elements.fileSelect.addEventListener('change', onFileSelect);
elements.prevFileBtn.addEventListener('click', gotoPrevFile);
elements.nextFileBtn.addEventListener('click', gotoNextFile);
elements.loadBtn.addEventListener('click', loadTraceFile);
elements.refreshBtn.addEventListener('click', refreshFileList);
elements.expandAllBtn.addEventListener('click', expandAllSteps);
elements.collapseAllBtn.addEventListener('click', collapseAllSteps);
// Set default directory path
setDefaultDirectory();
// Initialize button states
updateNavigationButtons();
// Add keyboard shortcut support
document.addEventListener('keydown', handleKeyboardShortcuts);
}
// Utility functions
function showLoading() {
elements.loadingOverlay.classList.remove('d-none');
}
function hideLoading() {
elements.loadingOverlay.classList.add('d-none');
}
function showError(message) {
elements.errorMessage.textContent = message;
const toast = new bootstrap.Toast(elements.errorToast);
toast.show();
}
function showSuccess(message) {
elements.successMessage.textContent = message;
const toast = new bootstrap.Toast(elements.successToast);
toast.show();
}
function formatTimestamp(timestamp) {
if (!timestamp) return '';
try {
const date = new Date(timestamp);
return date.toLocaleString('zh-CN');
} catch (e) {
return timestamp;
}
}
function truncateText(text, maxLength = 100) {
if (!text) return '';
if (text.length <= maxLength) return text;
return text.substring(0, maxLength) + '...';
}
function formatFileSize(bytes) {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
}
// Handle MCP tool call display
function formatMcpToolCallWithPlaceholders(text, placeholders) {
if (!text || typeof text !== 'string') return text;
// MCP tool call regex - more lenient matching, including newlines
const mcpPattern = /\s*(.*?)<\/server_name>\s*(.*?)<\/tool_name>\s*\s*(.*?)\s*<\/arguments>\s*<\/use_mcp_tool>/gs;
let placeholderCounter = 0;
return text.replace(mcpPattern, (match, serverName, toolName, args) => {
// Clean and format arguments
let formattedArgs = args.trim();
// First convert escaped newlines to actual newlines
formattedArgs = formattedArgs.replace(/\\n/g, '\n');
try {
// Try to format JSON arguments
const parsed = JSON.parse(formattedArgs);
formattedArgs = JSON.stringify(parsed, null, 2);
} catch (e) {
// If not JSON, keep as is but ensure newlines are correct
formattedArgs = formattedArgs.replace(/\n/g, '\n');
}
const isBrowserAgent = serverName.trim() === 'browsing-agent';
const toolClass = isBrowserAgent ? 'browser-agent' : '';
const iconClass = isBrowserAgent ? 'globe' : 'cog';
// Create complete MCP tool call HTML structure
const mcpHtml = `
${serverName.trim()}.${toolName.trim()}
<use_mcp_tool>
<server_name>${serverName.trim()}</server_name>
<tool_name>${toolName.trim()}</tool_name>
<arguments>
${formattedArgs}
</arguments>
</use_mcp_tool>
`;
// Use simple placeholder ID to avoid complex JSON strings
const placeholderId = `MCP_PLACEHOLDER_${placeholderCounter++}`;
placeholders.set(placeholderId, mcpHtml);
return `[${placeholderId}]`;
});
}
// Create new format tool call HTML
function createNewFormatToolCallHTML(tool) {
const isBeowserAgent = tool.server_name.includes('browsing') || tool.server_name.includes('agent');
const toolClass = isBeowserAgent ? 'browser-agent' : '';
const iconClass = isBeowserAgent ? 'globe' : 'cog';
// Format arguments
let formattedArgs;
try {
if (typeof tool.arguments === 'string') {
formattedArgs = tool.arguments;
} else {
formattedArgs = JSON.stringify(tool.arguments, null, 2);
}
} catch (e) {
formattedArgs = String(tool.arguments);
}
return `
Shortcuts:←→ Switch files
Enter Load
Ctrl+R Refresh
Message Details
Loading...
Error
Success
================================================
FILE: apps/visualize-trace/trace_analyzer.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import re
from typing import Any, Dict, List, Optional
class TraceAnalyzer:
"""
Class for analyzing trace JSON files, convenient for reading and accessing important information
Supports two tool call formats:
1. Old format (MCP): Tool calls using XML tag format in content
2. New format: Tool calls using tool_calls field directly in message
"""
def __init__(self, json_file_path: str):
"""
Initialize analyzer
Args:
json_file_path: Path to the JSON file
"""
self.json_file_path = json_file_path
self.data = self._load_json()
def _load_json(self) -> Dict[str, Any]:
"""Load JSON file"""
try:
with open(self.json_file_path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
raise Exception(f"Failed to load JSON file: {e}")
def _parse_new_format_tool_name(self, tool_name: str) -> tuple[str, str]:
"""
Parse new format tool name
Args:
tool_name: New format tool name, for example:
- "tool-server_name-tool_name" format
- "agent-browsing-search_and_browse" format (browser agent)
Returns:
tuple: (server_name, actual_tool_name)
"""
# Handle agent-browsing-* format (browser agent calls)
if tool_name.startswith("agent-browsing-"):
server_name = "agent-browsing"
actual_tool_name = tool_name[len("agent-browsing-") :]
return server_name, actual_tool_name
# Handle other agent-* formats
elif tool_name.startswith("agent-"):
# Find the last '-' to split server_name and tool_name
last_dash = tool_name.rfind("-")
if last_dash > 6: # There's content after "agent-"
server_name = tool_name[:last_dash]
actual_tool_name = tool_name[last_dash + 1 :]
else:
server_name = tool_name
actual_tool_name = ""
return server_name, actual_tool_name
# Handle tool-server_name-tool_name format
elif tool_name.startswith("tool-"):
parts = tool_name.split("-", 2)
if len(parts) >= 3:
server_name = parts[1]
actual_tool_name = parts[2]
else:
server_name = "unknown"
actual_tool_name = tool_name
return server_name, actual_tool_name
# Other formats
else:
server_name = "unknown"
actual_tool_name = tool_name
return server_name, actual_tool_name
# ==================== Basic Information ====================
def get_basic_info(self) -> Dict[str, Any]:
"""Get basic information of the task"""
return {
"status": self.data.get("status"),
"task_id": self.data.get("task_id"),
"start_time": self.data.get("start_time"),
"end_time": self.data.get("end_time"),
"final_boxed_answer": self.data.get("final_boxed_answer"),
"ground_truth": self.data.get("ground_truth"),
"final_judge_result": self.data.get("final_judge_result"),
"judge_type": self.data.get("judge_type"),
"error": self.data.get("error", ""),
}
def get_performance_summary(self) -> Dict[str, Any]:
"""Get performance summary information"""
trace_data = self.data.get("trace_data", {})
return trace_data.get("performance_summary", {})
# ==================== Main Agent Message History ====================
def get_main_agent_history(self) -> Dict[str, Any]:
"""Get main agent message history"""
return self.data.get("main_agent_message_history", {})
def get_main_agent_messages(self) -> List[Dict[str, Any]]:
"""Get main agent message list"""
history = self.get_main_agent_history()
return history.get("message_history", [])
# ==================== Browser Agent Message History ====================
def get_browser_agent_sessions(self) -> Dict[str, Any]:
"""Get all browser agent sessions"""
# Try two possible key names
browser_sessions = self.data.get("browser_agent_message_history_sessions", {})
if not browser_sessions:
browser_sessions = self.data.get("sub_agent_message_history_sessions", {})
return browser_sessions
def get_browser_agent_session_messages(
self, session_id: str
) -> List[Dict[str, Any]]:
"""Get message list for specified session"""
sessions = self.get_browser_agent_sessions()
session = sessions.get(session_id, {})
return session.get("message_history", [])
# ==================== MCP Tool Call Parsing ====================
def parse_mcp_tool_call(self, text: str) -> Optional[Dict[str, Any]]:
"""Parse MCP tool call"""
pattern = r"\s*(.*?)\s*(.*?)\s*\s*(.*?)\s*\s*"
match = re.search(pattern, text, re.DOTALL)
if match:
server_name = match.group(1).strip()
tool_name = match.group(2).strip()
arguments_str = match.group(3).strip()
try:
arguments = json.loads(arguments_str)
except json.JSONDecodeError:
arguments = arguments_str
return {
"server_name": server_name,
"tool_name": tool_name,
"arguments": arguments,
}
return None
def extract_text_content(self, content) -> str:
"""Extract text from message content"""
if isinstance(content, list):
text_parts = []
for item in content:
if isinstance(item, dict) and item.get("type") == "text":
text_parts.append(item.get("text", ""))
return "".join(text_parts)
return str(content)
def analyze_conversation_flow(self) -> List[Dict[str, Any]]:
"""Analyze conversation flow, including tool calls"""
flow_steps = []
main_messages = self.get_main_agent_messages()
sub_agent_sessions = self.get_browser_agent_sessions()
sub_agent_call_count = 0
for i, message in enumerate(main_messages):
role = message.get("role")
content = message.get("content", [])
text_content = self.extract_text_content(content)
step = {
"step_id": i,
"agent": "main_agent",
"role": role,
"content_preview": text_content[:200] + "..."
if len(text_content) > 200
else text_content,
"full_content": text_content,
"tool_calls": [],
"browser_session": None,
"timestamp": message.get("timestamp", ""),
"browser_flow": [],
}
# If it's an assistant message, check for tool calls
if role == "assistant":
# Check for new format tool_calls
if "tool_calls" in message and message["tool_calls"]:
for tool_call in message["tool_calls"]:
# Convert new format to unified format
if "function" in tool_call:
function_info = tool_call["function"]
tool_name = function_info.get("name", "")
arguments = function_info.get("arguments", "")
# Parse arguments string as JSON (if it's a string)
if isinstance(arguments, str):
try:
arguments = json.loads(arguments)
except json.JSONDecodeError:
pass
# Extract server_name from tool_name (if available)
server_name, actual_tool_name = (
self._parse_new_format_tool_name(tool_name)
)
parsed_tool_call = {
"server_name": server_name,
"tool_name": actual_tool_name,
"arguments": arguments,
"id": tool_call.get("id", ""),
"type": tool_call.get("type", "function"),
"format": "new",
}
step["tool_calls"].append(parsed_tool_call)
# Handle browser agent calls - maintain complete consistency with MCP format logic
if server_name.startswith("agent-"):
sub_agent_call_count += 1
session_id = f"{server_name}_{sub_agent_call_count}"
step["browser_session"] = session_id
# Analyze browser session conversation flow
if session_id in sub_agent_sessions:
browser_flow = self.analyze_browser_session_flow(
session_id
)
step["browser_flow"] = browser_flow
elif server_name.startswith("browsing-agent"):
sub_agent_call_count += 1
session_id = f"browser_agent_{sub_agent_call_count}"
step["browser_session"] = session_id
# Analyze browser session conversation flow
if session_id in sub_agent_sessions:
browser_flow = self.analyze_browser_session_flow(
session_id
)
step["browser_flow"] = browser_flow
# Check for old format MCP tool calls (maintain compatibility)
mcp_tool_call = self.parse_mcp_tool_call(text_content)
if mcp_tool_call:
mcp_tool_call["format"] = "mcp" # Mark as old format
step["tool_calls"].append(mcp_tool_call)
# If browsing-agent is called, associate browser session
if mcp_tool_call["server_name"].startswith("agent-"):
sub_agent_call_count += 1
session_id = (
f"{mcp_tool_call['server_name']}_{sub_agent_call_count}"
)
step["browser_session"] = session_id
# Analyze browser session conversation flow
if session_id in sub_agent_sessions:
browser_flow = self.analyze_browser_session_flow(session_id)
step["browser_flow"] = browser_flow
elif mcp_tool_call["server_name"].startswith("browsing-agent"):
sub_agent_call_count += 1
session_id = f"browser_agent_{sub_agent_call_count}"
step["browser_session"] = session_id
# Analyze browser session conversation flow
if session_id in sub_agent_sessions:
browser_flow = self.analyze_browser_session_flow(session_id)
step["browser_flow"] = browser_flow
flow_steps.append(step)
return flow_steps
def analyze_browser_session_flow(self, session_id: str) -> List[Dict[str, Any]]:
"""Analyze browser session conversation flow"""
browser_messages = self.get_browser_agent_session_messages(session_id)
browser_flow = []
for i, message in enumerate(browser_messages):
role = message.get("role")
content = message.get("content", [])
text_content = self.extract_text_content(content)
step = {
"step_id": i,
"agent": session_id,
"role": role,
"content_preview": text_content[:200] + "..."
if len(text_content) > 200
else text_content,
"full_content": text_content,
"tool_calls": [],
"timestamp": message.get("timestamp", ""),
}
# If it's an assistant message, check for tool calls
if role == "assistant":
# Check for new format tool_calls
if "tool_calls" in message and message["tool_calls"]:
for tool_call in message["tool_calls"]:
# Convert new format to unified format
if "function" in tool_call:
function_info = tool_call["function"]
tool_name = function_info.get("name", "")
arguments = function_info.get("arguments", "")
# Parse arguments string as JSON (if it's a string)
if isinstance(arguments, str):
try:
arguments = json.loads(arguments)
except json.JSONDecodeError:
pass
# Extract server_name from tool_name (if available)
server_name, actual_tool_name = (
self._parse_new_format_tool_name(tool_name)
)
parsed_tool_call = {
"server_name": server_name,
"tool_name": actual_tool_name,
"arguments": arguments,
"id": tool_call.get("id", ""),
"type": tool_call.get("type", "function"),
"format": "new",
}
step["tool_calls"].append(parsed_tool_call)
# Check for old format MCP tool calls (maintain compatibility)
mcp_tool_call = self.parse_mcp_tool_call(text_content)
if mcp_tool_call:
mcp_tool_call["format"] = "mcp" # Mark as old format
step["tool_calls"].append(mcp_tool_call)
browser_flow.append(step)
return browser_flow
def get_execution_summary(self) -> Dict[str, Any]:
"""Get execution summary information"""
flow_steps = self.analyze_conversation_flow()
total_steps = len(flow_steps)
tool_calls = []
browser_sessions = []
for step in flow_steps:
if step["tool_calls"]:
tool_calls.extend(step["tool_calls"])
if step.get("browser_session"):
browser_sessions.append(step["browser_session"])
# Collect tool calls from browser sessions
if step.get("browser_flow"):
for browser_step in step["browser_flow"]:
if browser_step.get("tool_calls"):
tool_calls.extend(browser_step["tool_calls"])
# Tool usage statistics
tool_usage = {}
for tool in tool_calls:
# Choose appropriate key name generation method based on format
if tool.get("format") == "new":
# New format: use server_name.tool_name, if server_name is unknown then use only tool_name
if tool.get("server_name") != "unknown":
key = f"{tool['server_name']}.{tool['tool_name']}"
else:
key = tool["tool_name"]
else:
# Old format (MCP): maintain original method
key = f"{tool['server_name']}.{tool['tool_name']}"
tool_usage[key] = tool_usage.get(key, 0) + 1
return {
"total_steps": total_steps,
"total_tool_calls": len(tool_calls),
"browser_sessions_count": len(browser_sessions),
"tool_usage_distribution": tool_usage,
"browser_sessions": browser_sessions,
}
def get_spans_summary(self) -> Dict[str, Any]:
"""Get spans statistical summary"""
trace_data = self.data.get("trace_data", {})
spans = trace_data.get("spans", [])
agent_stats = {}
for span in spans:
agent = span.get("agent_context", "unknown")
if agent not in agent_stats:
agent_stats[agent] = {
"count": 0,
"total_duration": 0,
"span_types": set(),
}
agent_stats[agent]["count"] += 1
agent_stats[agent]["total_duration"] += span.get("duration_seconds", 0)
agent_stats[agent]["span_types"].add(span.get("name", "unknown"))
# Convert set to list
for agent in agent_stats:
agent_stats[agent]["span_types"] = list(agent_stats[agent]["span_types"])
return {
"total_spans": len(spans),
"total_duration": sum(span.get("duration_seconds", 0) for span in spans),
"agent_stats": agent_stats,
}
def get_step_logs_summary(self) -> Dict[str, Any]:
"""Get step logs summary statistics"""
logs = self.data.get("step_logs", [])
status_count = {}
step_type_count = {}
for log in logs:
status = log.get("status", "unknown")
step_name = log.get("step_name", "unknown")
status_count[status] = status_count.get(status, 0) + 1
step_type_count[step_name] = step_type_count.get(step_name, 0) + 1
return {
"total_logs": len(logs),
"status_distribution": status_count,
"step_type_distribution": step_type_count,
}
================================================
FILE: assets/LOCAL-TOOL-DEPLOYMENT.md
================================================
# Local Tool Deployment Guide
This guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration.
## Overview
MiroThinker supports several optional open-source tools that you can deploy locally:
- **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files
- **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images
- **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks
These tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file.
## Prerequisites
- **GPU**: NVIDIA GPU with sufficient VRAM
- **Python 3.10+**
- **CUDA**: Compatible CUDA toolkit installed
- **Model Storage**: Sufficient disk space to download model checkpoints
## Tool Deployment
### 1. Audio Transcription Tool (`tool-transcribe-os`)
**Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo)
**Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs.
**Deployment with vLLM**:
```bash
# Install vLLM with audio support
pip install vllm==0.10.0
pip install vllm[audio]
# Start the server
vllm serve openai/whisper-large-v3-turbo \
--served-model-name whisper-large-v3-turbo \
--task transcription \
--host 0.0.0.0 \
--port 8000
```
**Configuration in `.env`**:
```bash
WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
WHISPER_API_KEY=your_api_key # Optional, if your server requires authentication
WHISPER_BASE_URL="http://0.0.0.0:8000/v1"
```
### 2. Visual Question Answering Tool (`tool-vqa-os`)
**Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)
**Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats.
**Deployment with SGLang**:
```bash
# Install SGLang
pip install sglang[all]
# Start the server
python3 -m sglang.launch_server \
--model-path Qwen/Qwen2.5-VL-72B-Instruct \
--tp 8 \
--host 0.0.0.0 \
--port 8001 \
--trust-remote-code \
--enable-metrics
```
**Configuration in `.env`**:
```bash
VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
VISION_API_KEY=your_api_key # Optional, if your server requires authentication
VISION_BASE_URL="http://0.0.0.0:8001/v1/chat/completions"
```
### 3. Reasoning Engine Tool (`tool-reasoning-os`)
**Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)
**Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens).
**Deployment with SGLang**:
```bash
# Install SGLang
pip install sglang[all]
# Start the server
python3 -m sglang.launch_server \
--model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \
--tp 8 \
--host 0.0.0.0 \
--port 8002 \
--trust-remote-code \
--context-length 131072 \
--enable-metrics
```
**Configuration in `.env`**:
```bash
REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
REASONING_API_KEY=your_api_key # Optional, if your server requires authentication
REASONING_BASE_URL="http://0.0.0.0:8002/v1/chat/completions"
```
## Using Deployed Tools
Once you have deployed the tools, configure your agent to use them:
1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`):
```yaml
main_agent:
tools:
- tool-python
- search_and_scrape_webpage
- jina_scrape_llm_summary
- tool-transcribe-os # Use local Whisper deployment
- tool-vqa-os # Use local Qwen2.5-VL deployment
- tool-reasoning-os # Use local Qwen3-235B deployment
max_turns: 400
```
2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above.
1. **Run your agent**:
```bash
cd apps/miroflow-agent
uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1
```
## Commercial Alternatives
If you prefer not to deploy these tools locally, you can use commercial alternatives:
- **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API
- **`tool-vqa`**: Uses Claude Sonnet 3.7 API
- **`tool-reasoning`**: Uses Claude Sonnet 3.7 API
Simply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`).
## Additional Resources
- **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/)
- **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/)
- **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations
================================================
FILE: assets/QA.md
================================================
# MiroFlow QA Documentation
## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations?
**Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools.
### Step-by-Step Process
1. **Extract GAIA-Text-103 Tasks**
```bash
# Extract text-103 tasks to a separate directory
uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation
```
This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation`
1. **Re-grade with GAIA-Text-103 Evaluator**
```bash
# Apply GAIA-Text-103 specific grading
uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
```
1. **Verify Results**
```bash
# Check accuracy and generate statistics
uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
```
## Q2: Does the choice of judgment model affect evaluation performance?
**Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models.
We have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons:
- **Ease of deployment:** No need to host additional GPU-intensive models
- **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp)
- **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons
## Code Quality Checks
Before submitting a pull request, ensure your code meets our quality standards:
```bash
# Fix linting issues automatically
uv tool run ruff@0.8.0 check --fix .
# Format code according to our style guidelines
uv tool run ruff@0.8.0 format .
```
## Know Issues
- The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate.
================================================
FILE: assets/qwen3_nonthinking.jinja
================================================
{%- if tools %}
{{- '<|im_start|>system\n' }}
{%- if messages[0].role == 'system' %}
{{- messages[0].content + '\n\n' }}
{%- endif %}
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
{%- for tool in tools %}
{{- "\n" }}
{{- tool | tojson }}
{%- endfor %}
{{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
{%- else %}
{%- if messages[0].role == 'system' %}
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{%- for message in messages[::-1] %}
{%- set index = (messages|length - 1) - loop.index0 %}
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
{%- set ns.multi_step_tool = false %}
{%- set ns.last_query_index = index %}
{%- endif %}
{%- endfor %}
{%- for message in messages %}
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "assistant" %}
{%- set content = message.content %}
{%- set reasoning_content = '' %}
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
{%- set reasoning_content = message.reasoning_content %}
{%- else %}
{%- if '' in message.content %}
{%- set content = message.content.split('')[-1].lstrip('\n') %}
{%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
{%- endif %}
{%- endif %}
{%- if loop.index0 > ns.last_query_index %}
{%- if loop.last or (not loop.last and reasoning_content) %}
{{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + content }}
{%- endif %}
{%- if message.tool_calls %}
{%- for tool_call in message.tool_calls %}
{%- if (loop.first and content) or (not loop.first) %}
{{- '\n' }}
{%- endif %}
{%- if tool_call.function %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n{"name": "' }}
{{- tool_call.name }}
{{- '", "arguments": ' }}
{%- if tool_call.arguments is string %}
{{- tool_call.arguments }}
{%- else %}
{{- tool_call.arguments | tojson }}
{%- endif %}
{{- '}\n' }}
{%- endfor %}
{%- endif %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "tool" %}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|im_start|>user' }}
{%- endif %}
{{- '\n\n' }}
{{- message.content }}
{{- '\n' }}
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n\n\n\n\n' }}
{%- endif %}
================================================
FILE: justfile
================================================
default:
just --list
# lint monorepo
[group('precommit')]
lint:
uv tool run ruff@0.8.0 check --fix .
# sort imports
[group('precommit')]
sort-imports:
uv tool run ruff@0.8.0 check --select I --fix .
# format monorepo
[group('precommit')]
format:
uv tool run ruff@0.8.0 format .
# check license
[group('precommit')]
check-license:
uv run reuse lint
# insert license for contributor
insert-license:
# https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings
git diff --name-only --cached | xargs -I {} reuse annotate -c "$(git config --get user.name) <$(git config --get user.email)>" "{}"
# format markdown files
[group('precommit')]
format-md:
find . -name "*.md" -type f | xargs uv tool run mdformat@0.7.17
# run precommit before PR
[group('precommit')]
precommit: lint sort-imports format-md format
================================================
FILE: libs/miroflow-tools/README.md
================================================
# 🛠️ MiroFlow Tools
> A comprehensive tool management system and MCP (Model Context Protocol) server collection for MiroFlow, providing a unified interface to various AI capabilities including code execution, vision processing, audio transcription, web searching, reasoning, and document reading.
## ✨ Features
- **🔧 Unified Tool Management**: Centralized `ToolManager` for managing multiple MCP servers
- **🌐 Multiple Transport Protocols**: Support for both stdio and SSE (HTTP) connections
- **📦 Rich Tool Ecosystem**: Pre-built MCP servers for common AI tasks
- **⚙️ Flexible Configuration**: Tool blacklisting, timeout management, and custom server configurations
- **🛡️ Error Handling**: Robust retry logic and fallback mechanisms
## 📦 Installation
This package is a local dependency that is automatically installed when you run `uv sync` in the `apps/miroflow-agent` directory. No separate installation is required.
For standalone usage or development:
```bash
cd libs/miroflow-tools
uv sync
```
## 📋 MCP Servers Overview
Quick reference tables of all available MCP servers and their tools. Click on "Details" to jump to the full documentation.
### 📊 Tools Used in MiroThinker v1.0 and v1.5
The following tools were used in the MiroThinker v1.0 and v1.5 evaluation:
| Category | Server Name | Tools | Key Environment Variables | Link |
|----------------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------------------|
| **Execution Environment** | `tool-python` | `create_sandbox`, `run_command`, `run_python_code` | `E2B_API_KEY`, `LOGS_DIR` | [Details](#tool-python) |
| **File Management** | `tool-python` | `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY`, `LOGS_DIR` | [Details](#tool-python) |
| **Information Retrieval** | `search_and_scrape_webpage` | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` | [Details](#search_and_scrape_webpage) |
| **Information Retrieval** | `jina_scrape_llm_summary` | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | [Details](#jina_scrape_llm_summary) |
### 🔧 Additional Available Tools
The following tools are implemented but were not used in the MiroThinker v1.0/v1.5 evaluation:
| Category | Server Name | Tools | Key Environment Variables | Link |
|-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------|
| **Web Searching** | `tool-google-search` | `google_search`, `scrape_website` | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) |
| **Web Searching (Sogou)** | `tool-sogou-search` | `sogou_search`, `scrape_website` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) |
| **Vision Processing** | `tool-vqa` | `visual_question_answering` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-vqa) |
| **Vision Processing** | `tool-vqa-os` | `visual_question_answering` | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME` | [Details](#tool-vqa-os) |
| **Audio Processing** | `tool-transcribe` | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | [Details](#tool-transcribe) |
| **Audio Processing** | `tool-transcribe-os` | `audio_transcription` | `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME` | [Details](#tool-transcribe-os) |
| **Document Reading** | `tool-reading` | `convert_to_markdown` | None required | [Details](#tool-reading) |
| **Reasoning Engine** | `tool-reasoning` | `reasoning` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-reasoning) |
| **Reasoning Engine** | `tool-reasoning-os` | `reasoning` | `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME` | [Details](#tool-reasoning-os) |
## 🚀 Quick Start
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
# Initialize tool manager with server configurations
server_configs = [
{
"name": "tool-python",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
env={"E2B_API_KEY": "your_e2b_api_key"} # Required for Python execution
)
},
# Add more server configurations...
]
tool_manager = ToolManager(server_configs)
# Get all available tool definitions
tool_definitions = await tool_manager.get_all_tool_definitions()
# Create a sandbox first
sandbox_result = await tool_manager.execute_tool_call(
server_name="tool-python",
tool_name="create_sandbox",
arguments={"timeout": 600}
)
# Extract sandbox_id from result
sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()
# Execute a tool call
result = await tool_manager.execute_tool_call(
server_name="tool-python",
tool_name="run_python_code",
arguments={"code_block": "print('Hello, World!')", "sandbox_id": sandbox_id}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
## 🔧 ToolManager
The `ToolManager` class is the central component for managing and executing tools across multiple MCP servers.
### Key Features
- **🔌 Multi-Server Support**: Manage tools from multiple MCP servers simultaneously
- **🔗 Connection Management**: Automatic connection handling for stdio and SSE transports
- **🚫 Tool Blacklisting**: Filter out specific tools from specific servers
- **📝 Structured Logging**: Optional task logging integration
- **🔄 Error Recovery**: Automatic retry logic and fallback mechanisms
### Methods
- `get_all_tool_definitions()`: Retrieve tool schemas from all configured servers
- `execute_tool_call(server_name, tool_name, arguments)`: Execute a specific tool
- `set_task_log(task_log)`: Enable structured logging
- `get_server_params(server_name)`: Get configuration for a specific server
### Example Usage
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
# Configure servers
server_configs = [
{
"name": "python-server",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
env={"E2B_API_KEY": "your_key"}
)
}
]
# Initialize with optional blacklist
tool_blacklist = {("python-server", "some_tool")}
manager = ToolManager(server_configs, tool_blacklist=tool_blacklist)
# Enable logging
# manager.set_task_log(your_task_logger)
# Get tools
tools = await manager.get_all_tool_definitions()
# Create a sandbox first (required before running code)
sandbox_result = await manager.execute_tool_call(
server_name="python-server",
tool_name="create_sandbox",
arguments={"timeout": 600}
)
sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip()
# Execute tool
result = await manager.execute_tool_call(
server_name="python-server",
tool_name="run_python_code",
arguments={"code_block": "1 + 1", "sandbox_id": sandbox_id}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
## 🔌 MCP Servers
### Server: tool-python
Execute Python code in isolated E2B sandboxes with persistent sessions.
**Tools**:
- 🔨 `create_sandbox(timeout=600)`: Create a new Linux sandbox
- 🐍 `run_python_code(code_block, sandbox_id)`: Execute Python code
- 💻 `run_command(command, sandbox_id)`: Run shell commands
- ⬆️ `upload_file_from_local_to_sandbox(sandbox_id, local_file_path, sandbox_file_path)`: Upload files
- ⬇️ `download_file_from_internet_to_sandbox(sandbox_id, url, sandbox_file_path)`: Download files
- 💾 `download_file_from_sandbox_to_local(sandbox_id, sandbox_file_path, local_filename)`: Download files
**Environment Variables**:
- 🔑 `E2B_API_KEY`: E2B API key (required)
- 📁 `LOGS_DIR`: Directory for temporary files (default: `../../logs`)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
# Configure server with environment variables
server_configs = [
{
"name": "tool-python",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"],
env={"E2B_API_KEY": "your_e2b_api_key"}
)
}
]
manager = ToolManager(server_configs)
# Create sandbox
result = await manager.execute_tool_call(
server_name="tool-python",
tool_name="create_sandbox",
arguments={"timeout": 600}
)
# Extract sandbox_id from result
sandbox_id = result['result'].split('sandbox_id:')[-1].strip()
# Run code
result = await manager.execute_tool_call(
server_name="tool-python",
tool_name="run_python_code",
arguments={"code_block": "import numpy as np; print(np.array([1,2,3]))", "sandbox_id": sandbox_id}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-vqa
Analyze images and answer questions about visual content using Anthropic Claude.
**Tools**:
- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images
**Environment Variables**:
- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)
- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-vqa",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"],
env={
"ANTHROPIC_API_KEY": "your_anthropic_api_key",
"ANTHROPIC_BASE_URL": "https://api.anthropic.com"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-vqa",
tool_name="visual_question_answering",
arguments={
"image_path_or_url": "https://example.com/image.jpg",
"question": "What is in this image?"
}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-vqa-os
Analyze images and answer questions about visual content using open-source compatible models.
**Tools**:
- 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images
**Environment Variables**:
- 🔑 `VISION_API_KEY`: API key (required)
- 🌐 `VISION_BASE_URL`: API endpoint URL (required)
- 🤖 `VISION_MODEL_NAME`: Model name (required)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-vqa-os",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"],
env={
"VISION_API_KEY": "your_vision_api_key",
"VISION_BASE_URL": "your_vision_base_url",
"VISION_MODEL_NAME": "your_vision_model_name"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-vqa-os",
tool_name="visual_question_answering",
arguments={
"image_path_or_url": "https://example.com/image.jpg",
"question": "What is in this image?"
}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-transcribe
Transcribe audio files and answer questions about audio content using OpenAI Whisper.
**Tools**:
- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text
- 🎧 `audio_question_answering(audio_path_or_url, question)`: Answer questions about audio
**Environment Variables**:
- 🔑 `OPENAI_API_KEY`: OpenAI API key (required)
- 🌐 `OPENAI_BASE_URL`: API base URL (default: `https://api.openai.com/v1`)
**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-transcribe",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"],
env={
"OPENAI_API_KEY": "your_openai_api_key",
"OPENAI_BASE_URL": "https://api.openai.com/v1"
}
)
}
]
manager = ToolManager(server_configs)
# Transcribe audio
result = await manager.execute_tool_call(
server_name="tool-transcribe",
tool_name="audio_transcription",
arguments={"audio_path_or_url": "/path/to/audio.mp3"}
)
print(result)
# Answer questions about audio
result = await manager.execute_tool_call(
server_name="tool-transcribe",
tool_name="audio_question_answering",
arguments={
"audio_path_or_url": "/path/to/audio.mp3",
"question": "What is the main topic discussed?"
}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-transcribe-os
Transcribe audio files using open-source compatible models.
**Tools**:
- 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text
**Environment Variables**:
- 🔑 `WHISPER_API_KEY`: API key (required)
- 🌐 `WHISPER_BASE_URL`: API endpoint URL (required)
- 🤖 `WHISPER_MODEL_NAME`: Model name (required)
**Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-transcribe-os",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"],
env={
"WHISPER_API_KEY": "your_whisper_api_key",
"WHISPER_BASE_URL": "your_whisper_base_url",
"WHISPER_MODEL_NAME": "your_whisper_model_name"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-transcribe-os",
tool_name="audio_transcription",
arguments={"audio_path_or_url": "/path/to/audio.mp3"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-reading
Convert various document formats to Markdown using MarkItDown.
**Tools**:
- 📄 `convert_to_markdown(uri)`: Convert documents (PDF, DOC, PPT, Excel, CSV, ZIP, etc.) to Markdown. URI must start with `file:`, `data:`, `http:`, or `https:` scheme.
**Supported Formats**: 📄 PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, CSV, ZIP, and more
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
# Configure server (no additional environment variables required)
server_configs = [
{
"name": "tool-reading",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"]
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-reading",
tool_name="convert_to_markdown",
arguments={"uri": "file:///path/to/document.pdf"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-reasoning
Solve complex reasoning problems requiring chain-of-thought using Anthropic Claude with thinking.
**Tools**:
- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions
**Environment Variables**:
- 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required)
- 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-reasoning",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server"],
env={
"ANTHROPIC_API_KEY": "your_anthropic_api_key",
"ANTHROPIC_BASE_URL": "https://api.anthropic.com"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-reasoning",
tool_name="reasoning",
arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-reasoning-os
Solve complex reasoning problems requiring chain-of-thought using open-source compatible models.
**Tools**:
- 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions
**Environment Variables**:
- 🔑 `REASONING_API_KEY`: API key (required)
- 🌐 `REASONING_BASE_URL`: API endpoint URL (required)
- 🤖 `REASONING_MODEL_NAME`: Model name (required)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-reasoning-os",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server_os"],
env={
"REASONING_API_KEY": "your_reasoning_api_key",
"REASONING_BASE_URL": "your_reasoning_base_url",
"REASONING_MODEL_NAME": "your_reasoning_model_name"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="tool-reasoning-os",
tool_name="reasoning",
arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: search_and_scrape_webpage
Google search via Serper API. Used in MiroThinker v1.0/v1.5 evaluation.
**Tools**:
- 🔍 `google_search(q, gl="us", hl="en", location=None, num=None, tbs=None, page=None, autocorrect=None)`: Perform web searches via Serper API and retrieve rich results
**Environment Variables**:
- 🔑 `SERPER_API_KEY`: Serper API key (required)
- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "search_and_scrape_webpage",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.dev_mcp_servers.search_and_scrape_webpage"],
env={
"SERPER_API_KEY": "your_serper_api_key",
"SERPER_BASE_URL": "https://google.serper.dev"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="search_and_scrape_webpage",
tool_name="google_search",
arguments={
"q": "Python async programming",
"gl": "us",
"hl": "en",
"num": 10
}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: jina_scrape_llm_summary
Scrape content from URLs and extract meaningful information using an LLM. Used in MiroThinker v1.0/v1.5 evaluation.
**Tools**:
- 🔎 `scrape_and_extract_info(url, info_to_extract, custom_headers=None)`: Scrape content from a URL (web pages, PDFs, code files, etc.) and extract meaningful information using an LLM
**Environment Variables**:
- 🔑 `JINA_API_KEY`: Jina.ai API key (required)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)
- 🔗 `SUMMARY_LLM_BASE_URL`: LLM API base URL for summarization (required)
- 🤖 `SUMMARY_LLM_MODEL_NAME`: LLM model name for summarization (required)
- 🔑 `SUMMARY_LLM_API_KEY`: LLM API key for summarization (optional, depends on LLM provider)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "jina_scrape_llm_summary",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary"],
env={
"JINA_API_KEY": "your_jina_api_key",
"JINA_BASE_URL": "https://r.jina.ai",
"SUMMARY_LLM_BASE_URL": "your_llm_base_url",
"SUMMARY_LLM_MODEL_NAME": "your_llm_model_name",
"SUMMARY_LLM_API_KEY": "your_llm_api_key"
}
)
}
]
manager = ToolManager(server_configs)
result = await manager.execute_tool_call(
server_name="jina_scrape_llm_summary",
tool_name="scrape_and_extract_info",
arguments={
"url": "https://example.com/article",
"info_to_extract": "What is the main topic of this article?"
}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-google-search
Google search via Serper API with website scraping capabilities.
**Tools**:
- 🔍 `google_search(q, gl="us", hl="en", location=None, num=10, tbs=None, page=1)`: Google search
- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai
**Environment Variables**:
- 🔑 `SERPER_API_KEY`: Serper API key (required for Google search)
- 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`)
- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)
**Filtering Options** (via environment variables):
- 🚫 `REMOVE_SNIPPETS`: Remove snippets from search results
- 🚫 `REMOVE_KNOWLEDGE_GRAPH`: Remove knowledge graph from results
- 🚫 `REMOVE_ANSWER_BOX`: Remove answer box from results
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-google-search",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.searching_google_mcp_server"],
env={
"SERPER_API_KEY": "your_serper_api_key",
"SERPER_BASE_URL": "https://google.serper.dev",
"JINA_API_KEY": "your_jina_api_key",
"JINA_BASE_URL": "https://r.jina.ai"
}
)
}
]
manager = ToolManager(server_configs)
# Google search
result = await manager.execute_tool_call(
server_name="tool-google-search",
tool_name="google_search",
arguments={
"q": "Python async programming",
"gl": "us",
"hl": "en",
"num": 10
}
)
print(result)
# Scrape website
result = await manager.execute_tool_call(
server_name="tool-google-search",
tool_name="scrape_website",
arguments={"url": "https://example.com/article"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
### Server: tool-sogou-search
Sogou search (optimized for Chinese) with website scraping capabilities. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation*
**Tools**:
- 🔍 `sogou_search(Query, Cnt=10)`: Sogou search (Chinese)
- 🌐 `scrape_website(url)`: Scrape website content using Jina.ai
**Environment Variables**:
- 🔑 `TENCENTCLOUD_SECRET_ID`: Tencent Cloud secret ID (required)
- 🔑 `TENCENTCLOUD_SECRET_KEY`: Tencent Cloud secret key (required)
- 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping)
- 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`)
**Example**:
Click to expand code example
```python
import asyncio
from miroflow_tools import ToolManager
from mcp import StdioServerParameters
async def main():
server_configs = [
{
"name": "tool-sogou-search",
"params": StdioServerParameters(
command="python",
args=["-m", "miroflow_tools.mcp_servers.searching_sogou_mcp_server"],
env={
"TENCENTCLOUD_SECRET_ID": "your_tencent_secret_id",
"TENCENTCLOUD_SECRET_KEY": "your_tencent_secret_key",
"JINA_API_KEY": "your_jina_api_key",
"JINA_BASE_URL": "https://r.jina.ai"
}
)
}
]
manager = ToolManager(server_configs)
# Sogou search
result = await manager.execute_tool_call(
server_name="tool-sogou-search",
tool_name="sogou_search",
arguments={
"Query": "Python 异步编程",
"Cnt": 10
}
)
print(result)
# Scrape website
result = await manager.execute_tool_call(
server_name="tool-sogou-search",
tool_name="scrape_website",
arguments={"url": "https://example.com/article"}
)
print(result)
if __name__ == "__main__":
asyncio.run(main())
```
## 🚀 Development
### Adding a New MCP Server
1. Create a new server file in `mcp_servers/`
1. Use `FastMCP` to define tools:
```python
from fastmcp import FastMCP
mcp = FastMCP("server-name")
@mcp.tool()
async def my_tool(arg: str) -> str:
"""Tool description."""
return "result"
if __name__ == "__main__":
mcp.run(transport="stdio")
```
1. Add server configuration to your application
1. Update this README with server documentation
================================================
FILE: libs/miroflow-tools/pyproject.toml
================================================
[project]
name = "miroflow-tools"
version = "0.1.0"
description = "Tool management and MCP server utilities for MiroFlow"
readme = "README.md"
authors = [
{ name = "MiroMind Team", email = "service@miromind.ai" }
]
requires-python = ">=3.12"
dependencies = [
"mcp>=1.0.0",
"fastmcp>=0.1.0",
"playwright>=1.40.0",
"requests>=2.32.0",
"e2b-code-interpreter==1.2.1",
"wikipedia",
"mutagen",
"markitdown-mcp>=0.0.1a3",
"google-genai",
"aiohttp",
"redis"
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/miroflow_tools"]
[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
"pytest-cov>=6.2.1",
"pytest-html>=4.1.1",
"pytest-xdist>=3.7.0",
"pytest-mock>=3.10.0",
"pytest-timeout>=2.1.0",
"inline-snapshot>=0.23.2",
]
[tool.pytest.ini_options]
minversion = "8.3.5"
testpaths = ["src/test"]
asyncio_default_fixture_loop_scope = "function"
addopts = [
"-rA",
"--show-capture=stderr",
"-n=auto",
"--html=report.html",
"--self-contained-html",
"--cov=miroflow_tools",
"--cov-report=html",
"--strict-markers",
"-v",
]
markers = [
"integration: marks tests as integration tests (may be slow)",
"unit: marks tests as unit tests",
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"requires_api_key: marks tests that require real API credentials",
]
================================================
FILE: libs/miroflow-tools/src/__init__.py
================================================
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/__init__.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
from .manager import ToolManager
__all__ = ["ToolManager"]
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/jina_scrape_llm_summary.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import json
import logging
import os
from typing import Any, Dict
import httpx
from mcp.server.fastmcp import FastMCP
# Configure logging
logger = logging.getLogger("miroflow")
SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL")
SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME")
SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")
# Initialize FastMCP server
mcp = FastMCP("jina_scrape_llm_summary")
@mcp.tool()
async def scrape_and_extract_info(
url: str, info_to_extract: str, custom_headers: Dict[str, str] = None
):
"""
Scrape content from a URL, including web pages, PDFs, code files, and other supported resources, and extract meaningful information using an LLM.
If you need to extract information from a PDF, please use this tool.
Args:
url (str): The URL to scrape content from. Supports various types of URLs such as web pages, PDFs, raw text/code files (e.g., GitHub, Gist), and similar sources.
info_to_extract (str): The specific types of information to extract (usually a question)
custom_headers (Dict[str, str]): Additional headers to include in the scraping request
Returns:
Dict[str, Any]: A dictionary containing:
- success (bool): Whether the operation was successful
- url (str): The original URL
- extracted_info (str): The extracted information
- error (str): Error message if the operation failed
- scrape_stats (Dict): Statistics about the scraped content
- model_used (str): The model used for summarization
- tokens_used (int): Number of tokens used (if available)
"""
if _is_huggingface_dataset_or_space_url(url):
return json.dumps(
{
"success": False,
"url": url,
"extracted_info": "",
"error": "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.",
"scrape_stats": {},
"tokens_used": 0,
},
ensure_ascii=False,
)
# First, scrape the content with Jina
scrape_result = await scrape_url_with_jina(url, custom_headers)
# If Jina fails, try direct Python scraping as fallback
if not scrape_result["success"]:
logger.warning(
f"Jina Scrape and Extract Info: Jina scraping failed: {scrape_result['error']}, trying direct Python scraping as fallback"
)
scrape_result = await scrape_url_with_python(url, custom_headers)
if not scrape_result["success"]:
logger.error(
f"Jina Scrape and Extract Info: Both Jina and Python scraping failed: {scrape_result['error']}"
)
return json.dumps(
{
"success": False,
"url": url,
"extracted_info": "",
"error": f"Scraping failed (both Jina and Python): {scrape_result['error']}",
"scrape_stats": {},
"tokens_used": 0,
},
ensure_ascii=False,
)
else:
logger.info(
f"Jina Scrape and Extract Info: Python fallback scraping succeeded for URL: {url}"
)
# Then, summarize the content
extracted_result = await extract_info_with_llm(
url=url,
content=scrape_result["content"],
info_to_extract=info_to_extract,
model=SUMMARY_LLM_MODEL_NAME,
max_tokens=8192,
)
# Combine results
return json.dumps(
{
"success": extracted_result["success"],
"url": url,
"extracted_info": extracted_result["extracted_info"],
"error": extracted_result["error"],
"scrape_stats": {
"line_count": scrape_result["line_count"],
"char_count": scrape_result["char_count"],
"last_char_line": scrape_result["last_char_line"],
"all_content_displayed": scrape_result["all_content_displayed"],
},
"model_used": extracted_result["model_used"],
"tokens_used": extracted_result["tokens_used"],
},
ensure_ascii=False,
)
def _is_huggingface_dataset_or_space_url(url):
"""
Check if the URL is a HuggingFace dataset or space URL.
:param url: The URL to check
:return: True if it's a HuggingFace dataset or space URL, False otherwise
"""
if not url:
return False
return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url
async def scrape_url_with_jina(
url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4
) -> Dict[str, Any]:
"""
Scrape content from a URL and save to a temporary file. Need to read the content from the temporary file.
Args:
url (str): The URL to scrape content from
custom_headers (Dict[str, str]): Additional headers to include in the request
max_chars (int): Maximum number of characters to reserve for the scraped content
Returns:
Dict[str, Any]: A dictionary containing:
- success (bool): Whether the operation was successful
- filename (str): Absolute path to the temporary file containing the scraped content
- content (str): The scraped content of the first 40k characters
- error (str): Error message if the operation failed
- line_count (int): Number of lines in the scraped content
- char_count (int): Number of characters in the scraped content
- last_char_line (int): Line number where the last displayed character is located
- all_content_displayed (bool): Signal indicating if all content was displayed (True if content <= 40k chars)
"""
# Validate input
if not url or not url.strip():
return {
"success": False,
"filename": "",
"content": "",
"error": "URL cannot be empty",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Get API key from environment
if not JINA_API_KEY:
return {
"success": False,
"filename": "",
"content": "",
"error": "JINA_API_KEY environment variable is not set",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Avoid duplicate Jina URL prefix
if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
url = url[len("https://r.jina.ai/") :]
# Construct the Jina.ai API URL
jina_url = f"{JINA_BASE_URL}/{url}"
try:
# Prepare headers
headers = {
"Authorization": f"Bearer {JINA_API_KEY}",
}
# Add custom headers if provided
if custom_headers:
headers.update(custom_headers)
# Retry configuration
retry_delays = [1, 2, 4, 8]
for attempt, delay in enumerate(retry_delays, 1):
try:
# Make the request using httpx library
async with httpx.AsyncClient() as client:
response = await client.get(
jina_url,
headers=headers,
timeout=httpx.Timeout(None, connect=20, read=60),
follow_redirects=True, # Follow redirects (equivalent to curl -L)
)
# Check if request was successful
response.raise_for_status()
break # Success, exit retry loop
except httpx.ConnectTimeout as e:
# connection timeout, retry
if attempt < len(retry_delays):
logger.info(
f"Jina Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Jina Scrape: Connection retry attempts exhausted, url: {url}"
)
raise e
except httpx.ConnectError as e:
# connection error, retry
if attempt < len(retry_delays):
logger.info(
f"Jina Scrape: Connection error: {e}, {delay}s before next attempt"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Jina Scrape: Connection retry attempts exhausted, url: {url}"
)
raise e
except httpx.ReadTimeout as e:
# read timeout, retry
if attempt < len(retry_delays):
logger.info(
f"Jina Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Jina Scrape: Read timeout retry attempts exhausted, url: {url}"
)
raise e
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]
if should_retry and attempt < len(retry_delays):
logger.info(
f"Jina Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}"
)
await asyncio.sleep(delay)
continue
elif should_retry:
logger.error(
f"Jina Scrape: HTTP {status_code} retry exhausted, url: {url}"
)
raise e
else:
logger.error(
f"Jina Scrape: HTTP {status_code} (non-retryable), url: {url}"
)
raise e
except httpx.RequestError as e:
if attempt < len(retry_delays):
logger.info(
f"Jina Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Jina Scrape: Unknown request exception retry attempts exhausted, url: {url}"
)
raise e
except Exception as e:
error_msg = f"Jina Scrape: Unexpected error occurred: {str(e)}"
logger.error(error_msg)
return {
"success": False,
"filename": "",
"content": "",
"error": error_msg,
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Get the scraped content
content = response.text
if not content:
return {
"success": False,
"filename": "",
"content": "",
"error": "No content returned from Jina.ai API",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# handle insufficient balance error
try:
content_dict = json.loads(content)
except json.JSONDecodeError:
content_dict = None
if (
isinstance(content_dict, dict)
and content_dict.get("name") == "InsufficientBalanceError"
):
return {
"success": False,
"filename": "",
"content": "",
"error": "Insufficient balance",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Get content statistics
total_char_count = len(content)
total_line_count = content.count("\n") + 1 if content else 0
# Extract first max_chars characters
displayed_content = content[:max_chars]
all_content_displayed = total_char_count <= max_chars
# Calculate the line number of the last character displayed
if displayed_content:
# Count newlines up to the last displayed character
last_char_line = displayed_content.count("\n") + 1
else:
last_char_line = 0
return {
"success": True,
"content": displayed_content,
"error": "",
"line_count": total_line_count,
"char_count": total_char_count,
"last_char_line": last_char_line,
"all_content_displayed": all_content_displayed,
}
async def scrape_url_with_python(
url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4
) -> Dict[str, Any]:
"""
Fallback scraping method using Python's httpx library directly.
Args:
url (str): The URL to scrape content from
custom_headers (Dict[str, str]): Additional headers to include in the request
max_chars (int): Maximum number of characters to reserve for the scraped content
Returns:
Dict[str, Any]: A dictionary containing:
- success (bool): Whether the operation was successful
- content (str): The scraped content
- error (str): Error message if the operation failed
- line_count (int): Number of lines in the scraped content
- char_count (int): Number of characters in the scraped content
- last_char_line (int): Line number where the last displayed character is located
- all_content_displayed (bool): Signal indicating if all content was displayed
"""
# Validate input
if not url or not url.strip():
return {
"success": False,
"content": "",
"error": "URL cannot be empty",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
try:
# Prepare headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# Add custom headers if provided
if custom_headers:
headers.update(custom_headers)
# Retry configuration
retry_delays = [1, 2, 4]
for attempt, delay in enumerate(retry_delays, 1):
try:
# Make the request using httpx library
async with httpx.AsyncClient() as client:
response = await client.get(
url,
headers=headers,
timeout=httpx.Timeout(None, connect=20, read=60),
follow_redirects=True,
)
# Check if request was successful
response.raise_for_status()
break # Success, exit retry loop
except httpx.ConnectTimeout as e:
if attempt < len(retry_delays):
logger.info(
f"Python Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Python Scrape: Connection retry attempts exhausted, url: {url}"
)
raise e
except httpx.ConnectError as e:
if attempt < len(retry_delays):
logger.info(
f"Python Scrape: Connection error: {e}, {delay}s before next attempt"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Python Scrape: Connection retry attempts exhausted, url: {url}"
)
raise e
except httpx.ReadTimeout as e:
if attempt < len(retry_delays):
logger.info(
f"Python Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Python Scrape: Read timeout retry attempts exhausted, url: {url}"
)
raise e
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]
if should_retry and attempt < len(retry_delays):
logger.info(
f"Python Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}"
)
await asyncio.sleep(delay)
continue
elif should_retry:
logger.error(
f"Python Scrape: HTTP {status_code} retry exhausted, url: {url}"
)
raise e
else:
logger.error(
f"Python Scrape: HTTP {status_code} (non-retryable), url: {url}"
)
raise e
except httpx.RequestError as e:
if attempt < len(retry_delays):
logger.info(
f"Python Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
f"Python Scrape: Unknown request exception retry attempts exhausted, url: {url}"
)
raise e
except Exception as e:
error_msg = f"Python Scrape: Unexpected error occurred: {str(e)}"
logger.error(error_msg)
return {
"success": False,
"content": "",
"error": error_msg,
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Get the scraped content
content = response.text
if not content:
return {
"success": False,
"content": "",
"error": "No content returned from URL",
"line_count": 0,
"char_count": 0,
"last_char_line": 0,
"all_content_displayed": False,
}
# Get content statistics
total_char_count = len(content)
total_line_count = content.count("\n") + 1 if content else 0
# Extract first max_chars characters
displayed_content = content[:max_chars]
all_content_displayed = total_char_count <= max_chars
# Calculate the line number of the last character displayed
if displayed_content:
last_char_line = displayed_content.count("\n") + 1
else:
last_char_line = 0
return {
"success": True,
"content": displayed_content,
"error": "",
"line_count": total_line_count,
"char_count": total_char_count,
"last_char_line": last_char_line,
"all_content_displayed": all_content_displayed,
}
EXTRACT_INFO_PROMPT = """You are given a piece of content and the requirement of information to extract. Your task is to extract the information specifically requested. Be precise and focus exclusively on the requested information.
INFORMATION TO EXTRACT:
{}
INSTRUCTIONS:
1. Extract the information relevant to the focus above.
2. If the exact information is not found, extract the most closely related details.
3. Be specific and include exact details when available.
4. Clearly organize the extracted information for easy understanding.
5. Do not include general summaries or unrelated content.
CONTENT TO ANALYZE:
{}
EXTRACTED INFORMATION:"""
def get_prompt_with_truncation(
info_to_extract: str, content: str, truncate_last_num_chars: int = -1
) -> str:
if truncate_last_num_chars > 0:
content = content[:-truncate_last_num_chars] + "[...truncated]"
# Prepare the prompt
prompt = EXTRACT_INFO_PROMPT.format(info_to_extract, content)
return prompt
async def extract_info_with_llm(
url: str,
content: str,
info_to_extract: str,
model: str = "LLM",
max_tokens: int = 4096,
) -> Dict[str, Any]:
"""
Summarize content using an LLM API.
Args:
content (str): The content to summarize
info_to_extract (str): The specific types of information to extract (usually a question)
model (str): The model to use for summarization
max_tokens (int): Maximum tokens for the response
Returns:
Dict[str, Any]: A dictionary containing:
- success (bool): Whether the operation was successful
- extracted_info (str): The extracted information
- error (str): Error message if the operation failed
- model_used (str): The model used for summarization
- tokens_used (int): Number of tokens used (if available)
"""
# Validate input
if not content or not content.strip():
return {
"success": False,
"extracted_info": "",
"error": "Content cannot be empty",
"model_used": model,
"tokens_used": 0,
}
prompt = get_prompt_with_truncation(info_to_extract, content)
# Prepare the payload
if "gpt" in model:
payload = {
"model": model,
"max_completion_tokens": max_tokens,
"messages": [
{"role": "user", "content": prompt},
],
}
# Add cost-saving parameters for GPT-5 models
if "gpt-5" in model.lower() or "gpt5" in model.lower():
payload["service_tier"] = "flex"
payload["reasoning_effort"] = "minimal"
else:
payload = {
"model": model,
"max_tokens": max_tokens,
"messages": [
{"role": "user", "content": prompt},
],
"temperature": 1.0,
# "top_p": 0.8,
# "top_k": 20,
}
# Validate LLM endpoint configuration early for clearer errors
if not SUMMARY_LLM_BASE_URL or not SUMMARY_LLM_BASE_URL.strip():
return {
"success": False,
"extracted_info": "",
"error": "SUMMARY_LLM_BASE_URL environment variable is not set",
"model_used": model,
"tokens_used": 0,
}
# Prepare headers (add Authorization if API key is available)
headers = {"Content-Type": "application/json"}
if SUMMARY_LLM_API_KEY:
headers["Authorization"] = f"Bearer {SUMMARY_LLM_API_KEY}"
try:
# Retry configuration
connect_retry_delays = [1, 2, 4, 8]
for attempt, delay in enumerate(connect_retry_delays, 1):
try:
# Make the API request using httpx
async with httpx.AsyncClient() as client:
response = await client.post(
SUMMARY_LLM_BASE_URL,
headers=headers,
json=payload,
timeout=httpx.Timeout(None, connect=30, read=300),
)
if response.text and len(response.text) >= 50:
tail_50 = response.text[-50:]
repeat_count = response.text.count(tail_50)
if repeat_count > 5:
logger.info("Repeat detected in extract_info_with_llm")
continue
# Check if the request was successful
if (
"Requested token count exceeds the model's maximum context length"
in response.text
or "longer than the model's context length" in response.text
):
prompt = get_prompt_with_truncation(
info_to_extract,
content,
truncate_last_num_chars=40960 * attempt,
) # remove 40k * num_attempts chars from the end of the content
payload["messages"][0]["content"] = prompt
continue # no need to raise error here, just try again
response.raise_for_status()
break # Success, exit retry loop
except httpx.ConnectTimeout as e:
# connection timeout, retry
if attempt < len(connect_retry_delays):
logger.info(
f"Jina Scrape and Extract Info: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
"Jina Scrape and Extract Info: Connection retry attempts exhausted"
)
raise e
except httpx.ConnectError as e:
# connection error, retry
if attempt < len(connect_retry_delays):
logger.info(
f"Jina Scrape and Extract Info: Connection error: {e}, {delay}s before next attempt"
)
await asyncio.sleep(delay)
continue
else:
logger.error(
"Jina Scrape and Extract Info: Connection retry attempts exhausted"
)
raise e
except httpx.ReadTimeout as e:
# read timeout, LLM API is too slow, no need to retry
if attempt < len(connect_retry_delays):
logger.info(
f"Jina Scrape and Extract Info: LLM API attempt {attempt} read timeout"
)
continue
else:
logger.error(
f"Jina Scrape and Extract Info: LLM API read timeout retry attempts exhausted, please check the request complexity, information to extract: {info_to_extract}, length of content: {len(content)}, url: {url}"
)
raise e
except httpx.HTTPStatusError as e:
status_code = e.response.status_code
# Special case: GPT-5 service_tier parameter compatibility issue
if (
"gpt-5" in model.lower() or "gpt5" in model.lower()
) and "service_tier" in payload:
logger.info(
"Extract Info: GPT-5 service_tier error, removing and retrying"
)
payload.pop("service_tier", None)
if attempt < len(connect_retry_delays):
await asyncio.sleep(delay)
continue
# Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429)
should_retry = status_code >= 500 or status_code in [408, 409, 425, 429]
if should_retry and attempt < len(connect_retry_delays):
logger.info(
f"Extract Info: HTTP {status_code} (retryable), retry in {delay}s"
)
await asyncio.sleep(delay)
continue
elif should_retry:
logger.error(f"Extract Info: HTTP {status_code} retry exhausted")
raise e
else:
logger.error(f"Extract Info: HTTP {status_code} (non-retryable)")
raise httpx.HTTPStatusError(
f"response.text: {response.text}",
request=e.request,
response=e.response,
) from e
except httpx.RequestError as e:
logger.error(
f"Jina Scrape and Extract Info: Unknown request exception: {e}"
)
raise e
except Exception as e:
error_msg = f"Jina Scrape and Extract Info: Unexpected error during LLM API call: {str(e)}"
logger.error(error_msg)
return {
"success": False,
"extracted_info": "",
"error": error_msg,
"model_used": model,
"tokens_used": 0,
}
# Parse the response
try:
response_data = response.json()
except json.JSONDecodeError as e:
error_msg = (
f"Jina Scrape and Extract Info: Failed to parse LLM API response: {str(e)}"
)
logger.error(error_msg)
logger.error(f"Raw response: {response.text}")
return {
"success": False,
"extracted_info": "",
"error": error_msg,
"model_used": model,
"tokens_used": 0,
}
# Extract summary from response
if "choices" in response_data and len(response_data["choices"]) > 0:
try:
summary = response_data["choices"][0]["message"]["content"]
except Exception as e:
error_msg = f"Jina Scrape and Extract Info: Failed to get summary from LLM API response: {str(e)}"
logger.error(error_msg)
return {
"success": False,
"extracted_info": "",
"error": error_msg,
"model_used": model,
"tokens_used": 0,
}
# Extract token usage if available
tokens_used = 0
if "usage" in response_data:
tokens_used = response_data["usage"].get("total_tokens", 0)
return {
"success": True,
"extracted_info": summary,
"error": "",
"model_used": model,
"tokens_used": tokens_used,
}
elif "error" in response_data:
error_msg = (
f"Jina Scrape and Extract Info: LLM API error: {response_data['error']}"
)
logger.error(error_msg)
return {
"success": False,
"extracted_info": "",
"error": error_msg,
"model_used": model,
"tokens_used": 0,
}
else:
error_msg = f"Jina Scrape and Extract Info: No valid response from LLM API, response data: {response_data}"
logger.error(error_msg)
return {
"success": False,
"extracted_info": "",
"error": error_msg,
"model_used": model,
"tokens_used": 0,
}
if __name__ == "__main__":
# Example usage and testing
# Run the MCP server
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/search_and_scrape_webpage.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import logging
import os
from typing import Any, Dict
import httpx
from mcp.server.fastmcp import FastMCP
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from tencentcloud.common import credential
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
TencentCloudSDKException,
)
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from ..mcp_servers.utils.url_unquote import decode_http_urls_in_dict
# Configure logging
logger = logging.getLogger("miroflow")
SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
TENCENTCLOUD_SECRET_ID = os.getenv("TENCENTCLOUD_SECRET_ID", "")
TENCENTCLOUD_SECRET_KEY = os.getenv("TENCENTCLOUD_SECRET_KEY", "")
# Initialize FastMCP server
mcp = FastMCP("search_and_scrape_webpage")
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception_type(
(httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError)
),
)
async def make_serper_request(
payload: Dict[str, Any], headers: Dict[str, str]
) -> httpx.Response:
"""Make HTTP request to Serper API with retry logic."""
async with httpx.AsyncClient() as client:
response = await client.post(
f"{SERPER_BASE_URL}/search",
json=payload,
headers=headers,
)
response.raise_for_status()
return response
def _is_banned_url(url: str) -> bool:
"""
Check if the URL is a banned URL.
:param url: The URL to check
:return: True if it's a banned URL, False otherwise
"""
banned_list = [
"unifuncs",
"huggingface.co/datasets",
"huggingface.co/spaces",
]
if not url:
return False
return any(banned in url for banned in banned_list)
@mcp.tool()
async def google_search(
q: str,
gl: str = "us",
hl: str = "en",
location: str = None,
num: int = None,
tbs: str = None,
page: int = None,
autocorrect: bool = None,
):
"""
Tool to perform web searches via Serper API and retrieve rich results.
It is able to retrieve organic search results, people also ask,
related searches, and knowledge graph.
Args:
q: Search query string
gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
num: Number of results to return (default: 10)
tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)
page: Page number of results to return (default: 1)
autocorrect: Whether to autocorrect spelling in query
Returns:
Dictionary containing search results and metadata.
"""
# Check for API key
if not SERPER_API_KEY:
return json.dumps(
{
"success": False,
"error": "SERPER_API_KEY environment variable not set",
"results": [],
},
ensure_ascii=False,
)
# Validate required parameter
if not q or not q.strip():
return json.dumps(
{
"success": False,
"error": "Search query 'q' is required and cannot be empty",
"results": [],
},
ensure_ascii=False,
)
try:
# Helper function to perform a single search
async def perform_search(search_query: str) -> tuple[list, dict]:
"""Perform a search and return organic results and search parameters."""
# Build payload with all supported parameters
payload: dict[str, Any] = {
"q": search_query.strip(),
"gl": gl,
"hl": hl,
}
# Add optional parameters if provided
if location:
payload["location"] = location
if num is not None:
payload["num"] = num
else:
payload["num"] = 10 # Default
if tbs:
payload["tbs"] = tbs
if page is not None:
payload["page"] = page
if autocorrect is not None:
payload["autocorrect"] = autocorrect
# Set up headers
headers = {
"X-API-KEY": SERPER_API_KEY,
"Content-Type": "application/json",
}
# Make the API request
response = await make_serper_request(payload, headers)
data = response.json()
# filter out HuggingFace dataset or space urls
organic_results = []
if "organic" in data:
for item in data["organic"]:
if _is_banned_url(item.get("link", "")):
continue
organic_results.append(item)
return organic_results, data.get("searchParameters", {})
# Perform initial search
original_query = q.strip()
organic_results, search_params = await perform_search(original_query)
# If no results and query contains quotes, retry without quotes
if not organic_results and '"' in original_query:
# Remove all types of quotes
query_without_quotes = original_query.replace('"', "").strip()
if query_without_quotes: # Make sure we still have a valid query
organic_results, search_params = await perform_search(
query_without_quotes
)
# Build comprehensive response
response_data = {
"organic": organic_results,
"searchParameters": search_params,
}
response_data = decode_http_urls_in_dict(response_data)
return json.dumps(response_data, ensure_ascii=False)
except Exception as e:
return json.dumps(
{
"success": False,
"error": f"Unexpected error: {str(e)}",
"results": [],
},
ensure_ascii=False,
)
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception_type(TencentCloudSDKException),
)
async def make_sogou_request(query: str, cnt: int) -> Dict[str, Any]:
"""Make request to Tencent Cloud SearchPro API with retry logic."""
cred = credential.Credential(TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY)
httpProfile = HttpProfile()
httpProfile.endpoint = "wsa.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
params = f'{{"Query":"{query}","Mode":0, "Cnt":{cnt}}}'
common_client = CommonClient("wsa", "2025-05-08", cred, "", profile=clientProfile)
result = common_client.call_json("SearchPro", json.loads(params))["Response"]
return result
@mcp.tool()
async def sogou_search(
q: str,
num: int = 10,
) -> str:
"""
Tool to perform web searches via Tencent Cloud SearchPro API (Sogou search engine).
Sogou search offers superior results for Chinese-language queries compared to Google.
Args:
q: Search query string (Required)
num: Number of search results to return (Can only be 10/20/30/40/50, default: 10)
Returns:
JSON string containing search results with the following fields:
- Query: The original search query
- Pages: Array of search results, each containing title, url, passage, date, and site
"""
# Check for API credentials
if not TENCENTCLOUD_SECRET_ID or not TENCENTCLOUD_SECRET_KEY:
return json.dumps(
{
"success": False,
"error": "TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY environment variable not set",
"results": [],
},
ensure_ascii=False,
)
# Validate required parameter
if not q or not q.strip():
return json.dumps(
{
"success": False,
"error": "Search query 'q' is required and cannot be empty",
"results": [],
},
ensure_ascii=False,
)
# Validate num parameter
if num not in [10, 20, 30, 40, 50]:
return json.dumps(
{
"success": False,
"error": f"Invalid num value: {num}. Must be one of 10, 20, 30, 40, 50",
"results": [],
},
ensure_ascii=False,
)
try:
# Make the API request
result = await make_sogou_request(q.strip(), num)
# Remove RequestId from response
if "RequestId" in result:
del result["RequestId"]
# Process and simplify the Pages field
pages = []
if "Pages" in result:
for page in result["Pages"]:
page_json = json.loads(page)
new_page = {
"title": page_json.get("title", ""),
"url": page_json.get("url", ""),
"passage": page_json.get("passage", ""),
"date": page_json.get("date", ""),
"site": page_json.get("site", ""),
}
pages.append(new_page)
result["Pages"] = pages
# Decode URLs in the response
result = decode_http_urls_in_dict(result)
return json.dumps(result, ensure_ascii=False)
except TencentCloudSDKException as e:
return json.dumps(
{
"success": False,
"error": f"Tencent Cloud API error: {str(e)}",
"results": [],
},
ensure_ascii=False,
)
except Exception as e:
return json.dumps(
{
"success": False,
"error": f"Unexpected error: {str(e)}",
"results": [],
},
ensure_ascii=False,
)
if __name__ == "__main__":
mcp.run()
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import os
from e2b_code_interpreter import Sandbox
from mcp.server.fastmcp import FastMCP
# Initialize FastMCP server
mcp = FastMCP("stateless-python-server")
# API keys
E2B_API_KEY = os.environ.get("E2B_API_KEY")
# DEFAULT CONFS
DEFAULT_TIMEOUT = 300 # seconds
@mcp.tool()
async def python(code: str) -> str:
"""Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output.
IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time.
Args:
code: The python code to run.
Returns:
A string containing the execution result including stdout and stderr.
"""
sandbox = Sandbox.create(
timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template="1av7fdjfvcparqo8efq6"
)
max_attempts = 2
for attempt in range(1, max_attempts + 1):
try:
execution = sandbox.run_code(code)
break
except Exception as e:
if attempt == max_attempts:
raise e
execution = sandbox.run_code(code)
sandbox.kill()
return str(execution)
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/task_planner.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import json
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List
from uuid import uuid4
from mcp.server.fastmcp import FastMCP
# Configure logging
logger = logging.getLogger("miroflow")
# Initialize FastMCP server
mcp = FastMCP("task_planner")
# Configuration
TODO_DATA_DIR = os.environ.get("TODO_DATA_DIR", "../../logs/todo_lists")
# TASK_ID is required for task isolation
# Without TASK_ID, task planner operations will fail
TASK_ID = os.environ.get("TASK_ID")
if not TASK_ID:
raise ValueError(
"TASK_ID environment variable is required for task_planner tool. "
"This tool must have a unique task identifier to prevent data conflicts in concurrent execution."
)
TODO_DATA_FILE = os.path.join(TODO_DATA_DIR, f"todos_{TASK_ID}.json")
# Ensure data directory exists
Path(TODO_DATA_DIR).mkdir(parents=True, exist_ok=True)
def load_todos() -> List[Dict[str, Any]]:
"""Load task plan from the JSON file."""
if not os.path.exists(TODO_DATA_FILE):
return []
try:
with open(TODO_DATA_FILE, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load task plan: {str(e)}")
return []
def save_todos(todos: List[Dict[str, Any]]) -> bool:
"""Save task plan to the JSON file."""
try:
with open(TODO_DATA_FILE, "w", encoding="utf-8") as f:
json.dump(todos, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
logger.error(f"Failed to save task plan: {str(e)}")
return False
def format_todos_as_markdown(todos: List[Dict[str, Any]], message: str = "") -> str:
"""
Format task plan as markdown checklist.
Args:
todos: List of task items
message: Optional message to display at the top
Returns:
Markdown formatted string
"""
# Calculate statistics
total = len(todos)
completed = sum(1 for t in todos if t.get("completed", False))
pending = total - completed
# Build markdown
lines = []
if message:
lines.append(f"{message}\n")
lines.append("# Task Plan\n")
lines.append(f"Total: {total} | Pending: {pending} | Completed: {completed}\n")
lines.append("")
if not todos:
lines.append("No tasks planned yet.")
else:
for todo in todos:
checkbox = "[x]" if todo.get("completed", False) else "[ ]"
title = todo["title"]
todo_id = todo["id"][:8] # Show first 8 chars of ID
lines.append(f"- {checkbox} {title} ({todo_id})")
return "\n".join(lines)
@mcp.tool()
async def add_todo(titles: List[str]) -> str:
"""
Create a task plan by adding one or more task items.
CRITICAL: Before starting to work on ANY task, you MUST first create a complete task plan.
This is the foundation of effective task execution:
- Break down the main goal into clear, actionable steps
- Identify all necessary subtasks upfront
- Create a roadmap that guides your work
- Ensure nothing is overlooked or forgotten
Good task planning prevents confusion and ensures systematic progress toward your goal.
Args:
titles: List of task item titles. For example:
- Single task: ["Complete project report"]
- Multiple tasks: ["Complete project report", "Fix bug #123", "Update documentation"]
- Complex project: ["Research requirements", "Design architecture", "Implement core features", "Write tests", "Document API"]
Returns:
Markdown formatted string showing the success message and current task plan.
"""
if not titles:
return "❌ Error: Task titles list cannot be empty."
# Filter out empty titles
title_list = [t.strip() for t in titles if t and t.strip()]
if not title_list:
return "❌ Error: No valid task titles provided."
todos = load_todos()
added_todos = []
# Add all tasks
for title in title_list:
new_todo = {
"id": str(uuid4()),
"title": title,
"completed": False,
"created_at": datetime.now().isoformat(),
}
todos.append(new_todo)
added_todos.append(title)
if not save_todos(todos):
return "❌ Error: Failed to save task plan."
# Build success message
if len(added_todos) == 1:
message = f'✅ Task added: "{added_todos[0]}"'
else:
message = f"✅ Added {len(added_todos)} tasks:\n" + "\n".join(
f" - {t}" for t in added_todos
)
return format_todos_as_markdown(todos, message)
@mcp.tool()
async def list_todos() -> str:
"""
Display the complete task plan with all items and their status.
Use this to review your overall progress, see what's done and what remains,
and understand where you are in the execution of your plan.
Returns:
Markdown formatted string showing all tasks with their completion status.
"""
todos = load_todos()
return format_todos_as_markdown(todos)
@mcp.tool()
async def complete_todo(todo_ids: List[str]) -> str:
"""
Mark one or more tasks as completed in your plan.
Use this after finishing a task to track your progress and maintain an
accurate view of what's done and what's remaining.
Args:
todo_ids: List of task IDs to mark as completed (full ID or first 8 characters).
For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"]
Returns:
Markdown formatted string showing the success message and updated task plan.
"""
if not todo_ids:
return "❌ Error: Task IDs list cannot be empty."
# Filter out empty IDs
id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]
if not id_list:
return "❌ Error: No valid task IDs provided."
todos = load_todos()
completed_todos = []
not_found_ids = []
# Complete all matching tasks
for todo_id in id_list:
found = False
for todo in todos:
if todo["id"] == todo_id or todo["id"].startswith(todo_id):
if not todo.get(
"completed", False
): # Only mark if not already completed
todo["completed"] = True
completed_todos.append(todo["title"])
found = True
break
if not found:
not_found_ids.append(todo_id)
if not completed_todos and not_found_ids:
return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}"
if not save_todos(todos):
return "❌ Error: Failed to save changes."
# Build success message
if len(completed_todos) == 1:
message = f'✅ Completed: "{completed_todos[0]}"'
else:
message = f"✅ Completed {len(completed_todos)} tasks:\n" + "\n".join(
f" - {t}" for t in completed_todos
)
if not_found_ids:
message += f'\n⚠️ Not found: {", ".join(not_found_ids)}'
return format_todos_as_markdown(todos, message)
@mcp.tool()
async def delete_todo(todo_ids: List[str]) -> str:
"""
Remove one or more tasks from your plan.
Use this to adjust your plan when tasks become irrelevant, duplicated,
or no longer needed. This helps keep your plan focused and accurate.
Args:
todo_ids: List of task IDs to remove (full ID or first 8 characters).
For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"]
Returns:
Markdown formatted string showing the success message and remaining task plan.
"""
if not todo_ids:
return "❌ Error: Task IDs list cannot be empty."
# Filter out empty IDs
id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()]
if not id_list:
return "❌ Error: No valid task IDs provided."
todos = load_todos()
deleted_todos = []
not_found_ids = []
ids_to_delete = set()
# Find all tasks to delete
for todo_id in id_list:
found = False
for todo in todos:
if todo["id"] == todo_id or todo["id"].startswith(todo_id):
deleted_todos.append(todo["title"])
ids_to_delete.add(todo["id"])
found = True
break
if not found:
not_found_ids.append(todo_id)
if not deleted_todos and not_found_ids:
return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}"
# Remove the tasks
todos = [t for t in todos if t["id"] not in ids_to_delete]
if not save_todos(todos):
return "❌ Error: Failed to save changes."
# Build success message
if len(deleted_todos) == 1:
message = f'🗑️ Deleted: "{deleted_todos[0]}"'
else:
message = f"🗑️ Deleted {len(deleted_todos)} tasks:\n" + "\n".join(
f" - {t}" for t in deleted_todos
)
if not_found_ids:
message += f'\n⚠️ Not found: {", ".join(not_found_ids)}'
return format_todos_as_markdown(todos, message)
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/manager.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import functools
from typing import Any, Awaitable, Callable, Protocol, TypeVar
from mcp import ClientSession, StdioServerParameters # (already imported in config.py)
from mcp.client.sse import sse_client
from mcp.client.stdio import stdio_client
from .mcp_servers.browser_session import PlaywrightSession
# logger = logging.getLogger("miroflow_agent")
R = TypeVar("R")
def with_timeout(timeout_s: float = 300.0):
"""
Decorator: wraps any *async* function in asyncio.wait_for().
Usage:
@with_timeout(20)
async def create_message_foo(...): ...
"""
def decorator(
func: Callable[..., Awaitable[R]],
) -> Callable[..., Awaitable[R]]:
@functools.wraps(func)
async def wrapper(*args, **kwargs) -> R:
return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)
return wrapper
return decorator
class ToolManagerProtocol(Protocol):
"""this enables other kinds of tool manager."""
async def get_all_tool_definitions(self) -> Any: ...
async def execute_tool_call(
self, *, server_name: str, tool_name: str, arguments: dict[str, Any]
) -> Any: ...
class ToolManager(ToolManagerProtocol):
def __init__(self, server_configs, tool_blacklist=None):
"""
Initialize ToolManager.
:param server_configs: List returned by create_server_parameters()
"""
self.server_configs = server_configs
self.server_dict = {
config["name"]: config["params"] for config in server_configs
}
self.browser_session = None
self.tool_blacklist = tool_blacklist if tool_blacklist else set()
self.task_log = None
def set_task_log(self, task_log):
"""Set the task logger for structured logging."""
self.task_log = task_log
self._log(
"info",
"ToolManager | Initialization",
f"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}",
)
def _log(self, level, step_name, message, metadata=None):
"""Helper method to log using task_log if available, otherwise skip logging."""
if self.task_log:
self.task_log.log_step(level, step_name, message, metadata)
def _is_huggingface_dataset_or_space_url(self, url):
"""
Check if the URL is a Hugging Face dataset or space URL.
:param url: The URL to check
:return: True if it's a HuggingFace dataset or space URL, False otherwise
"""
if not url:
return False
return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url
def _should_block_hf_scraping(self, tool_name, arguments):
"""
Check if we should block scraping of Hugging Face datasets/spaces.
:param tool_name: The name of the tool being called
:param arguments: The arguments passed to the tool
:return: True if scraping should be blocked, False otherwise
"""
return (
tool_name in ["scrape", "scrape_website"]
and arguments.get("url")
and self._is_huggingface_dataset_or_space_url(arguments["url"])
)
def get_server_params(self, server_name):
"""Get parameters for the specified server"""
return self.server_dict.get(server_name)
async def get_all_tool_definitions(self):
"""
Connect to all configured servers and get their tool definitions.
Returns a list suitable for passing to the Prompt generator.
"""
all_servers_for_prompt = []
# Process remote server tools
for config in self.server_configs:
server_name = config["name"]
server_params = config["params"]
one_server_for_prompt = {"name": server_name, "tools": []}
self._log(
"info",
"ToolManager | Get Tool Definitions",
f"Getting tool definitions for server '{server_name}'...",
)
try:
if isinstance(server_params, StdioServerParameters):
async with stdio_client(server_params) as (read, write):
async with ClientSession(
read, write, sampling_callback=None
) as session:
await session.initialize()
tools_response = await session.list_tools()
# black list some tools
for tool in tools_response.tools:
if (server_name, tool.name) in self.tool_blacklist:
self._log(
"info",
"ToolManager | Tool Blacklisted",
f"Tool '{tool.name}' in server '{server_name}' is blacklisted, skipping.",
)
continue
one_server_for_prompt["tools"].append(
{
"name": tool.name,
"description": tool.description,
"schema": tool.inputSchema,
}
)
elif isinstance(server_params, str) and server_params.startswith(
("http://", "https://")
):
# SSE endpoint
async with sse_client(server_params) as (read, write):
async with ClientSession(
read, write, sampling_callback=None
) as session:
await session.initialize()
tools_response = await session.list_tools()
for tool in tools_response.tools:
# Can add specific tool filtering logic here (if needed)
# if server_name == "tool-excel" and tool.name not in ["get_workbook_metadata", "read_data_from_excel"]:
# continue
one_server_for_prompt["tools"].append(
{
"name": tool.name,
"description": tool.description,
"schema": tool.inputSchema,
}
)
else:
self._log(
"error",
"ToolManager | Unknown Parameter Type",
f"Error: Unknown parameter type for server '{server_name}': {type(server_params)}",
)
raise TypeError(
f"Unknown server params type for {server_name}: {type(server_params)}"
)
self._log(
"info",
"ToolManager | Tool Definitions Success",
f"Successfully obtained {len(one_server_for_prompt['tools'])} tool definitions from server '{server_name}'.",
)
all_servers_for_prompt.append(one_server_for_prompt)
except Exception as e:
self._log(
"error",
"ToolManager | Connection Error",
f"Error: Unable to connect or get tools from server '{server_name}': {e}",
)
# Still add server entry, but mark tool list as empty or include error information
one_server_for_prompt["tools"] = [
{"error": f"Unable to fetch tools: {e}"}
]
all_servers_for_prompt.append(one_server_for_prompt)
return all_servers_for_prompt
@with_timeout(1200)
async def execute_tool_call(self, server_name, tool_name, arguments) -> Any:
"""
Execute a single tool call.
:param server_name: Server name
:param tool_name: Tool name
:param arguments: Tool arguments dictionary
:return: Dictionary containing result or error
"""
# Original remote server call logic
server_params = self.get_server_params(server_name)
if not server_params:
self._log(
"error",
"ToolManager | Server Not Found",
f"Error: Attempting to call server '{server_name}' not found",
)
return {
"server_name": server_name,
"tool_name": tool_name,
"error": f"Server '{server_name}' not found.",
}
self._log(
"info",
"ToolManager | Tool Call Start",
f"Connecting to server '{server_name}' to call tool '{tool_name}'",
metadata={"arguments": arguments},
)
if server_name == "playwright":
try:
if self.browser_session is None:
self.browser_session = PlaywrightSession(server_params)
await self.browser_session.connect()
tool_result = await self.browser_session.call_tool(
tool_name, arguments=arguments
)
return {
"server_name": server_name,
"tool_name": tool_name,
"result": tool_result,
}
except Exception as e:
return {
"server_name": server_name,
"tool_name": tool_name,
"error": f"Tool call failed: {str(e)}",
}
else:
try:
result_content = None
if isinstance(server_params, StdioServerParameters):
async with stdio_client(server_params) as (read, write):
async with ClientSession(
read, write, sampling_callback=None
) as session:
await session.initialize()
try:
tool_result = await session.call_tool(
tool_name, arguments=arguments
)
result_content = (
tool_result.content[-1].text
if tool_result.content
else ""
)
# post hoc check for browsing agent reading answers from hf datsets
if self._should_block_hf_scraping(tool_name, arguments):
result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
except Exception as tool_error:
self._log(
"error",
"ToolManager | Tool Execution Error",
f"Tool execution error: {tool_error}",
)
return {
"server_name": server_name,
"tool_name": tool_name,
"error": f"Tool execution failed: {str(tool_error)}",
}
elif isinstance(server_params, str) and server_params.startswith(
("http://", "https://")
):
async with sse_client(server_params) as (read, write):
async with ClientSession(
read, write, sampling_callback=None
) as session:
await session.initialize()
try:
tool_result = await session.call_tool(
tool_name, arguments=arguments
)
result_content = (
tool_result.content[-1].text
if tool_result.content
else ""
)
# post hoc check for browsing agent reading answers from hf datsets
if self._should_block_hf_scraping(tool_name, arguments):
result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
except Exception as tool_error:
self._log(
"error",
"ToolManager | Tool Execution Error",
f"Tool execution error: {tool_error}",
)
return {
"server_name": server_name,
"tool_name": tool_name,
"error": f"Tool execution failed: {str(tool_error)}",
}
else:
raise TypeError(
f"Unknown server params type for {server_name}: {type(server_params)}"
)
self._log(
"info",
"ToolManager | Tool Call Success",
f"Tool '{tool_name}' (server: '{server_name}') called successfully.",
)
return {
"server_name": server_name,
"tool_name": tool_name,
"result": result_content, # Return extracted text content
}
except Exception as outer_e: # Rename this to outer_e to avoid shadowing
self._log(
"error",
"ToolManager | Tool Call Failed",
f"Error: Failed to call tool '{tool_name}' (server: '{server_name}'): {outer_e}",
)
# Store the original error message for later use
error_message = str(outer_e)
if (
tool_name in ["scrape", "scrape_website"]
and "unhandled errors" in error_message
and "url" in arguments
and arguments["url"] is not None
):
try:
self._log(
"info",
"ToolManager | Fallback Attempt",
"Attempting fallback using MarkItDown...",
)
from markitdown import MarkItDown
md = MarkItDown(
docintel_endpoint=""
)
result = md.convert(arguments["url"])
self._log(
"info",
"ToolManager | Fallback Success",
"MarkItDown fallback successful",
)
return {
"server_name": server_name,
"tool_name": tool_name,
"result": result.text_content, # Return extracted text content
}
except (
Exception
) as inner_e: # Use a different name to avoid shadowing
# Log the inner exception if needed
self._log(
"error",
"ToolManager | Fallback Failed",
f"Fallback also failed: {inner_e}",
)
# No need for pass here as we'll continue to the return statement
# Always use the outer exception for the final error response
return {
"server_name": server_name,
"tool_name": tool_name,
"error": f"Tool call failed: {error_message}",
}
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py
================================================
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import base64
import contextlib
import mimetypes
import os
import tempfile
import wave
from urllib.parse import urlparse
import requests
from fastmcp import FastMCP
from mutagen import File as MutagenFile
from openai import OpenAI
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
# Initialize FastMCP server
mcp = FastMCP("audio-mcp-server")
def _get_audio_extension(url: str, content_type: str = None) -> str:
"""
Determine the appropriate audio file extension from URL or content type.
Args:
url: The URL of the audio file
content_type: The content type from HTTP headers
Returns:
File extension (with dot) to use for temporary file
"""
# First try to get extension from URL
parsed_url = urlparse(url)
path = parsed_url.path.lower()
# Common audio extensions
audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"]
for ext in audio_extensions:
if path.endswith(ext):
return ext
# If no extension found in URL, try content type
if content_type:
content_type = content_type.lower()
if "mp3" in content_type or "mpeg" in content_type:
return ".mp3"
elif "wav" in content_type:
return ".wav"
elif "m4a" in content_type:
return ".m4a"
elif "aac" in content_type:
return ".aac"
elif "ogg" in content_type:
return ".ogg"
elif "flac" in content_type:
return ".flac"
# Default fallback to mp3
return ".mp3"
def _get_audio_duration(audio_path: str) -> float:
"""
Get audio duration in seconds.
Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).
Returns 0.0 if duration cannot be determined.
"""
# Try using wave for .wav files
try:
with contextlib.closing(wave.open(audio_path, "rb")) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
if duration > 0:
return duration
except Exception:
pass # Not a wav file or failed
# Try using mutagen for other audio formats (mp3, etc)
try:
audio = MutagenFile(audio_path)
if (
audio is not None
and hasattr(audio, "info")
and hasattr(audio.info, "length")
):
duration = float(audio.info.length)
if duration > 0:
return duration
except Exception:
pass # Failed to get duration
# Return 0.0 if all methods failed
return 0.0
def _encode_audio_file(audio_path: str) -> tuple[str, str]:
"""Encode audio file to base64 and determine format."""
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
encoded_string = base64.b64encode(audio_data).decode("utf-8")
# Determine file format from file extension
mime_type, _ = mimetypes.guess_type(audio_path)
if mime_type and mime_type.startswith("audio/"):
mime_format = mime_type.split("/")[-1]
# Map MIME type formats to OpenAI supported formats
format_mapping = {
"mpeg": "mp3", # audio/mpeg -> mp3
"wav": "wav", # audio/wav -> wav
"wave": "wav", # audio/wave -> wav
}
file_format = format_mapping.get(mime_format, "mp3")
else:
# Default to mp3 if we can't determine
file_format = "mp3"
return encoded_string, file_format
@mcp.tool()
async def audio_transcription(audio_path_or_url: str) -> str:
"""
Transcribe audio file to text and return the transcription.
Args:
audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.
Returns:
The transcription of the audio file.
"""
max_retries = 3
retry = 0
transcription = None
# Create client once outside the retry loop
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
while retry < max_retries:
try:
if os.path.exists(audio_path_or_url): # Check if the file exists locally
with open(audio_path_or_url, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="gpt-4o-transcribe", file=audio_file
)
elif "home/user" in audio_path_or_url:
return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction"
else:
# download the audio file from the URL
response = requests.get(audio_path_or_url)
response.raise_for_status() # Raise an exception for bad status codes
# Basic content validation - check if response has content
if not response.content:
return (
"[ERROR]: Audio transcription failed: Downloaded file is empty"
)
# Check content type if available
content_type = response.headers.get("content-type", "").lower()
# Get proper extension for the temporary file
file_extension = _get_audio_extension(audio_path_or_url, content_type)
# Use proper temporary file handling with correct extension
with tempfile.NamedTemporaryFile(
delete=False, suffix=file_extension
) as temp_file:
temp_file.write(response.content)
temp_audio_path = temp_file.name
try:
with open(temp_audio_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model="gpt-4o-transcribe", file=audio_file
)
finally:
# Clean up the temp file
if os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
break
except requests.RequestException as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
except Exception as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
return transcription.text
@mcp.tool()
async def audio_question_answering(audio_path_or_url: str, question: str) -> str:
"""
Answer the question based on the given audio information.
Args:
audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.
question: The question to answer.
Returns:
The answer to the question, and the duration of the audio file.
"""
max_retries = 3
retry = 0
# Create client once outside the retry loop
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Initialize variables to avoid scope issues
encoded_string = None
file_format = None
duration = 0.0
while retry < max_retries:
try:
text_prompt = f"""Answer the following question based on the given \
audio information:\n\n{question}"""
if os.path.exists(audio_path_or_url): # Check if the file exists locally
encoded_string, file_format = _encode_audio_file(audio_path_or_url)
duration = _get_audio_duration(audio_path_or_url)
elif "home/user" in audio_path_or_url:
return "[ERROR]: The audio_question_answering tool cannot access to sandbox file, please use the local path provided by original instruction"
else:
# download the audio file from the URL
response = requests.get(
audio_path_or_url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
)
response.raise_for_status() # Raise an exception for bad status codes
# Basic content validation - check if response has content
if not response.content:
return "[ERROR]: Audio question answering failed: Downloaded file is empty.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported."
# Check content type if available
content_type = response.headers.get("content-type", "").lower()
# Get proper extension for the temporary file
file_extension = _get_audio_extension(audio_path_or_url, content_type)
# Use proper temporary file handling with correct extension
with tempfile.NamedTemporaryFile(
delete=False, suffix=file_extension
) as temp_file:
temp_file.write(response.content)
temp_audio_path = temp_file.name
try:
encoded_string, file_format = _encode_audio_file(temp_audio_path)
duration = _get_audio_duration(temp_audio_path)
finally:
# Clean up the temp file
if os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
if encoded_string is None or file_format is None:
return "[ERROR]: Audio question answering failed: Failed to encode audio file.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported."
response = client.chat.completions.create(
model="gpt-4o-audio-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant specializing in audio analysis.",
},
{
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": encoded_string,
"format": file_format,
},
},
],
},
],
)
# If we reach here, the API call was successful
break
except requests.RequestException as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio question answering failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
except Exception as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio question answering failed when calling OpenAI API: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
response_text = response.choices[0].message.content
response_text += f"\n\nAudio duration: {duration} seconds"
return response_text
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import base64
import contextlib
import mimetypes
import os
import tempfile
import wave
from urllib.parse import urlparse
import requests
from fastmcp import FastMCP
from mutagen import File as MutagenFile
from openai import OpenAI
WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY")
WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL")
WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME")
# Initialize FastMCP server
mcp = FastMCP("audio-mcp-server-os")
def _get_audio_extension(url: str, content_type: str = None) -> str:
"""
Determine the appropriate audio file extension from URL or content type.
Args:
url: The URL of the audio file
content_type: The content type from HTTP headers
Returns:
File extension (with dot) to use for temporary file
"""
# First try to get extension from URL
parsed_url = urlparse(url)
path = parsed_url.path.lower()
# Common audio extensions
audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"]
for ext in audio_extensions:
if path.endswith(ext):
return ext
# If no extension found in URL, try content type
if content_type:
content_type = content_type.lower()
if "mp3" in content_type or "mpeg" in content_type:
return ".mp3"
elif "wav" in content_type:
return ".wav"
elif "m4a" in content_type:
return ".m4a"
elif "aac" in content_type:
return ".aac"
elif "ogg" in content_type:
return ".ogg"
elif "flac" in content_type:
return ".flac"
# Default fallback to mp3
return ".mp3"
def _get_audio_duration(audio_path: str) -> float:
"""
Get audio duration in seconds.
Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc).
"""
# Try using wave for .wav files
try:
with contextlib.closing(wave.open(audio_path, "rb")) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
if duration > 0:
return duration
except Exception:
pass # Not a wav file or failed
# Try using mutagen for other audio formats (mp3, etc)
try:
audio = MutagenFile(audio_path)
if (
audio is not None
and hasattr(audio, "info")
and hasattr(audio.info, "length")
):
duration = float(audio.info.length)
if duration > 0:
return duration
except Exception as e:
return f"[ERROR]: Failed to get audio duration: {e}"
def _encode_audio_file(audio_path: str) -> tuple[str, str]:
"""Encode audio file to base64 and determine format."""
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
encoded_string = base64.b64encode(audio_data).decode("utf-8")
# Determine file format from file extension
mime_type, _ = mimetypes.guess_type(audio_path)
if mime_type and mime_type.startswith("audio/"):
mime_format = mime_type.split("/")[-1]
# Map MIME type formats to OpenAI supported formats
format_mapping = {
"mpeg": "mp3", # audio/mpeg -> mp3
"wav": "wav", # audio/wav -> wav
"wave": "wav", # audio/wave -> wav
}
file_format = format_mapping.get(mime_format, "mp3")
else:
# Default to mp3 if we can't determine
file_format = "mp3"
return encoded_string, file_format
@mcp.tool()
async def audio_transcription(audio_path_or_url: str) -> str:
"""
Transcribe audio file to text and return the transcription.
Args:
audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported.
Returns:
The transcription of the audio file.
"""
max_retries = 3
retry = 0
transcription = None
while retry < max_retries:
try:
client = OpenAI(base_url=WHISPER_BASE_URL, api_key=WHISPER_API_KEY)
if os.path.exists(audio_path_or_url): # Check if the file exists locally
with open(audio_path_or_url, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model=WHISPER_MODEL_NAME, file=audio_file
)
elif "home/user" in audio_path_or_url:
return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction"
else:
# download the audio file from the URL
response = requests.get(audio_path_or_url)
response.raise_for_status() # Raise an exception for bad status codes
# Basic content validation - check if response has content
if not response.content:
return (
"[ERROR]: Audio transcription failed: Downloaded file is empty"
)
# Check content type if available
content_type = response.headers.get("content-type", "").lower()
if content_type and not any(
media_type in content_type
for media_type in ["audio", "video", "application/octet-stream"]
):
return f"[ERROR]: Audio transcription failed: Invalid content type '{content_type}'. Expected audio file."
# Get proper extension for the temporary file
file_extension = _get_audio_extension(audio_path_or_url, content_type)
# Use proper temporary file handling with correct extension
with tempfile.NamedTemporaryFile(
delete=False, suffix=file_extension
) as temp_file:
temp_file.write(response.content)
temp_audio_path = temp_file.name
try:
with open(temp_audio_path, "rb") as audio_file:
transcription = client.audio.transcriptions.create(
model=WHISPER_MODEL_NAME, file=audio_file
)
finally:
# Clean up the temp file
if os.path.exists(temp_audio_path):
os.remove(temp_audio_path)
break
except requests.RequestException as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
except Exception as e:
retry += 1
if retry >= max_retries:
return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported."
await asyncio.sleep(5 * (2**retry))
return transcription.text
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import json
import logging
from mcp import StdioServerParameters
from mcp.client.session import ClientSession
from mcp.client.sse import sse_client
from mcp.client.stdio import stdio_client
logger = logging.getLogger("miroflow")
class PlaywrightSession:
"""Class to maintain a persistent Playwright MCP session."""
def __init__(self, server_params):
self.server_params = server_params
self.read = None
self.write = None
self.session = None
self._client = None
async def connect(self):
"""Connect to the MCP server and initialize the session."""
if self.session is None:
if isinstance(self.server_params, StdioServerParameters):
self._client = stdio_client(self.server_params)
else:
self._client = sse_client(self.server_params)
self.read, self.write = await self._client.__aenter__()
self.session = ClientSession(self.read, self.write, sampling_callback=None)
await self.session.__aenter__()
await self.session.initialize()
logger.info("Connected to MCP server and initialized session")
async def call_tool(self, tool_name, arguments=None):
"""Call a tool while maintaining the session."""
if self.session is None:
await self.connect()
logger.info(f"Calling tool '{tool_name}'")
tool_result = await self.session.call_tool(tool_name, arguments=arguments)
result_content = tool_result.content[0].text if tool_result.content else ""
return result_content
async def close(self):
"""Close the session and connection."""
if self.session:
await self.session.__aexit__(None, None, None)
self.session = None
if self._client:
await self._client.__aexit__(None, None, None)
self._client = None
self.read = None
self.write = None
logger.info("Closed MCP session")
# Example usage:
async def test_persistent_session():
# Create a persistent session
mcp_session = PlaywrightSession("http://localhost:8931")
try:
# First call: Navigate to a website
await mcp_session.call_tool("browser_navigate", {"url": "https://example.com"})
logger.info("Navigation complete")
# Wait a moment for the page to load
await asyncio.sleep(2)
# Second call: Take a snapshot of the current page
snapshot_result = await mcp_session.call_tool("browser_snapshot", {})
# Process and save the snapshot
snapshot_json = json.loads(snapshot_result)
logger.info(f"Snapshot taken of page: {snapshot_json.get('url')}")
logger.info(f"Page title: {snapshot_json.get('title')}")
with open("snapshot.json", "w") as f:
json.dump(snapshot_json, f, indent=2, ensure_ascii=False)
logger.info("Snapshot saved to snapshot.json")
finally:
# Close the session when done with all tool calls
await mcp_session.close()
if __name__ == "__main__":
asyncio.run(test_persistent_session())
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/python_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import os
import shlex
from urllib.parse import urlparse
from e2b_code_interpreter import Sandbox
from fastmcp import FastMCP
# Initialize FastMCP server
mcp = FastMCP("e2b-python-interpreter")
# API keys
E2B_API_KEY = os.environ.get("E2B_API_KEY")
LOGS_DIR = os.environ.get(
"LOGS_DIR", "../../logs"
) # Directory where benchmark logs are stored
# DEFAULT TEMPLATE ID
DEFAULT_TEMPLATE_ID = "1av7fdjfvcparqo8efq6"
# DEFAULT CONFS
DEFAULT_TIMEOUT = 600 # seconds
# Maximum number of tokens that can be returned by the Python tool
MAX_RESULT_LEN = 20_000
# Maximum number of tokens allowed in an error message
MAX_ERROR_LEN = 4_000
# Invalid sandbox IDs that are not allowed to be used
INVALID_SANDBOX_IDS = {
"default",
"sandbox1",
"sandbox",
"some_id",
"new_sandbox",
"python",
"create_sandbox",
"sandbox123",
"temp",
"sandbox-0",
"sandbox-1",
"sandbox_0",
"sandbox_1",
"new",
"0",
"auto",
"default_sandbox",
"none",
"sandbox_12345",
"dummy",
"sandbox_01",
}
def looks_like_dir(path: str) -> bool:
"""
Return True if the given path either:
- exists and is a directory, OR
- does not exist but looks like a directory (e.g., ends with '/', or has no file extension)
"""
# If it exists, trust the filesystem
if os.path.isdir(path):
return True
# If it ends with '/' or has no extension, treat as directory
if path.endswith(os.path.sep) or not os.path.splitext(path)[1]:
return True
return False
def truncate_result(result: str) -> str:
"""
Truncate result to MAX_RESULT_LEN.
Args:
result: The full result string to potentially truncate
Returns:
Truncated result string
"""
if len(result) > MAX_RESULT_LEN:
result = result[:MAX_RESULT_LEN] + " [Result truncated due to length limit]"
return result
@mcp.tool()
async def create_sandbox(timeout: int = DEFAULT_TIMEOUT) -> str:
"""Create a linux sandbox.
Args:
timeout: Time in seconds before the sandbox is automatically shutdown. The default is 600 seconds.
Returns:
The sandbox_id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox.
"""
max_retries = 5
timeout = min(timeout, DEFAULT_TIMEOUT)
for attempt in range(1, max_retries + 1):
sandbox = None
try:
sandbox = Sandbox(
template=DEFAULT_TEMPLATE_ID,
timeout=timeout,
api_key=E2B_API_KEY,
)
info = sandbox.get_info()
tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles")
os.makedirs(tmpfiles_dir, exist_ok=True)
return f"Sandbox created with sandbox_id: {info.sandbox_id}"
except Exception as e:
if attempt == max_retries:
error_details = str(e)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to create sandbox after {max_retries} attempts: {error_details}, please retry later."
await asyncio.sleep(attempt**2) # Exponential backoff
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(timeout)
except Exception:
pass # Ignore timeout setting errors
@mcp.tool()
async def run_command(command: str, sandbox_id: str) -> str:
"""Execute a lightweight shell command in the linux sandbox (no long-running, blocking, or resource-heavy processes).
Args:
command: The command to execute.
sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`.
Returns:
A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)
"""
if sandbox_id in INVALID_SANDBOX_IDS:
return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."
try:
sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
except Exception:
return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."
max_retries = 3
for attempt in range(1, max_retries + 1):
try:
sandbox.set_timeout(
DEFAULT_TIMEOUT
) # refresh the timeout for each command execution
result = sandbox.commands.run(command)
result_str = str(result)
return truncate_result(result_str)
except Exception as e:
if attempt == max_retries:
# Build error message
error_details = str(e)[:MAX_ERROR_LEN]
error_msg = f"[ERROR]: Failed to run command after {max_retries} attempts.\n\nException type: {type(e).__name__}\nDetails: {error_details}"
return error_msg
await asyncio.sleep(attempt**2) # Exponential backoff
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(DEFAULT_TIMEOUT)
except Exception:
pass # Ignore timeout setting errors
@mcp.tool()
async def run_python_code(code_block: str, sandbox_id: str) -> str:
"""Run short, safe python code in a sandbox and return the execution result (avoid long loops or heavy tasks; must finish quickly).
Args:
code_block: The python code to run.
sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
Returns:
A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...)
"""
# If sandbox_id is invalid, fallback to stateless execution
if not sandbox_id or sandbox_id in INVALID_SANDBOX_IDS:
try:
sandbox = Sandbox(
template=DEFAULT_TEMPLATE_ID,
timeout=DEFAULT_TIMEOUT,
api_key=E2B_API_KEY,
)
try:
execution = sandbox.run_code(code_block)
return truncate_result(str(execution))
finally:
sandbox.kill()
except Exception as e:
error_details = str(e)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to run code in stateless mode. Exception type: {type(e).__name__}, Details: {error_details}"
try:
sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
except Exception:
return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."
max_retries = 3
for attempt in range(1, max_retries + 1):
try:
sandbox.set_timeout(
DEFAULT_TIMEOUT
) # refresh the timeout for each command execution
execution = sandbox.run_code(code_block)
result_str = str(execution)
return truncate_result(result_str)
except Exception as e:
if attempt == max_retries:
error_details = str(e)[:MAX_ERROR_LEN]
error_msg = f"[ERROR]: Failed to run code in sandbox {sandbox_id} after {max_retries} attempts. Exception type: {type(e).__name__}, Details: {error_details}"
return error_msg
await asyncio.sleep(attempt**2) # Exponential backoff
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(DEFAULT_TIMEOUT)
except Exception:
pass # Ignore timeout setting errors
@mcp.tool()
async def upload_file_from_local_to_sandbox(
sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user"
) -> str:
"""Upload a local file to the `/home/user` dir of the remote python interpreter.
Args:
sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
local_file_path: The path of the file on local machine to upload.
sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`.
Returns:
The path of the uploaded file in the remote python interpreter if the upload is successful.
"""
if sandbox_id in INVALID_SANDBOX_IDS:
return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."
try:
sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
except Exception:
return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."
try:
sandbox.set_timeout(
DEFAULT_TIMEOUT
) # refresh the timeout for each command execution
# Check if local file exists and is readable
if not os.path.exists(local_file_path):
return f"[ERROR]: Local file does not exist: {local_file_path}"
if not os.path.isfile(local_file_path):
return f"[ERROR]: Path is not a file: {local_file_path}"
# Get the uploaded file path
uploaded_file_path = os.path.join(
sandbox_file_path, os.path.basename(local_file_path)
)
# Normalize the path
uploaded_file_path = os.path.normpath(uploaded_file_path)
# Ensure the parent directory exists in sandbox
parent_dir = os.path.dirname(uploaded_file_path)
if parent_dir and parent_dir != "/":
mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}")
if mkdir_result.exit_code != 0:
mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}"
# Upload the file
with open(local_file_path, "rb") as f:
sandbox.files.write(uploaded_file_path, f)
return f"File uploaded to {uploaded_file_path}"
except Exception as e:
error_details = str(e)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to upload file {local_file_path} to sandbox {sandbox_id}: {error_details}"
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(DEFAULT_TIMEOUT)
except Exception:
pass # Ignore timeout setting errors
@mcp.tool()
async def download_file_from_internet_to_sandbox(
sandbox_id: str, url: str, sandbox_file_path: str = "/home/user"
) -> str:
"""Download a file from the internet to the `/home/user` dir of the sandbox (avoid large or slow URLs).
Args:
sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`.
url: The URL of the file to download.
sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`.
Returns:
The path of the downloaded file in the sandbox if the download is successful.
"""
if sandbox_id in INVALID_SANDBOX_IDS:
return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."
try:
sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
except Exception:
return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."
try:
sandbox.set_timeout(
DEFAULT_TIMEOUT
) # refresh the timeout for each command execution
# Extract basename from URL properly (handle query parameters)
parsed_url = urlparse(url)
basename = os.path.basename(parsed_url.path) or "downloaded_file"
# Remove any query parameters or fragments from basename
if "?" in basename:
basename = basename.split("?")[0]
if "#" in basename:
basename = basename.split("#")[0]
# Check whether sandbox_file_path looks like a directory
if looks_like_dir(sandbox_file_path):
# It's a directory — join with the filename
downloaded_file_path = os.path.join(sandbox_file_path, basename)
else:
# It's a file path — use it directly
downloaded_file_path = sandbox_file_path
# Normalize the path
downloaded_file_path = os.path.normpath(downloaded_file_path)
# Ensure the parent directory exists in sandbox
parent_dir = os.path.dirname(downloaded_file_path)
if parent_dir and parent_dir != "/":
mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}")
if mkdir_result.exit_code != 0:
mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}"
# Download the file with retry logic
max_retries = 3
for attempt in range(1, max_retries + 1):
safe_url = shlex.quote(url)
safe_path = shlex.quote(downloaded_file_path)
cmd = f"wget {safe_url} -O {safe_path}"
try:
result = sandbox.commands.run(cmd)
if result.exit_code == 0:
return f"File downloaded to {safe_path}"
elif attempt < max_retries:
await asyncio.sleep(4**attempt)
continue # Retry
else:
# Extract detailed error information
error_details = ""
if hasattr(result, "stderr") and result.stderr:
error_details = f"stderr: {result.stderr}"[:MAX_ERROR_LEN]
error_msg = (
f"[ERROR]: Failed to download file from {url} to {downloaded_file_path} after {max_retries} attempts.\n\n"
f"exit_code: {result.exit_code}\n\n"
f"Details: {error_details}"
)
return error_msg
except Exception as e:
if attempt == max_retries:
error_details = str(e)[:MAX_ERROR_LEN]
error_msg = f"[ERROR]: Failed to download file from {url} to {downloaded_file_path}. Exception: {error_details}"
return error_msg
await asyncio.sleep(4**attempt)
except Exception as e:
error_details = str(e)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to download file from {url}: {error_details}"
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(DEFAULT_TIMEOUT)
except Exception:
pass # Ignore timeout setting errors
@mcp.tool()
async def download_file_from_sandbox_to_local(
sandbox_id: str, sandbox_file_path: str, local_filename: str = None
) -> str:
"""Download a file from the sandbox to local system. Files in sandbox cannot be processed by tools from other servers - only local files and internet URLs can be processed by them.
Args:
sandbox_id: The id of the sandbox to download the file from. To have a sandbox, use tool `create_sandbox`.
sandbox_file_path: The path of the file to download on the sandbox.
local_filename: Optional filename to save as. If not provided, uses the original filename from sandbox_file_path.
Returns:
The local path of the downloaded file if successful, otherwise error message.
"""
if sandbox_id in INVALID_SANDBOX_IDS:
return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool."
try:
sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY)
except Exception:
return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct."
try:
sandbox.set_timeout(
DEFAULT_TIMEOUT
) # refresh the timeout for each command execution
# Create tmpfiles directory if it doesn't exist
if not LOGS_DIR:
return "[ERROR]: LOGS_DIR environment variable is not set. Cannot determine where to save the file."
tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles")
os.makedirs(tmpfiles_dir, exist_ok=True)
# Check if the path is a directory (before attempting to read)
check_result = sandbox.commands.run(
f'test -d {shlex.quote(sandbox_file_path)} && echo "is_directory" || echo "not_directory"'
)
if check_result.stdout and "is_directory" in check_result.stdout:
return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file."
# Check if the file exists
check_file_result = sandbox.commands.run(
f'test -f {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"'
)
if check_file_result.stdout and "not_exists" in check_file_result.stdout:
# Check if it exists at all (might be a symlink or other type)
check_any_result = sandbox.commands.run(
f'test -e {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"'
)
if check_any_result.stdout and "not_exists" in check_any_result.stdout:
error_msg = f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: file does not exist."
return error_msg
# Determine local filename
if local_filename is None or local_filename.strip() == "":
local_filename = os.path.basename(sandbox_file_path)
# If basename is empty or just '/', use a default name
if not local_filename or local_filename == "/":
local_filename = "downloaded_file"
local_file_path = os.path.join(
tmpfiles_dir, f"sandbox_{sandbox_id}_{local_filename}"
)
# Download the file
try:
with open(local_file_path, "wb") as f:
content = sandbox.files.read(sandbox_file_path, format="bytes")
f.write(content)
except Exception as read_error:
error_msg = str(read_error).lower()
if "directory" in error_msg or "is a directory" in error_msg:
return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file."
else:
read_error_details = str(read_error)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to read file '{sandbox_file_path}' from sandbox {sandbox_id}: {read_error_details}"
return f"File downloaded successfully to: {local_file_path}"
except Exception as e:
error_details = str(e)[:MAX_ERROR_LEN]
return f"[ERROR]: Failed to download file '{sandbox_file_path}' from sandbox {sandbox_id}: {error_details}"
finally:
# Set timeout before exit to prevent timeout after function exits
try:
sandbox.set_timeout(DEFAULT_TIMEOUT)
except Exception:
pass # Ignore timeout setting errors
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import argparse
import logging
import sys
from fastmcp import FastMCP
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client
logger = logging.getLogger("miroflow")
# Initialize FastMCP server
mcp = FastMCP("reading-mcp-server")
@mcp.tool()
async def convert_to_markdown(uri: str) -> str:
"""Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.)
described by an file: or data: URI to markdown.
Args:
uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes.
Returns:
str: The converted markdown content, or an error message if conversion fails.
"""
if not uri or not uri.strip():
return "Error: URI parameter is required and cannot be empty."
# Validate URI scheme
valid_schemes = ["http:", "https:", "file:", "data:"]
if not any(uri.lower().startswith(scheme) for scheme in valid_schemes):
return f"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}"
tool_name = "convert_to_markdown"
arguments = {"uri": uri}
server_params = StdioServerParameters(
command=sys.executable,
args=["-m", "markitdown_mcp"],
)
result_content = ""
try:
async with stdio_client(server_params) as (read, write):
async with ClientSession(read, write, sampling_callback=None) as session:
await session.initialize()
try:
tool_result = await session.call_tool(
tool_name, arguments=arguments
)
result_content = (
tool_result.content[-1].text if tool_result.content else ""
)
except Exception as tool_error:
logger.info(f"Tool execution error: {tool_error}")
return f"Error: Tool execution failed: {str(tool_error)}"
except Exception as session_error:
logger.info(f"Session error: {session_error}")
return (
f"Error: Failed to connect to markitdown-mcp server: {str(session_error)}"
)
return result_content
if __name__ == "__main__":
# Set up argument parser
parser = argparse.ArgumentParser(description="Reading MCP Server")
parser.add_argument(
"--transport",
choices=["stdio", "http"],
default="stdio",
help="Transport method: 'stdio' or 'http' (default: stdio)",
)
parser.add_argument(
"--port",
type=int,
default=8080,
help="Port to use when running with HTTP transport (default: 8080)",
)
parser.add_argument(
"--path",
type=str,
default="/mcp",
help="URL path to use when running with HTTP transport (default: /mcp)",
)
# Parse command line arguments
args = parser.parse_args()
# Run the server with the specified transport method
if args.transport == "stdio":
mcp.run(transport="stdio")
else:
# For HTTP transport, include port and path options
mcp.run(transport="streamable-http", port=args.port, path=args.path)
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import logging
import os
from anthropic import Anthropic
from fastmcp import FastMCP
logger = logging.getLogger("miroflow")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
# Initialize FastMCP server
mcp = FastMCP("reasoning-mcp-server")
@mcp.tool()
async def reasoning(question: str) -> str:
"""You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
DO NOT use this tool for simple and obvious question.
Args:
question: The hard question.
Returns:
The answer to the question.
"""
messages_for_llm = [
{
"role": "user",
"content": [
{
"type": "text",
"text": question,
}
],
}
]
client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL)
response = client.messages.create(
model="claude-3-7-sonnet-20250219",
max_tokens=21000,
thinking={
"type": "enabled",
"budget_tokens": 19000,
},
messages=messages_for_llm,
stream=False,
)
try:
return response.content[-1].text
except Exception:
logger.info("Reasoning Error: only thinking content is returned")
return response.content[-1].thinking
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import logging
import os
import random
import time
import requests
from fastmcp import FastMCP
logger = logging.getLogger("miroflow")
REASONING_API_KEY = os.environ.get("REASONING_API_KEY")
REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL")
REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME")
# Initialize FastMCP server
mcp = FastMCP("reasoning-mcp-server-os")
# Retry configuration
MAX_RETRIES = 10
BACKOFF_BASE = 1.0 # initial backoff in seconds
BACKOFF_MAX = 30.0 # maximum backoff in seconds
def post_with_retry(url, json, headers):
"""Send POST request with retry and exponential backoff.
Returns response object if success, otherwise None."""
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = requests.post(url, json=json, headers=headers, timeout=600)
if resp.status_code == 200:
return resp
else:
logger.warning(
f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}"
)
except requests.exceptions.RequestException as e:
logger.warning(f"Request failed on attempt {attempt}: {e}")
# Backoff before next retry
if attempt < MAX_RETRIES:
sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX)
# Add jitter to avoid thundering herd
sleep_time *= 0.8 + 0.4 * random.random()
logger.info(f"Retrying in {sleep_time:.1f}s...")
time.sleep(sleep_time)
logger.warning(f"All {MAX_RETRIES} retries failed for {url}")
return None
@mcp.tool()
async def reasoning(question: str) -> str:
"""You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
DO NOT use this tool for simple and obvious question.
Args:
question: The hard question.
Returns:
The answer to the question.
"""
payload = {
"model": REASONING_MODEL_NAME,
"messages": [{"role": "user", "content": question}],
"temperature": 0.6,
"top_p": 0.95,
}
headers = {
"Authorization": f"Bearer {REASONING_API_KEY}",
"Content-Type": "application/json",
}
response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers)
if response is None:
return "Reasoning service unavailable. Please try again later."
json_response = response.json()
try:
content = json_response["choices"][0]["message"]["content"]
if "" in content:
content = content.split("", 1)[1].strip()
return content
except Exception:
logger.info("Reasoning Error: only thinking content is returned")
return json_response["choices"][0]["message"]["reasoning_content"]
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_google_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import calendar
import datetime
import json
import os
import sys
import requests
import wikipedia
from fastmcp import FastMCP
from mcp import ClientSession, StdioServerParameters # (already imported in config.py)
from mcp.client.stdio import stdio_client
from .utils import strip_markdown_links
SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")
# Google search result filtering environment variables
REMOVE_SNIPPETS = os.environ.get("REMOVE_SNIPPETS", "").lower() in ("true", "1", "yes")
REMOVE_KNOWLEDGE_GRAPH = os.environ.get("REMOVE_KNOWLEDGE_GRAPH", "").lower() in (
"true",
"1",
"yes",
)
REMOVE_ANSWER_BOX = os.environ.get("REMOVE_ANSWER_BOX", "").lower() in (
"true",
"1",
"yes",
)
# Initialize FastMCP server
mcp = FastMCP("searching-google-mcp-server")
def filter_google_search_result(result_content: str) -> str:
"""Filter google search result content based on environment variables.
Args:
result_content: The JSON string result from google search
Returns:
Filtered JSON string result
"""
try:
# Parse JSON
data = json.loads(result_content)
# Remove knowledgeGraph if requested
if REMOVE_KNOWLEDGE_GRAPH and "knowledgeGraph" in data:
del data["knowledgeGraph"]
# Remove answerBox if requested
if REMOVE_ANSWER_BOX and "answerBox" in data:
del data["answerBox"]
# Remove snippets if requested
if REMOVE_SNIPPETS:
# Remove snippets from organic results
if "organic" in data:
for item in data["organic"]:
if "snippet" in item:
del item["snippet"]
# Remove snippets from peopleAlsoAsk
if "peopleAlsoAsk" in data:
for item in data["peopleAlsoAsk"]:
if "snippet" in item:
del item["snippet"]
# Return filtered JSON
return json.dumps(data, ensure_ascii=False, indent=None)
except (json.JSONDecodeError, Exception):
# If filtering fails, return original content
return result_content
@mcp.tool()
async def google_search(
q: str,
gl: str = "us",
hl: str = "en",
location: str = None,
num: int = 10,
tbs: str = None,
page: int = 1,
) -> str:
"""Perform google searches via Serper API and retrieve rich results.
It is able to retrieve organic search results, people also ask, related searches, and knowledge graph.
Args:
q: Search query string.
gl: Country context for search (e.g., 'us' for United States, 'cn' for China, 'uk' for United Kingdom). Influences regional results priority. Default is 'us'.
hl: Google interface language (e.g., 'en' for English, 'zh' for Chinese, 'es' for Spanish). Affects snippet language preference. Default is 'en'.
location: City-level location for search results (e.g., 'SoHo, New York, United States', 'California, United States').
num: The number of results to return (default: 10).
tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year).
page: The page number of results to return (default: 1).
Returns:
The search results.
"""
if SERPER_API_KEY == "":
return (
"[ERROR]: SERPER_API_KEY is not set, google_search tool is not available."
)
tool_name = "google_search"
arguments = {
"q": q,
"gl": gl,
"hl": hl,
"num": num,
"page": page,
"autocorrect": False,
}
if location:
arguments["location"] = location
if tbs:
arguments["tbs"] = tbs
server_params = StdioServerParameters(
command=sys.executable,
args=["-m", "miroflow_tools.mcp_servers.serper_mcp_server"],
env={"SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL},
)
result_content = ""
retry_count = 0
max_retries = 3
while retry_count < max_retries:
try:
async with stdio_client(server_params) as (read, write):
async with ClientSession(
read, write, sampling_callback=None
) as session:
await session.initialize()
tool_result = await session.call_tool(
tool_name, arguments=arguments
)
result_content = (
tool_result.content[-1].text if tool_result.content else ""
)
assert (
result_content is not None and result_content.strip() != ""
), "Empty result from google_search tool, please try again."
# Apply filtering based on environment variables
filtered_result = filter_google_search_result(result_content)
return filtered_result # Success, exit retry loop
except Exception as error:
retry_count += 1
if retry_count >= max_retries:
return f"[ERROR]: google_search tool execution failed after {max_retries} attempts: {str(error)}"
# Wait before retrying
await asyncio.sleep(min(2**retry_count, 60))
return "[ERROR]: Unknown error occurred in google_search tool, please try again."
# @mcp.tool()
async def wiki_get_page_content(entity: str, first_sentences: int = 10) -> str:
"""Get specific Wikipedia page content for the specific entity (people, places, concepts, events) and return structured information.
This tool searches Wikipedia for the given entity and returns either the first few sentences
(which typically contain the summary/introduction) or full page content based on parameters.
It handles disambiguation pages and provides clean, structured output.
Args:
entity: The entity to search for in Wikipedia.
first_sentences: Number of first sentences to return from the page. Set to 0 to return full content. Defaults to 10.
Returns:
str: Formatted search results containing title, first sentences/full content, and URL.
Returns error message if page not found or other issues occur.
"""
try:
# Try to get the Wikipedia page directly
page = wikipedia.page(title=entity, auto_suggest=False)
# Prepare the result
result_parts = [f"Page Title: {page.title}"]
if first_sentences > 0:
# Get summary with specified number of sentences
try:
summary = wikipedia.summary(
entity, sentences=first_sentences, auto_suggest=False
)
result_parts.append(
f"First {first_sentences} sentences (introduction): {summary}"
)
except Exception:
# Fallback to page summary if direct summary fails
content_sentences = page.content.split(". ")[:first_sentences]
summary = (
". ".join(content_sentences) + "."
if content_sentences
else page.content[:5000] + "..."
)
result_parts.append(
f"First {first_sentences} sentences (introduction): {summary}"
)
else:
# Return full content if first_sentences is 0
# TODO: Context Engineering Needed
result_parts.append(f"Content: {page.content}")
result_parts.append(f"URL: {page.url}")
return "\n\n".join(result_parts)
except wikipedia.exceptions.DisambiguationError as e:
options_list = "\n".join(
[f"- {option}" for option in e.options[:10]]
) # Limit to first 10
output = (
f"Disambiguation Error: Multiple pages found for '{entity}'.\n\n"
f"Available options:\n{options_list}\n\n"
f"Please be more specific in your search query."
)
try:
search_results = wikipedia.search(entity, results=5)
if search_results:
output += f"Try to search {entity} in Wikipedia: {search_results}"
return output
except Exception:
pass
return output
except wikipedia.exceptions.PageError:
# Try a search if direct page lookup fails
try:
search_results = wikipedia.search(entity, results=5)
if search_results:
suggestion_list = "\n".join(
[f"- {result}" for result in search_results[:5]]
)
return (
f"Page Not Found: No Wikipedia page found for '{entity}'.\n\n"
f"Similar pages found:\n{suggestion_list}\n\n"
f"Try searching for one of these suggestions instead."
)
else:
return (
f"Page Not Found: No Wikipedia page found for '{entity}' "
f"and no similar pages were found. Please try a different search term."
)
except Exception as search_error:
return (
f"Page Not Found: No Wikipedia page found for '{entity}'. "
f"Search for alternatives also failed: {str(search_error)}"
)
except wikipedia.exceptions.RedirectError:
return f"Redirect Error: Failed to follow redirect for '{entity}'"
except requests.exceptions.RequestException as e:
return f"Network Error: Failed to connect to Wikipedia: {str(e)}"
except wikipedia.exceptions.WikipediaException as e:
return f"Wikipedia Error: An error occurred while searching Wikipedia: {str(e)}"
except Exception as e:
return f"Unexpected Error: An unexpected error occurred: {str(e)}"
# @mcp.tool()
async def search_wiki_revision(
entity: str, year: int, month: int, max_revisions: int = 50
) -> str:
"""Search for an entity in Wikipedia and return the revision history for a specific month.
Args:
entity: The entity to search for in Wikipedia.
year: The year of the revision (e.g. 2024).
month: The month of the revision (1-12).
max_revisions: Maximum number of revisions to return. Defaults to 50.
Returns:
str: Formatted revision history with timestamps, revision IDs, and URLs.
Returns error message if page not found or other issues occur.
"""
# Auto-adjust date values and track changes
adjustments = []
original_year, original_month = year, month
current_year = datetime.datetime.now().year
# Adjust year to valid range
if year < 2000:
year = 2000
adjustments.append(
f"Year adjusted from {original_year} to 2000 (minimum supported)"
)
elif year > current_year:
year = current_year
adjustments.append(
f"Year adjusted from {original_year} to {current_year} (current year)"
)
# Adjust month to valid range
if month < 1:
month = 1
adjustments.append(f"Month adjusted from {original_month} to 1")
elif month > 12:
month = 12
adjustments.append(f"Month adjusted from {original_month} to 12")
# Prepare adjustment message if any changes were made
if adjustments:
adjustment_msg = (
"Date auto-adjusted: "
+ "; ".join(adjustments)
+ f". Using {year}-{month:02d} instead.\n\n"
)
else:
adjustment_msg = ""
base_url = "https://en.wikipedia.org/w/api.php"
try:
# Construct the time range
start_date = datetime.datetime(year, month, 1)
last_day = calendar.monthrange(year, month)[1]
end_date = datetime.datetime(year, month, last_day, 23, 59, 59)
# Convert to ISO format (UTC time)
start_iso = start_date.strftime("%Y-%m-%dT%H:%M:%SZ")
end_iso = end_date.strftime("%Y-%m-%dT%H:%M:%SZ")
# API parameters configuration
params = {
"action": "query",
"format": "json",
"titles": entity,
"prop": "revisions",
"rvlimit": min(max_revisions, 500), # Wikipedia API limit
"rvstart": start_iso,
"rvend": end_iso,
"rvdir": "newer",
"rvprop": "timestamp|ids",
}
response = requests.get(base_url, params=params)
response.raise_for_status()
data = response.json()
# Check for API errors
if "error" in data:
return f"[ERROR]: Wikipedia API Error: {data['error'].get('info', 'Unknown error')}"
# Process the response
pages = data.get("query", {}).get("pages", {})
if not pages:
return f"[ERROR]: No results found for entity '{entity}'"
# Check if page exists
page_id = list(pages.keys())[0]
if page_id == "-1":
return f"[ERROR]: Page Not Found: No Wikipedia page found for '{entity}'"
page_info = pages[page_id]
page_title = page_info.get("title", entity)
if "revisions" not in page_info or not page_info["revisions"]:
return (
adjustment_msg + f"Page Title: {page_title}\n\n"
f"No revisions found for '{entity}' in {year}-{month:02d}.\n\n"
f"The page may not have been edited during this time period."
)
# Format the results
result_parts = [
f"Page Title: {page_title}",
f"Revision Period: {year}-{month:02d}",
f"Total Revisions Found: {len(page_info['revisions'])}",
]
# Add revision details
revisions_details = []
for i, rev in enumerate(page_info["revisions"], 1):
revision_id = rev["revid"]
timestamp = rev["timestamp"]
# Format timestamp for better readability
try:
dt = datetime.datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
except Exception:
formatted_time = timestamp
# Construct revision URL
rev_url = f"https://en.wikipedia.org/w/index.php?title={entity}&oldid={revision_id}"
revisions_details.append(
f"{i}. Revision ID: {revision_id}\n"
f" Timestamp: {formatted_time}\n"
f" URL: {rev_url}"
)
if revisions_details:
result_parts.append("Revisions:\n" + "\n\n".join(revisions_details))
return (
adjustment_msg
+ "\n\n".join(result_parts)
+ "\n\nHint: You can use the `scrape_website` tool to get the webpage content of a URL."
)
except requests.exceptions.Timeout:
return f"[ERROR]: Network Error: Request timed out while fetching revision history for '{entity}'"
except requests.exceptions.RequestException as e:
return f"[ERROR]: Network Error: Failed to connect to Wikipedia: {str(e)}"
except ValueError as e:
return f"[ERROR]: Date Error: Invalid date values - {str(e)}"
except Exception as e:
return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}"
# @mcp.tool()
async def search_archived_webpage(url: str, year: int, month: int, day: int) -> str:
"""Search the Wayback Machine (archive.org) for archived versions of a webpage, optionally for a specific date.
Args:
url: The URL to search for in the Wayback Machine.
year: The target year (e.g., 2023).
month: The target month (1-12).
day: The target day (1-31).
Returns:
str: Formatted archive information including archived URL, timestamp, and status.
Returns error message if URL not found or other issues occur.
"""
# Handle empty URL
if not url:
return f"[ERROR]: Invalid URL: '{url}'. URL cannot be empty."
# Auto-add https:// if no protocol is specified
protocol_hint = ""
if not url.startswith(("http://", "https://")):
original_url = url
url = f"https://{url}"
protocol_hint = f"[NOTE]: Automatically added 'https://' to URL '{original_url}' -> '{url}'\n\n"
hint_message = ""
if ".wikipedia.org" in url:
hint_message = "Note: You are trying to search a Wikipedia page, you can also use the `search_wiki_revision` tool to get the revision content of a Wikipedia page.\n\n"
# Check if specific date is requested
date = ""
adjustment_msg = ""
if year > 0 and month > 0:
# Auto-adjust date values and track changes
adjustments = []
original_year, original_month, original_day = year, month, day
current_year = datetime.datetime.now().year
# Adjust year to valid range
if year < 1995:
year = 1995
adjustments.append(
f"Year adjusted from {original_year} to 1995 (minimum supported)"
)
elif year > current_year:
year = current_year
adjustments.append(
f"Year adjusted from {original_year} to {current_year} (current year)"
)
# Adjust month to valid range
if month < 1:
month = 1
adjustments.append(f"Month adjusted from {original_month} to 1")
elif month > 12:
month = 12
adjustments.append(f"Month adjusted from {original_month} to 12")
# Adjust day to valid range for the given month/year
max_day = calendar.monthrange(year, month)[1]
if day < 1:
day = 1
adjustments.append(f"Day adjusted from {original_day} to 1")
elif day > max_day:
day = max_day
adjustments.append(
f"Day adjusted from {original_day} to {max_day} (max for {year}-{month:02d})"
)
# Update the date string with adjusted values
date = f"{year:04d}{month:02d}{day:02d}"
try:
# Validate the final adjusted date
datetime.datetime(year, month, day)
except ValueError as e:
return f"[ERROR]: Invalid date: {year}-{month:02d}-{day:02d}. {str(e)}"
# Prepare adjustment message if any changes were made
if adjustments:
adjustment_msg = (
"Date auto-adjusted: "
+ "; ".join(adjustments)
+ f". Using {date} instead.\n\n"
)
try:
base_url = "https://archive.org/wayback/available"
# Search with specific date if provided
if date:
retry_count = 0
# retry 5 times if the response is not valid
while retry_count < 5:
response = requests.get(f"{base_url}?url={url}×tamp={date}")
response.raise_for_status()
data = response.json()
if (
"archived_snapshots" in data
and "closest" in data["archived_snapshots"]
):
break
retry_count += 1
await asyncio.sleep(min(2**retry_count, 60))
if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
closest = data["archived_snapshots"]["closest"]
archived_url = closest["url"]
archived_timestamp = closest["timestamp"]
available = closest.get("available", True)
if not available:
return (
hint_message
+ adjustment_msg
+ (
f"Archive Status: Snapshot exists but is not available\n\n"
f"Original URL: {url}\n"
f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n"
f"Closest Snapshot: {archived_timestamp}\n\n"
f"Try a different date"
)
)
# Format timestamp for better readability
try:
dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S")
formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
except Exception:
formatted_time = archived_timestamp
return (
protocol_hint
+ hint_message
+ adjustment_msg
+ (
f"Archive Found: Archived version located\n\n"
f"Original URL: {url}\n"
f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n"
f"Archived URL: {archived_url}\n"
f"Archived Timestamp: {formatted_time}\n"
)
+ "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL."
)
# Search without specific date (most recent)
retry_count = 0
# retry 5 times if the response is not valid
while retry_count < 5:
response = requests.get(f"{base_url}?url={url}")
response.raise_for_status()
data = response.json()
if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
break
retry_count += 1
await asyncio.sleep(min(2**retry_count, 60))
if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
closest = data["archived_snapshots"]["closest"]
archived_url = closest["url"]
archived_timestamp = closest["timestamp"]
available = closest.get("available", True)
if not available:
return (
protocol_hint
+ hint_message
+ (
f"Archive Status: Most recent snapshot exists but is not available\n\n"
f"Original URL: {url}\n"
f"Most Recent Snapshot: {archived_timestamp}\n\n"
f"The URL may have been archived but access is restricted"
)
)
# Format timestamp for better readability
try:
dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S")
formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
except Exception:
formatted_time = archived_timestamp
return (
protocol_hint
+ hint_message
+ (
f"Archive Found: Most recent archived version\n\n"
f"Original URL: {url}\n"
f"Archived URL: {archived_url}\n"
f"Archived Timestamp: {formatted_time}\n"
)
+ "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL."
)
else:
return (
protocol_hint
+ hint_message
+ (
f"Archive Not Found: No archived versions available\n\n"
f"Original URL: {url}\n\n"
f"The URL '{url}' has not been archived by the Wayback Machine.\n"
f"You may want to:\n"
f"- Check if the URL is correct\n"
f"- Try a different URL and date\n"
)
)
except requests.exceptions.RequestException as e:
return f"[ERROR]: Network Error: Failed to connect to Wayback Machine: {str(e)}"
except ValueError as e:
return f"[ERROR]: Data Error: Failed to parse response from Wayback Machine: {str(e)}"
except Exception as e:
return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}"
@mcp.tool()
async def scrape_website(url: str) -> str:
"""This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.
Args:
url: The URL of the website to scrape.
Returns:
The scraped website content.
"""
# Validate URL format
if not url or not url.startswith(("http://", "https://")):
return f"Invalid URL: '{url}'. URL must start with http:// or https://"
# Avoid duplicate Jina URL prefix
if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
url = url[len("https://r.jina.ai/") :]
# Check for restricted domains
if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url:
return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
if JINA_API_KEY == "":
return "JINA_API_KEY is not set, scrape_website tool is not available."
try:
# Use Jina.ai reader API to convert URL to LLM-friendly text
jina_url = f"{JINA_BASE_URL}/{url}"
# Make request with proper headers
headers = {"Authorization": f"Bearer {JINA_API_KEY}"}
response = requests.get(jina_url, headers=headers, timeout=60)
response.raise_for_status()
# Get the content
content = response.text.strip()
content = strip_markdown_links(content)
if not content:
return f"No content retrieved from URL: {url}"
return content
except requests.exceptions.Timeout:
return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive."
except requests.exceptions.ConnectionError:
return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible."
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code if e.response else "unknown"
if status_code == 404:
return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist."
elif status_code == 403:
return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden."
elif status_code == 500:
return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error."
else:
return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}"
except requests.exceptions.RequestException as e:
return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}"
except Exception as e:
return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}"
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sogou_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import json
import os
import requests
from fastmcp import FastMCP
from tencentcloud.common import credential
from tencentcloud.common.common_client import CommonClient
from tencentcloud.common.exception.tencent_cloud_sdk_exception import (
TencentCloudSDKException,
)
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from .utils import strip_markdown_links
TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID", "")
TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY", "")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")
# Initialize FastMCP server
mcp = FastMCP("searching-sogou-mcp-server")
@mcp.tool()
async def sogou_search(Query: str, Cnt: int = 10) -> str:
"""Performs web searches using the Tencent Cloud SearchPro API to retrieve comprehensive information, with Sogou search offering superior results for Chinese-language queries.
Args:
Query: The core search query string. Be specific to improve result relevance (e.g., "2024 World Cup final results"). (Required, no default value)
Cnt: Number of search results to return (Can only be 10/20/30/40/50). Optional, default: 10)
Returns:
The search results in JSON format, including the following core fields:
- Query: The original search query (consistent with the input Query, for request verification)
- Pages: Array of JSON strings, each containing details of a single search result (e.g., title, url, passage, date, site, favicon)
"""
if TENCENTCLOUD_SECRET_ID == "" or TENCENTCLOUD_SECRET_KEY == "":
return "[ERROR]: TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY is not set, sogou_search tool is not available."
retry_count = 0
max_retries = 3
while retry_count < max_retries:
try:
cred = credential.Credential(
TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY
)
httpProfile = HttpProfile()
httpProfile.endpoint = "wsa.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
params = f'{{"Query":"{Query}","Mode":0, "Cnt":{Cnt}}}'
common_client = CommonClient(
"wsa", "2025-05-08", cred, "", profile=clientProfile
)
result = common_client.call_json("SearchPro", json.loads(params))[
"Response"
]
del result["RequestId"]
pages = []
for page in result["Pages"]:
page_json = json.loads(page)
new_page = {}
new_page["title"] = page_json["title"]
new_page["url"] = page_json["url"]
new_page["passage"] = page_json["passage"]
new_page["date"] = page_json["date"]
# new_page["content"] = page_json["content"]
new_page["site"] = page_json["site"]
# new_page["favicon"] = page_json["favicon"]
pages.append(new_page)
result["Pages"] = pages
return json.dumps(result, ensure_ascii=False)
except TencentCloudSDKException:
retry_count += 1
if retry_count >= max_retries:
return f"[ERROR]: sogou_search tool execution failed after {max_retries} attempts: Unexpected error occurred."
# Wait before retrying
await asyncio.sleep(min(2**retry_count, 60))
return "[ERROR]: Unknown error occurred in google_search tool, please try again."
@mcp.tool()
async def scrape_website(url: str) -> str:
"""This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc.
Args:
url: The URL of the website to scrape.
Returns:
The scraped website content.
"""
# Validate URL format
if not url or not url.startswith(("http://", "https://")):
return f"Invalid URL: '{url}'. URL must start with http:// or https://"
# Avoid duplicate Jina URL prefix
if url.startswith("https://r.jina.ai/") and url.count("http") >= 2:
url = url[len("https://r.jina.ai/") :]
# Check for restricted domains
if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url:
return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose."
if JINA_API_KEY == "":
return "JINA_API_KEY is not set, scrape_website tool is not available."
try:
# Use Jina.ai reader API to convert URL to LLM-friendly text
jina_url = f"{JINA_BASE_URL}/{url}"
# Make request with proper headers
headers = {"Authorization": f"Bearer {JINA_API_KEY}"}
response = requests.get(jina_url, headers=headers, timeout=60)
response.raise_for_status()
# Get the content
content = response.text.strip()
content = strip_markdown_links(content)
if not content:
return f"No content retrieved from URL: {url}"
return content
except requests.exceptions.Timeout:
return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive."
except requests.exceptions.ConnectionError:
return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible."
except requests.exceptions.HTTPError as e:
status_code = e.response.status_code if e.response else "unknown"
if status_code == 404:
return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist."
elif status_code == 403:
return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden."
elif status_code == 500:
return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error."
else:
return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}"
except requests.exceptions.RequestException as e:
return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}"
except Exception as e:
return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}"
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
"""
adapted from
https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1
"""
import json
import os
from typing import Any, Dict
import requests
from mcp.server.fastmcp import FastMCP
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
from .utils import decode_http_urls_in_dict
SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
# Initialize FastMCP server
mcp = FastMCP("serper-mcp-server")
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=10),
retry=retry_if_exception_type(
(requests.ConnectionError, requests.Timeout, requests.HTTPError)
),
)
def make_serper_request(
payload: Dict[str, Any], headers: Dict[str, str]
) -> requests.Response:
"""Make HTTP request to Serper API with retry logic."""
response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers)
response.raise_for_status()
return response
def _is_huggingface_dataset_or_space_url(url):
"""
Check if the URL is a HuggingFace dataset or space URL.
:param url: The URL to check
:return: True if it's a HuggingFace dataset or space URL, False otherwise
"""
if not url:
return False
return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url
@mcp.tool()
def google_search(
q: str,
gl: str = "us",
hl: str = "en",
location: str | None = None,
num: int | None = None,
tbs: str | None = None,
page: int | None = None,
autocorrect: bool | None = None,
):
"""
Tool to perform web searches via Serper API and retrieve rich results.
It is able to retrieve organic search results, people also ask,
related searches, and knowledge graph.
Args:
q: Search query string
gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
num: Number of results to return (default: 10)
tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week,
'qdr:m' for past month, 'qdr:y' for past year)
page: Page number of results to return (default: 1)
autocorrect: Whether to autocorrect spelling in query
Returns:
Dictionary containing search results and metadata.
"""
# Check for API key
if not SERPER_API_KEY:
return json.dumps(
{
"success": False,
"error": "SERPER_API_KEY environment variable not set",
"results": [],
},
ensure_ascii=False,
)
# Validate required parameter
if not q or not q.strip():
return json.dumps(
{
"success": False,
"error": "Search query 'q' is required and cannot be empty",
"results": [],
},
ensure_ascii=False,
)
try:
# Build payload with all supported parameters
payload: dict[str, Any] = {
"q": q.strip(),
"gl": gl,
"hl": hl,
}
# Add optional parameters if provided
if location:
payload["location"] = location
if num is not None:
payload["num"] = num
else:
payload["num"] = 10 # Default
if tbs:
payload["tbs"] = tbs
if page is not None:
payload["page"] = page
if autocorrect is not None:
payload["autocorrect"] = autocorrect
# Set up headers
headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
# Make the API request
response = make_serper_request(payload, headers)
data = response.json()
# filter out HuggingFace dataset or space urls
organic_results = []
if "organic" in data:
for item in data["organic"]:
if _is_huggingface_dataset_or_space_url(item.get("link", "")):
continue
organic_results.append(item)
# Keep all original fields, but overwrite "organic"
response_data = dict(data)
response_data["organic"] = organic_results
response_data = decode_http_urls_in_dict(response_data)
return json.dumps(response_data, ensure_ascii=False)
except Exception as e:
return json.dumps(
{"success": False, "error": f"Unexpected error: {str(e)}", "results": []},
ensure_ascii=False,
)
if __name__ == "__main__":
mcp.run()
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py
================================================
from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links
__all__ = [
"safe_unquote",
"decode_http_urls_in_dict",
"strip_markdown_links",
]
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py
================================================
import re
from urllib.parse import unquote
from markdown_it import MarkdownIt
# RFC 3986 reserved characters percent-encoding (decoding these would alter URL semantics/structure)
# gen-delims: : / ? # [ ] @
# sub-delims: ! $ & ' ( ) * + , ; =
RESERVED_PERCENT_ENCODINGS = frozenset(
{
"%2f",
"%2F", # / path separator
"%3f",
"%3F", # ? query string start
"%23", # # fragment start
"%26", # & query parameter separator
"%3d",
"%3D", # = key-value separator
"%40", # @
"%3a",
"%3A", # :
"%5b",
"%5B", # [
"%5d",
"%5D", # ]
"%21", # !
"%24", # $
"%27", # '
"%28", # (
"%29", # )
"%2a",
"%2A", # *
"%2b",
"%2B", # +
"%2c",
"%2C", # ,
"%3b",
"%3B", # ;
"%25", # % percent sign itself (prevents double-encoding issues)
"%20", # space (keep encoded to avoid URL semantic changes)
}
)
def safe_unquote(url: str) -> str:
"""
Safely decode URL-encoded strings, only decoding characters that won't alter URL semantics.
Preserve the following encodings (because decoding would change URL structure/semantics):
- %2F (/) - path separator, decoding would alter path hierarchy
- %3F (?) - query string start marker
- %23 (#) - fragment start marker (not sent to server)
- %26 (&) - query parameter separator
- %3D (=) - key-value separator
- %25 (%) - percent sign itself (prevents double-encoding issues, e.g. %252F -> %2F -> /)
- %20 ( ) - space (keep encoded to avoid URL semantic changes)
- and other RFC 3986 reserved characters
Only decode unreserved characters and UTF-8 encoded international characters (e.g. Chinese).
"""
if not url:
return url
result = []
i = 0
n = len(url)
while i < n:
# Check if this is a percent-encoded sequence %XX
if url[i] == "%" and i + 2 < n:
hex_chars = url[i + 1 : i + 3]
# Validate it's a valid hexadecimal
if all(c in "0123456789ABCDEFabcdef" for c in hex_chars):
percent_encoded = url[i : i + 3]
# Check if this is a reserved character encoding that should be preserved
if percent_encoded in RESERVED_PERCENT_ENCODINGS:
# Keep the encoding, don't decode
result.append(percent_encoded)
i += 3
continue
# Try to decode (may be a UTF-8 multi-byte sequence)
# Collect consecutive percent-encoded sequences
encoded_sequence = percent_encoded
j = i + 3
while j + 2 < n and url[j] == "%":
next_hex = url[j + 1 : j + 3]
if all(c in "0123456789ABCDEFabcdef" for c in next_hex):
next_encoded = url[j : j + 3]
# Stop collecting if we encounter a reserved character
if next_encoded in RESERVED_PERCENT_ENCODINGS:
break
encoded_sequence += next_encoded
j += 3
else:
break
# Decode the collected sequence
try:
decoded = unquote(encoded_sequence)
result.append(decoded)
i = j
continue
except Exception:
# Decoding failed, keep the original encoding
result.append(percent_encoded)
i += 3
continue
result.append(url[i])
i += 1
return "".join(result)
def decode_http_urls_in_dict(data):
"""
Traverse all values in the data structure:
- If it's a string starting with http, apply urllib.parse.unquote
- If it's a list, recursively process each element
- If it's a dict, recursively process each value
- Other types remain unchanged
"""
if isinstance(data, str):
if "%" in data and "http" in data:
return safe_unquote(data)
else:
return data
elif isinstance(data, list):
return [decode_http_urls_in_dict(item) for item in data]
elif isinstance(data, dict):
return {key: decode_http_urls_in_dict(value) for key, value in data.items()}
else:
return data
md = MarkdownIt("commonmark")
def strip_markdown_links(markdown: str) -> str:
tokens = md.parse(markdown)
def render(ts):
out = []
for tok in ts:
t = tok.type
# 1) Links: drop the wrapper, keep inner text (children will be rendered)
if t == "link_open" or t == "link_close":
continue
# 2) Images: skip the entire image block
if t == "image":
continue
# 3) Line breaks and block closings
if t == "softbreak": # inline single line break
out.append("\n")
continue
if (
t == "hardbreak"
): # explicit line break (two spaces + newline in Markdown)
out.append("\n")
continue
if t in ("paragraph_close", "heading_close", "blockquote_close"):
out.append("\n\n")
continue
if t in ("list_item_close", "bullet_list_close", "ordered_list_close"):
out.append("\n")
continue
if t == "hr":
out.append("\n\n")
continue
# 4) Inline or nested tokens
if tok.children:
out.append(render(tok.children))
continue
# Preserve inline code style
if t == "code_inline":
out.append(f"`{tok.content}`")
else:
out.append(tok.content or "")
return "".join(out)
text = render(tokens)
# normalize excessive blank lines (avoid more than 2 consecutive newlines)
text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n"
return text.strip()
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import asyncio
import base64
import os
from fastmcp import FastMCP
from openai import OpenAI
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1")
# Initialize FastMCP server
mcp = FastMCP("vision-mcp-server")
# Maximum file size for vision processing (20MB for images, 50MB for videos)
MAX_IMAGE_SIZE = 20 * 1024 * 1024 # 20MB
MAX_VIDEO_SIZE = 50 * 1024 * 1024 # 50MB
def guess_mime_media_type_from_extension(file_path: str) -> tuple[str, str]:
"""
Guess the MIME type and media category based on the file extension.
Returns:
Tuple of (mime_type, media_category) where media_category is 'image' or 'video'
"""
_, ext = os.path.splitext(file_path)
ext = ext.lower()
# Image formats
if ext in [".jpg", ".jpeg"]:
return "image/jpeg", "image"
elif ext == ".png":
return "image/png", "image"
elif ext == ".gif":
return "image/gif", "image"
elif ext == ".webp":
return "image/webp", "image"
elif ext == ".bmp":
return "image/bmp", "image"
elif ext == ".tiff" or ext == ".tif":
return "image/tiff", "image"
# Video formats
elif ext == ".mp4":
return "video/mp4", "video"
elif ext == ".mov":
return "video/quicktime", "video"
elif ext == ".avi":
return "video/x-msvideo", "video"
elif ext == ".mkv":
return "video/x-matroska", "video"
elif ext == ".webm":
return "video/webm", "video"
# Default to JPEG for unknown formats
return "image/jpeg", "image"
def _validate_file_size(file_path: str, media_category: str) -> tuple[bool, str]:
"""
Validate file size based on media category.
Returns:
Tuple of (is_valid, error_message)
"""
try:
file_size = os.path.getsize(file_path)
max_size = MAX_VIDEO_SIZE if media_category == "video" else MAX_IMAGE_SIZE
max_size_mb = max_size / (1024 * 1024)
if file_size > max_size:
return (
False,
f"[ERROR]: File size ({file_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb}MB) for {media_category}",
)
if file_size == 0:
return False, "[ERROR]: File is empty"
return True, ""
except Exception as e:
return False, f"[ERROR]: Failed to check file size: {e}"
@mcp.tool()
async def visual_question_answering(media_path_or_url: str, question: str) -> str:
"""Ask question about an image or a video and get the answer with GPT-4o vision model.
Args:
media_path_or_url: The path of the image/video file locally or its URL. Supports images (jpg, png, gif, webp, bmp, tiff) and videos (mp4, mov, avi, mkv, webm).
question: The question to ask about the image or video.
Returns:
The answer to the media-related question.
"""
max_retries = 3
retry = 0
# Create client once outside the retry loop
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
# Initialize variables
response = None
media_data = None
mime_type = None
media_category = None
while retry < max_retries:
try:
# Build message content
content = [{"type": "text", "text": question}]
if os.path.exists(media_path_or_url): # Check if the file exists locally
# Get media type and validate
mime_type, media_category = guess_mime_media_type_from_extension(
media_path_or_url
)
# Validate file size
is_valid, error_msg = _validate_file_size(
media_path_or_url, media_category
)
if not is_valid:
return error_msg
# Read and encode file
with open(media_path_or_url, "rb") as media_file:
media_data = base64.b64encode(media_file.read()).decode("utf-8")
# Add image_url content (works for both images and videos in OpenAI API)
content.append(
{
"type": "image_url",
"image_url": {"url": f"data:{mime_type};base64,{media_data}"},
}
)
elif "home/user" in media_path_or_url:
return "[ERROR]: The visual_question_answering tool cannot access sandbox files, please use the local path provided by original instruction"
else: # Otherwise, assume it's a URL
# Basic URL validation
if not media_path_or_url.startswith(("http://", "https://")):
return "[ERROR]: Invalid URL format. URLs must start with http:// or https://"
content.append(
{"type": "image_url", "image_url": {"url": media_path_or_url}}
)
# Make API call
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1024,
)
# If we reach here, the API call was successful
break
except FileNotFoundError:
return f"[ERROR]: File not found: {media_path_or_url}"
except PermissionError:
return f"[ERROR]: Permission denied when reading file: {media_path_or_url}"
except Exception as e:
retry += 1
if retry >= max_retries:
error_type = (
"API call"
if media_data is not None or not os.path.exists(media_path_or_url)
else "file processing"
)
return f"[ERROR]: Visual question answering failed during {error_type}: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction.\nSupported image formats: jpg, png, gif, webp, bmp, tiff\nSupported video formats: mp4, mov, avi, mkv, webm\nURLs must be publicly accessible and start with http:// or https://"
await asyncio.sleep(5 * (2**retry))
# Extract and return response
try:
if response and response.choices and len(response.choices) > 0:
return response.choices[0].message.content
else:
return "[ERROR]: Received empty response from API"
except (AttributeError, IndexError) as e:
return f"[ERROR]: Failed to parse API response: {e}"
if __name__ == "__main__":
mcp.run(transport="stdio")
================================================
FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py
================================================
# Copyright (c) 2025 MiroMind
# This source code is licensed under the Apache 2.0 License.
import base64
import os
import aiohttp
import requests
from fastmcp import FastMCP
VISION_API_KEY = os.environ.get("VISION_API_KEY")
VISION_BASE_URL = os.environ.get("VISION_BASE_URL")
VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME")
# Initialize FastMCP server
mcp = FastMCP("vision-mcp-server-os")
def guess_mime_media_type_from_extension(file_path: str) -> str:
"""Guess the MIME type based on the file extension."""
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext in [".jpg", ".jpeg"]:
return "image/jpeg"
elif ext == ".png":
return "image/png"
elif ext == ".gif":
return "image/gif"
else:
return "image/jpeg" # Default to JPEG if unknown
@mcp.tool()
async def visual_question_answering(image_path_or_url: str, question: str) -> str:
"""Ask question about an image or a video and get the answer with a vision language model.
Args:
image_path_or_url: The path of the image file locally or its URL.
question: The question to ask about the image.
Returns:
The answer to the image-related question.
"""
messages_for_llm = [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": None}},
{
"type": "text",
"text": question,
},
],
}
]
headers = {
"Authorization": f"Bearer {VISION_API_KEY}",
"Content-Type": "application/json",
}
try:
if os.path.exists(image_path_or_url): # Check if the file exists locally
with open(image_path_or_url, "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode("utf-8")
mime_type = guess_mime_media_type_from_extension(image_path_or_url)
messages_for_llm[0]["content"][0]["image_url"]["url"] = (
f"data:{mime_type};base64,{image_data}"
)
elif image_path_or_url.startswith(("http://", "https://")):
async with aiohttp.ClientSession() as session:
async with session.get(image_path_or_url) as resp:
if resp.status == 200:
image_bytes = await resp.read()
mime_type = resp.headers.get(
"Content-Type", "image/png"
) # fallback MIME type
image_data = base64.b64encode(image_bytes).decode("utf-8")
messages_for_llm[0]["content"][0]["image_url"]["url"] = (
f"data:{mime_type};base64,{image_data}"
)
else:
return f"Failed to fetch image from URL: {image_path_or_url}"
else:
messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url
payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm}
response = requests.post(VISION_BASE_URL, json=payload, headers=headers)
except Exception as e:
return f"Error: {e}"
try:
return response.json()["choices"][0]["message"]["content"]
except (AttributeError, IndexError):
return response.json()
if __name__ == "__main__":
mcp.run(transport="stdio")