Repository: MiroMindAI/MiroThinker Branch: main Commit: 40a9faef2efd Files: 169 Total size: 1.1 MB Directory structure: gitextract_qqy1lifh/ ├── .github/ │ └── workflows/ │ └── run-ruff.yml ├── .gitignore ├── LICENSE ├── README.md ├── apps/ │ ├── collect-trace/ │ │ ├── README.md │ │ ├── pyproject.toml │ │ ├── scripts/ │ │ │ ├── collect_trace_claude37.sh │ │ │ ├── collect_trace_gpt41.sh │ │ │ ├── collect_trace_gpt5.sh │ │ │ └── collect_trace_qwen3.sh │ │ └── utils/ │ │ ├── converters/ │ │ │ ├── __init__.py │ │ │ ├── convert_non_oai_to_chatml.py │ │ │ ├── convert_oai_to_chatml.py │ │ │ ├── convert_to_chatml_auto_batch.py │ │ │ ├── example_usage.py │ │ │ └── system_prompts.py │ │ ├── merge_chatml_msgs_to_one_json.py │ │ └── process_logs.py │ ├── gradio-demo/ │ │ ├── README.md │ │ ├── main.py │ │ ├── prompt_patch.py │ │ ├── pyproject.toml │ │ └── utils.py │ ├── lobehub-compatibility/ │ │ ├── MiroThinkerToolParser.py │ │ ├── README.md │ │ ├── chat_template.jinja │ │ ├── requirements.txt │ │ ├── test_tool_parser.py │ │ └── unit_test.py │ ├── miroflow-agent/ │ │ ├── README.md │ │ ├── benchmarks/ │ │ │ ├── __init__.py │ │ │ ├── check_progress/ │ │ │ │ ├── check_progress_aime2025.py │ │ │ │ ├── check_progress_browsecomp.py │ │ │ │ ├── check_progress_browsecomp_zh.py │ │ │ │ ├── check_progress_deepsearchqa.py │ │ │ │ ├── check_progress_frames.py │ │ │ │ ├── check_progress_gaia-validation-text-103.py │ │ │ │ ├── check_progress_gaia-validation.py │ │ │ │ ├── check_progress_hle-text-2158.py │ │ │ │ ├── check_progress_hle-text-500.py │ │ │ │ ├── check_progress_hle.py │ │ │ │ ├── check_progress_seal-0.py │ │ │ │ ├── check_progress_webwalkerqa.py │ │ │ │ ├── check_progress_xbench_deepsearch.py │ │ │ │ └── common.py │ │ │ ├── common_benchmark.py │ │ │ ├── evaluators/ │ │ │ │ ├── __init__.py │ │ │ │ ├── calculate_average_score.py │ │ │ │ ├── eval_utils.py │ │ │ │ └── extract_futurex_results.py │ │ │ └── subset_extraction/ │ │ │ ├── gaia-text-103-grader.py │ │ │ └── gaia-to-text-103-mover.py │ │ ├── conf/ │ │ │ ├── __init__.py │ │ │ ├── agent/ │ │ │ │ ├── default.yaml │ │ │ │ ├── demo.yaml │ │ │ │ ├── mirothinker_1.7_keep5_max200.yaml │ │ │ │ ├── mirothinker_1.7_keep5_max300.yaml │ │ │ │ ├── mirothinker_v1.0.yaml │ │ │ │ ├── mirothinker_v1.0_keep5.yaml │ │ │ │ ├── mirothinker_v1.5.yaml │ │ │ │ ├── mirothinker_v1.5_keep5_max200.yaml │ │ │ │ ├── mirothinker_v1.5_keep5_max400.yaml │ │ │ │ ├── multi_agent.yaml │ │ │ │ ├── multi_agent_os.yaml │ │ │ │ ├── single_agent.yaml │ │ │ │ └── single_agent_keep5.yaml │ │ │ ├── benchmark/ │ │ │ │ ├── aime2025.yaml │ │ │ │ ├── browsecomp.yaml │ │ │ │ ├── browsecomp_zh.yaml │ │ │ │ ├── collect_trace.yaml │ │ │ │ ├── debug.yaml │ │ │ │ ├── deepsearchqa.yaml │ │ │ │ ├── default.yaml │ │ │ │ ├── frames.yaml │ │ │ │ ├── futurex.yaml │ │ │ │ ├── gaia-validation-text-103.yaml │ │ │ │ ├── gaia-validation.yaml │ │ │ │ ├── hle-text-2158.yaml │ │ │ │ ├── hle-text-500.yaml │ │ │ │ ├── hle.yaml │ │ │ │ ├── seal-0.yaml │ │ │ │ ├── webwalkerqa.yaml │ │ │ │ └── xbench_deepsearch.yaml │ │ │ ├── config.yaml │ │ │ └── llm/ │ │ │ ├── claude-3-7.yaml │ │ │ ├── default.yaml │ │ │ ├── gpt-5.yaml │ │ │ └── qwen-3.yaml │ │ ├── main.py │ │ ├── pyproject.toml │ │ ├── scripts/ │ │ │ ├── run_evaluate_multiple_runs_aime2025.sh │ │ │ ├── run_evaluate_multiple_runs_browsecomp.sh │ │ │ ├── run_evaluate_multiple_runs_browsecomp_zh.sh │ │ │ ├── run_evaluate_multiple_runs_debug.sh │ │ │ ├── run_evaluate_multiple_runs_deepsearchqa.sh │ │ │ ├── run_evaluate_multiple_runs_frames.sh │ │ │ ├── run_evaluate_multiple_runs_futurex.sh │ │ │ ├── run_evaluate_multiple_runs_gaia-validation-text-103.sh │ │ │ ├── run_evaluate_multiple_runs_gaia-validation.sh │ │ │ ├── run_evaluate_multiple_runs_hle-text-2158.sh │ │ │ ├── run_evaluate_multiple_runs_hle-text-500.sh │ │ │ ├── run_evaluate_multiple_runs_hle.sh │ │ │ ├── run_evaluate_multiple_runs_seal-0.sh │ │ │ ├── run_evaluate_multiple_runs_webwalkerqa.sh │ │ │ └── run_evaluate_multiple_runs_xbench_deepsearch.sh │ │ └── src/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ └── settings.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── answer_generator.py │ │ │ ├── orchestrator.py │ │ │ ├── pipeline.py │ │ │ ├── stream_handler.py │ │ │ └── tool_executor.py │ │ ├── io/ │ │ │ ├── __init__.py │ │ │ ├── input_handler.py │ │ │ └── output_formatter.py │ │ ├── llm/ │ │ │ ├── __init__.py │ │ │ ├── base_client.py │ │ │ ├── factory.py │ │ │ ├── providers/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anthropic_client.py │ │ │ │ └── openai_client.py │ │ │ └── util.py │ │ ├── logging/ │ │ │ ├── __init__.py │ │ │ ├── summary_time_cost.py │ │ │ └── task_logger.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── parsing_utils.py │ │ ├── prompt_utils.py │ │ └── wrapper_utils.py │ └── visualize-trace/ │ ├── .python-version │ ├── README.md │ ├── app.py │ ├── pyproject.toml │ ├── requirements.txt │ ├── run.py │ ├── static/ │ │ ├── css/ │ │ │ └── style.css │ │ └── js/ │ │ └── script.js │ ├── templates/ │ │ └── index.html │ └── trace_analyzer.py ├── assets/ │ ├── LOCAL-TOOL-DEPLOYMENT.md │ ├── QA.md │ └── qwen3_nonthinking.jinja ├── justfile └── libs/ └── miroflow-tools/ ├── README.md ├── pyproject.toml └── src/ ├── __init__.py └── miroflow_tools/ ├── __init__.py ├── dev_mcp_servers/ │ ├── jina_scrape_llm_summary.py │ ├── search_and_scrape_webpage.py │ ├── stateless_python_server.py │ └── task_planner.py ├── manager.py └── mcp_servers/ ├── __init__.py ├── audio_mcp_server.py ├── audio_mcp_server_os.py ├── browser_session.py ├── python_mcp_server.py ├── reading_mcp_server.py ├── reasoning_mcp_server.py ├── reasoning_mcp_server_os.py ├── searching_google_mcp_server.py ├── searching_sogou_mcp_server.py ├── serper_mcp_server.py ├── utils/ │ ├── __init__.py │ └── url_unquote.py ├── vision_mcp_server.py └── vision_mcp_server_os.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/run-ruff.yml ================================================ name: lint on: pull_request: branches: [ "main" ] jobs: lint: if: github.repository_owner == 'MiroMindAI' name: lint pull request runs-on: ubuntu-latest steps: - name: checkout code uses: actions/checkout@v4 - name: Install uv uses: astral-sh/setup-uv@v5 - name: Check static error run: | uv tool run ruff@0.8.0 check --show-fixes --output-format=github - name: Reformat code style run: | echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY if diff_output="$(uv tool run ruff@0.8.0 format --diff 2>&1)"; then echo "$diff_output" echo '✅ Format check passed.' >> "$GITHUB_STEP_SUMMARY" else echo "$diff_output" echo '❌ Format issues detected.' >> "$GITHUB_STEP_SUMMARY" { echo '```diff' echo "$diff_output" echo '```' } >> "$GITHUB_STEP_SUMMARY" exit 1 fi ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py.cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # UV # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. #uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock #poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. # https://pdm-project.org/en/latest/usage/project/#working-with-version-control #pdm.lock #pdm.toml .pdm-python .pdm-build/ # pixi # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. #pixi.lock # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one # in the .venv directory. It is recommended not to include this directory in version control. .pixi # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .envrc .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Abstra # Abstra is an AI-powered process automation framework. # Ignore directories containing user credentials, local state, and settings. # Learn more at https://abstra.io/docs .abstra/ # Visual Studio Code # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore # and can be added to the global gitignore or merged into this file. However, if you prefer, # you could uncomment the following to ignore the entire vscode folder # .vscode/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Cursor # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore # Marimo marimo/_static/ marimo/_lsp/ __marimo__/ # -- ADDED -- # Log files logs/ # Data directory - exclude everything except README data/ .idea/ .DS_Store apps/collect-trace/scripts/*/*.sh ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS ================================================ FILE: README.md ================================================
MiroThinker

[![MODEL](https://img.shields.io/badge/Model-FFD21E?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/collections/miromind-ai/mirothinker-17) [![Blog](https://img.shields.io/badge/Blog-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/#blog) [![DATA](https://img.shields.io/badge/Data-0040A1?style=for-the-badge&logo=huggingface&logoColor=ffffff&labelColor)](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) [![GITHUB](https://img.shields.io/badge/Github-24292F?style=for-the-badge&logo=github&logoColor=white)](https://github.com/MiroMindAI) [![WEBSITE](https://img.shields.io/badge/Website-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://miromind.ai/) [![DISCORD](https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/GPqEnkzQZd)
### 🚀 [Try MiroThinker!](https://dr.miromind.ai/)
**MiroThinker**: A deep research agent optimized for research and prediction. It achieves a 88.2 on the challenging BrowseComp benchmark. See [Quick Start](#-quick-start). ## 📋 Table of Contents - 📰 [News & Updates](#-news--updates) - 📝 [Introduction](#-introduction) - ✨ [Key Features](#-key-features) - 📈 [Performance on Benchmarks](#-performance-on-benchmarks) - 🚀 [Quick Start](#-quick-start) - 📊 [Benchmark Evaluation](#-benchmark-evaluation) - 🔬 [Trace Collection](#-trace-collection) - ❓ [FAQ & Troubleshooting](#-faq--troubleshooting) - 📄 [License](#-license) - 🙏 [Acknowledgments](#-acknowledgments) ## 📰 News & Updates - **[2026-03-11]** 🎉🎉🎉 Introducing [MiroThinker-1.7](https://huggingface.co/collections/miromind-ai/mirothinker-17), including [MiroThinker-1.7-mini](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) and [MiroThinker-1.7](https://huggingface.co/miromind-ai/MiroThinker-1.7). MiroThinker-1.7-mini achieves 72.3 on BrowseComp-ZH, setting a new SOTA among open-source models while using only 30B parameters. Our proprietary agent MiroThinker-H1 achieves leading performance on BrowseComp and BrowseComp-ZH among open-source and commercial models. - **\[2026-01-23\]** 🎉 We have brought two important updates to [MiroThinker online](http://dr.miromind.ai): (a) Core Research Report Generation: Deep Research online reports now support generation, preview, and sharing. (b) Extended Document Upload Types: Now supports the upload of various file formats, such as `.pdf`, `.doc`, `.ppt`, `.xls`, `.jpg`. Welcome to try it out! MiroThinker will continue to be maintained and iteratively upgraded, with the goal of becoming the best Research Agent you'll ever use! - **\[2026-01-05\]** 🎉🎉 We release [MiroThinker-v1.5](https://huggingface.co/collections/miromind-ai/mirothinker-v15), a series of open-source deep research agents optimized for financial prediction. [MiroThinker-v1.5-30B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) surpasses Kimi-K2-Thinking on BrowseComp-ZH at much lower cost, using only 1/30 of the parameters. [MiroThinker-v1.5-235B](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) scores 39.2% on HLE-Text, 69.8% on BrowseComp, 71.5% on BrowseComp-ZH, and 80.8% on GAIA-Val-165, setting a new state-of-the-art among search agents.
📜 Click to expand older updates - **\[2025-11-13\]** 🎉 [MiroThinker-v1.0](https://huggingface.co/collections/miromind-ai/mirothinker-v10) is now released! Introducing **interactive scaling** as a third dimension of performance improvement, MiroThinker v1.0 supports 256K context window and up to 600 tool calls per task. Available in 8B, 30B, and 72B parameter scales, achieving 37.7%, 47.1%, 55.6%, and 81.9% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. See [Technical Report](https://arxiv.org/abs/2511.11793) for more details. - **\[2025-09-11\]** MiroThinker-72B-Preview ranked 4th in this week's FutureX benchmark. See [FutureX](https://futurex-ai.github.io/). - **\[2025-09-08\]** [MiroThinker-v0.2](https://huggingface.co/collections/miromind-ai/mirothinker-v02) is now released, achieving open-source SOTA performance across multiple benchmarks, including HLE (17.8%), HLE-Text-Only (19.1%), BrowseComp-EN (17.2%), BrowseComp-ZH (29.4%), XBench-DeepSearch (56.0%), and Frames (74.8%). - **\[2025-09-07\]** We supported more benchmarks, including [BrowseComp-ZH](https://arxiv.org/abs/2504.19314), [XBench-DeepSearch](https://xbench.org/agi/aisearch), and [FutureX](https://futurex-ai.github.io/). We plan to add more benchmarks in the future. - **\[2025-08-22\]** Introducing streamlined deployment options for MiroThinker with optimized resource usage and faster startup times. Experience the interactive demo: [🚀 Try Gradio Demo](apps/gradio-demo) - **\[2025-08-08\]** [MiroThinker-v0.1](https://huggingface.co/collections/miromind-ai/mirothinker-v01-689301b6d0563321862d44a1) released.
## 📝 Introduction ### MiroThinker-1.7 Our new MiroThinker family represents a significant leap in building reliable agents for long-chain tasks. Engineered with enhanced post-training pipeline, our MiroThinker-1.7 family achieve SOTA performance in deep research tasks among open-source models. **Key Features** - 🚀 MiroThinker-1.7 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis. - 🔧 Handles up to 300 tool interactions per task, now with more accurate stepwise reasoning and decision-making. - 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets. - Our proprietary agent, MiroThinker-H1 provides promising evidence for long-chain verifiable reasoning — reasoning processes that are step-verifiable and globally verifiable, improving the performance of complex agentic workflows.
| Model Name | Parameters | Max Context | Max Tool Calls | HF Link | |:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:| | MiroThinker-1.7-mini | 30B | 256K | 300 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7-mini) | | MiroThinker-1.7 | 235B | 256K | 300 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-1.7) |
MiroThinker-1.7 demonstrates strong general-research performance across a broad range of benchmarks, achieving 74.0%, 75.3%, 82.7% and 42.9% on BrowseComp, BrowseComp-ZH, GAIA-Val-165 and HLE-Text, respectively. MiroThinker-1.7 achieves SOTA performance on BrowseComp-ZH. ![image](/assets/1.7_main_results.png) ### MiroThinker-v1.5
📦 Click to expand MiroThinker-v1.5 details MiroThinker v1.5 is the world-leading open-source search agent that advances tool-augmented reasoning through **interactive scaling** — training the agent to handle deeper and more frequent agent-environment interactions as a third dimension of performance improvement, beyond model size and context length. ![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_framework.png) **Key Features** - 🚀 MiroThinker v1.5 supports a 256K context window, long-horizon reasoning, and deep multi-step analysis. - 🔧 Handles up to 400 tool calls per task — a substantial improvement over previous open-source research agents. - 📦 Released in 30B and 235B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets.
| Agent Name | Base Agent | Max Context | Max Tool Calls | HF Link | |:---------------------:|:-----------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:| | MiroThinker-v1.5-30B | Qwen3-30B-A3B-Thinking-2507 | 256K | 400 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) | | MiroThinker-v1.5-235B | Qwen3-235B-A22B-Thinking-2507 | 256K | 400 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.5-235B) |
MiroThinker v1.5 demonstrates strong general-research performance across a broad range of benchmarks, achieving 39.2%, 69.8%, 71.5%, and 80.8% on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Val-165, respectively. These results surpass previous open-source agents and set the new world-leading BrowseComp performance. ![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/mirothinker_v1.5_browsecomp.png)
### MiroThinker-v1.0
📦 Click to expand MiroThinker-v1.0 details Unlike previous agents that scale only model size or context length, MiroThinker v1.0 introduces **interactive scaling** at the agent level, systematically training the agent to handle deeper and more frequent agent–environment interactions as a third dimension of performance improvement. Interactive scaling leverages environment feedback and external information acquisition to correct errors and refine trajectories. ![image](https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/assets/MiroThinker_v1.0_Overall.png) ### ✨ Key Features - 🚀 **256K Context Window**: Supports long-horizon reasoning and deep multi-step analysis - 🔧 **600 Tool Calls**: Handles up to 600 tool calls per task — a substantial improvement over previous open-source research agents - 📦 **Multiple Scales**: Released in 8B, 30B, and 72B parameter scales, accompanied by a comprehensive suite of tools and workflows to flexibly support diverse research settings and compute budgets
| Agent Name | Base Agent | Max Context | Max Tool Calls | HF Link | |:--------------------:|:---------------------------:|:-----------:|:--------------:|:------------------------------------------------------------------:| | MiroThinker-v1.0-8B | Qwen3-8B | 256K | 600 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-8B) | | MiroThinker-v1.0-30B | Qwen3-30B-A3B-Thinking-2507 | 256K | 600 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-30B) | | MiroThinker-v1.0-72B | Qwen2.5-72B-Instruct | 256K | 600 | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-v1.0-72B) |
MiroThinker v1.0 demonstrates strong general-research performance across a broad range of benchmarks, achieving **37.7%**, **47.1%**, **55.6%**, and **81.9%** on HLE-Text, BrowseComp, BrowseComp-ZH, and GAIA-Text-103, respectively. These results surpass previous open-source agents and narrow the gap with commercial counterparts such as **GPT-5-high**.
MiroThinker
### MiroThinker-v0.2
📦 Click to expand MiroThinker-v0.2 details In this new version, we introduced three key improvements: - 📚 **Richer training data** from both English and Chinese sources, yielding significant gains in benchmark performance and generalization - 🎯 **Unified DPO training** with a single preference dataset across all agents - 📏 **Extended context length** from 40k to 64k for more challenging multi-turn tool-use tasks Compared to v0.1, MiroThinker v0.2 delivers consistent gains across benchmarks. For example, scores improved from **57.3 → 64.1** on **GAIA-Text-103** and from **17.0 → 29.4** on **BrowseComp-ZH**, reflecting substantial advancements in the model’s general research agent capabilities.
| Agent Name | Base Agent | Max Context | HF Link | |:------------------------:|:---------------------:|:-----------:|:----------------------------------------------------------------------:| | MiroThinker-4B-SFT-v0.2 | Qwen3-4B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-SFT-v0.2) | | MiroThinker-4B-DPO-v0.2 | Qwen3-4B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-4B-DPO-v0.2) | | MiroThinker-8B-SFT-v0.2 | Qwen3-8B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.2) | | MiroThinker-8B-DPO-v0.2 | Qwen3-8B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.2) | | MiroThinker-14B-SFT-v0.2 | Qwen3-14B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.2) | | MiroThinker-14B-DPO-v0.2 | Qwen3-14B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.2) | | MiroThinker-32B-SFT-v0.2 | Qwen3-32B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.2) | | MiroThinker-32B-DPO-v0.2 | Qwen3-32B | 64K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.2) |
### MiroThinker-v0.1
📦 Click to expand MiroThinker-v0.1 details
MiroFlow Performance on GAIA-Validation

Performance of Open-Source Agents on GAIA-Validation Benchmark.

We have released the **MiroThinker v0.1** series, including both SFT and DPO variants at parameter scales of **8B**, **14B**, and **32B**. Notably, MiroThinker v0.1 achieves **state-of-the-art performance** among open-source models on the [GAIA benchmark](https://huggingface.co/datasets/gaia-benchmark/GAIA), a rigorous evaluation suite for advanced agentic capabilities, demonstrating its strength in long-context, decision-intensive, and real-world task scenarios.
| Agent Name | Base Agent | Max Context | HF Link | | :-----------------------: |:----------:|:-----------:| :--------------------------------------------------------------------:| | MiroThinker-8B-SFT-v0.1 | Qwen3-8B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-SFT-v0.1) | | MiroThinker-8B-DPO-v0.1 | Qwen3-8B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-8B-DPO-v0.1) | | MiroThinker-14B-SFT-v0.1 | Qwen3-14B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-SFT-v0.1) | | MiroThinker-14B-DPO-v0.1 | Qwen3-14B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-14B-DPO-v0.1) | | MiroThinker-32B-SFT-v0.1 | Qwen3-32B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-SFT-v0.1) | | MiroThinker-32B-DPO-v0.1 | Qwen3-32B | 40K | [🤗 link](https://huggingface.co/miromind-ai/MiroThinker-32B-DPO-v0.1) |
## ✨ Key Features ### 🤖 **MiroThinker-Optimized Framework** - 🔓 **Fully Open-Source Agent Framework**: Complete transparency with open framework and open agents - 🔗 **Tool Integration**: Seamless integration with external tools and APIs - 📝 **Trace Collection**: Comprehensive logging and analysis of agent interactions with elapsed time and estimated completion time displayed in minutes. Ready for SFT and DPO - 📊 **Benchmark Evaluation**: Extensive testing across multiple benchmark datasets ### 📊 **Comprehensive Benchmark Suite**
📋 Click to expand benchmark list - **GAIA Validation**: A benchmark for General AI Assistants. ([paper](https://arxiv.org/abs/2311.12983)) - **GAIA-Text-103**: A subset of GAIA Validation for text-only tasks. ([paper](https://arxiv.org/abs/2505.22648)) - **HLE**: Humanity's Last Exam. ([paper](https://arxiv.org/abs/2501.14249)) - **HLE-Text-2158**: A subset of HLE for text-only tasks. ([paper](https://arxiv.org/abs/2501.14249)) - **HLE-Text-500**: A subset of HLE for text-only tasks, created by [WebThinker](https://arxiv.org/pdf/2504.21776). ([paper](https://arxiv.org/pdf/2504.21776)) - **BrowseComp-EN**: Web browsing and comprehension tasks. ([paper](https://arxiv.org/abs/2504.12516)) - **BrowseComp-ZH**: A Chinese version of BrowseComp. ([paper](https://arxiv.org/abs/2504.19314)) - **WebWalkerQA**: Web navigation and question answering. ([paper](https://arxiv.org/abs/2501.07572)) - **Frames**: Factuality, Retrieval, And reasoning MEasurement Set. ([paper](https://arxiv.org/abs/2409.12941)) - **XBench-DeepSearch**: A benchmark for deep research agents. ([website](https://xbench.org/agi/aisearch)) - **FutureX**: A live benchmark designed for predicting unknown future. ([website](https://futurex-ai.github.io/)) - **SEAL-0**: A benchmark for evaluating LLMs on conflicting-evidence web questions. ([paper](https://arxiv.org/abs/2506.01062)) - **AIME2025**: American Invitational Mathematics Examination 2025. ([website](https://artificialanalysis.ai/evaluations/aime-2025)) - **DeepSearchQA**: Google's Deep Search Question Answering benchmark. ([paper](https://arxiv.org/abs/2505.20827))
## 📈 Performance on Benchmarks ### MiroThinker-1.7 > To prevent potential information leakage (e.g., retrieving benchmark answers from HuggingFace), we blocked access to certain websites during evaluation.
MiroThinker
### MiroThinker-v1.5
📦 Click to expand MiroThinker-v1.5 details > To prevent potential information leakage (e.g., searching benchmark answers from HuggingFace), access to HuggingFace has been explicitly disabled in these tools. > We further perform canary string testing on the tool outputs of all trajectories and disregard any trajectory found to be contaminated, treating it as an incorrect answer.
MiroThinker
### MiroThinker-v1.0
📦 Click to expand MiroThinker-v1.0 details
MiroThinker
### MiroThinker-v0.2
📦 Click to expand MiroThinker-v0.2 details #### Comparison with SOTA Research Agents
MiroThinker
#### GAIA Benchmark
MiroThinker
### MiroThinker-v0.1
📦 Click to expand MiroThinker-v0.1 details #### GAIA Benchmark
| **Method** | Text-103
Best Pass@1 | Text-103
Pass@1 (Avg@8) | Val-165
Best Pass@1 | Val-165
Pass@1 (Avg@8) | |------------------------------|:-----------------------:|:--------------------------:|:----------------------:|:-------------------------:| | **🔹—— 7B/8B Agents ——** | | | | | | Search-o1-7B | 17.5 | - | - | - | | R1-Searcher-7B | 20.4 | - | - | - | | WebDancer-7B | 31.0 | - | - | - | | WebSailor-7B | 37.9 | - | - | - | | CK-Pro-8B | 40.3 | - | 32.7 | - | | **MiroThinker-8B-SFT-v0.1** | 44.7 | 40.1 | 34.6 | 31.8 | | + Commercial Tools | 46.6 | 42.1 | 37.6 | 33.9 | | **MiroThinker-8B-DPO-v0.1** | 46.6 | 44.8 | 37.0 | 35.4 | | + Commercial Tools | **50.5** | **46.7** | **38.2** | **35.9** | | **🔹—— 14B Agents ——** | | | | | | **MiroThinker-14B-SFT-v0.1** | 47.6 | 44.4 | 37.0 | 34.4 | | + Commercial Tools | 49.5 | 47.5 | 41.8 | 39.8 | | **MiroThinker-14B-DPO-v0.1** | 48.5 | 46.6 | 42.4 | 39.2 | | + Commercial Tools | **52.4** | **48.5** | **45.5** | **42.0** | | **🔹—— 32B Agents ——** | | | | | | Qwen3-32B | 31.1 | 26.7 | 29.7 | 26.4 | | Search-o1-32B | 28.2 | - | - | - | | WebThinker-32B-RL | 48.5 | - | - | - | | WebDancer-QwQ-32B | 51.5 | - | - | - | | WebSailor-32B | 53.2 | - | - | - | | WebShaper-QwQ-32B | 53.3 | - | - | - | | **MiroThinker-32B-SFT-v0.1** | 55.3 | 51.3 | 44.9 | 42.7 | | + Commercial Tools | 58.3 | 54.2 | 48.5 | 45.8 | | **MiroThinker-32B-DPO-v0.1** | 57.3 | 54.1 | 48.5 | 45.9 | | + Commercial Tools | **60.2** | **57.9** | **50.9** | **48.9** |
1. Following the practices of WebThinker, WebAgents, and CognitiveKernel, we report the Best Pass@1, the highest score across three runs, which often reflects stronger performance, though it may exhibit some variability. To provide a more stable measure, we additionally report Pass@1 (Avg@8), which offers greater consistency at the cost of slightly lower scores. 1. For consistency with prior open-source works, we evaluate GAIA-Text-103 using the WebAgents LLM-as-a-Judge template, and report results on GAIA-Val-165 using the official GAIA scorer script. 1. By default, we use open-source tools wherever possible, except for the code tool [E2B](https://github.com/e2b-dev/E2B) and the Google search tool [Serper](https://serper.dev/). We use [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct), and [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) in our implementation. The framework can be easily extended to other open-source tools of your choice. 1. Replacing these open-source tools with commercial alternatives can yield performance gains. Commercial tools were mainly used for multimodal capabilities and certain complex reasoning subtasks. The majority of tasks, including planning, browsing, refinement, navigation, and more, were handled by our agents. #### More Benchmarks
| Method | HLE
Pass@1 | Frames
Pass@1 | BrowseComp
Pass@1 | BrowseComp-ZH
Pass@1 | WebWalkerQA
Pass@1 | |------------------------------|:-------------:|:----------------:|:--------------------:|:-----------------------:|:---------------------:| | OpenAI Deep Research | 26.6 | - | 51.5 | 42.9 | - | | Gemini Deep Research | 26.9 | - | - | - | - | | Kimi-Researcher | 26.9 | 78.8 | - | - | - | | | | | | | | | WebDancer-7B | - | - | - | - | 36.0 | | WebSailor-7B | - | - | 6.7 | 14.2 | - | | **MiroThinker-8B-SFT-v0.1** | - | 58.0 | 5.5 | 9.3 | 41.3 | | **MiroThinker-8B-DPO-v0.1** | - | 64.4 | 8.7 | 13.6 | 45.7 | | | | | | | | | WebThinker-32B-RL | - | - | - | - | 46.5 | | WebDancer-QwQ-32B | - | - | 3.8 | 18.0 | 47.9 | | WebSailor-32B | - | - | 10.5 | 25.5 | - | | WebShaper-32B | - | - | - | - | 51.4 | | **MiroThinker-32B-SFT-v0.1** | 10.2 | 70.4 | 10.6 | 13.8 | 45.7 | | **MiroThinker-32B-DPO-v0.1** | 11.8 | 71.7 | 13.0 | 17.0 | 49.3 |
1. MiroThinker’s performance was tested with this repository and open-source tools; other agents’ results are from their papers and official sites. 1. As [MiroVerse-v0.1](https://huggingface.co/datasets/miromind-ai/MiroVerse-v0.1) mainly contains English data, the agent’s Chinese capability is limited. We plan to add more Chinese data to improve performance in the next version.
## 🚀 Quick Start For optimal usage, we recommend using MiroThinker with this tool-enabled agent framework and thinking mode enabled. ### Prerequisites - 🐍 **Python 3.10+** - 📦 **uv package manager** ([Installation guide](https://github.com/astral-sh/uv)) - 🔑 **Required API keys** (see configuration section below) ### Installation ```bash # Clone the repository git clone https://github.com/MiroMindAI/MiroThinker cd MiroThinker # Setup environment cd apps/miroflow-agent uv sync # Configure API keys cp .env.example .env # Edit .env with your API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.) ``` > **📝 Environment Variables**: See [Tool Configuration](#tool-configuration) section for required API keys. ### Tool Configuration #### Minimal Configuration for MiroThinker-1.7. | Server | Description | Tools Provided | Required Environment Variables | |:-------|:------------|:---------------|:-------------------------------| | **`tool-python`** | Execution environment and file management (E2B sandbox) | `create_sandbox`, `run_command`, `run_python_code`, `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY` | | **`search_and_scrape_webpage`** | Google search via Serper API | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` | | **`jina_scrape_llm_summary`** | Web scraping with LLM-based information extraction | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **Minimal `.env` configuration example:** ```bash # Required for MiroThinker v1.5 and v1.0 (minimal setup) SERPER_API_KEY=your_serper_key SERPER_BASE_URL="https://google.serper.dev" JINA_API_KEY=your_jina_key JINA_BASE_URL="https://r.jina.ai" E2B_API_KEY=your_e2b_key # Required for jina_scrape_llm_summary # Note: Summary LLM can be a small model (e.g., Qwen3-14B or GPT-5-Nano) # The choice has minimal impact on performance, use what's most convenient SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions" SUMMARY_LLM_MODEL_NAME=your_llm_model_name # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano" SUMMARY_LLM_API_KEY=your_llm_api_key # Optional, depends on LLM provider # Required for benchmark evaluation (LLM-as-a-Judge) OPENAI_API_KEY=your_openai_key # Required for running benchmark evaluations OPENAI_BASE_URL="https://api.openai.com/v1" # Optional, defaults to OpenAI's API ``` > **💡 Why this is minimal**: These 3 MCP servers cover the core capabilities needed for research tasks: web search, content extraction, and code execution. All other servers are optional enhancements. > > **🤖 Summary LLM**: The `SUMMARY_LLM` can be a small model like Qwen3-14B or GPT-5-Nano. The choice has minimal impact on overall performance, use whichever is most convenient for your setup. > > **📊 For Benchmark Evaluation**: If you plan to run benchmark evaluations, you also need `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL`) for LLM-as-a-Judge functionality used in evaluation scripts. > > **🖼️ For GAIA Multimodal Tasks**: GAIA-Val-165 includes tasks with image/audio/video files. Since MiroThinker is a text-only LLM, GPT-4o is used to pre-process these files into text descriptions. The same `OPENAI_API_KEY` is used for both this preprocessing and LLM-as-a-Judge. > > **📖 For more details**: See [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.
🔧 Click to expand additional available tools The following optional tools are available but were not used in MiroThinker v1.0-1.7 evaluation: | Server Name | Type | Description | |:---------------------|:-------------|:--------------------------------------------| | `tool-vqa` | Commercial | Vision processing using Claude | | `tool-vqa-os` | Open-Source | Vision processing (open-source alternative) | | `tool-transcribe` | Commercial | Audio transcription using OpenAI | | `tool-transcribe-os` | Open-Source | Audio transcription using Whisper | | `tool-reasoning` | Commercial | Reasoning engine using Claude | | `tool-reasoning-os` | Open-Source | Reasoning engine (open-source alternative) | | `tool-reading` | Open-Source | Document reading using MarkItDown | | `tool-google-search` | Commercial | Web search using Google + scraping | | `tool-sogou-search` | Commercial | Web search using Sogou (Chinese) | > **📖 Local Deployment**: For instructions on deploying open-source tools (`tool-vqa-os`, `tool-transcribe-os`, `tool-reasoning-os`) locally, see [Local Tool Deployment Guide](assets/LOCAL-TOOL-DEPLOYMENT.md). See the [MiroFlow Tools README](libs/miroflow-tools/README.md) for complete documentation of all available tools.
#### Pre-configured Agent Settings The `apps/miroflow-agent/conf/agent/` directory contains several pre-configured agent settings. Each configuration uses different tools and requires corresponding environment variables in your `.env` file. > **💡 Recommended**: For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management, recommended for most tasks) or `mirothinker_v1.7_keep5_max300` (only used for BrowseComp and BrowseComp-ZH). | Configuration | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For | |:---------------------------------------|:------------|:----------|:------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------| | **`mirothinker_1.7_keep5_max200`** ⭐ | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **1.7 (recommended for most tasks)** | | **`mirothinker_1.7_keep5_max300`** ⭐ | Single-agent with context management | 300 | Keep 5 most recent | Same as above | **1.7 (for BrowseComp & BrowseComp-ZH)** |
📦 Click to expand legacy configurations (v0.1/v0.2) | Configuration | Description | Max Turns | Context Retention | Required Environment Variables | Recommended For | |:-------------------------|:------------|:----------|:------------------|:-------------------------------|:----------------| | **`mirothinker_v1.5_keep5_max200`** | Single-agent with context management | 200 | Keep 5 most recent | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL`, `E2B_API_KEY`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | **v1.5 (recommended for most tasks)** | | **`mirothinker_v1.5_keep5_max400`** | Single-agent with context management | 400 | Keep 5 most recent | Same as above | **v1.5 (for BrowseComp & BrowseComp-ZH)** | | **`mirothinker_v1.5`** | Single-agent for MiroThinker v1.5 | 600 | Keep all results | Same as above | **v1.5** | | **`mirothinker_v1.0_keep5`** | Single-agent with context management | 600 | Keep 5 most recent | Same as above | **v1.0** | | **`mirothinker_v1.0`** | Single-agent for MiroThinker v1.0 | 600 | Keep all results | Same as above | **v1.0** | | **`multi_agent`** | Multi-agent with commercial tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL`, `OPENAI_API_KEY`, `OPENAI_BASE_URL`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 | | **`multi_agent_os`** | Multi-agent with open-source tools (v0.1/v0.2) | 50 | Keep all results | `E2B_API_KEY`, `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME`, `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME`, `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME`, `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | v0.1/v0.2 |
> **💡 Note**: All environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and fill in the values for the tools you plan to use. #### Creating Custom Tool Configurations
🔧 Click to expand custom tool configuration guide You can create your own YAML configuration file to freely combine MCP servers. Here's how: 1. **Create a new YAML file** in `apps/miroflow-agent/conf/agent/`: ```yaml # conf/agent/my_custom_config.yaml defaults: - default - _self_ main_agent: tools: - tool-python # Execution environment - search_and_scrape_webpage # Google search - jina_scrape_llm_summary # Web scraping with LLM - tool-vqa # Vision processing (optional) - tool-transcribe # Audio processing (optional) - tool-reasoning # Reasoning engine (optional) - tool-reading # Document reading (optional) max_turns: 300 # Maximum number of turns sub_agents: agent-browsing: # Optional sub-agent tools: - tool-google-search - tool-vqa - tool-reading - tool-python max_turns: 50 keep_tool_result: -1 # Context retention budget: -1 keeps all tool results, or specify K to keep only the K most recent tool responses ``` > **💡 Context Retention Strategy**: The `keep_tool_result` parameter implements a **recency-based context retention** strategy. In the standard ReAct paradigm, all tool outputs are retained in the message history, which can lead to inefficient context utilization. Empirically, we observe that the agent's subsequent actions depend primarily on recent observations rather than distant ones. This strategy retains only the most recent K tool responses (where K is the `keep_tool_result` value) while preserving the complete sequence of thoughts and actions. > > **Benefits:** > > - ✅ Preserves the reasoning and action trace > - ✅ Focuses the agent's attention on the most contextually relevant observations > - ✅ Frees additional context space for extended reasoning and deeper tool-use trajectories > - ✅ Does not lead to performance degradation while allowing more context space for interactive scaling > > **Usage:** Set `keep_tool_result: -1` to keep all tool results, or specify a positive integer K (e.g., `keep_tool_result: 5`) to keep only the K most recent tool responses. 2. **Use your custom configuration** when running evaluations: ```bash cd apps/miroflow-agent uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1 ``` 3. **Configure environment variables** in `.env` based on the tools you use. All available environment variables are listed in `apps/miroflow-agent/.env.example`. Copy it to `.env` and configure the variables according to your chosen configuration: ```bash cd apps/miroflow-agent cp .env.example .env # Edit .env with your actual API keys ``` **For MiroThinker v1.5** (`mirothinker_v1.5_keep5_max200.yaml`, `mirothinker_v1.5_keep5_max400.yaml`, or `mirothinker_v1.5.yaml`) and **v1.0** (`mirothinker_v1.0_keep5.yaml` or `mirothinker_v1.0.yaml`), see the [Minimal Configuration](#minimal-configuration-for-mirothinker-v15-and-v10) section above for the complete configuration example. **For other configurations**, refer to the [Pre-configured Agent Settings](#pre-configured-agent-settings) table above to see which environment variables are required.
🔑 Click to expand optional API keys ```bash # API for LLM-as-a-Judge (for benchmark testing, required for benchmark evaluation) OPENAI_API_KEY=your_openai_key OPENAI_BASE_URL="https://api.openai.com/v1" # Optional, defaults to OpenAI's API # API for Open-Source Audio Transcription Tool (for benchmark testing, optional) WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" WHISPER_API_KEY=your_whisper_key WHISPER_BASE_URL="https://your_whisper_base_url/v1" # API for Open-Source VQA Tool (for benchmark testing, optional) VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" VISION_API_KEY=your_vision_key VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions" # API for Open-Source Reasoning Tool (for benchmark testing, optional) REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" REASONING_API_KEY=your_reasoning_key REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions" # API for Claude Sonnet 3.7 as Commercial Tools (optional) ANTHROPIC_API_KEY=your_anthropic_key # API for Sogou Search (optional) TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key # API for Summary LLM (can use small models like Qwen3-14B or GPT-5-Nano) SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions" SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name # e.g., "Qwen/Qwen3-14B" or "gpt-5-nano" SUMMARY_LLM_API_KEY=your_summary_llm_api_key ```
### Serve the MiroThinker Agent #### Option 1 (Recommended): Serve with SGLang or vLLM Use SGLang to serve MiroThinker models at port 61002: ```bash NUM_GPUS=4 PORT=61002 # Downloading agent from HF AGENT_PATH=miromind-ai/MiroThinker-1.7-mini python3 -m sglang.launch_server \ --model-path $AGENT_PATH \ --tp $NUM_GPUS \ --dp 1 \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code ``` > **📍 Server URL**: This will start a server at `http://0.0.0.0:$PORT`. Use this as your server base URL (e.g., `http://0.0.0.0:61002/v1`). #### Option 2: Quantized Light-Weight Options We also provide comprehensive guidance for serving MiroThinker agents using CPU-optimized and GPU-accelerated quantization techniques, along with detailed analysis and guidelines for deployment with llama.cpp, Ollama, SGLang, and other inference frameworks. > **📖 Complete Guide**: See [Deployment Documentation](apps/gradio-demo/) for detailed deployment instructions. ### Run Your First Task After setting up the environment and starting your server, run `main.py` to test with a default question: *"What is the title of today's arxiv paper in computer science?"* ```bash cd apps/miroflow-agent # Using MiroThinker agents (requires your own server) uv run python main.py llm=qwen-3 agent=mirothinker_1.7_keep5_max200 llm.base_url=http://localhost:61002/v1 # Or using Claude (requires ANTHROPIC_API_KEY in .env) uv run python main.py llm=claude-3-7 agent=single_agent_keep5 # Or using GPT-5 (requires OPENAI_API_KEY in .env) uv run python main.py llm=gpt-5 agent=single_agent_keep5 ``` **To customize your question**, edit `main.py` line 32: ```python task_description = "Your custom question here" ``` The agent will search the web, execute code if needed, and provide an answer with sources. > **📖 More details**: See [apps/miroflow-agent/README.md](apps/miroflow-agent/README.md) for available configurations and troubleshooting. ## 📊 Benchmark Evaluation > For researchers who want to reproduce our benchmark results or evaluate on standard benchmarks. ### Download Benchmark Data ```bash cd MiroThinker # Back to project root wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/data_20251115_password_protected.zip unzip data_20251115_password_protected.zip # Password: pf4* rm data_20251115_password_protected.zip ``` ### Run Benchmark Evaluation > **Note:** For MiroThinker-1.7, use `mirothinker_1.7_keep5_max200` (with context management), `mirothinker_1.7_keep5_max300` (with context management). **Available Parameters:** You can customize the evaluation by setting the following environment variables before running the script: | Parameter | Default | Description | |:----------|:--------|:------------| | `LLM_MODEL` | `"MiroThinker-Agents"` | Agent name identifier | | `BASE_URL` | `"https://your-api.com/v1"` | Base URL of your server | | `NUM_RUNS` | Varies by benchmark | Number of evaluation runs (3 for most benchmarks, 8 for GAIA/XBench/FutureX/SEAL-0, 32 for AIME2025) | | `LLM_PROVIDER` | `"qwen"` | LLM provider (e.g., `qwen`, `openai`, `anthropic`) | | `AGENT_SET` | `"mirothinker_1.7_keep5_max200"` | Agent configuration (e.g., `mirothinker_1.7_keep5_max200`, `mirothinker_1.7_keep5_max300`.) | | `MAX_CONTEXT_LENGTH` | `262144` | Maximum context length (256K) | | `MAX_CONCURRENT` | `10` | Maximum concurrent tasks | | `PASS_AT_K` | `1` | Pass@K evaluation metric | | `TEMPERATURE` | `1.0` | Sampling temperature | | `API_KEY` | `"xxx"` | API key for the server | **Example Usage:** ```bash # Navigate to the miroflow-agent directory first cd apps/miroflow-agent # Basic usage with v1.5 (recommended) NUM_RUNS=8 LLM_MODEL="MiroThinker-1.7-mini" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh # Or with v1.0 # NUM_RUNS=8 LLM_MODEL="MiroThinker-v1.0-30B" BASE_URL="https://your-api.com/v1" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh # Customize number of runs and agent configuration (v1.5 with context management) LLM_MODEL="MiroThinker-1.7-mini" \ BASE_URL="https://your-api.com/v1" \ NUM_RUNS=8 \ AGENT_SET="mirothinker_1.7_keep5_max200" \ bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh ```
📋 Click to expand all benchmark commands > **⚠️ Important for MiroThinker-1.7**: To reproduce our reported results, you must set the correct `AGENT_SET`: > > - **BrowseComp & BrowseComp-ZH**: Use `AGENT_SET="mirothinker_1.7_keep5_max300"` > - **All other benchmarks**: Use `AGENT_SET="mirothinker_1.7_keep5_max200"` ```bash # Navigate to the miroflow-agent directory first cd apps/miroflow-agent # HLE NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle.sh # HLE-Text-2158 NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-2158.sh # HLE-Text-500 NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_hle-text-500.sh # GAIA-Text-103 NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh # GAIA-Validation (GAIA-Val-165) NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_gaia-validation.sh # BrowseComp-EN (⚠️ use max300) NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp.sh # BrowseComp-ZH (⚠️ use max300) NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max300" bash scripts/run_evaluate_multiple_runs_browsecomp_zh.sh # WebWalkerQA NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_webwalkerqa.sh # XBench-DeepSearch NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh # FRAMES NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_frames.sh # SEAL-0 NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_seal-0.sh # FutureX NUM_RUNS=8 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_futurex.sh # AIME2025 NUM_RUNS=32 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_aime2025.sh # DeepSearchQA NUM_RUNS=3 LLM_MODEL="xxx" BASE_URL="xxx" AGENT_SET="mirothinker_1.7_keep5_max200" bash scripts/run_evaluate_multiple_runs_deepsearchqa.sh ```
#### 3. **Monitor evaluation progress**
📊 Click to expand progress monitoring commands ```bash # Navigate to the miroflow-agent directory first cd apps/miroflow-agent # For HLE python benchmarks/check_progress/check_progress_hle.py /path/to/evaluation/logs # For HLE-Text-2158 python benchmarks/check_progress/check_progress_hle-text-2158.py /path/to/evaluation/logs # For HLE-Text-500 python benchmarks/check_progress/check_progress_hle-text-500.py /path/to/evaluation/logs # For BrowseComp-EN python benchmarks/check_progress/check_progress_browsecomp.py /path/to/evaluation/logs # For BrowseComp-ZH python benchmarks/check_progress/check_progress_browsecomp_zh.py /path/to/evaluation/logs # For GAIA-Validation python benchmarks/check_progress/check_progress_gaia-validation.py /path/to/evaluation/logs # For GAIA-Text-103 python benchmarks/check_progress/check_progress_gaia-validation-text-103.py /path/to/evaluation/logs # For WebWalkerQA python benchmarks/check_progress/check_progress_webwalkerqa.py /path/to/evaluation/logs # For Frames python benchmarks/check_progress/check_progress_frames.py /path/to/evaluation/logs # For XBench-DeepSearch python benchmarks/check_progress/check_progress_xbench_deepsearch.py /path/to/evaluation/logs # For SEAL-0 python benchmarks/check_progress/check_progress_seal-0.py /path/to/evaluation/logs # For AIME2025 python benchmarks/check_progress/check_progress_aime2025.py /path/to/evaluation/logs # For DeepSearchQA python benchmarks/check_progress/check_progress_deepsearchqa.py /path/to/evaluation/logs ```
## 🔬 Trace Collection
📋 Click to expand trace collection commands ```bash cd apps/collect-trace # Collect Traces for SFT bash scripts/collect_trace_claude37.sh bash scripts/collect_trace_gpt5.sh # Collect Traces for DPO bash scripts/collect_trace_qwen3.sh ```
## ❓ FAQ & Troubleshooting ### Common Issues
🔧 Click to expand troubleshooting guide #### **Q: Which version should I use?** **A:** We recommend **MiroThinker-1.7** ⭐ with the minimal configuration: - **v1.7** ⭐: Latest version with 256K context, world-leading performance. Use config (with context management): - `mirothinker_1.7_keep5_max200` (up to 200 turns, recommended for most tasks) - `mirothinker_1.7_keep5_max300` (up to 300 turns, only used for BrowseComp and BrowseComp-ZH) #### **Q: How do I get API keys?** **A:** You need these keys for minimal setup: - **SERPER_API_KEY**: Get from [Serper.dev](https://serper.dev/) (Google search API) - **JINA_API_KEY**: Get from [Jina.ai](https://jina.ai/) (Web scraping) - **E2B_API_KEY**: Get from [E2B.dev](https://e2b.dev/) (Code execution sandbox) - **SUMMARY_LLM_API_KEY**: Your LLM API credentials (for content summarization). Can be a small model like Qwen3-14B or GPT-5-Nano—the choice has minimal impact on performance. - **OPENAI_API_KEY**: Get from [OpenAI](https://platform.openai.com/) (Required for benchmark evaluation, used for LLM-as-a-Judge) - **OPENAI_BASE_URL**: Optional, defaults to `https://api.openai.com/v1`. Can be changed to use OpenAI-compatible APIs. #### **Q: Agent server connection errors** **A:** Common issues: - **Check base URL format**: Should end with `/v1` (e.g., `https://your-api.com/v1`) - **Verify API key**: Ensure `API_KEY` is set correctly in environment or script - **Check server status**: Make sure your server is running and accessible - **Network issues**: Verify firewall/network settings allow connections #### **Q: Evaluation script fails to run** **A:** Troubleshooting steps: 1. **Check working directory**: Make sure you're in `apps/miroflow-agent` directory 1. **Verify environment**: Run `uv sync` to ensure dependencies are installed 1. **Check .env file**: Ensure all required environment variables are set 1. **Review logs**: Check `logs/` directory for detailed error messages 1. **Verify data path**: Ensure benchmark data is downloaded and in correct location #### **Q: Out of memory errors** **A:** Solutions: - **Reduce context length**: Set `MAX_CONTEXT_LENGTH` to a smaller value (e.g., 131072 for 128K) - **Use context management with fewer turns**: - For v1.5: Use `mirothinker_1.7_keep5_max200` or `mirothinker_1.7_keep5_max300` (with context management) - **Reduce concurrent tasks**: Set `MAX_CONCURRENT` to a smaller number (e.g., 5) - **Use smaller agents**: - For v1.5: Try 30B instead of 235B - For v1.0: Try 8B or 30B instead of 72B #### **Q: Tool execution errors** **A:** Common fixes: - **E2B errors**: Verify `E2B_API_KEY` is valid and account has credits - **Serper errors**: Check `SERPER_API_KEY` and rate limits - **Jina errors**: Verify `JINA_API_KEY` and `JINA_BASE_URL` are correct - **LLM summarization errors**: Check `SUMMARY_LLM_*` variables and agent availability #### **Q: How to monitor long-running evaluations?** **A:** Use the progress monitoring scripts: ```bash cd apps/miroflow-agent python benchmarks/check_progress/check_progress_.py /path/to/logs ``` The scripts show completion status, elapsed time, and estimated remaining time.
### Getting Help - 📖 **Documentation**: Check [MiroFlow Tools README](libs/miroflow-tools/README.md) for tool details - 💬 **Discord**: Join our [Discord community](https://discord.com/invite/GPqEnkzQZd) - 🐛 **Issues**: Report bugs on [GitHub Issues](https://github.com/MiroMindAI/MiroThinker/issues) - 📧 **Contact**: Visit [our website](https://miromind.ai/) for more information ## 📄 License This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details. ## 🙏 Acknowledgments We extend our sincere gratitude to: - 🏆 **Benchmark Contributors** for the comprehensive evaluation datasets - 🌍 **Open Source Community** for the tools and libraries that make this possible - 👥 **All Contributors** who have helped make MiroThinker better
Join our community and help us build the future of AI agents! ### References If you find this project useful in your research, please consider citing: **MiroThinker** (Model & Method) ``` @article{miromind2026mirothinker, title={MiroThinker-1.7 & H1: Towards Heavy-Duty Research Agents via Verification}, author={MiroMind Team and Bai, S. and Bing, L. and Lei, L. and Li, R. and Li, X. and Lin, X. and Min, E. and Su, L. and Wang, B. and Wang, L. and Wang, L. and Wang, S. and Wang, X. and Zhang, Y. and Zhang, Z. and others}, journal={arXiv preprint arXiv:2603.15726}, year={2026} } ``` **MiroFlow** (Framework) ```bibtex @article{miromind2026miroflow, title={MiroFlow: Towards High-Performance and Robust Open-Source Agent Framework for General Deep Research Tasks}, author={Su, Shiqian and Xing, Sen and Dong, Xuan and Zhong, Muyan and Wang, Bin and Zhu, Xizhou and Chen, Yuntao and Wang, Wenhai and Deng, Yue and Zhu, Pengxiang and others}, journal={arXiv preprint arXiv:2602.22808}, year={2026} } ``` [![Star History Chart](https://api.star-history.com/svg?repos=MiroMindAI/MiroThinker&type=Date)](https://star-history.com/#MiroMindAI/MiroThinker&Date) ================================================ FILE: apps/collect-trace/README.md ================================================ # Collect Trace > TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-a-Judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO). ## 📝 Overview Collect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct. Workflow: 1. Load each RLVR item’s question and verifiable answer. 1. Run the agent in the evaluation pipeline (with tool use / browsing as needed). 1. Verify the model's answer with an LLM-as-a-Judge against the RLVR reference answer. 1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples. ## 🚀 Quick Start ### Prerequisites - Python 3.10+ - [uv](https://github.com/astral-sh/uv) package manager - OpenAI API key (for LLM-based validation) - RLVR dataset (JSONL; contains question and a verifiable answer) ### Installation 1. **Navigate to the collect-trace directory**: ```bash cd apps/collect-trace ``` 1. **Install dependencies**: ```bash uv sync ``` 1. **Set up environment variables**: ```bash # Create .env if missing (safe; won't overwrite existing file) [ -f ../miroflow-agent/.env ] || cp ../miroflow-agent/.env.example ../miroflow-agent/.env # (Alternative on macOS/Linux) cp -n ../miroflow-agent/.env.example ../miroflow-agent/.env || true # Edit .env and fill in your keys # Required: OPENAI_API_KEY (for LLM-as-a-Judge) # Optional: other keys for specific tools ``` ### Basic Usage Run a benchmark evaluation to collect traces: ```bash # Using Claude-3.7 for trace collection bash scripts/collect_trace_claude37.sh # Using GPT-5 for trace collection bash scripts/collect_trace_gpt5.sh # Using Qwen-3 for trace collection bash scripts/collect_trace_qwen3.sh ``` ================================================ FILE: apps/collect-trace/pyproject.toml ================================================ [project] name = "collect-trace" version = "0.1.0" description = "Executes a user-defined agent loop for capturing multi-turn interaction traces" readme = "README.md" requires-python = ">=3.12" authors = [{ name = "MiroMind Team", email = "service@miromind.ai" }] dependencies = [ "miroflow-tools>=0.1.0", "dotenv>=0.9.9", "openai>=1.90.0", ] [tool.uv.sources] miroflow-tools = { path = "../../libs/miroflow-tools", editable = true } ================================================ FILE: apps/collect-trace/scripts/collect_trace_claude37.sh ================================================ # Check if ANTHROPIC_API_KEY is set if [ -z "$ANTHROPIC_API_KEY" ]; then echo "Error: ANTHROPIC_API_KEY is not set." exit 1 else echo "ANTHROPIC_API_KEY detected." fi # Get the directory where the current script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "Current script directory: $SCRIPT_DIR" # Enter the apps/miroflow-agent directory TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR mkdir -p ../../logs LOG_DIR="../../logs/collect_trace_claude37" echo "Log directory: $LOG_DIR" mkdir -p $LOG_DIR # Collect traces uv run python benchmarks/common_benchmark.py \ benchmark=collect_trace \ benchmark.data.data_dir="../../data/debug" \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=claude-3-7 \ llm.provider=anthropic \ llm.model_name=claude-3-7-sonnet-20250219 \ llm.api_key="$ANTHROPIC_API_KEY" \ llm.base_url=https://api.anthropic.com \ llm.async_client=true \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=10 \ benchmark.execution.pass_at_k=1 \ agent=single_agent \ hydra.run.dir=$LOG_DIR \ 2>&1 | tee "$LOG_DIR/output.log" # Enter the apps/collect-trace directory TARGET_DIR="$SCRIPT_DIR/../" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR # Process traces uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl ================================================ FILE: apps/collect-trace/scripts/collect_trace_gpt41.sh ================================================ # Check if OPENAI_API_KEY is set if [ -z "$OPENAI_API_KEY" ]; then echo "Error: OPENAI_API_KEY is not set." exit 1 else echo "OPENAI_API_KEY detected." fi # Get the directory where the current script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "Current script directory: $SCRIPT_DIR" # Enter the apps/miroflow-agent directory TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR mkdir -p ../../logs LOG_DIR="../../logs/collect_trace_gpt41" echo "Log directory: $LOG_DIR" mkdir -p $LOG_DIR # Collect traces uv run python benchmarks/common_benchmark.py \ benchmark=collect_trace \ benchmark.data.data_dir="../../data/debug" \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=gpt-5 \ llm.provider=openai \ llm.model_name=gpt-4.1-mini \ llm.api_key="$OPENAI_API_KEY" \ llm.base_url=https://api.openai.com/v1 \ llm.async_client=true \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=10 \ benchmark.execution.pass_at_k=1 \ agent=single_agent \ hydra.run.dir=$LOG_DIR \ 2>&1 | tee "$LOG_DIR/output.log" # Enter the apps/collect-trace directory TARGET_DIR="$SCRIPT_DIR/../" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR # Process traces uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl ================================================ FILE: apps/collect-trace/scripts/collect_trace_gpt5.sh ================================================ # Check if OPENAI_API_KEY is set if [ -z "$OPENAI_API_KEY" ]; then echo "Error: OPENAI_API_KEY is not set." exit 1 else echo "OPENAI_API_KEY detected." fi # Get the directory where the current script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "Current script directory: $SCRIPT_DIR" # Enter the apps/miroflow-agent directory TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR mkdir -p ../../logs LOG_DIR="../../logs/collect_trace_gpt5" echo "Log directory: $LOG_DIR" mkdir -p $LOG_DIR # Collect traces uv run python benchmarks/common_benchmark.py \ benchmark=collect_trace \ benchmark.data.data_dir="../../data/debug" \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=gpt-5 \ llm.provider=openai \ llm.model_name=gpt-5-2025-08-07 \ llm.api_key="$OPENAI_API_KEY" \ llm.base_url=https://api.openai.com/v1 \ llm.async_client=true \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=10 \ benchmark.execution.pass_at_k=1 \ agent=single_agent \ hydra.run.dir=$LOG_DIR \ 2>&1 | tee "$LOG_DIR/output.log" # Enter the apps/collect-trace directory TARGET_DIR="$SCRIPT_DIR/../" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR # Process traces uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl ================================================ FILE: apps/collect-trace/scripts/collect_trace_qwen3.sh ================================================ # Get the directory where the current script is located SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" echo "Current script directory: $SCRIPT_DIR" # Enter the apps/miroflow-agent directory TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR mkdir -p ../../logs LOG_DIR="../../logs/collect_trace_qwen3" echo "Log directory: $LOG_DIR" mkdir -p $LOG_DIR # Collect traces uv run python benchmarks/common_benchmark.py \ benchmark=collect_trace \ benchmark.data.data_dir="../../data/debug" \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=qwen \ llm.model_name=qwen-3-32b \ llm.api_key="" \ llm.base_url=https://your-api.com/v1 \ llm.async_client=true \ llm.temperature=1.0 \ llm.max_context_length=131072 \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=10 \ benchmark.execution.pass_at_k=1 \ agent=single_agent \ hydra.run.dir=$LOG_DIR \ 2>&1 | tee "$LOG_DIR/output.log" # Enter the apps/collect-trace directory TARGET_DIR="$SCRIPT_DIR/../" echo "Target directory: $TARGET_DIR" cd $TARGET_DIR # Process traces uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl ================================================ FILE: apps/collect-trace/utils/converters/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. from .convert_non_oai_to_chatml import ( convert_to_json_chatml, extract_and_save_chat_history, ) from .convert_oai_to_chatml import ( extract_message_history_from_log, oai_tool_message_to_chat_message, process_log_file, save_chatml_to_files, ) from .convert_to_chatml_auto_batch import ( batch_process_files, determine_conversion_method, get_llm_provider, process_single_file, ) __all__ = [ # OAI conversion functions "oai_tool_message_to_chat_message", "extract_message_history_from_log", "save_chatml_to_files", "process_log_file", # Non-OAI conversion functions "convert_to_json_chatml", "extract_and_save_chat_history", # Auto batch conversion functions "get_llm_provider", "determine_conversion_method", "process_single_file", "batch_process_files", ] ================================================ FILE: apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import sys from pathlib import Path from typing import Any, Dict, List def convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]: """ Convert message list to OpenAI JSON format ChatML Filter out messages with role 'tool', convert content None to empty string """ chatml_list = [] for message in messages: role = message.get("role", "") if role == "tool": continue # Skip tool messages if role == "system": continue # Skip system messages content = message.get("content", "") if content is None: content = "" # Handle different content formats if isinstance(content, list): text_parts = [] for item in content: if isinstance(item, dict) and item.get("type") == "text": text_parts.append(item.get("text", "")) content = " ".join(text_parts) elif isinstance(content, str): pass else: content = str(content) chatml_list.append({"role": role, "content": content}) return chatml_list def extract_and_save_chat_history( log_data: Dict[str, Any], output_dir: Path, input_filename: str ): """ Extract message history from log data and save as ChatML format Args: log_data: Log data dictionary output_dir: Output directory input_filename: Input filename (without extension) """ # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) # 1. Extract main_agent_message_history main_agent_history = log_data.get("main_agent_message_history", {}) if main_agent_history and "message_history" in main_agent_history: main_messages = main_agent_history["message_history"] if main_messages: chatml_list = convert_to_json_chatml(main_messages) chatml_list.insert( 0, { "role": "system", "content": main_agent_history.get("system_prompt", ""), }, ) # Save main agent chat records main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json" with open(main_output_file, "w", encoding="utf-8") as f: json.dump(chatml_list, f, ensure_ascii=False, indent=2) print(f"✓ Saved main agent chat records: {main_output_file}") # 2. Extract sub_agent_message_history_sessions sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {}) if sub_agent_sessions: for session_name, session_data in sub_agent_sessions.items(): if "message_history" in session_data: sub_agent_messages = session_data["message_history"] if sub_agent_messages: chatml_list = convert_to_json_chatml(sub_agent_messages) chatml_list.insert( 0, { "role": "system", "content": session_data.get("system_prompt", ""), }, ) # Save browser agent chat records sub_agent_output_file = ( output_dir / f"{input_filename}_{session_name}_chatml.json" ) with open(sub_agent_output_file, "w", encoding="utf-8") as f: json.dump(chatml_list, f, ensure_ascii=False, indent=2) print(f"✓ Saved sub agent chat records: {sub_agent_output_file}") def main(): """Main function""" if len(sys.argv) < 2: print("Usage: python convert_non_oai_to_chatml.py [output_dir]") print( "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json" ) print( "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats" ) sys.exit(1) log_file_path = Path(sys.argv[1]) output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("extracted_chats") # Check if input file exists if not log_file_path.exists(): print(f"Error: Log file does not exist: {log_file_path}") sys.exit(1) try: # Read log file print(f"Reading log file: {log_file_path}") with open(log_file_path, "r", encoding="utf-8") as f: log_data = json.load(f) # Extract input filename (without extension) input_filename = log_file_path.stem # Extract and save chat history print(f"Extracting chat history to: {output_dir}") extract_and_save_chat_history(log_data, output_dir, input_filename) print("\n✓ Chat history extraction completed!") print(f"Output directory: {output_dir.absolute()}") except json.JSONDecodeError as e: print(f"Error: Cannot parse JSON file: {e}") sys.exit(1) except Exception as e: print(f"Error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: apps/collect-trace/utils/converters/convert_oai_to_chatml.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import ast import json import os import sys from copy import deepcopy from datetime import datetime from pathlib import Path from typing import Any, Dict from system_prompts import ( main_system_prompt_foreword, sub_agent_system_prompt_foreword, system_prompt_tool_instrcutions, ) # Initialize creation_time_str with current time creation_time_str = datetime.now().strftime("%Y-%m-%d") def oai_tool_message_to_chat_message(oai_messages, agent_type, tool_definition): def convert_oai_tool_call_to_mcp_tool_call_str(oai_tool_call): if isinstance(oai_tool_call, list): assert len(oai_tool_call) >= 1 if isinstance(oai_tool_call, str): oai_tool_call = [json.loads(oai_tool_call)] mcp_tool_call_templates = [] for each_oai_tool_call in oai_tool_call: assert isinstance( each_oai_tool_call, dict ), f"oai_tool_call should be a dict, but got {type(each_oai_tool_call)}" server_name, tool_name = each_oai_tool_call["function"]["name"].rsplit( "-", maxsplit=1 ) arguments = json.loads(each_oai_tool_call["function"]["arguments"]) mcp_tool_call_template = f"\n{server_name}\n{tool_name}\n\n{json.dumps(arguments)}\n\n" mcp_tool_call_templates.append(mcp_tool_call_template) return "\n\n".join(mcp_tool_call_templates) def safe_get_text(content): """Safely extract text content, handling different content formats""" if isinstance(content, list) and content: if isinstance(content[0], dict) and "text" in content[0]: return content[0]["text"] elif isinstance(content[0], str): return content[0] else: return str(content[0]) elif isinstance(content, str): return content elif content is None: return "" else: return str(content) def generate_mcp_servers_str(tool_definition): mcp_servers_str = "" if tool_definition and len(tool_definition) > 0: for server in tool_definition: mcp_servers_str += f"## Server name: {server['name']}\n" if "tools" in server and len(server["tools"]) > 0: for tool in server["tools"]: # Skip tools that failed to load (they only have 'error' key) if "error" in tool and "name" not in tool: continue mcp_servers_str += f"### Tool name: {tool['name']}\n" mcp_servers_str += f"Description: {tool['description']}\n" mcp_servers_str += f"Input JSON schema: {tool['schema']}\n" return mcp_servers_str oai_messages = deepcopy(oai_messages) chat_messages = [] idx = 0 pending_user_tool_contents = [] # Merge pending_user_tool_contents into a single user message and add to chat_messages def flush_pending(pending_user_tool_contents, chat_messages): if pending_user_tool_contents: combined_content = "\n\n".join(pending_user_tool_contents) chat_messages.append( { "role": "user", "content": combined_content, } ) return [] # Always return a new empty list try: for idx, msg in enumerate(oai_messages): if msg["role"] in ["developer", "system"]: assert idx == 0, "System messages should be the first message" time_str = f" Today is: {creation_time_str}\n" tool_definition_str = generate_mcp_servers_str(tool_definition) ori_system_prompt = msg["content"][0]["text"] system_prompt_after_general_objective = ori_system_prompt[ ori_system_prompt.find("# General Objective") : ] if agent_type == "main": system_prompt = ( main_system_prompt_foreword + time_str + system_prompt_tool_instrcutions + tool_definition_str + system_prompt_after_general_objective ) elif agent_type == "sub_agent": system_prompt = ( sub_agent_system_prompt_foreword + time_str + system_prompt_tool_instrcutions + tool_definition_str + system_prompt_after_general_objective ) else: raise ValueError(f"Unknown agent type: {agent_type}") chat_messages.append( { "role": "system", "content": system_prompt, } ) elif msg["role"] in ["user", "tool"]: content = safe_get_text(msg["content"]) pending_user_tool_contents.append(content) elif msg["role"] == "assistant" and "tool_calls" in msg: # Flush pending user/tool messages pending_user_tool_contents = flush_pending( pending_user_tool_contents, chat_messages ) content = safe_get_text(msg.get("content", "")) if content != "": content += "\n\n" # Concatenate thinking text with tool call chat_messages.append( { "role": "assistant", "content": content + convert_oai_tool_call_to_mcp_tool_call_str(msg["tool_calls"]), } ) elif msg["role"] == "assistant" and "tool_calls" not in msg: # Flush pending user/tool messages pending_user_tool_contents = flush_pending( pending_user_tool_contents, chat_messages ) content = safe_get_text(msg["content"]) chat_messages.append( { "role": "assistant", "content": content, } ) else: raise ValueError(f"Unknown role: {msg['role']}") assert ( len(pending_user_tool_contents) == 0 ), "Error: Trace ends with user/tool round. Pending user/tool contents should be empty." except Exception as e: raise ValueError(f"Error processing messages: {e}") return chat_messages def extract_message_history_from_log( log_data: Dict[str, Any], ): """ Extract message history from log data and convert to OpenAI ChatML format Args: log_data: Log data dictionary Returns: Dictionary containing main_agent and sub_agents message history """ result = {"main_agent": [], "sub_agents": {}} # Extract main_agent_message_history main_agent_history = log_data.get("main_agent_message_history", {}) if main_agent_history and "message_history" in main_agent_history: main_messages = main_agent_history["message_history"] if main_messages: tool_main_agent_definition = extract_step_message( log_data, "get_main_tool_definitions" ) result["main_agent"] = oai_tool_message_to_chat_message( main_messages, "main", tool_main_agent_definition, ) # Extract sub_agent_message_history_sessions sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {}) if sub_agent_sessions: for session_name, session_data in sub_agent_sessions.items(): if "message_history" in session_data: sub_agent_messages = session_data["message_history"] if sub_agent_messages: sub_agent_type = session_name.split("_")[0] tool_sub_agent_definition = extract_step_message( log_data, f"get_sub_{sub_agent_type}_tool_definitions" ) result["sub_agents"][session_name] = ( oai_tool_message_to_chat_message( sub_agent_messages, "sub_agent", tool_sub_agent_definition ) ) return result def save_chatml_to_files( chatml_data: Dict[str, Any], output_dir: Path, input_filename: str, ): """ Save ChatML format messages to files Args: chatml_data: Dictionary containing message history output_dir: Output directory input_filename: Input filename (without extension) """ # Ensure output directory exists output_dir.mkdir(parents=True, exist_ok=True) # Save main agent messages if chatml_data["main_agent"]: main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json" with open(main_output_file, "w", encoding="utf-8") as f: json.dump(chatml_data["main_agent"], f, ensure_ascii=False, indent=2) print(f"✓ Saved main agent ChatML: {main_output_file}") # Save sub agent messages for session_name, messages in chatml_data["sub_agents"].items(): # Extract numeric suffix sub_agent_output_file = ( output_dir / f"{input_filename}_{session_name}_chatml.json" ) with open(sub_agent_output_file, "w", encoding="utf-8") as f: json.dump(messages, f, ensure_ascii=False, indent=2) print(f"✓ Saved sub agent {session_name} ChatML: {sub_agent_output_file}") def extract_step_message(data, target_step_name): try: # Check if step_logs field exists if "step_logs" not in data: print("step_logs field not found in log file") return None # Iterate through step_logs to find target step_name for i, step in enumerate(data["step_logs"]): step_name = step.get("step_name") if step_name == target_step_name: message = step.get("message") return ast.literal_eval(message) print(f"No record found with step_name '{target_step_name}'") return None except Exception as e: print(f"Error processing file: {e}") return None def process_log_file(log_file_path: str, output_dir: str = "extracted_chatml"): """ Process a single log file, extract message history and convert to ChatML format Args: log_file_path: Log file path output_dir: Output directory """ log_path = Path(log_file_path) output_path = Path(output_dir) if not log_path.exists(): print(f"Error: Log file does not exist: {log_file_path}") return # Get file creation time global creation_time_str try: stat_info = os.stat(log_path) creation_time = datetime.fromtimestamp(stat_info.st_ctime) creation_time_str = creation_time.strftime("%Y-%m-%d") print(f"File creation time: {creation_time_str}") except Exception as e: print(f"Warning: Could not get file creation time: {e}") try: # Read log file print(f"Reading log file: {log_path}") with open(log_path, "r", encoding="utf-8") as f: log_data = json.load(f) # Extract input filename (without extension) input_filename = log_path.stem # Extract message history and convert to ChatML format print("Extracting message history...") chatml_data = extract_message_history_from_log(log_data) # Save to files print(f"Saving ChatML files to: {output_path}") save_chatml_to_files(chatml_data, output_path, input_filename) print("\n✓ Processing completed!") print(f"Output directory: {output_path.absolute()}") except json.JSONDecodeError as e: print(f"Error: Cannot parse JSON file: {e}") except Exception as e: print(f"Error: {e}") def main(): """Main function""" if len(sys.argv) < 2: print("Usage: python convert_oai_to_chatml.py [output_dir]") print("Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json") print( "Example: python convert_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chatml" ) sys.exit(1) log_file_path = sys.argv[1] output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted_chatml" process_log_file(log_file_path, output_dir) if __name__ == "__main__": main() ================================================ FILE: apps/collect-trace/utils/converters/convert_to_chatml_auto_batch.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import subprocess import sys from pathlib import Path from typing import Dict, List def get_llm_provider(json_file_path: str) -> str: """ Extract llm_provider from JSON file Args: json_file_path: Path to JSON file Returns: llm_provider value or 'unknown' if not found """ try: with open(json_file_path, "r", encoding="utf-8") as f: data = json.load(f) # Extract llm_provider from env_info provider = data.get("env_info", {}).get("llm_provider") if provider: return provider else: return "unknown" except Exception as e: print(f"Error reading JSON file {json_file_path}: {e}") return "error" def determine_conversion_method(provider: str) -> str: """ Determine conversion method based on provider Args: provider: LLM provider name Returns: 'oai' for OpenAI, 'non-oai' for others """ if provider.lower() in ["openai", "claude_newapi", "deepseek_newapi"]: return "oai" else: return "non-oai" def get_script_paths() -> tuple: """ Get paths to conversion scripts Returns: Tuple of (oai_script_path, non_oai_script_path) """ # Get directory of current script current_dir = Path(__file__).parent oai_script = current_dir / "convert_oai_to_chatml.py" non_oai_script = current_dir / "convert_non_oai_to_chatml.py" # Check if scripts exist if not oai_script.exists(): raise FileNotFoundError(f"OAI conversion script not found: {oai_script}") if not non_oai_script.exists(): raise FileNotFoundError( f"Non-OAI conversion script not found: {non_oai_script}" ) return str(oai_script), str(non_oai_script) def process_single_file(json_file_path: str, output_dir: str) -> bool: """ Process a single JSON file Args: json_file_path: Path to JSON file output_dir: Output directory Returns: True if successful, False otherwise """ try: # Get llm_provider provider = get_llm_provider(json_file_path) if provider == "error": print(f"❌ Failed to read provider from: {json_file_path}") return False # Determine conversion method conversion_method = determine_conversion_method(provider) # Get script paths oai_script, non_oai_script = get_script_paths() # Choose script based on conversion method if conversion_method == "oai": script_path = oai_script print(f"🔧 Using OAI conversion for provider: {provider}") else: script_path = non_oai_script print(f"🔧 Using Non-OAI conversion for provider: {provider}") # Run conversion script result = subprocess.run( [sys.executable, script_path, json_file_path, output_dir], capture_output=True, text=True, ) if result.returncode == 0: print(f"✅ Successfully processed: {json_file_path}") return True else: print(f"❌ Failed to process {json_file_path}: {result.stderr}") return False except Exception as e: print(f"❌ Error processing {json_file_path}: {e}") return False def find_json_files(input_paths: List[str]) -> List[str]: """ Find JSON files from input paths Args: input_paths: List of file paths, directories, or patterns Returns: List of JSON file paths """ json_files = [] for path in input_paths: path_obj = Path(path) if path_obj.is_file(): # Single file if path_obj.suffix.lower() == ".json": json_files.append(str(path_obj)) elif path_obj.is_dir(): # Directory - find all JSON files for json_file in path_obj.glob("*.json"): json_files.append(str(json_file)) else: # Pattern matching try: for json_file in Path(".").glob(path): if json_file.suffix.lower() == ".json": json_files.append(str(json_file)) except Exception: print(f"Warning: Could not process pattern: {path}") return json_files def batch_process_files(input_paths: List[str], output_dir: str) -> Dict[str, int]: """ Batch process multiple files Args: input_paths: List of input paths output_dir: Output directory Returns: Dictionary with processing statistics """ # Find JSON files json_files = find_json_files(input_paths) if not json_files: print("❌ No JSON files found in the specified paths") return {"total": 0, "success": 0, "failed": 0} print(f"📁 Found {len(json_files)} JSON files to process") # Create output directory Path(output_dir).mkdir(parents=True, exist_ok=True) # Process files success_count = 0 failed_count = 0 for json_file in json_files: if process_single_file(json_file, output_dir): success_count += 1 else: failed_count += 1 return {"total": len(json_files), "success": success_count, "failed": failed_count} def show_help(): """Show help information""" help_text = """ Auto ChatML Conversion Script ============================ Automatically determines conversion method based on llm_provider field in JSON files Usage: python convert_to_chatml_auto_batch.py [output_dir] python convert_to_chatml_auto_batch.py [output_dir] python convert_to_chatml_auto_batch.py [output_dir] Parameters: input_paths: JSON files, directories, or patterns output_dir: Output directory (optional, default: extracted_chatml) Examples: python convert_to_chatml_auto_batch.py logs/debug_logs/ python convert_to_chatml_auto_batch.py logs/debug_logs/*.json python convert_to_chatml_auto_batch.py logs/debug_logs/ ./my_output python convert_to_chatml_auto_batch.py task_1.json task_2.json Conversion Logic: - If llm_provider = 'openai': Use convert_oai_to_chatml.py - If llm_provider = anything else: Use convert_non_oai_to_chatml.py Features: 1. Auto-detect conversion method per file 2. Batch process log files 3. Extract main_agent_message_history 4. Extract browser_agent_message_history_sessions 5. Convert to OpenAI ChatML format 6. Save as separate files 7. Generate processing summary """ print(help_text) def main(): """Main function""" # Check for help if len(sys.argv) < 2 or sys.argv[1] in ["-h", "--help"]: show_help() return # Parse arguments args = sys.argv[1:] # Check if last argument is output directory if len(args) > 1 and not args[-1].startswith("-"): # Check if last argument looks like a directory last_arg = args[-1] if ( last_arg.endswith("/") or not Path(last_arg).suffix or last_arg == "extracted_chatml" or last_arg.startswith("./") ): output_dir = last_arg input_paths = args[:-1] else: output_dir = "extracted_chatml" input_paths = args else: output_dir = "extracted_chatml" input_paths = args print("🚀 Starting auto ChatML conversion") print(f"📂 Input paths: {input_paths}") print(f"📁 Output directory: {output_dir}") try: # Check if conversion scripts exist get_script_paths() # Process files stats = batch_process_files(input_paths, output_dir) # Show results print("\n" + "=" * 50) print("📊 Processing Summary") print("=" * 50) print(f"Total files: {stats['total']}") print(f"Successfully processed: {stats['success']}") print(f"Failed: {stats['failed']}") print(f"Output directory: {Path(output_dir).absolute()}") if stats["failed"] > 0: print(f"\n⚠️ {stats['failed']} files failed to process") sys.exit(1) else: print("\n✅ All files processed successfully!") except FileNotFoundError as e: print(f"❌ {e}") sys.exit(1) except Exception as e: print(f"❌ Unexpected error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: apps/collect-trace/utils/converters/example_usage.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import os import sys import tempfile from pathlib import Path # Add parent directory to Python path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) from utils.converters import ( extract_and_save_chat_history, extract_message_history_from_log, ) def example_1_basic_conversion(): """Example 1: Basic conversion using Python API""" print("=== Example 1: Basic Conversion ===") # Sample log data log_data = { "main_agent_message_history": { "system_prompt": "You are a helpful assistant.", "message_history": [ { "role": "developer", "content": [ {"type": "text", "text": "You are a helpful assistant."} ], }, { "role": "user", "content": [{"type": "text", "text": "Hello, how are you?"}], }, { "role": "assistant", "content": [{"type": "text", "text": "I'm doing well, thank you!"}], }, ], }, "browser_agent_message_history_sessions": { "browser_agent_1": { "system_prompt": "You are a browsing agent.", "message_history": [ { "role": "developer", "content": [ {"type": "text", "text": "You are a browsing agent."} ], }, { "role": "user", "content": [{"type": "text", "text": "Search for something"}], }, { "role": "assistant", "content": [{"type": "text", "text": "I found it."}], }, ], } }, "env_info": {"llm_provider": "openai"}, } # Convert using OAI method chatml_data = extract_message_history_from_log(log_data) print( f"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent" ) print( f"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent" ) # Convert using Non-OAI method with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) extract_and_save_chat_history(log_data, temp_path, "example") # Check generated files main_file = temp_path / "example_main_agent_chatml.json" browser_file = temp_path / "example_browser_agent_1_chatml.json" if main_file.exists(): with open(main_file, "r") as f: main_content = json.load(f) print( f"Non-OAI conversion result: {len(main_content)} messages in main agent" ) if browser_file.exists(): with open(browser_file, "r") as f: browser_content = json.load(f) print( f"Non-OAI conversion result: {len(browser_content)} messages in browser agent" ) if __name__ == "__main__": print("ChatML Conversion Utilities - Usage Examples") print("=" * 50) example_1_basic_conversion() print("\n" + "=" * 50) print("Examples completed successfully!") print("\nFor more information, see the README.md file.") ================================================ FILE: apps/collect-trace/utils/converters/system_prompts.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. main_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.""" sub_agent_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.""" system_prompt_tool_instrcutions = """# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n\nserver name here\ntool name here\n\n{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\"escaped string\\\"\"\n}\n\n\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n""" ================================================ FILE: apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import glob import json import os def merge_json_files(input_dir, type="main"): # List to store all messages all_conversations = [] # Get all JSON files matching the pattern json_files = glob.glob(os.path.join(input_dir, f"*{type}*.json")) # Read each JSON file and merge its content for json_file in json_files: try: with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) conversation = { "messages": data, } all_conversations.append(conversation) print(f"Successfully processed: {json_file}") except Exception as e: print(f"Error processing {json_file}: {str(e)}") output_file = os.path.join(input_dir, f"{type}_merged.json") # Write the merged data to a new JSON file with open(output_file, "w", encoding="utf-8") as f: json.dump(all_conversations, f, ensure_ascii=False, indent=2) print( f"\nMerging complete! All {type} JSON files have been merged into {output_file}" ) print(f"Total number of files processed: {len(json_files)}") print(f"Total number of messages: {len(all_conversations)}") def main(): parser = argparse.ArgumentParser( description="Merge multiple JSON files which contain chat messages into a single file" ) parser.add_argument( "--input_dir", type=str, required=True, help="File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')", ) args = parser.parse_args() merge_json_files(args.input_dir, type="main_agent") merge_json_files(args.input_dir, type="agent-browsing") if __name__ == "__main__": main() ================================================ FILE: apps/collect-trace/utils/process_logs.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import json import os import shutil def get_successful_log_paths(jsonl_file_path: str) -> list: """ Collects the paths of successful log files from a dataset. This function extracts log file paths of successful records based on the value of `final_judge_result`. If the dataset has been fully processed, it reads from a `benchmark_results.jsonl` file. Otherwise, if processing was interrupted, it falls back to scanning individual `.json` files in the given directory. Success is determined by: - `PASS_AT_K_SUCCESS` for records in JSONL files. - `CORRECT` for records in individual JSON files. Args: jsonl_file_path (str): Path to a JSONL file or a directory of JSON files. Returns: list: A list of log file paths for successful records. """ log_paths = [] if jsonl_file_path.endswith(".jsonl"): with open(jsonl_file_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line: try: data = json.loads(line) if data.get("final_judge_result") == "PASS_AT_K_SUCCESS": log_path = data.get("log_file_path") if log_path: log_paths.append(log_path) except json.JSONDecodeError: continue else: filenames = os.listdir(jsonl_file_path) filenames = [filename for filename in filenames if filename.endswith(".json")] for filename in filenames: filepath = os.path.join(jsonl_file_path, filename) try: data = json.load(open(filepath, "r")) except Exception: continue try: final_judge_result = data["final_judge_result"] except KeyError: print(data.keys()) continue if final_judge_result == "CORRECT": log_paths.append(filepath) return log_paths # Usage example if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract successful log paths from JSONL file" ) parser.add_argument( "file_path", help="Path to the JSONL file containing benchmark results" ) args = parser.parse_args() result = get_successful_log_paths(args.file_path) # Get the parent directory of args.file_path parent_dir = os.path.abspath(os.path.dirname(args.file_path)) # Create successful logs directory success_log_dir = parent_dir + "/successful_logs" success_chatml_log_dir = parent_dir + "/successful_chatml_logs" os.makedirs(success_log_dir, exist_ok=True) print(f"Successful logs directory: {success_log_dir}") for i, path in enumerate(result, 1): basename = os.path.basename(path) print(f"Copying file: {path} to {success_log_dir}/{basename}") shutil.copy(path, f"{success_log_dir}/{basename}") os.system( f"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}" ) os.system( f"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}" ) ================================================ FILE: apps/gradio-demo/README.md ================================================ # Local Deep Research Demo with Gradio Web UI Host your own Deep Research demo using our [MiroThinker v1.5](https://huggingface.co/miromind-ai/MiroThinker-v1.5-30B) models and lightweight Gradio-based web interface. ## 🖥️ Hardware Requirements - **GPU**: NVIDIA RTX 40xx/50xx series or equivalent - **VRAM**: - **16GB minimum** (with Q4 quantization via llama.cpp) - **48GB+ recommended** (for FP8 quantization or longer context) - MiroThinker-v1.5-30B is a 30B MoE model with 3B active parameters ## ⚙️ LLM Server Deployment ### Download Model Checkpoints Download the full checkpoint from Hugging Face: ```python from huggingface_hub import snapshot_download snapshot_download(repo_id="miromind-ai/MiroThinker-v1.5-30B", local_dir="model/MiroThinker-v1.5-30B") ``` ### Option 1: SGLang Server (Recommended) FP8 is a highly efficient 8-bit floating point format that significantly reduces memory usage while maintaining model quality. This approach provides excellent performance for inference workloads on modern GPUs. Please install [SGLang](https://github.com/sgl-project/sglang) first. Then initialize fast inference with FP8 precision: ```bash MODEL_PATH=model/MiroThinker-v1.5-30B python3 -m sglang.launch_server \ --model-path $MODEL_PATH \ --mem-fraction-static 0.9 \ --quantization fp8 \ --tp 1 \ --dp 1 \ --host 0.0.0.0 \ --port 61005 \ --trust-remote-code ``` It will start an openai compatible server with BASE_URL=`http://0.0.0.0:61005/v1`. ### Option 2: llama.cpp (Quantized) For memory-efficient inference, download the pre-quantized GGUF version from the community: **Note**: Thanks to the community for providing quantized versions: [mradermacher](https://huggingface.co/mradermacher) ```bash # Download Q4_K_M quantized model (recommended balance) wget https://huggingface.co/mradermacher/MiroThinker-v1.5-30B-GGUF/resolve/main/MiroThinker-v1.5-30B.Q4_K_M.gguf ``` Follow the [official llama.cpp installation guide](https://github.com/ggml-org/llama.cpp) to set up the environment. After that: ```bash # Set up model path MODEL_PATH=model/MiroThinker-v1.5-30B.Q4_K_M.gguf # Start the server llama-server -m $MODEL_PATH \ --port 61005 \ -ngl 99 \ -v ``` This will start an OpenAI-compatible server at `http://0.0.0.0:61005/v1`. ### Other Options You can also leverage other frameworks for model serving like Ollama, vLLM, and Text Generation Inference (TGI) for different deployment scenarios. ## 🚀 Quick Start Guide ### 1. **Environment Setup** Get your API keys: - [Serper](https://serper.dev/): 2,500 free search credits for new accounts (required for web search) - [E2B](https://e2b.dev/): Free tier available (required for Python code execution) - [Jina](https://jina.ai/): Free tier available (required for web scraping) Edit the `apps/miroflow-agent/.env` file with your API keys: ```bash # Required - Web Search SERPER_API_KEY=your_serper_key # Required - Python Code Execution (E2B Cloud Sandbox) E2B_API_KEY=your_e2b_key # Required - Web Scraping JINA_API_KEY=your_jina_key # Required - Summary LLM (for webpage summarization) # Option 1: Use OpenAI GPT-5-Nano (recommended, cost-effective) SUMMARY_LLM_BASE_URL=https://api.openai.com/v1 SUMMARY_LLM_MODEL_NAME=gpt-5-nano SUMMARY_LLM_API_KEY=your_openai_key # Option 2: Use MiroThinker itself (if you have enough VRAM) # SUMMARY_LLM_BASE_URL=http://0.0.0.0:61005/v1 # SUMMARY_LLM_MODEL_NAME=MiroThinker # SUMMARY_LLM_API_KEY=none ``` ### 2. **Install Dependencies** We use [uv](https://github.com/astral-sh/uv) to manage all dependencies. ```bash cd apps/gradio-demo uv sync ``` ### 3. **Configure API Endpoint** Set your LLM API endpoint and API key: ```bash export BASE_URL=http://your-sglang-address:your-sglang-port/v1 export API_KEY=your_api_key # Optional, required if your endpoint needs authentication ``` ### 4. **Launch the Application** ```bash uv run main.py ``` ### 5. **Access the Web Interface** Open your browser and navigate to: `http://localhost:8080` ### 📝 Notes - Ensure your LLM server is up and running before launching the demo - The demo will use your local CPU/GPU for inference while leveraging external APIs for search and code execution - Monitor your API usage through the respective provider dashboards ================================================ FILE: apps/gradio-demo/main.py ================================================ import asyncio import json import logging import os import threading import time import uuid from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import AsyncGenerator, List, Optional import gradio as gr from dotenv import load_dotenv from hydra import compose, initialize_config_dir from omegaconf import DictConfig from prompt_patch import apply_prompt_patch from src.config.settings import expose_sub_agents_as_tools from src.core.pipeline import create_pipeline_components, execute_task_pipeline from utils import replace_chinese_punctuation # Apply custom system prompt patch (adds MiroThinker identity) apply_prompt_patch() # Create global cleanup thread pool for operations that won't be affected by asyncio.cancel cleanup_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="cleanup") logger = logging.getLogger(__name__) # Set DEMO_MODE for simplified tool configuration os.environ["DEMO_MODE"] = "1" # Load environment variables from .env file load_dotenv() # Global Hydra initialization flag _hydra_initialized = False def load_miroflow_config(config_overrides: Optional[dict] = None) -> DictConfig: """ Load the full MiroFlow configuration using Hydra, similar to how benchmarks work. """ global _hydra_initialized # Get the path to the miroflow agent config directory miroflow_config_dir = Path(__file__).parent.parent / "miroflow-agent" / "conf" miroflow_config_dir = miroflow_config_dir.resolve() logger.debug(f"Config dir: {miroflow_config_dir}") if not miroflow_config_dir.exists(): raise FileNotFoundError( f"MiroFlow config directory not found: {miroflow_config_dir}" ) # Initialize Hydra if not already done if not _hydra_initialized: try: initialize_config_dir( config_dir=str(miroflow_config_dir), version_base=None ) _hydra_initialized = True except Exception as e: logger.warning(f"Hydra already initialized or error: {e}") # Compose configuration with environment variable overrides overrides = [] # Add environment variable based overrides (refer to scripts/debug.sh) llm_provider = os.getenv( "DEFAULT_LLM_PROVIDER", "qwen" ) # debug.sh defaults to qwen model_name = os.getenv( "DEFAULT_MODEL_NAME", "MiroThinker" ) # debug.sh default model agent_set = os.getenv("DEFAULT_AGENT_SET", "demo") # Use demo config base_url = os.getenv("BASE_URL", "http://localhost:11434") api_key = os.getenv("API_KEY", "") # API key for LLM endpoint logger.debug(f"LLM base_url: {base_url}") # Map provider names to config files # Available configs: default.yaml, claude-3-7.yaml, gpt-5.yaml, qwen-3.yaml provider_config_map = { "anthropic": "claude-3-7", "openai": "gpt-5", "qwen": "qwen-3", } llm_config = provider_config_map.get( llm_provider, "qwen-3" ) # fallback to qwen-3 config overrides.extend( [ f"llm={llm_config}", f"llm.provider={llm_provider}", f"llm.model_name={model_name}", f"llm.base_url={base_url}", f"llm.api_key={api_key}", f"agent={agent_set}", "agent.main_agent.max_turns=50", # Limit max turns for gradio demo "benchmark=gaia-validation", # refer to debug.sh ] ) # Add config overrides from request if config_overrides: for key, value in config_overrides.items(): if isinstance(value, dict): for subkey, subvalue in value.items(): overrides.append(f"{key}.{subkey}={subvalue}") else: overrides.append(f"{key}={value}") try: cfg = compose(config_name="config", overrides=overrides) return cfg except Exception as e: logger.error(f"Failed to compose Hydra config: {e}") exit() # Lazy loading for tool definitions to speed up page load # Tools will be loaded on first request instead of blocking startup _preload_cache = { "cfg": None, "main_agent_tool_manager": None, "sub_agent_tool_managers": None, "output_formatter": None, "tool_definitions": None, "sub_agent_tool_definitions": None, "loaded": False, } _preload_lock = threading.Lock() def _ensure_preloaded(): """Lazy load pipeline components on first request.""" global _preload_cache if _preload_cache["loaded"]: return with _preload_lock: if _preload_cache["loaded"]: return logger.info("Loading pipeline components (first request)...") cfg = load_miroflow_config(None) main_agent_tool_manager, sub_agent_tool_managers, output_formatter = ( create_pipeline_components(cfg) ) tool_definitions = asyncio.run( main_agent_tool_manager.get_all_tool_definitions() ) if cfg.agent.sub_agents: tool_definitions += expose_sub_agents_as_tools(cfg.agent.sub_agents) sub_agent_tool_definitions = { name: asyncio.run(sub_agent_tool_manager.get_all_tool_definitions()) for name, sub_agent_tool_manager in sub_agent_tool_managers.items() } _preload_cache["cfg"] = cfg _preload_cache["main_agent_tool_manager"] = main_agent_tool_manager _preload_cache["sub_agent_tool_managers"] = sub_agent_tool_managers _preload_cache["output_formatter"] = output_formatter _preload_cache["tool_definitions"] = tool_definitions _preload_cache["sub_agent_tool_definitions"] = sub_agent_tool_definitions _preload_cache["loaded"] = True logger.info("Pipeline components loaded successfully.") class ThreadSafeAsyncQueue: """Thread-safe async queue wrapper""" def __init__(self): self._queue = asyncio.Queue() self._loop = None self._closed = False def set_loop(self, loop): self._loop = loop async def put(self, item): """Put data safely from any thread""" if self._closed: return await self._queue.put(item) def put_nowait_threadsafe(self, item): """Put data from other threads - use direct queue put for lower latency""" if self._closed or not self._loop: return # Use put_nowait directly instead of creating a task for lower latency self._loop.call_soon_threadsafe(lambda: self._queue.put_nowait(item)) async def get(self): return await self._queue.get() def close(self): self._closed = True def filter_google_search_organic(organic: List[dict]) -> List[dict]: """ Filter google search organic results to remove unnecessary information """ result = [] for item in organic: result.append( { "title": item.get("title", ""), "link": item.get("link", ""), } ) return result def is_scrape_error(result: str) -> bool: """ Check if the scrape result is an error """ try: json.loads(result) return False except json.JSONDecodeError: return True def filter_message(message: dict) -> dict: """ Filter message to remove unnecessary information """ if message["event"] == "tool_call": tool_name = message["data"].get("tool_name") tool_input = message["data"].get("tool_input") if ( tool_name == "google_search" and isinstance(tool_input, dict) and "result" in tool_input ): result_dict = json.loads(tool_input["result"]) if "organic" in result_dict: new_result = { "organic": filter_google_search_organic(result_dict["organic"]) } message["data"]["tool_input"]["result"] = json.dumps( new_result, ensure_ascii=False ) if ( tool_name in ["scrape", "scrape_website"] and isinstance(tool_input, dict) and "result" in tool_input ): # if error, it can not be json if is_scrape_error(tool_input["result"]): message["data"]["tool_input"] = {"error": tool_input["result"]} else: message["data"]["tool_input"] = {} return message async def stream_events_optimized( task_id: str, query: str, _: Optional[dict] = None, disconnect_check=None ) -> AsyncGenerator[dict, None]: """Optimized event stream generator that directly outputs structured events, no longer wrapped as SSE strings.""" workflow_id = task_id last_send_time = time.time() last_heartbeat_time = time.time() # Create thread-safe queue stream_queue = ThreadSafeAsyncQueue() stream_queue.set_loop(asyncio.get_event_loop()) cancel_event = threading.Event() def run_pipeline_in_thread(): try: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) class ThreadQueueWrapper: def __init__(self, thread_queue, cancel_event): self.thread_queue = thread_queue self.cancel_event = cancel_event async def put(self, item): if self.cancel_event.is_set(): logger.info("Pipeline cancelled, stopping execution") return self.thread_queue.put_nowait_threadsafe(filter_message(item)) wrapper_queue = ThreadQueueWrapper(stream_queue, cancel_event) # Ensure pipeline components are loaded (lazy loading) _ensure_preloaded() async def pipeline_with_cancellation(): pipeline_task = asyncio.create_task( execute_task_pipeline( cfg=_preload_cache["cfg"], task_id=workflow_id, task_description=query, task_file_name=None, main_agent_tool_manager=_preload_cache[ "main_agent_tool_manager" ], sub_agent_tool_managers=_preload_cache[ "sub_agent_tool_managers" ], output_formatter=_preload_cache["output_formatter"], stream_queue=wrapper_queue, log_dir=os.getenv("LOG_DIR", "logs/api-server"), tool_definitions=_preload_cache["tool_definitions"], sub_agent_tool_definitions=_preload_cache[ "sub_agent_tool_definitions" ], ) ) async def check_cancellation(): while not cancel_event.is_set(): await asyncio.sleep(0.5) logger.info("Cancel event detected, cancelling pipeline") pipeline_task.cancel() cancel_task = asyncio.create_task(check_cancellation()) try: done, pending = await asyncio.wait( [pipeline_task, cancel_task], return_when=asyncio.FIRST_COMPLETED, ) for task in pending: task.cancel() for task in done: if task == pipeline_task: try: await task except asyncio.CancelledError: logger.info("Pipeline task was cancelled") except Exception as e: logger.error(f"Pipeline execution error: {e}") pipeline_task.cancel() cancel_task.cancel() loop.run_until_complete(pipeline_with_cancellation()) except Exception as e: if not cancel_event.is_set(): logger.error(f"Pipeline error: {e}", exc_info=True) stream_queue.put_nowait_threadsafe( { "event": "error", "data": {"error": str(e), "workflow_id": workflow_id}, } ) finally: stream_queue.put_nowait_threadsafe(None) if "loop" in locals(): loop.close() executor = ThreadPoolExecutor(max_workers=1) future = executor.submit(run_pipeline_in_thread) try: while True: try: if disconnect_check and await disconnect_check(): logger.info("Client disconnected, stopping pipeline") cancel_event.set() break message = await asyncio.wait_for(stream_queue.get(), timeout=0.1) if message is None: logger.info("Pipeline completed") break yield message last_send_time = time.time() except asyncio.TimeoutError: current_time = time.time() if current_time - last_send_time > 300: logger.info("Stream timeout") break if future.done(): try: message = stream_queue._queue.get_nowait() if message is not None: yield message continue except Exception: break if current_time - last_heartbeat_time >= 15: yield { "event": "heartbeat", "data": {"timestamp": current_time, "workflow_id": workflow_id}, } last_heartbeat_time = current_time except Exception as e: logger.error(f"Stream error: {e}", exc_info=True) yield { "event": "error", "data": {"workflow_id": workflow_id, "error": f"Stream error: {str(e)}"}, } finally: cancel_event.set() stream_queue.close() try: future.result(timeout=1.0) except Exception: pass executor.shutdown(wait=False) # ========================= Gradio Integration ========================= def _init_render_state(): return { "agent_order": [], "agents": {}, # agent_id -> {"agent_name": str, "tool_call_order": [], "tools": {tool_call_id: {...}}} "current_agent_id": None, "errors": [], } def _format_think_content(text: str) -> str: """Convert tags to readable markdown format.""" import re # Replace tags with blockquote format (no label) text = re.sub(r"\s*", "\n> ", text) text = re.sub(r"\s*", "\n", text) # Convert newlines within thinking to blockquote continuation lines = text.split("\n") result = [] in_thinking = False for line in lines: if line.strip().startswith(">") and not in_thinking: in_thinking = True result.append(line) elif in_thinking and line.strip() and not line.startswith(">"): result.append(f"> {line}") else: if line.strip() == "" and in_thinking: in_thinking = False result.append(line) return "\n".join(result) def _append_show_text(tool_entry: dict, delta: str): existing = tool_entry.get("content", "") # Skip "Final boxed answer" content (already shown in main response) if "Final boxed answer" in delta: return # Format think tags for display formatted_delta = _format_think_content(delta) tool_entry["content"] = existing + formatted_delta def _is_empty_payload(value) -> bool: if value is None: return True if isinstance(value, str): stripped = value.strip() return stripped == "" or stripped in ("{}", "[]") if isinstance(value, (dict, list, tuple, set)): return len(value) == 0 return False def _format_search_results(tool_input: dict, tool_output: dict) -> str: """Format google_search results in a beautiful card layout.""" lines = [] # Get search query from input query = "" if isinstance(tool_input, dict): query = tool_input.get("q", "") or tool_input.get("query", "") # Parse results from output - handle multiple formats results = [] if isinstance(tool_output, dict): # Case 1: output has "result" field containing JSON string result_str = tool_output.get("result", "") if isinstance(result_str, str) and result_str.strip(): try: result_data = json.loads(result_str) if isinstance(result_data, dict): results = result_data.get("organic", []) except json.JSONDecodeError: pass elif isinstance(result_str, dict): results = result_str.get("organic", []) # Case 2: output directly contains "organic" field if not results and "organic" in tool_output: results = tool_output.get("organic", []) if not results and not query: return "" # Build the card lines.append('
') # Header with query if query: lines.append('
') lines.append('🔍') lines.append(f'Search: "{query}"') lines.append("
") # Results count if results: lines.append(f'
≡ Found {len(results)} results
') # Results list lines.append('
') for item in results[:10]: # Limit to 10 results title = item.get("title", "Untitled") link = item.get("link", "#") lines.append(f""" 🌐 {title} """) lines.append("
") lines.append("
") return "\n".join(lines) def _format_sogou_search_results(tool_input: dict, tool_output: dict) -> str: """Format sogou_search results in a beautiful card layout.""" lines = [] # Get search query from input query = "" if isinstance(tool_input, dict): query = tool_input.get("q", "") or tool_input.get("query", "") # Parse results from output - sogou uses "Pages" instead of "organic" results = [] if isinstance(tool_output, dict): result_str = tool_output.get("result", "") if isinstance(result_str, str) and result_str.strip(): try: result_data = json.loads(result_str) if isinstance(result_data, dict): results = result_data.get("Pages", []) except json.JSONDecodeError: pass elif isinstance(result_str, dict): results = result_str.get("Pages", []) if not results and "Pages" in tool_output: results = tool_output.get("Pages", []) if not results and not query: return "" # Build the card lines.append('
') # Header with query if query: lines.append('
') lines.append('🔍') lines.append(f'Search: "{query}"') lines.append("
") # Results count if results: lines.append(f'
≡ Found {len(results)} results
') # Results list lines.append('
') for item in results[:10]: # Limit to 10 results title = item.get("title", "Untitled") link = item.get("url", item.get("link", "#")) lines.append(f""" 🌐 {title} """) lines.append("
") lines.append("
") return "\n".join(lines) def _format_scrape_results(tool_input: dict, tool_output: dict) -> str: """Format scrape/webpage results in a card layout.""" lines = [] # Get URL url = "" if isinstance(tool_input, dict): url = tool_input.get("url", tool_input.get("link", "")) # Check for error if isinstance(tool_output, dict) and "error" in tool_output: lines.append('
') lines.append('
') lines.append('🌐') lines.append( f'{url[:60]}{"..." if len(url) > 60 else ""}' ) lines.append("
") lines.append('
❌ Failed
') lines.append("
") return "\n".join(lines) # Success case lines.append('
') if url: lines.append('
') lines.append('🌐') lines.append( f'{url[:60]}{"..." if len(url) > 60 else ""}' ) lines.append("
") lines.append('
✓ Done
') lines.append("
") return "\n".join(lines) def _render_markdown(state: dict) -> str: lines = [] final_summary_lines = [] # Collect final summary content separately # Render errors first if any if state.get("errors"): for err in state["errors"]: lines.append(f'
❌ {err}
') # Render all agents' content for agent_id in state.get("agent_order", []): agent = state["agents"].get(agent_id, {}) agent_name = agent.get("agent_name", "") is_final_summary = agent_name == "Final Summary" for call_id in agent.get("tool_call_order", []): call = agent["tools"].get(call_id, {}) tool_name = call.get("tool_name", "unknown_tool") # Show text / message - display directly if tool_name in ("show_text", "message"): content = call.get("content", "") if content: if is_final_summary: final_summary_lines.append(content) else: lines.append(content) continue tool_input = call.get("input", {}) tool_output = call.get("output", {}) has_input = not _is_empty_payload(tool_input) has_output = not _is_empty_payload(tool_output) # Special formatting for google_search if tool_name == "google_search" and (has_input or has_output): formatted = _format_search_results(tool_input, tool_output) if formatted: lines.append(formatted) continue # Special formatting for sogou_search if tool_name == "sogou_search" and (has_input or has_output): formatted = _format_sogou_search_results(tool_input, tool_output) if formatted: lines.append(formatted) continue # Special formatting for scrape/webpage tools if tool_name in ( "scrape", "scrape_website", "scrape_webpage", "scrape_and_extract_info", ) and (has_input or has_output): formatted = _format_scrape_results(tool_input, tool_output) if formatted: lines.append(formatted) continue # Special formatting for code execution tools if tool_name in ("python", "run_python_code") and (has_input or has_output): # Use pure Markdown to avoid HTML wrapper blocking Markdown rendering lines.append("\n---\n") lines.append("#### 💻 Code Execution\n") # Show code input - try multiple possible keys code = "" if isinstance(tool_input, dict): code = tool_input.get("code") or tool_input.get("code_block") or "" elif isinstance(tool_input, str): code = tool_input if code: lines.append(f"\n```python\n{code}\n```\n") # Show output if available if has_output: output = "" if isinstance(tool_output, dict): output = ( tool_output.get("result") or tool_output.get("output") or tool_output.get("stdout") or "" ) elif isinstance(tool_output, str): output = tool_output if isinstance(output, str) and output.strip(): lines.append("\n**Output:**\n") lines.append( f'\n```text\n{output[:1000]}{"..." if len(output) > 1000 else ""}\n```\n' ) lines.append("\n✅ Executed\n") continue # Other tools - show as compact card if has_input or has_output: target_lines = final_summary_lines if is_final_summary else lines target_lines.append('
') target_lines.append(f'
🔧 {tool_name}
') if has_input: # Show brief input summary if isinstance(tool_input, dict): brief = ", ".join( f"{k}: {str(v)[:30]}..." if len(str(v)) > 30 else f"{k}: {v}" for k, v in list(tool_input.items())[:2] ) target_lines.append(f'
{brief}
') if has_output: target_lines.append('
✓ Done
') target_lines.append("
") # Add final summary with Markdown-based styling (no HTML wrapper to preserve Markdown rendering) if final_summary_lines: lines.append("\n\n---\n\n") # Markdown horizontal rule as divider lines.append("## 📋 Research Summary\n\n") lines.extend(final_summary_lines) return "\n".join(lines) if lines else "*Waiting to start research...*" def _update_state_with_event(state: dict, message: dict): event = message.get("event") data = message.get("data", {}) if event == "start_of_agent": agent_id = data.get("agent_id") agent_name = data.get("agent_name", "unknown") if agent_id and agent_id not in state["agents"]: state["agents"][agent_id] = { "agent_name": agent_name, "tool_call_order": [], "tools": {}, } state["agent_order"].append(agent_id) state["current_agent_id"] = agent_id elif event == "end_of_agent": # End marker, no special handling needed, keep structure state["current_agent_id"] = None elif event == "tool_call": tool_call_id = data.get("tool_call_id") tool_name = data.get("tool_name", "unknown_tool") agent_id = state.get("current_agent_id") or ( state["agent_order"][-1] if state["agent_order"] else None ) if not agent_id: return state agent = state["agents"].setdefault( agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}} ) tools = agent["tools"] if tool_call_id not in tools: tools[tool_call_id] = {"tool_name": tool_name} agent["tool_call_order"].append(tool_call_id) entry = tools[tool_call_id] if tool_name == "show_text" and "delta_input" in data: delta = data.get("delta_input", {}).get("text", "") _append_show_text(entry, delta) elif tool_name == "show_text" and "tool_input" in data: ti = data.get("tool_input") text = "" if isinstance(ti, dict): text = ti.get("text", "") or ( (ti.get("result") or {}).get("text") if isinstance(ti.get("result"), dict) else "" ) elif isinstance(ti, str): text = ti if text: _append_show_text(entry, text) else: # Distinguish between input and output: if "tool_input" in data: # Could be input (first time) or output with result (second time) ti = data["tool_input"] # If contains result, assign to output; otherwise assign to input if isinstance(ti, dict) and "result" in ti: entry["output"] = ti else: # Only update input if we don't already have valid input data, or if the new data is not empty if "input" not in entry or not _is_empty_payload(ti): entry["input"] = ti elif event == "message": # Same incremental text display as show_text, aggregated by message_id message_id = data.get("message_id") agent_id = state.get("current_agent_id") or ( state["agent_order"][-1] if state["agent_order"] else None ) if not agent_id: return state agent = state["agents"].setdefault( agent_id, {"agent_name": "unknown", "tool_call_order": [], "tools": {}} ) tools = agent["tools"] if message_id not in tools: tools[message_id] = {"tool_name": "message"} agent["tool_call_order"].append(message_id) entry = tools[message_id] delta_content = (data.get("delta") or {}).get("content", "") if isinstance(delta_content, str) and delta_content: _append_show_text(entry, delta_content) elif event == "error": # Collect errors, display uniformly during rendering err_text = data.get("error") if isinstance(data, dict) else None if not err_text: try: err_text = json.dumps(data, ensure_ascii=False) except Exception: err_text = str(data) state.setdefault("errors", []).append(err_text) else: # Ignore heartbeat or other events pass return state _CANCEL_FLAGS = {} _CANCEL_LOCK = threading.Lock() def _set_cancel_flag(task_id: str): with _CANCEL_LOCK: _CANCEL_FLAGS[task_id] = True def _reset_cancel_flag(task_id: str): with _CANCEL_LOCK: _CANCEL_FLAGS[task_id] = False async def _disconnect_check_for_task(task_id: str): with _CANCEL_LOCK: return _CANCEL_FLAGS.get(task_id, False) def _spinner_markup(running: bool) -> str: if not running: return "" return ( '\n\n
' '
' "Generating..." "
\n\n" ) async def gradio_run(query: str, ui_state: Optional[dict]): query = replace_chinese_punctuation(query or "") task_id = str(uuid.uuid4()) _reset_cancel_flag(task_id) if not ui_state: ui_state = {"task_id": task_id} else: ui_state = {**ui_state, "task_id": task_id} state = _init_render_state() # Initial: disable Run, enable Stop, and show spinner at bottom of text yield ( _render_markdown(state) + _spinner_markup(True), gr.update(interactive=False), gr.update(interactive=True), ui_state, ) async for message in stream_events_optimized( task_id, query, None, lambda: _disconnect_check_for_task(task_id) ): # Skip heartbeat events - they don't need UI update event_type = message.get("event", "unknown") if event_type == "heartbeat": continue state = _update_state_with_event(state, message) md = _render_markdown(state) yield ( md + _spinner_markup(True), gr.update(interactive=False), gr.update(interactive=True), ui_state, ) # Small delay to allow Gradio to process the update await asyncio.sleep(0.01) # End: enable Run, disable Stop, remove spinner yield ( _render_markdown(state), gr.update(interactive=True), gr.update(interactive=False), ui_state, ) def stop_current(ui_state: Optional[dict]): tid = (ui_state or {}).get("task_id") if tid: _set_cancel_flag(tid) # Immediately switch button availability: enable Run, disable Stop return ( gr.update(interactive=True), gr.update(interactive=False), ) def build_demo(): # Use remote logo from dr.miromind.ai for faster page load custom_css = """ /* ========== MiroThinker - Modern Clean Design ========== */ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); /* Base */ .gradio-container { max-width: 100% !important; margin: 0 !important; padding: 0 !important; font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important; background: #ffffff !important; min-height: 100vh; } footer { display: none !important; } /* ===== Top Navigation ===== */ .top-nav { display: flex; align-items: center; justify-content: space-between; padding: 16px 32px; border-bottom: 1px solid #f0f0f0; background: #ffffff; } .nav-left { display: flex; align-items: center; gap: 20px; } .nav-brand { display: flex; align-items: center; gap: 10px; font-weight: 600; font-size: 1.1em; color: #18181b; } .brand-logo { width: 32px; height: 32px; border-radius: 6px; } .nav-links { display: flex; align-items: center; gap: 12px; } .nav-links a { color: #71717a; font-size: 1.1em; text-decoration: none; transition: color 0.2s; } .nav-links a:hover { color: #18181b; } .nav-right { display: flex; align-items: center; gap: 16px; } .nav-right a { color: #52525b; text-decoration: none; font-size: 0.9em; } /* ===== Hero Section ===== */ .hero-section { text-align: center; padding: 60px 24px 40px; max-width: 900px; margin: 0 auto; } .hero-title { font-size: 3em; font-weight: 700; background: linear-gradient(135deg, #10b981 0%, #14b8a6 50%, #06b6d4 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; margin: 0 0 16px 0; letter-spacing: -0.02em; } .hero-subtitle { display: flex; align-items: center; justify-content: center; gap: 16px; color: #71717a; font-size: 1em; } .hero-line { width: 40px; height: 1px; background: #d4d4d8; } /* ===== Input Section ===== */ #input-section { max-width: 720px !important; margin: 0 auto 40px !important; background: #ffffff; border: 1px solid #e0e0e0; border-radius: 16px; box-shadow: 0 2px 8px rgba(0,0,0,0.04); } #question-input { padding: 20px 24px !important; background: #ffffff !important; border: none !important; } #question-input textarea { background: #ffffff !important; border: none !important; font-size: 1.05em !important; line-height: 1.7 !important; color: #18181b !important; box-shadow: none !important; } #question-input textarea:focus { outline: none !important; box-shadow: none !important; } #question-input textarea::placeholder { color: #9ca3af !important; } #btn-row { padding: 16px 24px !important; border-top: 1px solid #f0f0f0; gap: 12px !important; } #run-btn { background: linear-gradient(135deg, #10b981 0%, #14b8a6 100%) !important; color: #ffffff !important; border: none !important; border-radius: 10px !important; padding: 12px 24px !important; font-size: 0.95em !important; font-weight: 500 !important; cursor: pointer !important; transition: opacity 0.2s, transform 0.2s !important; } #run-btn:hover { opacity: 0.9 !important; transform: translateY(-1px) !important; } #stop-btn { background: #ffffff !important; color: #71717a !important; border: 1px solid #e5e5e5 !important; border-radius: 10px !important; padding: 12px 20px !important; font-size: 0.95em !important; font-weight: 500 !important; cursor: pointer !important; transition: all 0.2s !important; } #stop-btn:hover { color: #ef4444 !important; border-color: #fecaca !important; background: #fef2f2 !important; } /* ===== Output Section ===== */ #output-section { max-width: 900px !important; margin: 0 auto !important; padding: 0 24px 60px !important; } .output-label { font-size: 0.85em; font-weight: 500; color: #71717a; text-transform: uppercase; letter-spacing: 0.05em; margin-bottom: 12px; padding: 0 4px; } #log-view { padding: 24px !important; min-height: 400px; max-height: 70vh; overflow-y: auto; background: #ffffff !important; border: 1px solid #e5e5e5 !important; border-radius: 16px !important; } #log-view h3 { font-size: 0.95em; font-weight: 600; color: #18181b; margin: 24px 0 16px 0; padding-bottom: 8px; border-bottom: 1px solid #f4f4f5; } #log-view h3:first-child { margin-top: 0; } /* Error block */ .error-block { background: #fef2f2; border: 1px solid #fecaca; border-radius: 10px; padding: 12px 16px; margin: 12px 0; color: #dc2626; font-size: 0.9em; } /* Tool card */ .tool-card { background: #fafafa; border: 1px solid #e5e5e5; border-radius: 10px; padding: 12px 16px; margin: 12px 0; } .tool-header { font-size: 0.9em; font-weight: 500; color: #3f3f46; margin-bottom: 4px; } .tool-brief { font-size: 0.8em; color: #71717a; margin-top: 4px; } .tool-status { font-size: 0.8em; color: #10b981; margin-top: 6px; } #log-view blockquote { background: linear-gradient(135deg, #f0fdf4 0%, #ecfeff 100%); border: none; border-left: 3px solid #10b981; padding: 16px 20px; margin: 16px 0; border-radius: 0 12px 12px 0; font-style: normal; color: #065f46; font-size: 0.9em; line-height: 1.7; } #log-view pre { background: #f8f9fa !important; color: #1e293b !important; border-radius: 8px !important; padding: 16px !important; font-size: 0.85em !important; line-height: 1.6 !important; overflow-x: auto; margin: 12px 0; border: 1px solid #e2e8f0; } #log-view pre code { background: transparent !important; color: #1e293b !important; font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important; font-size: inherit !important; padding: 0 !important; white-space: pre-wrap; word-break: break-word; } #log-view code { font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important; background: #f1f5f9 !important; color: #1e293b !important; padding: 2px 6px !important; border-radius: 4px !important; font-size: 0.9em !important; } #log-view p { line-height: 1.7; color: #3f3f46; } #log-view::-webkit-scrollbar { width: 6px; } #log-view::-webkit-scrollbar-track { background: transparent; } #log-view::-webkit-scrollbar-thumb { background: #e5e5e5; border-radius: 3px; } #log-view::-webkit-scrollbar-thumb:hover { background: #d4d4d8; } /* ===== Footer ===== */ .app-footer { text-align: center; padding: 24px; color: #a1a1aa; font-size: 0.85em; border-top: 1px solid #f0f0f0; } /* ===== Loading Spinner ===== */ @keyframes spin { to { transform: rotate(360deg); } } .loading-indicator { display: inline-flex; align-items: center; gap: 10px; color: #10b981; font-size: 0.9em; padding: 12px 0; } .loading-indicator::before { content: ''; width: 16px; height: 16px; border: 2px solid #d1fae5; border-top-color: #10b981; border-radius: 50%; animation: spin 0.8s linear infinite; } /* ===== Search Results Card ===== */ .search-card { background: #ffffff; border: 1px solid #e5e5e5; border-radius: 12px; margin: 16px 0; overflow: hidden; } .search-header { display: flex; align-items: center; gap: 10px; padding: 14px 18px; background: #fafafa; border-bottom: 1px solid #f0f0f0; } .search-icon { font-size: 1em; color: #10b981; } .search-query { font-size: 0.9em; color: #3f3f46; font-weight: 500; } .search-count { padding: 10px 18px; font-size: 0.8em; color: #71717a; background: #fafafa; border-bottom: 1px solid #f0f0f0; } .search-results { padding: 8px 0; } .search-result-item { display: flex; align-items: center; gap: 12px; padding: 12px 18px; text-decoration: none; color: #3f3f46; font-size: 0.9em; transition: background 0.15s; border-left: 3px solid transparent; } .search-result-item:hover { background: #f9fafb; border-left-color: #10b981; } .result-icon { font-size: 1em; flex-shrink: 0; opacity: 0.6; } .result-title { flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } /* ===== Scrape Card ===== */ .scrape-card { background: #ffffff; border: 1px solid #e5e5e5; border-radius: 10px; margin: 12px 0; padding: 12px 16px; display: flex; align-items: center; justify-content: space-between; gap: 12px; } .scrape-card.scrape-error { border-color: #fecaca; background: #fef2f2; } .scrape-header { display: flex; align-items: center; gap: 10px; flex: 1; min-width: 0; } .scrape-icon { font-size: 1em; opacity: 0.6; } .scrape-url { font-size: 0.85em; color: #52525b; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; } .scrape-status { font-size: 0.8em; padding: 4px 10px; border-radius: 6px; flex-shrink: 0; } .scrape-status.success { background: #ecfdf5; color: #059669; } .scrape-status.error { background: #fef2f2; color: #dc2626; } /* ===== Final Summary Section ===== */ .final-summary-divider { height: 1px; background: linear-gradient(to right, transparent, #e5e5e5, transparent); margin: 32px 0; } .final-summary-section { background: linear-gradient(135deg, #f8fafc 0%, #f1f5f9 100%); border: 1px solid #e2e8f0; border-radius: 16px; padding: 24px; margin-top: 16px; } .final-summary-header { font-size: 1.1em; font-weight: 600; color: #1e293b; margin-bottom: 16px; padding-bottom: 12px; border-bottom: 2px solid #3b82f6; display: inline-block; } .final-summary-content { color: #334155; line-height: 1.8; } .final-summary-content h1, .final-summary-content h2, .final-summary-content h3 { color: #1e293b; margin-top: 1.5em; margin-bottom: 0.5em; } .final-summary-content h1 { font-size: 1.4em; } .final-summary-content h2 { font-size: 1.2em; } .final-summary-content h3 { font-size: 1.1em; } .final-summary-content p { margin: 0.8em 0; } .final-summary-content ul, .final-summary-content ol { margin: 0.8em 0; padding-left: 1.5em; } .final-summary-content li { margin: 0.4em 0; } .final-summary-content a { color: #3b82f6; text-decoration: none; } .final-summary-content a:hover { text-decoration: underline; } .final-summary-content code { background: #e2e8f0; padding: 2px 6px; border-radius: 4px; font-family: 'SF Mono', 'Fira Code', monospace; font-size: 0.9em; } .final-summary-content pre { background: #1e293b; color: #e2e8f0; padding: 16px; border-radius: 8px; overflow-x: auto; } .final-summary-content pre code { background: transparent; padding: 0; color: inherit; } .final-summary-content table { width: 100%; border-collapse: collapse; margin: 1em 0; } .final-summary-content th, .final-summary-content td { padding: 10px 12px; border: 1px solid #e2e8f0; text-align: left; } .final-summary-content th { background: #f1f5f9; font-weight: 600; } .final-summary-content blockquote { border-left: 4px solid #3b82f6; margin: 1em 0; padding: 0.5em 1em; background: #f8fafc; color: #475569; } /* ===== Code Execution Card ===== */ .code-card { background: #1e1e2e; border: 1px solid #313244; border-radius: 12px; margin: 12px 0; padding: 16px; overflow: hidden; } .code-header { font-size: 0.9em; font-weight: 600; color: #cdd6f4; margin-bottom: 12px; display: flex; align-items: center; gap: 8px; } .code-card pre { background: #11111b !important; border-radius: 8px; padding: 12px 16px; margin: 8px 0; overflow-x: auto; font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important; font-size: 0.85em; line-height: 1.5; } .code-card code { background: transparent !important; color: #cdd6f4 !important; font-family: 'SF Mono', 'Fira Code', 'JetBrains Mono', Consolas, monospace !important; } .code-output-label { font-size: 0.8em; color: #a6adc8; margin-top: 12px; margin-bottom: 4px; } .code-status { font-size: 0.8em; color: #a6e3a1; margin-top: 8px; text-align: right; } /* ===== Responsive ===== */ @media (max-width: 768px) { .hero-title { font-size: 2em; } .hero-section { padding: 40px 16px 24px; } .input-wrapper, .output-wrapper { padding: 0 16px; } #log-view { max-height: 50vh; } } """ # Favicon head content favicon_head = '' with gr.Blocks( css=custom_css, title="MiroThinker - Deep Research", theme=gr.themes.Base(), head=favicon_head, ) as demo: # Top Navigation gr.HTML(""" """) # Hero Section gr.HTML("""

Research Deep. Uncover the Future

Don't just chat. Predict, verify, and discover with science-based AI.
""") # Input Section with gr.Column(elem_id="input-section"): inp = gr.Textbox( lines=4, placeholder="Enter your research question...", show_label=False, elem_id="question-input", ) with gr.Row(elem_id="btn-row"): stop_btn = gr.Button( "⏹ Stop", elem_id="stop-btn", variant="stop", interactive=False, scale=1, ) run_btn = gr.Button( "Start Research ➤", elem_id="run-btn", variant="primary", scale=2 ) # Output Section with gr.Column(elem_id="output-section"): gr.HTML('
Research Progress
') out_md = gr.Markdown("*Waiting to start research...*", elem_id="log-view") # State ui_state = gr.State({"task_id": None}) # Event handlers run_btn.click( fn=gradio_run, inputs=[inp, ui_state], outputs=[out_md, run_btn, stop_btn, ui_state], ) stop_btn.click(fn=stop_current, inputs=[ui_state], outputs=[run_btn, stop_btn]) # Footer gr.HTML(""" """) return demo if __name__ == "__main__": demo = build_demo() host = os.getenv("HOST", "0.0.0.0") port = int(os.getenv("PORT", "8080")) demo.queue().launch(server_name=host, server_port=port) ================================================ FILE: apps/gradio-demo/prompt_patch.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Custom Prompt Override (Monkey Patching) This module allows customizing prompts without modifying miroflow-agent code. Patches applied: 1. `generate_mcp_system_prompt` - Prepends custom identity prompt 2. `process_input` - Removes the boxed format requirement suffix 3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt for demo 4. `format_final_summary_and_log` - Disables boxed format check to prevent retry Usage: from prompt_patch import apply_prompt_patch apply_prompt_patch() """ import re # ============================================================================ # Custom Identity Prompt # ============================================================================ CUSTOM_IDENTITY_PROMPT = """You are MiroThinker, a specialized deep research AI assistant developed by MiroMind. IMPORTANT IDENTITY REMINDER: - You are NOT ChatGPT, Claude, or any other AI assistant """ # ============================================================================ # Strings to Remove from Input Processing # ============================================================================ # This string is appended to task descriptions in input_handler.py # We remove it for demo mode since we don't need strict boxed format BOXED_FORMAT_SUFFIX = "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}." # ============================================================================ # Custom Summarize Prompt for Demo Mode # ============================================================================ def get_demo_summarize_prompt(target_language: str, task_description: str) -> str: """ Generate a user-friendly summarize prompt for demo mode. This prompt is designed for better user experience, producing well-formatted Markdown responses instead of strict boxed answers. Args: target_language: The language to write the response in task_description: The original user question Returns: The summarize prompt string """ return f"""Please provide the final research summary based only on the information already gathered. No further tool calls are allowed. ## Requirements - **Language**: Write the entire response in **{target_language}**. - **Focus**: Directly answer the original question above. Do not just summarize gathered information — provide a clear, actionable answer. - **Response Length**: Match the complexity of your response to the question. For simple or short questions, provide a concise and direct answer without unnecessary elaboration. For complex questions, provide a detailed and structured report. - Use clear and structured Markdown formatting when appropriate. - Use appropriate Markdown headings (e.g., #, ##, ###) only when the content warrants structure. - Present key findings in an organized, concise, and readable way. - Use tables only when they genuinely improve clarity. - **Currency Format**: Use `\\$` instead of `$` for currency amounts (e.g., `\\$100`, `\\$1,000`) to avoid conflicts with inline math syntax. - **Citation Format**: - **In-Text**: Use the format `[ID]`, where `ID` is a **numeric identifier only** (digits 0–9), e.g. `[1]`, `[2]`. - **References Section(if has any sources)**: At the very end, add "References" (or equivalent in {target_language}). Format: [ID] TITLE/SECTION_TITLE. /. - Do NOT mention tools, tool calls, or internal reasoning steps. - Focus solely on delivering a professional, easy-to-read response that answers the user's original question. ## Original Question (for reference) {task_description}""" def _detect_language(text: str) -> str: """ Simple language detection based on character analysis. Returns a language description suitable for the summarize prompt. """ # Count characters by script chinese_chars = sum(1 for c in text if "\u4e00" <= c <= "\u9fff") japanese_chars = sum( 1 for c in text if "\u3040" <= c <= "\u30ff" or "\u31f0" <= c <= "\u31ff" ) korean_chars = sum(1 for c in text if "\uac00" <= c <= "\ud7af") total_chars = len(text.replace(" ", "")) if total_chars == 0: return "English" # Determine primary language if chinese_chars / total_chars > 0.1: return "Chinese (Simplified)" elif japanese_chars / total_chars > 0.1: return "Japanese" elif korean_chars / total_chars > 0.1: return "Korean" else: return "the same language as the user's question" # ============================================================================ # Monkey Patching # ============================================================================ _patched = False def apply_prompt_patch(): """ Apply monkey patches to customize prompts for demo mode. Patches applied: 1. `generate_mcp_system_prompt` - Prepends custom identity prompt to system prompt 2. `process_input` - Removes the boxed format requirement from task descriptions 3. `generate_agent_summarize_prompt` - Uses user-friendly summary prompt 4. `format_final_summary_and_log` - Disables boxed format check to prevent retry This function is idempotent - calling it multiple times has no additional effect. """ global _patched if _patched: return _patch_system_prompt() _patch_input_handler() _patch_summarize_prompt() _patch_output_formatter() _patched = True def _patch_system_prompt(): """Patch system prompt generation to include custom identity.""" from src.llm.providers import anthropic_client, openai_client from src.utils import prompt_utils # Store original function original_generate_mcp_system_prompt = prompt_utils.generate_mcp_system_prompt def patched_generate_mcp_system_prompt(date, mcp_servers): """Patched version that prepends custom identity prompt.""" original_prompt = original_generate_mcp_system_prompt(date, mcp_servers) return CUSTOM_IDENTITY_PROMPT + original_prompt # Apply patches to all modules that import and use this function prompt_utils.generate_mcp_system_prompt = patched_generate_mcp_system_prompt openai_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt anthropic_client.generate_mcp_system_prompt = patched_generate_mcp_system_prompt def _patch_input_handler(): """Patch input handler to remove boxed format requirement.""" from src.core import orchestrator from src.io import input_handler # Store original function original_process_input = input_handler.process_input def patched_process_input(task_description: str, task_file_name: str): """Patched version that removes boxed format requirement.""" result1, result2 = original_process_input(task_description, task_file_name) # Remove the boxed format suffix from both results result1 = result1.replace(BOXED_FORMAT_SUFFIX, "") result2 = result2.replace(BOXED_FORMAT_SUFFIX, "") return result1, result2 # Apply patch to input_handler module input_handler.process_input = patched_process_input # Also patch in orchestrator where it's imported orchestrator.process_input = patched_process_input def _patch_summarize_prompt(): """Patch summarize prompt generation for better user experience.""" from src.core import answer_generator, orchestrator from src.utils import prompt_utils def patched_generate_agent_summarize_prompt( task_description: str, agent_type: str = "" ) -> str: """ Patched version that uses user-friendly prompt for main agent. For main agent in demo mode, uses a Markdown-friendly prompt instead of the strict boxed format prompt used for benchmarks. """ if agent_type == "main": # Detect language from task description target_language = _detect_language(task_description) return get_demo_summarize_prompt(target_language, task_description) elif agent_type == "agent-browsing" or agent_type == "browsing-agent": # Keep original behavior for sub-agents summarize_prompt = ( "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n" "We are now ending this session, and your conversation history will be deleted. " "You must NOT initiate any further tool use. This is your final opportunity to report " "*all* of the information gathered during the session.\n\n" "The original task is repeated here for reference:\n\n" f'"{task_description}"\n\n' "Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n" "If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n" "If you reached a conclusion or answer, include it as part of the response.\n" "If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, " "Search results, quotes, and observations that might help a downstream agent solve the problem.\n" "If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n" "Your final response should be a clear, complete, and structured report.\n" "Organize the content into logical sections with appropriate headings.\n" "Do NOT include any tool call instructions, speculative filler, or vague summaries.\n" "Focus on factual, specific, and well-organized information." ) return summarize_prompt.strip() else: raise ValueError(f"Unknown agent type: {agent_type}") # Apply patches to all modules that import and use this function prompt_utils.generate_agent_summarize_prompt = ( patched_generate_agent_summarize_prompt ) orchestrator.generate_agent_summarize_prompt = ( patched_generate_agent_summarize_prompt ) answer_generator.generate_agent_summarize_prompt = ( patched_generate_agent_summarize_prompt ) def _patch_output_formatter(): """ Patch output formatter to disable boxed format check. In demo mode, we don't require \boxed{} format, so we patch the format_final_summary_and_log method to always return a valid result instead of FORMAT_ERROR_MESSAGE, which would trigger retry logic. """ from src.io import output_formatter # Get the OutputFormatter class OutputFormatter = output_formatter.OutputFormatter def patched_format_final_summary_and_log(self, final_answer_text: str, client=None): """ Patched version that doesn't return FORMAT_ERROR_MESSAGE. Instead of checking for \boxed{} content, we use the entire answer (with thinking tags removed) as the result. """ summary_lines = [] summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30) summary_lines.append(final_answer_text) # In demo mode, use the full answer text (minus thinking) as the result # Remove ... tags for the extracted result boxed_result = re.sub( r".*?", "", final_answer_text, flags=re.DOTALL ).strip() # If there's actual boxed content, extract it (for compatibility) actual_boxed = self._extract_boxed_content(final_answer_text) if actual_boxed: boxed_result = actual_boxed # Add extracted result section summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20) summary_lines.append(boxed_result if boxed_result else final_answer_text) # Token usage statistics and cost estimation if client and hasattr(client, "format_token_usage_summary"): token_summary_lines, log_string = client.format_token_usage_summary() summary_lines.extend(token_summary_lines) else: summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20) summary_lines.append("Token usage information not available.") summary_lines.append("-" * (40 + len(" Token Usage & Cost "))) log_string = "Token usage information not available." # Return boxed_result (never FORMAT_ERROR_MESSAGE in demo mode) # This ensures no retry is triggered return ( "\n".join(summary_lines), boxed_result or "Demo mode - no boxed format required", log_string, ) # Apply patch OutputFormatter.format_final_summary_and_log = patched_format_final_summary_and_log def get_custom_identity_prompt() -> str: """Return the custom identity prompt string.""" return CUSTOM_IDENTITY_PROMPT ================================================ FILE: apps/gradio-demo/pyproject.toml ================================================ [project] name = "gradio-demo" version = "0.1.0" description = "Gradio Demo" readme = "README.md" requires-python = ">=3.12" dependencies = [ "pydantic>=2.10.0", "python-dotenv>=1.0.0", "hydra-core>=1.3.0", "miroflow-agent", "aiohttp>=3.12.15", "gradio>=5.42.0", ] [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["./"] [tool.uv.sources] miroflow-agent = { path = "../miroflow-agent", editable = true } [dependency-groups] dev = [ "pytest>=8.4.1", "pytest-asyncio>=1.0.0", "httpx>=0.28.1", ] ================================================ FILE: apps/gradio-demo/utils.py ================================================ import re def contains_chinese(text): """ Detect if a string contains Chinese characters or Chinese punctuation Args: text (str): The string to detect Returns: bool: True if contains Chinese characters or punctuation, False otherwise """ # Chinese character Unicode ranges: # \u4e00-\u9fff: CJK Unified Ideographs # \u3400-\u4dbf: CJK Extension A # \uf900-\ufaff: CJK Compatibility Ideographs # \u3000-\u303f: CJK Symbols and Punctuation # \uff00-\uffef: Fullwidth ASCII, Fullwidth punctuation chinese_pattern = re.compile( r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]" ) return bool(chinese_pattern.search(text)) def replace_chinese_punctuation(text): # Handle single-character replacements with translate punctuation_map = str.maketrans( { ",": ",", "。": ".", "!": "!", "?": "?", ";": ";", ":": ":", "“": '"', "”": '"', "‘": "'", "’": "'", "(": "(", ")": ")", "【": "[", "】": "]", "《": "<", "》": ">", "、": ",", "—": "-", } ) # First, replace multi-character punctuation text = text.replace("……", "...") # Then apply single-character replacements return text.translate(punctuation_map) ================================================ FILE: apps/lobehub-compatibility/MiroThinkerToolParser.py ================================================ """ Tool parser plugin for vLLM for MiroThinker MCP format to compatible with the tool calling interface of openai. MCP format: server name tool name {...} """ import json from collections.abc import Sequence import json_repair import regex as re from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, FunctionCall, ToolCall, ) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager, ) from vllm.logger import init_logger logger = init_logger(__name__) class MirothinkerToolParser(ToolParser): def __init__(self, tokenizer): super().__init__(tokenizer) # State tracking for streaming self.current_tool_name_sent: bool = False self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.streamed_args_for_tool: list[str] = [] self.buffer: str = "" # Buffer for potential tool call tags self._resolved_tool_name_cache: dict[tuple[str, str], str] = {} # Correctness-first streaming state (incremental state machine) self._stream_mode: str = "text" # "text" | "tool" self._text_token_prefix: str = "" # possible prefix of self._tool_end_token_prefix: str = "" # possible prefix of self._tool_block_buffer: str = ( "" # accumulates between and ) self._stream_tool_call_ids: list[str] = [] # Token definitions self.tool_call_start_token: str = "" self.tool_call_end_token: str = "" # Regex patterns self.tool_call_regex = re.compile( r"\s*" r"(.*?)\s*" r"(.*?)\s*" r"\s*(.*?)\s*\s*" r"", re.DOTALL, ) # For streaming partial tool calls # IMPORTANT: Use GREEDY matching (.*) for arguments to capture all content # in streaming mode. We'll clean up tag in the code if present. # The outer ()? makes the whole section optional # The inner (.*) will match empty string if exists but has no content yet self.partial_tool_regex = re.compile( r"\s*" r"(?:(.*?)\s*)?" r"(?:(.*?)\s*)?" r"(?:(\s*.*))?", # Move \s* inside capture group so empty match returns "" re.DOTALL, ) # For correctness-first parsing on COMPLETE tool blocks only self._complete_tool_block_regex = re.compile( r"\s*" r"(?:(.*?)\s*)?" r"(?:(.*?)\s*)?" r"(?:\s*(.*?)\s*(?:\s*)?)?" r"", re.DOTALL, ) def _resolve_tool_name( self, server_name: str, tool_name: str, request: ChatCompletionRequest ) -> str: """ Resolve the actual tool name by combining server_name and tool_name if server_name is not 'default'. """ if not server_name or server_name == "default": return tool_name if not request or not request.tools: return tool_name cache_key = (server_name, tool_name) cached = self._resolved_tool_name_cache.get(cache_key) if cached: return cached # Filter tools that contain server_name candidates = [] for tool in request.tools: if hasattr(tool, "function") and hasattr(tool.function, "name"): name = tool.function.name if tool_name in name: candidates.append(name) if len(candidates) == 1: resolved = candidates[0] self._resolved_tool_name_cache[cache_key] = resolved return resolved # Find match containing tool_name for candidate in candidates: if server_name in candidate: logger.debug( "Resolved tool %s -> %s (server: %s)", tool_name, candidate, server_name, ) self._resolved_tool_name_cache[cache_key] = candidate return candidate return tool_name def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: request = super().adjust_request(request) if request.tools and request.tool_choice != "none": # Do not skip special tokens for proper tool parsing request.skip_special_tokens = False return request def _ensure_tool_id_valid(self, tool_id: int) -> bool: """Ensure the tool_id is valid and arrays have enough elements""" if tool_id < 0: return False # Ensure arrays are large enough while len(self.streamed_args_for_tool) <= tool_id: self.streamed_args_for_tool.append("") while len(self.prev_tool_call_arr) <= tool_id: self.prev_tool_call_arr.append({}) return True def extract_tool_calls( self, model_output: str, request: ChatCompletionRequest, ) -> ExtractedToolCallInformation: # Sanity check; avoid unnecessary processing if logger.isEnabledFor(10): # DEBUG logger.debug("model_output len=%s", len(model_output)) if ( self.tool_call_start_token not in model_output or request.tool_choice == "none" or not request.tools ): return ExtractedToolCallInformation( tools_called=False, tool_calls=[], content=model_output ) try: tool_calls = [] had_any_match = False had_parse_error = False # Find all complete tool calls for match in self.tool_call_regex.finditer(model_output): had_any_match = True server_name = match.group(1).strip() tool_name = match.group(2).strip() arguments_str = match.group(3).strip() # Resolve tool name tool_name = self._resolve_tool_name(server_name, tool_name, request) try: # Parse arguments as JSON arguments = json.loads(arguments_str) tool_call = ToolCall( type="function", function=FunctionCall( name=tool_name, arguments=json.dumps(arguments, ensure_ascii=False), ), ) tool_calls.append(tool_call) except json.JSONDecodeError: try: repaired = json_repair.repair_json(arguments_str) if not repaired: had_parse_error = True logger.warning( "Failed to repair tool arguments JSON: %s", arguments_str, ) continue arguments = json.loads(repaired) tool_call = ToolCall( type="function", function=FunctionCall( name=tool_name, arguments=json.dumps(arguments, ensure_ascii=False), ), ) tool_calls.append(tool_call) except Exception: had_parse_error = True logger.warning( "Failed to parse tool arguments after repair: %s", arguments_str, ) continue # If we couldn't successfully parse tool calls (or format didn't match), do not truncate. # Return the full model output as content to avoid losing text. if had_parse_error or not tool_calls or not had_any_match: return ExtractedToolCallInformation( tools_called=False, tool_calls=[], content=model_output ) # Extract content before first tool call content = model_output[: model_output.find(self.tool_call_start_token)] return ExtractedToolCallInformation( tools_called=len(tool_calls) > 0, tool_calls=tool_calls, content=content if content else None, ) except Exception: logger.exception("Error in extracting tool call from response.") return ExtractedToolCallInformation( tools_called=False, tool_calls=[], content=model_output ) def extract_tool_calls_streaming( self, previous_text: str, current_text: str, delta_text: str, previous_token_ids: Sequence[int], current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, ) -> DeltaMessage | None: # Reset state if this is the start of a new request if not previous_text: self.current_tool_name_sent = False self.prev_tool_call_arr = [] self.current_tool_id = -1 self.streamed_args_for_tool = [] self.buffer = "" self._resolved_tool_name_cache = {} self._stream_mode = "text" self._text_token_prefix = "" self._tool_end_token_prefix = "" self._tool_block_buffer = "" self._stream_tool_call_ids = [] # If tools are disabled for this request, do not suppress tags or parse tool calls. # Flush any internal buffers as plain text so we never drop output. if request.tool_choice == "none" or not request.tools: out = "" if self.buffer: out += self.buffer self.buffer = "" if self._text_token_prefix: out += self._text_token_prefix self._text_token_prefix = "" if self._tool_block_buffer: out += self.tool_call_start_token + self._tool_block_buffer self._tool_block_buffer = "" if self._tool_end_token_prefix: out += self._tool_end_token_prefix self._tool_end_token_prefix = "" out += delta_text return DeltaMessage(content=out) if out else None def _longest_token_prefix_at_end(s: str, token: str) -> str: max_len = min(len(token) - 1, len(s)) for i in range(max_len, 0, -1): if token.startswith(s[-i:]): return s[-i:] return "" emitted_text_parts: list[str] = [] emitted_tool_calls: list[DeltaToolCall] = [] chunk = delta_text while chunk: if self._stream_mode == "text": if self._text_token_prefix: chunk = self._text_token_prefix + chunk self._text_token_prefix = "" start_idx = chunk.find(self.tool_call_start_token) if start_idx < 0: prefix = _longest_token_prefix_at_end( chunk, self.tool_call_start_token ) if prefix: safe = chunk[: -len(prefix)] if safe: emitted_text_parts.append(safe) self._text_token_prefix = prefix else: emitted_text_parts.append(chunk) break before = chunk[:start_idx] if before: emitted_text_parts.append(before) chunk = chunk[start_idx + len(self.tool_call_start_token) :] self._stream_mode = "tool" self._tool_block_buffer = "" self._tool_end_token_prefix = "" continue # tool mode if self._tool_end_token_prefix: chunk = self._tool_end_token_prefix + chunk self._tool_end_token_prefix = "" end_idx = chunk.find(self.tool_call_end_token) if end_idx < 0: prefix = _longest_token_prefix_at_end(chunk, self.tool_call_end_token) if prefix: self._tool_block_buffer += chunk[: -len(prefix)] self._tool_end_token_prefix = prefix else: self._tool_block_buffer += chunk break # Complete tool block self._tool_block_buffer += chunk[:end_idx] tool_block = ( self.tool_call_start_token + self._tool_block_buffer + self.tool_call_end_token ) remainder = chunk[end_idx + len(self.tool_call_end_token) :] # Reset tool buffers before parsing self._stream_mode = "text" self._tool_block_buffer = "" self._tool_end_token_prefix = "" try: m = self._complete_tool_block_regex.search(tool_block) if not m: emitted_text_parts.append(tool_block) chunk = remainder continue server_name = (m.group(1) or "").strip() tool_name = (m.group(2) or "").strip() arguments_str = (m.group(3) or "").strip() if not tool_name: emitted_text_parts.append(tool_block) chunk = remainder continue resolved_name = ( self._resolve_tool_name(server_name, tool_name, request) if server_name else tool_name ) # Finalize arguments strictly at end of the block if not arguments_str: arguments_json_str = "{}" else: try: arguments_obj = json.loads(arguments_str) except Exception: repaired = json_repair.repair_json(arguments_str) if not repaired: emitted_text_parts.append(tool_block) chunk = remainder continue arguments_obj = json.loads(repaired) arguments_json_str = json.dumps(arguments_obj, ensure_ascii=False) tool_index = len(self._stream_tool_call_ids) tool_call_id = make_tool_call_id() self._stream_tool_call_ids.append(tool_call_id) emitted_tool_calls.append( DeltaToolCall( index=tool_index, type="function", id=tool_call_id, function=DeltaFunctionCall( name=resolved_name, arguments=arguments_json_str, ).model_dump(exclude_none=True), ) ) except Exception: logger.exception( "Error parsing complete tool block in streaming; falling back to plain text." ) emitted_text_parts.append(tool_block) chunk = remainder emitted_text = "".join(emitted_text_parts) if emitted_text_parts else None if emitted_text is not None and emitted_text == "": emitted_text = None if emitted_text is None and not emitted_tool_calls: return None # vLLM's DeltaMessage.tool_calls is validated as a list; do not pass None explicitly. if emitted_tool_calls: return DeltaMessage(content=emitted_text, tool_calls=emitted_tool_calls) return DeltaMessage(content=emitted_text) # Register the tool parser to ToolParserManager ToolParserManager.register_module("mirothinker", True, MirothinkerToolParser) ================================================ FILE: apps/lobehub-compatibility/README.md ================================================ # LobeChat Integration Guide This guide describes how to integrate the MiroThinker model with [LobeChat](https://github.com/lobehub/lobe-chat), an open-source, modern LLM UI framework supporting tool usage (function calling). ## Before You Start MiroThinker is a reasoning model. When generating responses, it first outputs its reasoning process inside `...` tags, then provides the final answer. For agentic tasks (multi-step tool use), the model performs better when it can see its previous reasoning in the conversation history. However, LobeChat does not preserve reasoning content in conversation history. When sending messages back to the API, LobeChat strips the `...` content from previous assistant messages. This means the model cannot see its prior reasoning steps. - For general chat: This works fine. - For agentic workflows: Performance may be degraded since the model cannot reference its previous reasoning. If you need full reasoning preservation for agentic use cases, consider modifying LobeChat's source code to return `reasoning_content` in conversation history. ## 1. Start the Inference Service First, launch the MiroThinker model using vLLM with the OpenAI-compatible API adapter. We use vLLM because it supports loading custom tool parsers from external Python files, while SGLang does not. Ensure you include the tool parser plugin. ```bash # Configuration PORT=61002 MODEL_PATH=miromind-ai/MiroThinker-v1.5-30B # Start vLLM server vllm serve $MODEL_PATH \ --served-model-name mirothinker \ --port $PORT \ --trust-remote-code \ --chat-template chat_template.jinja \ --tool-parser-plugin MiroThinkerToolParser.py \ --tool-call-parser mirothinker \ --enable-auto-tool-choice ``` ## 2. Configure LobeChat You can use either the self-hosted version or the [web application](https://lobechat.com/chat). ### Step 1: Access Settings Navigate to **Settings** -> **AI Service Provider** to add a custom AI service provider. ![Settings Navigation](img/settings.png) ### Step 2: Add Custom AI Provider Click the `+` button to add a new provider and configure it as follows: ![Add AI Provider](img/AI-provider.png) | Field | Value | Description | | :--- | :--- | :--- | | **Provider ID** | `miromind` | Or any identifier you prefer. | | **Request Format** | `OPENAI` | | | **API Key** | `your-api-key` | Use any string if auth is disabled. | | **API Proxy Address** | `http://localhost:61002/v1` | Replace with your actual service address. | ### Step 3: Configure the Model After adding the provider, add the models you deploy to the service provider's model list.: 1. Add a new model with the ID `mirothinker` (must match `--served-model-name`). 1. **Crucial**: Enable the **Function Calling** capability toggle. 1. Click "Check" to verify connectivity. ![Model Configuration](img/model.png) ## 3. Usage Demo Once configured, you can use MiroThinker in LobeChat with full tool-calling capabilities. ![Presentation Demo](img/presentation.gif) ================================================ FILE: apps/lobehub-compatibility/chat_template.jinja ================================================ {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "In this environment you have access to a set of tools you can use to answer the user's question.\n\nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.\n\nToday is: " + strftime_now('%Y-%m-%d') + ". For time-dependent questions, answer based on the world as it would reasonably be today.\n\n# Tool-Use Formatting Instructions\n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription:\nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n\nserver name here\ntool name here\n\n{\n \"param1\": \"value1\",\n \"param2\": \"value2 \\\"escaped string\\\"\"\n}\n\n\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n## Server name: default" }} {%- for tool in tools %} {%- set func = tool.function if tool.function is defined else tool %} {{- "\n### Tool name: " + func.name + "\n" }} {{- "Description:\n" }} {%- set desc = func.description if func.description else '' %} {%- if desc[:4] == ' ' %} {{- desc }} {%- else %} {{- " " + desc }} {%- endif %} {%- if "Args:" not in desc and func.parameters is defined and func.parameters.properties is defined %} {{- "\n\n Args:" }} {%- for prop_name, prop_value in func.parameters.properties.items() %} {%- if prop_value.description is defined %} {{- "\n " + prop_name + ": " + prop_value.description }} {%- else %} {{- "\n " + prop_name + ": " + (prop_value.type if prop_value.type is defined else "any") }} {%- endif %} {%- endfor %} {%- endif %} {{- "\n\nInput JSON schema: " + (func.parameters | tojson) + "\n" }} {%- endfor %} {{- "\n# General Objective\n\nYou accomplish a given task iteratively, breaking it down into clear steps and working through them methodically.<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content if message.content is not none else '' %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '
' in content %} {%- set reasoning_content = (content.split('
')[0]).rstrip('\n') %} {%- set reasoning_content = (reasoning_content.split('')[-1]).lstrip('\n') %} {%- set content = (content.split('')[-1]).lstrip('\n') %} {%- endif %} {%- endif %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\ndefault\n' }} {{- tool_call.name }} {{- '\n\n' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '\n\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user\n' }} {%- else %} {{- '\n\n' }} {%- endif %} {{- message.content }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n' }} {%- if enable_thinking is defined and enable_thinking is false %} {{- '\n\n\n\n' }} {%- endif %} {%- endif %} ================================================ FILE: apps/lobehub-compatibility/requirements.txt ================================================ vllm>=0.11.0 json-repair regex ================================================ FILE: apps/lobehub-compatibility/test_tool_parser.py ================================================ #!/usr/bin/env python3 """ Test MiroThinkerToolParser for correctness. """ import json import sys from types import SimpleNamespace from unittest.mock import MagicMock import regex as re # Mock vLLM imports for testing without vLLM installed # Create mock modules mock_vllm = MagicMock() mock_vllm.entrypoints = MagicMock() mock_vllm.entrypoints.chat_utils = MagicMock() mock_vllm.entrypoints.chat_utils.make_tool_call_id = lambda: "call_test_123" mock_protocol = SimpleNamespace( ChatCompletionRequest=MagicMock, DeltaFunctionCall=MagicMock, DeltaMessage=MagicMock, DeltaToolCall=MagicMock, ExtractedToolCallInformation=MagicMock, FunctionCall=MagicMock, ToolCall=MagicMock, ) mock_tool_parser = SimpleNamespace( ToolParser=object, ToolParserManager=MagicMock(), ) mock_logger = SimpleNamespace( init_logger=lambda x: MagicMock(isEnabledFor=lambda _: False), ) sys.modules["vllm"] = mock_vllm sys.modules["vllm.entrypoints"] = mock_vllm.entrypoints sys.modules["vllm.entrypoints.chat_utils"] = mock_vllm.entrypoints.chat_utils sys.modules["vllm.entrypoints.openai"] = MagicMock() sys.modules["vllm.entrypoints.openai.protocol"] = mock_protocol sys.modules["vllm.entrypoints.openai.tool_parsers"] = MagicMock() sys.modules["vllm.entrypoints.openai.tool_parsers.abstract_tool_parser"] = ( mock_tool_parser ) sys.modules["vllm.logger"] = mock_logger def test_tool_call_regex(): """Test the main tool call regex pattern.""" tool_call_regex = re.compile( r"\s*" r"(.*?)\s*" r"(.*?)\s*" r"\s*(.*?)\s*\s*" r"", re.DOTALL, ) # Test 1: Basic tool call text1 = """ my_mcp_server web_search {"query": "AI news"} """ match = tool_call_regex.search(text1) assert match is not None, "Should match basic tool call" assert match.group(1).strip() == "my_mcp_server" assert match.group(2).strip() == "web_search" assert json.loads(match.group(3).strip()) == {"query": "AI news"} print("✅ Test 1: Basic tool call - PASSED") # Test 2: Tool call with content before text2 = """Let me search for that. my_mcp_server search {"q": "test"} """ match = tool_call_regex.search(text2) assert match is not None, "Should match tool call with content before" print("✅ Test 2: Tool call with content before - PASSED") # Test 3: Multiple tool calls text3 = """ server1 tool1 {"a": 1} server2 tool2 {"b": 2} """ matches = list(tool_call_regex.finditer(text3)) assert len(matches) == 2, f"Should find 2 tool calls, found {len(matches)}" assert matches[0].group(2).strip() == "tool1" assert matches[1].group(2).strip() == "tool2" print("✅ Test 3: Multiple tool calls - PASSED") # Test 4: Complex JSON arguments text4 = """ my_mcp_server complex_tool { "query": "test with quotes and apostrophes", "options": {"nested": true}, "list": [1, 2, 3] } """ match = tool_call_regex.search(text4) assert match is not None, "Should match complex JSON" args = json.loads(match.group(3).strip()) assert args["query"] == "test with quotes and apostrophes" assert args["options"]["nested"] is True print("✅ Test 4: Complex JSON arguments - PASSED") # Test 5: Empty arguments text5 = """ my_mcp_server no_args_tool {} """ match = tool_call_regex.search(text5) assert match is not None, "Should match empty arguments" assert json.loads(match.group(3).strip()) == {} print("✅ Test 5: Empty arguments - PASSED") # Test 6: Minimal whitespace text6 = "st{}" match = tool_call_regex.search(text6) assert match is not None, "Should match minimal whitespace" print("✅ Test 6: Minimal whitespace - PASSED") def test_partial_tool_regex(): """Test the partial tool regex for streaming.""" partial_tool_regex = re.compile( r"\s*" r"(?:(.*?)\s*)?" r"(?:(.*?)\s*)?" r"(?:(\s*.*))?", re.DOTALL, ) # Test partial: only opening tag text1 = "\n" match = partial_tool_regex.search(text1) assert match is not None print("✅ Partial test 1: Only opening tag - PASSED") # Test partial: server_name only text2 = "\nmy_server\n" match = partial_tool_regex.search(text2) assert match is not None assert match.group(1).strip() == "my_server" assert match.group(2) is None print("✅ Partial test 2: Server name only - PASSED") # Test partial: incomplete arguments text3 = """ my_server my_tool {"query": "incomp""" match = partial_tool_regex.search(text3) assert match is not None assert match.group(1).strip() == "my_server" assert match.group(2).strip() == "my_tool" assert '{"query": "incomp' in match.group(3) print("✅ Partial test 3: Incomplete arguments - PASSED") def test_complete_tool_block_regex(): """Test the complete tool block regex used in streaming.""" complete_regex = re.compile( r"\s*" r"(?:(.*?)\s*)?" r"(?:(.*?)\s*)?" r"(?:\s*(.*?)\s*(?:\s*)?)?" r"", re.DOTALL, ) # Test: Complete block text1 = """ my_mcp_server search {"q": "test"} """ match = complete_regex.search(text1) assert match is not None assert match.group(1).strip() == "my_mcp_server" assert match.group(2).strip() == "search" assert json.loads(match.group(3).strip()) == {"q": "test"} print("✅ Complete block test 1: Full block - PASSED") # Test: Without arguments tag text2 = """ my_mcp_server simple_tool """ match = complete_regex.search(text2) assert match is not None assert match.group(2).strip() == "simple_tool" assert match.group(3) is None print("✅ Complete block test 2: Without arguments - PASSED") def test_edge_cases(): """Test edge cases and potential bugs.""" tool_call_regex = re.compile( r"\s*" r"(.*?)\s*" r"(.*?)\s*" r"\s*(.*?)\s*\s*" r"", re.DOTALL, ) # Edge case 1: Unicode in arguments text1 = """ my_mcp_server search {"query": "你好世界 🎉"} """ match = tool_call_regex.search(text1) assert match is not None args = json.loads(match.group(3).strip()) assert args["query"] == "你好世界 🎉" print("✅ Edge case 1: Unicode in arguments - PASSED") # Edge case 2: Newlines in JSON text2 = """ my_mcp_server search { "query": "line1\\nline2\\nline3" } """ match = tool_call_regex.search(text2) assert match is not None args = json.loads(match.group(3).strip()) assert "line1\nline2" in args["query"] print("✅ Edge case 2: Newlines in JSON - PASSED") # Edge case 3: Tags in content (should not match nested) text3 = """ my_mcp_server search {"query": "test"} """ match = tool_call_regex.search(text3) assert match is not None args = json.loads(match.group(3).strip()) assert "" in args["query"] print("✅ Edge case 3: HTML tags in arguments - PASSED") def check_unused_code(): """Check for unused code in the parser.""" print("\n" + "=" * 60) print("CODE ANALYSIS - Potential Issues") print("=" * 60) issues = [] # Issue 1: Unused variables unused_vars = [ "self.current_tool_name_sent", "self.prev_tool_call_arr", "self.current_tool_id", "self.streamed_args_for_tool", "self.buffer", ] issues.append( f"⚠️ Unused instance variables (defined but never used in main logic):\n {', '.join(unused_vars)}" ) # Issue 2: Unused method issues.append("⚠️ `_ensure_tool_id_valid` method is defined but never called") # Issue 3: Unused regex issues.append("⚠️ `partial_tool_regex` is defined but never used") # Issue 4: server_name handling issues.append( "⚠️ `_resolve_tool_name` checks for 'default' server_name,\n but chat_template.jinja uses 'my_mcp_server'" ) for issue in issues: print(f"\n{issue}") print("\n" + "=" * 60) print("RECOMMENDATIONS") print("=" * 60) print(""" 1. Remove unused variables and methods to clean up the code 2. Either use `partial_tool_regex` or remove it 3. Update `_resolve_tool_name` to handle 'my_mcp_server' correctly 4. The streaming implementation looks correct with the state machine approach 5. The main `extract_tool_calls` and `extract_tool_calls_streaming` logic appears sound """) def main(): print("=" * 60) print("MiroThinkerToolParser Test Suite") print("=" * 60) print("\n--- Testing Main Tool Call Regex ---") test_tool_call_regex() print("\n--- Testing Partial Tool Regex ---") test_partial_tool_regex() print("\n--- Testing Complete Tool Block Regex ---") test_complete_tool_block_regex() print("\n--- Testing Edge Cases ---") test_edge_cases() check_unused_code() print("\n" + "=" * 60) print("ALL REGEX TESTS PASSED ✅") print("=" * 60) if __name__ == "__main__": main() ================================================ FILE: apps/lobehub-compatibility/unit_test.py ================================================ #!/usr/bin/env python3 """ Unit tests for MiroThinker chat template. Run with: pytest unit_test.py -v """ from datetime import datetime from pathlib import Path import pytest from jinja2 import BaseLoader, Environment # ============================================================================ # Fixtures # ============================================================================ def strftime_now(format_str: str) -> str: """Simulate vLLM's strftime_now function.""" return datetime.now().strftime(format_str) @pytest.fixture def template(): """Load the chat template.""" template_path = Path(__file__).parent / "chat_template.jinja" with open(template_path, "r") as f: template_str = f.read() env = Environment(loader=BaseLoader()) env.globals["strftime_now"] = strftime_now return env.from_string(template_str) @pytest.fixture def today_date(): """Get today's date in YYYY-MM-DD format.""" return datetime.now().strftime("%Y-%m-%d") # ============================================================================ # Test: Basic Message Formatting # ============================================================================ class TestBasicMessageFormatting: """Tests for basic message formatting without tools.""" def test_user_message_format(self, template): """User message should be wrapped in <|im_start|>user ... <|im_end|>.""" messages = [{"role": "user", "content": "Hello!"}] result = template.render(messages=messages, add_generation_prompt=False) assert "<|im_start|>user\nHello!<|im_end|>" in result def test_system_message_format(self, template): """System message should be wrapped correctly.""" messages = [ {"role": "system", "content": "You are helpful."}, {"role": "user", "content": "Hi"}, ] result = template.render(messages=messages, add_generation_prompt=False) assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result def test_assistant_message_format(self, template): """Assistant message should be wrapped correctly with tags.""" messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there!"}, ] result = template.render(messages=messages, add_generation_prompt=False) # Assistant always outputs tags (even if empty) assert ( "<|im_start|>assistant\n\n\n\n\nHi there!<|im_end|>" in result ) def test_add_generation_prompt(self, template): """add_generation_prompt should add <|im_start|>assistant at the end.""" messages = [{"role": "user", "content": "Hello"}] result = template.render(messages=messages, add_generation_prompt=True) assert result.endswith("<|im_start|>assistant\n") def test_multi_turn_conversation(self, template): """Multi-turn conversation should maintain correct order.""" messages = [ {"role": "system", "content": "System prompt"}, {"role": "user", "content": "User 1"}, {"role": "assistant", "content": "Assistant 1"}, {"role": "user", "content": "User 2"}, ] result = template.render(messages=messages, add_generation_prompt=True) # Check order sys_pos = result.find("System prompt") user1_pos = result.find("User 1") asst1_pos = result.find("Assistant 1") user2_pos = result.find("User 2") assert sys_pos < user1_pos < asst1_pos < user2_pos # ============================================================================ # Test: Thinking/Reasoning Content # ============================================================================ class TestThinkingContent: """Tests for tag handling.""" def test_reasoning_content_field(self, template): """reasoning_content field should be wrapped in tags.""" messages = [ {"role": "user", "content": "What is 2+2?"}, { "role": "assistant", "content": "The answer is 4.", "reasoning_content": "2+2=4 by basic arithmetic.", }, ] result = template.render(messages=messages, add_generation_prompt=False) assert "\n2+2=4 by basic arithmetic.\n" in result assert "The answer is 4." in result def test_think_tags_in_content(self, template): """ tags in content should be extracted and reformatted.""" messages = [ {"role": "user", "content": "Question"}, { "role": "assistant", "content": "\nMy reasoning here.\n\n\nMy answer here.", }, ] result = template.render(messages=messages, add_generation_prompt=False) assert "\nMy reasoning here.\n" in result assert "My answer here." in result def test_think_preserved_in_history(self, template): """Think tags should be preserved in historical messages, not removed.""" messages = [ {"role": "user", "content": "First question"}, { "role": "assistant", "content": "First answer", "reasoning_content": "First reasoning", }, {"role": "user", "content": "Second question"}, ] result = template.render(messages=messages, add_generation_prompt=True) # Historical thinking should be present assert "\nFirst reasoning\n" in result def test_enable_thinking_false(self, template): """enable_thinking=false should output empty think tags.""" messages = [{"role": "user", "content": "Hello"}] result = template.render( messages=messages, add_generation_prompt=True, enable_thinking=False ) assert result.endswith("<|im_start|>assistant\n\n\n\n\n") def test_enable_thinking_true(self, template): """enable_thinking=true should not output empty think tags.""" messages = [{"role": "user", "content": "Hello"}] result = template.render( messages=messages, add_generation_prompt=True, enable_thinking=True ) assert result.endswith("<|im_start|>assistant\n") assert "\n\n" not in result # ============================================================================ # Test: Tool Definitions in System Prompt # ============================================================================ class TestToolDefinitions: """Tests for tool definition formatting in system prompt.""" def test_tools_trigger_system_prompt(self, template, today_date): """When tools are provided, a special system prompt should be generated.""" messages = [{"role": "user", "content": "Search something"}] tools = [ { "type": "function", "function": { "name": "web_search", "description": "Search the web", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "In this environment you have access to a set of tools" in result assert f"Today is: {today_date}" in result assert "# Tool-Use Formatting Instructions" in result def test_tool_name_format(self, template): """Tool should be formatted with ### Tool name: header.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "my_tool", "description": "My description", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "### Tool name: my_tool" in result def test_tool_server_name(self, template): """Tool server should be my_mcp_server.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "test_tool", "description": "Test", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "## Server name: default" in result def test_tool_description_indentation(self, template): """Tool description should be indented with 4 spaces.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "test_tool", "description": "My tool description", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "Description:\n My tool description" in result def test_tool_args_auto_generated(self, template): """Args section should be auto-generated from parameters.properties.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "search", "description": "Search function", "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query"}, "limit": {"type": "integer", "description": "Max results"}, }, }, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "Args:" in result assert "query: Search query" in result assert "limit: Max results" in result def test_tool_args_not_duplicated(self, template): """If description already has Args:, don't add another.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "search", "description": "Search function\n\nArgs:\n query: The query", "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Search query"} }, }, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) # Should only have one Args: section assert result.count("Args:") == 1 def test_tool_json_schema_included(self, template): """Input JSON schema should be included.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "test", "description": "Test", "parameters": { "type": "object", "properties": {"x": {"type": "string"}}, }, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "Input JSON schema:" in result assert '"type": "object"' in result or '"type":"object"' in result def test_tool_without_function_wrapper(self, template): """Tools can be passed without the function wrapper.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "name": "direct_tool", "description": "Direct tool format", "parameters": {"type": "object", "properties": {}}, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "### Tool name: direct_tool" in result def test_tool_none_description(self, template): """Tool with None description should not crash.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "test", "description": None, "parameters": {"type": "object", "properties": {}}, }, } ] # Should not raise an exception result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "### Tool name: test" in result def test_tool_empty_description(self, template): """Tool with empty description should not crash.""" messages = [{"role": "user", "content": "Test"}] tools = [ { "type": "function", "function": { "name": "test", "description": "", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "### Tool name: test" in result def test_system_message_prepended_with_tools(self, template): """Custom system message should be prepended when tools are present.""" messages = [ {"role": "system", "content": "You are MiroThinker."}, {"role": "user", "content": "Hi"}, ] tools = [ { "type": "function", "function": { "name": "test", "description": "Test", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) # System message should come first, then tool instructions sys_idx = result.find("You are MiroThinker.") tools_idx = result.find("In this environment you have access") assert sys_idx < tools_idx # ============================================================================ # Test: Tool Calls in Assistant Messages # ============================================================================ class TestToolCalls: """Tests for tool call formatting in assistant messages.""" def test_tool_call_format(self, template): """Tool calls should be formatted with tags.""" messages = [ {"role": "user", "content": "Search for AI"}, { "role": "assistant", "content": "Let me search.", "tool_calls": [ { "id": "call_1", "type": "function", "function": { "name": "web_search", "arguments": '{"query": "AI news"}', }, } ], }, ] tools = [ { "type": "function", "function": { "name": "web_search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=False ) assert "" in result assert "default" in result assert "web_search" in result assert "" in result assert '{"query": "AI news"}' in result assert "" in result assert "" in result def test_tool_call_no_content(self, template): """Tool call with None content should work.""" messages = [ {"role": "user", "content": "Search"}, { "role": "assistant", "content": None, "tool_calls": [ { "id": "call_1", "function": { "name": "search", "arguments": '{"q": "test"}', }, } ], }, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=False ) # Should have tool call with empty think tags (no content before tool call) assert "<|im_start|>assistant\n\n\n\n\n" in result def test_multiple_tool_calls(self, template): """Multiple tool calls should be separated by newlines.""" messages = [ {"role": "user", "content": "Compare Tokyo and Osaka"}, { "role": "assistant", "content": "I'll search both.", "tool_calls": [ { "id": "call_1", "function": { "name": "search", "arguments": '{"q": "Tokyo"}', }, }, { "id": "call_2", "function": { "name": "search", "arguments": '{"q": "Osaka"}', }, }, ], }, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=False ) # Extract assistant message part (after the last <|im_start|>assistant) assistant_start = result.rfind("<|im_start|>assistant") assistant_part = result[assistant_start:] # Should have two tool calls in assistant message assert assistant_part.count("") == 2 assert assistant_part.count("") == 2 def test_tool_call_arguments_dict(self, template): """Tool call with dict arguments should be JSON serialized.""" messages = [ {"role": "user", "content": "Search"}, { "role": "assistant", "content": "", "tool_calls": [ { "id": "call_1", "function": { "name": "search", "arguments": {"q": "test", "limit": 5}, # dict, not string }, } ], }, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=False ) # Arguments should be JSON serialized assert "" in result assert '"q"' in result or "'q'" in result # ============================================================================ # Test: Tool Responses # ============================================================================ class TestToolResponses: """Tests for tool response handling.""" def test_tool_response_in_user_message(self, template): """Tool response should be embedded in a user message.""" messages = [ {"role": "user", "content": "Search"}, { "role": "assistant", "content": "Searching...", "tool_calls": [ { "id": "call_1", "function": {"name": "search", "arguments": '{"q": "test"}'}, } ], }, { "role": "tool", "tool_call_id": "call_1", "content": "Search results here", }, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) # Tool response should be in a user message assert "<|im_start|>user\nSearch results here<|im_end|>" in result def test_multiple_tool_responses_merged(self, template): """Multiple consecutive tool responses should be merged into one user message.""" messages = [ {"role": "user", "content": "Compare"}, { "role": "assistant", "content": "Searching...", "tool_calls": [ { "id": "call_1", "function": {"name": "search", "arguments": '{"q": "A"}'}, }, { "id": "call_2", "function": {"name": "search", "arguments": '{"q": "B"}'}, }, ], }, {"role": "tool", "tool_call_id": "call_1", "content": "Result A"}, {"role": "tool", "tool_call_id": "call_2", "content": "Result B"}, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) # Should have only one user message containing both results # Results should be separated by \n\n assert "Result A\n\nResult B" in result # Count im_start|>user - should have 2 (original user + tool results) user_count = result.count("<|im_start|>user") assert user_count == 2 def test_tool_response_no_wrapper_tags(self, template): """Tool responses should NOT be wrapped in tags.""" messages = [ {"role": "user", "content": "Search"}, { "role": "assistant", "content": "", "tool_calls": [ { "id": "call_1", "function": {"name": "search", "arguments": '{"q": "test"}'}, } ], }, {"role": "tool", "tool_call_id": "call_1", "content": "Results"}, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) assert "" not in result assert "" not in result # ============================================================================ # Test: Edge Cases # ============================================================================ class TestEdgeCases: """Tests for edge cases and error handling.""" def test_only_system_message(self, template): """Only system message should work.""" messages = [{"role": "system", "content": "You are helpful."}] result = template.render(messages=messages, add_generation_prompt=False) assert "<|im_start|>system\nYou are helpful.<|im_end|>" in result def test_assistant_empty_content(self, template): """Assistant with empty string content should work.""" messages = [ {"role": "user", "content": "Hi"}, {"role": "assistant", "content": ""}, ] result = template.render(messages=messages, add_generation_prompt=False) # Assistant always outputs tags (even with empty content) assert "<|im_start|>assistant\n\n\n\n\n<|im_end|>" in result def test_unicode_content(self, template): """Unicode content should be preserved.""" messages = [ {"role": "user", "content": "你好!🎉"}, {"role": "assistant", "content": "こんにちは!"}, ] result = template.render(messages=messages, add_generation_prompt=False) assert "你好!🎉" in result assert "こんにちは!" in result def test_special_characters_in_content(self, template): """Special characters should be preserved.""" messages = [ {"role": "user", "content": "Test & \"quotes\" 'apostrophe'"}, ] result = template.render(messages=messages, add_generation_prompt=False) assert ' & "quotes"' in result def test_newlines_preserved(self, template): """Newlines in content should be preserved.""" messages = [ {"role": "user", "content": "Line 1\nLine 2\n\nLine 4"}, ] result = template.render(messages=messages, add_generation_prompt=False) assert "Line 1\nLine 2\n\nLine 4" in result # ============================================================================ # Test: Complete Flow # ============================================================================ class TestCompleteFlow: """Integration tests for complete conversation flows.""" def test_full_tool_use_flow(self, template, today_date): """Test a complete tool use flow.""" messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What's the weather?"}, { "role": "assistant", "content": "Let me check.", "tool_calls": [ { "id": "call_1", "function": { "name": "weather", "arguments": '{"city": "Tokyo"}', }, } ], }, {"role": "tool", "tool_call_id": "call_1", "content": "Sunny, 25°C"}, { "role": "assistant", "content": "It's sunny and 25°C in Tokyo!", }, {"role": "user", "content": "Thanks!"}, ] tools = [ { "type": "function", "function": { "name": "weather", "description": "Get weather info", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "City name"} }, }, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=True ) # Check structure assert "<|im_start|>system" in result assert "You are a helpful assistant." in result assert f"Today is: {today_date}" in result assert "### Tool name: weather" in result assert "" in result assert "default" in result assert "Sunny, 25°C" in result assert "It's sunny and 25°C in Tokyo!" in result assert result.endswith("<|im_start|>assistant\n") def test_reasoning_with_tool_use(self, template): """Test reasoning content combined with tool use.""" messages = [ {"role": "user", "content": "Search for Python tutorials"}, { "role": "assistant", "content": "I'll search for Python tutorials.", "reasoning_content": "User wants Python tutorials. I should use web search.", "tool_calls": [ { "id": "call_1", "function": { "name": "search", "arguments": '{"q": "Python tutorials"}', }, } ], }, ] tools = [ { "type": "function", "function": { "name": "search", "description": "Search", "parameters": {"type": "object", "properties": {}}, }, } ] result = template.render( messages=messages, tools=tools, add_generation_prompt=False ) # Should have both thinking and tool call assert "" in result assert "User wants Python tutorials" in result assert "" in result assert "" in result # ============================================================================ # Run tests # ============================================================================ if __name__ == "__main__": pytest.main([__file__, "-v"]) ================================================ FILE: apps/miroflow-agent/README.md ================================================ # MiroFlow Agent > For comprehensive documentation, installation guide, and tool configuration, see the [main README](../../README.md). ## Prerequisites Before running the agent, ensure you have: 1. **Installed dependencies**: Run `uv sync` in this directory 1. **Configured environment variables**: Copy `.env.example` to `.env` and fill in your API keys ```bash cp .env.example .env # Edit .env with your actual API keys (SERPER_API_KEY, JINA_API_KEY, E2B_API_KEY, etc.) ``` 1. **Started your model server** (for MiroThinker models): See the [Serve the MiroThinker Model](../../README.md#serve-the-mirothinker-model) section ## Quick Start ### Run a Single Task The simplest way to test the agent is running `main.py` directly. It will execute a default task: *"What is the title of today's arxiv paper in computer science?"* ```bash # Using MiroThinker models (requires your own model server) uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 llm.base_url=http://localhost:61002/v1 # Using Claude (requires ANTHROPIC_API_KEY in .env) uv run python main.py llm=claude-3-7 agent=single_agent_keep5 # Using GPT-5 (requires OPENAI_API_KEY in .env) uv run python main.py llm=gpt-5 agent=single_agent_keep5 ``` ### Customize Your Task To ask a different question, edit `main.py` line 32: ```python task_description = "Your custom question here" ``` Then run the agent again. It will search the web, execute code, and provide an answer. ### Run Benchmark Evaluation For systematic evaluation on standard benchmarks, add the `benchmark=` parameter: ```bash # Run on debug benchmark (quick test) uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=debug llm.base_url=http://localhost:61002/v1 # Run on specific benchmarks uv run python main.py llm=qwen-3 agent=mirothinker_v1.5_keep5_max200 benchmark=gaia-validation-text-103 llm.base_url=http://localhost:61002/v1 ``` ## Available Configurations ### LLM Models | Model | Config Name | Requirements | |-------|-------------|--------------| | MiroThinker (self-hosted) | `qwen-3` | Model server + `llm.base_url` | | Claude 3.7 Sonnet | `claude-3-7` | `ANTHROPIC_API_KEY` in .env | | GPT-5 | `gpt-5` | `OPENAI_API_KEY` in .env | ### Agent Configurations **MiroThinker v1.5:** - `mirothinker_v1.5_keep5_max200` ⭐ (recommended) - context management, up to 200 turns - `mirothinker_v1.5_keep5_max400` - context management, up to 400 turns (for BrowseComp) - `mirothinker_v1.5` - no context management, up to 600 turns **MiroThinker v1.0:** - `mirothinker_v1.0_keep5` (recommended) - context management, up to 600 turns - `mirothinker_v1.0` - no context management, up to 600 turns **General (for closed-source models like Claude, GPT-5):** - `single_agent_keep5` (recommended) - single agent with context management - `single_agent` - single agent without context management **Multi-Agent (Legacy for v0.1/v0.2):** - `multi_agent` - multi-agent with commercial tools - `multi_agent_os` - multi-agent with open-source tools ### Benchmark Configs `debug`, `browsecomp`, `browsecomp_zh`, `hle`, `hle-text-2158`, `hle-text-500`, `gaia-validation-text-103`, `gaia-validation`, `frames`, `xbench_deepsearch`, `futurex`, `seal-0`, `aime2025`, `deepsearchqa`, `webwalkerqa` ## Output The agent will: 1. Execute the task using available tools (search, code execution, etc.) 1. Generate a final summary and boxed answer 1. Save detailed logs to `../../logs/` directory 1. Display the results in the terminal ## Troubleshooting | Problem | Solution | |---------|----------| | API key errors | Check `.env` file has correct keys | | Model connection failed | Verify `llm.base_url` is accessible | | Tool execution errors | Check E2B/Serper/Jina API keys and quotas | | Out of memory | Use `mirothinker_v1.5_keep5_max200` config | For detailed logs, check the `logs/` directory. ================================================ FILE: apps/miroflow-agent/benchmarks/__init__.py ================================================ ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "aime2025" BENCHMARK_NAME_STD = "AIME2025" TASKS_PER_RUN = 30 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "browsecomp" BENCHMARK_NAME_STD = "BrowseComp-EN" TASKS_PER_RUN = 1266 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "browsecomp_zh" BENCHMARK_NAME_STD = "BrowseComp-ZH" TASKS_PER_RUN = 289 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_deepsearchqa.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import glob import json import os from pathlib import Path from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "deepsearchqa" BENCHMARK_NAME_STD = "DeepSearchQA" TASKS_PER_RUN = 900 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def extract_eval_details_from_log(log_file: str) -> dict: """ Extract evaluation details from a completed task log file. Returns: Dict with num_correct, num_expected, num_excessive, or empty dict if not found """ try: with open(log_file, "r") as f: content = f.read() # Try to parse as JSON first (task log files are JSON) try: log_data = json.loads(content) # Method 1: Check for eval_details field (new format - saved directly) if "eval_details" in log_data and log_data["eval_details"]: eval_details = log_data["eval_details"] if all( k in eval_details for k in ["num_correct", "num_expected", "num_excessive"] ): return { "num_correct": eval_details["num_correct"], "num_expected": eval_details["num_expected"], "num_excessive": eval_details["num_excessive"], } # Method 2: Check if llm_response contains the evaluation output (legacy format) if "llm_response" in log_data and log_data["llm_response"]: llm_response = log_data["llm_response"] # Look for DeepSearchQA Judge output if "DeepSearchQA Judge - Correct:" in llm_response: for line in llm_response.split("\n"): if "DeepSearchQA Judge - Correct:" in line: # Parse "Correct: X/Y, Excessive: Z" parts = line.split("Correct:")[1].strip() correct_part, excessive_part = parts.split(", Excessive:") num_correct, num_expected = map( int, correct_part.split("/") ) num_excessive = int(excessive_part.strip()) return { "num_correct": num_correct, "num_expected": num_expected, "num_excessive": num_excessive, } except json.JSONDecodeError: # Not JSON, try as plain text (legacy format) if "DeepSearchQA Judge - Correct:" in content: for line in content.split("\n"): if "DeepSearchQA Judge - Correct:" in line: # Parse "Correct: X/Y, Excessive: Z" parts = line.split("Correct:")[1].strip() correct_part, excessive_part = parts.split(", Excessive:") num_correct, num_expected = map(int, correct_part.split("/")) num_excessive = int(excessive_part.strip()) return { "num_correct": num_correct, "num_expected": num_expected, "num_excessive": num_excessive, } except Exception: pass return {} def calculate_deepsearchqa_metrics_from_logs(base_path: str) -> dict: """ Calculate metrics from individual task log files (for in-progress runs). Returns: Dict with metrics or None if no completed tasks found """ try: # Find all completed task log files pattern = os.path.join(base_path, "run_*/task_*.json") log_files = glob.glob(pattern) if not log_files: return None num_valid = 0 num_fully_correct = 0 num_fully_incorrect = 0 num_correct_with_extraneous = 0 f1_list = [] for log_file in log_files: details = extract_eval_details_from_log(log_file) if not details: continue num_correct = details["num_correct"] num_expected = details["num_expected"] num_excessive = details["num_excessive"] # Calculate per-item metrics true_positives = num_correct false_negatives = num_expected - num_correct false_positives = num_excessive # Calculate precision and recall for F1 precision = 0.0 if (true_positives + false_positives) > 0: precision = true_positives / (true_positives + false_positives) recall = 0.0 if (true_positives + false_negatives) > 0: recall = true_positives / (true_positives + false_negatives) f1 = 0.0 if (precision + recall) > 0: f1 = 2 * (precision * recall) / (precision + recall) f1_list.append(f1) # Classify into categories all_expected_correct = num_correct == num_expected has_extraneous = num_excessive > 0 if all_expected_correct and not has_extraneous: num_fully_correct += 1 elif num_correct == 0: num_fully_incorrect += 1 elif all_expected_correct and has_extraneous: num_correct_with_extraneous += 1 num_valid += 1 if num_valid > 0: return { "num_valid": num_valid, "fully_correct": num_fully_correct, "fully_incorrect": num_fully_incorrect, "correct_with_extraneous": num_correct_with_extraneous, "pct_fully_correct": num_fully_correct / num_valid, "pct_fully_incorrect": num_fully_incorrect / num_valid, "pct_correct_with_extraneous": num_correct_with_extraneous / num_valid, "avg_f1": sum(f1_list) / len(f1_list), } return None except Exception: return None def calculate_deepsearchqa_metrics(results_file: str) -> dict: """ Calculate DeepSearchQA-specific metrics from results file. Following the official Google DeepSearchQA evaluation metrics: 1. Fully Correct: All expected answers correct + no extraneous answers 2. Fully Incorrect: No correct answers 3. Correct with Extraneous Answers: All expected answers correct + has extraneous 4. F1 Score: Harmonic mean of precision and recall Returns: Dict with the 4 core metrics """ try: results = [] with open(results_file, "r") as f: for line in f: if line.strip(): results.append(json.loads(line)) num_valid = 0 num_fully_correct = 0 num_fully_incorrect = 0 num_correct_with_extraneous = 0 f1_list = [] for result in results: if result.get("status") != "success": continue # Extract eval_details from attempts if "attempts" in result and result["attempts"]: for attempt in result["attempts"]: if "eval_details" in attempt and attempt["eval_details"]: details = attempt["eval_details"] num_correct = details.get("num_correct", 0) num_expected = details.get("num_expected", 0) num_excessive = details.get("num_excessive", 0) # Calculate per-item metrics true_positives = num_correct false_negatives = num_expected - num_correct false_positives = num_excessive # Calculate precision and recall for F1 precision = 0.0 if (true_positives + false_positives) > 0: precision = true_positives / ( true_positives + false_positives ) recall = 0.0 if (true_positives + false_negatives) > 0: recall = true_positives / (true_positives + false_negatives) f1 = 0.0 if (precision + recall) > 0: f1 = 2 * (precision * recall) / (precision + recall) f1_list.append(f1) # Classify into categories all_expected_correct = num_correct == num_expected has_extraneous = num_excessive > 0 if all_expected_correct and not has_extraneous: num_fully_correct += 1 elif num_correct == 0: num_fully_incorrect += 1 elif all_expected_correct and has_extraneous: num_correct_with_extraneous += 1 num_valid += 1 break # Only use first attempt with details if num_valid > 0: return { "num_valid": num_valid, "fully_correct": num_fully_correct, "fully_incorrect": num_fully_incorrect, "correct_with_extraneous": num_correct_with_extraneous, "pct_fully_correct": num_fully_correct / num_valid, "pct_fully_incorrect": num_fully_incorrect / num_valid, "pct_correct_with_extraneous": num_correct_with_extraneous / num_valid, "avg_f1": sum(f1_list) / len(f1_list), } else: return {"num_valid": 0} except Exception as e: print(f"Warning: Could not calculate DeepSearchQA metrics: {e}") return {"num_valid": 0} def show_deepsearchqa_metrics(base_path: str): """ Show DeepSearchQA-specific metrics for all runs. Following Google DeepSearchQA official metrics: 1. Fully Correct 2. Fully Incorrect 3. Correct with Extraneous Answers 4. F1 Score """ print("\n" + "=" * 80) print("DeepSearchQA Metrics (Official Google Metrics)") print("=" * 80) # Find all benchmark_results.jsonl files results_files = glob.glob(os.path.join(base_path, "run_*/benchmark_results.jsonl")) if not results_files: print("(Metrics will be available after tasks complete)") return all_fully_correct = [] all_fully_incorrect = [] all_correct_with_extraneous = [] all_f1 = [] for results_file in sorted(results_files): run_dir = Path(results_file).parent.name metrics = calculate_deepsearchqa_metrics(results_file) if metrics["num_valid"] > 0: fully_correct_pct = metrics["pct_fully_correct"] fully_incorrect_pct = metrics["pct_fully_incorrect"] correct_with_extraneous_pct = metrics["pct_correct_with_extraneous"] f1 = metrics["avg_f1"] all_fully_correct.append(fully_correct_pct) all_fully_incorrect.append(fully_incorrect_pct) all_correct_with_extraneous.append(correct_with_extraneous_pct) all_f1.append(f1) print(f"\n{run_dir} ({metrics['num_valid']} items):") print( f" Fully Correct: {fully_correct_pct:6.2%} ({metrics['fully_correct']} items)" ) print( f" Fully Incorrect: {fully_incorrect_pct:6.2%} ({metrics['fully_incorrect']} items)" ) print( f" Correct w/ Extraneous: {correct_with_extraneous_pct:6.2%} ({metrics['correct_with_extraneous']} items)" ) print(f" F1 Score: {f1:6.2%}") if all_fully_correct: print("\n" + "=" * 80) print(f"Average across {len(all_fully_correct)} runs:") print("=" * 80) avg_fully_correct = sum(all_fully_correct) / len(all_fully_correct) avg_fully_incorrect = sum(all_fully_incorrect) / len(all_fully_incorrect) avg_correct_with_extraneous = sum(all_correct_with_extraneous) / len( all_correct_with_extraneous ) avg_f1 = sum(all_f1) / len(all_f1) print(f" Fully Correct: {avg_fully_correct:6.2%}") print(f" Fully Incorrect: {avg_fully_incorrect:6.2%}") print(f" Correct w/ Extraneous: {avg_correct_with_extraneous:6.2%}") print(f" F1 Score: {avg_f1:6.2%}") print("=" * 80) def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Show DeepSearchQA-specific metrics (only if runs are complete) # Check if any run has completed all its tasks has_complete_run = False run_dirs = glob.glob(os.path.join(args.path, "run_*")) for run_dir in run_dirs: results_file = os.path.join(run_dir, "benchmark_results.jsonl") if os.path.exists(results_file): has_complete_run = True break if has_complete_run: show_deepsearchqa_metrics(args.path) elif summary.total_completed > 0: # Try to show intermediate metrics from completed tasks interim_metrics = calculate_deepsearchqa_metrics_from_logs(args.path) print("\n" + "=" * 80) print("DeepSearchQA Metrics (Official Google Metrics)") print("=" * 80) if interim_metrics and interim_metrics.get("num_valid", 0) > 0: num_with_details = interim_metrics["num_valid"] print( f"⚠️ INTERIM RESULTS (based on {num_with_details}/{summary.total_completed} tasks with eval_details)" ) if num_with_details < summary.total_completed: print( f" Note: {summary.total_completed - num_with_details} completed tasks don't have eval_details (likely ran before the update)" ) print("-" * 80) fully_correct_pct = interim_metrics["pct_fully_correct"] fully_incorrect_pct = interim_metrics["pct_fully_incorrect"] correct_with_extraneous_pct = interim_metrics[ "pct_correct_with_extraneous" ] f1 = interim_metrics["avg_f1"] print( f" Fully Correct: {fully_correct_pct:6.2%} ({interim_metrics['fully_correct']} items)" ) print( f" Fully Incorrect: {fully_incorrect_pct:6.2%} ({interim_metrics['fully_incorrect']} items)" ) print( f" Correct w/ Extraneous: {correct_with_extraneous_pct:6.2%} ({interim_metrics['correct_with_extraneous']} items)" ) print(f" F1 Score: {f1:6.2%}") print() print( f"Note: Based on {interim_metrics['num_valid']} completed tasks. Final metrics may differ." ) else: print(f"Tasks in progress... ({summary.total_completed} completed)") print("Detailed metrics will be available when runs complete.") print("=" * 80) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "frames" BENCHMARK_NAME_STD = "Frames" TASKS_PER_RUN = 824 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import GAIAProgressChecker as ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "gaia-2023-validation-text-103" BENCHMARK_NAME_STD = "GAIA-Text-103" TASKS_PER_RUN = 103 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import GAIAProgressChecker as ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "gaia-2023-validation" BENCHMARK_NAME_STD = "GAIA-Val-165" TASKS_PER_RUN = 165 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "hle-text-2158" BENCHMARK_NAME_STD = "HLE-Text-2158" TASKS_PER_RUN = 2158 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "hle-text-500" BENCHMARK_NAME_STD = "HLE-Text-500" TASKS_PER_RUN = 500 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "hle" BENCHMARK_NAME_STD = "HLE-2500" TASKS_PER_RUN = 2500 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "seal-0" BENCHMARK_NAME_STD = "SEAL-0" TASKS_PER_RUN = 111 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "webwalkerqa" BENCHMARK_NAME_STD = "WebWalkerQA" TASKS_PER_RUN = 680 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_task_id_(\d+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import os from common import ProgressChecker # Benchmark configuration FILENAME = os.path.basename(__file__) BENCHMARK_NAME = "xbench_deepsearch" BENCHMARK_NAME_STD = "XBench-DeepSearch" TASKS_PER_RUN = 100 DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" TASK_ID_PATTERN = r"task_([a-f0-9]+)" def parse_args(): parser = argparse.ArgumentParser( description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." ) parser.add_argument( "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: # Create progress checker and run analysis checker = ProgressChecker( args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH ) summary = checker.run_analysis( benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN ) # Exit with appropriate code if summary.total_tasks == 0: print("No task files found in any run directories") elif summary.total_completed == 0: print("No tasks completed yet") except FileNotFoundError as e: print(f"Error: {e}") except PermissionError as e: print(f"Error: {e}") except ValueError as e: print(f"Error: {e}") except Exception as e: print(f"Unexpected error: {e}") ================================================ FILE: apps/miroflow-agent/benchmarks/check_progress/common.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import glob import json import math import os import re from dataclasses import dataclass from datetime import datetime from io import StringIO from typing import Dict, List, Optional, Tuple # Time estimation constants DEFAULT_TASK_TIME_MINUTES = 3.5 MINUTES_PER_HOUR = 60 HOURS_PER_DAY = 24 MINUTES_PER_DAY = MINUTES_PER_HOUR * HOURS_PER_DAY # Progress bar configuration PROGRESS_BAR_WIDTH = 20 GREEN_THRESHOLD = 80 YELLOW_THRESHOLD = 60 ORANGE_THRESHOLD = 40 # Judge result patterns for correctness CORRECT_RESULTS = ["CORRECT", "SUCCESS"] SUCCESS_PATTERNS = ["PASS_AT_K_SUCCESS"] # Log file configuration LOG_FILE_PREFIX = "progress_analysis_" LOG_FILE_TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S" def create_progress_bar(percentage: float, width: int = PROGRESS_BAR_WIDTH) -> str: """Create a visual progress bar for percentage display""" filled = int(width * percentage / 100) bar = "█" * filled + "░" * (width - filled) # Add color based on percentage if percentage >= GREEN_THRESHOLD: color = "\033[92m" # Green elif percentage >= YELLOW_THRESHOLD: color = "\033[93m" # Yellow elif percentage >= ORANGE_THRESHOLD: color = "\033[33m" # Orange else: color = "\033[91m" # Red reset = "\033[0m" return f"{color}[{bar}] {percentage:.1f}%{reset}" def find_earliest_start_time(completed_files: List[str]) -> Optional[datetime]: """Find the earliest start time from all completed files""" earliest_time = None for file_path in completed_files: try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if "start_time" in data: # Parse UTC time and convert to naive datetime start_time_str = data["start_time"] if start_time_str.endswith("Z"): start_time_str = start_time_str[:-1] + "+00:00" start_time = datetime.fromisoformat(start_time_str) # Convert to naive datetime for comparison start_time = start_time.replace(tzinfo=None) if earliest_time is None or start_time < earliest_time: earliest_time = start_time except (json.JSONDecodeError, KeyError, ValueError, OSError): continue # Skip files with invalid timing data return earliest_time def find_latest_end_time(completed_files: List[str]) -> Optional[datetime]: """Find the latest end time from all completed files""" latest_time = None for file_path in completed_files: try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if "end_time" in data: # Parse UTC time and convert to naive datetime end_time_str = data["end_time"] if end_time_str.endswith("Z"): end_time_str = end_time_str[:-1] + "+00:00" end_time = datetime.fromisoformat(end_time_str) # Convert to naive datetime for comparison (UTC-naive) end_time = end_time.replace(tzinfo=None) if latest_time is None or end_time > latest_time: latest_time = end_time except (json.JSONDecodeError, KeyError, ValueError, OSError): continue # Skip files with invalid timing data # If no valid end_time found, return current UTC (naive) return latest_time or datetime.now().replace(tzinfo=None) def calculate_mean_and_std(values: List[float]) -> Tuple[float, float]: """Calculate mean and standard deviation of a list of values""" if not values: return 0.0, 0.0 n = len(values) mean = sum(values) / n if n == 1: return mean, 0.0 variance = sum((x - mean) ** 2 for x in values) / (n - 1) std = math.sqrt(variance) return mean, std def estimate_completion_time( total_tasks: int, completed_tasks: int, completed_files: List[str] ) -> str: """Estimate completion time based on overall progress rate from all completed tasks""" if completed_tasks == 0: return "Cannot estimate (no completed tasks)" # Check if all tasks are completed if completed_tasks >= total_tasks: return "All tasks completed" remaining_tasks = total_tasks - completed_tasks # Use overall completion rate from all successfully completed tasks earliest_start = find_earliest_start_time(completed_files) latest_end = find_latest_end_time(completed_files) if earliest_start is None: # Fallback to default estimation if no valid timing data estimated_minutes = remaining_tasks * DEFAULT_TASK_TIME_MINUTES else: # Calculate overall elapsed time elapsed_time = latest_end - earliest_start elapsed_minutes = elapsed_time.total_seconds() / 60 if elapsed_minutes <= 0: return "Cannot estimate (time interval too short)" # Calculate average time per task based on all completed tasks avg_minutes_per_task = elapsed_minutes / completed_tasks if avg_minutes_per_task <= 0: return "Cannot estimate (invalid time per task)" estimated_minutes = remaining_tasks * avg_minutes_per_task # Format the estimate in minutes return f"~{int(estimated_minutes)} minutes" @dataclass class TaskStats: """Statistics for a single task""" completed: int = 0 running: int = 0 failed: int = 0 judge_correct: int = 0 total: int = 0 # Completed files for timing analysis completed_files: List[str] = None # Turn statistics total_turns: int = 0 completed_tasks_with_turns: int = 0 # No boxed content found statistics no_boxed_found: int = 0 def __post_init__(self): if self.completed_files is None: self.completed_files = [] @property def judge_accuracy(self) -> float: """Calculate judge accuracy percentage""" return ( (self.judge_correct / self.completed * 100) if self.completed > 0 else 0.0 ) @property def completion_rate(self) -> float: """Calculate completion rate percentage""" return (self.completed / self.total * 100) if self.total > 0 else 0.0 @property def average_turns(self) -> float: """Calculate average turns per completed task""" return ( (self.total_turns / self.completed_tasks_with_turns) if self.completed_tasks_with_turns > 0 else 0.0 ) @dataclass class GAIATaskStats(TaskStats): """Statistics for a single task""" # Difficulty level tracking level1_completed: int = 0 level1_correct: int = 0 level2_completed: int = 0 level2_correct: int = 0 level3_completed: int = 0 level3_correct: int = 0 @property def level1_accuracy(self) -> float: """Calculate Level 1 accuracy percentage""" return ( (self.level1_correct / self.level1_completed * 100) if self.level1_completed > 0 else 0.0 ) @property def level2_accuracy(self) -> float: """Calculate Level 2 accuracy percentage""" return ( (self.level2_correct / self.level2_completed * 100) if self.level2_completed > 0 else 0.0 ) @property def level3_accuracy(self) -> float: """Calculate Level 3 accuracy percentage""" return ( (self.level3_correct / self.level3_completed * 100) if self.level3_completed > 0 else 0.0 ) @dataclass class SummaryStats: """Summary statistics across all runs""" total_tasks: int = 0 total_completed: int = 0 total_running: int = 0 total_failed: int = 0 total_judge_correct: int = 0 total_no_boxed_found: int = 0 @property def total_judge_accuracy(self) -> float: """Calculate overall judge accuracy percentage""" return ( (self.total_judge_correct / self.total_completed * 100) if self.total_completed > 0 else 0.0 ) def average_run_accuracy( self, run_stats_list: List[Tuple[str, TaskStats]] ) -> Tuple[float, float]: """Calculate overall accuracy (mean) and standard deviation across individual runs""" if not run_stats_list: return 0.0, 0.0 # Mean accuracy is the overall accuracy (weighted average) # This matches the OVERALL JUDGE ACCURACY calculation mean = self.total_judge_accuracy # Standard deviation is calculated from individual run accuracies accuracies = [ stats.judge_accuracy for _, stats in run_stats_list if stats.completed > 0 ] if not accuracies: return mean, 0.0 _, std = calculate_mean_and_std(accuracies) return mean, std @property def total_completion_rate(self) -> float: """Calculate overall completion rate percentage""" return ( (self.total_completed / self.total_tasks * 100) if self.total_tasks > 0 else 0.0 ) @dataclass class GAIASummaryStats(SummaryStats): """Summary statistics across all runs""" # Difficulty level summary stats level1_completed: int = 0 level1_correct: int = 0 level2_completed: int = 0 level2_correct: int = 0 level3_completed: int = 0 level3_correct: int = 0 @property def level1_accuracy(self) -> float: """Calculate overall Level 1 accuracy percentage""" return ( (self.level1_correct / self.level1_completed * 100) if self.level1_completed > 0 else 0.0 ) @property def level2_accuracy(self) -> float: """Calculate overall Level 2 accuracy percentage""" return ( (self.level2_correct / self.level2_completed * 100) if self.level2_completed > 0 else 0.0 ) @property def level3_accuracy(self) -> float: """Calculate overall Level 3 accuracy percentage""" return ( (self.level3_correct / self.level3_completed * 100) if self.level3_completed > 0 else 0.0 ) class ProgressChecker: """Main class for checking benchmark progress""" def __init__(self, target_path: str, task_per_run: int, data_path: str): self.target_path = target_path self.run_dirs: List[str] = [] self.total_tasks_per_run = task_per_run # Load benchmark data self._load_benchmark_data(data_path) def _load_benchmark_data(self, data_path) -> None: """Load benchmark data and configuration""" try: # Load benchmark data if available if os.path.exists(data_path): with open(data_path) as f: benchmark_data = [json.loads(line) for line in f.readlines()] print(f"Loaded {len(benchmark_data)} tasks from {data_path}") except Exception as e: print(f"Warning: Could not load data: {e}") def find_run_directories(self) -> List[str]: """Find all run directories in the target path""" run_dirs = [] if not os.path.exists(self.target_path): raise FileNotFoundError(f"Path '{self.target_path}' does not exist") # Check if target_path itself is a run directory if os.path.basename(self.target_path).startswith("run_"): run_dirs.append(self.target_path) else: # Find run_* directories under target_path try: for item in os.listdir(self.target_path): item_path = os.path.join(self.target_path, item) if os.path.isdir(item_path) and item.startswith("run_"): run_dirs.append(item_path) except PermissionError: raise PermissionError( f"No permission to access directory '{self.target_path}'" ) # Sort by run number run_dirs.sort(key=lambda x: self._extract_run_number(x)) if not run_dirs: raise ValueError(f"No run directories found in '{self.target_path}'") return run_dirs def _extract_run_number(self, path: str) -> int: """Extract run number from directory path for sorting""" basename = os.path.basename(path) parts = basename.split("_") if len(parts) > 1 and parts[1].isdigit(): return int(parts[1]) return 0 def _extract_task_id(self, filename: str, task_id_pattern: str) -> Optional[str]: """Extract task ID from filename""" match = re.match(task_id_pattern, filename) return match.group(1) if match else None def _get_latest_task_files(self, run_dir: str, task_id_pattern: str) -> List[str]: """Get the latest task file for each task ID in a run directory""" json_files = glob.glob(os.path.join(run_dir, "task_*.json")) if not json_files: return [] # Group by task ID, keep only the latest file for each task task_groups: Dict[str, Dict] = {} for json_file in json_files: filename = os.path.basename(json_file) task_id = self._extract_task_id(filename, task_id_pattern) if task_id: try: # Read the JSON file to get the start_time with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) start_time_str = data.get("start_time", "") if start_time_str: # Parse the ISO format timestamp from datetime import datetime start_time = datetime.fromisoformat( start_time_str.replace("Z", "+00:00") ) start_timestamp = start_time.timestamp() else: # Fallback to file modification time if start_time is not available start_timestamp = os.path.getmtime(json_file) if ( task_id not in task_groups or start_timestamp > task_groups[task_id]["timestamp"] ): task_groups[task_id] = { "file": json_file, "timestamp": start_timestamp, } except (json.JSONDecodeError, ValueError, OSError) as e: # Fallback to file modification time if JSON parsing fails print(f"Warning: Could not parse {json_file}: {e}") file_mtime = os.path.getmtime(json_file) if ( task_id not in task_groups or file_mtime > task_groups[task_id]["timestamp"] ): task_groups[task_id] = { "file": json_file, "timestamp": file_mtime, } return [info["file"] for info in task_groups.values()] def _is_task_completed(self, data: Dict) -> bool: """Check if a task is completed based on its data""" end_time = data.get("end_time", "") error = data.get("error", "") status = data.get("status", "") final_answer = data.get("final_boxed_answer", "") return ( (end_time != "" and error == "") or (status == "completed") or (final_answer != "" and error == "") ) def _is_judge_correct(self, judge_result) -> bool: """Determine if LLM judge result indicates correct answer""" if isinstance(judge_result, bool): return judge_result elif isinstance(judge_result, str): result_str = judge_result.upper() return ( result_str in CORRECT_RESULTS or any(pattern in result_str for pattern in SUCCESS_PATTERNS) or result_str.lower() in ["true", "1", "yes", "pass"] ) elif isinstance(judge_result, (int, float)): return judge_result > 0 elif isinstance(judge_result, dict): return judge_result.get("correct", False) or judge_result.get( "is_correct", False ) return False def _calculate_turns(self, data: Dict) -> int: """Calculate number of turns from task data (excluding system prompt)""" try: main_agent_history = data.get("main_agent_message_history", {}) message_history = main_agent_history.get("message_history", []) if not message_history: return 0 # Filter out system messages and count total messages, then divide by 2 # Turn count = (total messages excluding system) / 2 non_system_messages = [ msg for msg in message_history if msg.get("role") != "system" ] # Each turn consists of user + assistant, so divide by 2 turn_count = len(non_system_messages) // 2 return turn_count except (KeyError, TypeError, IndexError): return 0 def analyze_run_directory( self, run_dir: str, task_id_pattern: str ) -> Tuple[TaskStats, Dict[str, bool]]: """Analyze a single run directory and return statistics and task results Returns: Tuple[TaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct """ latest_files = self._get_latest_task_files(run_dir, task_id_pattern) # Use the correct total tasks stats = TaskStats(total=self.total_tasks_per_run) completed_files = [] # Track completed files for timing analysis task_results = {} # Track task_id -> is_correct mapping for json_file in latest_files: try: with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) status = data.get("status", "") if status == "running": stats.running += 1 elif self._is_task_completed(data): stats.completed += 1 completed_files.append(json_file) # Track for timing analysis # Check judge result for completed tasks judge_result = data.get("final_judge_result", None) is_correct = judge_result is not None and self._is_judge_correct( judge_result ) if is_correct: stats.judge_correct += 1 # Extract task ID and store result filename = os.path.basename(json_file) task_id = self._extract_task_id(filename, task_id_pattern) if task_id: task_results[task_id] = is_correct # Check if final_boxed_answer contains "No \\boxed{} content found" final_boxed_answer = data.get("final_boxed_answer", "") if ( isinstance(final_boxed_answer, str) and "No \\boxed{} content found" in final_boxed_answer ): stats.no_boxed_found += 1 # Calculate turns for completed tasks turns = self._calculate_turns(data) if turns > 0: stats.total_turns += turns stats.completed_tasks_with_turns += 1 else: stats.failed += 1 except (json.JSONDecodeError, IOError) as e: # Skip files that are being written or corrupted if "Expecting value" in str(e) or "line 1 column 1" in str(e): continue # Skip corrupted/empty files print(f"Warning: Could not parse {json_file}: {e}") stats.failed += 1 except Exception as e: print(f"Warning: Unexpected error processing {json_file}: {e}") stats.failed += 1 # Store completed files in stats for timing analysis stats.completed_files = completed_files return stats, task_results def run_analysis( self, benchmark_name_std: str, task_id_pattern: str ) -> SummaryStats: """Run the complete analysis and return summary statistics""" self.run_dirs = self.find_run_directories() summary = SummaryStats() run_stats_list = [] # Store statistics for each run all_completed_files = [] # Collect all completed files for timing analysis all_task_results = {} # Collect task_id -> list of is_correct across all runs print() print("=" * 80) print(f"Analyzing benchmark progress for: {self.target_path}") print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 80) # Analyze each run directory for run_dir in self.run_dirs: run_name = os.path.basename(run_dir) stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern) if stats.total == 0: print(f"{run_name}: No task files found") print() continue # Display run statistics in a single line run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}" # Add accuracy information if stats.completed > 0: run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)" # Add average turns information (show even if some tasks are still running) if stats.completed_tasks_with_turns > 0: run_info += f" | Avg Turns: {stats.average_turns:.1f}" print(run_info) print() # Store run statistics for later display run_stats_list.append((run_name, stats)) # Collect completed files for timing analysis all_completed_files.extend(stats.completed_files) # Collect task results for Pass@n calculation for task_id, is_correct in task_results.items(): if task_id not in all_task_results: all_task_results[task_id] = [] all_task_results[task_id].append(is_correct) # Update summary statistics summary.total_tasks += stats.total summary.total_completed += stats.completed summary.total_running += stats.running summary.total_failed += stats.failed summary.total_judge_correct += stats.judge_correct summary.total_no_boxed_found += stats.no_boxed_found # Display summary after all runs are processed self._display_summary( summary, run_stats_list, all_completed_files, benchmark_name_std, all_task_results, ) return summary def _calculate_pass_at_n( self, all_task_results: Dict[str, List[bool]], total_tasks: int ) -> Tuple[int, float]: """Calculate Pass@n: number of tasks with at least one correct answer across all runs Returns: Tuple[int, float]: (pass_at_n_count, pass_at_n_percentage) """ if not all_task_results or total_tasks == 0: return 0, 0.0 pass_at_n_count = 0 for task_id, results in all_task_results.items(): # If at least one run got it correct, this task passes if any(results): pass_at_n_count += 1 pass_at_n_percentage = ( (pass_at_n_count / total_tasks * 100) if total_tasks > 0 else 0.0 ) return pass_at_n_count, pass_at_n_percentage def _display_summary( self, summary: SummaryStats, run_stats_list: List[Tuple[str, TaskStats]], completed_files: List[str], benchmark_name_std: str, all_task_results: Dict[str, List[bool]] = None, ): """Display summary statistics""" print("=" * 80) print("SUMMARY STATISTICS") print("=" * 80) print( f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)" ) # Estimate completion time using overall progress rate if summary.total_tasks > 0 and summary.total_completed > 0: remaining_tasks = summary.total_tasks - summary.total_completed earliest_start = find_earliest_start_time(completed_files) latest_end = find_latest_end_time(completed_files) completion_estimate = estimate_completion_time( summary.total_tasks, summary.total_completed, completed_files ) print(f"Remaining Tasks: {remaining_tasks}") if earliest_start: elapsed_time = latest_end - earliest_start elapsed_minutes = elapsed_time.total_seconds() / 60 tasks_per_minute = ( summary.total_completed / elapsed_minutes if elapsed_minutes > 0 else 0 ) print(f"Elapsed Time: {elapsed_minutes:.1f} minutes") print(f"Completion Rate: {tasks_per_minute:.1f} tasks/minute") print(f"Estimated Time to Complete: {completion_estimate}") if summary.total_completed > 0: accuracy_bar = create_progress_bar(summary.total_judge_accuracy) print( f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}" ) # Calculate and display overall average turns total_turns = sum(stats.total_turns for _, stats in run_stats_list) total_tasks_with_turns = sum( stats.completed_tasks_with_turns for _, stats in run_stats_list ) if total_tasks_with_turns > 0: overall_avg_turns = total_turns / total_tasks_with_turns print(f"Overall Average Turns: {overall_avg_turns:.1f}") # Display each run's correct percentage if run_stats_list: print() print("INDIVIDUAL RUN ACCURACIES:") for run_name, stats in run_stats_list: if stats.completed > 0: accuracy_bar = create_progress_bar(stats.judge_accuracy) print( f" {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}" ) else: print( f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)" ) # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n)) num_runs = len(run_stats_list) mean_acc, std_acc = summary.average_run_accuracy(run_stats_list) if mean_acc > 0: print() if num_runs > 1: print( f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%" ) else: print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%") # Display Pass@n if multiple runs if num_runs > 1 and all_task_results: # Calculate total unique tasks (use the first run's total as reference) first_run_total = ( run_stats_list[0][1].total if run_stats_list else summary.total_tasks ) pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n( all_task_results, first_run_total ) pass_at_n_bar = create_progress_bar(pass_at_n_percentage) print( f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}" ) # Display no boxed content found statistics if summary.total_completed > 0: print( f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)" ) print("=" * 80) print() # Save analysis results to log file self._save_analysis_log( summary, run_stats_list, completed_files, benchmark_name_std, all_task_results, ) def _save_analysis_log( self, summary: SummaryStats, run_stats_list: List[Tuple[str, TaskStats]], completed_files: List[str], benchmark_name_std: str, all_task_results: Dict[str, List[bool]] = None, ) -> None: """Save analysis results to a log file in the target directory""" try: # Create log filename with timestamp timestamp = datetime.now().strftime(LOG_FILE_TIMESTAMP_FORMAT) log_filename = f"{LOG_FILE_PREFIX}{timestamp}.log" log_path = os.path.join(self.target_path, log_filename) # Capture the analysis output output_buffer = StringIO() # Write header output_buffer.write("=" * 80 + "\n") output_buffer.write(f"{benchmark_name_std} Progress Analysis\n") output_buffer.write( f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n" ) output_buffer.write(f"Target Path: {self.target_path}\n") output_buffer.write("=" * 80 + "\n\n") # Write run statistics for run_name, stats in run_stats_list: output_buffer.write( f"{run_name}: Status: {stats.completed} completed, {stats.running} running, {stats.failed} failed\n" ) if stats.completed > 0: accuracy = stats.judge_correct / stats.completed * 100 output_buffer.write( f" Overall Accuracy: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n" ) else: output_buffer.write( f" Overall Accuracy: {stats.judge_correct}/{stats.completed} (N/A)\n" ) output_buffer.write("\n") # Write summary statistics output_buffer.write("=" * 80 + "\n") output_buffer.write("SUMMARY STATISTICS\n") output_buffer.write("=" * 80 + "\n") output_buffer.write( f"Total Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)\n" ) # Write timing information if summary.total_tasks > 0 and summary.total_completed > 0: remaining_tasks = summary.total_tasks - summary.total_completed earliest_start = find_earliest_start_time(completed_files) latest_end = find_latest_end_time(completed_files) completion_estimate = estimate_completion_time( summary.total_tasks, summary.total_completed, completed_files ) output_buffer.write(f"Remaining Tasks: {remaining_tasks}\n") if earliest_start: elapsed_time = latest_end - earliest_start elapsed_minutes = elapsed_time.total_seconds() / 60 tasks_per_minute = ( summary.total_completed / elapsed_minutes if elapsed_minutes > 0 else 0 ) output_buffer.write( f"Elapsed Time: {elapsed_minutes:.1f} minutes\n" ) output_buffer.write( f"Completion Rate: {tasks_per_minute:.1f} tasks/minute\n" ) output_buffer.write( f"Estimated Time to Complete: {completion_estimate}\n" ) if summary.total_completed > 0: accuracy = summary.total_judge_correct / summary.total_completed * 100 output_buffer.write( f"Judge Accuracy: {summary.total_judge_correct}/{summary.total_completed} ({accuracy:.1f}%)\n" ) no_boxed_percentage = ( summary.total_no_boxed_found / summary.total_completed * 100 ) output_buffer.write( f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n" ) # Write individual run accuracies if run_stats_list: output_buffer.write("\nINDIVIDUAL RUN ACCURACIES:\n") for run_name, stats in run_stats_list: if stats.completed > 0: accuracy = stats.judge_correct / stats.completed * 100 output_buffer.write( f" {run_name}: {stats.judge_correct}/{stats.completed} ({accuracy:.1f}%)\n" ) else: output_buffer.write( f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)\n" ) # Write mean accuracy and standard deviation (Pass@1 Acc (Avg@n)) num_runs = len(run_stats_list) mean_acc, std_acc = summary.average_run_accuracy(run_stats_list) if mean_acc > 0: if num_runs > 1: output_buffer.write( f"\nPass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%\n" ) else: output_buffer.write( f"\nMEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%\n" ) # Write Pass@n if multiple runs if num_runs > 1 and all_task_results: first_run_total = ( run_stats_list[0][1].total if run_stats_list else summary.total_tasks ) pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n( all_task_results, first_run_total ) output_buffer.write( f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} ({pass_at_n_percentage:.1f}%)\n" ) if summary.total_completed > 0: no_boxed_percentage = ( summary.total_no_boxed_found / summary.total_completed * 100 ) output_buffer.write( f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({no_boxed_percentage:.1f}%)\n" ) output_buffer.write("=" * 80 + "\n") # Write to file with open(log_path, "w", encoding="utf-8") as f: f.write(output_buffer.getvalue()) output_buffer.close() print(f"Analysis results saved to: {log_path}") except Exception as e: print(f"Warning: Could not save analysis log: {e}") class GAIAProgressChecker(ProgressChecker): """Main class for checking GAIA benchmark progress""" DIFFICULTY_LEVELS = [1, 2, 3] def __init__(self, target_path: str, task_per_run: int, data_path: str): super().__init__(target_path, task_per_run=0, data_path="") # 调用父类构造函数 # Difficulty level mapping self.task_difficulty_map: Dict[str, int] = {} self.total_tasks_per_run = task_per_run # Load GAIA data if this is a GAIA validation directory self._load_benchmark_data(data_path) def _load_benchmark_data(self, data_path) -> None: """Load GAIA-specific data and configuration""" try: if os.path.exists(data_path): with open(data_path) as f: benchmark_data = [json.loads(line) for line in f.readlines()] print(f"Loaded {len(benchmark_data)} tasks from {data_path}") for line in benchmark_data: task_id = line["task_id"] metadata = line.get("metadata", {}) difficulty_level = ( metadata.get("Level") or metadata.get("level") or 0 ) if difficulty_level in self.DIFFICULTY_LEVELS: self.task_difficulty_map[task_id] = difficulty_level level_counts = { level: sum( 1 for v in self.task_difficulty_map.values() if v == level ) for level in self.DIFFICULTY_LEVELS } print(f"Difficulty level distribution: {level_counts}") except Exception as e: print(f"Warning: Could not load GAIA data: {e}") def _update_difficulty_stats( self, stats: GAIATaskStats, task_id: str, is_correct: bool ) -> None: """Update difficulty level statistics for a task""" if task_id not in self.task_difficulty_map: return difficulty_level = self.task_difficulty_map[task_id] if difficulty_level == 1: stats.level1_completed += 1 if is_correct: stats.level1_correct += 1 elif difficulty_level == 2: stats.level2_completed += 1 if is_correct: stats.level2_correct += 1 elif difficulty_level == 3: stats.level3_completed += 1 if is_correct: stats.level3_correct += 1 def analyze_run_directory( self, run_dir: str, task_id_pattern: str ) -> Tuple[GAIATaskStats, Dict[str, bool]]: """Analyze a single run directory and return statistics (GAIA-specific) Returns: Tuple[GAIATaskStats, Dict[str, bool]]: Statistics and a mapping of task_id -> is_correct """ latest_files = self._get_latest_task_files( run_dir, task_id_pattern ) # 直接用父类的实现 stats = GAIATaskStats(total=len(latest_files)) completed_files = [] task_results = {} # Track task_id -> is_correct mapping for json_file in latest_files: try: with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) status = data.get("status", "") if status == "running": stats.running += 1 elif self._is_task_completed(data): stats.completed += 1 completed_files.append(json_file) judge_result = data.get("final_judge_result", None) is_correct = judge_result is not None and self._is_judge_correct( judge_result ) if is_correct: stats.judge_correct += 1 # Check if final_boxed_answer contains "No \\boxed{} content found" final_boxed_answer = data.get("final_boxed_answer", "") if ( isinstance(final_boxed_answer, str) and "No \\boxed{} content found" in final_boxed_answer ): stats.no_boxed_found += 1 task_id = self._extract_task_id( os.path.basename(json_file), task_id_pattern ) if task_id: self._update_difficulty_stats(stats, task_id, is_correct) task_results[task_id] = is_correct # Calculate turns for completed tasks turns = self._calculate_turns(data) if turns > 0: stats.total_turns += turns stats.completed_tasks_with_turns += 1 else: stats.failed += 1 except Exception as e: print(f"Warning: Could not process {json_file}: {e}") stats.failed += 1 stats.completed_files = completed_files return stats, task_results def run_analysis( self, benchmark_name_std: str, task_id_pattern: str ) -> GAIASummaryStats: """Run the complete analysis and return summary statistics""" self.run_dirs = self.find_run_directories() summary = GAIASummaryStats() run_stats_list = [] # Store statistics for each run all_completed_files = [] # Collect all completed files for timing analysis all_task_results = {} # Collect task_id -> list of is_correct across all runs print() print("=" * 80) print(f"Analyzing benchmark progress for: {self.target_path}") print(f"Generated at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 80) # Analyze each run directory for run_dir in self.run_dirs: run_name = os.path.basename(run_dir) stats, task_results = self.analyze_run_directory(run_dir, task_id_pattern) if stats.total == 0: print(f"{run_name}: No task files found") print() continue # Display run statistics in a single line run_info = f"[{run_name}] Completed: {stats.completed} | Running: {stats.running} | Failed: {stats.failed}" # Add accuracy information if stats.completed > 0: run_info += f" | Accuracy: {stats.judge_correct}/{stats.completed} ({stats.judge_accuracy:.1f}%)" # Add average turns information (show even if some tasks are still running) if stats.completed_tasks_with_turns > 0: run_info += f" | Avg Turns: {stats.average_turns:.1f}" print(run_info) print() # Store run statistics for later display run_stats_list.append((run_name, stats)) # Collect completed files for timing analysis all_completed_files.extend(stats.completed_files) # Collect task results for Pass@n calculation for task_id, is_correct in task_results.items(): if task_id not in all_task_results: all_task_results[task_id] = [] all_task_results[task_id].append(is_correct) # Update summary statistics self._update_summary_stats(summary, stats) # Display summary after all runs are processed self._display_summary( summary, run_stats_list, all_completed_files, benchmark_name_std, all_task_results, ) return summary def _update_summary_stats( self, summary: GAIASummaryStats, stats: GAIATaskStats ) -> None: """Update summary statistics with data from a single run""" summary.total_tasks += stats.total summary.total_completed += stats.completed summary.total_running += stats.running summary.total_failed += stats.failed summary.total_judge_correct += stats.judge_correct summary.total_no_boxed_found += stats.no_boxed_found # Update difficulty level summary stats summary.level1_completed += stats.level1_completed summary.level1_correct += stats.level1_correct summary.level2_completed += stats.level2_completed summary.level2_correct += stats.level2_correct summary.level3_completed += stats.level3_completed summary.level3_correct += stats.level3_correct def _display_summary( self, summary: GAIASummaryStats, run_stats_list: List[Tuple[str, GAIATaskStats]], completed_files: List[str], benchmark_name_std: str, all_task_results: Dict[str, List[bool]] = None, ): """Display summary statistics""" print("=" * 80) print("SUMMARY STATISTICS") print("=" * 80) # Estimate completion time using overall progress rate if summary.total_completed > 0: num_runs = len(run_stats_list) if run_stats_list else 1 expected_total_tasks = self.total_tasks_per_run * num_runs remaining_tasks = expected_total_tasks - summary.total_completed earliest_start = find_earliest_start_time(completed_files) last_end = find_latest_end_time(completed_files) completion_estimate = estimate_completion_time( expected_total_tasks, summary.total_completed, completed_files ) print( f"Current Tasks: {summary.total_tasks} ({summary.total_completed} completed, {summary.total_running} running)" ) print(f"Remaining Tasks to Complete: {remaining_tasks}") if earliest_start: elapsed_time = last_end - earliest_start elapsed_minutes = elapsed_time.total_seconds() / 60 overall_rate = ( summary.total_completed / elapsed_minutes if elapsed_minutes > 0 else 0 ) print(f"Elapsed Time: {elapsed_minutes:.1f} minutes") print(f"Completion Rate: {overall_rate:.2f} tasks/minute") print(f"Estimated Time to Complete: {completion_estimate}") # Display each run's correct percentage if run_stats_list: print() print("INDIVIDUAL RUN ACCURACIES:") for run_name, stats in run_stats_list: if stats.completed > 0: accuracy_bar = create_progress_bar(stats.judge_accuracy) print( f" {run_name}: {stats.judge_correct}/{stats.completed} {accuracy_bar}" ) # Add difficulty level information for each run if ( stats.level1_completed > 0 or stats.level2_completed > 0 or stats.level3_completed > 0 ): # Calculate total expected tasks for each difficulty level total_level1 = sum( 1 for level in self.task_difficulty_map.values() if level == 1 ) total_level2 = sum( 1 for level in self.task_difficulty_map.values() if level == 2 ) total_level3 = sum( 1 for level in self.task_difficulty_map.values() if level == 3 ) difficulty_info = ( f" L1: {stats.level1_correct}/{stats.level1_completed}/{total_level1} ({stats.level1_accuracy:.1f}%) | " f"L2: {stats.level2_correct}/{stats.level2_completed}/{total_level2} ({stats.level2_accuracy:.1f}%) | " f"L3: {stats.level3_correct}/{stats.level3_completed}/{total_level3} ({stats.level3_accuracy:.1f}%)" ) print(f" {difficulty_info}") print() else: print( f" {run_name}: {stats.judge_correct}/{stats.completed} (N/A)" ) # Display mean accuracy and standard deviation (Pass@1 Acc (Avg@n)) num_runs = len(run_stats_list) mean_acc, std_acc = summary.average_run_accuracy(run_stats_list) if mean_acc > 0: print() if num_runs > 1: print( f"Pass@1 Acc (Avg@{num_runs}): {mean_acc:.1f}% ± {std_acc:.1f}%" ) else: print(f"MEAN ACCURACY: {mean_acc:.1f}% ± {std_acc:.1f}%") # Display Pass@n if multiple runs if num_runs > 1 and all_task_results: # Use the first run's total as reference first_run_total = ( run_stats_list[0][1].total if run_stats_list else summary.total_tasks ) pass_at_n_count, pass_at_n_percentage = self._calculate_pass_at_n( all_task_results, first_run_total ) pass_at_n_bar = create_progress_bar(pass_at_n_percentage) print( f"Pass@{num_runs}: {pass_at_n_count}/{first_run_total} {pass_at_n_bar}" ) # Display no boxed content found statistics if summary.total_completed > 0: print( f"No \\boxed{{}} content found: {summary.total_no_boxed_found}/{summary.total_completed} ({summary.total_no_boxed_found / summary.total_completed * 100:.1f}%)" ) # Display overall judge accuracy after individual runs if summary.total_completed > 0: print() accuracy_bar = create_progress_bar(summary.total_judge_accuracy) print( f"OVERALL JUDGE ACCURACY: {summary.total_judge_correct}/{summary.total_completed} {accuracy_bar}" ) # Calculate and display overall average turns total_turns = sum(stats.total_turns for _, stats in run_stats_list) total_tasks_with_turns = sum( stats.completed_tasks_with_turns for _, stats in run_stats_list ) if total_tasks_with_turns > 0: overall_avg_turns = total_turns / total_tasks_with_turns print(f"OVERALL AVERAGE TURNS: {overall_avg_turns:.1f}") # Display difficulty level summary if available if ( summary.level1_completed > 0 or summary.level2_completed > 0 or summary.level3_completed > 0 ): print() print("DIFFICULTY LEVEL SUMMARY:") # Calculate total expected tasks for each difficulty level total_level1 = sum( 1 for level in self.task_difficulty_map.values() if level == 1 ) total_level2 = sum( 1 for level in self.task_difficulty_map.values() if level == 2 ) total_level3 = sum( 1 for level in self.task_difficulty_map.values() if level == 3 ) print( f" L1: {summary.level1_correct}/{summary.level1_completed}/{total_level1} ({summary.level1_accuracy:.1f}%) | L2: {summary.level2_correct}/{summary.level2_completed}/{total_level2} ({summary.level2_accuracy:.1f}%) | L3: {summary.level3_correct}/{summary.level3_completed}/{total_level3} ({summary.level3_accuracy:.1f}%)" ) print("=" * 80) print() ================================================ FILE: apps/miroflow-agent/benchmarks/common_benchmark.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import gc import json import os import random import re from abc import ABC from concurrent.futures import ProcessPoolExecutor from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import hydra # Import from the new modular structure from evaluators.eval_utils import verify_answer_for_datasets from omegaconf import DictConfig, OmegaConf from src.core.pipeline import ( create_pipeline_components, execute_task_pipeline, ) from src.logging.summary_time_cost import generate_summary from src.utils.prompt_utils import ( FAILURE_EXPERIENCE_FOOTER, FAILURE_EXPERIENCE_HEADER, FAILURE_EXPERIENCE_ITEM, FORMAT_ERROR_MESSAGE, ) def _task_worker(task_dict, cfg_dict, evaluator_kwargs): """ Worker function to run a single task in a separate process. This function is called by ProcessPoolExecutor and must be at module level. """ import asyncio from omegaconf import OmegaConf # Reconstruct config in this process cfg = OmegaConf.create(cfg_dict) # Reconstruct task task = BenchmarkTask( task_id=task_dict["task_id"], task_question=task_dict["task_question"], ground_truth=task_dict["ground_truth"], file_path=task_dict.get("file_path"), metadata=task_dict.get("metadata", {}), ) # Create evaluator in this process evaluator = GenericEvaluator( data_dir=evaluator_kwargs["data_dir"], benchmark_name=evaluator_kwargs["benchmark_name"], cfg=cfg, metadata_file=evaluator_kwargs.get("metadata_file", "metadata.jsonl"), task_id_field=evaluator_kwargs.get("task_id_field", "task_id"), question_field=evaluator_kwargs.get("question_field", "task_question"), ground_truth_field=evaluator_kwargs.get("ground_truth_field", "ground_truth"), file_name_field=evaluator_kwargs.get("file_name_field"), ) # Run task in new event loop loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # Set exception handler to suppress "Task exception was never retrieved" warnings def exception_handler(loop, context): # Suppress all asyncio internal warnings for cleaner output pass loop.set_exception_handler(exception_handler) try: result = loop.run_until_complete(evaluator.run_single_task(task)) # Convert result to dict for serialization return asdict(result) finally: loop.close() @dataclass class BenchmarkTask: """Generic benchmark task data structure""" task_id: str task_question: str ground_truth: str file_path: Optional[str] = None metadata: Dict[str, Any] = field(default_factory=dict) model_boxed_answer: str = "" status: str = "pending" # pending, success, failed @dataclass class BenchmarkResult: """Generic benchmark evaluation result structure""" task_id: str task_question: str ground_truth: str file_path: Optional[str] status: str model_boxed_answer: str = "" metadata: Dict[str, Any] = field(default_factory=dict) error_message: str = "" final_judge_result: Optional[str] = None judge_type: Optional[str] = None log_file_path: Optional[str] = None # Pass@K support fields attempts: List[Dict[str, Any]] = field(default_factory=list) # Store all attempts pass_at_k_success: bool = False # Whether task passed using pass@k evaluation k_value: int = 1 # The k value used for this evaluation class BenchmarkEvaluator(ABC): """Abstract base class for benchmark evaluators""" def __init__(self, data_dir: str, benchmark_name: str, cfg: DictConfig): """ Initialize benchmark evaluator Args: data_dir: Path to benchmark data directory benchmark_name: Name of the benchmark cfg: The Hydra configuration object """ self.data_dir = Path(data_dir) self.benchmark_name = benchmark_name self.cfg = cfg self.pass_at_k = cfg.benchmark.execution.get("pass_at_k", 1) self.tasks: List[BenchmarkTask] = [] self.results: List[BenchmarkResult] = [] # Format error tracking and retry configuration # Read from agent config as it's part of context management self.context_compress_limit = cfg.agent.get("context_compress_limit", 0) # Get LLM provider and model from the config object self.llm_provider = cfg.llm.provider self.llm_model = cfg.llm.model_name # Initialize pipeline components print("Initializing pipeline components...") ( self.main_agent_tool_manager, self.sub_agent_tool_managers, self.output_formatter, ) = create_pipeline_components(cfg) print( f"Pipeline components initialized successfully! Using pass@{self.pass_at_k}" ) def get_log_dir(self) -> Path: """Get the log directory for the current benchmark and model.""" return Path(hydra.core.hydra_config.HydraConfig.get().run.dir) async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult: """ Run inference for a single benchmark task with pass@k support Args: task: BenchmarkTask object Returns: BenchmarkResult object """ print(f"Processing task {task.task_id} with pass@{self.pass_at_k}") result = BenchmarkResult( task_id=task.task_id, task_question=task.task_question, ground_truth=task.ground_truth, file_path=task.file_path, model_boxed_answer="", status="pending", metadata=task.metadata.copy(), k_value=self.pass_at_k, ) logs_dir = self.get_log_dir() found_correct_answer = False # Print debug info about log directory print(f" Current log directory: {logs_dir}") try: # Prepare task task_description, task_file_path = self.prepare_task_description(task) # Run up to k attempts (with early stopping when correct answer found) for attempt in range(1, self.pass_at_k + 1): print(f" Attempt {attempt}/{self.pass_at_k} for task {task.task_id}") format_retry_count = 0 # Check if log file exists for this specific attempt in current directory log_pattern = f"task_{task.task_id}_attempt-{attempt}_*.json" matching_logs = [] # Search only in current log directory if logs_dir.exists(): dir_logs = sorted(list(logs_dir.glob(log_pattern))) if dir_logs: matching_logs.extend(dir_logs) if matching_logs: # Sort by timestamp in filename to get the most recent def extract_timestamp(file_path): filename = file_path.name # Extract timestamp from filename like: task_xxx_attempt-1_format-retry-0_2025-08-13-10-13-20.json # The timestamp is the last part before .json if "_" in filename and filename.endswith(".json"): timestamp_part = filename.split("_")[-1].replace( ".json", "" ) # Convert timestamp to datetime for proper sorting from datetime import datetime return datetime.strptime( timestamp_part, "%Y-%m-%d-%H-%M-%S" ) return filename matching_logs = sorted(matching_logs, key=extract_timestamp) attempt_result = { "attempt_number": attempt, "model_boxed_answer": "", "status": "pending", "log_file_path": None, "final_judge_result": None, "judge_type": None, "is_correct": False, } # Try to load existing result for this attempt if matching_logs: log_file = matching_logs[-1] attempt_result["log_file_path"] = str(log_file) print( f" Found existing log for attempt {attempt}: {log_file.name}" ) match = re.search(r"retry-(\d+)", os.path.basename(str(log_file))) if match: format_retry_count = int(match.group(1)) else: raise ValueError( f"Failed to extract retry number from log file: {log_file}" ) try: with open(log_file) as f: log_data = json.loads(f.read()) if log_data.get("status") == "success": format_retry_count += 1 if log_data.get("final_boxed_answer"): attempt_result["model_boxed_answer"] = log_data[ "final_boxed_answer" ] attempt_result["status"] = log_data.get("status") # Check if we already have judge result in log if log_data.get("final_judge_result"): attempt_result["final_judge_result"] = log_data[ "final_judge_result" ] attempt_result["judge_type"] = log_data.get( "judge_type", "" ) attempt_result["is_correct"] = ( log_data["final_judge_result"] == "CORRECT" ) # Load evaluation details if available if log_data.get("eval_details"): attempt_result["eval_details"] = log_data[ "eval_details" ] print( f" Loaded existing result: {attempt_result['model_boxed_answer']}" ) except Exception as e: print(f" Error loading log file {log_file}: {e}") # Run inference if no existing result or if we have a format error if ( not attempt_result["model_boxed_answer"] or attempt_result["model_boxed_answer"] == FORMAT_ERROR_MESSAGE ): # Try to get a valid response with format retry print(f"TASK ID: {task.task_id}, ATTEMPT: {attempt}") max_format_retries = self.context_compress_limit # Track accumulated failure experiences for this attempt # Start with the original task description current_task_description = task_description failure_experiences = [] # Resume: Recover failure experiences from previous retry logs if format_retry_count > 0 and logs_dir.exists(): print( f" Resuming from retry {format_retry_count}, recovering previous failure experiences..." ) for prev_retry in range(format_retry_count): prev_log_pattern = f"task_{task.task_id}_attempt-{attempt}_format-retry-{prev_retry}_*.json" prev_logs = sorted(list(logs_dir.glob(prev_log_pattern))) if prev_logs: prev_log_file = prev_logs[-1] # Get the latest one try: with open( prev_log_file, "r", encoding="utf-8" ) as f: prev_log_data = json.load(f) # Extract failure experience from trace_data trace_data = prev_log_data.get("trace_data", {}) prev_failure_exp = trace_data.get( "failure_experience_summary" ) if prev_failure_exp: failure_experiences.append(prev_failure_exp) print( f" Recovered failure experience from retry {prev_retry}" ) except Exception as e: print( f" Warning: Failed to load previous log {prev_log_file}: {e}" ) # Rebuild enhanced task description with recovered failure experiences if failure_experiences: current_task_description += FAILURE_EXPERIENCE_HEADER for idx, exp in enumerate(failure_experiences, 1): current_task_description += ( FAILURE_EXPERIENCE_ITEM.format( attempt_number=idx, failure_summary=exp, ) ) current_task_description += FAILURE_EXPERIENCE_FOOTER print( f" Recovered {len(failure_experiences)} failure experience(s) from previous retries" ) while format_retry_count <= max_format_retries: try: # Check if this is the final retry (no more chances after this) is_final_retry = format_retry_count == max_format_retries ( response, final_boxed_answer, log_file_path, failure_experience_summary, ) = await execute_task_pipeline( cfg=self.cfg, task_id=f"{task.task_id}_attempt-{attempt}_format-retry-{format_retry_count}", task_file_name=task_file_path, task_description=current_task_description, main_agent_tool_manager=self.main_agent_tool_manager, sub_agent_tool_managers=self.sub_agent_tool_managers, output_formatter=self.output_formatter, ground_truth=task.ground_truth, log_dir=str(self.get_log_dir()), is_final_retry=is_final_retry, ) attempt_result["model_boxed_answer"] = ( final_boxed_answer if final_boxed_answer else "" ) attempt_result["log_file_path"] = log_file_path # Check for format error if ( attempt_result["model_boxed_answer"] == FORMAT_ERROR_MESSAGE ): format_retry_count += 1 if format_retry_count <= max_format_retries: # Use the model-generated failure experience summary print( f" Format error detected, using model-generated failure summary for retry {format_retry_count}..." ) if failure_experience_summary: failure_experiences.append( failure_experience_summary ) # Build enhanced task description with accumulated failure experiences # Start fresh from original task_description each time current_task_description = task_description current_task_description += ( FAILURE_EXPERIENCE_HEADER ) for idx, exp in enumerate( failure_experiences, 1 ): current_task_description += ( FAILURE_EXPERIENCE_ITEM.format( attempt_number=idx, failure_summary=exp, ) ) current_task_description += ( FAILURE_EXPERIENCE_FOOTER ) print( f" Enhanced task description with {len(failure_experiences)} failure experience(s)" ) else: print( " No failure experience summary generated, retrying without enhancement..." ) continue else: # Exceeded format retry limit attempt_result["status"] = "success" attempt_result["model_boxed_answer"] = ( f"{FORMAT_ERROR_MESSAGE} (after {max_format_retries} retries)" ) attempt_result["error_message"] = ( f"Exceeded format error retry limit ({max_format_retries})" ) break else: # Got valid response, success attempt_result["status"] = "success" break except Exception as e: attempt_result["status"] = "failed" attempt_result["error_message"] = str(e) print( f" Error in attempt {attempt}, format retry {format_retry_count}: {e}" ) break # Perform LLM verification if we have an answer and haven't verified yet if ( attempt_result["model_boxed_answer"] and attempt_result["final_judge_result"] is None and task.ground_truth is not None ): print(f" Verifying answer for attempt {attempt}...") try: ( evaluation_result, judge_type, eval_details, ) = await verify_answer_for_datasets( benchmark_name=self.benchmark_name, question=task.task_question, target=task.ground_truth, predicted_answer=attempt_result["model_boxed_answer"], metadata=task.metadata, ) attempt_result["final_judge_result"] = evaluation_result attempt_result["judge_type"] = judge_type attempt_result["is_correct"] = evaluation_result == "CORRECT" # Store evaluation details (e.g., for DeepSearchQA metrics) if eval_details: attempt_result["eval_details"] = eval_details # Update the log file with verification result if attempt_result["log_file_path"]: self._update_log_file_with_evaluation( attempt_result["model_boxed_answer"], attempt_result["log_file_path"], evaluation_result, judge_type, eval_details, # Pass eval_details to save in log file ) if attempt_result["is_correct"]: print(f" ✅ Attempt {attempt}: CORRECT!") found_correct_answer = True else: print( f" ❌ Attempt {attempt}: INCORRECT ({evaluation_result})" ) except Exception as e: print(f" Error verifying attempt {attempt}: {e}") attempt_result["final_judge_result"] = "ERROR" attempt_result["judge_type"] = "error" attempt_result["is_correct"] = False elif attempt_result["is_correct"]: print(f" ✅ Attempt {attempt}: CORRECT (cached)") found_correct_answer = True elif attempt_result["final_judge_result"]: print( f" ❌ Attempt {attempt}: INCORRECT (cached: {attempt_result['final_judge_result']})" ) else: print(f" ⚠️ Attempt {attempt}: No valid answer to verify") result.attempts.append(attempt_result) # Update main result with the first successful attempt or best attempt so far if attempt == 1 or ( attempt_result["status"] == "success" and not result.model_boxed_answer ): result.model_boxed_answer = attempt_result["model_boxed_answer"] result.log_file_path = attempt_result["log_file_path"] result.status = attempt_result["status"] if "error_message" in attempt_result: result.error_message = attempt_result["error_message"] # Early stopping: if we found a correct answer, we can stop if found_correct_answer: print( f" 🎯 Found correct answer! Stopping early after {attempt} attempts." ) break except Exception as e: result.error_message = str(e) result.status = "failed" print(f"Error processing task {task.task_id}: {e}") finally: result.pass_at_k_success = found_correct_answer # Set main result judge result based on pass@k outcome if found_correct_answer: result.final_judge_result = "PASS_AT_K_SUCCESS" result.judge_type = "pass_at_k" else: if result.ground_truth is None: result.final_judge_result = "TEST_SET_MODE" else: result.final_judge_result = "PASS_AT_K_FAILED" result.judge_type = "pass_at_k" print(f"Task {task.task_id} completed with {len(result.attempts)} attempts") if result.ground_truth is not None: print( f" Pass@{self.pass_at_k} result: {'✅ SUCCESS' if found_correct_answer else '❌ FAILED'}" ) gc.collect() return result def _run_single_task_sync(self, task: BenchmarkTask) -> BenchmarkResult: """Sync wrapper for run_single_task to be used in threads""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) # Set exception handler to suppress "Task exception was never retrieved" warnings def exception_handler(loop, context): # Suppress all asyncio internal warnings for cleaner output pass loop.set_exception_handler(exception_handler) try: # Direct await is simpler and cleaner than gather for single task return loop.run_until_complete(self.run_single_task(task)) finally: loop.close() def run_parallel_inference( self, tasks: List[BenchmarkTask], max_concurrent: int = 3 ) -> List[BenchmarkResult]: """Run inference on multiple tasks in parallel using multiprocessing""" print( f"Running inference on {len(tasks)} tasks with max_concurrent={max_concurrent} (multiprocessing)" ) # Serialize config cfg_dict = OmegaConf.to_container(self.cfg, resolve=True) # Shuffle tasks to avoid order bias and improve balancing shuffled_tasks = tasks.copy() random.shuffle(shuffled_tasks) # Prepare evaluator kwargs for worker processes evaluator_kwargs = { "data_dir": str(self.data_dir), "benchmark_name": self.benchmark_name, } # Add GenericEvaluator specific kwargs if available if hasattr(self, "metadata_file"): evaluator_kwargs["metadata_file"] = str(self.metadata_file.name) if hasattr(self, "task_id_field"): evaluator_kwargs["task_id_field"] = self.task_id_field if hasattr(self, "question_field"): evaluator_kwargs["question_field"] = self.question_field if hasattr(self, "ground_truth_field"): evaluator_kwargs["ground_truth_field"] = self.ground_truth_field if hasattr(self, "file_name_field"): evaluator_kwargs["file_name_field"] = self.file_name_field # Prepare serializable arguments for worker processes worker_args = [] for task in shuffled_tasks: task_dict = { "task_id": task.task_id, "task_question": task.task_question, "ground_truth": task.ground_truth, "file_path": task.file_path, "metadata": task.metadata, } worker_args.append((task_dict, cfg_dict, evaluator_kwargs)) # Use ProcessPoolExecutor for true parallelism (bypasses GIL) processed_results = [] task_index_map = { task.task_id: (i, task) for i, task in enumerate(shuffled_tasks) } results_dict = {} # Store results by task_id to maintain order executor = None try: executor = ProcessPoolExecutor(max_workers=max_concurrent) # Submit all tasks future_to_task_id = {} for args in worker_args: task_dict = args[0] # First element is task_dict future = executor.submit(_task_worker, *args) future_to_task_id[future] = task_dict["task_id"] # Collect results as they complete from concurrent.futures import as_completed for future in as_completed(future_to_task_id): task_id = future_to_task_id[future] try: result_dict = future.result() # Reconstruct BenchmarkResult from dict result = BenchmarkResult(**result_dict) results_dict[task_id] = result completed = len(results_dict) print( f"Progress: {completed}/{len(shuffled_tasks)} tasks completed" ) except Exception as e: print(f"Exception in task {task_id}: {e}") # Get original task for error result _, original_task = task_index_map[task_id] error_result = BenchmarkResult( task_id=original_task.task_id, task_question=original_task.task_question, ground_truth=original_task.ground_truth, file_path=original_task.file_path, model_boxed_answer="", status="failed", metadata=original_task.metadata.copy(), error_message=str(e), ) results_dict[task_id] = error_result except KeyboardInterrupt: print("\n⚠️ Received interrupt signal, shutting down gracefully...") if executor: print(" Cancelling pending tasks and terminating worker processes...") # Cancel all pending futures for future in future_to_task_id: future.cancel() # Forcefully terminate worker processes # Access internal processes and terminate them if hasattr(executor, "_processes") and executor._processes: for pid, process in executor._processes.items(): try: if process.is_alive(): print(f" Terminating worker process {pid}...") process.terminate() except Exception as e: print( f" Warning: Failed to terminate process {pid}: {e}" ) # Give processes a short time to terminate gracefully import time time.sleep(0.5) # Force kill any remaining processes for pid, process in executor._processes.items(): try: if process.is_alive(): print(f" Force killing worker process {pid}...") process.kill() except Exception as e: print(f" Warning: Failed to kill process {pid}: {e}") # Shutdown executor without waiting for pending tasks executor.shutdown(wait=False, cancel_futures=True) print(" Shutdown complete.") raise finally: # Ensure executor is properly cleaned up if executor: try: executor.shutdown(wait=True) except Exception: pass # Ignore errors during cleanup # Reconstruct results in original task order processed_results = [results_dict[task.task_id] for task in shuffled_tasks] # Sort results to maintain original task order task_id_to_index = {task.task_id: i for i, task in enumerate(tasks)} processed_results.sort( key=lambda r: task_id_to_index.get(r.task_id, len(tasks)) ) self.results = processed_results return processed_results def save_results(self, output_file: str) -> str: """Save evaluation results to JSONL file""" output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w", encoding="utf-8") as f: for result in self.results: f.write(json.dumps(asdict(result), ensure_ascii=False) + "\n") print(f"Results saved to {output_path}") return str(output_path) def evaluate_accuracy(self) -> float: """Evaluate pass@k accuracy (verification already done in run_single_task)""" if not self.results: print("No results to evaluate") return 0.0 print( f"Calculating pass@{self.pass_at_k} accuracy for {len(self.results)} results..." ) correct_count = 0 total_count = 0 for result in self.results: total_count += 1 # Display task results print(f"\nTask {result.task_id}:") print(f" Attempts: {len(result.attempts)}") if result.ground_truth is not None: print( f" Pass@{self.pass_at_k}: {'✅ SUCCESS' if result.pass_at_k_success else '❌ FAILED'}" ) print(" " + "=" * 50) print(f" Reference: {result.ground_truth}") print(" " + "=" * 50) if result.pass_at_k_success: correct_count += 1 pass_at_k_accuracy = correct_count / total_count if total_count > 0 else 0.0 print(f"\nPass@{self.pass_at_k} Final Results:") print(f"Tasks passed: {correct_count}/{total_count}") print(f"Pass@{self.pass_at_k} Accuracy: {pass_at_k_accuracy:.2%}") return pass_at_k_accuracy def _update_log_file_with_evaluation( self, model_boxed_answer: str, log_file_path: str, evaluation_result: str, judge_type: str, eval_details: Optional[Dict[str, Any]] = None, ): """Helper method to update log file with evaluation result""" try: log_file = Path(log_file_path) # Read existing data with open(log_file, "r", encoding="utf-8") as f: log_data = json.load(f) # Update with evaluation result log_data["final_boxed_answer"] = model_boxed_answer log_data["final_judge_result"] = evaluation_result log_data["judge_type"] = judge_type # Store evaluation details (e.g., for DeepSearchQA metrics) if eval_details: log_data["eval_details"] = eval_details # Write to a temporary file and then atomically replace temp_log_file = log_file.with_suffix(f"{log_file.suffix}.tmp") with open(temp_log_file, "w", encoding="utf-8") as f: json.dump(log_data, f, indent=2, ensure_ascii=False) os.replace(temp_log_file, log_file) print(f" Updated log file {log_file.name} with evaluation result.") except Exception as e: print(f" Error updating log file {log_file_path}: {e}") class GenericEvaluator(BenchmarkEvaluator): """Generic benchmark evaluator for JSONL format""" def __init__( self, data_dir: str, benchmark_name: str, cfg: DictConfig, metadata_file: str = "metadata.jsonl", task_id_field: str = "task_id", question_field: str = "task_question", ground_truth_field: str = "ground_truth", file_name_field: Optional[str] = "file_name_field", ): """ Initialize generic evaluator Args: data_dir: Path to benchmark data directory benchmark_name: Name of the benchmark cfg: The Hydra configuration object metadata_file: Name of the metadata file task_id_field: Field name for task ID in the data question_field: Field name for task question in the data ground_truth_field: Field name for ground truth answer in the data file_name_field: Field name for file name in the data (optional) pass_at_k: Pass@K value for evaluation (default: 1) """ super().__init__(data_dir=data_dir, benchmark_name=benchmark_name, cfg=cfg) self.metadata_file = self.data_dir / metadata_file self.task_id_field = task_id_field self.question_field = question_field self.ground_truth_field = ground_truth_field self.file_name_field = file_name_field self.tasks: List[BenchmarkTask] = [] self.results: List[BenchmarkResult] = [] def load_tasks(self, limit: Optional[int] = None) -> List[BenchmarkTask]: """ Load benchmark tasks from metadata.jsonl Args: limit: Maximum number of tasks to load (None for all) Returns: List of BenchmarkTask objects """ print(f"Loading tasks from {self.metadata_file}") if not self.metadata_file.exists(): raise FileNotFoundError(f"Metadata file not found: {self.metadata_file}") tasks = [] with open(self.metadata_file, "r", encoding="utf-8") as f: for i, line in enumerate(f): if limit and i >= limit: break try: data = json.loads(line.strip()) # Extract file path if specified file_path = None if self.file_name_field and self.file_name_field in data: file_path = data[self.file_name_field] # Create metadata dict with all remaining fields metadata = { k: v for k, v in data.items() if k not in [ self.task_id_field, self.question_field, self.ground_truth_field, self.file_name_field, ] } task = BenchmarkTask( task_id=data[self.task_id_field], task_question=data[self.question_field], ground_truth=data[self.ground_truth_field], file_path=file_path, metadata=metadata, ) tasks.append(task) except Exception as e: print(f"Warning: Failed to parse line {i + 1}: {e}") continue gc.collect() self.tasks = tasks print(f"Loaded {len(tasks)} tasks") return tasks def prepare_task_description( self, task: BenchmarkTask ) -> Tuple[str, Optional[str]]: """ Prepare task description and file path for the agent Args: task: BenchmarkTask object Returns: Tuple of (task_description, task_file_path) """ task_file_path = None if task.file_path: # Build complete file path: data directory + relative path full_file_path = self.data_dir / task.file_path # Convert to absolute path and resolve any symbolic links task_file_path = str(full_file_path.resolve()) else: task_file_path = None # Return task question and file path return task.task_question, task_file_path class CommonBenchmark: """Main class to run a benchmark""" def __init__(self, cfg: DictConfig): """ Initialize the benchmark run Args: cfg: Hydra configuration object """ self.cfg = cfg self.benchmark_name = cfg.benchmark.name evaluator_kwargs = cfg.benchmark.get("evaluator_kwargs", OmegaConf.create({})) # Support for legacy config structure if "metadata_file" in cfg.benchmark.data: evaluator_kwargs["metadata_file"] = cfg.benchmark.data.metadata_file if "field_mapping" in cfg.benchmark.data: mapping = cfg.benchmark.data.field_mapping if "task_id_field" in mapping: evaluator_kwargs["task_id_field"] = mapping.task_id_field if "task_question_field" in mapping: evaluator_kwargs["question_field"] = mapping.task_question_field if "ground_truth_field" in mapping: evaluator_kwargs["ground_truth_field"] = mapping.ground_truth_field if "file_name_field" in mapping: evaluator_kwargs["file_name_field"] = mapping.file_name_field self.evaluator = GenericEvaluator( data_dir=cfg.benchmark.data.data_dir, benchmark_name=self.benchmark_name, cfg=cfg, **evaluator_kwargs, ) def run_evaluation(self) -> float: """ Run the full benchmark evaluation process """ print(f"Starting evaluation for benchmark: {self.benchmark_name}") print(f"LLM Provider: {self.evaluator.llm_provider}") print(f"LLM Model: {self.evaluator.llm_model}") # Load tasks self.evaluator.load_tasks(limit=self.cfg.benchmark.execution.max_tasks) if not self.evaluator.tasks: print("No tasks loaded. Exiting.") return 0.0 # Run inference print( f"\nStarting parallel inference with {self.cfg.benchmark.execution.max_concurrent} concurrent tasks..." ) print(f"Using pass@{self.evaluator.pass_at_k} evaluation...") self.evaluator.run_parallel_inference( self.evaluator.tasks, max_concurrent=self.cfg.benchmark.execution.max_concurrent, ) # Evaluate accuracy print("Evaluating accuracy...") accuracy = self.evaluator.evaluate_accuracy() print(f"\nOverall pass@{self.evaluator.pass_at_k} accuracy: {accuracy:.2%}") # Save results # Construct the full path in the correct log directory log_dir = self.evaluator.get_log_dir() results_path = log_dir / "benchmark_results.jsonl" self.evaluator.save_results(str(results_path)) print(f"\nEvaluation completed! Results saved to {results_path}") # save accuracy to a file accuracy_file = str(results_path).replace( ".jsonl", f"_pass_at_{self.evaluator.pass_at_k}_accuracy.txt" ) with open(accuracy_file, "w") as f: f.write(f"{accuracy:.2%}") # Generate and save summary generate_summary(log_dir) return accuracy @hydra.main(config_path="../conf", config_name="config", version_base=None) def run_benchmark(cfg: DictConfig) -> None: """ Main entry point for running benchmarks with Hydra. """ print("Benchmark configuration:\n", OmegaConf.to_yaml(cfg.benchmark)) benchmark = CommonBenchmark(cfg) benchmark.run_evaluation() if __name__ == "__main__": run_benchmark() ================================================ FILE: apps/miroflow-agent/benchmarks/evaluators/__init__.py ================================================ ================================================ FILE: apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import glob import os import re import statistics import sys def detect_pass_at_k(results_dir: str) -> tuple: """Detect the pass_at_k value used in the results directory""" # Find all possible pass_at_k files pattern = os.path.join( results_dir, "run_*", "benchmark_results_pass_at_*_accuracy.txt" ) all_files = glob.glob(pattern) if not all_files: print(f"No accuracy files found in {results_dir}") print(f"Expected pattern: {pattern}") return None, [] # Extract pass_at_k value from the first file filename = os.path.basename(all_files[0]) match = re.search(r"pass_at_(\d+)_accuracy\.txt", filename) if not match: print(f"Cannot extract pass_at_k from filename: {filename}") return None, [] k = int(match.group(1)) # Get all files with this k value accuracy_files = glob.glob( os.path.join( results_dir, "run_*", f"benchmark_results_pass_at_{k}_accuracy.txt" ) ) return k, accuracy_files def calculate_average_scores(results_dir: str) -> dict: """Calculate average scores from multiple runs - automatically detect pass_at_k value""" # Detect pass_at_k value and corresponding files pass_at_k, accuracy_files = detect_pass_at_k(results_dir) if pass_at_k is None: return None print(f"Detected pass_at_{pass_at_k} files") print(f"Found {len(accuracy_files)} accuracy files") scores = [] # Read each accuracy file for i, file_path in enumerate(sorted(accuracy_files), 1): try: with open(file_path, "r") as f: content = f.read().strip() # Remove percentage sign and convert to float score = float(content.replace("%", "")) scores.append(score) print(f"Run {i}: {score:.2f}%") except Exception as e: print(f"Error reading {file_path}: {e}") continue if not scores: print("No valid scores found") return None # Calculate statistics stats = { "pass_at_k": pass_at_k, "num_runs": len(scores), "individual_scores": scores, "average_score": statistics.mean(scores), "std_dev": statistics.stdev(scores) if len(scores) > 1 else 0, "min_score": min(scores), "max_score": max(scores), } return stats def print_results(stats: dict): """Print results""" print("\n" + "=" * 50) print("EVALUATION RESULTS") print("=" * 50) print(f"Pass@{stats['pass_at_k']} Results:") print(f"Number of runs: {stats['num_runs']}") print(f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}") print() print(f"Standard deviation: {stats['std_dev']:.2f}%") print(f"Min score: {stats['min_score']:.2f}%") print(f"Max score: {stats['max_score']:.2f}%") print(f"Average score: {stats['average_score']:.2f}%") print("=" * 50) def main(): if len(sys.argv) < 2: print("Usage: python calculate_average_score.py ") print("Example: python calculate_average_score.py logs/gaia-validation/mytest") sys.exit(1) results_dir = sys.argv[1] if not os.path.exists(results_dir): print(f"Results directory does not exist: {results_dir}") sys.exit(1) print(f"Analyzing results from: {results_dir}") stats = calculate_average_scores(results_dir) if stats: print_results(stats) # Save simple statistics results output_file = os.path.join( results_dir, f"average_scores_pass_at_{stats['pass_at_k']}.txt" ) with open(output_file, "w") as f: f.write("EVALUATION RESULTS\n") f.write("=" * 50 + "\n") f.write(f"Pass@{stats['pass_at_k']} Results:\n") f.write(f"Number of runs: {stats['num_runs']}\n") f.write( f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\n" ) f.write(f"Standard deviation: {stats['std_dev']:.2f}%\n") f.write(f"Min score: {stats['min_score']:.2f}%\n") f.write(f"Max score: {stats['max_score']:.2f}%\n") f.write(f"Average score: {stats['average_score']:.2f}%\n") f.write("=" * 50 + "\n") print(f"\nResults saved to: {output_file}") else: print("Failed to calculate statistics") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: apps/miroflow-agent/benchmarks/evaluators/eval_utils.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import json import os import re import string import warnings from typing import Any, Dict, Literal, Optional from dotenv import load_dotenv from openai import AsyncOpenAI, OpenAI from pydantic import BaseModel load_dotenv() OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL") evaluation_llm_client = AsyncOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) model_as_a_judge_client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # ================================================ # verify_answer_simpleqa # ================================================ EVALUATION_PROMPT_SIMPLEQA = """ Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. First, I will give examples of each grade, and then you will grade a new example. The following are examples of CORRECT predicted answers. ``` Question: What are the names of Barack Obama's children? Gold target: Malia Obama and Sasha Obama Predicted answer 1: sasha and malia obama Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001. ``` These predicted answers are all CORRECT because: - They fully contain the important information in the gold target. - They do not contain any information that contradicts the gold target. - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter. - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions. The following are examples of INCORRECT predicted answers. ``` Question: What are the names of Barack Obama's children? Gold target: Malia and Sasha Predicted answer 1: Malia. Predicted answer 2: Malia, Sasha, and Susan. Predicted answer 3: Barack Obama does not have any children. Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia. Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children. Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer? Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information. ``` These predicted answers are all INCORRECT because: - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect. The following are examples of NOT_ATTEMPTED predicted answers. ``` Question: What are the names of Barack Obama's children? Gold target: Malia and Sasha Predicted answer 1: I don't know. Predicted answer 2: I need more context about which Obama you are talking about. Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children. Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one. ``` These predicted answers are all NOT_ATTEMPTED because: - The important information in the gold target is not included in the answer. - No statements in the answer contradict the gold target. Also note the following things: - For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". - Predicted answers "120k", "124k", and 115k" are all CORRECT. - Predicted answers "100k" and "113k" are INCORRECT. - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target. - The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question. - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer. - Do not punish predicted answers if they omit information that would be clearly inferred from the question. - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California". - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question. - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question. - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed. - Do not punish for typos in people's name if it's clearly the same name. - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung". Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. ``` Question: {} Gold target: {} Predicted answer: {} ``` Grade the predicted answer of this new question as one of: A: CORRECT B: INCORRECT C: NOT_ATTEMPTED Just return the letters "A", "B", or "C", with no text around it. """.strip() async def verify_answer_simpleqa( question: str, target: str, predicted_answer: str ) -> str: """ Use LLM to verify if the predicted answer is correct. Expects the LLM to choose between A (correct), B or C (incorrect). """ messages = [ { "role": "user", "content": EVALUATION_PROMPT_SIMPLEQA.format( question, target, predicted_answer ), } ] CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"} try: llm_response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=messages, max_completion_tokens=2 ) content = llm_response.choices[0].message.content match = re.search(r"(A|B|C)", content) if match: return CHOICE_MAP[match.group(0)] except Exception as e: print(f"LLM evaluation failed: {e}") return "NOT_ATTEMPTED" # ================================================ # verify_answer_hle # ================================================ HLE_JUDGE_PROMPT = """Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below. [question]: {question} [response]: {response} Your judgement must be in the format and criteria specified below: extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response. [correct_answer]: {correct_answer} reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match. correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect. confidence: The extracted confidence score between 0|\%| and 100|\%| from [response]. Put 100 if there is no confidence score available.""" class HLEExtractedAnswer(BaseModel): extracted_final_answer: str reasoning: str correct: Literal["yes", "no"] confidence: int strict: Literal[True] = True # 100% reliability async def verify_answer_hle(question: str, target: str, predicted_answer: str) -> str: """ Use HLE-style LLM judge to verify if the predicted answer is correct. Returns the evaluation result as a string: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED". Args: question: The question being answered target: The correct/target answer predicted_answer: The model's predicted answer Returns: String indicating the evaluation result """ prompt = HLE_JUDGE_PROMPT.format( question=question, correct_answer=target, response=predicted_answer ) try: response = await evaluation_llm_client.beta.chat.completions.parse( model="o3-mini-2025-01-31", max_completion_tokens=4096, messages=[{"role": "user", "content": prompt}], response_format=HLEExtractedAnswer, ) content = response.choices[0].message.parsed # Print HLE reasoning print(f"LLM as Judge Reasoning: {content.reasoning}") print(f"LLM as Judge Result: {content.correct}") print(f"LLM as Judge Confidence: {content.confidence}%") # Convert HLE format to eval_utils format if content.correct == "yes": return "CORRECT" else: return "INCORRECT" except Exception as e: if "Incorrect API key provided" in str(e): print(f"LLM evaluation failed: {e}") exit() print(f"LLM evaluation failed: {e}") return "NOT_ATTEMPTED" # ================================================ # verify_answer_gaia # ================================================ async def verify_answer_gaia(question: str, target: str, predicted_answer: str) -> str: """ Use GAIA-style judge to verify if the predicted answer is correct. """ def normalize_number_str(number_str: str) -> float | None: # we replace these common units and commas to allow # conversion to float for char in ["$", "%", ","]: number_str = number_str.replace(char, "") try: return float(number_str) except ValueError: print(f"String {number_str} cannot be normalized to number str.") return None # Return None instead of inf to handle gracefully def split_string( s: str, char_list: list[str] = [",", ";"], ) -> list[str]: pattern = f"[{''.join(char_list)}]" return re.split(pattern, s) def normalize_str(input_str, remove_punct=True) -> str: """ Normalize a string by: - Removing all white spaces - Optionally removing punctuation (if remove_punct is True) - Converting to lowercase Parameters: - input_str: str, the string to normalize - remove_punct: bool, whether to remove punctuation (default: True) Returns: - str, the normalized string """ # Remove all white spaces. Required e.g for seagull vs. sea gull no_spaces = re.sub(r"\s", "", input_str) # Remove punctuation, if specified. if remove_punct: translator = str.maketrans("", "", string.punctuation) return no_spaces.lower().translate(translator) else: return no_spaces.lower() def question_scorer( model_answer: str, ground_truth: str, ) -> bool: def is_float(element: any) -> bool: try: float(element) return True except ValueError: return False if model_answer is None: model_answer = "None" # if gt is a number if is_float(ground_truth): print(f"Evaluating {model_answer} as a number.") normalized_answer = normalize_number_str(model_answer) # If normalization failed, the answer is incorrect if normalized_answer is None: return False return normalized_answer == float(ground_truth) # if gt is a list elif any(char in ground_truth for char in [",", ";"]): print(f"Evaluating {model_answer} as a comma separated list.") # question with the fish: normalization removes punct gt_elems = split_string(ground_truth) ma_elems = split_string(model_answer) # check length is the same if len(gt_elems) != len(ma_elems): warnings.warn( "Answer lists have different lengths, returning False.", UserWarning ) return False # compare each element as float or str comparisons = [] for ma_elem, gt_elem in zip(ma_elems, gt_elems): if is_float(gt_elem): normalized_ma_elem = normalize_number_str(ma_elem) # If normalization failed, this element is incorrect if normalized_ma_elem is None: comparisons.append(False) else: comparisons.append(normalized_ma_elem == float(gt_elem)) else: # we do not remove punct since comparisons can include punct comparisons.append( normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False) ) return all(comparisons) # if gt is a str else: print(f"Evaluating {model_answer} as a string.") return normalize_str(model_answer) == normalize_str(ground_truth) # Use the question_scorer to evaluate the answer try: is_correct = question_scorer(predicted_answer, target) return "CORRECT" if is_correct else "INCORRECT" except Exception as e: print(f"GAIA evaluation failed: {e}") raise e # use raise error instead, later we could judge it as NOT_ATTEMPTED. # return "NOT_ATTEMPTED" # ================================================ # verify_answer_gaia_validation_text_103 # Prompt from WebAgent # https://github.com/Alibaba-NLP/WebAgent/blob/f25dae54daf0ce2874ffd5ed5ffb20feca7c4c4e/WebSailor/src/prompt.py#L98 # ================================================ GAIA_VALIDATION_TEXT_103_SCORER_PROMPT = """You are an evaluation assistant. Please determine if the predicted answer is equivalent to the labeled answer. Question: {question} Labeled Answer: {correct_answer} Predicted Answer: {response} Did the model give an answer **equivalent** to the labeled answer? Please respond with "Correct" if they are equivalent, or "Incorrect" if they are not equivalent. Do not include any other text. """ async def verify_answer_gaia_validation_text_103( question: str, target: str, predicted_answer: str ) -> str: prompt = GAIA_VALIDATION_TEXT_103_SCORER_PROMPT.format( question=question, correct_answer=target, response=predicted_answer ) max_tries = 10 for attempt in range(max_tries): try: response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=[{"role": "user", "content": prompt}], ) content = response.choices[0].message.content print("LLM Judge Response: ", content) if response: break except Exception as e: if attempt == (max_tries - 1): raise e # Use case-insensitive matching and strip whitespace/punctuation content_normalized = content.strip().rstrip(".").lower() if content_normalized == "correct": return "CORRECT" elif content_normalized == "incorrect": return "INCORRECT" else: # If we can't parse the response, default to NOT_ATTEMPTED to trigger retry print(f"Warning: Could not parse judge response: {content}") return "NOT_ATTEMPTED" # ================================================ # verify_answer_browsecomp # Prompt from Tongyi DeepResearch # https://github.com/Alibaba-NLP/DeepResearch/blob/main/WebAgent/WebWatcher/infer/evaluation/prompt.py#L110 # ================================================ JUDGE_PROMPT_BC_zh = """ 请根据给定问题、标准答案和模型预测的答案来评估模型的回答是否正确。您的任务是将结果评定为:【正确】、【错误】。 首先,我们将列出每个评定类别的示例,然后请您对新问题的预测答案进行评定。 以下是【正确】的答复示例: ``` 问题:贝拉克·奥巴马的孩子叫什么名字? 标准答案:玛丽亚·奥巴马和萨莎·奥巴马 模型预测1:Malia Obama and Sasha Obama 模型预测2:玛丽亚和萨沙 模型预测3:大多数人会说是玛丽亚和萨莎,但我不确定,需要再确认 模型预测4:巴拉克·奥巴马有两个女儿,她们分别是玛丽亚·安和娜塔莎·玛丽安,但通常称作玛丽亚·奥巴马和萨莎·奥巴马。 ``` 这些答复均为【正确】,因为: - 完整地包含了标准答案中的重要信息。 - 不包含任何与标准答案矛盾的信息。 - 只关注语义内容,中英文,大小写、标点、语法和顺序不重要。 - 答复中出现模糊语句或猜测是可以接受的,前提是包含了标准答案且不含有不正确信息或矛盾。 以下是【错误】的答复示例: ``` 问题:巴拉克·奥巴马的孩子叫什么名字? 标准答案:玛丽亚·奥巴马和萨莎·奥巴马 模型预测1:玛丽亚 模型预测2:玛丽亚、萨莎和苏珊和萨莎·奥巴马或玛丽亚·奥巴马,或娜塔莎·玛丽安,或爱因斯坦 模型预测3:虽然我不知道他们的确切名字,但能说出巴拉克·奥巴马有两个孩子。 模型预测4:你可能是想说贝茜和奥利维亚。不过您应通过最新的参考资料确认详细信息。那是正确的答案吗? 模型预测5:巴拉克·奥巴马的孩子 ``` 这些答复均为【错误】,因为: - 答复中包含与标准答案矛盾的事实陈述。 - 答案为空、重复表述问题。 - 答案枚举了多个答案,重复表述答案。 需要格外注意的是: - 标准答案中包含对于问题中多个方面的回答,并且在同一个方面的答案中可能会有多种不同的描述,这些描述均是正确的,并且在同一个括号中给出,通过逗号连接。例如,考虑问题"抖音自己的人工智能大模型叫什么名字?",标准答案为"【【豆包,云雀】】": - 预测答案"豆包"、"豆包、云雀"、"云雀"等均为【正确】。 - 对于标准答案中包含的不同方面的回答,模型需要同时给出所有方面的回答才可以算是正确,否则直接判断为【错误】,不存在【部分正确】这种输出方式,这些答案会在不同的括号中给出。例如,考虑问题"TFBOYS组合中的成员有哪些?",标准答案为"【【王俊凯】【王源】【易洋千玺】】": - 预测答案"王俊凯、王源、易洋千玺"等同时包含所有答案,才可以算为【正确】。 - 预测答案为"王俊凯、易洋千玺"等没有同时包含所有答案,会被算为【错误】。 另外注意以下几点: - 对于标准答案为数字的问题,预测答案应和标准答案一致。例如,考虑问题"金山铁路黄浦江特大桥的全长是多少米?",标准答案为"3518.17": - 预测答案"3518"、"3518.1"、"3518.17"均为【正确】。 - 预测答案"3520"和"3600"均为【错误】。 - 如果模型预测并没有直接回答问题,模型试图绕过或未能直接给出标准答案视为【错误】答案。 - 例如:问题"林宥嘉的老婆是谁",标准答案为"丁文琪"。模型预测"林宥嘉的老婆"、"林宥嘉的老婆应该很优秀"、"林宥嘉的老婆可能是某个公众人物"均为【错误】。 - 如果标准答案包含比问题更多的信息,预测答案只需包含问题中提到的信息。 - 例如,考虑问题"菱镁矿的主要化学成分是什么?"标准答案为"碳酸镁(MgCO3)"。"碳酸镁"或"MgCO3"均视为【正确】答案。 - 如果从问题中明显可以推断出预测答案省略的信息,那么算作正确。 - 例如,问题"巴鲁米尼的努拉吉遗迹在1997年被联合国教科文组织列为世界文化遗产,那么这遗址在哪个地区?"标准答案为"意大利撒丁岛",预测答案"撒丁岛"被视为【正确】。 - 如果能明显看出名字翻译版本不同但是是同一个人也认为正确。 - 例如,如果标准答案是"Robinson",那么回答鲁滨逊或者鲁滨孙均正确。 - 你应该更关注标准答案和模型预测的匹配度,而不是关心标准答案是否是正确的。 下面是一个新的问题示例。请只回复【正确】、【错误】之一,不要道歉或纠正自己的错误,只需要评估该回答。 ``` 问题: {question} 标准答案: {correct_answer} 预测答案: {response} ``` 将此新问题的预测答案评定为以下之一: A.【正确】 B.【错误】 只返回【正确】、【错误】所代表的选项即可,即仅返回A或B即可,无须添加任何其他的文本。 """.strip() JUDGE_PROMPT_BC_en = """ Based on the given question, standard answer, and model-predicted answer, evaluate whether the model's response is correct. Your task is to classify the result as: [CORRECT] or [INCORRECT]. First, we'll list examples for each category, then you'll evaluate a new question's predicted answer. Here are examples of [CORRECT] responses: ``` Question: What are the names of Barack Obama's children? Standard Answer: Malia Obama and Sasha Obama Model Prediction 1: Malia Obama and Sasha Obama Model Prediction 2: Malia and Sasha Model Prediction 3: Most would say Malia and Sasha, but I'm not sure, I should verify Model Prediction 4: Barack Obama has two daughters, Malia Ann and Natasha Marian, commonly known as Malia Obama and Sasha Obama. ``` These responses are all [CORRECT] because they: - Fully include the important information from the standard answer. - Don't contain any information that contradicts the standard answer. - Focus only on semantic content; language, capitalization, punctuation, grammar, and order aren't important. - Vague statements or guesses are acceptable as long as they include the standard answer and don't contain incorrect information or contradictions. Here are examples of [INCORRECT] responses: ``` Question: What are the names of Barack Obama's children? Standard Answer: Malia Obama and Sasha Obama Model Prediction 1: Malia Model Prediction 2: Malia, Sasha and Susan or Sasha Obama or Malia Obama, or Natasha Marian, or Einstein Model Prediction 3: While I don't know their exact names, I can tell you Barack Obama has two children. Model Prediction 4: You might be thinking of Betsy and Olivia. But you should verify the details with the latest references. Is that the correct answer? Model Prediction 5: Barack Obama's children ``` These responses are all [INCORRECT] because they: - Contain factual statements that contradict the standard answer. - Are empty or merely repeat the question. - Enumerate multiple answers or repeat the answer. Pay special attention to the following: - The standard answer may contain responses to multiple aspects of the question, and within the same aspect, there might be different descriptions, all of which are correct and are given in the same bracket, connected by commas. For example, for the question "What is the name of ByteDance's AI model?", the standard answer is "[[Doubao, Skylark]]": - Predicted answers "Doubao", "Doubao, Skylark", "Skylark", etc. are all [CORRECT]. - For standard answers containing responses to different aspects, the model needs to provide answers to all aspects to be considered correct; otherwise, it's directly judged as [INCORRECT]. There is no [PARTIALLY CORRECT] output option. These answers will be given in different brackets. For example, for the question "Who are the members of TFBOYS?", the standard answer is "[[Wang Junkai][Wang Yuan][Yi Yangqianxi]]": - Predicted answers like "Wang Junkai, Wang Yuan, Yi Yangqianxi" that include all answers are [CORRECT]. - Predicted answers like "Wang Junkai, Yi Yangqianxi" that don't include all answers are [INCORRECT]. Also note the following points: - For questions with numerical standard answers, the predicted answer should match the standard answer. For example, for the question "What is the total length in meters of the Huangpu River Bridge on the Jinshan Railway?", the standard answer is "3518.17": - Predicted answers "3518", "3518.1", "3518.17" are all [CORRECT]. - Predicted answers "3520" and "3600" are [INCORRECT]. - If the model prediction doesn't directly answer the question, attempts to circumvent or fails to directly provide the standard answer, it's considered an [INCORRECT] answer. - For example, for the question "Who is JJ Lin's wife?", with the standard answer "Ding Wenqi", model predictions like "JJ Lin's wife", "JJ Lin's wife should be excellent", "JJ Lin's wife might be a public figure" are all [INCORRECT]. - If the standard answer contains more information than the question asks for, the predicted answer only needs to include the information mentioned in the question. - For example, for the question "What is the main chemical component of magnesite?", with the standard answer "Magnesium carbonate (MgCO3)", "Magnesium carbonate" or "MgCO3" are both considered [CORRECT] answers. - If information omitted in the predicted answer can be clearly inferred from the question, it's considered correct. - For example, for the question "The Nuragic ruins of Barumini were listed as a World Cultural Heritage by UNESCO in 1997, so where is this site located?", with the standard answer "Sardinia, Italy", the predicted answer "Sardinia" is considered [CORRECT]. - If it's clear that different translations of a name refer to the same person, it's considered correct. - For example, if the standard answer is "Robinson", answers like "Lubinson" or "Lubinsun" are both correct. - You should focus more on the match between the standard answer and the model prediction, rather than whether the standard answer itself is correct. Below is a new question example. Please reply with only [CORRECT] or [INCORRECT], without apologies or corrections to your own errors, just evaluate the answer. ``` Question: {question} Standard Answer: {correct_answer} Predicted Answer: {response} ``` Evaluate this new question's predicted answer as one of the following: A. [CORRECT] B. [INCORRECT] Return only the option representing [CORRECT] or [INCORRECT], i.e., just return A or B, without adding any other text. """.strip() async def verify_answer_browsecomp( question: str, target: str, predicted_answer: str ) -> str: """ Use BrowseComp judge (English version) to verify if the predicted answer is correct. Expects the LLM to return A (correct) or B (incorrect). """ prompt = JUDGE_PROMPT_BC_en.format( question=question, correct_answer=target, response=predicted_answer ) try: response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=[{"role": "user", "content": prompt}], max_completion_tokens=2, ) content = response.choices[0].message.content print(f"BrowseComp Judge Response: {content}") # Extract A or B from the response match = re.search(r"[AB]", content) if match: choice = match.group(0) if choice == "A": return "CORRECT" elif choice == "B": return "INCORRECT" # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry print(f"Warning: Could not parse BrowseComp judge response: {content}") return "NOT_ATTEMPTED" except Exception as e: print(f"BrowseComp evaluation failed: {e}") raise e async def verify_answer_browsecomp_zh( question: str, target: str, predicted_answer: str ) -> str: """ Use BrowseComp judge (Chinese version) to verify if the predicted answer is correct. Expects the LLM to return A (correct) or B (incorrect). """ prompt = JUDGE_PROMPT_BC_zh.format( question=question, correct_answer=target, response=predicted_answer ) try: response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=[{"role": "user", "content": prompt}], max_completion_tokens=2, ) content = response.choices[0].message.content print(f"BrowseComp-ZH Judge Response: {content}") # Extract A or B from the response match = re.search(r"[AB]", content) if match: choice = match.group(0) if choice == "A": return "CORRECT" elif choice == "B": return "INCORRECT" # If no clear A or B is found, return NOT_ATTEMPTED to trigger retry print(f"Warning: Could not parse BrowseComp-ZH judge response: {content}") return "NOT_ATTEMPTED" except Exception as e: print(f"BrowseComp-ZH evaluation failed: {e}") raise e # ================================================ # verify_answer_xbench_deepsearch # Prompt from XBench-Evals # https://github.com/xbench-ai/xbench-evals/blob/main/eval_grader.py#L25 # ================================================ JUDGE_PROMPT_XBENCH = """ 你是一个通用人工智能助手。根据下面给出的[正确答案], 判断以下对[原问题]的[回答]的回答是否正确。 [原问题]: {question} [正确答案]: {correct_answer} [回答]:{response} 你的判断必须按照以下格式和标准进行: 最终答案: 从[回答]中提取出的最终准确答案。如果[回答]中没有明确的最终答案, 则填写'无'。 解释: 根据[正确答案]解释为什么[最终答案]是正确的或错误的。只关注[最终答案]与[正确答案]之间是否存在实质性差异, 不要评论题目的背景, 不要尝试重新解题, 不要为任何不同于[正确答案]的答案辩护, 只专注于判断答案是否一致。 结论: 如果[最终答案]与上方给出的[正确答案]一致, 或者在数值题目中处于可接受的微小误差范围内, 则填写'正确'; 否则(即存在任何不一致、歧义、不等价或提取出的答案错误的情况)填写'错误'。 """.strip() async def verify_answer_xbench_deepsearch( question: str, target: str, predicted_answer: str ) -> str: """ Use XBench-DeepSearch judge to verify if the predicted answer is correct. """ def parse_match_result(match): if match is None: return match match = match.group(0) try: target = match.split(":")[1].strip() return target except Exception: return match # return naive result in case of failed if predicted_answer is None: return "INCORRECT" judge_prompt = JUDGE_PROMPT_XBENCH.format( question=question, correct_answer=target, response=predicted_answer, ) try: response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=[{"role": "user", "content": judge_prompt}], ) judge_response = response.choices[0].message.content except Exception: judge_response = None if judge_response is None: return "NOT_ATTEMPTED" # Extract grader conclusions extract_match = re.search(r"最终答案:*(.*)", judge_response) extract_match = parse_match_result(extract_match) # Fixed regex: make the dot optional with \s* (zero or more whitespace) correct_match = re.search(r"结论:*\s*(正确|错误)", judge_response) correct_match = parse_match_result(correct_match) explain_match = re.search(r"解释:*(.*)", judge_response) explain_match = parse_match_result(explain_match) # Print debug info print(f"XBench Judge - Extract: {extract_match}, Correct: {correct_match}") if correct_match == "正确": return "CORRECT" elif correct_match == "错误": return "INCORRECT" else: # If we can't parse the result, return NOT_ATTEMPTED to trigger retry print( f"Warning: Could not parse XBench judge response, correct_match={correct_match}" ) return "NOT_ATTEMPTED" # ================================================ # verify_answer_deepsearchqa # # Official prompt from DeepSearchQA benchmark # https://www.kaggle.com/code/andrewmingwang/deepsearchqa-starter-code # ================================================ JUDGE_PROMPT_DEEPSEARCHQA = """Your task is to evaluate whether a given "AI Response" for a specific "User Prompt" arrived at the correct answer. **Answer Correctness Task** * **Purpose:** Assess whether the AI response provides the correct answer(s) based on the provided "Correct Answer" and "Prompt Type". * **Process:** * Identify the "Prompt Type": "". * Refer to the "Correct Answer": "". * Based on the "Prompt Type", determine if the "AI Response" contains the expected answer(s). * **'Single Answer'**: Check if the response provides the answer that addresses the user's question. It does not have to match the exact wording of the provided answer. * **'Set Answer'**: Check if the response includes *each* item from the provided ground truth answers. The order might not matter unless specified otherwise. The response might include more answers than the list. Determine the correctness *only* based on the list first and then check if the response includes answers not in the list. * **Explanation:** Provide a brief explanation justifying your assessment of answer correctness, referencing specific parts of the AI response and the correct answer. * **Correctness Details:** Provide a dictionary, one key for each expected answer part, and value is a boolean indicating whether each expected answer part was found. * For 'Set Answer', this will be a list of attributes, one for each item/part in the "Correct Answer". Each key will be a string indicating the expected answer part, and the value will be a boolean indicating whether that part was found in the response. * **Excessive Answers:** Provide a list of strings, each indicating an excessive answer part. If the response provides answers that are **not** in the "Correct Answer" list, add these answers as excessive answers. Return an empty list when there's no excessive answers in the response. **Output Format:** Your evaluation *must* be structured as a nested JSON dictionary with the following top-level keys: `"Answer Correctness"`. Please return NULL if any of "Prompt", "AI Response" or "Correct Answer" is empty. The value for `"Answer Correctness"` should be a dictionary containing `"Explanation"` (a string), `"Correctness Details"` (a dictionary where each key is the expected correct answer, and the value is a boolean indicating whether the response contains the correct answer), and `"Excessive Answers"` (a list of strings indicating the excessive answers). Make sure you return a valid JSON string. Pay special attention to quotes, commas and special characters in the JSON string. Make sure to escape all special characters and quotes in the JSON string. **Example (Partial):** "```json {{ "Answer Correctness": {{ "Explanation": "The response correctly identified Belgium and France but also includes an excessive answer, Italy.", "Correctness Details": {{ "Belgium": true, "France": true, }}, "Excessive Answers": [ "Italy" ] }} }} ```" **Now, proceed with the evaluation using the provided User Prompt, AI Response, and Correct Answer.** User Prompt (Wrapped in and ): {prompt} -------------------- ** Correct Answer (Wrapped in and ): Prompt Type: {prompt_type} {answer} -------------------- AI assistant response (Wrapped in and ): {response} -------------------- Rating:""" async def verify_answer_deepsearchqa( question: str, target: str, predicted_answer: str, metadata: Optional[Dict[str, Any]] = None, ) -> tuple[str, str, Optional[Dict[str, Any]]]: """ Use DeepSearchQA-specific judge to verify if the predicted answer is correct. Uses the official DeepSearchQA evaluation prompt with JSON output format. Args: question: The question being answered target: The correct/target answer predicted_answer: The model's predicted answer metadata: Optional metadata dict with additional context (e.g., problem_category, answer_type) Returns: Tuple of (result, judge_type, details_dict): - result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED" - judge_type: "deepsearchqa_judge" - details_dict: Dict with keys: - correctness_details: Dict[str, bool] mapping answer parts to correctness - excessive_answers: List[str] of extra answers not in ground truth - explanation: str explaining the judgment - num_correct: int number of correct answer parts - num_expected: int total number of expected answer parts - num_excessive: int number of excessive answers """ if predicted_answer is None: return "INCORRECT", "deepsearchqa_judge", None # Determine prompt_type from metadata prompt_type = "Single Answer" # Default if metadata and "answer_type" in metadata: answer_type = metadata["answer_type"] # Map answer_type to prompt_type if answer_type == "Set Answer": prompt_type = "Set Answer" # Add more mappings if needed judge_prompt = JUDGE_PROMPT_DEEPSEARCHQA.format( prompt_type=prompt_type, prompt=question, answer=target, response=predicted_answer, ) try: response = await evaluation_llm_client.chat.completions.create( model="gpt-4.1-2025-04-14", messages=[{"role": "user", "content": judge_prompt}], ) judge_response = response.choices[0].message.content except Exception as e: print(f"DeepSearchQA judge failed: {e}") return "NOT_ATTEMPTED", "deepsearchqa_judge", None if judge_response is None: return "NOT_ATTEMPTED", "deepsearchqa_judge", None # Parse JSON response try: # Extract JSON from the response (might be wrapped in markdown code blocks) json_match = re.search(r"```json\s*(\{.*?\})\s*```", judge_response, re.DOTALL) if json_match: json_str = json_match.group(1) else: # Try to find JSON without code blocks json_match = re.search(r"\{.*\}", judge_response, re.DOTALL) if json_match: json_str = json_match.group(0) else: print("Warning: Could not find JSON in DeepSearchQA judge response") return "NOT_ATTEMPTED", "deepsearchqa_judge", None result = json.loads(json_str) answer_correctness = result.get("Answer Correctness", {}) explanation = answer_correctness.get("Explanation", "") correctness_details = answer_correctness.get("Correctness Details", {}) excessive_answers = answer_correctness.get("Excessive Answers", []) # Calculate statistics num_expected = len(correctness_details) num_correct = sum(1 for v in correctness_details.values() if v) num_excessive = len(excessive_answers) # Build details dict details = { "correctness_details": correctness_details, "excessive_answers": excessive_answers, "explanation": explanation, "num_correct": num_correct, "num_expected": num_expected, "num_excessive": num_excessive, } # Print debug info print( f"DeepSearchQA Judge - Correct: {num_correct}/{num_expected}, Excessive: {num_excessive}" ) print(f"DeepSearchQA Judge - Explanation: {explanation}") # Determine if answer is correct # Following official logic: all expected parts must be found, and no excessive answers if correctness_details: all_correct = all(correctness_details.values()) if all_correct and not excessive_answers: return "CORRECT", "deepsearchqa_judge", details else: # Either missing some expected answers or has excessive answers return "INCORRECT", "deepsearchqa_judge", details else: # No correctness details, can't determine return "NOT_ATTEMPTED", "deepsearchqa_judge", None except json.JSONDecodeError as e: print(f"Warning: Failed to parse JSON from DeepSearchQA judge: {e}") print(f"Response: {judge_response[:200]}...") return "NOT_ATTEMPTED", "deepsearchqa_judge", None except Exception as e: print(f"Warning: Error processing DeepSearchQA judge response: {e}") return "NOT_ATTEMPTED", "deepsearchqa_judge", None # ================================================ # verify_answer_for_datasets # ================================================ async def _verify_answer_for_datasets_core( benchmark_name: str, question: str, target: str, predicted_answer: str, metadata: Optional[Dict[str, Any]] = None, ) -> tuple[str, str, Optional[Dict[str, Any]]]: """ Verify the answer for a given dataset. Args: benchmark_name: Name of the benchmark dataset question: The question being answered target: The correct/target answer predicted_answer: The model's predicted answer metadata: Optional metadata dict with additional context Returns: A tuple of (result, judge_type, details_dict). details_dict is None for most benchmarks, but contains evaluation details for DeepSearchQA. """ # For benchmarks that need detailed evaluation, don't use exact_match if benchmark_name not in ["deepsearchqa"]: if predicted_answer == target: return "CORRECT", "exact_match", None # For gaia-validation, use gaia-validation-text-103-scorer # We found that gaia_scorer tends to label many correct answers as incorrect, so we believe # that using an LLM-as-judge approach can more accurately reflect the model’s performance. if benchmark_name == "gaia-validation": # result = await verify_answer_gaia(question, target, predicted_answer) # return result, "gaia_scorer", None result = await verify_answer_gaia_validation_text_103( question, target, predicted_answer ) return result, "gaia_validation_text_103_judge", None # For gaia-validation-text-103, use gaia-validation-text-103-scorer elif benchmark_name == "gaia-validation-text-103": result = await verify_answer_gaia_validation_text_103( question, target, predicted_answer ) return result, "gaia_validation_text_103_judge", None # For browsecomp (English) and browsecomp-zh (Chinese), use different judges elif benchmark_name == "browsecomp": result = await verify_answer_browsecomp(question, target, predicted_answer) return result, "browsecomp_judge", None elif benchmark_name == "browsecomp_zh": result = await verify_answer_browsecomp_zh(question, target, predicted_answer) return result, "browsecomp_zh_judge", None # For hle, hle-text-500, and hle-text-2158, use hle_judge elif "hle" in benchmark_name: result = await verify_answer_hle(question, target, predicted_answer) return result, "hle_judge", None # For webwalkerqa, frames, and seal-0, use gaia_validation_text_103_judge elif benchmark_name in ["webwalkerqa", "frames", "seal-0"]: result = await verify_answer_gaia_validation_text_103( question, target, predicted_answer ) return result, "gaia_validation_text_103_judge", None # For simpleqa, use simpleqa_judge elif benchmark_name == "simpleqa" or benchmark_name == "collect_trace": result = await verify_answer_simpleqa(question, target, predicted_answer) return result, "simpleqa_judge", None # For xbench_deepsearch, use xbench_deepsearch_judge elif benchmark_name == "xbench_deepsearch": result = await verify_answer_xbench_deepsearch( question, target, predicted_answer ) return result, "xbench_deepsearch_judge", None # For deepsearchqa, use deepsearchqa_judge (with metadata support and detailed evaluation) elif benchmark_name == "deepsearchqa": result, judge_type, details = await verify_answer_deepsearchqa( question, target, predicted_answer, metadata ) # Return details for DeepSearchQA-specific metrics calculation return result, judge_type, details # For other benchmarks, use gaia_validation_text_103_judge else: result = await verify_answer_gaia_validation_text_103( question, target, predicted_answer ) return result, "gaia_validation_text_103_judge", None async def verify_answer_for_datasets( benchmark_name: str, question: str, target: str, predicted_answer: str, metadata: Optional[Dict[str, Any]] = None, max_retries: int = 10, retry_interval: int = 5, ) -> tuple[str, str, Optional[Dict[str, Any]]]: """ Wrapper with retry logic for NOT_ATTEMPTED results. Args: benchmark_name: Name of the benchmark dataset question: The question being answered target: The correct/target answer predicted_answer: The model's predicted answer metadata: Optional metadata dict with additional context max_retries: Maximum number of retry attempts retry_interval: Seconds to wait between retries Returns: A tuple of (result, judge_type, details_dict). details_dict contains evaluation details (for DeepSearchQA) or None (for other benchmarks). """ for attempt in range(1, max_retries + 1): result, judge_type, details = await _verify_answer_for_datasets_core( benchmark_name, question, target, predicted_answer, metadata ) if result != "NOT_ATTEMPTED": return result, judge_type, details if attempt < max_retries: print( f"[Retry {attempt}/{max_retries}] Got NOT_ATTEMPTED, retrying in {retry_interval}s..." ) await asyncio.sleep(retry_interval) # still NOT_ATTEMPTED after retries print(f"All {max_retries} attempts resulted in NOT_ATTEMPTED.") return "NOT_ATTEMPTED", "retry_wrapper", None ================================================ FILE: apps/miroflow-agent/benchmarks/evaluators/extract_futurex_results.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import json import os from collections import Counter, defaultdict from typing import Dict, List, Tuple def majority_vote( preds: List[str], first_seen_order: Dict[str, int] ) -> Tuple[str, Dict[str, int]]: """ Compute the majority-vote prediction for a list of candidate predictions. Tie-breaking rules (deterministic): 1) Highest frequency wins. 2) If there is a tie on frequency, choose the candidate that appeared earliest across all runs (based on the provided first_seen_order index). 3) As a final guard (shouldn't be needed if first_seen_order is complete), fall back to lexicographic order. Returns: (chosen_prediction, counts_dict) """ counter = Counter(preds) # Get the max vote count max_count = max(counter.values()) # All candidates that share the max vote count tied = [c for c, cnt in counter.items() if cnt == max_count] if len(tied) == 1: chosen = tied[0] else: # Prefer the one seen earliest globally tied.sort(key=lambda x: (first_seen_order.get(x, float("inf")), x)) chosen = tied[0] # Expose counts for optional debugging/inspection return chosen, dict(counter) def discover_runs(results_dir: str) -> List[str]: """ Discover subdirectories inside results_dir that potentially contain a 'benchmark_results.jsonl'. We don't strictly require the subdir name to start with 'run_', but we sort the list to keep processing deterministic. """ runs = [] for name in sorted(os.listdir(results_dir)): path = os.path.join(results_dir, name) if os.path.isdir(path): fpath = os.path.join(path, "benchmark_results.jsonl") if os.path.isfile(fpath): runs.append(path) return runs def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Aggregate multiple run_*/benchmark_results.jsonl files and produce a FutureX submission with majority voting." ) parser.add_argument( "results_dir", help="Path to results dir containing run_*/benchmark_results.jsonl", ) parser.add_argument( "-o", "--output", default=None, help="Output JSONL file path (default: /futurex_submission.jsonl)", ) return parser.parse_args() def main() -> None: args = parse_args() results_dir = os.path.abspath(args.results_dir) if not os.path.isdir(results_dir): raise FileNotFoundError(f"Results dir not found: {results_dir}") output_file = ( os.path.abspath(args.output) if args.output else os.path.join(results_dir, "futurex_submission.jsonl") ) # Maps task_id -> list of predictions collected across runs preds_by_task: Dict[str, List[str]] = defaultdict(list) # Track first-seen order index for each distinct prediction string across all runs. # This enables deterministic tie-breaking. first_seen_order: Dict[str, int] = {} next_order_idx = 0 runs = discover_runs(results_dir) if not runs: raise FileNotFoundError( f"No run directories with 'benchmark_results.jsonl' found under: {results_dir}" ) total_lines = 0 used_lines = 0 # Read and aggregate predictions for run_dir in runs: fpath = os.path.join(run_dir, "benchmark_results.jsonl") print(f"Reading: {fpath}") with open(fpath, "r", encoding="utf-8") as fin: for line in fin: total_lines += 1 line = line.strip() if not line: continue try: rec = json.loads(line) except json.JSONDecodeError: # Skip malformed JSON lines, but keep going continue task_id = rec.get("task_id") pred = rec.get("model_boxed_answer") # Only accept non-empty strings; coerce to str for safety if task_id and pred is not None and str(pred).strip(): pred_str = str(pred).strip() preds_by_task[task_id].append(pred_str) if pred_str not in first_seen_order: first_seen_order[pred_str] = next_order_idx next_order_idx += 1 used_lines += 1 # Write submission JSONL # We sort task_ids to keep output reproducible. num_tasks = 0 with open(output_file, "w", encoding="utf-8") as out: for task_id in sorted(preds_by_task.keys()): voted_pred, _counts = majority_vote( preds_by_task[task_id], first_seen_order ) out.write( json.dumps( {"id": task_id, "prediction": voted_pred}, ensure_ascii=False ) + "\n" ) num_tasks += 1 # Optional: small summary to stdout print(f"Collected from {len(runs)} run(s).") print(f"Read {total_lines} line(s), accepted {used_lines} record(s).") print(f"Aggregated {num_tasks} unique task_id(s).") print(f"✅ Submission saved to {output_file}") if __name__ == "__main__": main() ================================================ FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-text-103-grader.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ GAIA-Text-103 Task Grader This script: 1. Loads extracted GAIA-Text-103 tasks from the extraction directory 2. Grades each task using the GAIA-Text-103 evaluator (LLM judgement) 3. Updates the original task files with grading results Usage: uv run benchmarks/subset_extraction/gaia-text-103-grader.py /path/to/extraction/directory """ import argparse import asyncio import json import os import sys import time from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional # Add the benchmarks directory to the path to import evaluators sys.path.append(str(Path(__file__).parent.parent)) from evaluators.eval_utils import verify_answer_gaia_validation_text_103 @dataclass class GradingResult: """Result of grading a single task""" task_id: str run_name: str file_path: str question: str ground_truth: str predicted_answer: str judge_result: str judge_type: str = "gaia_validation_text_103_scorer" grading_time: float = 0.0 error_message: str = "" class GAIAText103Grader: """Grader for GAIA-Text-103 tasks using LLM judgement""" def __init__(self, extraction_dir: str): """ Initialize the grader Args: extraction_dir: Directory containing extracted GAIA-Text-103 tasks """ self.extraction_dir = Path(extraction_dir) self.results: List[GradingResult] = [] self.stats = { "total_tasks": 0, "graded_tasks": 0, "errors": 0, "total_grading_time": 0.0, } def find_task_files(self) -> List[Path]: """Find all task JSON files in the extraction directory""" task_files = [] # Recursively search for task files for root, dirs, files in os.walk(self.extraction_dir): for file in files: if file.startswith("task_") and file.endswith(".json"): task_files.append(Path(root) / file) return sorted(task_files) def extract_task_info(self, task_file: Path) -> Optional[Dict]: """Extract task information from a task file""" try: with open(task_file, "r", encoding="utf-8") as f: task_data = json.load(f) # Check if task has already been graded with our specific scorer if task_data.get("judge_type") == "gaia_validation_text_103_scorer": print(f"Skipping already graded task: {task_file.name}") return None # Extract basic information task_info = { "task_id": task_data.get("task_id", ""), "run_name": task_data.get("run_name", ""), "file_path": str(task_file), "question": task_data.get("input", {}).get("task_description", ""), "ground_truth": task_data.get("ground_truth", ""), "predicted_answer": task_data.get("final_boxed_answer", ""), } # Validate required fields if not all( [ task_info["question"], task_info["ground_truth"], task_info["predicted_answer"], ] ): print(f"Warning: Missing required fields in {task_file}") print(f" question: {task_info['question']}") print(f" ground_truth: {task_info['ground_truth']}") print(f" predicted_answer: {task_info['predicted_answer']}") return None return task_info except Exception as e: print(f"Error reading task file {task_file}: {e}") return None async def grade_single_task(self, task_info: Dict) -> GradingResult: """Grade a single task using GAIA-Text-103 evaluator""" start_time = time.time() result = GradingResult( task_id=task_info["task_id"], run_name=task_info["run_name"], file_path=task_info["file_path"], question=task_info["question"], ground_truth=task_info["ground_truth"], predicted_answer=task_info["predicted_answer"], judge_result="", judge_type="gaia_validation_text_103_scorer", ) try: # Use the GAIA-Text-103 evaluator judge_result = await verify_answer_gaia_validation_text_103( question=task_info["question"], target=task_info["ground_truth"], predicted_answer=task_info["predicted_answer"], ) result.judge_result = judge_result result.grading_time = time.time() - start_time print( f"Task {task_info['task_id']} ({task_info['run_name']}): {judge_result}" ) except Exception as e: result.error_message = str(e) result.judge_result = "ERROR" result.grading_time = time.time() - start_time self.stats["errors"] += 1 print(f"Error grading task {task_info['task_id']}: {e}") return result async def grade_all_tasks(self, max_concurrent: int = 5) -> List[GradingResult]: """Grade all tasks with concurrent processing""" task_files = self.find_task_files() print(f"Found {len(task_files)} task files to grade") # Extract task information task_infos = [] for task_file in task_files: task_info = self.extract_task_info(task_file) if task_info: task_infos.append(task_info) self.stats["total_tasks"] = len(task_infos) print(f"Extracted {len(task_infos)} valid tasks for grading") if not task_infos: print("No valid tasks found for grading") return [] # Grade tasks with concurrency control semaphore = asyncio.Semaphore(max_concurrent) async def grade_with_semaphore(task_info): async with semaphore: return await self.grade_single_task(task_info) # Create tasks for concurrent execution tasks = [grade_with_semaphore(task_info) for task_info in task_infos] # Execute all grading tasks results = await asyncio.gather(*tasks, return_exceptions=True) # Filter out exceptions and collect valid results valid_results = [] for i, result in enumerate(results): if isinstance(result, Exception): print(f"Exception in task {i}: {result}") self.stats["errors"] += 1 else: valid_results.append(result) self.stats["graded_tasks"] += 1 self.stats["total_grading_time"] += result.grading_time self.results = valid_results return valid_results def update_original_files(self): """Update original task files with grading results""" updated_count = 0 for result in self.results: try: # Read original file with open(result.file_path, "r", encoding="utf-8") as f: task_data = json.load(f) # Add grading information task_data["final_judge_result"] = result.judge_result task_data["judge_type"] = result.judge_type task_data["grading_time"] = result.grading_time if result.error_message: task_data["grading_error"] = result.error_message # Write back to file with open(result.file_path, "w", encoding="utf-8") as f: json.dump(task_data, f, indent=2, ensure_ascii=False) updated_count += 1 except Exception as e: print(f"Error updating file {result.file_path}: {e}") print(f"Updated {updated_count} original task files with grading results") def print_summary(self): """Print grading summary""" print("\n" + "=" * 60) print("GAIA-Text-103 Grading Summary") print("=" * 60) print(f"Total tasks found: {self.stats['total_tasks']}") print(f"Successfully graded: {self.stats['graded_tasks']}") print(f"Errors: {self.stats['errors']}") print("=" * 60) async def main(): """Main function""" parser = argparse.ArgumentParser( description="Grade GAIA-Text-103 tasks using LLM judgement" ) parser.add_argument( "extraction_dir", help="Directory containing extracted GAIA-Text-103 tasks" ) parser.add_argument( "--max-concurrent", type=int, default=5, help="Maximum number of concurrent grading tasks (default: 5)", ) args = parser.parse_args() # Validate input directory if not os.path.exists(args.extraction_dir): print(f"Error: Extraction directory not found: {args.extraction_dir}") return 1 print(f"Extraction directory: {args.extraction_dir}") print(f"Max concurrent tasks: {args.max_concurrent}") print() # Create grader and run grading grader = GAIAText103Grader(args.extraction_dir) try: print("Starting grading process...") results = await grader.grade_all_tasks(max_concurrent=args.max_concurrent) if results: # Update original files only grader.update_original_files() # Print summary grader.print_summary() print("\n✅ Grading completed successfully!") print("📝 Original task files updated with grading results") else: print("❌ No tasks were graded successfully") return 1 except KeyboardInterrupt: print("\nGrading interrupted by user") return 1 except Exception as e: print(f"Error during grading: {e}") return 1 return 0 if __name__ == "__main__": exit_code = asyncio.run(main()) sys.exit(exit_code) ================================================ FILE: apps/miroflow-agent/benchmarks/subset_extraction/gaia-to-text-103-mover.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ GAIA to Text-103 Task Copier This script: 1. Loads GAIA validation logs from a specified directory 2. Identifies tasks that belong to GAIA-Text-103 dataset 3. Copies those tasks to a new directory structure maintaining the original layout """ import argparse import json import os import shutil from pathlib import Path from typing import Set class GAIAtoText103Copier: """Copy GAIA-Text-103 tasks from GAIA validation logs""" def __init__(self, gaia_text_103_data_path: str, output_dir: str): """ Initialize the copier Args: gaia_text_103_data_path: Path to GAIA-Text-103 standardized data file output_dir: Directory to save copied tasks """ self.gaia_text_103_data_path = gaia_text_103_data_path self.output_dir = Path(output_dir) self.gaia_text_103_task_ids: Set[str] = set() self.copied_count = 0 # Load GAIA-Text-103 task IDs self._load_gaia_text_103_tasks() def _load_gaia_text_103_tasks(self): """Load task IDs from GAIA-Text-103 dataset""" print(f"Loading GAIA-Text-103 task IDs from {self.gaia_text_103_data_path}") if not os.path.exists(self.gaia_text_103_data_path): raise FileNotFoundError( f"GAIA-Text-103 data file not found: {self.gaia_text_103_data_path}" ) with open(self.gaia_text_103_data_path, "r", encoding="utf-8") as f: for line in f: if line.strip(): task_data = json.loads(line) task_id = task_data.get("task_id") if task_id: self.gaia_text_103_task_ids.add(task_id) print(f"Loaded {len(self.gaia_text_103_task_ids)} GAIA-Text-103 task IDs") def copy_gaia_text_103_tasks(self, gaia_logs_dir: str) -> int: """ Copy GAIA-Text-103 tasks from GAIA validation logs Args: gaia_logs_dir: Directory containing GAIA validation logs Returns: Number of copied tasks """ print(f"Copying GAIA-Text-103 tasks from {gaia_logs_dir}") # Find all task JSON files in the logs directory (including in run subdirectories) task_files = [] for root, dirs, files in os.walk(gaia_logs_dir): for file in files: if file.startswith("task_") and file.endswith(".json"): task_files.append(os.path.join(root, file)) print(f"Found {len(task_files)} task files to process") copied_count = 0 for task_file in task_files: try: filename = os.path.basename(task_file) # Extract task ID from filename like: task_5188369a-3bbe-43d8-8b94-11558f909a08_attempt_1_format_retry_0_2025-08-06T21-14-23-770872Z.json task_id = ( filename.split("_")[1] if filename.startswith("task_") and "_" in filename else "" ) if task_id and task_id in self.gaia_text_103_task_ids: # This is a GAIA-Text-103 task, copy it copied_count += 1 # Preserve the original directory structure # Get the relative path from the original directory original_dir = os.path.dirname(gaia_logs_dir) relative_path = os.path.relpath(task_file, original_dir) # Create the same directory structure in the output output_file = self.output_dir / relative_path output_file.parent.mkdir(parents=True, exist_ok=True) # Copy the file shutil.copy2(task_file, output_file) if copied_count % 50 == 0: print(f"Copied {copied_count} tasks...") except Exception as e: print(f"Error processing {task_file}: {e}") continue print(f"Successfully copied {copied_count} GAIA-Text-103 tasks") self.copied_count = copied_count return copied_count def print_summary(self): """Print copying summary to console""" print("\n" + "=" * 60) print("GAIA-Text-103 Task Copying Summary") print("=" * 60) print(f"Total Tasks Copied: {self.copied_count}") print(f"Output Directory: {self.output_dir}") print("=" * 60) def main(): """Main function""" parser = argparse.ArgumentParser( description="Copy GAIA-Text-103 tasks from GAIA validation logs" ) parser.add_argument( "gaia_logs_dir", help="Directory containing GAIA validation logs" ) parser.add_argument( "--gaia_text_103_data", default="../../data/gaia-2023-validation-text-103/standardized_data.jsonl", help="Path to GAIA-Text-103 standardized data file", ) parser.add_argument( "--output-dir", help="Output directory for copied tasks (default: side by side with gaia-validation)", ) args = parser.parse_args() # Set default output directory side by side with gaia-validation if not args.output_dir: gaia_logs_path = Path(args.gaia_logs_dir) # If the input is a gaia-validation directory, create gaia-text-103-extraction next to it if gaia_logs_path.name == "gaia-validation": args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction") else: # Otherwise, create in the same directory as the input args.output_dir = str(gaia_logs_path.parent / "gaia-text-103-extraction") # Validate inputs if not os.path.exists(args.gaia_logs_dir): print(f"Error: GAIA logs directory not found: {args.gaia_logs_dir}") return 1 if not os.path.exists(args.gaia_text_103_data): print(f"Error: GAIA-Text-103 data file not found: {args.gaia_text_103_data}") return 1 print(f"Input GAIA logs directory: {args.gaia_logs_dir}") print(f"Output directory: {args.output_dir}") print(f"GAIA-Text-103 data file: {args.gaia_text_103_data}") print() try: # Initialize copier copier = GAIAtoText103Copier(args.gaia_text_103_data, args.output_dir) # Copy tasks copied_count = copier.copy_gaia_text_103_tasks(args.gaia_logs_dir) if copied_count == 0: print("No GAIA-Text-103 tasks found in the logs directory") return 0 # Print summary copier.print_summary() return 0 except Exception as e: print(f"Error: {e}") return 1 if __name__ == "__main__": exit_code = main() exit(exit_code) ================================================ FILE: apps/miroflow-agent/conf/__init__.py ================================================ # This file makes the conf directory a Python package ================================================ FILE: apps/miroflow-agent/conf/agent/default.yaml ================================================ # conf/agent/default.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py main_agent: tools: - tool-python - tool-vqa - tool-transcribe - tool-reasoning - tool-reader max_turns: 20 # Maximum number of turns for main agent execution sub_agents: agent-browsing: tools: - tool-google-search - tool-vqa - tool-reader - tool-python max_turns: 20 # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/demo.yaml ================================================ # conf/agent/demo.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 20 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max200.yaml ================================================ # conf/agent/mirothinker_1.7_keep5_max200.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 200 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). retry_with_summary: False # default is true ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_1.7_keep5_max300.yaml ================================================ # conf/agent/mirothinker_1.7_keep5_max300.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 300 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). retry_with_summary: False # default is true ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0.yaml ================================================ # conf/agent/mirothinker_v1.0.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 600 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.0_keep5.yaml ================================================ # conf/agent/mirothinker_v1.0_keep5.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 600 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5.yaml ================================================ # conf/agent/mirothinker_v1.5.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 600 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max200.yaml ================================================ # conf/agent/mirothinker_v1.5_keep5_max200.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 200 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/mirothinker_v1.5_keep5_max400.yaml ================================================ # conf/agent/mirothinker_v1.5_keep5_max400.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 400 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/multi_agent.yaml ================================================ # conf/agent/multi_agent.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - tool-python - tool-vqa - tool-transcribe - tool-reasoning - tool-reader max_turns: 50 # Maximum number of turns for main agent execution sub_agents: agent-browsing: tools: - tool-google-search - tool-vqa - tool-reader - tool-python max_turns: 50 # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/multi_agent_os.yaml ================================================ # conf/agent/multi_agent_os.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - tool-python - tool-vqa-os - tool-transcribe-os - tool-reasoning-os - tool-reader max_turns: 50 # Maximum number of turns for main agent execution sub_agents: agent-browsing: tools: - tool-google-search - tool-vqa-os - tool-reader - tool-python max_turns: 50 # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/single_agent.yaml ================================================ # conf/agent/single_agent.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 600 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: -1 context_compress_limit: 0 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/agent/single_agent_keep5.yaml ================================================ # conf/agent/single_agent_keep5.yaml # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py defaults: - default - _self_ main_agent: tools: - search_and_scrape_webpage - jina_scrape_llm_summary - tool-python tool_blacklist: - [ "search_and_scrape_webpage", "sogou_search" ] - [ "tool-python", "download_file_from_sandbox_to_local" ] max_turns: 600 # Maximum number of turns for main agent execution sub_agents: # Settings for context management keep_tool_result: 5 context_compress_limit: 5 # Enable context compression (>0 = enabled, 0 = disabled). ================================================ FILE: apps/miroflow-agent/conf/benchmark/aime2025.yaml ================================================ # conf/benchmark/aime2025.yaml defaults: - default - _self_ name: "aime2025" data: data_dir: "../../data/aime2025" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/browsecomp.yaml ================================================ # conf/benchmark/browsecomp.yaml defaults: - default - _self_ name: "browsecomp" data: data_dir: "../../data/browsecomp" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml ================================================ # conf/benchmark/browsecomp_zh.yaml defaults: - default - _self_ name: "browsecomp_zh" data: data_dir: "../../data/browsecomp_zh" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/collect_trace.yaml ================================================ # conf/benchmark/collect_trace.yaml defaults: - default - _self_ name: "collect_trace" data: data_dir: "../../data/debug" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/debug.yaml ================================================ # conf/benchmark/debug.yaml defaults: - default - _self_ name: "debug" data: data_dir: "../../data/debug" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/deepsearchqa.yaml ================================================ # conf/benchmark/deepsearchqa.yaml defaults: - default - _self_ name: "deepsearchqa" data: data_dir: "../../data/deepsearchqa" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/default.yaml ================================================ # conf/benchmark/default.yaml - Default benchmark configuration # This is a base configuration for benchmarks. Specific benchmarks can override these defaults. name: "default" data: metadata_file: "standardized_data.jsonl" field_mapping: task_id_field: "task_id" task_question_field: "task_question" ground_truth_field: "ground_truth" file_name_field: "file_name" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/frames.yaml ================================================ # conf/benchmark/frames.yaml defaults: - default - _self_ name: "frames" data: data_dir: "../../data/frames" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/futurex.yaml ================================================ # conf/benchmark/futurex.yaml defaults: - default - _self_ name: "futurex" data: data_dir: "../../data/futurex" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml ================================================ # conf/benchmark/gaia-validation-text-103.yaml defaults: - default - _self_ name: "gaia-validation-text-103" data: data_dir: "../../data/gaia-2023-validation-text-103" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/gaia-validation.yaml ================================================ # conf/benchmark/gaia-validation.yaml defaults: - default - _self_ name: "gaia-validation" data: data_dir: "../../data/gaia-2023-validation" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml ================================================ # conf/benchmark/hle-text-2158.yaml defaults: - default - _self_ name: "hle-text-2158" data: data_dir: "../../data/hle-text-2158" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/hle-text-500.yaml ================================================ # conf/benchmark/hle-text-500.yaml defaults: - default - _self_ name: "hle-text-500" data: data_dir: "../../data/hle-text-500" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/hle.yaml ================================================ # conf/benchmark/hle.yaml defaults: - default - _self_ name: "hle" data: data_dir: "../../data/hle" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/seal-0.yaml ================================================ # conf/benchmark/seal-0.yaml defaults: - default - _self_ name: "seal-0" data: data_dir: "../../data/seal-0" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml ================================================ # conf/benchmark/webwalkerqa.yaml defaults: - default - _self_ name: "webwalkerqa" data: data_dir: "../../data/webwalkerqa" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml ================================================ # conf/benchmark/xbench_deepsearch.yaml defaults: - default - _self_ name: "xbench_deepsearch" data: data_dir: "../../data/xbench_deepsearch" execution: max_tasks: null # null means no limit max_concurrent: 5 pass_at_k: 1 ================================================ FILE: apps/miroflow-agent/conf/config.yaml ================================================ # conf/config.yaml defaults: - llm: default - agent: default - benchmark: default - _self_ # Allows variables to be defined at the top of this file hydra: run: dir: ../../logs/debug # You can define some top-level or default parameters here project_name: "miroflow-agent" debug_dir: "../../logs/debug" ================================================ FILE: apps/miroflow-agent/conf/llm/claude-3-7.yaml ================================================ # conf/llm/claude-3-7.yaml defaults: - default - _self_ provider: "anthropic" model_name: "claude-3-7-sonnet-20250219" base_url: https://api.anthropic.com max_context_length: 65536 ================================================ FILE: apps/miroflow-agent/conf/llm/default.yaml ================================================ # conf/llm/default.yaml - Default LLM configuration provider: "anthropic" # openai, anthropic, qwen model_name: "claude-3-7-sonnet-20250219" async_client: false temperature: 0.3 top_p: 1.0 min_p: 0.0 top_k: -1 max_tokens: 4096 api_key: "" base_url: https://api.anthropic.com repetition_penalty: 1.0 ================================================ FILE: apps/miroflow-agent/conf/llm/gpt-5.yaml ================================================ # conf/llm/gpt-5.yaml defaults: - default - _self_ provider: "openai" model_name: "gpt-5-2025-08-07" base_url: https://api.openai.com/v1 max_context_length: 65536 ================================================ FILE: apps/miroflow-agent/conf/llm/qwen-3.yaml ================================================ # conf/llm/qwen-3.yaml defaults: - default - _self_ provider: "qwen" model_name: "qwen-3" base_url: "https://your-api.com/v1" max_context_length: 262144 max_tokens: 16384 top_p: 0.95 repetition_penalty: 1.05 temperature: 1.0 ================================================ FILE: apps/miroflow-agent/main.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import hydra from omegaconf import DictConfig, OmegaConf # Import from the new modular structure from src.core.pipeline import ( create_pipeline_components, execute_task_pipeline, ) from src.logging.task_logger import bootstrap_logger # Configure logger and get the configured instance logger = bootstrap_logger() async def amain(cfg: DictConfig) -> None: """Asynchronous main function.""" logger.info(OmegaConf.to_yaml(cfg)) # Create pipeline components using the factory function main_agent_tool_manager, sub_agent_tool_managers, output_formatter = ( create_pipeline_components(cfg) ) # Define task parameters task_id = "task_example" task_description = "What is the title of today's arxiv paper in computer science?" task_file_name = "" # Execute task using the pipeline final_summary, final_boxed_answer, log_file_path, _ = await execute_task_pipeline( cfg=cfg, task_id=task_id, task_file_name=task_file_name, task_description=task_description, main_agent_tool_manager=main_agent_tool_manager, sub_agent_tool_managers=sub_agent_tool_managers, output_formatter=output_formatter, log_dir=cfg.debug_dir, ) @hydra.main(config_path="conf", config_name="config", version_base=None) def main(cfg: DictConfig) -> None: asyncio.run(amain(cfg)) if __name__ == "__main__": main() ================================================ FILE: apps/miroflow-agent/pyproject.toml ================================================ [project] name = "miroflow-agent" version = "0.1.0" description = "An agent framework for complex task solving with LLM and MCP tools" readme = "README.md" requires-python = ">=3.12" dependencies = [ "miroflow-tools>=0.1.0", "huggingface-hub>=0.28.0", "requests>=2.32.3", "rich>=13.9.4", "jinja2>=3.1.4", "pillow>=11.0.0", "markdownify>=0.14.1", "duckduckgo-search>=6.3.7", "python-dotenv", "pdfminer-six", "python-pptx", "puremagic", "pydub", "SpeechRecognition", "youtube_transcript_api", "mcp", "fastmcp", "anthropic", "e2b-code-interpreter==1.2.1", "jsonlines>=4.0.0", "mammoth>=1.9.0", "numpy>=2.2.5", "ipdb>=0.13.13", "datasets>=3.5.0", "openpyxl>=3.1.5", "markitdown-mcp>=0.0.1a3", "markitdown>=0.1.1", "regex>=2024.11.6", "openai>=1.78.1", "tenacity>=9.1.2", "transformers>=4.51.3", "omegaconf>=2.3.0", "wikipedia", "mutagen", "hydra-core", "google-genai", "tiktoken>=0.9.0", "aiohttp", "colorama>=0.4.6", "json-repair>=0.49.0", "tencentcloud-sdk-python>=3.0.1451" ] [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src"] [tool.uv.sources] miroflow-tools = { path = "../../libs/miroflow-tools", editable = true } [dependency-groups] dev = [ "inline-snapshot>=0.23.2", "pyright>=1.1.403", "pytest>=8.4.1", "pytest-asyncio>=1.0.0", "pytest-cov>=6.2.1", "pytest-html>=4.1.1", "pytest-xdist>=3.7.0", "ty>=0.0.1a14", ] [tool.pytest.ini_options] # see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml minversion = "8.3.5" testpaths = ["tests"] # make warning go away # https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915 asyncio_default_fixture_loop_scope = "function" addopts = [ # summary for failed AND passed tests "-rA", # only show stderr for test. stdlog can contain sensitive information "--show-capture=stderr", # use `pytest-xdist` to run tests in parallel "-n=auto", # use `pytest-html` to generate test report in html format "--html=report.html", "--self-contained-html", # use `pytest-testmon` to run tests on changed files only # "--testmon", # use `pytest-cov` to generate test coverage report "--cov=miroflow_agent", "--cov-report=html", ] ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-32} BENCHMARK_NAME="aime2025" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/aime2025 \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="browsecomp" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/browsecomp \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="browsecomp_zh" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/browsecomp_zh \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-1} BENCHMARK_NAME="debug" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-1} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/debug \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_deepsearchqa.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="deepsearchqa" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/deepsearchqa \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="frames" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/frames \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-8} BENCHMARK_NAME="futurex" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data_250924_250930.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/futurex \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores # echo "Calculating average scores..." # uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "Extracting predictions and formatting for FutureX submission..." uv run python benchmarks/evaluators/extract_futurex_results.py "$RESULTS_DIR" # Check status and provide user-friendly message if [ $? -eq 0 ]; then echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl" echo "You can now upload this file to the FutureX test server." else echo "❌ Failed to generate submission file. Please check the logs for details." fi echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-8} BENCHMARK_NAME="gaia-validation-text-103" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-8} BENCHMARK_NAME="gaia-validation" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/gaia-2023-validation \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="hle-text-2158" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data_original.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/hle-text-2158 \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="hle-text-500" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data_original.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/hle-text-500 \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="hle" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/hle \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-8} BENCHMARK_NAME="seal-0" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/seal-0 \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-3} BENCHMARK_NAME="webwalkerqa" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/webwalkerqa \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh ================================================ #!/bin/bash # Parse environment variables, use defaults if not set LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} BASE_URL=${BASE_URL:-"https://your-api.com/v1"} # Configuration parameters NUM_RUNS=${NUM_RUNS:-8} BENCHMARK_NAME="xbench_deepsearch" LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} AGENT_SET=${AGENT_SET:-"single_agent_keep5"} MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} MAX_CONCURRENT=${MAX_CONCURRENT:-10} PASS_AT_K=${PASS_AT_K:-1} TEMPERATURE=${TEMPERATURE:-1.0} API_KEY=${API_KEY:-"xxx"} # Set results directory RESULTS_DIR="../../logs/${BENCHMARK_NAME}/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" # Create results directory mkdir -p "$RESULTS_DIR" # Launch all parallel tasks for i in $(seq 1 $NUM_RUNS); do echo "==========================================" echo "Launching experiment $i/$NUM_RUNS" echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" echo "==========================================" # Set specific identifier for this run RUN_ID="run_$i" # Run experiment (background execution) ( uv run python benchmarks/common_benchmark.py \ benchmark=$BENCHMARK_NAME \ benchmark.data.metadata_file="standardized_data.jsonl" \ llm=qwen-3 \ llm.provider=$LLM_PROVIDER \ llm.model_name=$LLM_MODEL \ llm.base_url=$BASE_URL \ llm.async_client=true \ llm.temperature=$TEMPERATURE \ llm.max_context_length=$MAX_CONTEXT_LENGTH \ llm.api_key=$API_KEY \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=$PASS_AT_K \ benchmark.data.data_dir=../../data/xbench_deepsearch \ agent=$AGENT_SET \ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" # Check if run was successful if [ $? -eq 0 ]; then echo "Run $i completed successfully" RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) if [ -f "$RESULT_FILE" ]; then echo "Results saved to $RESULT_FILE" else echo "Warning: Result file not found for run $i" fi else echo "Run $i failed!" fi ) & # Small delay between launches to avoid simultaneous requests sleep 2 done echo "All $NUM_RUNS runs have been launched in parallel" echo "Waiting for all runs to complete..." # Wait for all background tasks to complete wait echo "==========================================" echo "All $NUM_RUNS runs completed!" echo "==========================================" # Calculate average scores echo "Calculating average scores..." uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" echo "==========================================" ================================================ FILE: apps/miroflow-agent/src/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """MiroFlow Agent - A modular agent framework for task execution.""" from .core.orchestrator import Orchestrator from .core.pipeline import create_pipeline_components, execute_task_pipeline from .io.output_formatter import OutputFormatter from .llm.factory import ClientFactory from .logging.task_logger import TaskLog, bootstrap_logger __all__ = [ "Orchestrator", "create_pipeline_components", "execute_task_pipeline", "OutputFormatter", "ClientFactory", "TaskLog", "bootstrap_logger", ] ================================================ FILE: apps/miroflow-agent/src/config/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Configuration module for MiroFlow Agent.""" from .settings import ( create_mcp_server_parameters, expose_sub_agents_as_tools, get_env_info, ) __all__ = [ "create_mcp_server_parameters", "expose_sub_agents_as_tools", "get_env_info", ] ================================================ FILE: apps/miroflow-agent/src/config/settings.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Configuration settings and MCP server parameter management. This module handles: - Loading environment variables for API keys and service URLs - Creating MCP server configurations for different tools - Exposing sub-agents as callable tools - Collecting environment information for logging """ import os import sys from dotenv import load_dotenv from mcp import StdioServerParameters from omegaconf import DictConfig # Load environment variables from .env file load_dotenv() # API for Google Search SERPER_API_KEY = os.environ.get("SERPER_API_KEY") SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev") # API for Web Scraping JINA_API_KEY = os.environ.get("JINA_API_KEY") JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai") # API for Linux Sandbox E2B_API_KEY = os.environ.get("E2B_API_KEY") # API for Open-Source Audio Transcription Tool WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL") WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY") WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME") # API for Open-Source VQA Tool VISION_API_KEY = os.environ.get("VISION_API_KEY") VISION_BASE_URL = os.environ.get("VISION_BASE_URL") VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME") # API for Open-Source Reasoning Tool REASONING_API_KEY = os.environ.get("REASONING_API_KEY") REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL") REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME") # API for Claude Sonnet 3.7 as Commercial Tools ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY") ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") # API Keys for Commercial Tools OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") # API for Sogou Search TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID") TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY") # API for Summary LLM SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY") SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL") SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME") # MCP server configuration generation function def create_mcp_server_parameters(cfg: DictConfig, agent_cfg: DictConfig): """ Create MCP server configurations based on agent configuration. Dynamically generates StdioServerParameters for each tool specified in the agent configuration. Each tool type (search, python, vqa, etc.) has its own MCP server with appropriate environment variables. Args: cfg: Global Hydra configuration object agent_cfg: Agent-specific configuration containing 'tools' and 'tool_blacklist' Returns: Tuple of (configs, blacklist) where: - configs: List of dicts with 'name' and 'params' (StdioServerParameters) - blacklist: Set of (server_name, tool_name) tuples to exclude """ configs = [] if ( agent_cfg.get("tools", None) is not None and "tool-google-search" in agent_cfg["tools"] ): if not SERPER_API_KEY: raise ValueError( "SERPER_API_KEY not set, tool-google-search will be unavailable." ) configs.append( { "name": "tool-google-search", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.mcp_servers.searching_google_mcp_server", ], env={ "SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL, "JINA_API_KEY": JINA_API_KEY, "JINA_BASE_URL": JINA_BASE_URL, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-sogou-search" in agent_cfg["tools"] ): configs.append( { "name": "tool-sogou-search", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.mcp_servers.searching_sogou_mcp_server", ], env={ "TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID, "TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY, "JINA_API_KEY": JINA_API_KEY, "JINA_BASE_URL": JINA_BASE_URL, }, ), } ) if agent_cfg.get("tools", None) is not None and "tool-python" in agent_cfg["tools"]: configs.append( { "name": "tool-python", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"], env={"E2B_API_KEY": E2B_API_KEY}, ), } ) if agent_cfg.get("tools", None) is not None and "tool-vqa" in agent_cfg["tools"]: configs.append( { "name": "tool-vqa", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"], env={ "OPENAI_API_KEY": OPENAI_API_KEY, "OPENAI_BASE_URL": OPENAI_BASE_URL, }, ), } ) if agent_cfg.get("tools", None) is not None and "tool-vqa-os" in agent_cfg["tools"]: configs.append( { "name": "tool-vqa-os", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"], env={ "VISION_API_KEY": VISION_API_KEY, "VISION_BASE_URL": VISION_BASE_URL, "VISION_MODEL_NAME": VISION_MODEL_NAME, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-transcribe" in agent_cfg["tools"] ): configs.append( { "name": "tool-transcribe", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"], env={ "OPENAI_API_KEY": OPENAI_API_KEY, "OPENAI_BASE_URL": OPENAI_BASE_URL, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-transcribe-os" in agent_cfg["tools"] ): configs.append( { "name": "tool-transcribe-os", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"], env={ "WHISPER_BASE_URL": WHISPER_BASE_URL, "WHISPER_API_KEY": WHISPER_API_KEY, "WHISPER_MODEL_NAME": WHISPER_MODEL_NAME, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-reasoning" in agent_cfg["tools"] ): configs.append( { "name": "tool-reasoning", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.mcp_servers.reasoning_mcp_server", ], env={ "ANTHROPIC_API_KEY": ANTHROPIC_API_KEY, "ANTHROPIC_BASE_URL": ANTHROPIC_BASE_URL, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-reasoning-os" in agent_cfg["tools"] ): configs.append( { "name": "tool-reasoning-os", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.mcp_servers.reasoning_mcp_server_os", ], env={ "REASONING_API_KEY": REASONING_API_KEY, "REASONING_BASE_URL": REASONING_BASE_URL, "REASONING_MODEL_NAME": REASONING_MODEL_NAME, }, ), } ) # reader if agent_cfg.get("tools", None) is not None and "tool-reader" in agent_cfg["tools"]: configs.append( { "name": "tool-reader", "params": StdioServerParameters( command=sys.executable, args=["-m", "markitdown_mcp"], ), } ) if ( agent_cfg.get("tools", None) is not None and "tool-reading" in agent_cfg["tools"] ): configs.append( { "name": "tool-reading", "params": StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"], ), } ) if ( agent_cfg.get("tools", None) is not None and "search_and_scrape_webpage" in agent_cfg["tools"] ): configs.append( { "name": "search_and_scrape_webpage", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.dev_mcp_servers.search_and_scrape_webpage", ], env={ "SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL, "TENCENTCLOUD_SECRET_ID": TENCENTCLOUD_SECRET_ID, "TENCENTCLOUD_SECRET_KEY": TENCENTCLOUD_SECRET_KEY, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "jina_scrape_llm_summary" in agent_cfg["tools"] ): configs.append( { "name": "jina_scrape_llm_summary", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary", ], env={ "JINA_API_KEY": JINA_API_KEY, "JINA_BASE_URL": JINA_BASE_URL, "SUMMARY_LLM_BASE_URL": SUMMARY_LLM_BASE_URL, "SUMMARY_LLM_MODEL_NAME": SUMMARY_LLM_MODEL_NAME, "SUMMARY_LLM_API_KEY": SUMMARY_LLM_API_KEY, }, ), } ) if ( agent_cfg.get("tools", None) is not None and "stateless_python" in agent_cfg["tools"] ): configs.append( { "name": "stateless_python", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.dev_mcp_servers.stateless_python_server", ], env={"E2B_API_KEY": E2B_API_KEY}, ), } ) if ( agent_cfg.get("tools", None) is not None and "task_planner" in agent_cfg["tools"] ): # Generate a random UUID for each MCP server instance to ensure isolation # Each time create_mcp_server_parameters is called, a new UUID is generated # This automatically isolates todo lists for concurrent tasks import uuid todo_task_id = str(uuid.uuid4()) configs.append( { "name": "task_planner", "params": StdioServerParameters( command=sys.executable, args=[ "-m", "miroflow_tools.dev_mcp_servers.task_planner", ], env={"TASK_ID": todo_task_id}, ), } ) blacklist = set() for black_list_item in agent_cfg.get("tool_blacklist", []): blacklist.add((black_list_item[0], black_list_item[1])) return configs, blacklist def expose_sub_agents_as_tools(sub_agents_cfg: DictConfig): """ Convert sub-agent configurations into tool definitions for the main agent. This allows the main agent to invoke sub-agents (like the browsing agent) as if they were regular MCP tools, enabling a hierarchical agent architecture. Args: sub_agents_cfg: Configuration containing sub-agent definitions Returns: List of server parameter dicts, each with 'name' and 'tools' keys. Each tool includes 'name', 'description', and 'schema' for the sub-agent. """ sub_agents_server_params = [] for sub_agent in sub_agents_cfg.keys(): if "agent-browsing" in sub_agent: sub_agents_server_params.append( dict( name="agent-browsing", tools=[ dict( name="search_and_browse", description="This tool is an agent that performs the subtask of searching and browsing the web for specific missing information and generating the desired answer. The subtask should be clearly defined, include relevant background, and focus on factual gaps. It does not perform vague or speculative subtasks. \nArgs: \n\tsubtask: the subtask to be performed. \nReturns: \n\tthe result of the subtask. ", schema={ "type": "object", "properties": { "subtask": {"title": "Subtask", "type": "string"} }, "required": ["subtask"], "title": "search_and_browseArguments", }, ) ], ) ) return sub_agents_server_params def get_env_info(cfg: DictConfig) -> dict: """ Collect current configuration and environment information for logging. Gathers LLM settings, agent configuration, API key availability (masked), and base URLs. Used for debugging and task log enrichment. Args: cfg: Hydra configuration object Returns: Dictionary containing: - LLM configuration (provider, model, temperature, etc.) - Agent configuration (max turns for main/sub agents) - API key availability flags (boolean, not actual keys) - Service base URLs """ return { # LLM Configuration "llm_provider": cfg.llm.provider, "llm_base_url": cfg.llm.base_url, "llm_model_name": cfg.llm.model_name, "llm_temperature": cfg.llm.temperature, "llm_top_p": cfg.llm.top_p, "llm_min_p": cfg.llm.min_p, "llm_top_k": cfg.llm.top_k, "llm_max_tokens": cfg.llm.max_tokens, "llm_repetition_penalty": cfg.llm.repetition_penalty, "llm_async_client": cfg.llm.async_client, "keep_tool_result": cfg.agent.keep_tool_result, # Agent Configuration "main_agent_max_turns": cfg.agent.main_agent.max_turns, **( { f"sub_{sub_agent}_max_turns": cfg.agent.sub_agents[sub_agent].max_turns for sub_agent in cfg.agent.sub_agents } if cfg.agent.sub_agents is not None else {} ), # API Keys (masked for security) "has_serper_api_key": bool(SERPER_API_KEY), "has_jina_api_key": bool(JINA_API_KEY), "has_anthropic_api_key": bool(ANTHROPIC_API_KEY), "has_openai_api_key": bool(OPENAI_API_KEY), "has_e2b_api_key": bool(E2B_API_KEY), "has_tencent_secret_id": bool(TENCENTCLOUD_SECRET_ID), "has_tencent_secret_key": bool(TENCENTCLOUD_SECRET_KEY), "has_summary_llm_api_key": bool(SUMMARY_LLM_API_KEY), # Base URLs "openai_base_url": OPENAI_BASE_URL, "anthropic_base_url": ANTHROPIC_BASE_URL, "jina_base_url": JINA_BASE_URL, "serper_base_url": SERPER_BASE_URL, "whisper_base_url": WHISPER_BASE_URL, "vision_base_url": VISION_BASE_URL, "reasoning_base_url": REASONING_BASE_URL, "summary_llm_base_url": SUMMARY_LLM_BASE_URL, } ================================================ FILE: apps/miroflow-agent/src/core/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Core module containing orchestrator and pipeline components.""" from .answer_generator import AnswerGenerator from .orchestrator import Orchestrator from .pipeline import create_pipeline_components, execute_task_pipeline from .stream_handler import StreamHandler from .tool_executor import ToolExecutor __all__ = [ "AnswerGenerator", "Orchestrator", "StreamHandler", "ToolExecutor", "create_pipeline_components", "execute_task_pipeline", ] ================================================ FILE: apps/miroflow-agent/src/core/answer_generator.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Answer generator module for final answer generation and context management. This module provides the AnswerGenerator class that handles: - LLM call processing - Failure summary generation for context compression - Final answer generation with retries - Context management fallback strategies """ import logging from typing import Any, Dict, List, Optional, Tuple from omegaconf import DictConfig from ..io.output_formatter import OutputFormatter from ..llm.base_client import BaseClient from ..logging.task_logger import TaskLog from ..utils.parsing_utils import extract_failure_experience_summary from ..utils.prompt_utils import ( FAILURE_SUMMARY_ASSISTANT_PREFIX, FAILURE_SUMMARY_PROMPT, FORMAT_ERROR_MESSAGE, generate_agent_summarize_prompt, ) from ..utils.wrapper_utils import ErrorBox, ResponseBox from .stream_handler import StreamHandler logger = logging.getLogger(__name__) # Safety limits for retry loops DEFAULT_MAX_FINAL_ANSWER_RETRIES = 3 class AnswerGenerator: """ Generator for final answers with context management support. Handles the generation of final answers, failure summaries for retry, and various fallback strategies based on context management settings. """ def __init__( self, llm_client: BaseClient, output_formatter: OutputFormatter, task_log: TaskLog, stream_handler: StreamHandler, cfg: DictConfig, intermediate_boxed_answers: List[str], ): """ Initialize the answer generator. Args: llm_client: The LLM client for API calls output_formatter: Formatter for output processing task_log: Logger for task execution stream_handler: Handler for streaming events cfg: Configuration object intermediate_boxed_answers: List to track intermediate answers """ self.llm_client = llm_client self.output_formatter = output_formatter self.task_log = task_log self.stream = stream_handler self.cfg = cfg self.intermediate_boxed_answers = intermediate_boxed_answers # Context management settings self.context_compress_limit = cfg.agent.get("context_compress_limit", 0) self.max_final_answer_retries = ( DEFAULT_MAX_FINAL_ANSWER_RETRIES if cfg.agent.keep_tool_result == -1 else 1 ) self.retry_with_summary = cfg.agent.get("retry_with_summary", True) async def handle_llm_call( self, system_prompt: str, message_history: List[Dict[str, Any]], tool_definitions: List[Dict], step_id: int, purpose: str = "", agent_type: str = "main", ) -> Tuple[Optional[str], bool, Optional[Any], List[Dict[str, Any]]]: """ Unified LLM call and logging processing. Args: system_prompt: System prompt for the LLM message_history: Conversation history tool_definitions: Available tool definitions step_id: Current step ID for logging purpose: Description of the call purpose agent_type: Type of agent making the call Returns: Tuple of (response_text, should_break, tool_calls_info, message_history) """ original_message_history = message_history try: response, message_history = await self.llm_client.create_message( system_prompt=system_prompt, message_history=message_history, tool_definitions=tool_definitions, keep_tool_result=self.cfg.agent.keep_tool_result, step_id=step_id, task_log=self.task_log, agent_type=agent_type, ) if ErrorBox.is_error_box(response): await self.stream.show_error(str(response)) response = None if ResponseBox.is_response_box(response): if response.has_extra_info(): extra_info = response.get_extra_info() if extra_info.get("warning_msg"): await self.stream.show_error( extra_info.get("warning_msg", "Empty warning message") ) response = response.get_response() # Check if response is None (indicating an error occurred) if response is None: self.task_log.log_step( "error", f"{purpose} | LLM Call Failed", f"{purpose} failed - no response received", ) return "", False, None, original_message_history # Use client's response processing method assistant_response_text, should_break, message_history = ( self.llm_client.process_llm_response( response, message_history, agent_type ) ) # Use client's tool call information extraction method tool_calls_info = self.llm_client.extract_tool_calls_info( response, assistant_response_text ) self.task_log.log_step( "info", f"{purpose} | LLM Call", "completed successfully", ) return ( assistant_response_text, should_break, tool_calls_info, message_history, ) except Exception as e: self.task_log.log_step( "error", f"{purpose} | LLM Call ERROR", f"{purpose} error: {str(e)}", ) # Return empty response with should_break=False, need to retry return "", False, None, original_message_history async def generate_failure_summary( self, system_prompt: str, message_history: List[Dict[str, Any]], tool_definitions: List[Dict], turn_count: int, ) -> Optional[str]: """ Generate a failure experience summary for context compression. This is the core of the context management mechanism. When a task attempt fails (i.e., the task is not completed within the given turns and context window), we compress the entire conversation history into a structured summary containing: - Failure type: incomplete / blocked / misdirected / format_missed - What happened: the approach taken and why a final answer was not reached - Useful findings: facts, intermediate results, or conclusions to be reused Args: system_prompt: The system prompt used in the conversation message_history: The full conversation history to be compressed tool_definitions: Available tool definitions turn_count: Current turn count for step ID Returns: The compressed failure experience summary, or None if generation failed """ self.task_log.log_step( "info", "Main Agent | Failure Summary", "Generating failure experience summary for potential retry...", ) # Build failure summary history failure_summary_history = message_history.copy() if failure_summary_history and failure_summary_history[-1]["role"] == "user": failure_summary_history.pop() # Add failure summary prompt and assistant prefix for structured output failure_summary_history.append( {"role": "user", "content": FAILURE_SUMMARY_PROMPT} ) failure_summary_history.append( {"role": "assistant", "content": FAILURE_SUMMARY_ASSISTANT_PREFIX} ) # Call LLM to generate failure summary ( failure_summary_text, _, _, _, ) = await self.handle_llm_call( system_prompt, failure_summary_history, tool_definitions, turn_count + 10, # Use a different step id "Main Agent | Failure Experience Summary", agent_type="main", ) # Prepend the assistant prefix to the response for complete output if failure_summary_text: failure_summary_text = ( FAILURE_SUMMARY_ASSISTANT_PREFIX + failure_summary_text ) failure_experience_summary = extract_failure_experience_summary( failure_summary_text ) # Truncate for logging, but only add "..." if actually truncated log_preview = failure_experience_summary[:500] if len(failure_experience_summary) > 500: log_preview += "..." self.task_log.log_step( "info", "Main Agent | Failure Summary", f"Generated failure experience summary:\n{log_preview}", ) return failure_experience_summary else: self.task_log.log_step( "warning", "Main Agent | Failure Summary", "Failed to generate failure experience summary", ) return None async def generate_final_answer_with_retries( self, system_prompt: str, message_history: List[Dict[str, Any]], tool_definitions: List[Dict], turn_count: int, task_description: str, ) -> Tuple[Optional[str], str, Optional[str], str, List[Dict[str, Any]]]: """ Generate final answer with retry mechanism. Args: system_prompt: System prompt for the LLM message_history: Conversation history tool_definitions: Available tool definitions turn_count: Current turn count task_description: Original task description Returns: Tuple of (final_answer_text, final_summary, final_boxed_answer, usage_log, message_history) """ # Generate summary prompt summary_prompt = generate_agent_summarize_prompt( task_description, agent_type="main", ) if message_history[-1]["role"] == "user": message_history.pop(-1) message_history.append({"role": "user", "content": summary_prompt}) final_answer_text = None final_boxed_answer = None final_summary = "" usage_log = "" for retry_idx in range(self.max_final_answer_retries): ( final_answer_text, should_break, tool_calls_info, message_history, ) = await self.handle_llm_call( system_prompt, message_history, tool_definitions, turn_count + 1 + retry_idx, f"Main agent | Final Summary (attempt {retry_idx + 1}/{self.max_final_answer_retries})", agent_type="main", ) if final_answer_text: final_summary, final_boxed_answer, usage_log = ( self.output_formatter.format_final_summary_and_log( final_answer_text, self.llm_client ) ) if final_boxed_answer != FORMAT_ERROR_MESSAGE: self.task_log.log_step( "info", "Main Agent | Final Answer", f"Boxed answer found on attempt {retry_idx + 1}", ) break else: self.task_log.log_step( "warning", "Main Agent | Final Answer", f"No boxed answer on attempt {retry_idx + 1}, retrying...", ) if retry_idx < self.max_final_answer_retries - 1: if ( message_history and message_history[-1]["role"] == "assistant" ): message_history.pop() else: self.task_log.log_step( "warning", "Main Agent | Final Answer", f"Failed to generate answer on attempt {retry_idx + 1}", ) if retry_idx < self.max_final_answer_retries - 1: if message_history and message_history[-1]["role"] == "assistant": message_history.pop() # Ensure final_boxed_answer is never None if final_boxed_answer is None: final_boxed_answer = FORMAT_ERROR_MESSAGE return ( final_answer_text, final_summary, final_boxed_answer, usage_log, message_history, ) def handle_no_context_management_fallback( self, final_answer_text: Optional[str], final_summary: str, final_boxed_answer: Optional[str], ) -> Tuple[str, str, str]: """ Handle fallback when context_compress_limit == 0 (no context management). In this mode, the model has only one chance to answer. We should try to use intermediate answers as fallback to maximize accuracy. Args: final_answer_text: The generated final answer text final_summary: The final summary final_boxed_answer: The extracted boxed answer Returns: Tuple of (final_answer_text, final_summary, final_boxed_answer) """ # Validate final_answer_text if not final_answer_text: final_answer_text = "No final answer generated." final_summary = final_answer_text final_boxed_answer = FORMAT_ERROR_MESSAGE self.task_log.log_step( "error", "Main Agent | Final Answer", "Unable to generate final answer after all retries", ) else: self.task_log.log_step( "info", "Main Agent | Final Answer", f"Final answer content:\n\n{final_answer_text}", ) # Fallback to intermediate answer if no valid boxed answer if ( final_boxed_answer == FORMAT_ERROR_MESSAGE or final_boxed_answer is None ) and self.intermediate_boxed_answers: final_boxed_answer = self.intermediate_boxed_answers[-1] self.task_log.log_step( "info", "Main Agent | Final Answer (No Context Management)", f"Using intermediate boxed answer as fallback: {final_boxed_answer}", ) # Ensure final_boxed_answer is never None if final_boxed_answer is None: final_boxed_answer = FORMAT_ERROR_MESSAGE return final_answer_text, final_summary, final_boxed_answer def handle_context_management_no_fallback( self, final_answer_text: Optional[str], final_summary: str, final_boxed_answer: Optional[str], ) -> Tuple[str, str, str]: """ Handle failure when context_compress_limit > 0 (context management enabled). In this mode, the model has multiple chances to retry with context management. We should NOT guess or use intermediate answers, because: - A wrong guess can reduce accuracy - The model will have another chance to answer with failure experience Args: final_answer_text: The generated final answer text final_summary: The final summary final_boxed_answer: The extracted boxed answer Returns: Tuple of (final_answer_text, final_summary, final_boxed_answer) """ # Validate final_answer_text if not final_answer_text: final_answer_text = "No final answer generated." final_summary = final_answer_text final_boxed_answer = FORMAT_ERROR_MESSAGE self.task_log.log_step( "error", "Main Agent | Final Answer", "Unable to generate final answer after all retries", ) else: self.task_log.log_step( "info", "Main Agent | Final Answer", f"Final answer content:\n\n{final_answer_text}", ) # Ensure final_boxed_answer is never None if final_boxed_answer is None: final_boxed_answer = FORMAT_ERROR_MESSAGE # With context management, do NOT fallback to intermediate answers if final_boxed_answer == FORMAT_ERROR_MESSAGE: self.task_log.log_step( "info", "Main Agent | Final Answer (Context Management Mode)", "No valid boxed answer found. Not using intermediate fallback - will generate failure summary for retry.", ) return final_answer_text, final_summary, final_boxed_answer async def generate_and_finalize_answer( self, system_prompt: str, message_history: List[Dict[str, Any]], tool_definitions: List[Dict], turn_count: int, task_description: str, reached_max_turns: bool = False, is_final_retry: bool = False, save_callback=None, ) -> Tuple[str, str, Optional[str], str, List[Dict[str, Any]]]: """ Generate final answer and handle fallback based on context management settings. Context Management (context_compress_limit > 0) is essentially a context compression mechanism that enables multi-attempt problem solving. Decision table based on (context_management, reached_max_turns): | Context Management | Reached Max Turns | Behavior | |--------------------|-------------------|---------------------------------------------| | OFF (limit=0) | No | Generate answer → fallback to intermediate | | OFF (limit=0) | Yes | Generate answer → fallback to intermediate | | ON (limit>0) | No | Generate answer → no fallback, fail summary | | ON (limit>0) | Yes | SKIP generation → fail summary directly | Args: system_prompt: System prompt for the LLM message_history: Conversation history tool_definitions: Available tool definitions turn_count: Current turn count task_description: Original task description reached_max_turns: Whether the main loop ended due to reaching max turns save_callback: Optional callback to save message history Returns: Tuple of (final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history) """ context_management_enabled = self.context_compress_limit > 0 failure_experience_summary = None usage_log = "" # CASE: Context management ON + reached max turns + NOT final retry # Skip answer generation entirely - any answer would be a blind guess # But if this is the final retry, we still try to generate an answer (last chance) if context_management_enabled and reached_max_turns and not is_final_retry: self.task_log.log_step( "info", "Main Agent | Final Answer (Context Management Mode)", "Reached max turns. Skipping answer generation to avoid blind guessing.", ) if save_callback: save_callback(system_prompt, message_history) if self.retry_with_summary: failure_experience_summary = await self.generate_failure_summary( system_prompt, message_history, tool_definitions, turn_count ) return ( "Task incomplete - reached maximum turns. Will retry with failure experience.", FORMAT_ERROR_MESSAGE, failure_experience_summary, usage_log, message_history, ) # ALL OTHER CASES: Generate final answer first # (including final retry with reached_max_turns - last chance to get an answer) ( final_answer_text, final_summary, final_boxed_answer, usage_log, message_history, ) = await self.generate_final_answer_with_retries( system_prompt=system_prompt, message_history=message_history, tool_definitions=tool_definitions, turn_count=turn_count, task_description=task_description, ) if save_callback: save_callback(system_prompt, message_history) # CASE: Context management OFF or final retry # Try to use intermediate answers as fallback to maximize accuracy # For final retry, there's no more retry opportunity, so we use fallback if not context_management_enabled or is_final_retry: final_answer_text, final_summary, final_boxed_answer = ( self.handle_no_context_management_fallback( final_answer_text, final_summary, final_boxed_answer ) ) if is_final_retry: self.task_log.log_step( "info", "Main Agent | Final Answer (Final Retry)", "This is the final retry. Using intermediate fallback if available.", ) return ( final_summary, final_boxed_answer, None, usage_log, message_history, ) # CASE: Context management ON + normal completion (not reached max turns, not final retry) # Don't use fallback - wrong guess would reduce accuracy final_answer_text, final_summary, final_boxed_answer = ( self.handle_context_management_no_fallback( final_answer_text, final_summary, final_boxed_answer ) ) if final_boxed_answer == FORMAT_ERROR_MESSAGE and self.retry_with_summary: failure_experience_summary = await self.generate_failure_summary( system_prompt, message_history, tool_definitions, turn_count ) return ( final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history, ) ================================================ FILE: apps/miroflow-agent/src/core/orchestrator.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Orchestrator module for coordinating agent task execution. This module contains the main Orchestrator class that manages the execution of tasks by coordinating between the main agent, sub-agents, and various tools. """ import asyncio import gc import logging import time import uuid from collections import defaultdict from datetime import date from typing import Any, Dict, List, Optional from miroflow_tools.manager import ToolManager from omegaconf import DictConfig from ..config.settings import expose_sub_agents_as_tools from ..io.input_handler import process_input from ..io.output_formatter import OutputFormatter from ..llm.base_client import BaseClient from ..logging.task_logger import TaskLog, get_utc_plus_8_time from ..utils.parsing_utils import extract_llm_response_text from ..utils.prompt_utils import ( generate_agent_specific_system_prompt, generate_agent_summarize_prompt, mcp_tags, refusal_keywords, ) from .answer_generator import AnswerGenerator from .stream_handler import StreamHandler from .tool_executor import ToolExecutor logger = logging.getLogger(__name__) # ============================================================================= # Constants # ============================================================================= # Default timeout for LLM calls in seconds DEFAULT_LLM_TIMEOUT = 600 # Safety limits for retry loops DEFAULT_MAX_CONSECUTIVE_ROLLBACKS = 5 # Additional attempts beyond max_turns for total loop protection EXTRA_ATTEMPTS_BUFFER = 200 def _list_tools(sub_agent_tool_managers: Dict[str, ToolManager]): """ Create a cached async function for fetching sub-agent tool definitions. This factory function returns an async closure that lazily fetches and caches tool definitions from all sub-agent tool managers. The cache ensures that tool definitions are only fetched once per orchestrator instance. Args: sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances. Returns: An async function that returns a dictionary of tool definitions for each sub-agent. """ cache = None async def wrapped(): nonlocal cache if cache is None: # Only fetch tool definitions if not already cached result = { name: await tool_manager.get_all_tool_definitions() for name, tool_manager in sub_agent_tool_managers.items() } cache = result return cache return wrapped class Orchestrator: """ Main orchestrator for coordinating agent task execution. Manages the execution loop for main and sub-agents, coordinating LLM calls, tool execution, streaming events, and context management. """ def __init__( self, main_agent_tool_manager: ToolManager, sub_agent_tool_managers: Dict[str, ToolManager], llm_client: BaseClient, output_formatter: OutputFormatter, cfg: DictConfig, task_log: Optional["TaskLog"] = None, stream_queue: Optional[Any] = None, tool_definitions: Optional[List[Dict[str, Any]]] = None, sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None, ): """ Initialize the orchestrator. Args: main_agent_tool_manager: Tool manager for main agent sub_agent_tool_managers: Dictionary of tool managers for sub-agents llm_client: The LLM client for API calls output_formatter: Formatter for output processing cfg: Configuration object task_log: Logger for task execution stream_queue: Optional async queue for streaming events tool_definitions: Pre-fetched tool definitions (optional) sub_agent_tool_definitions: Pre-fetched sub-agent tool definitions (optional) """ self.main_agent_tool_manager = main_agent_tool_manager self.sub_agent_tool_managers = sub_agent_tool_managers self.llm_client = llm_client self.output_formatter = output_formatter self.cfg = cfg self.task_log = task_log self.stream_queue = stream_queue self.tool_definitions = tool_definitions self.sub_agent_tool_definitions = sub_agent_tool_definitions # Initialize sub-agent tool list function self._list_sub_agent_tools = None if sub_agent_tool_managers: self._list_sub_agent_tools = _list_tools(sub_agent_tool_managers) # Pass task_log to llm_client if self.llm_client and task_log: self.llm_client.task_log = task_log # Track boxed answers extracted during main loop turns self.intermediate_boxed_answers: List[str] = [] # Record used subtask / q / Query to detect duplicates self.used_queries: Dict[str, Dict[str, int]] = {} # Retry loop protection limits self.MAX_CONSECUTIVE_ROLLBACKS = DEFAULT_MAX_CONSECUTIVE_ROLLBACKS # Context management settings self.context_compress_limit = cfg.agent.get("context_compress_limit", 0) # Initialize helper components self.stream = StreamHandler(stream_queue) self.tool_executor = ToolExecutor( main_agent_tool_manager=main_agent_tool_manager, sub_agent_tool_managers=sub_agent_tool_managers, output_formatter=output_formatter, task_log=task_log, stream_handler=self.stream, max_consecutive_rollbacks=DEFAULT_MAX_CONSECUTIVE_ROLLBACKS, ) self.answer_generator = AnswerGenerator( llm_client=llm_client, output_formatter=output_formatter, task_log=task_log, stream_handler=self.stream, cfg=cfg, intermediate_boxed_answers=self.intermediate_boxed_answers, ) def _save_message_history( self, system_prompt: str, message_history: List[Dict[str, Any]] ): """Save message history to task log.""" self.task_log.main_agent_message_history = { "system_prompt": system_prompt, "message_history": message_history, } self.task_log.save() async def _handle_response_format_issues( self, assistant_response_text: str, message_history: List[Dict[str, Any]], turn_count: int, consecutive_rollbacks: int, total_attempts: int, max_attempts: int, agent_name: str, ) -> tuple: """ Handle MCP tag format errors and refusal keywords. Args: assistant_response_text: The LLM response text message_history: Current message history turn_count: Current turn count consecutive_rollbacks: Current consecutive rollback count total_attempts: Total attempts made max_attempts: Maximum allowed attempts agent_name: Name of the agent for logging Returns: Tuple of (should_continue, should_break, turn_count, consecutive_rollbacks, message_history) """ # Check for MCP tags in response (format error) if any(mcp_tag in assistant_response_text for mcp_tag in mcp_tags): if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1: turn_count -= 1 consecutive_rollbacks += 1 if message_history[-1]["role"] == "assistant": message_history.pop() self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | Rollback", f"Tool call format incorrect - found MCP tags in response. " f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, " f"Total attempts: {total_attempts}/{max_attempts}", ) return True, False, turn_count, consecutive_rollbacks, message_history else: self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks", f"Ending agent loop after {consecutive_rollbacks} consecutive MCP format errors", ) return False, True, turn_count, consecutive_rollbacks, message_history # Check for refusal keywords if any(keyword in assistant_response_text for keyword in refusal_keywords): matched_keywords = [ kw for kw in refusal_keywords if kw in assistant_response_text ] if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1: turn_count -= 1 consecutive_rollbacks += 1 if message_history[-1]["role"] == "assistant": message_history.pop() self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | Rollback", f"LLM refused to answer - found refusal keywords: {matched_keywords}. " f"Consecutive rollbacks: {consecutive_rollbacks}/{self.MAX_CONSECUTIVE_ROLLBACKS}, " f"Total attempts: {total_attempts}/{max_attempts}", ) return True, False, turn_count, consecutive_rollbacks, message_history else: self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | End After Max Rollbacks", f"Ending agent loop after {consecutive_rollbacks} consecutive refusals with keywords: {matched_keywords}", ) return False, True, turn_count, consecutive_rollbacks, message_history # No format issues - normal end without tool calls return False, True, turn_count, consecutive_rollbacks, message_history async def _check_duplicate_query( self, tool_name: str, arguments: dict, cache_name: str, consecutive_rollbacks: int, turn_count: int, total_attempts: int, max_attempts: int, message_history: List[Dict[str, Any]], agent_name: str, ) -> tuple: """ Check for duplicate queries and handle rollback if needed. Args: tool_name: Name of the tool being called arguments: Tool arguments cache_name: Name of the query cache to use consecutive_rollbacks: Current consecutive rollback count turn_count: Current turn count total_attempts: Total attempts made max_attempts: Maximum allowed attempts message_history: Current message history agent_name: Name of the agent for logging Returns: Tuple of (is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history) """ query_str = self.tool_executor.get_query_str_from_tool_call( tool_name, arguments ) if not query_str: return False, False, turn_count, consecutive_rollbacks, message_history self.used_queries.setdefault(cache_name, defaultdict(int)) count = self.used_queries[cache_name][query_str] if count > 0: if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1: message_history.pop() turn_count -= 1 consecutive_rollbacks += 1 self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | Rollback", f"Duplicate query detected - tool: {tool_name}, query: '{query_str}', " f"previous count: {count}. Consecutive rollbacks: {consecutive_rollbacks}/" f"{self.MAX_CONSECUTIVE_ROLLBACKS}, Total attempts: {total_attempts}/{max_attempts}", ) return True, True, turn_count, consecutive_rollbacks, message_history else: self.task_log.log_step( "warning", f"{agent_name} | Turn: {turn_count} | Allow Duplicate", f"Allowing duplicate query after {consecutive_rollbacks} rollbacks - " f"tool: {tool_name}, query: '{query_str}', previous count: {count}", ) return False, False, turn_count, consecutive_rollbacks, message_history async def _record_query(self, cache_name: str, tool_name: str, arguments: dict): """Record a successful query execution.""" query_str = self.tool_executor.get_query_str_from_tool_call( tool_name, arguments ) if query_str: self.used_queries.setdefault(cache_name, defaultdict(int)) self.used_queries[cache_name][query_str] += 1 async def run_sub_agent( self, sub_agent_name: str, task_description: str, ): """ Run a sub-agent to handle a subtask. Args: sub_agent_name: Name of the sub-agent to run task_description: Description of the subtask Returns: The final answer text from the sub-agent """ task_description += "\n\nPlease provide the answer and detailed supporting information of the subtask given to you." self.task_log.log_step( "info", f"{sub_agent_name} | Task Description", f"Subtask: {task_description}", ) # Stream sub-agent start display_name = sub_agent_name.replace("agent-", "") sub_agent_id = await self.stream.start_agent(display_name) await self.stream.start_llm(display_name) # Start new sub-agent session self.task_log.start_sub_agent_session(sub_agent_name, task_description) # Initialize message history message_history = [{"role": "user", "content": task_description}] # Get sub-agent tool definitions if not self.sub_agent_tool_definitions: tool_definitions = await self._list_sub_agent_tools() tool_definitions = tool_definitions.get(sub_agent_name, {}) else: tool_definitions = self.sub_agent_tool_definitions[sub_agent_name] if not tool_definitions: self.task_log.log_step( "warning", f"{sub_agent_name} | No Tools", "No tool definitions available.", ) # Generate sub-agent system prompt system_prompt = self.llm_client.generate_agent_system_prompt( date=date.today(), mcp_servers=tool_definitions, ) + generate_agent_specific_system_prompt(agent_type=sub_agent_name) # Limit sub-agent turns if self.cfg.agent.sub_agents: max_turns = self.cfg.agent.sub_agents[sub_agent_name].max_turns else: max_turns = 0 turn_count = 0 total_attempts = 0 max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER consecutive_rollbacks = 0 while turn_count < max_turns and total_attempts < max_attempts: turn_count += 1 total_attempts += 1 if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS: self.task_log.log_step( "error", f"{sub_agent_name} | Too Many Rollbacks", f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.", ) break self.task_log.save() # Reset 'last_call_tokens' self.llm_client.last_call_tokens = { "prompt_tokens": 0, "completion_tokens": 0, } # LLM call using answer generator ( assistant_response_text, should_break, tool_calls, message_history, ) = await self.answer_generator.handle_llm_call( system_prompt, message_history, tool_definitions, turn_count, f"{sub_agent_name} | Turn: {turn_count}", agent_type=sub_agent_name, ) if should_break: self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | LLM Call", "should break is True, breaking the loop", ) break if assistant_response_text: text_response = extract_llm_response_text(assistant_response_text) if text_response: await self.stream.tool_call("show_text", {"text": text_response}) else: self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | LLM Call", "LLM call failed", ) await asyncio.sleep(5) continue # Handle no tool calls case if not tool_calls: ( should_continue, should_break_loop, turn_count, consecutive_rollbacks, message_history, ) = await self._handle_response_format_issues( assistant_response_text, message_history, turn_count, consecutive_rollbacks, total_attempts, max_attempts, sub_agent_name, ) if should_continue: continue if should_break_loop: if not any( mcp_tag in assistant_response_text for mcp_tag in mcp_tags ) and not any( keyword in assistant_response_text for keyword in refusal_keywords ): self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | LLM Call", f"No tool calls found in {sub_agent_name}, ending on turn {turn_count}", ) break # Execute tool calls tool_calls_data = [] all_tool_results_content_with_id = [] should_rollback_turn = False for call in tool_calls: server_name = call["server_name"] tool_name = call["tool_name"] arguments = call["arguments"] call_id = call["id"] # Fix common parameter name mistakes arguments = self.tool_executor.fix_tool_call_arguments( tool_name, arguments ) self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | Tool Call", f"Executing {tool_name} on {server_name}", ) call_start_time = time.time() try: # Check for duplicate query cache_name = sub_agent_id + "_" + tool_name ( is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history, ) = await self._check_duplicate_query( tool_name, arguments, cache_name, consecutive_rollbacks, turn_count, total_attempts, max_attempts, message_history, sub_agent_name, ) if should_rollback: should_rollback_turn = True break # Send stream event tool_call_id = await self.stream.tool_call(tool_name, arguments) # Execute tool call tool_result = await self.sub_agent_tool_managers[ sub_agent_name ].execute_tool_call(server_name, tool_name, arguments) # Update query count if successful if "error" not in tool_result: await self._record_query(cache_name, tool_name, arguments) # Post-process result tool_result = self.tool_executor.post_process_tool_call_result( tool_name, tool_result ) result = ( tool_result.get("result") if tool_result.get("result") else tool_result.get("error") ) # Check for errors that should trigger rollback if self.tool_executor.should_rollback_result( tool_name, result, tool_result ): if consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1: message_history.pop() turn_count -= 1 consecutive_rollbacks += 1 should_rollback_turn = True self.task_log.log_step( "warning", f"{sub_agent_name} | Turn: {turn_count} | Rollback", f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'", ) break await self.stream.tool_call( tool_name, {"result": result}, tool_call_id=tool_call_id ) call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | Tool Call", f"Tool {tool_name} completed in {call_duration_ms}ms", ) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "result": tool_result, "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) except Exception as e: call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "error": str(e), "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) tool_result = { "error": f"Tool call failed: {str(e)}", "server_name": server_name, "tool_name": tool_name, } self.task_log.log_step( "error", f"{sub_agent_name} | Turn: {turn_count} | Tool Call", f"Tool {tool_name} failed to execute: {str(e)}", ) tool_result_for_llm = self.output_formatter.format_tool_result_for_user( tool_result ) all_tool_results_content_with_id.append((call_id, tool_result_for_llm)) if should_rollback_turn: continue # Reset consecutive rollbacks on successful execution if consecutive_rollbacks > 0: self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | Recovery", f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks", ) consecutive_rollbacks = 0 # Update message history message_history = self.llm_client.update_message_history( message_history, all_tool_results_content_with_id ) # Check context length temp_summary_prompt = generate_agent_summarize_prompt( task_description, agent_type=sub_agent_name, ) pass_length_check, message_history = self.llm_client.ensure_summary_context( message_history, temp_summary_prompt ) if not pass_length_check: turn_count = max_turns self.task_log.log_step( "info", f"{sub_agent_name} | Turn: {turn_count} | Context Limit Reached", "Context limit reached, triggering summary", ) break # Log loop end if turn_count >= max_turns: self.task_log.log_step( "info", f"{sub_agent_name} | Max Turns Reached / Context Limit Reached", f"Reached maximum turns ({max_turns}) or context limit reached", ) else: self.task_log.log_step( "info", f"{sub_agent_name} | Main Loop Completed", f"Main loop completed after {turn_count} turns", ) # Generate final summary self.task_log.log_step( "info", f"{sub_agent_name} | Final Summary", f"Generating {sub_agent_name} final summary", ) summary_prompt = generate_agent_summarize_prompt( task_description, agent_type=sub_agent_name, ) if message_history[-1]["role"] == "user": message_history.pop() message_history.append({"role": "user", "content": summary_prompt}) await self.stream.tool_call( "Partial Summary", {}, tool_call_id=str(uuid.uuid4()) ) # Generate final answer ( final_answer_text, should_break, tool_calls_info, message_history, ) = await self.answer_generator.handle_llm_call( system_prompt, message_history, tool_definitions, turn_count + 1, f"{sub_agent_name} | Final summary", agent_type=sub_agent_name, ) if final_answer_text: self.task_log.log_step( "info", f"{sub_agent_name} | Final Answer", "Final answer generated successfully", ) else: final_answer_text = ( f"No final answer generated by sub agent {sub_agent_name}." ) self.task_log.log_step( "error", f"{sub_agent_name} | Final Answer", "Unable to generate final answer", ) # Save session history self.task_log.sub_agent_message_history_sessions[ self.task_log.current_sub_agent_session_id ] = {"system_prompt": system_prompt, "message_history": message_history} self.task_log.save() self.task_log.end_sub_agent_session(sub_agent_name) # Remove thinking content final_answer_text = final_answer_text.split("")[-1].strip() final_answer_text = final_answer_text.split("")[-1].strip() # Stream sub-agent end await self.stream.end_llm(display_name) await self.stream.end_agent(display_name, sub_agent_id) return final_answer_text async def run_main_agent( self, task_description, task_file_name=None, task_id="default_task", is_final_retry=False, ): """ Execute the main end-to-end task. Args: task_description: Description of the task to execute task_file_name: Optional file associated with the task task_id: Unique identifier for the task Returns: Tuple of (final_summary, final_boxed_answer, failure_experience_summary) """ workflow_id = await self.stream.start_workflow(task_description) self.task_log.log_step("info", "Main Agent", f"Start task with id: {task_id}") self.task_log.log_step( "info", "Main Agent", f"Task description: {task_description}" ) if task_file_name: self.task_log.log_step( "info", "Main Agent", f"Associated file: {task_file_name}" ) # Process input initial_user_content, processed_task_desc = process_input( task_description, task_file_name ) message_history = [{"role": "user", "content": initial_user_content}] # Record initial user input user_input = processed_task_desc if task_file_name: user_input += f"\n[Attached file: {task_file_name}]" # Get tool definitions if not self.tool_definitions: tool_definitions = ( await self.main_agent_tool_manager.get_all_tool_definitions() ) if self.cfg.agent.sub_agents is not None: tool_definitions += expose_sub_agents_as_tools( self.cfg.agent.sub_agents ) else: tool_definitions = self.tool_definitions if not tool_definitions: self.task_log.log_step( "warning", "Main Agent | Tool Definitions", "Warning: No tool definitions found. LLM cannot use any tools.", ) # Generate system prompt system_prompt = self.llm_client.generate_agent_system_prompt( date=date.today(), mcp_servers=tool_definitions, ) + generate_agent_specific_system_prompt(agent_type="main") system_prompt = system_prompt.strip() # Main loop configuration max_turns = self.cfg.agent.main_agent.max_turns turn_count = 0 total_attempts = 0 max_attempts = max_turns + EXTRA_ATTEMPTS_BUFFER consecutive_rollbacks = 0 self.current_agent_id = await self.stream.start_agent("main") await self.stream.start_llm("main") while turn_count < max_turns and total_attempts < max_attempts: turn_count += 1 total_attempts += 1 if consecutive_rollbacks >= self.MAX_CONSECUTIVE_ROLLBACKS: self.task_log.log_step( "error", "Main Agent | Too Many Rollbacks", f"Reached {consecutive_rollbacks} consecutive rollbacks, breaking loop.", ) break self.task_log.save() # LLM call ( assistant_response_text, should_break, tool_calls, message_history, ) = await self.answer_generator.handle_llm_call( system_prompt, message_history, tool_definitions, turn_count, f"Main agent | Turn: {turn_count}", agent_type="main", ) # Process LLM response if assistant_response_text: text_response = extract_llm_response_text(assistant_response_text) if text_response: await self.stream.tool_call("show_text", {"text": text_response}) # Extract boxed content boxed_content = self.output_formatter._extract_boxed_content( assistant_response_text ) if boxed_content: self.intermediate_boxed_answers.append(boxed_content) if should_break: self.task_log.log_step( "info", f"Main Agent | Turn: {turn_count} | LLM Call", "should break is True, breaking the loop", ) break else: turn_count -= 1 self.task_log.log_step( "warning", f"Main Agent | Turn: {turn_count} | LLM Call", "No valid response from LLM, retrying", ) await asyncio.sleep(5) continue # Handle no tool calls case if not tool_calls: ( should_continue, should_break_loop, turn_count, consecutive_rollbacks, message_history, ) = await self._handle_response_format_issues( assistant_response_text, message_history, turn_count, consecutive_rollbacks, total_attempts, max_attempts, "Main Agent", ) if should_continue: continue if should_break_loop: if not any( mcp_tag in assistant_response_text for mcp_tag in mcp_tags ) and not any( keyword in assistant_response_text for keyword in refusal_keywords ): self.task_log.log_step( "info", f"Main Agent | Turn: {turn_count} | LLM Call", "LLM did not request tool usage, ending process.", ) break # Execute tool calls tool_calls_data = [] all_tool_results_content_with_id = [] should_rollback_turn = False main_agent_last_call_tokens = self.llm_client.last_call_tokens for call in tool_calls: server_name = call["server_name"] tool_name = call["tool_name"] arguments = call["arguments"] call_id = call["id"] # Fix common parameter name mistakes arguments = self.tool_executor.fix_tool_call_arguments( tool_name, arguments ) call_start_time = time.time() try: if server_name.startswith("agent-") and self.cfg.agent.sub_agents: # Sub-agent execution cache_name = "main_" + tool_name ( is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history, ) = await self._check_duplicate_query( tool_name, arguments, cache_name, consecutive_rollbacks, turn_count, total_attempts, max_attempts, message_history, "Main Agent", ) if should_rollback: should_rollback_turn = True break # Stream events await self.stream.end_llm("main") await self.stream.end_agent("main", self.current_agent_id) # Execute sub-agent sub_agent_result = await self.run_sub_agent( server_name, arguments["subtask"], ) # Update query count await self._record_query(cache_name, tool_name, arguments) tool_result = { "server_name": server_name, "tool_name": tool_name, "result": sub_agent_result, } self.current_agent_id = await self.stream.start_agent( "main", display_name="Summarizing" ) await self.stream.start_llm("main", display_name="Summarizing") else: # Regular tool execution cache_name = "main_" + tool_name ( is_duplicate, should_rollback, turn_count, consecutive_rollbacks, message_history, ) = await self._check_duplicate_query( tool_name, arguments, cache_name, consecutive_rollbacks, turn_count, total_attempts, max_attempts, message_history, "Main Agent", ) if should_rollback: should_rollback_turn = True break # Send stream event tool_call_id = await self.stream.tool_call(tool_name, arguments) # Execute tool call tool_result = ( await self.main_agent_tool_manager.execute_tool_call( server_name=server_name, tool_name=tool_name, arguments=arguments, ) ) # Update query count if successful if "error" not in tool_result: await self._record_query(cache_name, tool_name, arguments) # Post-process result tool_result = self.tool_executor.post_process_tool_call_result( tool_name, tool_result ) result = ( tool_result.get("result") if tool_result.get("result") else tool_result.get("error") ) # Check for errors that should trigger rollback if self.tool_executor.should_rollback_result( tool_name, result, tool_result ): if ( consecutive_rollbacks < self.MAX_CONSECUTIVE_ROLLBACKS - 1 ): message_history.pop() turn_count -= 1 consecutive_rollbacks += 1 should_rollback_turn = True self.task_log.log_step( "warning", f"Main Agent | Turn: {turn_count} | Rollback", f"Tool result error - tool: {tool_name}, result: '{str(result)[:200]}'", ) break await self.stream.tool_call( tool_name, {"result": result}, tool_call_id=tool_call_id ) call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "result": tool_result, "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) self.task_log.log_step( "info", f"Main Agent | Turn: {turn_count} | Tool Call", f"Tool {tool_name} completed in {call_duration_ms}ms", ) except Exception as e: call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "error": str(e), "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) tool_result = { "server_name": server_name, "tool_name": tool_name, "error": str(e), } self.task_log.log_step( "error", f"Main Agent | Turn: {turn_count} | Tool Call", f"Tool {tool_name} failed to execute: {str(e)}", ) # Format results for LLM tool_result_for_llm = self.output_formatter.format_tool_result_for_user( tool_result ) all_tool_results_content_with_id.append((call_id, tool_result_for_llm)) if should_rollback_turn: continue # Reset consecutive rollbacks on successful execution if consecutive_rollbacks > 0: self.task_log.log_step( "info", f"Main Agent | Turn: {turn_count} | Recovery", f"Successfully recovered after {consecutive_rollbacks} consecutive rollbacks", ) consecutive_rollbacks = 0 # Update 'last_call_tokens' self.llm_client.last_call_tokens = main_agent_last_call_tokens # Update message history message_history = self.llm_client.update_message_history( message_history, all_tool_results_content_with_id ) self.task_log.main_agent_message_history = { "system_prompt": system_prompt, "message_history": message_history, } self.task_log.save() # Check context length temp_summary_prompt = generate_agent_summarize_prompt( task_description, agent_type="main", ) pass_length_check, message_history = self.llm_client.ensure_summary_context( message_history, temp_summary_prompt ) if not pass_length_check: turn_count = max_turns self.task_log.log_step( "warning", f"Main Agent | Turn: {turn_count} | Context Limit Reached", "Context limit reached, triggering summary", ) break await self.stream.end_llm("main") await self.stream.end_agent("main", self.current_agent_id) # Determine if max turns was reached reached_max_turns = turn_count >= max_turns if reached_max_turns: self.task_log.log_step( "warning", "Main Agent | Max Turns Reached / Context Limit Reached", f"Reached maximum turns ({max_turns}) or context limit reached", ) else: self.task_log.log_step( "info", "Main Agent | Main Loop Completed", f"Main loop completed after {turn_count} turns", ) # Final summary self.task_log.log_step( "info", "Main Agent | Final Summary", "Generating final summary" ) self.current_agent_id = await self.stream.start_agent("Final Summary") await self.stream.start_llm("Final Summary") # Generate final answer using answer generator ( final_summary, final_boxed_answer, failure_experience_summary, usage_log, message_history, ) = await self.answer_generator.generate_and_finalize_answer( system_prompt=system_prompt, message_history=message_history, tool_definitions=tool_definitions, turn_count=turn_count, task_description=task_description, reached_max_turns=reached_max_turns, is_final_retry=is_final_retry, save_callback=self._save_message_history, ) await self.stream.tool_call("show_text", {"text": final_boxed_answer}) await self.stream.end_llm("Final Summary") await self.stream.end_agent("Final Summary", self.current_agent_id) await self.stream.end_workflow(workflow_id) self.task_log.log_step( "info", "Main Agent | Usage Calculation", f"Usage log: {usage_log}" ) self.task_log.log_step( "info", "Main Agent | Final boxed answer", f"Final boxed answer:\n\n{final_boxed_answer}", ) self.task_log.log_step( "info", "Main Agent | Task Completed", f"Main agent task {task_id} completed successfully", ) gc.collect() return final_summary, final_boxed_answer, failure_experience_summary ================================================ FILE: apps/miroflow-agent/src/core/pipeline.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Task execution pipeline module. This module provides: - execute_task_pipeline: Main function to run a complete task from start to finish - create_pipeline_components: Factory function to initialize all pipeline components The pipeline orchestrates the interaction between LLM clients, tool managers, and the orchestrator to execute complex multi-turn agent tasks. """ import traceback import uuid from typing import Any, Dict, List, Optional from miroflow_tools.manager import ToolManager from omegaconf import DictConfig from ..config.settings import ( create_mcp_server_parameters, get_env_info, ) from ..io.output_formatter import OutputFormatter from ..llm.factory import ClientFactory from ..logging.task_logger import ( TaskLog, get_utc_plus_8_time, ) from .orchestrator import Orchestrator async def execute_task_pipeline( cfg: DictConfig, task_id: str, task_description: str, task_file_name: str, main_agent_tool_manager: ToolManager, sub_agent_tool_managers: Dict[str, ToolManager], output_formatter: OutputFormatter, ground_truth: Optional[Any] = None, log_dir: str = "logs", stream_queue: Optional[Any] = None, tool_definitions: Optional[List[Dict[str, Any]]] = None, sub_agent_tool_definitions: Optional[Dict[str, List[Dict[str, Any]]]] = None, is_final_retry: bool = False, ): """ Executes the full pipeline for a single task. Args: cfg: The Hydra configuration object. task_id: A unique identifier for this task run (used for logging). task_description: The description of the task for the LLM. task_file_name: The path to an associated file (empty string if none). main_agent_tool_manager: An initialized main agent ToolManager instance. sub_agent_tool_managers: Dictionary mapping sub-agent names to their ToolManager instances. output_formatter: An initialized OutputFormatter instance. ground_truth: The ground truth for the task (optional). log_dir: The directory to save the task log (default: "logs"). stream_queue: A queue for streaming the task execution (optional). tool_definitions: The definitions of the tools for the main agent (optional). sub_agent_tool_definitions: The definitions of the tools for the sub-agents (optional). Returns: A tuple of (final_summary, final_boxed_answer, log_file_path, failure_experience_summary): - final_summary: A string with the final execution summary, or an error message. - final_boxed_answer: The extracted boxed answer from the LLM response. - log_file_path: The path to the saved task log file. - failure_experience_summary: Summary of failure experience for retry (None if successful). """ # Create task log task_log = TaskLog( log_dir=log_dir, task_id=task_id, start_time=get_utc_plus_8_time(), input={"task_description": task_description, "task_file_name": task_file_name}, env_info=get_env_info(cfg), ground_truth=ground_truth, ) # Log task start task_log.log_step( "info", "Main | Task Start", f"--- Starting Task Execution: {task_id} ---" ) # Set task_log for all ToolManager instances main_agent_tool_manager.set_task_log(task_log) if sub_agent_tool_managers: for sub_agent_tool_manager in sub_agent_tool_managers.values(): sub_agent_tool_manager.set_task_log(task_log) try: # Initialize LLM client random_uuid = str(uuid.uuid4()) unique_id = f"{task_id}-{random_uuid}" llm_client = ClientFactory(task_id=unique_id, cfg=cfg, task_log=task_log) # Initialize orchestrator orchestrator = Orchestrator( main_agent_tool_manager=main_agent_tool_manager, sub_agent_tool_managers=sub_agent_tool_managers, llm_client=llm_client, output_formatter=output_formatter, cfg=cfg, task_log=task_log, stream_queue=stream_queue, tool_definitions=tool_definitions, sub_agent_tool_definitions=sub_agent_tool_definitions, ) ( final_summary, final_boxed_answer, failure_experience_summary, ) = await orchestrator.run_main_agent( task_description=task_description, task_file_name=task_file_name, task_id=task_id, is_final_retry=is_final_retry, ) llm_client.close() task_log.final_boxed_answer = final_boxed_answer task_log.status = "success" # Store failure experience summary in task log if available if failure_experience_summary: task_log.trace_data["failure_experience_summary"] = ( failure_experience_summary ) log_file_path = task_log.save() return ( final_summary, final_boxed_answer, log_file_path, failure_experience_summary, ) except Exception as e: error_details = traceback.format_exc() task_log.log_step( "warning", "task_error_notification", f"An error occurred during task {task_id}", ) task_log.log_step("error", "task_error_details", error_details) error_message = ( f"Error executing task {task_id}:\n" f"Description: {task_description}\n" f"File: {task_file_name}\n" f"Error Type: {type(e).__name__}\n" f"Error Details:\n{error_details}" ) task_log.status = "failed" task_log.error = error_details log_file_path = task_log.save() return error_message, "", log_file_path, None finally: task_log.end_time = get_utc_plus_8_time() # Record task summary to structured log task_log.log_step( "info", "task_execution_finished", f"Task {task_id} execution completed with status: {task_log.status}", ) task_log.save() def create_pipeline_components(cfg: DictConfig): """ Creates and initializes the core components of the agent pipeline. Args: cfg: The Hydra configuration object. Returns: Tuple of (main_agent_tool_manager, sub_agent_tool_managers, output_formatter) """ # Create ToolManagers for main agent and sub-agents main_agent_mcp_server_configs, main_agent_blacklist = create_mcp_server_parameters( cfg, cfg.agent.main_agent ) main_agent_tool_manager = ToolManager( main_agent_mcp_server_configs, tool_blacklist=main_agent_blacklist, ) # Create OutputFormatter output_formatter = OutputFormatter() sub_agent_tool_managers = {} # For single agent mode if not cfg.agent.sub_agents: return main_agent_tool_manager, {}, output_formatter for sub_agent in cfg.agent.sub_agents: sub_agent_mcp_server_configs, sub_agent_blacklist = ( create_mcp_server_parameters(cfg, cfg.agent.sub_agents[sub_agent]) ) sub_agent_tool_manager = ToolManager( sub_agent_mcp_server_configs, tool_blacklist=sub_agent_blacklist, ) sub_agent_tool_managers[sub_agent] = sub_agent_tool_manager return main_agent_tool_manager, sub_agent_tool_managers, output_formatter ================================================ FILE: apps/miroflow-agent/src/core/stream_handler.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Stream handler module for SSE (Server-Sent Events) protocol. This module provides the StreamHandler class that manages all streaming events for real-time communication with clients during agent task execution. """ import logging import uuid from typing import Any, Optional logger = logging.getLogger(__name__) class StreamHandler: """ Handler for streaming events in SSE protocol format. Manages the sending of various event types including workflow lifecycle, agent lifecycle, LLM interactions, and tool calls. """ def __init__(self, stream_queue: Optional[Any] = None): """ Initialize the stream handler. Args: stream_queue: Optional async queue for sending stream messages. If None, streaming is disabled. """ self.stream_queue = stream_queue async def update(self, event_type: str, data: dict): """ Send a streaming update in SSE protocol format. Args: event_type: The type of event (e.g., 'start_of_workflow', 'tool_call') data: The event payload data """ if self.stream_queue: try: stream_message = { "event": event_type, "data": data, } await self.stream_queue.put(stream_message) except Exception as e: logger.warning(f"Failed to send stream update: {e}") async def start_workflow(self, user_input: str) -> str: """ Send start_of_workflow event. Args: user_input: The initial user input for the workflow Returns: The generated workflow ID """ workflow_id = str(uuid.uuid4()) await self.update( "start_of_workflow", { "workflow_id": workflow_id, "input": [ { "role": "user", "content": user_input, } ], }, ) return workflow_id async def end_workflow(self, workflow_id: str): """ Send end_of_workflow event. Args: workflow_id: The workflow ID to end """ await self.update( "end_of_workflow", { "workflow_id": workflow_id, }, ) async def show_error(self, error: str): """ Send show_error event and signal stream end. Args: error: The error message to display """ await self.tool_call("show_error", {"error": error}) if self.stream_queue: try: await self.stream_queue.put(None) except Exception as e: logger.warning(f"Failed to send show_error: {e}") async def start_agent(self, agent_name: str, display_name: str = None) -> str: """ Send start_of_agent event. Args: agent_name: Internal name of the agent display_name: Optional display name for UI Returns: The generated agent ID """ agent_id = str(uuid.uuid4()) await self.update( "start_of_agent", { "agent_name": agent_name, "display_name": display_name, "agent_id": agent_id, }, ) return agent_id async def end_agent(self, agent_name: str, agent_id: str): """ Send end_of_agent event. Args: agent_name: Internal name of the agent agent_id: The agent ID to end """ await self.update( "end_of_agent", { "agent_name": agent_name, "agent_id": agent_id, }, ) async def start_llm(self, agent_name: str, display_name: str = None): """ Send start_of_llm event. Args: agent_name: Name of the agent making the LLM call display_name: Optional display name for UI """ await self.update( "start_of_llm", { "agent_name": agent_name, "display_name": display_name, }, ) async def end_llm(self, agent_name: str): """ Send end_of_llm event. Args: agent_name: Name of the agent that finished LLM call """ await self.update( "end_of_llm", { "agent_name": agent_name, }, ) async def message(self, message_id: str, delta_content: str): """ Send message event with streaming content. Args: message_id: Unique identifier for the message delta_content: The content delta to send """ await self.update( "message", { "message_id": message_id, "delta": { "content": delta_content, }, }, ) async def tool_call( self, tool_name: str, payload: dict, streaming: bool = False, tool_call_id: str = None, ) -> str: """ Send tool_call event. Args: tool_name: Name of the tool being called payload: Tool call arguments or results streaming: If True, send payload keys as deltas tool_call_id: Optional existing tool call ID Returns: The tool call ID (generated if not provided) """ if not tool_call_id: tool_call_id = str(uuid.uuid4()) if streaming: for key, value in payload.items(): await self.update( "tool_call", { "tool_call_id": tool_call_id, "tool_name": tool_name, "delta_input": {key: value}, }, ) else: # Send complete tool call await self.update( "tool_call", { "tool_call_id": tool_call_id, "tool_name": tool_name, "tool_input": payload, }, ) return tool_call_id ================================================ FILE: apps/miroflow-agent/src/core/tool_executor.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Tool executor module for handling tool call execution. This module provides the ToolExecutor class that manages tool call execution, including argument fixing, duplicate detection, result processing, and error handling. """ import json import logging import os import time from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple from miroflow_tools.manager import ToolManager from ..io.output_formatter import OutputFormatter from ..logging.task_logger import TaskLog, get_utc_plus_8_time from .stream_handler import StreamHandler logger = logging.getLogger(__name__) # Maximum length for scrape results in demo mode (to support more conversation turns) DEMO_SCRAPE_MAX_LENGTH = 20_000 class ToolExecutor: """ Executor for tool calls with support for duplicate detection and result processing. Handles the execution of tool calls, including parameter fixing, duplicate query detection, result truncation in demo mode, and error handling. """ def __init__( self, main_agent_tool_manager: ToolManager, sub_agent_tool_managers: Dict[str, ToolManager], output_formatter: OutputFormatter, task_log: TaskLog, stream_handler: StreamHandler, max_consecutive_rollbacks: int = 5, ): """ Initialize the tool executor. Args: main_agent_tool_manager: Tool manager for main agent sub_agent_tool_managers: Dictionary of tool managers for sub-agents output_formatter: Formatter for tool results task_log: Logger for task execution stream_handler: Handler for streaming events max_consecutive_rollbacks: Maximum allowed consecutive rollbacks """ self.main_agent_tool_manager = main_agent_tool_manager self.sub_agent_tool_managers = sub_agent_tool_managers self.output_formatter = output_formatter self.task_log = task_log self.stream = stream_handler self.max_consecutive_rollbacks = max_consecutive_rollbacks # Track used queries to detect duplicates self.used_queries: Dict[str, Dict[str, int]] = {} def fix_tool_call_arguments(self, tool_name: str, arguments: dict) -> dict: """ Fix common parameter name mistakes made by LLM. Args: tool_name: Name of the tool being called arguments: Original arguments dictionary Returns: Fixed arguments dictionary """ # Create a copy to avoid modifying the original fixed_args = arguments.copy() # Fix scrape_and_extract_info parameter names if tool_name == "scrape_and_extract_info": # Map common mistakes to the correct parameter name mistake_names = ["description", "introduction"] if "info_to_extract" not in fixed_args: for mistake_name in mistake_names: if mistake_name in fixed_args: fixed_args["info_to_extract"] = fixed_args.pop(mistake_name) break # Fix run_python_code parameter names: 'code' -> 'code_block' # Also add default sandbox_id if missing (will trigger stateless fallback) if tool_name == "run_python_code": if "code_block" not in fixed_args and "code" in fixed_args: fixed_args["code_block"] = fixed_args.pop("code") if "sandbox_id" not in fixed_args: fixed_args["sandbox_id"] = "default" return fixed_args def get_query_str_from_tool_call( self, tool_name: str, arguments: dict ) -> Optional[str]: """ Extract the query string from tool call arguments based on tool_name. Supports search_and_browse, google_search, sogou_search, scrape_website, and scrape_and_extract_info. Args: tool_name: Name of the tool arguments: Tool arguments dictionary Returns: Query string for duplicate detection, or None if not applicable """ if tool_name == "search_and_browse": return tool_name + "_" + arguments.get("subtask", "") elif tool_name == "google_search": return tool_name + "_" + arguments.get("q", "") elif tool_name == "sogou_search": return tool_name + "_" + arguments.get("Query", "") elif tool_name == "scrape_website": return tool_name + "_" + arguments.get("url", "") elif tool_name == "scrape_and_extract_info": return ( tool_name + "_" + arguments.get("url", "") + "_" + arguments.get("info_to_extract", "") ) return None def is_duplicate_query(self, cache_name: str, query_str: str) -> Tuple[bool, int]: """ Check if a query has been executed before. Args: cache_name: Name of the cache (e.g., "main_google_search") query_str: The query string to check Returns: Tuple of (is_duplicate, previous_count) """ self.used_queries.setdefault(cache_name, defaultdict(int)) count = self.used_queries[cache_name][query_str] return count > 0, count def record_query(self, cache_name: str, query_str: str): """ Record that a query has been executed. Args: cache_name: Name of the cache query_str: The query string to record """ self.used_queries.setdefault(cache_name, defaultdict(int)) self.used_queries[cache_name][query_str] += 1 def is_google_search_empty_result(self, tool_name: str, tool_result: dict) -> bool: """ Check if google_search result has empty organic results. This indicates a poor search query that should be retried. Args: tool_name: Name of the tool tool_result: The tool execution result Returns: True if the result is empty and should trigger retry """ if tool_name != "google_search": return False result = tool_result.get("result") if not result: return False try: if isinstance(result, str): result_dict = json.loads(result) else: result_dict = result organic = result_dict.get("organic", []) return len(organic) == 0 except (json.JSONDecodeError, TypeError, AttributeError): return False def get_scrape_result(self, result: str) -> str: """ Process scrape result and truncate if too long. Args: result: Raw scrape result string (JSON or plain text) Returns: Processed result, truncated to DEMO_SCRAPE_MAX_LENGTH if necessary """ try: scrape_result_dict = json.loads(result) text = scrape_result_dict.get("text") if text and len(text) > DEMO_SCRAPE_MAX_LENGTH: text = text[:DEMO_SCRAPE_MAX_LENGTH] return json.dumps({"text": text}, ensure_ascii=False) except json.JSONDecodeError: if isinstance(result, str) and len(result) > DEMO_SCRAPE_MAX_LENGTH: result = result[:DEMO_SCRAPE_MAX_LENGTH] return result def post_process_tool_call_result( self, tool_name: str, tool_call_result: dict ) -> dict: """ Process tool call results. Only in demo mode: truncate scrape results to 20,000 chars to support more conversation turns. Args: tool_name: Name of the tool tool_call_result: The tool execution result Returns: Processed tool result """ if os.environ.get("DEMO_MODE") == "1": if "result" in tool_call_result and tool_name in [ "scrape", "scrape_website", ]: tool_call_result["result"] = self.get_scrape_result( tool_call_result["result"] ) return tool_call_result def should_rollback_result( self, tool_name: str, result: Any, tool_result: dict ) -> bool: """ Check if a tool result should trigger a rollback. Args: tool_name: Name of the tool result: The result value tool_result: Full tool result dictionary Returns: True if the result indicates an error that should trigger rollback """ return ( str(result).startswith("Unknown tool:") or str(result).startswith("Error executing tool") or self.is_google_search_empty_result(tool_name, tool_result) ) async def execute_single_tool_call( self, tool_manager: ToolManager, server_name: str, tool_name: str, arguments: dict, agent_name: str, turn_count: int, ) -> Tuple[dict, int, List[dict]]: """ Execute a single tool call. Args: tool_manager: The tool manager to use server_name: Name of the MCP server tool_name: Name of the tool arguments: Tool arguments agent_name: Name of the agent making the call turn_count: Current turn count Returns: Tuple of (tool_result, duration_ms, tool_calls_data) """ call_start_time = time.time() tool_calls_data = [] try: # Execute tool call tool_result = await tool_manager.execute_tool_call( server_name, tool_name, arguments ) # Post-process result tool_result = self.post_process_tool_call_result(tool_name, tool_result) call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) self.task_log.log_step( "info", f"{agent_name} | Turn: {turn_count} | Tool Call", f"Tool {tool_name} completed in {call_duration_ms}ms", ) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "result": tool_result, "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) return tool_result, call_duration_ms, tool_calls_data except Exception as e: call_end_time = time.time() call_duration_ms = int((call_end_time - call_start_time) * 1000) tool_calls_data.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "error": str(e), "duration_ms": call_duration_ms, "call_time": get_utc_plus_8_time(), } ) tool_result = { "error": f"Tool call failed: {str(e)}", "server_name": server_name, "tool_name": tool_name, } self.task_log.log_step( "error", f"{agent_name} | Turn: {turn_count} | Tool Call", f"Tool {tool_name} failed to execute: {str(e)}", ) return tool_result, call_duration_ms, tool_calls_data def format_tool_result_for_llm(self, tool_result: dict) -> dict: """ Format tool result for feeding back to LLM. Args: tool_result: The tool execution result Returns: Formatted result suitable for LLM message """ return self.output_formatter.format_tool_result_for_user(tool_result) ================================================ FILE: apps/miroflow-agent/src/io/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Input/Output module for processing task inputs and formatting outputs.""" from .input_handler import process_input from .output_formatter import OutputFormatter __all__ = [ "process_input", "OutputFormatter", ] ================================================ FILE: apps/miroflow-agent/src/io/input_handler.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Input handler module for processing various file types. This module provides functions for: - Processing task inputs with associated files - Converting documents (PDF, DOCX, PPTX, XLSX) to markdown - Generating captions for images, audio, and video files - Extracting task-relevant information from media files Supported file formats: - Documents: PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, HTML - Images: JPG, JPEG, PNG, GIF, WEBP - Audio: WAV, MP3, M4A - Video: MP4, MOV, AVI, MKV, WEBM - Data: JSON, JSONLD, CSV, YAML, TOML - Code: PY, SH, MD, TXT - Archives: ZIP """ import base64 import html import json import os import re import shutil import tempfile import traceback from typing import Any, Tuple, Union from urllib.parse import quote, unquote, urlparse, urlunparse import mammoth import markdownify import openpyxl import pdfminer import pdfminer.high_level import pptx from bs4 import BeautifulSoup from dotenv import load_dotenv from markitdown import MarkItDown from openai import OpenAI from openpyxl.utils import get_column_letter # Ensure .env file is loaded load_dotenv() # File extension constants for different media types IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "gif", "webp"} AUDIO_EXTENSIONS = {"wav", "mp3", "m4a"} VIDEO_EXTENSIONS = {"mp4", "mov", "avi", "mkv", "webm"} MEDIA_EXTENSIONS = IMAGE_EXTENSIONS | AUDIO_EXTENSIONS | VIDEO_EXTENSIONS # Extensions that should skip MarkItDown fallback processing SKIP_MARKITDOWN_EXTENSIONS = MEDIA_EXTENSIONS | {"pdb"} def _generate_image_caption(image_path: str) -> str: """ Generate a caption for an image using OpenAI's GPT-4o vision model. Args: image_path: Path to the image file Returns: Caption string, or error message if failed """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "[Caption unavailable: OPENAI_API_KEY not set]" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Read and encode image with open(image_path, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode("utf-8") # Guess MIME type _, ext = os.path.splitext(image_path) ext = ext.lower() mime_type = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", }.get(ext, "image/jpeg") # Call OpenAI API response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "text", "text": "Please provide a detailed description of this image. Include key objects, people, text, colors, and any other relevant details.", }, { "type": "image_url", "image_url": { "url": f"data:{mime_type};base64,{image_data}" }, }, ], } ], max_tokens=2048, temperature=0, ) content = response.choices[0].message.content return content if content else "[Caption unavailable: Empty response]" except Exception as e: return f"[Caption generation failed: {str(e)}]" def _generate_audio_caption(audio_path: str) -> str: """ Generate a caption for an audio file using OpenAI's audio transcription. Args: audio_path: Path to the audio file Returns: Caption string (transcription), or error message if failed """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "[Caption unavailable: OPENAI_API_KEY not set]" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Transcribe audio with open(audio_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( model="gpt-4o-transcribe", file=audio_file ) text = transcription.text return text if text else "[Transcription unavailable: Empty response]" except Exception as e: return f"[Caption generation failed: {str(e)}]" def _generate_video_caption(video_path: str) -> str: """ Generate a caption for a video using OpenAI's GPT-4o vision model. Args: video_path: Path to the video file Returns: Caption string, or error message if failed """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "[Caption unavailable: OPENAI_API_KEY not set]" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Read and encode video with open(video_path, "rb") as video_file: video_data = base64.b64encode(video_file.read()).decode("utf-8") # Guess MIME type _, ext = os.path.splitext(video_path) ext = ext.lower() mime_type = { ".mp4": "video/mp4", ".mov": "video/quicktime", ".avi": "video/x-msvideo", ".mkv": "video/x-matroska", ".webm": "video/webm", }.get(ext, "video/mp4") # Call OpenAI API response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "text", "text": "Please provide a detailed description of this video. Include key events, people, objects, actions, audio information, and any text visible in the video.", }, { "type": "image_url", "image_url": { "url": f"data:{mime_type};base64,{video_data}" }, }, ], } ], max_tokens=2048, temperature=0, ) content = response.choices[0].message.content return content if content else "[Caption unavailable: Empty response]" except Exception as e: return f"[Caption generation failed: {str(e)}]" def _extract_task_relevant_info_from_image( image_path: str, task_description: str ) -> str: """ Extract task-relevant information directly from an image based on the task description. Args: image_path: Path to the image file task_description: The user's task description Returns: Extracted relevant information, or empty string if extraction fails """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Read and encode image with open(image_path, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode("utf-8") # Guess MIME type _, ext = os.path.splitext(image_path) ext = ext.lower() mime_type = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", }.get(ext, "image/jpeg") # Call OpenAI API with task-specific prompt response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "text", "text": f"""Based on the following task, analyze this image and extract only the information that is directly relevant to completing the task. Task: {task_description} Please provide a concise summary of the relevant information from the image that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the image." Keep the response brief and focused.""", }, { "type": "image_url", "image_url": { "url": f"data:{mime_type};base64,{image_data}" }, }, ], } ], max_tokens=1024, temperature=0, ) return response.choices[0].message.content.strip() except Exception as e: print(f"Warning: Failed to extract task-relevant info from image: {str(e)}") return "" def _extract_task_relevant_info_from_audio( audio_path: str, task_description: str ) -> str: """ Extract task-relevant information directly from an audio file based on the task description. Args: audio_path: Path to the audio file task_description: The user's task description Returns: Extracted relevant information, or empty string if extraction fails """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Read and encode audio file with open(audio_path, "rb") as audio_file: audio_data = base64.b64encode(audio_file.read()).decode("utf-8") # Detect audio format _, ext = os.path.splitext(audio_path) ext = ext.lower() audio_format = { ".mp3": "mp3", ".wav": "wav", ".m4a": "m4a", }.get(ext, "mp3") # Use gpt-4o-audio-preview for direct audio question answering text_prompt = f"""Based on the following task, analyze this audio and extract only the information that is directly relevant to completing the task. Task: {task_description} Please provide a concise summary of the relevant information from the audio that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the audio." Keep the response brief and focused.""" response = client.chat.completions.create( model="gpt-4o-audio-preview", messages=[ { "role": "system", "content": "You are a helpful assistant specializing in audio analysis.", }, { "role": "user", "content": [ {"type": "text", "text": text_prompt}, { "type": "input_audio", "input_audio": { "data": audio_data, "format": audio_format, }, }, ], }, ], max_tokens=1024, temperature=0, ) return response.choices[0].message.content.strip() except Exception as e: print(f"Warning: Failed to extract task-relevant info from audio: {str(e)}") return "" def _extract_task_relevant_info_from_video( video_path: str, task_description: str ) -> str: """ Extract task-relevant information directly from a video based on the task description. Args: video_path: Path to the video file task_description: The user's task description Returns: Extracted relevant information, or empty string if extraction fails """ try: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") if not OPENAI_API_KEY: return "" client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Read and encode video with open(video_path, "rb") as video_file: video_data = base64.b64encode(video_file.read()).decode("utf-8") # Guess MIME type _, ext = os.path.splitext(video_path) ext = ext.lower() mime_type = { ".mp4": "video/mp4", ".mov": "video/quicktime", ".avi": "video/x-msvideo", ".mkv": "video/x-matroska", ".webm": "video/webm", }.get(ext, "video/mp4") # Call OpenAI API with task-specific prompt response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ { "type": "text", "text": f"""Based on the following task, analyze this video and extract only the information that is directly relevant to completing the task. Task: {task_description} Please provide a concise summary of the relevant information from the video that would help in completing this task. Focus only on what's pertinent to the task. If nothing is particularly relevant, state "No specific task-relevant details identified in the video." Keep the response brief and focused.""", }, { "type": "image_url", "image_url": { "url": f"data:{mime_type};base64,{video_data}" }, }, ], } ], max_tokens=1024, temperature=0, ) return response.choices[0].message.content.strip() except Exception as e: print(f"Warning: Failed to extract task-relevant info from video: {str(e)}") return "" def process_input(task_description: str, task_file_name: str) -> Tuple[str, str]: """ Process user input and associated files. Extracts content from the task file (if provided) and appends it to the task description in a format suitable for the LLM. Args: task_description: The original task description task_file_name: Path to an associated file, or empty string if none Returns: Tuple of (updated_task_description, updated_task_description) Both values are the same - the task description with file content appended """ updated_task_description = task_description file_content_section = "" # Collect file content to append at the end if task_file_name: try: file_extension = task_file_name.rsplit(".", maxsplit=1)[-1].lower() parsing_result = None if file_extension in IMAGE_EXTENSIONS: # Generate unconditional image caption caption = _generate_image_caption(task_file_name) # Extract task-relevant information directly from the image relevant_info = _extract_task_relevant_info_from_image( task_file_name, task_description ) # Format as Markdown file_content_section += f"\n\nNote: An image file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Image Content\nFile: {task_file_name}\n\n" file_content_section += f"> {caption}\n\n" if relevant_info: file_content_section += "Task-Relevant Information:\n\n" file_content_section += f"{relevant_info}\n\n" elif file_extension == "py": # Python files - read directly with open(task_file_name, "r", encoding="utf-8") as f: parsing_result = DocumentConverterResult( title=None, text_content=f.read() ) file_content_section += f"\n\nNote: A Python file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Python File\nFile: {task_file_name}\n\n" elif file_extension in ["txt", "md", "sh", "yaml", "yml", "toml", "csv"]: # Text-based files - read directly with open(task_file_name, "r", encoding="utf-8") as f: parsing_result = DocumentConverterResult( title=None, text_content=f.read() ) file_type_name = { "txt": "Text", "md": "Markdown", "sh": "Shell Script", "yaml": "YAML", "yml": "YAML", "toml": "TOML", "csv": "CSV", }.get(file_extension, "Text") file_content_section += f"\n\nNote: A {file_type_name.lower()} file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += ( f"## {file_type_name} File\nFile: {task_file_name}\n\n" ) elif file_extension in ["jsonld", "json"]: with open(task_file_name, "r", encoding="utf-8") as f: parsing_result = DocumentConverterResult( title=None, text_content=json.dumps( json.load(f), ensure_ascii=False, indent=2 ), ) file_content_section += f"\n\nNote: A JSON file '{task_file_name}' is associated with this task. The content has been extracted as JSON format below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## JSON File\nFile: {task_file_name}\n\n" elif file_extension in ["xlsx", "xls"]: parsing_result = XlsxConverter(local_path=task_file_name) file_content_section += f"\n\nNote: An Excel file '{task_file_name}' is associated with this task. The content has been extracted as a markdown table below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Excel File\nFile: {task_file_name}\n\n" elif file_extension == "pdf": parsing_result = DocumentConverterResult( title=None, text_content=pdfminer.high_level.extract_text(task_file_name), ) file_content_section += f"\n\nNote: A PDF file '{task_file_name}' is associated with this task. The content has been extracted as text below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## PDF File\nFile: {task_file_name}\n\n" elif file_extension in ["docx", "doc"]: parsing_result = DocxConverter(local_path=task_file_name) file_content_section += f"\n\nNote: A Word document '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Word Document\nFile: {task_file_name}\n\n" elif file_extension in ["html", "htm"]: parsing_result = HtmlConverter(local_path=task_file_name) file_content_section += f"\n\nNote: An HTML file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## HTML File\nFile: {task_file_name}\n\n" elif file_extension in ["pptx", "ppt"]: parsing_result = PptxConverter(local_path=task_file_name) file_content_section += f"\n\nNote: A PowerPoint presentation '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += ( f"## PowerPoint Presentation\nFile: {task_file_name}\n\n" ) elif file_extension in AUDIO_EXTENSIONS: # Generate unconditional audio transcription caption = _generate_audio_caption(task_file_name) # Extract task-relevant information directly from the audio relevant_info = _extract_task_relevant_info_from_audio( task_file_name, task_description ) # Format as Markdown file_content_section += f"\n\nNote: An audio file '{task_file_name}' is associated with this task. The content has been extracted as a transcription below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Audio Content\nFile: {task_file_name}\n\n" file_content_section += f"> {caption}\n\n" if relevant_info: file_content_section += "Task-Relevant Information:\n\n" file_content_section += f"{relevant_info}\n\n" elif file_extension in VIDEO_EXTENSIONS: # Generate unconditional video caption caption = _generate_video_caption(task_file_name) # Extract task-relevant information directly from the video relevant_info = _extract_task_relevant_info_from_video( task_file_name, task_description ) # Format as Markdown file_content_section += f"\n\nNote: A video file '{task_file_name}' is associated with this task. The content has been extracted as a detailed caption below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## Video Content\nFile: {task_file_name}\n\n" file_content_section += f"> {caption}\n\n" if relevant_info: file_content_section += "Task-Relevant Information:\n\n" file_content_section += f"{relevant_info}\n\n" elif file_extension in ["zip"]: parsing_result = ZipConverter(local_path=task_file_name) file_content_section += f"\n\nNote: A ZIP archive '{task_file_name}' is associated with this task. The content has been extracted as file list and contents below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += f"## ZIP Archive\nFile: {task_file_name}\n\n" elif file_extension == "pdb": # PDB files (protein database) - only add note file_content_section += f"\n\nNote: A PDB file '{task_file_name}' is associated with this task. You may use available tools to read its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" else: # For other file types, let MarkItDown try to handle it pass # MarkItDown will be tried below #### markitdown process - ONLY if no specialized converter handled it #### if parsing_result is None: try: if file_extension not in SKIP_MARKITDOWN_EXTENSIONS: md = MarkItDown(enable_plugins=True) parsing_result = md.convert(task_file_name) print( f"Info: Used MarkItDown as fallback to process file {task_file_name}" ) # Add prompt for files processed by MarkItDown file_content_section += f"\n\nNote: A file '{task_file_name}' is associated with this task. The content has been extracted as markdown below. You may use available tools to process its content if necessary. If you need to further process this file in the sandbox, please upload it to the sandbox first.\n\n" file_content_section += ( f"## File Content\nFile: {task_file_name}\n\n" ) except Exception as e: print( f"Warning: MarkItDown failed to process {task_file_name}: {e}" ) pass ############################ # Collect the content and title (if has) to append later if getattr(parsing_result, "title", None): file_content_section += "Title:\n\n{}\n\n".format(parsing_result.title) file_content_section += "Content:\n\n```\n{}\n```\n".format( parsing_result.text_content ) elif getattr(parsing_result, "text_content", None): content = parsing_result.text_content max_len = 200_000 # Limit the length of results returned to LLM if len(content) > max_len: content = content[:max_len] + "\n... [File truncated]" file_content_section += "```\n{}\n```\n".format(content) else: pass # for image, audio, video files that already have their content formatted except FileNotFoundError: print(f"Error: File not found {task_file_name}") file_content_section += ( f"\nWarning: The specified file '{task_file_name}' was not found." ) except Exception as e: print(f"Error: Error processing file {task_file_name}: {e}") traceback.print_exc() file_content_section += f"\nWarning: There was an error processing the file '{task_file_name}': {str(e)}" # output format requirement updated_task_description += "\nYou should follow the format instruction in the request strictly and wrap the final answer in \\boxed{}." # Append file content at the end updated_task_description += file_content_section updated_task_description = updated_task_description.strip() return updated_task_description, updated_task_description class _CustomMarkdownify(markdownify.MarkdownConverter): """ A custom version of markdownify's MarkdownConverter. Changes include: - Altering the default heading style to use '#', '##', etc. - Removing javascript hyperlinks. - Truncating images with large data:uri sources. - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax """ def __init__(self, **options: Any): options["heading_style"] = options.get("heading_style", markdownify.ATX) # Explicitly cast options to the expected type if necessary super().__init__(**options) def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: """Same as usual, but be sure to start with a new line""" if not convert_as_inline: if not re.search(r"^\n", text): return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore return super().convert_hn(n, el, text, convert_as_inline) # type: ignore def convert_a(self, el: Any, text: str, convert_as_inline: bool): """Same as usual converter, but removes Javascript links and escapes URIs.""" prefix, suffix, text = markdownify.chomp(text) # type: ignore if not text: return "" href = el.get("href") title = el.get("title") # Escape URIs and skip non-http or file schemes if href: try: parsed_url = urlparse(href) # type: ignore if parsed_url.scheme and parsed_url.scheme.lower() not in [ "http", "https", "file", ]: # type: ignore return "%s%s%s" % (prefix, text, suffix) href = urlunparse( parsed_url._replace(path=quote(unquote(parsed_url.path))) ) # type: ignore except ValueError: # It's not clear if this ever gets thrown return "%s%s%s" % (prefix, text, suffix) # For the replacement see #29: text nodes underscores are escaped if ( self.options["autolinks"] and text.replace(r"\_", "_") == href and not title and not self.options["default_title"] ): # Shortcut syntax return "<%s>" % href if self.options["default_title"] and not title: title = href title_part = ' "%s"' % title.replace('"', r"\"") if title else "" return ( "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text ) def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str: """Same as usual converter, but removes data URIs""" alt = el.attrs.get("alt", None) or "" src = el.attrs.get("src", None) or "" title = el.attrs.get("title", None) or "" title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if ( convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"] ): return alt # Remove dataURIs if src.startswith("data:"): src = src.split(",")[0] + "..." return "![%s](%s%s)" % (alt, src, title_part) def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore class DocumentConverterResult: """The result of converting a document to text.""" def __init__(self, title: Union[str, None] = None, text_content: str = ""): self.title: Union[str, None] = title self.text_content: str = text_content def convert_html_to_md(html_content): """ Placeholder for HTML to Markdown conversion function In the original class, this would call self._convert() """ soup = BeautifulSoup(html_content, "html.parser") for script in soup(["script", "style"]): script.extract() # Print only the main content body_elm = soup.find("body") webpage_text = "" if body_elm: webpage_text = _CustomMarkdownify().convert_soup(body_elm) else: webpage_text = _CustomMarkdownify().convert_soup(soup) assert isinstance(webpage_text, str) return DocumentConverterResult( title=None if soup.title is None else soup.title.string, text_content=webpage_text, ) def HtmlConverter(local_path: str): """ Convert an HTML file to Markdown format. Args: local_path: Path to the HTML file to convert. Returns: DocumentConverterResult containing the converted Markdown text. """ with open(local_path, "rt", encoding="utf-8") as fh: html_content = fh.read() return convert_html_to_md(html_content) def DocxConverter(local_path: str): """ Convert a DOCX file to Markdown format. Uses mammoth library to first convert DOCX to HTML, then converts the HTML to Markdown. Args: local_path: Path to the DOCX file to convert. Returns: DocumentConverterResult containing the converted Markdown text. """ with open(local_path, "rb") as docx_file: result = mammoth.convert_to_html(docx_file) html_content = result.value return convert_html_to_md(html_content) def XlsxConverter(local_path: str): """ Converts Excel files to Markdown using openpyxl. Preserves color formatting and other cell styling information. Args: local_path: Path to the Excel file Returns: DocumentConverterResult with the Markdown representation of the Excel file """ # Load the workbook wb = openpyxl.load_workbook(local_path, data_only=True) md_content = "" # Helper function to convert RGB color to hex def rgb_to_hex(rgb_value): if not rgb_value: return None # Convert RGB value to string for processing rgb_string = str(rgb_value) # Handle RGB format like 'RGB(255, 255, 255)' if isinstance(rgb_value, str) and rgb_string.startswith("RGB"): rgb_match = re.match(r"RGB\((\d+), (\d+), (\d+)\)", rgb_string) if rgb_match: r, g, b = map(int, rgb_match.groups()) return f"#{r:02x}{g:02x}{b:02x}" # Special handling for FFFFFFFF (white) and 00000000 (transparent/none) if rgb_string in ["FFFFFFFF", "00000000", "none", "auto"]: return None # Handle ARGB format (common in openpyxl) if len(rgb_string) == 8: # ARGB format like 'FF5733FF' return f"#{rgb_string[2:]}" # Strip alpha channel # Handle direct hex values like 'FF5733' if isinstance(rgb_value, str): return f"#{rgb_string}" if not rgb_string.startswith("#") else rgb_string return None # Return None for unrecognized formats # Helper function to detect and format cell styling def get_cell_format_info(cell): info = {} # Get background color if it exists if cell.fill and hasattr(cell.fill, "fgColor") and cell.fill.fgColor: # Get the RGB value - in openpyxl this can be stored in different attributes rgb_value = None if hasattr(cell.fill.fgColor, "rgb") and cell.fill.fgColor.rgb: rgb_value = cell.fill.fgColor.rgb elif hasattr(cell.fill.fgColor, "value") and cell.fill.fgColor.value: rgb_value = cell.fill.fgColor.value if rgb_value: bg_color = rgb_to_hex(rgb_value) if bg_color: # Skip transparent or white (handled in rgb_to_hex) info["bg_color"] = bg_color # Get font color if it exists if cell.font and hasattr(cell.font, "color") and cell.font.color: # Get the RGB value - in openpyxl this can be stored in different attributes rgb_value = None if hasattr(cell.font.color, "rgb") and cell.font.color.rgb: rgb_value = cell.font.color.rgb elif hasattr(cell.font.color, "value") and cell.font.color.value: rgb_value = cell.font.color.value if rgb_value: font_color = rgb_to_hex(rgb_value) if font_color: # Skip transparent (handled in rgb_to_hex) info["font_color"] = font_color # Get font weight (bold) if cell.font and cell.font.bold: info["bold"] = True # Get font style (italic) if cell.font and cell.font.italic: info["italic"] = True # Get font underline if cell.font and cell.font.underline and cell.font.underline != "none": info["underline"] = True return info # Process each sheet in the workbook for sheet_name in wb.sheetnames: try: sheet = wb[sheet_name] md_content += f"## {sheet_name}\n\n" # Get the dimensions of the used part of the sheet min_row, min_col = 1, 1 max_row = max( (cell.row for cell in sheet._cells.values() if cell.value is not None), default=0, ) max_col = max( ( cell.column for cell in sheet._cells.values() if cell.value is not None ), default=0, ) if max_row == 0 or max_col == 0: md_content += "This sheet is empty.\n\n" continue except Exception as e: error_msg = f"Error processing sheet '{sheet_name}': {str(e)}" print(error_msg) md_content += ( f"## {sheet_name}\n\nError processing this sheet: {str(e)}\n\n" ) continue try: # First, determine column widths col_widths = {} for col_idx in range(min_col, max_col + 1): max_length = 0 # col_letter = get_column_letter(col_idx) _ = get_column_letter(col_idx) for row_idx in range(min_row, max_row + 1): try: cell = sheet.cell(row=row_idx, column=col_idx) cell_value = str(cell.value) if cell.value is not None else "" max_length = max(max_length, len(cell_value)) except Exception as e: print( f"Warning: Error processing cell at row {row_idx}, column {col_idx}: {str(e)}" ) max_length = max(max_length, 10) # Use reasonable default col_widths[col_idx] = max(max_length + 2, 5) # Min width of 5 # Start building the table # Header row with column separators md_content += "|" for col_idx in range(min_col, max_col + 1): md_content += " " + " " * col_widths[col_idx] + " |" md_content += "\n" # Separator row md_content += "|" for col_idx in range(min_col, max_col + 1): md_content += ":" + "-" * col_widths[col_idx] + ":|" md_content += "\n" # Data rows for row_idx in range(min_row, max_row + 1): md_content += "|" for col_idx in range(min_col, max_col + 1): try: cell = sheet.cell(row=row_idx, column=col_idx) cell_value = str(cell.value) if cell.value is not None else "" # Get formatting info try: format_info = get_cell_format_info(cell) except Exception as e: print( f"Warning: Error getting formatting for cell at row {row_idx}, column {col_idx}: {str(e)}" ) format_info = {} formatted_value = cell_value # Add HTML-style formatting if needed if format_info: style_parts = [] if "bg_color" in format_info: style_parts.append( f"background-color:{format_info['bg_color']}" ) if "font_color" in format_info: style_parts.append(f"color:{format_info['font_color']}") span_attributes = [] if style_parts: span_attributes.append( f'style="{"; ".join(style_parts)}"' ) # Format with bold/italic/underline if needed inner_value = cell_value if "bold" in format_info: inner_value = f"{inner_value}" if "italic" in format_info: inner_value = f"{inner_value}" if "underline" in format_info: inner_value = f"{inner_value}" # Only add a span if we have style attributes if span_attributes: formatted_value = f"{inner_value}" else: formatted_value = inner_value # Pad to column width and add to markdown padding = col_widths[col_idx] - len(cell_value) padded_value = " " + formatted_value + " " * (padding + 1) md_content += padded_value + "|" except Exception as e: print( f"Error processing cell at row {row_idx}, column {col_idx}: {str(e)}" ) # Add a placeholder for the failed cell padded_value = " [Error] " + " " * (col_widths[col_idx] - 7) md_content += padded_value + " |" md_content += "\n" except Exception as e: error_msg = f"Error generating table for sheet '{sheet_name}': {str(e)}\n{traceback.format_exc()}" print(error_msg) md_content += f"Error generating table: {str(e)}\n\n" # Add formatting legend has_formatting = False for row_idx in range(min_row, max_row + 1): for col_idx in range(min_col, max_col + 1): cell = sheet.cell(row=row_idx, column=col_idx) if get_cell_format_info(cell): has_formatting = True break if has_formatting: break if has_formatting: md_content += "\n### Formatting Information\n" md_content += "The table above includes HTML formatting to represent colors and styles from the original Excel file.\n" md_content += "This formatting may not display in all Markdown viewers.\n" md_content += "\n\n" # Extra newlines between sheets return DocumentConverterResult( title=None, text_content=md_content.strip(), ) def PptxConverter(local_path: str) -> DocumentConverterResult: """ Converts PPTX files to Markdown. Supports headings, tables and images with alt text. Args: local_path: Path to the PPTX file Returns: DocumentConverterResult containing the converted Markdown text """ def is_picture(shape): """Check if a shape is a picture""" if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: return True if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: if hasattr(shape, "image"): return True return False def is_table(shape): """Check if a shape is a table""" if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: return True return False if not local_path.endswith(".pptx"): return DocumentConverterResult( title=None, text_content=f"Error: Expected .pptx file, got: {local_path}", ) md_content = "" presentation = pptx.Presentation(local_path) slide_num = 0 for slide in presentation.slides: slide_num += 1 md_content += f"\n\n\n" title = slide.shapes.title for shape in slide.shapes: # Pictures if is_picture(shape): # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069 alt_text = "" try: alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") except Exception: pass # A placeholder name filename = re.sub(r"\W", "", shape.name) + ".jpg" md_content += ( "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n" ) # Tables if is_table(shape): html_table = "" first_row = True for row in shape.table.rows: html_table += "" for cell in row.cells: if first_row: html_table += "" else: html_table += "" html_table += "" first_row = False html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" # Note: This would require a separate HTML to Markdown converter function # In this version, I'm assuming a convert_html_to_md function exists md_content += ( "\n" + convert_html_to_md(html_table).text_content.strip() + "\n" ) # Text areas elif shape.has_text_frame: if shape == title: md_content += "# " + shape.text.lstrip() + "\n" else: md_content += shape.text + "\n" md_content = md_content.strip() if slide.has_notes_slide: md_content += "\n\n### Notes:\n" notes_frame = slide.notes_slide.notes_text_frame if notes_frame is not None: md_content += notes_frame.text md_content = md_content.strip() return DocumentConverterResult( title=None, text_content=md_content.strip(), ) def ZipConverter(local_path: str, **kwargs): """ Extracts ZIP files to a temporary directory and processes each file according to its extension. Returns a combined result of all processed files. """ import zipfile temp_dir = tempfile.mkdtemp(prefix="zip_extract_") md_content = f"# Extracted from ZIP: {os.path.basename(local_path)}\n\n" try: with zipfile.ZipFile(local_path, "r") as zip_ref: zip_ref.extractall(temp_dir) # Get all extracted files extracted_files = [] for root, dirs, files in os.walk(temp_dir): for file in files: file_path = os.path.join(root, file) rel_path = os.path.relpath(file_path, temp_dir) extracted_files.append((file_path, rel_path)) if not extracted_files: md_content += "The ZIP file is empty or contains no files.\n" else: md_content += f"Total files extracted: {len(extracted_files)}\n\n" for file_path, rel_path in extracted_files: md_content += f"## File: {rel_path}\n\n" # Process each file based on its extension file_extension = ( file_path.rsplit(".", maxsplit=1)[-1].lower() if "." in file_path else "" ) file_result = None try: # Use the same processing logic as process_input if file_extension == "py": with open(file_path, "r", encoding="utf-8") as f: file_result = DocumentConverterResult( title=None, text_content=f.read() ) elif file_extension in [ "txt", "md", "sh", "yaml", "yml", "toml", "csv", ]: with open(file_path, "r", encoding="utf-8") as f: file_result = DocumentConverterResult( title=None, text_content=f.read() ) elif file_extension in ["jsonld", "json"]: with open(file_path, "r", encoding="utf-8") as f: file_result = DocumentConverterResult( title=None, text_content=json.dumps( json.load(f), ensure_ascii=False, indent=2 ), ) elif file_extension in ["xlsx", "xls"]: file_result = XlsxConverter(local_path=file_path) elif file_extension == "pdf": file_result = DocumentConverterResult( title=None, text_content=pdfminer.high_level.extract_text(file_path), ) elif file_extension in ["docx", "doc"]: file_result = DocxConverter(local_path=file_path) elif file_extension in ["html", "htm"]: file_result = HtmlConverter(local_path=file_path) elif file_extension in ["pptx", "ppt"]: file_result = PptxConverter(local_path=file_path) elif file_extension in IMAGE_EXTENSIONS: # Generate image caption for files in ZIP caption = _generate_image_caption(file_path) md_content += "[Image file]\n\n" md_content += f"> {caption}\n\n" continue elif file_extension in AUDIO_EXTENSIONS: # Generate audio caption for files in ZIP caption = _generate_audio_caption(file_path) md_content += "[Audio file]\n\n" md_content += f"> {caption}\n\n" continue elif file_extension in VIDEO_EXTENSIONS: # Generate video caption for files in ZIP caption = _generate_video_caption(file_path) md_content += "[Video file]\n\n" md_content += f"> {caption}\n\n" continue elif file_extension == "pdb": md_content += "[PDB file - specialized format]\n\n" continue else: # Try MarkItDown as fallback try: md_tool = MarkItDown(enable_plugins=True) file_result = md_tool.convert(file_path) except Exception: md_content += ( f"[Unsupported file type: {file_extension}]\n\n" ) continue # Add the processed content if file_result and getattr(file_result, "text_content", None): content = file_result.text_content # Limit length for each file max_len = 50_000 if len(content) > max_len: content = content[:max_len] + "\n... [Content truncated]" md_content += f"```\n{content}\n```\n\n" except Exception as e: md_content += f"[Error processing file: {str(e)}]\n\n" print(f"Warning: Error processing {rel_path} from ZIP: {e}") finally: # Clean up temporary directory try: shutil.rmtree(temp_dir) except Exception as e: print(f"Warning: Could not remove temporary directory {temp_dir}: {e}") return DocumentConverterResult( title="ZIP Archive Contents", text_content=md_content.strip() ) ================================================ FILE: apps/miroflow-agent/src/io/output_formatter.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Output formatting utilities for agent responses.""" import re from typing import Tuple from ..utils.prompt_utils import FORMAT_ERROR_MESSAGE # Maximum length for tool results before truncation (100k chars ≈ 25k tokens) TOOL_RESULT_MAX_LENGTH = 100_000 class OutputFormatter: """Formatter for processing and formatting agent outputs.""" def _extract_boxed_content(self, text: str) -> str: r""" Extract the content of the last \boxed{...} occurrence in the given text. Supports: - Arbitrary levels of nested braces - Escaped braces (\{ and \}) - Whitespace between \boxed and the opening brace - Empty content inside braces - Incomplete boxed expressions (extracts to end of string as fallback) Args: text: Input text that may contain \boxed{...} expressions Returns: The extracted boxed content, or empty string if no match is found. """ if not text: return "" _BOXED_RE = re.compile(r"\\boxed\b", re.DOTALL) last_result = None # Track the last boxed content (complete or incomplete) i = 0 n = len(text) while True: # Find the next \boxed occurrence m = _BOXED_RE.search(text, i) if not m: break j = m.end() # Skip any whitespace after \boxed while j < n and text[j].isspace(): j += 1 # Require that the next character is '{' if j >= n or text[j] != "{": i = j continue # Parse the brace content manually to handle nesting and escapes depth = 0 k = j escaped = False found_closing = False while k < n: ch = text[k] if escaped: escaped = False elif ch == "\\": escaped = True elif ch == "{": depth += 1 elif ch == "}": depth -= 1 # When depth returns to zero, the boxed content ends if depth == 0: last_result = text[j + 1 : k] i = k + 1 found_closing = True break k += 1 # If we didn't find a closing brace, this is an incomplete boxed # Store it as the last result (will be overwritten if we find more boxed later) if not found_closing and depth > 0: last_result = text[j + 1 : n] i = k # Continue from where we stopped elif not found_closing: i = j + 1 # Move past this invalid boxed # Return the last boxed content found (complete or incomplete) black_list = ["?", "??", "???", "?", "……", "…", "...", "unknown", None] return last_result.strip() if last_result not in black_list else "" def format_tool_result_for_user(self, tool_call_execution_result: dict) -> dict: """ Format tool execution results to be fed back to LLM as user messages. Only includes necessary information (results or errors). Long results are truncated to TOOL_RESULT_MAX_LENGTH to prevent context overflow. Args: tool_call_execution_result: Dict containing server_name, tool_name, and either 'result' or 'error'. Returns: Dict with 'type' and 'text' keys suitable for LLM message content. """ server_name = tool_call_execution_result["server_name"] tool_name = tool_call_execution_result["tool_name"] if "error" in tool_call_execution_result: # Provide concise error information to LLM content = f"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}" elif "result" in tool_call_execution_result: # Provide the original output result of the tool content = tool_call_execution_result["result"] # Truncate overly long results to prevent context overflow if len(content) > TOOL_RESULT_MAX_LENGTH: content = content[:TOOL_RESULT_MAX_LENGTH] + "\n... [Result truncated]" else: content = f"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result." return {"type": "text", "text": content} def format_final_summary_and_log( self, final_answer_text: str, client=None ) -> Tuple[str, str, str]: """ Format final summary information, including answers and token statistics. Args: final_answer_text: The final answer text from the agent client: Optional LLM client for token usage statistics Returns: Tuple of (summary_text, boxed_result, usage_log) """ summary_lines = [] summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30) summary_lines.append(final_answer_text) # Extract boxed result - find the last match using safer regex patterns boxed_result = self._extract_boxed_content(final_answer_text) # Add extracted result section summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20) if boxed_result: summary_lines.append(boxed_result) elif final_answer_text: summary_lines.append("No \\boxed{} content found.") boxed_result = FORMAT_ERROR_MESSAGE # Token usage statistics and cost estimation - use client method if client and hasattr(client, "format_token_usage_summary"): token_summary_lines, log_string = client.format_token_usage_summary() summary_lines.extend(token_summary_lines) else: # If no client or client doesn't support it, use default format summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20) summary_lines.append("Token usage information not available.") summary_lines.append("-" * (40 + len(" Token Usage & Cost "))) log_string = "Token usage information not available." return "\n".join(summary_lines), boxed_result, log_string ================================================ FILE: apps/miroflow-agent/src/llm/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. from .base_client import BaseClient from .factory import ClientFactory from .providers import ( AnthropicClient, OpenAIClient, ) __all__ = [ "BaseClient", "ClientFactory", "AnthropicClient", "OpenAIClient", ] ================================================ FILE: apps/miroflow-agent/src/llm/base_client.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Base client module for LLM providers. This module defines the abstract base class and common utilities for LLM clients, supporting both OpenAI and Anthropic API formats. """ import asyncio import dataclasses from abc import ABC from typing import ( Any, Dict, List, Optional, Tuple, TypedDict, ) from omegaconf import DictConfig from ..logging.task_logger import TaskLog from .util import with_timeout # Default timeout for LLM API calls (10 minutes) DEFAULT_LLM_TIMEOUT_SECONDS = 600 class TokenUsage(TypedDict, total=True): """ Unified token usage tracking across different LLM providers. We unify OpenAI and Anthropic formats. There are four usage types: - input/output tokens: Standard input and output token counts - cache write/read tokens: Tokens involved in caching operations Provider-specific notes: - OpenAI: Cache write is free, cache read is cheaper - Anthropic: Cache write has a small cost, cache read is cheaper """ total_input_tokens: int total_output_tokens: int total_cache_read_input_tokens: int total_cache_write_input_tokens: int @dataclasses.dataclass class BaseClient(ABC): """ Abstract base class for LLM provider clients. This class provides the common interface and utilities for interacting with different LLM providers (OpenAI, Anthropic, etc.). Concrete implementations should override _create_client() and provider-specific methods. Attributes: task_id: Unique identifier for the current task (used for tracking) cfg: Hydra configuration containing LLM settings task_log: Optional logger for recording task execution details """ # Required arguments (no default value) task_id: str cfg: DictConfig # Optional arguments (with default value) task_log: Optional["TaskLog"] = None # Initialized in __post_init__ client: Any = dataclasses.field(init=False) token_usage: TokenUsage = dataclasses.field(init=False) last_call_tokens: Dict[str, int] = dataclasses.field(init=False) def __post_init__(self): # Initialize last_call_tokens before other operations self.last_call_tokens: Dict[str, int] = { "prompt_tokens": 0, "completion_tokens": 0, } # Explicitly assign from cfg object self.provider: str = self.cfg.llm.provider self.model_name: str = self.cfg.llm.model_name self.temperature: float = self.cfg.llm.temperature self.top_p: float = self.cfg.llm.top_p self.min_p: float = self.cfg.llm.min_p self.top_k: int = self.cfg.llm.top_k self.max_context_length: int = self.cfg.llm.max_context_length self.max_tokens: int = self.cfg.llm.max_tokens self.async_client: bool = self.cfg.llm.async_client self.keep_tool_result: int = self.cfg.agent.keep_tool_result self.api_key: Optional[str] = self.cfg.llm.get("api_key") self.base_url: Optional[str] = self.cfg.llm.get("base_url") self.use_tool_calls: Optional[bool] = self.cfg.llm.get("use_tool_calls") self.repetition_penalty: float = self.cfg.llm.get("repetition_penalty", 1.0) self.token_usage = self._reset_token_usage() self.client = self._create_client() self.task_log.log_step( "info", "LLM | Initialization", f"LLMClient {self.provider} {self.model_name} initialization completed.", ) def _reset_token_usage(self) -> TokenUsage: """ Reset token usage counter to zero. Returns: A new TokenUsage dict with all counters set to zero. """ return TokenUsage( total_input_tokens=0, total_output_tokens=0, total_cache_write_input_tokens=0, total_cache_read_input_tokens=0, ) def _remove_tool_result_from_messages( self, messages, keep_tool_result ) -> List[Dict]: """Remove tool results from messages Args: messages: List of message dictionaries keep_tool_result: Number of tool results to keep. -1 means keep all. Returns: List of messages with tool results filtered according to keep_tool_result """ messages_copy = [m.copy() for m in messages] if keep_tool_result == -1: # No processing needed, keep all messages return messages_copy # Find indices of all user/tool messages (these are tool results) user_indices = [ i for i, msg in enumerate(messages_copy) if msg.get("role") == "user" or msg.get("role") == "tool" ] if len(user_indices) == 0: # No user/tool messages found self.task_log.log_step( "info", "LLM | Message Retention", "No user/tool messages found in the history.", ) return messages_copy # The first user message is the initial task, not a tool result # Tool results start from the second user message onwards if len(user_indices) == 1: # Only one user message (the initial task), no tool results to filter self.task_log.log_step( "info", "LLM | Message Retention", "Only 1 user message found (initial task). Keeping it as is.", ) return messages_copy # Tool result indices (excluding the first user message which is the initial task) tool_result_indices = user_indices[1:] first_user_idx = user_indices[ 0 ] # Always keep the first user message (initial task) # Calculate how many tool results to keep from the end if keep_tool_result == 0: # Keep 0 tool results, only keep the initial task num_tool_results_to_keep = 0 else: # Keep the last keep_tool_result tool results num_tool_results_to_keep = min(keep_tool_result, len(tool_result_indices)) # Get indices of tool results to keep from the end tool_result_indices_to_keep = ( tool_result_indices[-num_tool_results_to_keep:] if num_tool_results_to_keep > 0 else [] ) # Combine first message (initial task) and tool results to keep indices_to_keep = [first_user_idx] + tool_result_indices_to_keep self.task_log.log_step( "info", "LLM | Message Retention", f"Message retention summary: Total user/tool messages: {len(user_indices)}, " f"Initial task at index: {first_user_idx}, " f"Keeping last {num_tool_results_to_keep} tool results at indices: {tool_result_indices_to_keep}, " f"Total messages to keep: {len(indices_to_keep)}", ) # Replace content of tool results that should be omitted for i, msg in enumerate(messages_copy): if ( msg.get("role") == "user" or msg.get("role") == "tool" ) and i not in indices_to_keep: # Preserve the message structure but replace content if isinstance(msg.get("content"), list): # For Anthropic format msg["content"] = [ { "type": "text", "text": "Tool result is omitted to save tokens.", } ] else: # For OpenAI format msg["content"] = "Tool result is omitted to save tokens." return messages_copy @with_timeout(DEFAULT_LLM_TIMEOUT_SECONDS) async def create_message( self, system_prompt: str, message_history: List[Dict], tool_definitions: List[Dict], keep_tool_result: int = -1, step_id: int = 1, task_log: Optional["TaskLog"] = None, agent_type: str = "main", ) -> Tuple[Any, List[Dict]]: """ Call LLM to generate a response with optional tool call support. This is the main entry point for LLM interactions. It handles: - Message history management - Tool result filtering based on keep_tool_result - Error handling and logging Args: system_prompt: System prompt to guide the LLM's behavior message_history: List of previous messages in the conversation tool_definitions: List of available tool definitions keep_tool_result: Number of recent tool results to keep (-1 = keep all) step_id: Current step identifier for logging task_log: Optional logger for task execution agent_type: Type of agent making the call ("main" or sub-agent name) Returns: Tuple of (response, updated_message_history) """ # Unified LLM call processing try: response, message_history = await self._create_message( system_prompt, message_history, tool_definitions, keep_tool_result=keep_tool_result, ) except Exception as e: self.task_log.log_step( "error", f"FATAL ERROR | {agent_type} | LLM Call ERROR", f"{agent_type} failed: {str(e)}", ) response = None return response, message_history @staticmethod async def convert_tool_definition_to_tool_call(tools_definitions): """ Convert MCP tool definitions to OpenAI function call format. Transforms the internal tool definition format used by MCP servers into the format expected by OpenAI's function calling API. Args: tools_definitions: List of server definitions, each containing a 'name' and 'tools' list with tool specifications. Returns: List of tool definitions in OpenAI function call format, where each tool name is prefixed with its server name (e.g., "server-name-tool-name"). """ tool_list = [] for server in tools_definitions: if "tools" in server and len(server["tools"]) > 0: for tool in server["tools"]: tool_def = dict( type="function", function=dict( name=f"{server['name']}-{tool['name']}", description=tool["description"], parameters=tool["schema"], ), ) tool_list.append(tool_def) return tool_list def close(self): """Close client connection. Note: For async clients (AsyncOpenAI, AsyncAnthropic), the connection will be closed when the client object is garbage collected. For proper async cleanup, use `await client.aclose()` in an async context. """ if hasattr(self.client, "close"): if asyncio.iscoroutinefunction(self.client.close): # For async clients, we cannot call close() synchronously. # The async HTTP client will be closed when garbage collected. # For explicit async cleanup, call aclose() from an async context. if hasattr(self.client, "_client"): # Try to close the underlying httpx client if available try: self.client._client.close() except Exception: pass # Ignore errors during cleanup else: self.client.close() elif hasattr(self.client, "_client") and hasattr(self.client._client, "close"): # Some clients may have internal _client attribute self.client._client.close() def _format_response_for_log(self, response) -> Dict: """Format response for logging""" if not response: return {} # Basic response information formatted = { "response_type": type(response).__name__, } # Anthropic response if hasattr(response, "content"): formatted["content"] = [] for block in response.content: if hasattr(block, "type"): if block.type == "text": formatted["content"].append( { "type": "text", "text": block.text[:500] + "..." if len(block.text) > 500 else block.text, } ) elif block.type == "tool_use": formatted["content"].append( { "type": "tool_use", "id": block.id, "name": block.name, "input": str(block.input)[:200] + "..." if len(str(block.input)) > 200 else str(block.input), } ) # OpenAI response if hasattr(response, "choices"): formatted["choices"] = [] for choice in response.choices: choice_data = {"finish_reason": choice.finish_reason} if hasattr(choice, "message"): message = choice.message choice_data["message"] = { "role": message.role, "content": message.content[:500] + "..." if message.content and len(message.content) > 500 else message.content, } if hasattr(message, "tool_calls") and message.tool_calls: choice_data["message"]["tool_calls_count"] = len( message.tool_calls ) formatted["choices"].append(choice_data) return formatted ================================================ FILE: apps/miroflow-agent/src/llm/factory.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ LLM Client Factory module. This module provides a factory function for creating LLM clients based on configuration. It supports multiple providers including OpenAI, Anthropic, and Qwen (via OpenAI-compatible API). """ from typing import Optional, Union from omegaconf import DictConfig, OmegaConf from ..logging.task_logger import TaskLog from .providers.anthropic_client import AnthropicClient from .providers.openai_client import OpenAIClient # Supported LLM providers SUPPORTED_PROVIDERS = {"anthropic", "openai", "qwen"} def ClientFactory( task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs ) -> Union[OpenAIClient, AnthropicClient]: """ Create an LLM client based on the provider specified in configuration. This factory function automatically selects and instantiates the appropriate client class based on the `llm.provider` field in the configuration. Args: task_id: Unique identifier for the current task (used for tracking) cfg: Hydra configuration object containing LLM settings task_log: Optional logger for recording task execution details **kwargs: Additional keyword arguments to merge into configuration Returns: An instance of the appropriate LLM client (OpenAIClient or AnthropicClient) Example: >>> client = ClientFactory( ... task_id="task_001", ... cfg=cfg, ... task_log=task_log ... ) """ provider = cfg.llm.provider config = OmegaConf.merge(cfg, kwargs) client_creators = { "anthropic": lambda: AnthropicClient( task_id=task_id, task_log=task_log, cfg=config ), "qwen": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config), "openai": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config), } factory = client_creators.get(provider) if not factory: raise ValueError( f"Unsupported provider: '{provider}'. " f"Supported providers are: {', '.join(sorted(SUPPORTED_PROVIDERS))}" ) return factory() ================================================ FILE: apps/miroflow-agent/src/llm/providers/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. from .anthropic_client import AnthropicClient from .openai_client import OpenAIClient __all__ = [ "AnthropicClient", "OpenAIClient", ] ================================================ FILE: apps/miroflow-agent/src/llm/providers/anthropic_client.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Anthropic Claude LLM client implementation. This module provides the AnthropicClient class for interacting with Anthropic's Claude API, with support for prompt caching and extended thinking. Features: - Async and sync API support - Prompt caching with ephemeral cache control - Token usage tracking including cache statistics - MCP tool call parsing and response processing """ import asyncio import dataclasses import logging from typing import Any, Dict, List, Tuple, Union import tiktoken from anthropic import ( NOT_GIVEN, Anthropic, AsyncAnthropic, DefaultAsyncHttpxClient, DefaultHttpxClient, ) from tenacity import retry, stop_after_attempt, wait_fixed from ...utils.prompt_utils import generate_mcp_system_prompt from ..base_client import BaseClient logger = logging.getLogger("miroflow_agent") @dataclasses.dataclass class AnthropicClient(BaseClient): def __post_init__(self): super().__post_init__() # Anthropic-specific token counters self.input_tokens: int = 0 self.output_tokens: int = 0 self.cache_creation_tokens: int = 0 self.cache_read_tokens: int = 0 def _create_client(self) -> Union[AsyncAnthropic, Anthropic]: """Create LLM client""" http_client_args = {"headers": {"x-upstream-session-id": self.task_id}} if self.async_client: return AsyncAnthropic( api_key=self.api_key, base_url=self.base_url, http_client=DefaultAsyncHttpxClient(**http_client_args), ) else: return Anthropic( api_key=self.api_key, base_url=self.base_url, http_client=DefaultHttpxClient(**http_client_args), ) def _update_token_usage(self, usage_data: Any) -> None: """Update cumulative token usage""" if usage_data: # Update based on actual field names returned by Anthropic API self.token_usage["total_cache_write_input_tokens"] += ( getattr(usage_data, "cache_creation_input_tokens", 0) or 0 ) self.token_usage["total_cache_read_input_tokens"] += ( getattr(usage_data, "cache_read_input_tokens", 0) or 0 ) self.token_usage["total_input_tokens"] += ( getattr(usage_data, "input_tokens", 0) or 0 ) self.token_usage["total_output_tokens"] += ( getattr(usage_data, "output_tokens", 0) or 0 ) self.task_log.log_step( "info", "LLM | Token Usage", f"Input: {getattr(usage_data, 'input_tokens', 0)}, " f"Cache: {getattr(usage_data, 'cache_creation_input_tokens', 0)}+{getattr(usage_data, 'cache_read_input_tokens', 0)}, " f"Output: {getattr(usage_data, 'output_tokens', 0)}", ) self.last_call_tokens = { "input_tokens": getattr(usage_data, "input_tokens", 0) + getattr(usage_data, "cache_creation_input_tokens", 0) + getattr(usage_data, "cache_read_input_tokens", 0), "output_tokens": getattr(usage_data, "output_tokens", 0), } else: self.task_log.log_step( "warning", "LLM | Token Usage", "Warning: No valid usage_data received." ) @retry(wait=wait_fixed(10), stop=stop_after_attempt(5)) async def _create_message( self, system_prompt: str, messages_history: List[Dict[str, Any]], tools_definitions, keep_tool_result: int = -1, ): """ Send message to Anthropic API. :param system_prompt: System prompt string. :param messages_history: Message history list. :return: Anthropic API response object or None (if error occurs). """ self.task_log.log_step( "info", "LLM | Call Start", f"Calling LLM ({'async' if self.async_client else 'sync'})", ) # Create a filtered copy for sending to LLM (to save tokens) # But keep the original messages_history for returning (for complete log) messages_for_llm = self._remove_tool_result_from_messages( messages_history, keep_tool_result ) # Apply cache control processed_messages = self._apply_cache_control(messages_for_llm) try: # Note: Anthropic API does not support repetition_penalty parameter if self.async_client: response = await self.client.messages.create( model=self.model_name, temperature=self.temperature, top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN, top_k=self.top_k if self.top_k != -1 else NOT_GIVEN, max_tokens=self.max_tokens, system=[ { "type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}, } ], messages=processed_messages, stream=False, ) else: response = self.client.messages.create( model=self.model_name, temperature=self.temperature, top_p=self.top_p if self.top_p != 1.0 else NOT_GIVEN, top_k=self.top_k if self.top_k != -1 else NOT_GIVEN, max_tokens=self.max_tokens, system=[ { "type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}, } ], messages=processed_messages, stream=False, ) self._update_token_usage(getattr(response, "usage", None)) self.task_log.log_step( "info", "LLM | Call Status", f"LLM call status: {getattr(response, 'stop_reason', 'N/A')}", ) # Return the original messages_history (not the filtered copy) # This ensures that the complete conversation history is preserved in logs return response, messages_history except asyncio.CancelledError: self.task_log.log_step( "warning", "LLM | Call Cancelled", "⚠️ LLM API call was cancelled during execution", ) raise # Re-raise to allow decorator to log it except Exception as e: self.task_log.log_step( "error", "LLM | Call Failed", f"Anthropic LLM call failed: {str(e)}" ) raise e def process_llm_response( self, llm_response: Any, message_history: List[Dict], agent_type: str = "main" ) -> tuple[str, bool, List[Dict]]: """Process LLM response""" if not llm_response: self.task_log.log_step( "error", "LLM | Response Processing", "❌ LLM call failed, skipping this response.", ) return "", True, message_history if not hasattr(llm_response, "content") or not llm_response.content: self.task_log.log_step( "error", "LLM | Response Processing", "❌ LLM response is empty or contains no content.", ) return "", True, message_history # Extract response content assistant_response_text = "" assistant_response_content = [] from ...utils.parsing_utils import fix_server_name_in_text for block in llm_response.content: if block.type == "text": assistant_response_text += block.text + "\n" assistant_response_content.append({"type": "text", "text": block.text}) elif block.type == "tool_use": assistant_response_content.append( { "type": "tool_use", "id": block.id, "name": block.name, "input": block.input, } ) # Fix server_name in text content assistant_response_text = fix_server_name_in_text(assistant_response_text) for item in assistant_response_content: if item.get("type") == "text": item["text"] = fix_server_name_in_text(item["text"]) # Add assistant response to history message_history.append( {"role": "assistant", "content": assistant_response_content} ) self.task_log.log_step( "info", "LLM | Response", f"LLM Response: {assistant_response_text}" ) return assistant_response_text, False, message_history def extract_tool_calls_info( self, llm_response: Any, assistant_response_text: str ) -> List[Dict]: """Extract tool call information from LLM response""" from ...utils.parsing_utils import parse_llm_response_for_tool_calls return parse_llm_response_for_tool_calls(assistant_response_text) def update_message_history( self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple] ) -> List[Dict]: """Update message history with tool calls data (llm client specific)""" merged_text = "\n".join( [ item[1]["text"] for item in all_tool_results_content_with_id if item[1]["type"] == "text" ] ) message_history.append( { "role": "user", "content": [{"type": "text", "text": merged_text}], } ) return message_history def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str: from ...utils.parsing_utils import set_tool_server_mapping prompt = generate_mcp_system_prompt(date, mcp_servers) set_tool_server_mapping(prompt) return prompt def _estimate_tokens(self, text: str) -> int: """Use tiktoken to estimate the number of tokens in text""" if not hasattr(self, "encoding"): # Initialize tiktoken encoder try: self.encoding = tiktoken.get_encoding("o200k_base") except Exception: # If o200k_base is not available, use cl100k_base as fallback self.encoding = tiktoken.get_encoding("cl100k_base") try: return len(self.encoding.encode(text)) except Exception as e: # If encoding fails, use simple estimation: approximately 1 token per 4 characters self.task_log.log_step( "error", "LLM | Token Estimation Error", f"Error: {str(e)}", ) return len(text) // 4 def ensure_summary_context( self, message_history: list, summary_prompt: str ) -> tuple[bool, list]: """ Check if current message_history + summary_prompt will exceed context If it will exceed, remove the last assistant-user pair and return False Return True to continue, False if messages have been rolled back """ # Get token usage from the last LLM call last_input_tokens = self.last_call_tokens.get("input_tokens", 0) last_output_tokens = self.last_call_tokens.get("output_tokens", 0) buffer_factor = 1.5 # Calculate token count for summary prompt summary_tokens = int(self._estimate_tokens(str(summary_prompt)) * buffer_factor) # Calculate token count for the last user message in message_history last_user_tokens = 0 if message_history[-1]["role"] == "user": content = message_history[-1]["content"] last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor) # Calculate total token count: last input + output + last user message + summary + reserved response space estimated_total = ( last_input_tokens + last_output_tokens + last_user_tokens + summary_tokens + self.max_tokens + 1000 # Add 1000 tokens as buffer ) if estimated_total >= self.max_context_length: self.task_log.log_step( "info", "LLM | Context Limit Reached", "Context limit reached, proceeding to step back and summarize the conversation", ) # Remove the last user message (tool call results) if message_history[-1]["role"] == "user": message_history.pop() # Remove the second-to-last assistant message (tool call request) if message_history[-1]["role"] == "assistant": message_history.pop() self.task_log.log_step( "info", "LLM | Context Limit Reached", f"Removed the last assistant-user pair, current message_history length: {len(message_history)}", ) return False, message_history self.task_log.log_step( "info", "LLM | Context Limit Not Reached", f"{estimated_total}/{self.max_context_length}", ) return True, message_history def format_token_usage_summary(self) -> tuple[List[str], str]: """Format token usage statistics, return summary_lines for format_final_summary and log string""" token_usage = self.get_token_usage() total_input = token_usage.get("total_input_tokens", 0) total_output = token_usage.get("total_output_tokens", 0) total_cache_creation = token_usage.get("total_cache_write_input_tokens", 0) total_cache_read = token_usage.get("total_cache_read_input_tokens", 0) summary_lines = [] summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20) summary_lines.append(f"Total Input Tokens (non-cache): {total_input}") summary_lines.append( f"Total Cache Creation Input Tokens: {total_cache_creation}" ) summary_lines.append(f"Total Cache Read Input Tokens: {total_cache_read}") summary_lines.append(f"Total Output Tokens: {total_output}") summary_lines.append("-" * (40 + len(" Token Usage "))) summary_lines.append("Pricing is disabled - no cost information available") summary_lines.append("-" * (40 + len(" Token Usage "))) # Generate log string log_string = ( f"[{self.model_name}] Total Input: {total_input}, " f"Cache Creation: {total_cache_creation}, " f"Cache Read: {total_cache_read}, " f"Output: {total_output}" ) return summary_lines, log_string def get_token_usage(self): return self.token_usage.copy() def _apply_cache_control(self, messages: List[Dict]) -> List[Dict]: """Apply cache control to the last user message and system message (if applicable)""" cached_messages = [] user_turns_processed = 0 for turn in reversed(messages): if turn["role"] == "user" and user_turns_processed < 1: # Add ephemeral cache control to the text part of the last user message new_content = [] processed_text = False # Check if content is a list if isinstance(turn["content"], str): turn["content"] = [{"type": "text", "text": turn["content"]}] if isinstance(turn.get("content"), list): # see example here # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for item in turn["content"]: if ( item.get("type") == "text" and len(item.get("text")) > 0 and not processed_text ): # Copy and add cache control text_item = item.copy() text_item["cache_control"] = {"type": "ephemeral"} new_content.append(text_item) processed_text = True else: # Other types of content (like image) copied directly new_content.append(item.copy()) cached_messages.append({"role": "user", "content": new_content}) else: # If content is not a list (e.g., plain text), add as is without cache control # Or adjust logic as needed self.task_log.log_step( "warning", "LLM | Cache Control", "Warning: User message content is not in expected list format, cache control not applied.", ) cached_messages.append(turn) user_turns_processed += 1 else: # Add other messages directly cached_messages.append(turn) return list(reversed(cached_messages)) ================================================ FILE: apps/miroflow-agent/src/llm/providers/openai_client.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ OpenAI-compatible LLM client implementation. This module provides the OpenAIClient class for interacting with OpenAI's API and OpenAI-compatible endpoints (such as vLLM, Qwen, DeepSeek, etc.). Features: - Async and sync API support - Automatic retry with exponential backoff - Token usage tracking and context length management - MCP tool call parsing and response processing """ import asyncio import dataclasses import logging from typing import Any, Dict, List, Tuple, Union import tiktoken from openai import AsyncOpenAI, DefaultAsyncHttpxClient, DefaultHttpxClient, OpenAI from ...utils.prompt_utils import generate_mcp_system_prompt from ..base_client import BaseClient logger = logging.getLogger("miroflow_agent") @dataclasses.dataclass class OpenAIClient(BaseClient): def _create_client(self) -> Union[AsyncOpenAI, OpenAI]: """Create LLM client""" http_client_args = {"headers": {"x-upstream-session-id": self.task_id}} if self.async_client: return AsyncOpenAI( api_key=self.api_key, base_url=self.base_url, http_client=DefaultAsyncHttpxClient(**http_client_args), ) else: return OpenAI( api_key=self.api_key, base_url=self.base_url, http_client=DefaultHttpxClient(**http_client_args), ) def _update_token_usage(self, usage_data: Any) -> None: """Update cumulative token usage""" if usage_data: input_tokens = getattr(usage_data, "prompt_tokens", 0) output_tokens = getattr(usage_data, "completion_tokens", 0) prompt_tokens_details = getattr(usage_data, "prompt_tokens_details", None) if prompt_tokens_details: cached_tokens = ( getattr(prompt_tokens_details, "cached_tokens", None) or 0 ) else: cached_tokens = 0 # Record token usage for the most recent call self.last_call_tokens = { "prompt_tokens": input_tokens, "completion_tokens": output_tokens, } # OpenAI does not provide cache_creation_input_tokens self.token_usage["total_input_tokens"] += input_tokens self.token_usage["total_output_tokens"] += output_tokens self.token_usage["total_cache_read_input_tokens"] += cached_tokens self.task_log.log_step( "info", "LLM | Token Usage", f"Input: {self.token_usage['total_input_tokens']}, " f"Output: {self.token_usage['total_output_tokens']}", ) async def _create_message( self, system_prompt: str, messages_history: List[Dict[str, Any]], tools_definitions, keep_tool_result: int = -1, ): """ Send message to OpenAI API. :param system_prompt: System prompt string. :param messages_history: Message history list. :return: OpenAI API response object or None (if error occurs). """ # Create a copy for sending to LLM (to avoid modifying the original) messages_for_llm = [m.copy() for m in messages_history] # put the system prompt in the first message since OpenAI API does not support system prompt in if system_prompt: # Check if there's already a system or developer message if messages_for_llm and messages_for_llm[0]["role"] in [ "system", "developer", ]: messages_for_llm[0] = { "role": "system", "content": system_prompt, } else: messages_for_llm.insert( 0, { "role": "system", "content": system_prompt, }, ) # Filter tool results to save tokens (only affects messages sent to LLM) messages_for_llm = self._remove_tool_result_from_messages( messages_for_llm, keep_tool_result ) # Retry loop with dynamic max_tokens adjustment max_retries = 10 base_wait_time = 30 current_max_tokens = self.max_tokens for attempt in range(max_retries): params = { "model": self.model_name, "temperature": self.temperature, "messages": messages_for_llm, "stream": False, "top_p": self.top_p, "extra_body": {}, } # Check if the model is GPT-5, and adjust the parameter accordingly if "gpt-5" in self.model_name: # Use 'max_completion_tokens' for GPT-5 params["max_completion_tokens"] = current_max_tokens else: # Use 'max_tokens' for GPT-4 and other models params["max_tokens"] = current_max_tokens # Add repetition_penalty if it's not the default value if self.repetition_penalty != 1.0: params["extra_body"]["repetition_penalty"] = self.repetition_penalty if "deepseek-v3-1" in self.model_name: params["extra_body"]["thinking"] = {"type": "enabled"} # auto-detect if we need to continue from the last assistant message if messages_for_llm and messages_for_llm[-1].get("role") == "assistant": params["extra_body"]["continue_final_message"] = True params["extra_body"]["add_generation_prompt"] = False try: if self.async_client: response = await self.client.chat.completions.create(**params) else: response = self.client.chat.completions.create(**params) # Update token count self._update_token_usage(getattr(response, "usage", None)) self.task_log.log_step( "info", "LLM | Response Status", f"{getattr(response.choices[0], 'finish_reason', 'N/A')}", ) # Check if response was truncated due to length limit finish_reason = getattr(response.choices[0], "finish_reason", None) if finish_reason == "length": # If this is not the last retry, increase max_tokens and retry if attempt < max_retries - 1: # Increase max_tokens by 10% current_max_tokens = int(current_max_tokens * 1.1) self.task_log.log_step( "warning", "LLM | Length Limit Reached", f"Response was truncated due to length limit (attempt {attempt + 1}/{max_retries}). Increasing max_tokens to {current_max_tokens} and retrying...", ) await asyncio.sleep(base_wait_time) continue else: # Last retry, return the truncated response instead of raising exception self.task_log.log_step( "warning", "LLM | Length Limit Reached - Returning Truncated Response", f"Response was truncated after {max_retries} attempts. Returning truncated response to allow ReAct loop to continue.", ) # Return the truncated response and let the orchestrator handle it return response, messages_history # Check if the last 50 characters of the response appear more than 5 times in the response content. # If so, treat it as a severe repeat and trigger a retry. if hasattr(response.choices[0], "message") and hasattr( response.choices[0].message, "content" ): resp_content = response.choices[0].message.content or "" else: resp_content = getattr(response.choices[0], "text", "") if resp_content and len(resp_content) >= 50: tail_50 = resp_content[-50:] repeat_count = resp_content.count(tail_50) if repeat_count > 5: # If this is not the last retry, retry if attempt < max_retries - 1: self.task_log.log_step( "warning", "LLM | Repeat Detected", f"Severe repeat: the last 50 chars appeared over 5 times (attempt {attempt + 1}/{max_retries}), retrying...", ) await asyncio.sleep(base_wait_time) continue else: # Last retry, return anyway self.task_log.log_step( "warning", "LLM | Repeat Detected - Returning Anyway", f"Severe repeat detected after {max_retries} attempts. Returning response anyway.", ) # Success - return the original messages_history (not the filtered copy) # This ensures that the complete conversation history is preserved in logs return response, messages_history except asyncio.TimeoutError as e: if attempt < max_retries - 1: self.task_log.log_step( "warning", "LLM | Timeout Error", f"Timeout error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...", ) await asyncio.sleep(base_wait_time) continue else: self.task_log.log_step( "error", "LLM | Timeout Error", f"Timeout error after {max_retries} attempts: {str(e)}", ) raise e except asyncio.CancelledError as e: self.task_log.log_step( "error", "LLM | Request Cancelled", f"Request was cancelled: {str(e)}", ) raise e except Exception as e: if "Error code: 400" in str(e) and "longer than the model" in str(e): self.task_log.log_step( "error", "LLM | Context Length Error", f"Error: {str(e)}", ) raise e else: if attempt < max_retries - 1: self.task_log.log_step( "warning", "LLM | API Error", f"Error (attempt {attempt + 1}/{max_retries}): {str(e)}, retrying...", ) await asyncio.sleep(base_wait_time) continue else: self.task_log.log_step( "error", "LLM | API Error", f"Error after {max_retries} attempts: {str(e)}", ) raise e # Should never reach here, but just in case raise Exception("Unexpected error: retry loop completed without returning") def process_llm_response( self, llm_response: Any, message_history: List[Dict], agent_type: str = "main" ) -> tuple[str, bool, List[Dict]]: """Process LLM response""" if not llm_response or not llm_response.choices: error_msg = "LLM did not return a valid response." self.task_log.log_step( "error", "LLM | Response Error", f"Error: {error_msg}" ) return "", True, message_history # Exit loop, return message_history # Extract LLM response text from ...utils.parsing_utils import fix_server_name_in_text if llm_response.choices[0].finish_reason == "stop": assistant_response_text = llm_response.choices[0].message.content or "" assistant_response_text = fix_server_name_in_text(assistant_response_text) message_history.append( {"role": "assistant", "content": assistant_response_text} ) elif llm_response.choices[0].finish_reason == "length": assistant_response_text = llm_response.choices[0].message.content or "" assistant_response_text = fix_server_name_in_text(assistant_response_text) if assistant_response_text == "": assistant_response_text = "LLM response is empty." elif "Context length exceeded" in assistant_response_text: # This is the case where context length is exceeded, needs special handling self.task_log.log_step( "warning", "LLM | Context Length", "Detected context length exceeded, returning error status", ) message_history.append( {"role": "assistant", "content": assistant_response_text} ) return ( assistant_response_text, True, message_history, ) # Return True to indicate need to exit loop # Add assistant response to history message_history.append( {"role": "assistant", "content": assistant_response_text} ) else: raise ValueError( f"Unsupported finish reason: {llm_response.choices[0].finish_reason}" ) return assistant_response_text, False, message_history def extract_tool_calls_info( self, llm_response: Any, assistant_response_text: str ) -> List[Dict]: """Extract tool call information from LLM response""" from ...utils.parsing_utils import parse_llm_response_for_tool_calls return parse_llm_response_for_tool_calls(assistant_response_text) def update_message_history( self, message_history: List[Dict], all_tool_results_content_with_id: List[Tuple] ) -> List[Dict]: """Update message history with tool calls data (llm client specific)""" merged_text = "\n".join( [ item[1]["text"] for item in all_tool_results_content_with_id if item[1]["type"] == "text" ] ) message_history.append( { "role": "user", "content": merged_text, } ) return message_history def generate_agent_system_prompt(self, date: Any, mcp_servers: List[Dict]) -> str: from ...utils.parsing_utils import set_tool_server_mapping prompt = generate_mcp_system_prompt(date, mcp_servers) set_tool_server_mapping(prompt) return prompt def _estimate_tokens(self, text: str) -> int: """Use tiktoken to estimate the number of tokens in text""" if not hasattr(self, "encoding"): # Initialize tiktoken encoder try: self.encoding = tiktoken.get_encoding("o200k_base") except Exception: # If o200k_base is not available, use cl100k_base as fallback self.encoding = tiktoken.get_encoding("cl100k_base") try: return len(self.encoding.encode(text)) except Exception as e: # If encoding fails, use simple estimation: approximately 1 token per 4 characters self.task_log.log_step( "error", "LLM | Token Estimation Error", f"Error: {str(e)}", ) return len(text) // 4 def ensure_summary_context( self, message_history: list, summary_prompt: str ) -> tuple[bool, list]: """ Check if current message_history + summary_prompt will exceed context If it will exceed, remove the last assistant-user pair and return False Return True to continue, False if messages have been rolled back """ # Get token usage from the last LLM call last_prompt_tokens = self.last_call_tokens.get("prompt_tokens", 0) last_completion_tokens = self.last_call_tokens.get("completion_tokens", 0) buffer_factor = 1.5 # Calculate token count for summary prompt summary_tokens = int(self._estimate_tokens(summary_prompt) * buffer_factor) # Calculate token count for the last user message in message_history last_user_tokens = 0 if message_history[-1]["role"] == "user": content = message_history[-1]["content"] last_user_tokens = int(self._estimate_tokens(str(content)) * buffer_factor) # Calculate total token count: last prompt + completion + last user message + summary + reserved response space estimated_total = ( last_prompt_tokens + last_completion_tokens + last_user_tokens + summary_tokens + self.max_tokens + 1000 # Add 1000 tokens as buffer ) if estimated_total >= self.max_context_length: self.task_log.log_step( "info", "LLM | Context Limit Reached", "Context limit reached, proceeding to step back and summarize the conversation", ) # Remove the last user message (tool call results) if message_history[-1]["role"] == "user": message_history.pop() # Remove the second-to-last assistant message (tool call request) if message_history[-1]["role"] == "assistant": message_history.pop() self.task_log.log_step( "info", "LLM | Context Limit Reached", f"Removed the last assistant-user pair, current message_history length: {len(message_history)}", ) return False, message_history self.task_log.log_step( "info", "LLM | Context Limit Not Reached", f"{estimated_total}/{self.max_context_length}", ) return True, message_history def format_token_usage_summary(self) -> tuple[List[str], str]: """Format token usage statistics, return summary_lines for format_final_summary and log string""" token_usage = self.get_token_usage() total_input = token_usage.get("total_input_tokens", 0) total_output = token_usage.get("total_output_tokens", 0) cache_input = token_usage.get("total_cache_input_tokens", 0) summary_lines = [] summary_lines.append("\n" + "-" * 20 + " Token Usage " + "-" * 20) summary_lines.append(f"Total Input Tokens: {total_input}") summary_lines.append(f"Total Cache Input Tokens: {cache_input}") summary_lines.append(f"Total Output Tokens: {total_output}") summary_lines.append("-" * (40 + len(" Token Usage "))) summary_lines.append("Pricing is disabled - no cost information available") summary_lines.append("-" * (40 + len(" Token Usage "))) # Generate log string log_string = ( f"[{self.model_name}] Total Input: {total_input}, " f"Cache Input: {cache_input}, " f"Output: {total_output}" ) return summary_lines, log_string def get_token_usage(self): return self.token_usage.copy() ================================================ FILE: apps/miroflow-agent/src/llm/util.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Utility decorators and helpers for LLM client operations. This module provides: - Timeout decorator for async LLM API calls - Other common utilities shared across LLM providers """ import asyncio import functools from typing import Awaitable, Callable, TypeVar T = TypeVar("T") def with_timeout( timeout_s: float = 300.0, ) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: """ Decorator: wraps any *async* function in asyncio.wait_for(). Usage: @with_timeout(20) async def create_message_foo(...): ... """ def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: @functools.wraps(func) async def wrapper(*args, **kwargs) -> T: return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s) return wrapper return decorator ================================================ FILE: apps/miroflow-agent/src/logging/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Logging module for task execution tracking.""" from .task_logger import ( LLMCallLog, StepLog, TaskLog, ToolCallLog, bootstrap_logger, get_utc_plus_8_time, ) __all__ = [ "TaskLog", "StepLog", "LLMCallLog", "ToolCallLog", "bootstrap_logger", "get_utc_plus_8_time", ] ================================================ FILE: apps/miroflow-agent/src/logging/summary_time_cost.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json from collections import defaultdict from pathlib import Path from .task_logger import logger def _get_summary_template(): """Returns a template for the summary data structure.""" return { "total_tasks": 0, "total_wall_time": 0.0, "primary_breakdown": { "main_agent": defaultdict(float), "browsing_agent": defaultdict(float), }, "cross_cutting_breakdown": defaultdict(float), "tool_workload_breakdown": defaultdict(float), } def _update_summary_data(summary_block, perf_summary, tool_workload): """Updates a summary block with data from a single result.""" summary_block["total_tasks"] += 1 summary_block["total_wall_time"] += perf_summary.get("total_wall_time", 0.0) # Update primary breakdown primary_breakdown = perf_summary.get("primary_breakdown", {}) for agent, data in primary_breakdown.items(): if agent in summary_block["primary_breakdown"]: for key, value in data.items(): summary_block["primary_breakdown"][agent][key] += value # Update cross-cutting breakdown cross_cutting_breakdown = perf_summary.get("cross_cutting_breakdown", {}) for key, value in cross_cutting_breakdown.items(): summary_block["cross_cutting_breakdown"][key] += value # Update tool workload breakdown for key, value in tool_workload.items(): summary_block["tool_workload_breakdown"][key] += value def _calculate_averages(summary_block): """Calculates and adds average values to a summary block.""" num_tasks = summary_block["total_tasks"] if num_tasks == 0: return summary_block["average_wall_time"] = summary_block["total_wall_time"] / num_tasks # Calculate averages for primary breakdown for agent, data in summary_block["primary_breakdown"].items(): summary_block["primary_breakdown"][agent] = dict(data) # Convert back to dict avg_data = {f"avg_{k}": v / num_tasks for k, v in data.items()} summary_block["primary_breakdown"][agent].update(avg_data) # Calculate averages for cross-cutting breakdown summary_block["cross_cutting_breakdown"] = dict( summary_block["cross_cutting_breakdown"] ) avg_cross_cutting = { f"avg_{k}": v / num_tasks for k, v in summary_block["cross_cutting_breakdown"].items() } summary_block["cross_cutting_breakdown"].update(avg_cross_cutting) # Calculate averages for tool workload breakdown summary_block["tool_workload_breakdown"] = dict( summary_block["tool_workload_breakdown"] ) avg_tool_workload = { f"avg_{k}": v / num_tasks for k, v in summary_block["tool_workload_breakdown"].items() } summary_block["tool_workload_breakdown"].update(avg_tool_workload) def generate_summary(log_dir: Path): """ Generates a summary of benchmark results by reading log files from a directory, calculating total and average trace data, both overall and grouped by final_judge_result. Args: log_dir: The directory where the individual result log files are and where the summary file will be saved. """ results = [] for log_file in log_dir.glob("*.json"): if log_file.name == "summary.json": continue try: with open(log_file, "r", encoding="utf-8") as f: results.append(json.load(f)) except json.JSONDecodeError: logger.info(f"Warning: Could not decode JSON from {log_file}. Skipping.") except Exception as e: logger.info(f"Warning: Could not read file {log_file}: {e}. Skipping.") overall_summary = _get_summary_template() summary_by_judge = defaultdict(_get_summary_template) for result in results: trace_data = result.get("trace_data") if not trace_data or "performance_summary" not in trace_data: continue perf_summary = trace_data["performance_summary"] tool_workload = trace_data.get("tool_workload_breakdown", {}) # Update overall summary _update_summary_data(overall_summary, perf_summary, tool_workload) # Update summary by judge result judge_result = result.get("final_judge_result", "unknown") _update_summary_data( summary_by_judge[judge_result], perf_summary, tool_workload ) # Calculate averages for all summary blocks _calculate_averages(overall_summary) for judge_result in summary_by_judge: _calculate_averages(summary_by_judge[judge_result]) summary_data = { "overall_summary": overall_summary, "summary_by_final_judge_result": dict(summary_by_judge), } summary_file = log_dir / "summary_time_cost.json" with open(summary_file, "w", encoding="utf-8") as f: json.dump(summary_data, f, indent=4, ensure_ascii=False) ================================================ FILE: apps/miroflow-agent/src/logging/task_logger.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Task logging and structured output module. This module provides: - TaskLog: Main dataclass for tracking task execution state and history - StepLog: Individual step logging with timestamps and metadata - ColoredFormatter: Console output formatting with color-coded log levels - Utility functions for time handling and logger configuration All logs are persisted to JSON files for later analysis and debugging. """ import json import logging import os from dataclasses import asdict, dataclass, field from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any, Dict, List, Literal, Optional # Import colorama for cross-platform colored output from colorama import Fore, Style, init # Initialize colorama init(autoreset=True, strip=False) # This will be set to the configured logger instance logger = None def get_color_for_level(level: str) -> str: """Get color code based on log level for better visual distinction""" if level == "ERROR": return f"{Fore.RED}{Style.BRIGHT}" elif level == "WARNING": return f"{Fore.YELLOW}{Style.BRIGHT}" elif level == "INFO": return f"{Fore.GREEN}{Style.BRIGHT}" elif level == "DEBUG": return f"{Fore.CYAN}{Style.BRIGHT}" else: return f"{Fore.WHITE}{Style.BRIGHT}" class ColoredFormatter(logging.Formatter): """Custom formatter that adds colors for better developer visualization""" def format(self, record): # Get timestamp and format it timestamp = self.formatTime(record, self.datefmt) # Color the level name based on severity level_color = get_color_for_level(record.levelname) level_reset = Style.RESET_ALL # Color the logger name (miroflow_agent) name_color = f"{Fore.BLUE}{Style.BRIGHT}" name_reset = Style.RESET_ALL # Get the message as is (icons are already added in log_step) message = record.getMessage() # Format with selective coloring formatted = f"[{timestamp}][{name_color}{record.name}{name_reset}][{level_color}{record.levelname}{level_reset}] - {message}" return formatted def bootstrap_logger() -> logging.Logger: """Configure the miroflow_agent logger with consistent formatting""" global logger # Configure miroflow_agent logger miroflow_agent_logger = logging.getLogger("miroflow_agent") # Check if logger already has handlers to prevent duplicate configuration if miroflow_agent_logger.handlers: logger = miroflow_agent_logger return miroflow_agent_logger # Create formatter with consistent format formatter = ColoredFormatter( "%(asctime)s,%(msecs)03d", datefmt="%Y-%m-%d %H:%M:%S", ) # Add our handler with the specified formatter handler = logging.StreamHandler() handler.setFormatter(formatter) miroflow_agent_logger.addHandler(handler) miroflow_agent_logger.setLevel(logging.DEBUG) # Disable propagation to prevent duplicate logging from root logger miroflow_agent_logger.propagate = False # Set the global logger variable logger = miroflow_agent_logger return miroflow_agent_logger def get_utc_plus_8_time() -> str: """Get UTC+8 timezone current time string""" utc_plus_8 = timezone(timedelta(hours=8)) return datetime.now(utc_plus_8).strftime("%Y-%m-%d %H:%M:%S") @dataclass class LLMCallLog: """Record technical details of LLM calls""" provider: str model: str input_tokens: int = 0 output_tokens: int = 0 cache_creation_tokens: int = 0 cache_read_tokens: int = 0 error: Optional[str] = None @dataclass class ToolCallLog: """Record detailed information of tool calls""" server_name: str tool_name: str arguments: Dict[str, Any] = field(default_factory=dict) result: Any = None error: Optional[str] = None call_time: Optional[str] = None @dataclass class StepLog: """Record detailed information of task execution steps""" step_name: str message: str timestamp: str info_level: Literal["info", "warning", "error", "debug"] = "info" metadata: Dict[str, Any] = field(default_factory=dict) def __post_init__(self): """Validate info_level after initialization""" valid_levels = {"info", "warning", "error", "debug"} if self.info_level not in valid_levels: raise ValueError( f"info_level must be one of {valid_levels}, got '{self.info_level}'" ) @dataclass class TaskLog: status: str = "running" start_time: str = "" end_time: str = "" task_id: str = "" input: Any = None ground_truth: str = "" final_boxed_answer: str = "" final_judge_result: str = "" judge_type: str = "" eval_details: Optional[Dict[str, Any]] = None # For DeepSearchQA metrics error: str = "" # Main records: main agent conversation turns current_main_turn_id: int = 0 current_sub_agent_turn_id: int = 0 sub_agent_counter: int = 0 current_sub_agent_session_id: Optional[str] = None env_info: Optional[dict] = field(default_factory=dict) log_dir: str = "logs" main_agent_message_history: List[Dict[str, Any]] = field(default_factory=list) sub_agent_message_history_sessions: Dict[str, List[Dict[str, Any]]] = field( default_factory=dict ) step_logs: List[StepLog] = field(default_factory=list) trace_data: Dict[str, Any] = field(default_factory=dict) def start_sub_agent_session( self, sub_agent_name: str, subtask_description: str ) -> str: """Start a new sub-agent session""" self.sub_agent_counter += 1 session_id = f"{sub_agent_name}_{self.sub_agent_counter}" self.current_sub_agent_session_id = session_id # Record sub-agent session start self.log_step( "info", f"{sub_agent_name} | Session Start", f"Starting {session_id} for subtask: {subtask_description[:100]}{'...' if len(subtask_description) > 100 else ''}", metadata={"session_id": session_id, "subtask": subtask_description}, ) return session_id def end_sub_agent_session(self, sub_agent_name: str) -> Optional[str]: """End the current sub-agent session""" self.log_step( "info", f"{sub_agent_name} | Session End", f"Ending {self.current_sub_agent_session_id}", metadata={"session_id": self.current_sub_agent_session_id}, ) self.current_sub_agent_session_id = None return None def log_step( self, info_level: Literal["info", "warning", "error", "debug"], step_name: str, message: str, metadata: Optional[Dict[str, Any]] = None, ): """Record execution step""" # Add icons to step_name based on content icon = "" if "Tool Call Start" in step_name: icon = "▶️ " elif "Tool Call Success" in step_name: icon = "✅ " elif "Tool Call Error" in step_name or ( "error" in info_level and "tool" in step_name.lower() ): icon = "❌ " elif "agent-" in step_name: icon = "🤖 " elif "Main Agent" in step_name: icon = "👑 " elif "LLM" in step_name: icon = "🧠 " elif "ToolManager" in step_name or "Tool Call" in step_name: icon = "🔧 " elif "tool-python" in step_name.lower(): icon = "🐍 " elif "tool-google-search" in step_name.lower(): icon = "🔍 " elif "tool-browser" in step_name.lower() or "playwright" in step_name.lower(): icon = "🌐 " # Add icon to step_name step_name_with_icon = f"{icon}{step_name}" step_log = StepLog( step_name=step_name_with_icon, message=message, timestamp=get_utc_plus_8_time(), info_level=info_level, metadata=metadata or {}, ) self.step_logs.append(step_log) # Print the structured log to console using the configured logger log_message = f"{step_name_with_icon}: {message}" # Ensure logger is configured global logger if logger is None: logger = bootstrap_logger() if info_level == "error": logger.error(log_message) elif info_level == "warning": logger.warning(log_message) elif info_level == "debug": logger.debug(log_message) else: # info logger.info(log_message) def serialize_for_json(self, obj): """Convert objects to JSON-serializable format""" if isinstance(obj, Path): return str(obj) elif isinstance(obj, dict): return {k: self.serialize_for_json(v) for k, v in obj.items()} elif isinstance(obj, list): return [self.serialize_for_json(item) for item in obj] elif hasattr(obj, "__dict__"): return self.serialize_for_json(obj.__dict__) else: return obj def to_json(self) -> str: """ Serialize the TaskLog to a JSON string. Converts the dataclass to a dictionary, handles non-JSON-serializable objects (like Path), and returns a formatted JSON string. Returns: A JSON string representation of the task log with 2-space indentation. Note: Falls back to ASCII encoding if Unicode encoding fails. """ # Convert to dict first data_dict = asdict(self) # Serialize any non-JSON-serializable objects serialized_dict = self.serialize_for_json(data_dict) try: return json.dumps(serialized_dict, ensure_ascii=False, indent=2) except UnicodeEncodeError as e: # Fallback: try with ASCII encoding if Unicode fails print(f"Warning: Unicode encoding failed, falling back to ASCII: {e}") return json.dumps(serialized_dict, ensure_ascii=True, indent=2) def save(self): """Save as a single JSON file""" os.makedirs(self.log_dir, exist_ok=True) timestamp = ( self.start_time.replace(":", "-").replace(".", "-").replace(" ", "-") ) filename = f"{self.log_dir}/task_{self.task_id}_{timestamp}.json" try: with open(filename, "w", encoding="utf-8") as f: f.write(self.to_json()) except UnicodeEncodeError as e: # Fallback: try with different encoding if UTF-8 fails print(f"Warning: UTF-8 encoding failed, trying with system default: {e}") with open(filename, "w") as f: f.write(self.to_json()) return filename @classmethod def from_dict(cls, d: dict) -> "TaskLog": """ Create a TaskLog instance from a dictionary. Args: d: Dictionary containing TaskLog field values. Returns: A new TaskLog instance initialized with the dictionary values. Note: The dictionary keys should match the TaskLog field names. """ return cls(**d) ================================================ FILE: apps/miroflow-agent/src/utils/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Utility functions for parsing, prompts, and wrappers.""" from .parsing_utils import ( extract_failure_experience_summary, extract_llm_response_text, fix_server_name_in_text, parse_llm_response_for_tool_calls, safe_json_loads, set_tool_server_mapping, ) from .prompt_utils import ( FORMAT_ERROR_MESSAGE, generate_agent_specific_system_prompt, generate_agent_summarize_prompt, generate_mcp_system_prompt, ) from .wrapper_utils import ErrorBox, ResponseBox __all__ = [ # parsing_utils "parse_llm_response_for_tool_calls", "extract_llm_response_text", "extract_failure_experience_summary", "fix_server_name_in_text", "set_tool_server_mapping", "safe_json_loads", # prompt_utils "FORMAT_ERROR_MESSAGE", "generate_mcp_system_prompt", "generate_agent_specific_system_prompt", "generate_agent_summarize_prompt", # wrapper_utils "ErrorBox", "ResponseBox", ] ================================================ FILE: apps/miroflow-agent/src/utils/parsing_utils.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Parsing utilities for LLM responses and tool calls. This module provides functions for: - Parsing tool calls from LLM responses (both OpenAI and MCP formats) - Extracting text content from responses - Safe JSON parsing with automatic repair - Failure experience summary extraction """ import json import logging import re from typing import Any, Dict, List, Union from json_repair import repair_json logger = logging.getLogger("miroflow_agent") def parse_tool_server_mapping(system_prompt: str) -> dict: """ Parse system prompt to extract tool_name → server_name mapping. Parses patterns like: ## Server name: tool-python ### Tool name: run_python_code Only extracts mappings for the 3 target tools that models commonly get wrong: run_python_code, google_search, scrape_and_extract_info. Args: system_prompt: The system prompt containing MCP tool definitions Returns: Dict mapping tool_name to correct server_name, e.g. {"run_python_code": "tool-python", "google_search": "search_and_scrape_webpage", ...} """ TARGET_TOOLS = {"run_python_code", "google_search", "scrape_and_extract_info"} mapping = {} current_server = None for line in system_prompt.split("\n"): server_match = re.match(r"## Server name:\s*(.+)", line) if server_match: current_server = server_match.group(1).strip() continue tool_match = re.match(r"### Tool name:\s*(.+)", line) if tool_match and current_server: tool_name = tool_match.group(1).strip() if tool_name in TARGET_TOOLS: mapping[tool_name] = current_server return mapping # Module-level cache for tool_server_mapping _tool_server_mapping: dict = {} def set_tool_server_mapping(system_prompt: str) -> None: """ Parse system prompt and cache the tool_name → server_name mapping. Should be called once when system prompt is available. Args: system_prompt: The system prompt containing MCP tool definitions """ global _tool_server_mapping _tool_server_mapping = parse_tool_server_mapping(system_prompt) def fix_server_name_in_text(text: str) -> str: """ Fix incorrect server_name and tool_name in MCP XML tool calls. Uses the cached tool_server_mapping (parsed from system prompt) to determine the correct server_name for each tool. Only fixes the 3 target tools: run_python_code, google_search, scrape_and_extract_info. Also handles the special case where model outputs tool_name=python (should be run_python_code). Args: text: The LLM response text containing MCP tool calls Returns: Text with corrected server_name and tool_name if needed """ if not isinstance(text, str): return text mapping = _tool_server_mapping if not mapping: return text # Special case: tool_name=python or python_code → rename to run_python_code # Only apply if system prompt defines run_python_code (not python) if "run_python_code" in mapping: for wrong_name in ("python", "python_code"): tag = f"{wrong_name}" if tag in text: text = text.replace(tag, "run_python_code") # Fix server_name for each target tool using the mapping from system prompt for tool_name, correct_server in mapping.items(): tool_tag = f"{tool_name}" if tool_tag not in text: continue correct_server_tag = f"{correct_server}" if correct_server_tag in text: continue text = re.sub( r"[^<]+(\s*" + re.escape(tool_tag) + r")", correct_server_tag + r"\1", text, ) return text def filter_none_values(arguments: Union[Dict, Any]) -> Union[Dict, Any]: """ Filter out keys with None values from arguments dictionary. Args: arguments: A dictionary to filter, or any other value Returns: The filtered dictionary, or the original value if not a dict """ if not isinstance(arguments, dict): return arguments return {k: v for k, v in arguments.items() if v is not None} def _fix_backslash_escapes(json_str: str) -> str: """ Fix common backslash escape issues in JSON strings. This handles cases where backslashes in string values are not properly escaped. Common issues: - Unescaped backslashes before non-escape characters Note: This is a conservative fix that preserves valid escape sequences (\\, \", \/, \b, \f, \n, \r, \t) and only fixes clearly problematic cases. """ fixed_str = json_str # Fix backslashes that are not part of valid escape sequences # Valid JSON escape sequences: \\, \", \/, \b, \f, \n, \r, \t, \uXXXX # Pattern: backslash not followed by a valid escape character # This regex matches \ followed by anything except valid escape chars # But we need to be careful not to match already-escaped backslashes (\\) # Strategy: Find all backslashes, but skip those that are: # 1. Already escaped (\\) # 2. Part of valid escape sequences (\", \/, \b, \f, \n, \r, \t, \u) # More conservative approach: Only fix backslashes before uppercase letters # (common in Windows paths) and other clearly problematic patterns # This avoids breaking valid JSON escape sequences # Fix backslashes before uppercase letters (Windows paths like C:\Users) fixed_str = re.sub( r"(? Dict[str, Any]: """ Safely parse a JSON string with multiple fallbacks. Parsing strategy: 1. Try standard json.loads() 2. If it fails, try json_repair to fix common issues 3. If all attempts fail, return an error object Args: arguments_str: JSON string to parse Returns: Parsed dictionary, or error dict with 'error' and 'raw' keys """ # Step 1: Try standard JSON parsing try: return json.loads(arguments_str) except json.JSONDecodeError: pass # Step 2: Try json_repair to fix common issues try: repaired = repair_json(arguments_str, ensure_ascii=False) return json.loads(repaired) except Exception: logger.warning(f"Unable to parse JSON: {arguments_str}") # Step 3: Give up and return error information return { "error": "Failed to parse arguments", "raw": arguments_str, } def extract_failure_experience_summary(text: str) -> str: """ Extract failure experience summary from LLM response text. The text may contain: - ... block (thinking content) - Main content after
and before - ... block (tool call, ignored) Examples: "\n{xxx}\n\n\n{content}\n\n..." "\n{xxx}\n\n\n{content}" "{content}" (no think block) Returns: - If content is empty after strip, return think_content - If both think_content and content are non-empty, return content - mcp_block is never used """ if not text: return "" think_content = "" content = "" # Extract think content think_match = re.search(r"([\s\S]*?)", text) if think_match: think_content = think_match.group(1).strip() # Get content after
after_think = text[think_match.end() :] else: # No think block, entire text is potential content after_think = text # Remove ... block from content mcp_match = re.search(r"[\s\S]*", after_think) if mcp_match: content = after_think[: mcp_match.start()].strip() else: content = after_think.strip() # Apply the rules: # - If content is empty, use think_content # - If both are non-empty, use content if content: return content else: return think_content def extract_llm_response_text(llm_response: Union[str, Dict]) -> str: """ Extract text from LLM response, excluding tags. Stops immediately when tag is encountered, returning only the content before it. Args: llm_response: Either a string or a dict with 'content' key Returns: Extracted text content, stripped of trailing whitespace """ # If it's a dictionary type, extract the content field if isinstance(llm_response, dict): content = llm_response.get("content", "") else: # If it's a string type, use directly content = str(llm_response) # Find the position of tag tool_start_pattern = r"" match = re.search(tool_start_pattern, content) if match: # If tag is found, only return content before the tag return content[: match.start()].strip() else: # If no tag is found, return the complete content return content.strip() def parse_llm_response_for_tool_calls( llm_response_content_text: Union[str, Dict, List], ) -> List[Dict[str, Any]]: """ Parse tool calls from LLM response content. Supports multiple formats: - OpenAI Response API format (dict with 'output' containing function_call items) - OpenAI Completion API format (list of tool_call objects) - MCP format ( XML tags in text) Args: llm_response_content_text: Response content in any supported format Returns: List of tool call dicts with keys: server_name, tool_name, arguments, id """ # tool_calls or MCP reponse are handled differently # for openai response api, the tool_calls are in the response text if isinstance(llm_response_content_text, dict): tool_calls = [] for item in llm_response_content_text.get("output") or []: if item.get("type") == "function_call": name = item.get("name", "") if "-" in name: server_name, tool_name = name.rsplit("-", maxsplit=1) else: server_name = "unknown" tool_name = name arguments_str = item.get("arguments") arguments = safe_json_loads(arguments_str) arguments = filter_none_values(arguments) tool_calls.append( dict( server_name=server_name, tool_name=tool_name, arguments=arguments, id=item.get("call_id"), ) ) return tool_calls # for openai completion api, the tool_calls are in the response text if isinstance(llm_response_content_text, list): tool_calls = [] for tool_call in llm_response_content_text: name = tool_call.function.name if "-" in name: server_name, tool_name = name.rsplit("-", maxsplit=1) else: server_name = "unknown" tool_name = name arguments_str = tool_call.function.arguments # Parse JSON string to dictionary try: # Try to handle possible newlines and escape characters arguments = json.loads(arguments_str) except json.JSONDecodeError: logger.info( f"Warning: Unable to parse tool arguments JSON: {arguments_str}" ) # Try more lenient parsing or log error try: # Try to replace some common error formats, such as Python dict strings arguments_str_fixed = ( arguments_str.replace("'", '"') .replace("None", "null") .replace("True", "true") .replace("False", "false") ) arguments = json.loads(arguments_str_fixed) logger.info( "Info: Successfully parsed arguments after attempting to fix." ) except json.JSONDecodeError: logger.info( f"Error: Still unable to parse tool arguments JSON after fixing: {arguments_str}" ) arguments = { "error": "Failed to parse arguments", "raw": arguments_str, } arguments = filter_none_values(arguments) tool_calls.append( dict( server_name=server_name, tool_name=tool_name, arguments=arguments, id=tool_call.id, ) ) return tool_calls # for other clients, such as qwen and anthropic, we use MCP instead of tool calls tool_calls = [] # Find all tags tool_call_patterns = re.findall( r"\s*(.*?)\s*(.*?)\s*\s*([\s\S]*?)\s*\s*", llm_response_content_text, re.DOTALL, ) for match in tool_call_patterns: server_name = match[0].strip() tool_name = match[1].strip() arguments_str = match[2].strip() # Parse JSON string to dictionary arguments = safe_json_loads(arguments_str) arguments = filter_none_values(arguments) tool_calls.append( { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, "id": None, } ) return tool_calls ================================================ FILE: apps/miroflow-agent/src/utils/prompt_utils.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ Prompt templates and utilities for agent system prompts. This module provides: - System prompt generation for MCP tool usage - Agent-specific prompt generation (main agent, browsing agent) - Summary prompt templates for final answer generation - Failure experience templates for retry mechanisms """ # ============================================================================ # Format Error Messages # ============================================================================ FORMAT_ERROR_MESSAGE = "No \\boxed{} content found in the final answer." # ============================================================================ # Failure Experience Templates (for format error retry) # ============================================================================ # Header that appears once before all failure experiences FAILURE_EXPERIENCE_HEADER = """ === Previous Attempts Analysis === The following summarizes what was tried before and why it didn't work. Use this to guide a NEW approach. """ # Template for each individual failure experience (used multiple times) FAILURE_EXPERIENCE_ITEM = """[Attempt {attempt_number}] {failure_summary} """ # Footer that appears once after all failure experiences FAILURE_EXPERIENCE_FOOTER = """=== End of Analysis === Based on the above, you should try a different strategy this time. """ FAILURE_SUMMARY_PROMPT = """The task was not completed successfully. Do NOT call any tools. Provide a summary: Failure type: [incomplete / blocked / misdirected / format_missed] - incomplete: ran out of turns before finishing - blocked: got stuck due to tool failure or missing information - misdirected: went down the wrong path - format_missed: found the answer but forgot to use \\boxed{} What happened: [describe the approach taken and why a final answer was not reached] Useful findings: [list any facts, intermediate results, or conclusions discovered that should be reused]""" # Assistant prefix for failure summary generation (guides model to follow structured format) FAILURE_SUMMARY_THINK_CONTENT = """We need to write a structured post-mortem style summary **without calling any tools**, explaining why the task was not completed, using these required sections: * **Failure type**: pick one from **incomplete / blocked / misdirected / format_missed** * **What happened**: describe the approach taken and why it didn't reach a final answer * **Useful findings**: list any facts, intermediate results, or conclusions that can be reused""" FAILURE_SUMMARY_ASSISTANT_PREFIX = ( f"\n{FAILURE_SUMMARY_THINK_CONTENT}\n\n\n" ) # ============================================================================ # MCP Tags for Parsing # ============================================================================ mcp_tags = [ "", "", "", "", "", "", ] refusal_keywords = [ "time constraint", "I’m sorry, but I can’t", "I'm sorry, I cannot solve", ] def generate_mcp_system_prompt(date, mcp_servers): """ Generate the MCP (Model Context Protocol) system prompt for LLM. Creates a structured prompt that instructs the LLM on how to use available MCP tools. Includes tool definitions, XML formatting instructions, and general task-solving guidelines. Args: date: Current date object for timestamp inclusion mcp_servers: List of server definitions, each containing 'name' and 'tools' Returns: Complete system prompt string with tool definitions and usage instructions """ formatted_date = date.strftime("%Y-%m-%d") # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt template = f"""In this environment you have access to a set of tools you can use to answer the user's question. You only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use. Today is: {formatted_date} # Tool-Use Formatting Instructions Tool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags. The Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`. Description: Request to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters. Parameters: - server_name: (required) The name of the MCP server providing the tool - tool_name: (required) The name of the tool to execute - arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON Usage: server name here tool name here {{ "param1": "value1", "param2": "value2 \\"escaped string\\"" }} Important Notes: - Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags. - Always adhere to this format for the tool use to ensure proper parsing and execution. String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions. Here are the functions available in JSONSchema format: """ # Add MCP servers section if mcp_servers and len(mcp_servers) > 0: for server in mcp_servers: template += f"\n## Server name: {server['name']}\n" if "tools" in server and len(server["tools"]) > 0: for tool in server["tools"]: # Skip tools that failed to load (they only have 'error' key) if "error" in tool and "name" not in tool: continue template += f"### Tool name: {tool['name']}\n" template += f"Description: {tool['description']}\n" template += f"Input JSON schema: {tool['schema']}\n" # Add the full objective system prompt template += """ # General Objective You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically. """ return template def generate_no_mcp_system_prompt(date): """ Generate a minimal system prompt without MCP tool definitions. Used when no tools are available or when running in tool-less mode. Args: date: Current date object for timestamp inclusion Returns: Basic system prompt string without tool definitions """ formatted_date = date.strftime("%Y-%m-%d") # Start building the template, now follows https://docs.anthropic.com/en/docs/build-with-claude/tool-use/overview#tool-use-system-prompt template = """In this environment you have access to a set of tools you can use to answer the user's question. """ template += f" Today is: {formatted_date}\n" template += """ Important Notes: - Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags. - Always adhere to this format for the tool use to ensure proper parsing and execution. String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions. """ # Add the full objective system prompt template += """ # General Objective You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically. """ return template def generate_agent_specific_system_prompt(agent_type=""): """ Generate agent-specific objective prompts based on agent type. Different agent types have different objectives: - main: Task-solving agent that uses tools to answer questions - agent-browsing: Web search and browsing agent for information retrieval Args: agent_type: Type of agent ("main", "agent-browsing", or "browsing-agent") Returns: Agent-specific objective prompt string """ if agent_type == "main": system_prompt = """\n # Agent Specific Objective You are a task-solving agent that uses tools step-by-step to answer the user's question. Your goal is to provide complete, accurate and well-reasoned answers using additional tools. """ elif agent_type == "agent-browsing" or agent_type == "browsing-agent": system_prompt = """# Agent Specific Objective You are an agent that performs the task of searching and browsing the web for specific information and generating the desired answer. Your task is to retrieve reliable, factual, and verifiable information that fills in knowledge gaps. Do not infer, speculate, summarize broadly, or attempt to fill in missing parts yourself. Only return factual content. """ else: raise ValueError(f"Unknown agent type: {agent_type}") return system_prompt.strip() def generate_agent_summarize_prompt(task_description, agent_type=""): """ Generate the final summarization prompt for an agent. Creates prompts that instruct agents to summarize their work and provide final answers. Different agent types have different summarization formats: - main: Must wrap answer in \\boxed{} with strict formatting rules - agent-browsing: Provides structured report of findings Args: task_description: The original task/question to reference in the summary agent_type: Type of agent ("main" or "agent-browsing") Returns: Summarization prompt string with formatting instructions """ if agent_type == "main": summarize_prompt = ( "Summarize the above conversation, and output the FINAL ANSWER to the original question.\n\n" "If a clear answer has already been provided earlier in the conversation, do not rethink or recalculate it — " "simply extract that answer and reformat it to match the required format below.\n" "If a definitive answer could not be determined, make a well-informed educated guess based on the conversation.\n\n" "The original question is repeated here for reference:\n\n" f'"{task_description}"\n\n' "Wrap your final answer in \\boxed{}.\n" "Your final answer should be:\n" "- a number, OR\n" "- as few words as possible, OR\n" "- a comma-separated list of numbers and/or strings.\n\n" "ADDITIONALLY, your final answer MUST strictly follow any formatting instructions in the original question — " "such as alphabetization, sequencing, units, rounding, decimal places, etc.\n" "If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.\n" "If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.\n" "If you are asked for a comma-separated list, apply the above rules depending on whether the elements are numbers or strings.\n" "Do NOT include any punctuation such as '.', '!', or '?' at the end of the answer.\n" "Do NOT include any invisible or non-printable characters in the answer output.\n\n" "You must absolutely not perform any MCP tool call, tool invocation, search, scrape, code execution, or similar actions.\n" "You can only answer the original question based on the information already retrieved and your own internal knowledge.\n" "If you attempt to call any tool, it will be considered a mistake." ) elif agent_type == "agent-browsing": summarize_prompt = ( "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n" "We are now ending this session, and your conversation history will be deleted. " "You must NOT initiate any further tool use. This is your final opportunity to report " "*all* of the information gathered during the session.\n\n" "The original task is repeated here for reference:\n\n" f'"{task_description}"\n\n' "Summarize the above search and browsing history. Output the FINAL RESPONSE and detailed supporting information of the task given to you.\n\n" "If you found any useful facts, data, quotes, or answers directly relevant to the original task, include them clearly and completely.\n" "If you reached a conclusion or answer, include it as part of the response.\n" "If the task could not be fully answered, do NOT make up any content. Instead, return all partially relevant findings, " "Search results, quotes, and observations that might help a downstream agent solve the problem.\n" "If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n" "Your final response should be a clear, complete, and structured report.\n" "Organize the content into logical sections with appropriate headings.\n" "Do NOT include any tool call instructions, speculative filler, or vague summaries.\n" "Focus on factual, specific, and well-organized information." ) else: raise ValueError(f"Unknown agent type: {agent_type}") return summarize_prompt.strip() ================================================ FILE: apps/miroflow-agent/src/utils/wrapper_utils.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """Wrapper utilities for handling responses and errors in a type-safe manner.""" from typing import Any, Dict, Optional class ErrorBox: """ A wrapper class for error messages. Use this to wrap error messages that should be distinguishable from normal responses. Example: >>> error = ErrorBox("Connection failed") >>> if ErrorBox.is_error_box(error): ... print(f"Error: {error}") """ def __init__(self, error_msg: str) -> None: self.error_msg = error_msg def __str__(self) -> str: return self.error_msg def __repr__(self) -> str: return f"ErrorBox({self.error_msg!r})" @staticmethod def is_error_box(something: Any) -> bool: """Check if the given object is an ErrorBox instance.""" return isinstance(something, ErrorBox) class ResponseBox: """ A wrapper class for responses with optional extra information. Use this to wrap responses that may include additional metadata. Example: >>> response = ResponseBox({"data": "value"}, {"warning_msg": "Rate limited"}) >>> if response.has_extra_info(): ... print(response.get_extra_info()) """ def __init__( self, response: Any, extra_info: Optional[Dict[str, Any]] = None ) -> None: self.response = response self.extra_info = extra_info def __str__(self) -> str: return str(self.response) def __repr__(self) -> str: return f"ResponseBox({self.response!r}, extra_info={self.extra_info!r})" @staticmethod def is_response_box(something: Any) -> bool: """Check if the given object is a ResponseBox instance.""" return isinstance(something, ResponseBox) def has_extra_info(self) -> bool: """Check if this response has extra information attached.""" return self.extra_info is not None def get_extra_info(self) -> Optional[Dict[str, Any]]: """Get the extra information attached to this response.""" return self.extra_info def get_response(self) -> Any: """Get the wrapped response object.""" return self.response ================================================ FILE: apps/visualize-trace/.python-version ================================================ 3.11 ================================================ FILE: apps/visualize-trace/README.md ================================================ # Trace Analysis Web Demo An interactive web interface for analyzing and visualizing trace JSON files. ## Installation and Running ### Method 1: Using Python (Recommended) ```bash pip install -r requirements.txt python run.py ``` The startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000` ### Method 2: Using uv ```bash uv run run.py ``` ## Usage 1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser 1. **Load files**: - Select the trace JSON file to analyze from the dropdown menu in the top navigation bar - Click the "Load" button to load the file 1. **View analysis results**: - **Left panel**: Shows basic information, execution summary, and performance statistics - **Right panel**: Displays detailed execution flow - **Bottom panel**: Shows spans statistics and step logs statistics 1. **Interactive operations**: - Click on execution steps to expand/collapse detailed information - Use "Expand All"/"Collapse All" buttons to control all steps - Click "View Details" button to see complete message content ================================================ FILE: apps/visualize-trace/app.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import os from flask import Flask, jsonify, render_template, request from trace_analyzer import TraceAnalyzer app = Flask(__name__) # Global variable to store analyzer instance analyzer = None @app.route("/") def index(): """Main page""" return render_template("index.html") @app.route("/api/list_files", methods=["GET"]) def list_files(): """List available JSON files""" try: directory = request.args.get("directory", "") if not directory: # Default behavior: check parent directory directory = os.path.abspath("..") # Expand path (handle ~ and other symbols) directory = os.path.expanduser(directory) # Convert to absolute path directory = os.path.abspath(directory) if not os.path.exists(directory): return jsonify({"error": f"Directory does not exist: {directory}"}), 404 if not os.path.isdir(directory): return jsonify({"error": f"Path is not a directory: {directory}"}), 400 try: json_files = [] for file in os.listdir(directory): if file.endswith(".json"): file_path = os.path.join(directory, file) try: # Get file size and modification time stat = os.stat(file_path) json_files.append( { "name": file, "path": file_path, "size": stat.st_size, "modified": stat.st_mtime, } ) except Exception: json_files.append( {"name": file, "path": file_path, "size": 0, "modified": 0} ) # Sort by filename json_files.sort(key=lambda x: x["name"]) return jsonify( { "files": json_files, "directory": directory, "message": f'Found {len(json_files)} JSON files in directory "{directory}"', } ) except PermissionError: return jsonify( {"error": f"No permission to access directory: {directory}"} ), 403 except Exception as e: return jsonify({"error": f"Failed to read directory: {str(e)}"}), 500 except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/load_trace", methods=["POST"]) def load_trace(): """Load trace file""" global analyzer data = request.get_json() file_path = data.get("file_path") if not file_path: return jsonify({"error": "Please provide file path"}), 400 # If it's a relative path, convert to absolute path if not os.path.isabs(file_path): file_path = os.path.abspath(file_path) if not os.path.exists(file_path): return jsonify({"error": f"File does not exist: {file_path}"}), 404 try: analyzer = TraceAnalyzer(file_path) return jsonify( { "message": "File loaded successfully", "file_path": file_path, "file_name": os.path.basename(file_path), } ) except Exception as e: return jsonify({"error": f"Failed to load file: {str(e)}"}), 500 @app.route("/api/basic_info") def get_basic_info(): """Get basic information""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.get_basic_info()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/performance_summary") def get_performance_summary(): """Get performance summary""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.get_performance_summary()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/execution_flow") def get_execution_flow(): """Get execution flow""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.analyze_conversation_flow()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/execution_summary") def get_execution_summary(): """Get execution summary""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.get_execution_summary()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/spans_summary") def get_spans_summary(): """Get spans summary""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.get_spans_summary()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/step_logs_summary") def get_step_logs_summary(): """Get step logs summary""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: return jsonify(analyzer.get_step_logs_summary()) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route("/api/debug/raw_messages") def get_raw_messages(): """Get raw message data for debugging""" if not analyzer: return jsonify({"error": "Please load trace file first"}), 400 try: main_history = analyzer.get_main_agent_history() browser_sessions = analyzer.get_browser_agent_sessions() # Get message structure overview main_messages = analyzer.get_main_agent_messages() message_structure = [] for i, message in enumerate(main_messages): message_structure.append( { "index": i, "role": message.get("role"), "content_length": len(str(message.get("content", ""))), "has_timestamp": "timestamp" in message, "content_preview": str(message.get("content", ""))[:100] + "..." if len(str(message.get("content", ""))) > 100 else str(message.get("content", "")), } ) return jsonify( { "main_agent_history_structure": { "total_messages": len(main_messages), "messages": message_structure, }, "browser_sessions": list(browser_sessions.keys()), "raw_main_history": main_history, "raw_browser_sessions": { k: v for k, v in list(browser_sessions.items())[:2] }, # Only show first two sessions } ) except Exception as e: return jsonify({"error": str(e)}), 500 if __name__ == "__main__": app.run(debug=True, host="0.0.0.0", port=5000) ================================================ FILE: apps/visualize-trace/pyproject.toml ================================================ [project] name = "trace-dashboard" version = "1.0.0" description = "A web dashboard for analyzing trace JSON files" requires-python = ">=3.8" dependencies = [ "flask>=2.3.3", "werkzeug>=2.3.7", ] [tool.uv] dev-dependencies = [] ================================================ FILE: apps/visualize-trace/requirements.txt ================================================ flask==2.3.3 werkzeug==2.3.7 ================================================ FILE: apps/visualize-trace/run.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import os import subprocess import sys def check_dependencies(): """Check if dependencies are installed""" try: import importlib.util if importlib.util.find_spec("flask") is not None: print("✓ Flask is installed") return True else: raise ImportError("Flask not found") except ImportError: print("✗ Flask is not installed") print("Please use the following commands to install dependencies:") print(" uv sync") print("or:") print(" uv pip install -r requirements.txt") return False def install_dependencies(): """Install dependencies (recommended to use uv)""" print("Installing dependencies...") try: # Try using uv first try: subprocess.check_call(["uv", "sync"]) print("✓ Dependencies installed successfully using uv") return True except (subprocess.CalledProcessError, FileNotFoundError): # Fallback to pip subprocess.check_call( [sys.executable, "-m", "pip", "install", "-r", "requirements.txt"] ) print("✓ Dependencies installed successfully using pip") return True except subprocess.CalledProcessError: print("✗ Failed to install dependencies") print("Please manually run: uv sync or pip install -r requirements.txt") return False def main(): """Main function""" import argparse # Parse command line arguments parser = argparse.ArgumentParser(description="Trace Analysis Web Demo") parser.add_argument( "-p", "--port", type=int, default=5000, help="Specify port number (default: 5000)", ) args = parser.parse_args() print("=" * 50) print("Trace Analysis Web Demo") print("=" * 50) # Check dependencies if not check_dependencies(): print("\nInstalling dependencies...") if not install_dependencies(): print( "Please manually install dependencies: pip install -r requirements.txt" ) return # Check JSON files parent_dir = os.path.dirname(os.path.abspath(__file__)) json_files = [ f for f in os.listdir(os.path.join(parent_dir, "..")) if f.endswith(".json") ] if not json_files: print("\nWarning: No JSON files found in parent directory") print("Please ensure trace JSON files are in the trace_analyze/ directory") else: print(f"\nFound {len(json_files)} JSON files:") for file in json_files[:5]: # Only show first 5 print(f" - {file}") if len(json_files) > 5: print(f" ... and {len(json_files) - 5} other files") # Start application print("\nStarting web application...") print(f"Application will run at http://localhost:{args.port}") print("Press Ctrl+C to stop the application") print("=" * 50) try: from app import app app.run(debug=True, host="0.0.0.0", port=args.port) except KeyboardInterrupt: print("\nApplication stopped") except Exception as e: print(f"\nFailed to start application: {e}") if __name__ == "__main__": main() ================================================ FILE: apps/visualize-trace/static/css/style.css ================================================ /* Global styles */ body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f8f9fa; } /* Set special font for non-tool call content */ .rendered-content, .preview-text, .browser-agent-content { font-family: 'Courier New', 'Monaco', 'Menlo', monospace; font-size: 14px; line-height: 1.6; } /* Keep MCP tool calls using original font */ .mcp-tool-call { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } /* Ensure MCP tool call content uses original font */ .mcp-tool-call * { font-family: inherit; } /* Navigation button styles */ .nav-btn { transition: all 0.3s ease; } .nav-btn:hover:not(:disabled) { background-color: rgba(255, 255, 255, 0.2); } .nav-btn:disabled { opacity: 0.5; cursor: not-allowed; } /* File selection input group styles */ .file-navigation { display: flex; align-items: center; gap: 0; } .file-navigation .form-select { border-radius: 0; border-left: 0; border-right: 0; } .file-navigation .btn:first-child { border-top-right-radius: 0; border-bottom-right-radius: 0; } .file-navigation .btn:last-child { border-top-left-radius: 0; border-bottom-left-radius: 0; } /* Loading overlay */ .loading-overlay { position: fixed; top: 0; left: 0; width: 100%; height: 100%; background-color: rgba(0, 0, 0, 0.5); display: flex; justify-content: center; align-items: center; z-index: 9999; } /* Card styles */ .card { box-shadow: 0 2px 4px rgba(0,0,0,0.1); border: none; border-radius: 8px; } .card-header { background-color: #f8f9fa; border-bottom: 1px solid #dee2e6; font-weight: 500; } /* Top summary panel styles */ .summary-panel { background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border: none; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } .summary-panel h6 { color: #495057; font-weight: 600; margin-bottom: 15px; padding-bottom: 8px; border-bottom: 2px solid #dee2e6; } .summary-panel .answer-box { background: #fff; border: 1px solid #dee2e6; border-radius: 6px; padding: 8px 12px; margin-bottom: 10px; display: flex; align-items: center; gap: 10px; } .summary-panel .answer-label { font-weight: 600; color: #6c757d; font-size: 12px; margin-bottom: 0; white-space: nowrap; } .summary-panel .answer-content { font-size: 14px; line-height: 1.4; flex: 1; } .summary-panel .final-answer { border-left: 4px solid #007bff; } .summary-panel .ground-truth { border-left: 4px solid #28a745; } .summary-panel .stat-item { background: #fff; border: 1px solid #dee2e6; border-radius: 6px; padding: 8px 12px; margin-bottom: 8px; display: flex; justify-content: space-between; align-items: center; } .summary-panel .stat-label { font-size: 12px; color: #6c757d; font-weight: 500; } .summary-panel .stat-value { font-size: 14px; font-weight: 600; color: #495057; } /* Navigation panel styles */ .navigation-panel { position: sticky; top: 20px; max-height: calc(100vh - 40px); overflow-y: auto; } .navigation-list { max-height: calc(100vh - 120px); overflow-y: auto; } .nav-item { padding: 8px 12px; border-bottom: 1px solid #f1f1f1; cursor: pointer; transition: all 0.2s ease; font-size: 13px; } .nav-item:hover { /* Remove background color change, can add other subtle visual feedback */ } .nav-item.active { background-color: #007bff; color: white; } .nav-item .step-number { font-weight: bold; color: #6c757d; } .nav-item.active .step-number { color: white; } .nav-item .step-role { font-size: 11px; padding: 2px 6px; border-radius: 3px; margin-left: 8px; } .nav-item .step-role.user { background-color: #28a745; color: white; } .nav-item .step-role.assistant { background-color: #007bff; color: white; } .nav-item .step-role.tool { background-color: #fd7e14; color: white; } .nav-item .step-role.system { background-color: #6c757d; color: white; } .nav-item .step-summary { color: #6c757d; font-size: 12px; margin-top: 4px; display: -webkit-box; -webkit-line-clamp: 2; -webkit-box-orient: vertical; overflow: hidden; } .nav-item.active .step-summary { color: #e9ecef; } /* Browser sub-step navigation styles */ .nav-item.browser-sub-step { padding-left: 24px; font-size: 12px; border-left: 2px solid #dee2e6; margin-left: 8px; } .nav-item.browser-sub-step .step-number { font-size: 11px; color: #6c757d; } .nav-item.browser-sub-step .step-role { font-size: 10px; padding: 1px 4px; } .nav-item.browser-sub-step .step-summary { font-size: 11px; -webkit-line-clamp: 1; } .nav-item.browser-sub-step.active { border-left-color: #007bff; } .nav-item .browser-toggle { margin-left: auto; cursor: pointer; font-size: 12px; color: #6c757d; padding: 2px 4px; border-radius: 2px; transition: all 0.2s ease; } .nav-item .browser-toggle:hover { background-color: #e9ecef; } .nav-item.active .browser-toggle { color: #fff; } .nav-item.active .browser-toggle:hover { background-color: rgba(255, 255, 255, 0.2); } .browser-sub-steps { display: none; } .browser-sub-steps.expanded { display: block; } /* Execution flow styles */ .execution-steps-container { display: flex; flex-direction: column; gap: 16px; } .execution-step { border: 1px solid #dee2e6; border-radius: 6px; margin-bottom: 0; /* Remove bottom margin, use gap instead */ background-color: white; transition: all 0.3s ease; position: relative; } .execution-step:hover { box-shadow: 0 4px 8px rgba(0,0,0,0.1); } /* Ensure main agent steps have clear visual separation */ .execution-step[data-agent*="main_agent"] { border-left: 4px solid #007bff; z-index: 2; } /* Browser session should be indented inside main agent steps */ .browser-session { position: relative; margin-left: 20px; margin-top: 12px; } .step-header { padding: 12px 16px; cursor: pointer; position: relative; border-radius: 6px 6px 0 0; } .step-header:hover { background-color: #f8f9fa; } .step-header.user-message { background-color: #e3f2fd; border-left: 4px solid #2196f3; } .step-header.assistant-message { background-color: #f3e5f5; border-left: 4px solid #9c27b0; } .step-header.user-message.browser-agent { background-color: #e8f5e8; border-left: 4px solid #4caf50; } .step-header.assistant-message.browser-agent { background-color: #fff3e0; border-left: 4px solid #ff9800; } .step-header.tool-message { background-color: #fff3e0; border-left: 4px solid #fd7e14; } .step-header.system-message { background-color: #f8f9fa; border-left: 4px solid #6c757d; } .step-content { padding: 16px; border-top: 1px solid #dee2e6; background-color: #f8f9fa; } .step-toggle { position: absolute; right: 16px; top: 50%; transform: translateY(-50%); font-size: 14px; color: #6c757d; } /* Tool call styles */ .tool-call { background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 4px; padding: 10px; margin: 8px 0; } .tool-call-header { font-weight: 500; color: #856404; margin-bottom: 5px; } .tool-call.browser-agent { background-color: #d4edda; border-color: #c3e6cb; } .tool-call.browser-agent .tool-call-header { color: #155724; } /* Browser session styles */ .browser-session { background-color: #f8f9fa; border: 1px solid #dee2e6; border-radius: 4px; margin-top: 10px; padding: 12px; } .browser-session-header { font-weight: 500; color: #495057; margin-bottom: 10px; padding-bottom: 8px; border-bottom: 1px solid #dee2e6; } .browser-step { background-color: white; border: 1px solid #e9ecef; border-radius: 4px; margin-bottom: 8px; padding: 8px 12px; } .browser-step.user { background-color: #f0f8ff; } .browser-step.assistant { background-color: #fdf6e3; } .browser-step.tool { background-color: #fff3e0; border-left: 3px solid #fd7e14; } .browser-step.system { background-color: #f8f9fa; border-left: 3px solid #6c757d; } /* Statistics styles */ .stat-item { display: flex; justify-content: space-between; align-items: center; padding: 8px 0; border-bottom: 1px solid #f0f0f0; } .stat-item:last-child { border-bottom: none; } .stat-label { font-weight: 500; color: #495057; } .stat-value { font-weight: 600; color: #007bff; } /* Badge styles */ .badge-role { font-size: 11px; padding: 4px 8px; border-radius: 12px; font-weight: 500; text-transform: uppercase; } .badge-user { background-color: #007bff; color: white; } .badge-assistant { background-color: #6f42c1; color: white; } .badge-tool { background-color: #fd7e14; color: white; } .badge-system { background-color: #6c757d; color: white; } .badge-browser { background-color: #28a745; color: white; } /* Timestamp styles */ .timestamp { font-size: 11px; color: #6c757d; font-family: monospace; } /* Content preview styles */ .content-preview { background-color: white; border-radius: 4px; padding: 8px; margin: 8px 0; } .content-preview .preview-text { line-height: 1.5; } .expand-preview-btn { color: #007bff !important; font-size: 12px; text-decoration: none; } .expand-preview-btn:hover { text-decoration: underline !important; } /* Step content area style adjustments */ .step-content { padding: 16px; border-top: 1px solid #dee2e6; background-color: #f8f9fa; } .step-content h6 { color: #495057; font-weight: 600; margin-bottom: 8px; font-size: 14px; } /* Button styles */ .btn-sm { font-size: 12px; padding: 4px 12px; } /* Responsive styles */ @media (max-width: 768px) { .container-fluid { padding: 0 10px; } .col-md-3 { order: 2; } .col-md-9 { order: 1; } .step-header { padding: 10px 12px; } .step-content { padding: 12px; } } /* Animation effects */ .collapse { transition: height 0.3s ease; } .fade-in { animation: fadeIn 0.3s ease-in; } @keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } } /* Tooltip styles */ .tooltip { font-size: 12px; } /* Code styles */ .code-block { background-color: #f8f9fa; border: 1px solid #e9ecef; border-radius: 6px; padding: 12px; font-family: 'Courier New', monospace; font-size: 13px; white-space: pre-wrap; margin: 8px 0; overflow-x: auto; line-height: 1.4; } .code-block pre { margin: 0; padding: 0; background: none; border: none; font-family: inherit; font-size: inherit; white-space: pre-wrap; } .code-block code { background: none; border: none; padding: 0; font-family: inherit; font-size: inherit; color: inherit; } /* Error styles */ .error-message { color: #dc3545; font-size: 14px; margin-top: 8px; } .success-message { color: #28a745; font-size: 14px; margin-top: 8px; } /* Scrollbar styles */ ::-webkit-scrollbar { width: 8px; } ::-webkit-scrollbar-track { background: #f1f1f1; } ::-webkit-scrollbar-thumb { background: #c1c1c1; border-radius: 4px; } ::-webkit-scrollbar-thumb:hover { background: #a8a8a8; } /* MCP tool call styles */ .mcp-tool-call { background-color: #ffffff; border: 2px solid #007bff; border-radius: 8px; padding: 16px; margin: 16px 0; box-shadow: 0 2px 8px rgba(0,123,255,0.1); overflow: hidden; } .mcp-tool-call.browser-agent { border-color: #28a745; background-color: #ffffff; box-shadow: 0 2px 8px rgba(40,167,69,0.1); } .mcp-tool-header { display: flex; align-items: center; font-weight: 600; color: #007bff; margin-bottom: 12px; font-size: 14px; padding-bottom: 8px; border-bottom: 1px solid #e9ecef; } .mcp-tool-call.browser-agent .mcp-tool-header { color: #28a745; } .mcp-tool-header i { margin-right: 8px; font-size: 16px; } .mcp-tool-name { font-family: 'Courier New', monospace; background-color: rgba(0,123,255,0.1); padding: 4px 8px; border-radius: 4px; margin-left: 4px; font-size: 13px; } .mcp-tool-call.browser-agent .mcp-tool-name { background-color: rgba(40,167,69,0.1); } .mcp-tool-content { margin-top: 8px; } .mcp-xml-structure { font-family: 'Courier New', monospace; background-color: #f8f9fa; border: 1px solid #e9ecef; border-radius: 4px; padding: 16px; line-height: 1.6; font-size: 13px; } .xml-tag { color: #0066cc; font-weight: 500; margin: 2px 0; } .xml-content { margin-left: 20px; margin: 8px 0 8px 20px; } .xml-arguments { background-color: #ffffff; border: 1px solid #dee2e6; border-radius: 4px; padding: 12px; margin: 8px 0 8px 20px; white-space: pre-wrap; color: #2c3e50; font-family: 'Courier New', monospace; font-size: 12px; line-height: 1.5; overflow-x: auto; } .mcp-tool-args { margin-top: 8px; } .mcp-args-label { font-weight: 500; color: #495057; margin-bottom: 6px; font-size: 13px; } /* Format badge styles */ .badge-format { font-size: 10px; padding: 2px 6px; border-radius: 3px; font-weight: normal; } .badge-format { background-color: #6c757d; color: white; } /* Format badge default styles, can be extended as needed */ /* Tool ID styles */ .tool-id { margin-top: 8px; padding-top: 8px; border-top: 1px solid #e9ecef; } /* Rendered content styles - white background */ .rendered-content { background-color: white; padding: 12px; border-radius: 4px; border: 1px solid #e9ecef; margin: 8px 0; line-height: 1.6; } .rendered-content h1 { color: #2c3e50; border-bottom: 2px solid #3498db; padding-bottom: 8px; margin-bottom: 16px; font-size: 1.5em; } .rendered-content h2 { color: #34495e; border-bottom: 1px solid #bdc3c7; padding-bottom: 6px; margin-bottom: 12px; font-size: 1.3em; } .rendered-content h3 { color: #2c3e50; margin-bottom: 10px; font-size: 1.1em; } .rendered-content strong { color: #2c3e50; font-weight: 600; } .rendered-content em { color: #7f8c8d; font-style: italic; } .rendered-content ul, .rendered-content ol { margin: 10px 0; padding-left: 20px; } .rendered-content li { margin: 4px 0; } .rendered-content a { color: #3498db; text-decoration: none; } .rendered-content a:hover { text-decoration: underline; } .rendered-content .inline-code { background-color: #f8f9fa; color: #e83e8c; padding: 2px 4px; border-radius: 3px; font-family: 'Courier New', monospace; font-size: 0.9em; } .rendered-content .code-block { background-color: #f8f9fa; border: 1px solid #e9ecef; border-radius: 4px; margin: 8px 0; overflow-x: auto; } .rendered-content .code-block pre { margin: 0; padding: 12px; background: none; border: none; font-family: 'Courier New', monospace; font-size: 0.9em; line-height: 1.4; color: #2c3e50; } .rendered-content .code-block code { background: none; padding: 0; color: inherit; font-family: inherit; } /* Improve browser agent content styles */ .browser-agent-content { background-color: #f8fff8; border: 1px solid #d4edda; border-radius: 4px; padding: 12px; margin: 8px 0; } /* Improve content display in modal */ .modal-body .rendered-content { max-height: 400px; overflow-y: auto; } ================================================ FILE: apps/visualize-trace/static/js/script.js ================================================ // Global variables let currentFlowData = null; let currentBasicInfo = null; let currentFileList = []; let currentFileIndex = -1; // DOM elements const elements = { directoryInput: document.getElementById('directoryInput'), browseDirectoryBtn: document.getElementById('browseDirectoryBtn'), fileSelect: document.getElementById('fileSelect'), prevFileBtn: document.getElementById('prevFileBtn'), nextFileBtn: document.getElementById('nextFileBtn'), loadBtn: document.getElementById('loadBtn'), refreshBtn: document.getElementById('refreshBtn'), expandAllBtn: document.getElementById('expandAllBtn'), collapseAllBtn: document.getElementById('collapseAllBtn'), basicInfo: document.getElementById('basicInfo'), executionSummary: document.getElementById('executionSummary'), performanceSummary: document.getElementById('performanceSummary'), executionFlow: document.getElementById('executionFlow'), spansStats: document.getElementById('spansStats'), stepLogsStats: document.getElementById('stepLogsStats'), loadingOverlay: document.getElementById('loadingOverlay'), errorToast: document.getElementById('errorToast'), successToast: document.getElementById('successToast'), errorMessage: document.getElementById('errorMessage'), successMessage: document.getElementById('successMessage'), messageModal: document.getElementById('messageModal'), messageContent: document.getElementById('messageContent'), navigationList: document.getElementById('navigationList') }; // Initialize document.addEventListener('DOMContentLoaded', function() { initializeApp(); }); function initializeApp() { // Bind event listeners elements.browseDirectoryBtn.addEventListener('click', browseDirectory); elements.directoryInput.addEventListener('keypress', function(e) { if (e.key === 'Enter') { browseDirectory(); } }); elements.fileSelect.addEventListener('change', onFileSelect); elements.prevFileBtn.addEventListener('click', gotoPrevFile); elements.nextFileBtn.addEventListener('click', gotoNextFile); elements.loadBtn.addEventListener('click', loadTraceFile); elements.refreshBtn.addEventListener('click', refreshFileList); elements.expandAllBtn.addEventListener('click', expandAllSteps); elements.collapseAllBtn.addEventListener('click', collapseAllSteps); // Set default directory path setDefaultDirectory(); // Initialize button states updateNavigationButtons(); // Add keyboard shortcut support document.addEventListener('keydown', handleKeyboardShortcuts); } // Utility functions function showLoading() { elements.loadingOverlay.classList.remove('d-none'); } function hideLoading() { elements.loadingOverlay.classList.add('d-none'); } function showError(message) { elements.errorMessage.textContent = message; const toast = new bootstrap.Toast(elements.errorToast); toast.show(); } function showSuccess(message) { elements.successMessage.textContent = message; const toast = new bootstrap.Toast(elements.successToast); toast.show(); } function formatTimestamp(timestamp) { if (!timestamp) return ''; try { const date = new Date(timestamp); return date.toLocaleString('zh-CN'); } catch (e) { return timestamp; } } function truncateText(text, maxLength = 100) { if (!text) return ''; if (text.length <= maxLength) return text; return text.substring(0, maxLength) + '...'; } function formatFileSize(bytes) { if (bytes === 0) return '0 B'; const k = 1024; const sizes = ['B', 'KB', 'MB', 'GB']; const i = Math.floor(Math.log(bytes) / Math.log(k)); return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; } // Handle MCP tool call display function formatMcpToolCallWithPlaceholders(text, placeholders) { if (!text || typeof text !== 'string') return text; // MCP tool call regex - more lenient matching, including newlines const mcpPattern = /\s*(.*?)<\/server_name>\s*(.*?)<\/tool_name>\s*\s*(.*?)\s*<\/arguments>\s*<\/use_mcp_tool>/gs; let placeholderCounter = 0; return text.replace(mcpPattern, (match, serverName, toolName, args) => { // Clean and format arguments let formattedArgs = args.trim(); // First convert escaped newlines to actual newlines formattedArgs = formattedArgs.replace(/\\n/g, '\n'); try { // Try to format JSON arguments const parsed = JSON.parse(formattedArgs); formattedArgs = JSON.stringify(parsed, null, 2); } catch (e) { // If not JSON, keep as is but ensure newlines are correct formattedArgs = formattedArgs.replace(/\n/g, '\n'); } const isBrowserAgent = serverName.trim() === 'browsing-agent'; const toolClass = isBrowserAgent ? 'browser-agent' : ''; const iconClass = isBrowserAgent ? 'globe' : 'cog'; // Create complete MCP tool call HTML structure const mcpHtml = `
${serverName.trim()}.${toolName.trim()}
<use_mcp_tool>
<server_name>${serverName.trim()}</server_name>
<tool_name>${toolName.trim()}</tool_name>
<arguments>
${formattedArgs}
</arguments>
</use_mcp_tool>
`; // Use simple placeholder ID to avoid complex JSON strings const placeholderId = `MCP_PLACEHOLDER_${placeholderCounter++}`; placeholders.set(placeholderId, mcpHtml); return `[${placeholderId}]`; }); } // Create new format tool call HTML function createNewFormatToolCallHTML(tool) { const isBeowserAgent = tool.server_name.includes('browsing') || tool.server_name.includes('agent'); const toolClass = isBeowserAgent ? 'browser-agent' : ''; const iconClass = isBeowserAgent ? 'globe' : 'cog'; // Format arguments let formattedArgs; try { if (typeof tool.arguments === 'string') { formattedArgs = tool.arguments; } else { formattedArgs = JSON.stringify(tool.arguments, null, 2); } } catch (e) { formattedArgs = String(tool.arguments); } return `
${tool.server_name}.${tool.tool_name} ${tool.format || 'new'}
Arguments:
${formattedArgs}
${tool.id ? `
ID: ${tool.id}
` : ''}
`; } // Modified markdown rendering support - preserve markdown syntax, only handle newlines and MCP tool calls function renderMarkdown(text) { if (!text || typeof text !== 'string') return ''; let html = text; let placeholders = new Map(); // First process MCP tool calls, before HTML escaping html = formatMcpToolCallWithPlaceholders(html, placeholders); // Escape HTML special characters, but protect MCP tool call placeholders html = html.replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); // Only handle newlines, preserve all markdown syntax html = html.replace(/\n/g, '
'); // Finally process MCP tool call placeholders, insert HTML directly placeholders.forEach((htmlContent, placeholderId) => { html = html.replace(`[${placeholderId}]`, htmlContent); }); return html; } // 增强的内容渲染函数 function isJsonString(str) { try { const trimmed = str.trim(); if ((trimmed.startsWith('{') && trimmed.endsWith('}')) || (trimmed.startsWith('[') && trimmed.endsWith(']'))) { JSON.parse(trimmed); return true; } return false; } catch (e) { return false; } } function formatJsonContent(content) { try { const trimmed = content.trim(); const parsed = JSON.parse(trimmed); const formatted = JSON.stringify(parsed, null, 4); return `
${formatted}
`; } catch (e) { return content; } } function renderContent(content, isBrowserAgent = false) { if (!content) return ''; // 检查是否为纯JSON字符串 if (isJsonString(content)) { return formatJsonContent(content); } // 直接渲染Markdown(已包含MCP工具调用处理) let processedContent = renderMarkdown(content); // 如果是browser agent,添加特殊样式 if (isBrowserAgent) { processedContent = `
${processedContent}
`; } return processedContent; } // API调用函数 async function apiCall(url, options = {}) { try { const response = await fetch(url, { headers: { 'Content-Type': 'application/json', ...options.headers }, ...options }); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } return await response.json(); } catch (error) { console.error('API call failed:', error); throw error; } } // 文件管理 function setDefaultDirectory() { // 设置默认目录为上级目录 elements.directoryInput.value = '../'; // 自动加载文件列表 refreshFileList(); } async function browseDirectory() { const directory = elements.directoryInput.value.trim(); if (!directory) { showError('请输入目录路径'); return; } await refreshFileList(directory); } async function refreshFileList(directory = null) { try { const targetDirectory = directory || elements.directoryInput.value.trim(); if (!targetDirectory) { elements.fileSelect.innerHTML = ''; currentFileList = []; currentFileIndex = -1; updateNavigationButtons(); return; } showLoading(); const url = `/api/list_files?directory=${encodeURIComponent(targetDirectory)}`; const data = await apiCall(url); elements.fileSelect.innerHTML = ''; if (data.files.length === 0) { elements.fileSelect.innerHTML = ''; currentFileList = []; currentFileIndex = -1; showSuccess(`目录 "${targetDirectory}" 下没有找到JSON文件`); updateNavigationButtons(); return; } // 保存文件列表到全局变量 currentFileList = data.files; currentFileIndex = -1; data.files.forEach((file, index) => { const option = document.createElement('option'); option.value = file.path; option.dataset.index = index; const fileSize = formatFileSize(file.size); const modifiedDate = new Date(file.modified * 1000).toLocaleString('zh-CN'); option.textContent = `${file.name} (${fileSize}, ${modifiedDate})`; elements.fileSelect.appendChild(option); }); showSuccess(`在目录 "${targetDirectory}" 中找到 ${data.files.length} 个JSON文件`); updateNavigationButtons(); } catch (error) { showError('获取文件列表失败: ' + error.message); elements.fileSelect.innerHTML = ''; currentFileList = []; currentFileIndex = -1; updateNavigationButtons(); } finally { hideLoading(); } } // 文件切换功能 function onFileSelect() { const selectedOption = elements.fileSelect.options[elements.fileSelect.selectedIndex]; if (selectedOption && selectedOption.dataset.index !== undefined) { currentFileIndex = parseInt(selectedOption.dataset.index); updateNavigationButtons(); } } function gotoPrevFile() { if (currentFileIndex > 0) { currentFileIndex--; selectFileByIndex(currentFileIndex); loadTraceFile(); } } function gotoNextFile() { if (currentFileIndex < currentFileList.length - 1) { currentFileIndex++; selectFileByIndex(currentFileIndex); loadTraceFile(); } } function selectFileByIndex(index) { if (index >= 0 && index < currentFileList.length) { elements.fileSelect.selectedIndex = index + 1; // +1 因为第一个选项是"选择Trace文件..." currentFileIndex = index; updateNavigationButtons(); } } function updateNavigationButtons() { const hasPrev = currentFileIndex > 0; const hasNext = currentFileIndex >= 0 && currentFileIndex < currentFileList.length - 1; elements.prevFileBtn.disabled = !hasPrev; elements.nextFileBtn.disabled = !hasNext; // 更新按钮提示文本 if (currentFileIndex >= 0 && currentFileList.length > 0) { const prevFile = hasPrev ? currentFileList[currentFileIndex - 1] : null; const nextFile = hasNext ? currentFileList[currentFileIndex + 1] : null; elements.prevFileBtn.title = prevFile ? `上一个: ${prevFile.name}` : '没有上一个文件'; elements.nextFileBtn.title = nextFile ? `下一个: ${nextFile.name}` : '没有下一个文件'; } else { elements.prevFileBtn.title = '上一个文件'; elements.nextFileBtn.title = '下一个文件'; } } // 键盘快捷键处理 function handleKeyboardShortcuts(event) { // 只有在没有焦点在输入框时才处理快捷键 if (event.target.tagName === 'INPUT' || event.target.tagName === 'TEXTAREA' || event.target.tagName === 'SELECT') { return; } // 防止在模态框打开时触发 if (elements.messageModal.classList.contains('show')) { return; } switch (event.key) { case 'ArrowLeft': event.preventDefault(); if (!elements.prevFileBtn.disabled) { gotoPrevFile(); } break; case 'ArrowRight': event.preventDefault(); if (!elements.nextFileBtn.disabled) { gotoNextFile(); } break; case 'Enter': event.preventDefault(); if (elements.fileSelect.value) { loadTraceFile(); } break; case 'r': case 'R': if (event.ctrlKey) { event.preventDefault(); refreshFileList(); } break; } } async function loadTraceFile() { const selectedFile = elements.fileSelect.value; if (!selectedFile) { showError('请选择一个trace文件'); return; } showLoading(); try { // 加载文件 await apiCall('/api/load_trace', { method: 'POST', body: JSON.stringify({ file_path: selectedFile }) }); // 并行加载所有数据 const [basicInfo, executionSummary, performanceSummary, executionFlow, spansStats, stepLogsStats] = await Promise.all([ apiCall('/api/basic_info'), apiCall('/api/execution_summary'), apiCall('/api/performance_summary'), apiCall('/api/execution_flow'), apiCall('/api/spans_summary'), apiCall('/api/step_logs_summary') ]); // 更新界面 updateBasicInfo(basicInfo); updateExecutionSummary(executionSummary); updatePerformanceSummary(performanceSummary); updateExecutionFlow(executionFlow); updateSpansStats(spansStats); updateStepLogsStats(stepLogsStats); // 显示当前文件信息 const currentFile = currentFileList[currentFileIndex]; if (currentFile) { showSuccess(`文件加载成功: ${currentFile.name} (${currentFileIndex + 1}/${currentFileList.length})`); } else { showSuccess('文件加载成功'); } } catch (error) { showError('加载文件失败: ' + error.message); } finally { hideLoading(); } } // 界面更新函数 function updateBasicInfo(data) { currentBasicInfo = data; const finalAnswer = data.final_boxed_answer || '暂无答案'; const groundTruth = data.ground_truth || '暂无正确答案'; const html = `
任务ID: ${data.task_id || 'N/A'}
最终答案
${finalAnswer}
正确答案
${groundTruth}
判断结果: ${data.final_judge_result || 'N/A'}
判断类型: ${data.judge_type || 'N/A'}
`; elements.basicInfo.innerHTML = html; } function updateExecutionSummary(data) { const html = `
总步骤数: ${data.total_steps}
工具调用次数: ${data.total_tool_calls}
Browser会话数: ${data.browser_sessions_count}
browsing-agent.search_and_browse: ${data.tool_usage_distribution['browsing-agent.search_and_browse'] || 0}
`; elements.executionSummary.innerHTML = html; } function updatePerformanceSummary(data) { if (!data || Object.keys(data).length === 0) { elements.performanceSummary.innerHTML = '

无性能数据

'; return; } const html = `
总执行时间: ${(data.total_wall_time || 0).toFixed(2)}s
browsing_agent: ${data.primary_breakdown?.browsing_agent ? (data.primary_breakdown.browsing_agent.total || 0).toFixed(2) : 0}s
main_agent: ${data.primary_breakdown?.main_agent ? (data.primary_breakdown.main_agent.total || 0).toFixed(2) : 0}s
`; elements.performanceSummary.innerHTML = html; } function updateExecutionFlow(data) { currentFlowData = data; if (!data || data.length === 0) { elements.executionFlow.innerHTML = '

无执行流程数据

'; updateNavigationList([]); return; } // 确保每个步骤都是独立的顶级元素 const stepsContainer = document.createElement('div'); stepsContainer.className = 'execution-steps-container'; data.forEach((step, index) => { const stepElement = document.createElement('div'); stepElement.innerHTML = createStepHTML(step, index); stepsContainer.appendChild(stepElement.firstElementChild); }); elements.executionFlow.innerHTML = ''; elements.executionFlow.appendChild(stepsContainer); // 更新导航列表 updateNavigationList(data); // 绑定事件监听器 bindStepEventListeners(); } function createStepHTML(step, index) { const roleClass = step.role === 'user' ? 'user-message' : step.role === 'tool' ? 'tool-message' : step.role === 'system' ? 'system-message' : 'assistant-message'; const agentClass = step.agent.includes('browser') ? 'browser-agent' : ''; // 渲染内容 const renderedPreview = renderContent(step.content_preview); const renderedFullContent = renderContent(step.full_content); return `
${step.role} ${step.agent} ${step.tool_calls.length > 0 ? `${step.tool_calls.length} 工具调用` : ''} ${step.browser_session ? `Browser会话` : ''}
${formatTimestamp(step.timestamp)}
${renderedPreview}
完整内容:
${renderedFullContent}
${step.tool_calls.length > 0 ? `
工具调用:
${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}
` : ''} ${step.browser_flow && step.browser_flow.length > 0 ? `
Browser会话流程:
${step.browser_session} (${step.browser_flow.length} 步骤)
${step.browser_flow.map(browserStep => createBrowserStepHTML(browserStep, index)).join('')}
` : ''}
`; } function createToolCallHTML(tool) { // 如果是新格式的工具调用,使用新的渲染方式 if (tool.format === 'new') { return createNewFormatToolCallHTML(tool); } // 旧格式(MCP或其他)使用原有的渲染方式 const isBeowserAgent = tool.server_name === 'browsing-agent' || tool.server_name.includes('agent'); const toolClass = isBeowserAgent ? 'browser-agent' : ''; return `
${tool.server_name}.${tool.tool_name} ${tool.format || 'mcp'}
参数:
${JSON.stringify(tool.arguments, null, 2)}
`; } function createBrowserStepHTML(step, parentIndex) { // 为browser step创建唯一的ID const browserId = `browser-${parentIndex}-${step.step_id}`; // 判断内容是否被截断 const isContentTruncated = step.full_content && step.content_preview.length < step.full_content.length; // 渲染内容 const renderedPreview = renderContent(step.content_preview); const renderedFullContent = renderContent(step.full_content); return `
${step.role} ${step.tool_calls.length > 0 ? `${step.tool_calls.length} 工具` : ''}
${formatTimestamp(step.timestamp)}
${renderedPreview} ${isContentTruncated ? ` ... ` : ''}
${step.tool_calls.length > 0 ? `
工具调用: ${step.tool_calls.map(tool => createToolCallHTML(tool)).join('')}
` : ''}
`; } function updateSpansStats(data) { if (!data || Object.keys(data).length === 0) { elements.spansStats.innerHTML = '

无Spans数据

'; return; } const html = `
总Spans数: ${data.total_spans}
总时长: ${(data.total_duration || 0).toFixed(2)}s
Agent统计:
${Object.entries(data.agent_stats || {}).map(([agent, stats]) => `
${agent}:
数量: ${stats.count}
时长: ${(stats.total_duration || 0).toFixed(2)}s
`).join('')}
`; elements.spansStats.innerHTML = html; } function updateStepLogsStats(data) { if (!data || Object.keys(data).length === 0) { elements.stepLogsStats.innerHTML = '

无步骤日志数据

'; return; } const html = `
总日志数: ${data.total_logs}
状态分布:
${Object.entries(data.status_distribution || {}).map(([status, count]) => `
${status}: ${count}
`).join('')}
步骤类型分布:
${Object.entries(data.step_type_distribution || {}).map(([type, count]) => `
${type}: ${count}
`).join('')}
`; elements.stepLogsStats.innerHTML = html; } // 事件处理函数 function bindStepEventListeners() { // 步骤折叠/展开 document.querySelectorAll('.step-header').forEach(header => { header.addEventListener('click', function() { const target = this.getAttribute('data-target'); const content = document.querySelector(target); const icon = this.querySelector('.step-toggle i'); if (content.classList.contains('show')) { content.classList.remove('show'); icon.className = 'fas fa-chevron-down'; } else { content.classList.add('show'); icon.className = 'fas fa-chevron-up'; } }); }); } function expandAllSteps() { // 展开main agent的步骤 document.querySelectorAll('.step-content').forEach(content => { content.classList.add('show'); }); document.querySelectorAll('.step-toggle i').forEach(icon => { icon.className = 'fas fa-chevron-up'; }); // 展开browser agent的预览内容 document.querySelectorAll('.expand-preview-btn').forEach(button => { const isExpanded = button.getAttribute('data-expanded') === 'true'; if (!isExpanded) { button.click(); } }); } function collapseAllSteps() { // 收起main agent的步骤 document.querySelectorAll('.step-content').forEach(content => { content.classList.remove('show'); }); document.querySelectorAll('.step-toggle i').forEach(icon => { icon.className = 'fas fa-chevron-down'; }); // 收起browser agent的预览内容 document.querySelectorAll('.expand-preview-btn').forEach(button => { const isExpanded = button.getAttribute('data-expanded') === 'true'; if (isExpanded) { button.click(); } }); } // 切换内容预览展开/收起 // 切换browser预览展开/收起 function toggleBrowserPreview(browserId, parentIndex, browserStepId) { const previewElement = document.getElementById(`browser-preview-${browserId}`); const button = previewElement.querySelector('.expand-preview-btn'); const isExpanded = button.getAttribute('data-expanded') === 'true'; if (!currentFlowData) return; const parentStep = currentFlowData[parentIndex]; if (!parentStep || !parentStep.browser_flow) return; const browserStep = parentStep.browser_flow.find(step => step.step_id === browserStepId); if (!browserStep) return; if (isExpanded) { // 收起 const renderedPreview = renderContent(browserStep.content_preview); previewElement.querySelector('.preview-text').innerHTML = ` ${renderedPreview} ... `; } else { // 展开 const renderedFullContent = renderContent(browserStep.full_content); previewElement.querySelector('.preview-text').innerHTML = ` ${renderedFullContent} `; } } function showFullMessage(stepId) { if (!currentFlowData) return; const step = currentFlowData.find(s => s.step_id === stepId); if (!step) return; const renderedFullContent = renderContent(step.full_content); const modal = new bootstrap.Modal(elements.messageModal); elements.messageContent.innerHTML = `
步骤信息:
Step ID: ${step.step_id}
Agent: ${step.agent}
Role: ${step.role}
时间: ${formatTimestamp(step.timestamp)}
工具调用: ${step.tool_calls.length}
完整内容:
${renderedFullContent}
${step.tool_calls.length > 0 ? `
工具调用详情:
${step.tool_calls.map(tool => `
${tool.server_name}.${tool.tool_name}
${JSON.stringify(tool.arguments, null, 2)}
`).join('')}
` : ''} ${step.browser_flow && step.browser_flow.length > 0 ? `
Browser会话详情:
${step.browser_flow.map((browserStep, index) => { const renderedBrowserContent = renderContent(browserStep.full_content); return `

${renderedBrowserContent}
${browserStep.tool_calls.length > 0 ? `
工具调用: ${browserStep.tool_calls.map(tool => `
${tool.server_name}.${tool.tool_name}
`).join('')}
` : ''}
`; }).join('')}
` : ''} `; modal.show(); } // ==================== 导航功能 ==================== function updateNavigationList(data) { if (!data || data.length === 0) { elements.navigationList.innerHTML = '

暂无步骤

'; return; } const navigationHTML = data.map((step, index) => { const summary = truncateText(step.content_preview, 50); const toolsInfo = step.tool_calls.length > 0 ? ` (${step.tool_calls.length}工具)` : ''; const browserInfo = step.browser_session ? ' [浏览器]' : ''; let html = ` `; // 添加browser子步骤 if (step.browser_flow && step.browser_flow.length > 0) { html += `
${step.browser_flow.map((browserStep, browserIndex) => { const browserSummary = truncateText(browserStep.content_preview, 40); const browserToolsInfo = browserStep.tool_calls.length > 0 ? ` (${browserStep.tool_calls.length}工具)` : ''; return ` `; }).join('')}
`; } return html; }).join(''); elements.navigationList.innerHTML = navigationHTML; } function scrollToStep(stepIndex) { const stepElement = document.getElementById(`step-${stepIndex}`); if (stepElement) { stepElement.scrollIntoView({ behavior: 'smooth', block: 'start' }); // 更新活跃的导航项 updateActiveNavItem(stepIndex); // 如果步骤是收起的,自动展开 const stepContent = document.getElementById(`step-content-${stepIndex}`); if (stepContent && !stepContent.classList.contains('show')) { const collapseInstance = new bootstrap.Collapse(stepContent, { toggle: false }); collapseInstance.show(); } } } function scrollToBrowserStep(parentIndex, browserStepId) { const browserStepElement = document.getElementById(`browser-step-${parentIndex}-${browserStepId}`); if (browserStepElement) { browserStepElement.scrollIntoView({ behavior: 'smooth', block: 'start' }); // 更新活跃的导航项 updateActiveNavItem(parentIndex, browserStepId); // 确保父步骤是展开的 const stepContent = document.getElementById(`step-content-${parentIndex}`); if (stepContent && !stepContent.classList.contains('show')) { const collapseInstance = new bootstrap.Collapse(stepContent, { toggle: false }); collapseInstance.show(); } } } function toggleBrowserNav(stepIndex, event) { event.stopPropagation(); // 阻止事件冒泡 const browserNavElement = document.getElementById(`browser-nav-${stepIndex}`); const toggleIcon = event.target.closest('.browser-toggle').querySelector('i'); if (browserNavElement.classList.contains('expanded')) { browserNavElement.classList.remove('expanded'); toggleIcon.className = 'fas fa-chevron-down'; } else { browserNavElement.classList.add('expanded'); toggleIcon.className = 'fas fa-chevron-up'; } } function updateActiveNavItem(activeIndex, browserStepId = null) { // 移除所有活跃状态 const navItems = elements.navigationList.querySelectorAll('.nav-item'); navItems.forEach(item => item.classList.remove('active')); if (browserStepId) { // 激活browser子步骤 const browserNavItem = elements.navigationList.querySelector(`[data-step-index="${activeIndex}"][data-browser-step-id="${browserStepId}"]`); if (browserNavItem) { browserNavItem.classList.add('active'); } } else { // 激活主步骤 const activeItem = elements.navigationList.querySelector(`[data-step-index="${activeIndex}"]:not([data-browser-step-id])`); if (activeItem) { activeItem.classList.add('active'); } } } // 监听滚动事件,自动更新导航激活状态 let scrollTimeout; function handleScroll() { clearTimeout(scrollTimeout); scrollTimeout = setTimeout(() => { if (!currentFlowData) return; const steps = document.querySelectorAll('.execution-step'); const browserSteps = document.querySelectorAll('.browser-step'); const scrollTop = window.pageYOffset || document.documentElement.scrollTop; const windowHeight = window.innerHeight; let activeIndex = 0; let activeBrowserStepId = null; let minDistance = Infinity; // 检查browser子步骤 browserSteps.forEach((browserStep) => { const rect = browserStep.getBoundingClientRect(); const distance = Math.abs(rect.top - windowHeight / 3); if (distance < minDistance && rect.top < windowHeight * 0.7) { minDistance = distance; const id = browserStep.id; const matches = id.match(/browser-step-(\d+)-(\d+)/); if (matches) { activeIndex = parseInt(matches[1]); activeBrowserStepId = parseInt(matches[2]); } } }); // 如果没有找到活跃的browser步骤,检查主步骤 if (!activeBrowserStepId) { steps.forEach((step, index) => { const rect = step.getBoundingClientRect(); const distance = Math.abs(rect.top - windowHeight / 3); if (distance < minDistance && rect.top < windowHeight * 0.7) { minDistance = distance; activeIndex = index; activeBrowserStepId = null; } }); } updateActiveNavItem(activeIndex, activeBrowserStepId); }, 100); } // 绑定滚动事件 window.addEventListener('scroll', handleScroll); ================================================ FILE: apps/visualize-trace/templates/index.html ================================================ Trace Analysis Dashboard
Basic Information

Please load a trace file first

Execution Summary

Please load a trace file first

Performance Summary

Please load a trace file first

Execution Flow

Please load a trace file first

Spans Statistics

Please load a trace file first

Step Logs Statistics

Please load a trace file first

Shortcuts: ←→ Switch files Enter Load Ctrl+R Refresh
Loading...
================================================ FILE: apps/visualize-trace/trace_analyzer.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import re from typing import Any, Dict, List, Optional class TraceAnalyzer: """ Class for analyzing trace JSON files, convenient for reading and accessing important information Supports two tool call formats: 1. Old format (MCP): Tool calls using XML tag format in content 2. New format: Tool calls using tool_calls field directly in message """ def __init__(self, json_file_path: str): """ Initialize analyzer Args: json_file_path: Path to the JSON file """ self.json_file_path = json_file_path self.data = self._load_json() def _load_json(self) -> Dict[str, Any]: """Load JSON file""" try: with open(self.json_file_path, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: raise Exception(f"Failed to load JSON file: {e}") def _parse_new_format_tool_name(self, tool_name: str) -> tuple[str, str]: """ Parse new format tool name Args: tool_name: New format tool name, for example: - "tool-server_name-tool_name" format - "agent-browsing-search_and_browse" format (browser agent) Returns: tuple: (server_name, actual_tool_name) """ # Handle agent-browsing-* format (browser agent calls) if tool_name.startswith("agent-browsing-"): server_name = "agent-browsing" actual_tool_name = tool_name[len("agent-browsing-") :] return server_name, actual_tool_name # Handle other agent-* formats elif tool_name.startswith("agent-"): # Find the last '-' to split server_name and tool_name last_dash = tool_name.rfind("-") if last_dash > 6: # There's content after "agent-" server_name = tool_name[:last_dash] actual_tool_name = tool_name[last_dash + 1 :] else: server_name = tool_name actual_tool_name = "" return server_name, actual_tool_name # Handle tool-server_name-tool_name format elif tool_name.startswith("tool-"): parts = tool_name.split("-", 2) if len(parts) >= 3: server_name = parts[1] actual_tool_name = parts[2] else: server_name = "unknown" actual_tool_name = tool_name return server_name, actual_tool_name # Other formats else: server_name = "unknown" actual_tool_name = tool_name return server_name, actual_tool_name # ==================== Basic Information ==================== def get_basic_info(self) -> Dict[str, Any]: """Get basic information of the task""" return { "status": self.data.get("status"), "task_id": self.data.get("task_id"), "start_time": self.data.get("start_time"), "end_time": self.data.get("end_time"), "final_boxed_answer": self.data.get("final_boxed_answer"), "ground_truth": self.data.get("ground_truth"), "final_judge_result": self.data.get("final_judge_result"), "judge_type": self.data.get("judge_type"), "error": self.data.get("error", ""), } def get_performance_summary(self) -> Dict[str, Any]: """Get performance summary information""" trace_data = self.data.get("trace_data", {}) return trace_data.get("performance_summary", {}) # ==================== Main Agent Message History ==================== def get_main_agent_history(self) -> Dict[str, Any]: """Get main agent message history""" return self.data.get("main_agent_message_history", {}) def get_main_agent_messages(self) -> List[Dict[str, Any]]: """Get main agent message list""" history = self.get_main_agent_history() return history.get("message_history", []) # ==================== Browser Agent Message History ==================== def get_browser_agent_sessions(self) -> Dict[str, Any]: """Get all browser agent sessions""" # Try two possible key names browser_sessions = self.data.get("browser_agent_message_history_sessions", {}) if not browser_sessions: browser_sessions = self.data.get("sub_agent_message_history_sessions", {}) return browser_sessions def get_browser_agent_session_messages( self, session_id: str ) -> List[Dict[str, Any]]: """Get message list for specified session""" sessions = self.get_browser_agent_sessions() session = sessions.get(session_id, {}) return session.get("message_history", []) # ==================== MCP Tool Call Parsing ==================== def parse_mcp_tool_call(self, text: str) -> Optional[Dict[str, Any]]: """Parse MCP tool call""" pattern = r"\s*(.*?)\s*(.*?)\s*\s*(.*?)\s*\s*" match = re.search(pattern, text, re.DOTALL) if match: server_name = match.group(1).strip() tool_name = match.group(2).strip() arguments_str = match.group(3).strip() try: arguments = json.loads(arguments_str) except json.JSONDecodeError: arguments = arguments_str return { "server_name": server_name, "tool_name": tool_name, "arguments": arguments, } return None def extract_text_content(self, content) -> str: """Extract text from message content""" if isinstance(content, list): text_parts = [] for item in content: if isinstance(item, dict) and item.get("type") == "text": text_parts.append(item.get("text", "")) return "".join(text_parts) return str(content) def analyze_conversation_flow(self) -> List[Dict[str, Any]]: """Analyze conversation flow, including tool calls""" flow_steps = [] main_messages = self.get_main_agent_messages() sub_agent_sessions = self.get_browser_agent_sessions() sub_agent_call_count = 0 for i, message in enumerate(main_messages): role = message.get("role") content = message.get("content", []) text_content = self.extract_text_content(content) step = { "step_id": i, "agent": "main_agent", "role": role, "content_preview": text_content[:200] + "..." if len(text_content) > 200 else text_content, "full_content": text_content, "tool_calls": [], "browser_session": None, "timestamp": message.get("timestamp", ""), "browser_flow": [], } # If it's an assistant message, check for tool calls if role == "assistant": # Check for new format tool_calls if "tool_calls" in message and message["tool_calls"]: for tool_call in message["tool_calls"]: # Convert new format to unified format if "function" in tool_call: function_info = tool_call["function"] tool_name = function_info.get("name", "") arguments = function_info.get("arguments", "") # Parse arguments string as JSON (if it's a string) if isinstance(arguments, str): try: arguments = json.loads(arguments) except json.JSONDecodeError: pass # Extract server_name from tool_name (if available) server_name, actual_tool_name = ( self._parse_new_format_tool_name(tool_name) ) parsed_tool_call = { "server_name": server_name, "tool_name": actual_tool_name, "arguments": arguments, "id": tool_call.get("id", ""), "type": tool_call.get("type", "function"), "format": "new", } step["tool_calls"].append(parsed_tool_call) # Handle browser agent calls - maintain complete consistency with MCP format logic if server_name.startswith("agent-"): sub_agent_call_count += 1 session_id = f"{server_name}_{sub_agent_call_count}" step["browser_session"] = session_id # Analyze browser session conversation flow if session_id in sub_agent_sessions: browser_flow = self.analyze_browser_session_flow( session_id ) step["browser_flow"] = browser_flow elif server_name.startswith("browsing-agent"): sub_agent_call_count += 1 session_id = f"browser_agent_{sub_agent_call_count}" step["browser_session"] = session_id # Analyze browser session conversation flow if session_id in sub_agent_sessions: browser_flow = self.analyze_browser_session_flow( session_id ) step["browser_flow"] = browser_flow # Check for old format MCP tool calls (maintain compatibility) mcp_tool_call = self.parse_mcp_tool_call(text_content) if mcp_tool_call: mcp_tool_call["format"] = "mcp" # Mark as old format step["tool_calls"].append(mcp_tool_call) # If browsing-agent is called, associate browser session if mcp_tool_call["server_name"].startswith("agent-"): sub_agent_call_count += 1 session_id = ( f"{mcp_tool_call['server_name']}_{sub_agent_call_count}" ) step["browser_session"] = session_id # Analyze browser session conversation flow if session_id in sub_agent_sessions: browser_flow = self.analyze_browser_session_flow(session_id) step["browser_flow"] = browser_flow elif mcp_tool_call["server_name"].startswith("browsing-agent"): sub_agent_call_count += 1 session_id = f"browser_agent_{sub_agent_call_count}" step["browser_session"] = session_id # Analyze browser session conversation flow if session_id in sub_agent_sessions: browser_flow = self.analyze_browser_session_flow(session_id) step["browser_flow"] = browser_flow flow_steps.append(step) return flow_steps def analyze_browser_session_flow(self, session_id: str) -> List[Dict[str, Any]]: """Analyze browser session conversation flow""" browser_messages = self.get_browser_agent_session_messages(session_id) browser_flow = [] for i, message in enumerate(browser_messages): role = message.get("role") content = message.get("content", []) text_content = self.extract_text_content(content) step = { "step_id": i, "agent": session_id, "role": role, "content_preview": text_content[:200] + "..." if len(text_content) > 200 else text_content, "full_content": text_content, "tool_calls": [], "timestamp": message.get("timestamp", ""), } # If it's an assistant message, check for tool calls if role == "assistant": # Check for new format tool_calls if "tool_calls" in message and message["tool_calls"]: for tool_call in message["tool_calls"]: # Convert new format to unified format if "function" in tool_call: function_info = tool_call["function"] tool_name = function_info.get("name", "") arguments = function_info.get("arguments", "") # Parse arguments string as JSON (if it's a string) if isinstance(arguments, str): try: arguments = json.loads(arguments) except json.JSONDecodeError: pass # Extract server_name from tool_name (if available) server_name, actual_tool_name = ( self._parse_new_format_tool_name(tool_name) ) parsed_tool_call = { "server_name": server_name, "tool_name": actual_tool_name, "arguments": arguments, "id": tool_call.get("id", ""), "type": tool_call.get("type", "function"), "format": "new", } step["tool_calls"].append(parsed_tool_call) # Check for old format MCP tool calls (maintain compatibility) mcp_tool_call = self.parse_mcp_tool_call(text_content) if mcp_tool_call: mcp_tool_call["format"] = "mcp" # Mark as old format step["tool_calls"].append(mcp_tool_call) browser_flow.append(step) return browser_flow def get_execution_summary(self) -> Dict[str, Any]: """Get execution summary information""" flow_steps = self.analyze_conversation_flow() total_steps = len(flow_steps) tool_calls = [] browser_sessions = [] for step in flow_steps: if step["tool_calls"]: tool_calls.extend(step["tool_calls"]) if step.get("browser_session"): browser_sessions.append(step["browser_session"]) # Collect tool calls from browser sessions if step.get("browser_flow"): for browser_step in step["browser_flow"]: if browser_step.get("tool_calls"): tool_calls.extend(browser_step["tool_calls"]) # Tool usage statistics tool_usage = {} for tool in tool_calls: # Choose appropriate key name generation method based on format if tool.get("format") == "new": # New format: use server_name.tool_name, if server_name is unknown then use only tool_name if tool.get("server_name") != "unknown": key = f"{tool['server_name']}.{tool['tool_name']}" else: key = tool["tool_name"] else: # Old format (MCP): maintain original method key = f"{tool['server_name']}.{tool['tool_name']}" tool_usage[key] = tool_usage.get(key, 0) + 1 return { "total_steps": total_steps, "total_tool_calls": len(tool_calls), "browser_sessions_count": len(browser_sessions), "tool_usage_distribution": tool_usage, "browser_sessions": browser_sessions, } def get_spans_summary(self) -> Dict[str, Any]: """Get spans statistical summary""" trace_data = self.data.get("trace_data", {}) spans = trace_data.get("spans", []) agent_stats = {} for span in spans: agent = span.get("agent_context", "unknown") if agent not in agent_stats: agent_stats[agent] = { "count": 0, "total_duration": 0, "span_types": set(), } agent_stats[agent]["count"] += 1 agent_stats[agent]["total_duration"] += span.get("duration_seconds", 0) agent_stats[agent]["span_types"].add(span.get("name", "unknown")) # Convert set to list for agent in agent_stats: agent_stats[agent]["span_types"] = list(agent_stats[agent]["span_types"]) return { "total_spans": len(spans), "total_duration": sum(span.get("duration_seconds", 0) for span in spans), "agent_stats": agent_stats, } def get_step_logs_summary(self) -> Dict[str, Any]: """Get step logs summary statistics""" logs = self.data.get("step_logs", []) status_count = {} step_type_count = {} for log in logs: status = log.get("status", "unknown") step_name = log.get("step_name", "unknown") status_count[status] = status_count.get(status, 0) + 1 step_type_count[step_name] = step_type_count.get(step_name, 0) + 1 return { "total_logs": len(logs), "status_distribution": status_count, "step_type_distribution": step_type_count, } ================================================ FILE: assets/LOCAL-TOOL-DEPLOYMENT.md ================================================ # Local Tool Deployment Guide This guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration. ## Overview MiroThinker supports several optional open-source tools that you can deploy locally: - **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files - **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images - **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks These tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file. ## Prerequisites - **GPU**: NVIDIA GPU with sufficient VRAM - **Python 3.10+** - **CUDA**: Compatible CUDA toolkit installed - **Model Storage**: Sufficient disk space to download model checkpoints ## Tool Deployment ### 1. Audio Transcription Tool (`tool-transcribe-os`) **Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo) **Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs. **Deployment with vLLM**: ```bash # Install vLLM with audio support pip install vllm==0.10.0 pip install vllm[audio] # Start the server vllm serve openai/whisper-large-v3-turbo \ --served-model-name whisper-large-v3-turbo \ --task transcription \ --host 0.0.0.0 \ --port 8000 ``` **Configuration in `.env`**: ```bash WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" WHISPER_API_KEY=your_api_key # Optional, if your server requires authentication WHISPER_BASE_URL="http://0.0.0.0:8000/v1" ``` ### 2. Visual Question Answering Tool (`tool-vqa-os`) **Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) **Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats. **Deployment with SGLang**: ```bash # Install SGLang pip install sglang[all] # Start the server python3 -m sglang.launch_server \ --model-path Qwen/Qwen2.5-VL-72B-Instruct \ --tp 8 \ --host 0.0.0.0 \ --port 8001 \ --trust-remote-code \ --enable-metrics ``` **Configuration in `.env`**: ```bash VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" VISION_API_KEY=your_api_key # Optional, if your server requires authentication VISION_BASE_URL="http://0.0.0.0:8001/v1/chat/completions" ``` ### 3. Reasoning Engine Tool (`tool-reasoning-os`) **Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) **Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens). **Deployment with SGLang**: ```bash # Install SGLang pip install sglang[all] # Start the server python3 -m sglang.launch_server \ --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \ --tp 8 \ --host 0.0.0.0 \ --port 8002 \ --trust-remote-code \ --context-length 131072 \ --enable-metrics ``` **Configuration in `.env`**: ```bash REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" REASONING_API_KEY=your_api_key # Optional, if your server requires authentication REASONING_BASE_URL="http://0.0.0.0:8002/v1/chat/completions" ``` ## Using Deployed Tools Once you have deployed the tools, configure your agent to use them: 1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`): ```yaml main_agent: tools: - tool-python - search_and_scrape_webpage - jina_scrape_llm_summary - tool-transcribe-os # Use local Whisper deployment - tool-vqa-os # Use local Qwen2.5-VL deployment - tool-reasoning-os # Use local Qwen3-235B deployment max_turns: 400 ``` 2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above. 1. **Run your agent**: ```bash cd apps/miroflow-agent uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1 ``` ## Commercial Alternatives If you prefer not to deploy these tools locally, you can use commercial alternatives: - **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API - **`tool-vqa`**: Uses Claude Sonnet 3.7 API - **`tool-reasoning`**: Uses Claude Sonnet 3.7 API Simply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`). ## Additional Resources - **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/) - **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/) - **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations ================================================ FILE: assets/QA.md ================================================ # MiroFlow QA Documentation ## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations? **Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools. ### Step-by-Step Process 1. **Extract GAIA-Text-103 Tasks** ```bash # Extract text-103 tasks to a separate directory uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation ``` This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation` 1. **Re-grade with GAIA-Text-103 Evaluator** ```bash # Apply GAIA-Text-103 specific grading uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction ``` 1. **Verify Results** ```bash # Check accuracy and generate statistics uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction ``` ## Q2: Does the choice of judgment model affect evaluation performance? **Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models. We have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons: - **Ease of deployment:** No need to host additional GPU-intensive models - **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp) - **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons ## Code Quality Checks Before submitting a pull request, ensure your code meets our quality standards: ```bash # Fix linting issues automatically uv tool run ruff@0.8.0 check --fix . # Format code according to our style guidelines uv tool run ruff@0.8.0 format . ``` ## Know Issues - The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate. ================================================ FILE: assets/qwen3_nonthinking.jinja ================================================ {%- if tools %} {{- '<|im_start|>system\n' }} {%- if messages[0].role == 'system' %} {{- messages[0].content + '\n\n' }} {%- endif %} {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} {%- for tool in tools %} {{- "\n" }} {{- tool | tojson }} {%- endfor %} {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} {%- else %} {%- if messages[0].role == 'system' %} {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} {%- for message in messages[::-1] %} {%- set index = (messages|length - 1) - loop.index0 %} {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} {%- set ns.multi_step_tool = false %} {%- set ns.last_query_index = index %} {%- endif %} {%- endfor %} {%- for message in messages %} {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} {%- elif message.role == "assistant" %} {%- set content = message.content %} {%- set reasoning_content = '' %} {%- if message.reasoning_content is defined and message.reasoning_content is not none %} {%- set reasoning_content = message.reasoning_content %} {%- else %} {%- if '
' in message.content %} {%- set content = message.content.split('
')[-1].lstrip('\n') %} {%- set reasoning_content = message.content.split('
')[0].rstrip('\n').split('')[-1].lstrip('\n') %} {%- endif %} {%- endif %} {%- if loop.index0 > ns.last_query_index %} {%- if loop.last or (not loop.last and reasoning_content) %} {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- else %} {{- '<|im_start|>' + message.role + '\n' + content }} {%- endif %} {%- if message.tool_calls %} {%- for tool_call in message.tool_calls %} {%- if (loop.first and content) or (not loop.first) %} {{- '\n' }} {%- endif %} {%- if tool_call.function %} {%- set tool_call = tool_call.function %} {%- endif %} {{- '\n{"name": "' }} {{- tool_call.name }} {{- '", "arguments": ' }} {%- if tool_call.arguments is string %} {{- tool_call.arguments }} {%- else %} {{- tool_call.arguments | tojson }} {%- endif %} {{- '}\n' }} {%- endfor %} {%- endif %} {{- '<|im_end|>\n' }} {%- elif message.role == "tool" %} {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} {{- '<|im_start|>user' }} {%- endif %} {{- '\n\n' }} {{- message.content }} {{- '\n' }} {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} {{- '<|im_end|>\n' }} {%- endif %} {%- endif %} {%- endfor %} {%- if add_generation_prompt %} {{- '<|im_start|>assistant\n\n\n\n\n' }} {%- endif %} ================================================ FILE: justfile ================================================ default: just --list # lint monorepo [group('precommit')] lint: uv tool run ruff@0.8.0 check --fix . # sort imports [group('precommit')] sort-imports: uv tool run ruff@0.8.0 check --select I --fix . # format monorepo [group('precommit')] format: uv tool run ruff@0.8.0 format . # check license [group('precommit')] check-license: uv run reuse lint # insert license for contributor insert-license: # https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings git diff --name-only --cached | xargs -I {} reuse annotate -c "$(git config --get user.name) <$(git config --get user.email)>" "{}" # format markdown files [group('precommit')] format-md: find . -name "*.md" -type f | xargs uv tool run mdformat@0.7.17 # run precommit before PR [group('precommit')] precommit: lint sort-imports format-md format ================================================ FILE: libs/miroflow-tools/README.md ================================================ # 🛠️ MiroFlow Tools > A comprehensive tool management system and MCP (Model Context Protocol) server collection for MiroFlow, providing a unified interface to various AI capabilities including code execution, vision processing, audio transcription, web searching, reasoning, and document reading. ## ✨ Features - **🔧 Unified Tool Management**: Centralized `ToolManager` for managing multiple MCP servers - **🌐 Multiple Transport Protocols**: Support for both stdio and SSE (HTTP) connections - **📦 Rich Tool Ecosystem**: Pre-built MCP servers for common AI tasks - **⚙️ Flexible Configuration**: Tool blacklisting, timeout management, and custom server configurations - **🛡️ Error Handling**: Robust retry logic and fallback mechanisms ## 📦 Installation This package is a local dependency that is automatically installed when you run `uv sync` in the `apps/miroflow-agent` directory. No separate installation is required. For standalone usage or development: ```bash cd libs/miroflow-tools uv sync ``` ## 📋 MCP Servers Overview Quick reference tables of all available MCP servers and their tools. Click on "Details" to jump to the full documentation. ### 📊 Tools Used in MiroThinker v1.0 and v1.5 The following tools were used in the MiroThinker v1.0 and v1.5 evaluation: | Category | Server Name | Tools | Key Environment Variables | Link | |----------------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|------------------------------------------| | **Execution Environment** | `tool-python` | `create_sandbox`, `run_command`, `run_python_code` | `E2B_API_KEY`, `LOGS_DIR` | [Details](#tool-python) | | **File Management** | `tool-python` | `upload_file_from_local_to_sandbox`, `download_file_from_sandbox_to_local`, `download_file_from_internet_to_sandbox` | `E2B_API_KEY`, `LOGS_DIR` | [Details](#tool-python) | | **Information Retrieval** | `search_and_scrape_webpage` | `google_search` | `SERPER_API_KEY`, `SERPER_BASE_URL` | [Details](#search_and_scrape_webpage) | | **Information Retrieval** | `jina_scrape_llm_summary` | `scrape_and_extract_info` | `JINA_API_KEY`, `JINA_BASE_URL`, `SUMMARY_LLM_BASE_URL`, `SUMMARY_LLM_MODEL_NAME`, `SUMMARY_LLM_API_KEY` | [Details](#jina_scrape_llm_summary) | ### 🔧 Additional Available Tools The following tools are implemented but were not used in the MiroThinker v1.0/v1.5 evaluation: | Category | Server Name | Tools | Key Environment Variables | Link | |-----------------------------|----------------------|---------------------------------------------------|---------------------------------------------------------------------|--------------------------------| | **Web Searching** | `tool-google-search` | `google_search`, `scrape_website` | `SERPER_API_KEY`, `SERPER_BASE_URL`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-google-search) | | **Web Searching (Sogou)** | `tool-sogou-search` | `sogou_search`, `scrape_website` | `TENCENTCLOUD_SECRET_ID`, `TENCENTCLOUD_SECRET_KEY`, `JINA_API_KEY`, `JINA_BASE_URL` | [Details](#tool-sogou-search) | | **Vision Processing** | `tool-vqa` | `visual_question_answering` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-vqa) | | **Vision Processing** | `tool-vqa-os` | `visual_question_answering` | `VISION_API_KEY`, `VISION_BASE_URL`, `VISION_MODEL_NAME` | [Details](#tool-vqa-os) | | **Audio Processing** | `tool-transcribe` | `audio_transcription`, `audio_question_answering` | `OPENAI_API_KEY`, `OPENAI_BASE_URL` | [Details](#tool-transcribe) | | **Audio Processing** | `tool-transcribe-os` | `audio_transcription` | `WHISPER_API_KEY`, `WHISPER_BASE_URL`, `WHISPER_MODEL_NAME` | [Details](#tool-transcribe-os) | | **Document Reading** | `tool-reading` | `convert_to_markdown` | None required | [Details](#tool-reading) | | **Reasoning Engine** | `tool-reasoning` | `reasoning` | `ANTHROPIC_API_KEY`, `ANTHROPIC_BASE_URL` | [Details](#tool-reasoning) | | **Reasoning Engine** | `tool-reasoning-os` | `reasoning` | `REASONING_API_KEY`, `REASONING_BASE_URL`, `REASONING_MODEL_NAME` | [Details](#tool-reasoning-os) | ## 🚀 Quick Start
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): # Initialize tool manager with server configurations server_configs = [ { "name": "tool-python", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"], env={"E2B_API_KEY": "your_e2b_api_key"} # Required for Python execution ) }, # Add more server configurations... ] tool_manager = ToolManager(server_configs) # Get all available tool definitions tool_definitions = await tool_manager.get_all_tool_definitions() # Create a sandbox first sandbox_result = await tool_manager.execute_tool_call( server_name="tool-python", tool_name="create_sandbox", arguments={"timeout": 600} ) # Extract sandbox_id from result sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip() # Execute a tool call result = await tool_manager.execute_tool_call( server_name="tool-python", tool_name="run_python_code", arguments={"code_block": "print('Hello, World!')", "sandbox_id": sandbox_id} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
## 🔧 ToolManager The `ToolManager` class is the central component for managing and executing tools across multiple MCP servers. ### Key Features - **🔌 Multi-Server Support**: Manage tools from multiple MCP servers simultaneously - **🔗 Connection Management**: Automatic connection handling for stdio and SSE transports - **🚫 Tool Blacklisting**: Filter out specific tools from specific servers - **📝 Structured Logging**: Optional task logging integration - **🔄 Error Recovery**: Automatic retry logic and fallback mechanisms ### Methods - `get_all_tool_definitions()`: Retrieve tool schemas from all configured servers - `execute_tool_call(server_name, tool_name, arguments)`: Execute a specific tool - `set_task_log(task_log)`: Enable structured logging - `get_server_params(server_name)`: Get configuration for a specific server ### Example Usage
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): # Configure servers server_configs = [ { "name": "python-server", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"], env={"E2B_API_KEY": "your_key"} ) } ] # Initialize with optional blacklist tool_blacklist = {("python-server", "some_tool")} manager = ToolManager(server_configs, tool_blacklist=tool_blacklist) # Enable logging # manager.set_task_log(your_task_logger) # Get tools tools = await manager.get_all_tool_definitions() # Create a sandbox first (required before running code) sandbox_result = await manager.execute_tool_call( server_name="python-server", tool_name="create_sandbox", arguments={"timeout": 600} ) sandbox_id = sandbox_result['result'].split('sandbox_id:')[-1].strip() # Execute tool result = await manager.execute_tool_call( server_name="python-server", tool_name="run_python_code", arguments={"code_block": "1 + 1", "sandbox_id": sandbox_id} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
## 🔌 MCP Servers ### Server: tool-python Execute Python code in isolated E2B sandboxes with persistent sessions. **Tools**: - 🔨 `create_sandbox(timeout=600)`: Create a new Linux sandbox - 🐍 `run_python_code(code_block, sandbox_id)`: Execute Python code - 💻 `run_command(command, sandbox_id)`: Run shell commands - ⬆️ `upload_file_from_local_to_sandbox(sandbox_id, local_file_path, sandbox_file_path)`: Upload files - ⬇️ `download_file_from_internet_to_sandbox(sandbox_id, url, sandbox_file_path)`: Download files - 💾 `download_file_from_sandbox_to_local(sandbox_id, sandbox_file_path, local_filename)`: Download files **Environment Variables**: - 🔑 `E2B_API_KEY`: E2B API key (required) - 📁 `LOGS_DIR`: Directory for temporary files (default: `../../logs`) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): # Configure server with environment variables server_configs = [ { "name": "tool-python", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.python_mcp_server"], env={"E2B_API_KEY": "your_e2b_api_key"} ) } ] manager = ToolManager(server_configs) # Create sandbox result = await manager.execute_tool_call( server_name="tool-python", tool_name="create_sandbox", arguments={"timeout": 600} ) # Extract sandbox_id from result sandbox_id = result['result'].split('sandbox_id:')[-1].strip() # Run code result = await manager.execute_tool_call( server_name="tool-python", tool_name="run_python_code", arguments={"code_block": "import numpy as np; print(np.array([1,2,3]))", "sandbox_id": sandbox_id} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-vqa Analyze images and answer questions about visual content using Anthropic Claude. **Tools**: - 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images **Environment Variables**: - 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required) - 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-vqa", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server"], env={ "ANTHROPIC_API_KEY": "your_anthropic_api_key", "ANTHROPIC_BASE_URL": "https://api.anthropic.com" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-vqa", tool_name="visual_question_answering", arguments={ "image_path_or_url": "https://example.com/image.jpg", "question": "What is in this image?" } ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-vqa-os Analyze images and answer questions about visual content using open-source compatible models. **Tools**: - 👁️ `visual_question_answering(image_path_or_url, question)`: Answer questions about images **Environment Variables**: - 🔑 `VISION_API_KEY`: API key (required) - 🌐 `VISION_BASE_URL`: API endpoint URL (required) - 🤖 `VISION_MODEL_NAME`: Model name (required) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-vqa-os", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.vision_mcp_server_os"], env={ "VISION_API_KEY": "your_vision_api_key", "VISION_BASE_URL": "your_vision_base_url", "VISION_MODEL_NAME": "your_vision_model_name" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-vqa-os", tool_name="visual_question_answering", arguments={ "image_path_or_url": "https://example.com/image.jpg", "question": "What is in this image?" } ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-transcribe Transcribe audio files and answer questions about audio content using OpenAI Whisper. **Tools**: - 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text - 🎧 `audio_question_answering(audio_path_or_url, question)`: Answer questions about audio **Environment Variables**: - 🔑 `OPENAI_API_KEY`: OpenAI API key (required) - 🌐 `OPENAI_BASE_URL`: API base URL (default: `https://api.openai.com/v1`) **Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-transcribe", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server"], env={ "OPENAI_API_KEY": "your_openai_api_key", "OPENAI_BASE_URL": "https://api.openai.com/v1" } ) } ] manager = ToolManager(server_configs) # Transcribe audio result = await manager.execute_tool_call( server_name="tool-transcribe", tool_name="audio_transcription", arguments={"audio_path_or_url": "/path/to/audio.mp3"} ) print(result) # Answer questions about audio result = await manager.execute_tool_call( server_name="tool-transcribe", tool_name="audio_question_answering", arguments={ "audio_path_or_url": "/path/to/audio.mp3", "question": "What is the main topic discussed?" } ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-transcribe-os Transcribe audio files using open-source compatible models. **Tools**: - 🎤 `audio_transcription(audio_path_or_url)`: Transcribe audio to text **Environment Variables**: - 🔑 `WHISPER_API_KEY`: API key (required) - 🌐 `WHISPER_BASE_URL`: API endpoint URL (required) - 🤖 `WHISPER_MODEL_NAME`: Model name (required) **Supported Formats**: 🎵 MP3, WAV, M4A, AAC, OGG, FLAC **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-transcribe-os", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.audio_mcp_server_os"], env={ "WHISPER_API_KEY": "your_whisper_api_key", "WHISPER_BASE_URL": "your_whisper_base_url", "WHISPER_MODEL_NAME": "your_whisper_model_name" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-transcribe-os", tool_name="audio_transcription", arguments={"audio_path_or_url": "/path/to/audio.mp3"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-reading Convert various document formats to Markdown using MarkItDown. **Tools**: - 📄 `convert_to_markdown(uri)`: Convert documents (PDF, DOC, PPT, Excel, CSV, ZIP, etc.) to Markdown. URI must start with `file:`, `data:`, `http:`, or `https:` scheme. **Supported Formats**: 📄 PDF, DOC, DOCX, PPT, PPTX, XLS, XLSX, CSV, ZIP, and more **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): # Configure server (no additional environment variables required) server_configs = [ { "name": "tool-reading", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.reading_mcp_server"] ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-reading", tool_name="convert_to_markdown", arguments={"uri": "file:///path/to/document.pdf"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-reasoning Solve complex reasoning problems requiring chain-of-thought using Anthropic Claude with thinking. **Tools**: - 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions **Environment Variables**: - 🔑 `ANTHROPIC_API_KEY`: Anthropic API key (required) - 🌐 `ANTHROPIC_BASE_URL`: API base URL (default: `https://api.anthropic.com`) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-reasoning", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server"], env={ "ANTHROPIC_API_KEY": "your_anthropic_api_key", "ANTHROPIC_BASE_URL": "https://api.anthropic.com" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-reasoning", tool_name="reasoning", arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-reasoning-os Solve complex reasoning problems requiring chain-of-thought using open-source compatible models. **Tools**: - 🧠 `reasoning(question)`: Solve hard math problems, puzzles, riddles, and IQ test questions **Environment Variables**: - 🔑 `REASONING_API_KEY`: API key (required) - 🌐 `REASONING_BASE_URL`: API endpoint URL (required) - 🤖 `REASONING_MODEL_NAME`: Model name (required) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-reasoning-os", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.reasoning_mcp_server_os"], env={ "REASONING_API_KEY": "your_reasoning_api_key", "REASONING_BASE_URL": "your_reasoning_base_url", "REASONING_MODEL_NAME": "your_reasoning_model_name" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="tool-reasoning-os", tool_name="reasoning", arguments={"question": "Solve: If a train travels 60 mph for 2 hours, then 80 mph for 1 hour, what's the average speed?"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: search_and_scrape_webpage Google search via Serper API. Used in MiroThinker v1.0/v1.5 evaluation. **Tools**: - 🔍 `google_search(q, gl="us", hl="en", location=None, num=None, tbs=None, page=None, autocorrect=None)`: Perform web searches via Serper API and retrieve rich results **Environment Variables**: - 🔑 `SERPER_API_KEY`: Serper API key (required) - 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "search_and_scrape_webpage", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.dev_mcp_servers.search_and_scrape_webpage"], env={ "SERPER_API_KEY": "your_serper_api_key", "SERPER_BASE_URL": "https://google.serper.dev" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="search_and_scrape_webpage", tool_name="google_search", arguments={ "q": "Python async programming", "gl": "us", "hl": "en", "num": 10 } ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: jina_scrape_llm_summary Scrape content from URLs and extract meaningful information using an LLM. Used in MiroThinker v1.0/v1.5 evaluation. **Tools**: - 🔎 `scrape_and_extract_info(url, info_to_extract, custom_headers=None)`: Scrape content from a URL (web pages, PDFs, code files, etc.) and extract meaningful information using an LLM **Environment Variables**: - 🔑 `JINA_API_KEY`: Jina.ai API key (required) - 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`) - 🔗 `SUMMARY_LLM_BASE_URL`: LLM API base URL for summarization (required) - 🤖 `SUMMARY_LLM_MODEL_NAME`: LLM model name for summarization (required) - 🔑 `SUMMARY_LLM_API_KEY`: LLM API key for summarization (optional, depends on LLM provider) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "jina_scrape_llm_summary", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.dev_mcp_servers.jina_scrape_llm_summary"], env={ "JINA_API_KEY": "your_jina_api_key", "JINA_BASE_URL": "https://r.jina.ai", "SUMMARY_LLM_BASE_URL": "your_llm_base_url", "SUMMARY_LLM_MODEL_NAME": "your_llm_model_name", "SUMMARY_LLM_API_KEY": "your_llm_api_key" } ) } ] manager = ToolManager(server_configs) result = await manager.execute_tool_call( server_name="jina_scrape_llm_summary", tool_name="scrape_and_extract_info", arguments={ "url": "https://example.com/article", "info_to_extract": "What is the main topic of this article?" } ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-google-search Google search via Serper API with website scraping capabilities. **Tools**: - 🔍 `google_search(q, gl="us", hl="en", location=None, num=10, tbs=None, page=1)`: Google search - 🌐 `scrape_website(url)`: Scrape website content using Jina.ai **Environment Variables**: - 🔑 `SERPER_API_KEY`: Serper API key (required for Google search) - 🌐 `SERPER_BASE_URL`: Serper API base URL (default: `https://google.serper.dev`) - 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping) - 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`) **Filtering Options** (via environment variables): - 🚫 `REMOVE_SNIPPETS`: Remove snippets from search results - 🚫 `REMOVE_KNOWLEDGE_GRAPH`: Remove knowledge graph from results - 🚫 `REMOVE_ANSWER_BOX`: Remove answer box from results **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-google-search", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.searching_google_mcp_server"], env={ "SERPER_API_KEY": "your_serper_api_key", "SERPER_BASE_URL": "https://google.serper.dev", "JINA_API_KEY": "your_jina_api_key", "JINA_BASE_URL": "https://r.jina.ai" } ) } ] manager = ToolManager(server_configs) # Google search result = await manager.execute_tool_call( server_name="tool-google-search", tool_name="google_search", arguments={ "q": "Python async programming", "gl": "us", "hl": "en", "num": 10 } ) print(result) # Scrape website result = await manager.execute_tool_call( server_name="tool-google-search", tool_name="scrape_website", arguments={"url": "https://example.com/article"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
### Server: tool-sogou-search Sogou search (optimized for Chinese) with website scraping capabilities. *Optional: Not used in the MiroThinker v1.0/v1.5 evaluation* **Tools**: - 🔍 `sogou_search(Query, Cnt=10)`: Sogou search (Chinese) - 🌐 `scrape_website(url)`: Scrape website content using Jina.ai **Environment Variables**: - 🔑 `TENCENTCLOUD_SECRET_ID`: Tencent Cloud secret ID (required) - 🔑 `TENCENTCLOUD_SECRET_KEY`: Tencent Cloud secret key (required) - 🔑 `JINA_API_KEY`: Jina.ai API key (required for scraping) - 🌐 `JINA_BASE_URL`: Jina.ai API base URL (default: `https://r.jina.ai`) **Example**:
Click to expand code example ```python import asyncio from miroflow_tools import ToolManager from mcp import StdioServerParameters async def main(): server_configs = [ { "name": "tool-sogou-search", "params": StdioServerParameters( command="python", args=["-m", "miroflow_tools.mcp_servers.searching_sogou_mcp_server"], env={ "TENCENTCLOUD_SECRET_ID": "your_tencent_secret_id", "TENCENTCLOUD_SECRET_KEY": "your_tencent_secret_key", "JINA_API_KEY": "your_jina_api_key", "JINA_BASE_URL": "https://r.jina.ai" } ) } ] manager = ToolManager(server_configs) # Sogou search result = await manager.execute_tool_call( server_name="tool-sogou-search", tool_name="sogou_search", arguments={ "Query": "Python 异步编程", "Cnt": 10 } ) print(result) # Scrape website result = await manager.execute_tool_call( server_name="tool-sogou-search", tool_name="scrape_website", arguments={"url": "https://example.com/article"} ) print(result) if __name__ == "__main__": asyncio.run(main()) ```
## 🚀 Development ### Adding a New MCP Server 1. Create a new server file in `mcp_servers/` 1. Use `FastMCP` to define tools: ```python from fastmcp import FastMCP mcp = FastMCP("server-name") @mcp.tool() async def my_tool(arg: str) -> str: """Tool description.""" return "result" if __name__ == "__main__": mcp.run(transport="stdio") ``` 1. Add server configuration to your application 1. Update this README with server documentation ================================================ FILE: libs/miroflow-tools/pyproject.toml ================================================ [project] name = "miroflow-tools" version = "0.1.0" description = "Tool management and MCP server utilities for MiroFlow" readme = "README.md" authors = [ { name = "MiroMind Team", email = "service@miromind.ai" } ] requires-python = ">=3.12" dependencies = [ "mcp>=1.0.0", "fastmcp>=0.1.0", "playwright>=1.40.0", "requests>=2.32.0", "e2b-code-interpreter==1.2.1", "wikipedia", "mutagen", "markitdown-mcp>=0.0.1a3", "google-genai", "aiohttp", "redis" ] [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src/miroflow_tools"] [dependency-groups] dev = [ "pytest>=8.4.1", "pytest-asyncio>=1.0.0", "pytest-cov>=6.2.1", "pytest-html>=4.1.1", "pytest-xdist>=3.7.0", "pytest-mock>=3.10.0", "pytest-timeout>=2.1.0", "inline-snapshot>=0.23.2", ] [tool.pytest.ini_options] minversion = "8.3.5" testpaths = ["src/test"] asyncio_default_fixture_loop_scope = "function" addopts = [ "-rA", "--show-capture=stderr", "-n=auto", "--html=report.html", "--self-contained-html", "--cov=miroflow_tools", "--cov-report=html", "--strict-markers", "-v", ] markers = [ "integration: marks tests as integration tests (may be slow)", "unit: marks tests as unit tests", "slow: marks tests as slow (deselect with '-m \"not slow\"')", "requires_api_key: marks tests that require real API credentials", ] ================================================ FILE: libs/miroflow-tools/src/__init__.py ================================================ ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/__init__.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. from .manager import ToolManager __all__ = ["ToolManager"] ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/jina_scrape_llm_summary.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import json import logging import os from typing import Any, Dict import httpx from mcp.server.fastmcp import FastMCP # Configure logging logger = logging.getLogger("miroflow") SUMMARY_LLM_BASE_URL = os.environ.get("SUMMARY_LLM_BASE_URL") SUMMARY_LLM_MODEL_NAME = os.environ.get("SUMMARY_LLM_MODEL_NAME") SUMMARY_LLM_API_KEY = os.environ.get("SUMMARY_LLM_API_KEY") JINA_API_KEY = os.environ.get("JINA_API_KEY", "") JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai") # Initialize FastMCP server mcp = FastMCP("jina_scrape_llm_summary") @mcp.tool() async def scrape_and_extract_info( url: str, info_to_extract: str, custom_headers: Dict[str, str] = None ): """ Scrape content from a URL, including web pages, PDFs, code files, and other supported resources, and extract meaningful information using an LLM. If you need to extract information from a PDF, please use this tool. Args: url (str): The URL to scrape content from. Supports various types of URLs such as web pages, PDFs, raw text/code files (e.g., GitHub, Gist), and similar sources. info_to_extract (str): The specific types of information to extract (usually a question) custom_headers (Dict[str, str]): Additional headers to include in the scraping request Returns: Dict[str, Any]: A dictionary containing: - success (bool): Whether the operation was successful - url (str): The original URL - extracted_info (str): The extracted information - error (str): Error message if the operation failed - scrape_stats (Dict): Statistics about the scraped content - model_used (str): The model used for summarization - tokens_used (int): Number of tokens used (if available) """ if _is_huggingface_dataset_or_space_url(url): return json.dumps( { "success": False, "url": url, "extracted_info": "", "error": "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose.", "scrape_stats": {}, "tokens_used": 0, }, ensure_ascii=False, ) # First, scrape the content with Jina scrape_result = await scrape_url_with_jina(url, custom_headers) # If Jina fails, try direct Python scraping as fallback if not scrape_result["success"]: logger.warning( f"Jina Scrape and Extract Info: Jina scraping failed: {scrape_result['error']}, trying direct Python scraping as fallback" ) scrape_result = await scrape_url_with_python(url, custom_headers) if not scrape_result["success"]: logger.error( f"Jina Scrape and Extract Info: Both Jina and Python scraping failed: {scrape_result['error']}" ) return json.dumps( { "success": False, "url": url, "extracted_info": "", "error": f"Scraping failed (both Jina and Python): {scrape_result['error']}", "scrape_stats": {}, "tokens_used": 0, }, ensure_ascii=False, ) else: logger.info( f"Jina Scrape and Extract Info: Python fallback scraping succeeded for URL: {url}" ) # Then, summarize the content extracted_result = await extract_info_with_llm( url=url, content=scrape_result["content"], info_to_extract=info_to_extract, model=SUMMARY_LLM_MODEL_NAME, max_tokens=8192, ) # Combine results return json.dumps( { "success": extracted_result["success"], "url": url, "extracted_info": extracted_result["extracted_info"], "error": extracted_result["error"], "scrape_stats": { "line_count": scrape_result["line_count"], "char_count": scrape_result["char_count"], "last_char_line": scrape_result["last_char_line"], "all_content_displayed": scrape_result["all_content_displayed"], }, "model_used": extracted_result["model_used"], "tokens_used": extracted_result["tokens_used"], }, ensure_ascii=False, ) def _is_huggingface_dataset_or_space_url(url): """ Check if the URL is a HuggingFace dataset or space URL. :param url: The URL to check :return: True if it's a HuggingFace dataset or space URL, False otherwise """ if not url: return False return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url async def scrape_url_with_jina( url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4 ) -> Dict[str, Any]: """ Scrape content from a URL and save to a temporary file. Need to read the content from the temporary file. Args: url (str): The URL to scrape content from custom_headers (Dict[str, str]): Additional headers to include in the request max_chars (int): Maximum number of characters to reserve for the scraped content Returns: Dict[str, Any]: A dictionary containing: - success (bool): Whether the operation was successful - filename (str): Absolute path to the temporary file containing the scraped content - content (str): The scraped content of the first 40k characters - error (str): Error message if the operation failed - line_count (int): Number of lines in the scraped content - char_count (int): Number of characters in the scraped content - last_char_line (int): Line number where the last displayed character is located - all_content_displayed (bool): Signal indicating if all content was displayed (True if content <= 40k chars) """ # Validate input if not url or not url.strip(): return { "success": False, "filename": "", "content": "", "error": "URL cannot be empty", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Get API key from environment if not JINA_API_KEY: return { "success": False, "filename": "", "content": "", "error": "JINA_API_KEY environment variable is not set", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Avoid duplicate Jina URL prefix if url.startswith("https://r.jina.ai/") and url.count("http") >= 2: url = url[len("https://r.jina.ai/") :] # Construct the Jina.ai API URL jina_url = f"{JINA_BASE_URL}/{url}" try: # Prepare headers headers = { "Authorization": f"Bearer {JINA_API_KEY}", } # Add custom headers if provided if custom_headers: headers.update(custom_headers) # Retry configuration retry_delays = [1, 2, 4, 8] for attempt, delay in enumerate(retry_delays, 1): try: # Make the request using httpx library async with httpx.AsyncClient() as client: response = await client.get( jina_url, headers=headers, timeout=httpx.Timeout(None, connect=20, read=60), follow_redirects=True, # Follow redirects (equivalent to curl -L) ) # Check if request was successful response.raise_for_status() break # Success, exit retry loop except httpx.ConnectTimeout as e: # connection timeout, retry if attempt < len(retry_delays): logger.info( f"Jina Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Jina Scrape: Connection retry attempts exhausted, url: {url}" ) raise e except httpx.ConnectError as e: # connection error, retry if attempt < len(retry_delays): logger.info( f"Jina Scrape: Connection error: {e}, {delay}s before next attempt" ) await asyncio.sleep(delay) continue else: logger.error( f"Jina Scrape: Connection retry attempts exhausted, url: {url}" ) raise e except httpx.ReadTimeout as e: # read timeout, retry if attempt < len(retry_delays): logger.info( f"Jina Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Jina Scrape: Read timeout retry attempts exhausted, url: {url}" ) raise e except httpx.HTTPStatusError as e: status_code = e.response.status_code # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429) should_retry = status_code >= 500 or status_code in [408, 409, 425, 429] if should_retry and attempt < len(retry_delays): logger.info( f"Jina Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}" ) await asyncio.sleep(delay) continue elif should_retry: logger.error( f"Jina Scrape: HTTP {status_code} retry exhausted, url: {url}" ) raise e else: logger.error( f"Jina Scrape: HTTP {status_code} (non-retryable), url: {url}" ) raise e except httpx.RequestError as e: if attempt < len(retry_delays): logger.info( f"Jina Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Jina Scrape: Unknown request exception retry attempts exhausted, url: {url}" ) raise e except Exception as e: error_msg = f"Jina Scrape: Unexpected error occurred: {str(e)}" logger.error(error_msg) return { "success": False, "filename": "", "content": "", "error": error_msg, "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Get the scraped content content = response.text if not content: return { "success": False, "filename": "", "content": "", "error": "No content returned from Jina.ai API", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # handle insufficient balance error try: content_dict = json.loads(content) except json.JSONDecodeError: content_dict = None if ( isinstance(content_dict, dict) and content_dict.get("name") == "InsufficientBalanceError" ): return { "success": False, "filename": "", "content": "", "error": "Insufficient balance", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Get content statistics total_char_count = len(content) total_line_count = content.count("\n") + 1 if content else 0 # Extract first max_chars characters displayed_content = content[:max_chars] all_content_displayed = total_char_count <= max_chars # Calculate the line number of the last character displayed if displayed_content: # Count newlines up to the last displayed character last_char_line = displayed_content.count("\n") + 1 else: last_char_line = 0 return { "success": True, "content": displayed_content, "error": "", "line_count": total_line_count, "char_count": total_char_count, "last_char_line": last_char_line, "all_content_displayed": all_content_displayed, } async def scrape_url_with_python( url: str, custom_headers: Dict[str, str] = None, max_chars: int = 102400 * 4 ) -> Dict[str, Any]: """ Fallback scraping method using Python's httpx library directly. Args: url (str): The URL to scrape content from custom_headers (Dict[str, str]): Additional headers to include in the request max_chars (int): Maximum number of characters to reserve for the scraped content Returns: Dict[str, Any]: A dictionary containing: - success (bool): Whether the operation was successful - content (str): The scraped content - error (str): Error message if the operation failed - line_count (int): Number of lines in the scraped content - char_count (int): Number of characters in the scraped content - last_char_line (int): Line number where the last displayed character is located - all_content_displayed (bool): Signal indicating if all content was displayed """ # Validate input if not url or not url.strip(): return { "success": False, "content": "", "error": "URL cannot be empty", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } try: # Prepare headers headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } # Add custom headers if provided if custom_headers: headers.update(custom_headers) # Retry configuration retry_delays = [1, 2, 4] for attempt, delay in enumerate(retry_delays, 1): try: # Make the request using httpx library async with httpx.AsyncClient() as client: response = await client.get( url, headers=headers, timeout=httpx.Timeout(None, connect=20, read=60), follow_redirects=True, ) # Check if request was successful response.raise_for_status() break # Success, exit retry loop except httpx.ConnectTimeout as e: if attempt < len(retry_delays): logger.info( f"Python Scrape: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Python Scrape: Connection retry attempts exhausted, url: {url}" ) raise e except httpx.ConnectError as e: if attempt < len(retry_delays): logger.info( f"Python Scrape: Connection error: {e}, {delay}s before next attempt" ) await asyncio.sleep(delay) continue else: logger.error( f"Python Scrape: Connection retry attempts exhausted, url: {url}" ) raise e except httpx.ReadTimeout as e: if attempt < len(retry_delays): logger.info( f"Python Scrape: Read timeout, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Python Scrape: Read timeout retry attempts exhausted, url: {url}" ) raise e except httpx.HTTPStatusError as e: status_code = e.response.status_code # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429) should_retry = status_code >= 500 or status_code in [408, 409, 425, 429] if should_retry and attempt < len(retry_delays): logger.info( f"Python Scrape: HTTP {status_code} (retryable), retry in {delay}s, url: {url}" ) await asyncio.sleep(delay) continue elif should_retry: logger.error( f"Python Scrape: HTTP {status_code} retry exhausted, url: {url}" ) raise e else: logger.error( f"Python Scrape: HTTP {status_code} (non-retryable), url: {url}" ) raise e except httpx.RequestError as e: if attempt < len(retry_delays): logger.info( f"Python Scrape: Unknown request exception: {e}, url: {url}, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( f"Python Scrape: Unknown request exception retry attempts exhausted, url: {url}" ) raise e except Exception as e: error_msg = f"Python Scrape: Unexpected error occurred: {str(e)}" logger.error(error_msg) return { "success": False, "content": "", "error": error_msg, "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Get the scraped content content = response.text if not content: return { "success": False, "content": "", "error": "No content returned from URL", "line_count": 0, "char_count": 0, "last_char_line": 0, "all_content_displayed": False, } # Get content statistics total_char_count = len(content) total_line_count = content.count("\n") + 1 if content else 0 # Extract first max_chars characters displayed_content = content[:max_chars] all_content_displayed = total_char_count <= max_chars # Calculate the line number of the last character displayed if displayed_content: last_char_line = displayed_content.count("\n") + 1 else: last_char_line = 0 return { "success": True, "content": displayed_content, "error": "", "line_count": total_line_count, "char_count": total_char_count, "last_char_line": last_char_line, "all_content_displayed": all_content_displayed, } EXTRACT_INFO_PROMPT = """You are given a piece of content and the requirement of information to extract. Your task is to extract the information specifically requested. Be precise and focus exclusively on the requested information. INFORMATION TO EXTRACT: {} INSTRUCTIONS: 1. Extract the information relevant to the focus above. 2. If the exact information is not found, extract the most closely related details. 3. Be specific and include exact details when available. 4. Clearly organize the extracted information for easy understanding. 5. Do not include general summaries or unrelated content. CONTENT TO ANALYZE: {} EXTRACTED INFORMATION:""" def get_prompt_with_truncation( info_to_extract: str, content: str, truncate_last_num_chars: int = -1 ) -> str: if truncate_last_num_chars > 0: content = content[:-truncate_last_num_chars] + "[...truncated]" # Prepare the prompt prompt = EXTRACT_INFO_PROMPT.format(info_to_extract, content) return prompt async def extract_info_with_llm( url: str, content: str, info_to_extract: str, model: str = "LLM", max_tokens: int = 4096, ) -> Dict[str, Any]: """ Summarize content using an LLM API. Args: content (str): The content to summarize info_to_extract (str): The specific types of information to extract (usually a question) model (str): The model to use for summarization max_tokens (int): Maximum tokens for the response Returns: Dict[str, Any]: A dictionary containing: - success (bool): Whether the operation was successful - extracted_info (str): The extracted information - error (str): Error message if the operation failed - model_used (str): The model used for summarization - tokens_used (int): Number of tokens used (if available) """ # Validate input if not content or not content.strip(): return { "success": False, "extracted_info": "", "error": "Content cannot be empty", "model_used": model, "tokens_used": 0, } prompt = get_prompt_with_truncation(info_to_extract, content) # Prepare the payload if "gpt" in model: payload = { "model": model, "max_completion_tokens": max_tokens, "messages": [ {"role": "user", "content": prompt}, ], } # Add cost-saving parameters for GPT-5 models if "gpt-5" in model.lower() or "gpt5" in model.lower(): payload["service_tier"] = "flex" payload["reasoning_effort"] = "minimal" else: payload = { "model": model, "max_tokens": max_tokens, "messages": [ {"role": "user", "content": prompt}, ], "temperature": 1.0, # "top_p": 0.8, # "top_k": 20, } # Validate LLM endpoint configuration early for clearer errors if not SUMMARY_LLM_BASE_URL or not SUMMARY_LLM_BASE_URL.strip(): return { "success": False, "extracted_info": "", "error": "SUMMARY_LLM_BASE_URL environment variable is not set", "model_used": model, "tokens_used": 0, } # Prepare headers (add Authorization if API key is available) headers = {"Content-Type": "application/json"} if SUMMARY_LLM_API_KEY: headers["Authorization"] = f"Bearer {SUMMARY_LLM_API_KEY}" try: # Retry configuration connect_retry_delays = [1, 2, 4, 8] for attempt, delay in enumerate(connect_retry_delays, 1): try: # Make the API request using httpx async with httpx.AsyncClient() as client: response = await client.post( SUMMARY_LLM_BASE_URL, headers=headers, json=payload, timeout=httpx.Timeout(None, connect=30, read=300), ) if response.text and len(response.text) >= 50: tail_50 = response.text[-50:] repeat_count = response.text.count(tail_50) if repeat_count > 5: logger.info("Repeat detected in extract_info_with_llm") continue # Check if the request was successful if ( "Requested token count exceeds the model's maximum context length" in response.text or "longer than the model's context length" in response.text ): prompt = get_prompt_with_truncation( info_to_extract, content, truncate_last_num_chars=40960 * attempt, ) # remove 40k * num_attempts chars from the end of the content payload["messages"][0]["content"] = prompt continue # no need to raise error here, just try again response.raise_for_status() break # Success, exit retry loop except httpx.ConnectTimeout as e: # connection timeout, retry if attempt < len(connect_retry_delays): logger.info( f"Jina Scrape and Extract Info: Connection timeout, {delay}s before next attempt (attempt {attempt + 1})" ) await asyncio.sleep(delay) continue else: logger.error( "Jina Scrape and Extract Info: Connection retry attempts exhausted" ) raise e except httpx.ConnectError as e: # connection error, retry if attempt < len(connect_retry_delays): logger.info( f"Jina Scrape and Extract Info: Connection error: {e}, {delay}s before next attempt" ) await asyncio.sleep(delay) continue else: logger.error( "Jina Scrape and Extract Info: Connection retry attempts exhausted" ) raise e except httpx.ReadTimeout as e: # read timeout, LLM API is too slow, no need to retry if attempt < len(connect_retry_delays): logger.info( f"Jina Scrape and Extract Info: LLM API attempt {attempt} read timeout" ) continue else: logger.error( f"Jina Scrape and Extract Info: LLM API read timeout retry attempts exhausted, please check the request complexity, information to extract: {info_to_extract}, length of content: {len(content)}, url: {url}" ) raise e except httpx.HTTPStatusError as e: status_code = e.response.status_code # Special case: GPT-5 service_tier parameter compatibility issue if ( "gpt-5" in model.lower() or "gpt5" in model.lower() ) and "service_tier" in payload: logger.info( "Extract Info: GPT-5 service_tier error, removing and retrying" ) payload.pop("service_tier", None) if attempt < len(connect_retry_delays): await asyncio.sleep(delay) continue # Retryable: 5xx (server errors) + specific 4xx (408, 409, 425, 429) should_retry = status_code >= 500 or status_code in [408, 409, 425, 429] if should_retry and attempt < len(connect_retry_delays): logger.info( f"Extract Info: HTTP {status_code} (retryable), retry in {delay}s" ) await asyncio.sleep(delay) continue elif should_retry: logger.error(f"Extract Info: HTTP {status_code} retry exhausted") raise e else: logger.error(f"Extract Info: HTTP {status_code} (non-retryable)") raise httpx.HTTPStatusError( f"response.text: {response.text}", request=e.request, response=e.response, ) from e except httpx.RequestError as e: logger.error( f"Jina Scrape and Extract Info: Unknown request exception: {e}" ) raise e except Exception as e: error_msg = f"Jina Scrape and Extract Info: Unexpected error during LLM API call: {str(e)}" logger.error(error_msg) return { "success": False, "extracted_info": "", "error": error_msg, "model_used": model, "tokens_used": 0, } # Parse the response try: response_data = response.json() except json.JSONDecodeError as e: error_msg = ( f"Jina Scrape and Extract Info: Failed to parse LLM API response: {str(e)}" ) logger.error(error_msg) logger.error(f"Raw response: {response.text}") return { "success": False, "extracted_info": "", "error": error_msg, "model_used": model, "tokens_used": 0, } # Extract summary from response if "choices" in response_data and len(response_data["choices"]) > 0: try: summary = response_data["choices"][0]["message"]["content"] except Exception as e: error_msg = f"Jina Scrape and Extract Info: Failed to get summary from LLM API response: {str(e)}" logger.error(error_msg) return { "success": False, "extracted_info": "", "error": error_msg, "model_used": model, "tokens_used": 0, } # Extract token usage if available tokens_used = 0 if "usage" in response_data: tokens_used = response_data["usage"].get("total_tokens", 0) return { "success": True, "extracted_info": summary, "error": "", "model_used": model, "tokens_used": tokens_used, } elif "error" in response_data: error_msg = ( f"Jina Scrape and Extract Info: LLM API error: {response_data['error']}" ) logger.error(error_msg) return { "success": False, "extracted_info": "", "error": error_msg, "model_used": model, "tokens_used": 0, } else: error_msg = f"Jina Scrape and Extract Info: No valid response from LLM API, response data: {response_data}" logger.error(error_msg) return { "success": False, "extracted_info": "", "error": error_msg, "model_used": model, "tokens_used": 0, } if __name__ == "__main__": # Example usage and testing # Run the MCP server mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/search_and_scrape_webpage.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import logging import os from typing import Any, Dict import httpx from mcp.server.fastmcp import FastMCP from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from tencentcloud.common import credential from tencentcloud.common.common_client import CommonClient from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( TencentCloudSDKException, ) from tencentcloud.common.profile.client_profile import ClientProfile from tencentcloud.common.profile.http_profile import HttpProfile from ..mcp_servers.utils.url_unquote import decode_http_urls_in_dict # Configure logging logger = logging.getLogger("miroflow") SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev") SERPER_API_KEY = os.getenv("SERPER_API_KEY", "") TENCENTCLOUD_SECRET_ID = os.getenv("TENCENTCLOUD_SECRET_ID", "") TENCENTCLOUD_SECRET_KEY = os.getenv("TENCENTCLOUD_SECRET_KEY", "") # Initialize FastMCP server mcp = FastMCP("search_and_scrape_webpage") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type( (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPStatusError) ), ) async def make_serper_request( payload: Dict[str, Any], headers: Dict[str, str] ) -> httpx.Response: """Make HTTP request to Serper API with retry logic.""" async with httpx.AsyncClient() as client: response = await client.post( f"{SERPER_BASE_URL}/search", json=payload, headers=headers, ) response.raise_for_status() return response def _is_banned_url(url: str) -> bool: """ Check if the URL is a banned URL. :param url: The URL to check :return: True if it's a banned URL, False otherwise """ banned_list = [ "unifuncs", "huggingface.co/datasets", "huggingface.co/spaces", ] if not url: return False return any(banned in url for banned in banned_list) @mcp.tool() async def google_search( q: str, gl: str = "us", hl: str = "en", location: str = None, num: int = None, tbs: str = None, page: int = None, autocorrect: bool = None, ): """ Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph. Args: q: Search query string gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us') hl: Optional language code for search results in ISO 639-1 format (e.g., 'en') location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States') num: Number of results to return (default: 10) tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year) page: Page number of results to return (default: 1) autocorrect: Whether to autocorrect spelling in query Returns: Dictionary containing search results and metadata. """ # Check for API key if not SERPER_API_KEY: return json.dumps( { "success": False, "error": "SERPER_API_KEY environment variable not set", "results": [], }, ensure_ascii=False, ) # Validate required parameter if not q or not q.strip(): return json.dumps( { "success": False, "error": "Search query 'q' is required and cannot be empty", "results": [], }, ensure_ascii=False, ) try: # Helper function to perform a single search async def perform_search(search_query: str) -> tuple[list, dict]: """Perform a search and return organic results and search parameters.""" # Build payload with all supported parameters payload: dict[str, Any] = { "q": search_query.strip(), "gl": gl, "hl": hl, } # Add optional parameters if provided if location: payload["location"] = location if num is not None: payload["num"] = num else: payload["num"] = 10 # Default if tbs: payload["tbs"] = tbs if page is not None: payload["page"] = page if autocorrect is not None: payload["autocorrect"] = autocorrect # Set up headers headers = { "X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json", } # Make the API request response = await make_serper_request(payload, headers) data = response.json() # filter out HuggingFace dataset or space urls organic_results = [] if "organic" in data: for item in data["organic"]: if _is_banned_url(item.get("link", "")): continue organic_results.append(item) return organic_results, data.get("searchParameters", {}) # Perform initial search original_query = q.strip() organic_results, search_params = await perform_search(original_query) # If no results and query contains quotes, retry without quotes if not organic_results and '"' in original_query: # Remove all types of quotes query_without_quotes = original_query.replace('"', "").strip() if query_without_quotes: # Make sure we still have a valid query organic_results, search_params = await perform_search( query_without_quotes ) # Build comprehensive response response_data = { "organic": organic_results, "searchParameters": search_params, } response_data = decode_http_urls_in_dict(response_data) return json.dumps(response_data, ensure_ascii=False) except Exception as e: return json.dumps( { "success": False, "error": f"Unexpected error: {str(e)}", "results": [], }, ensure_ascii=False, ) @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type(TencentCloudSDKException), ) async def make_sogou_request(query: str, cnt: int) -> Dict[str, Any]: """Make request to Tencent Cloud SearchPro API with retry logic.""" cred = credential.Credential(TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY) httpProfile = HttpProfile() httpProfile.endpoint = "wsa.tencentcloudapi.com" clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile params = f'{{"Query":"{query}","Mode":0, "Cnt":{cnt}}}' common_client = CommonClient("wsa", "2025-05-08", cred, "", profile=clientProfile) result = common_client.call_json("SearchPro", json.loads(params))["Response"] return result @mcp.tool() async def sogou_search( q: str, num: int = 10, ) -> str: """ Tool to perform web searches via Tencent Cloud SearchPro API (Sogou search engine). Sogou search offers superior results for Chinese-language queries compared to Google. Args: q: Search query string (Required) num: Number of search results to return (Can only be 10/20/30/40/50, default: 10) Returns: JSON string containing search results with the following fields: - Query: The original search query - Pages: Array of search results, each containing title, url, passage, date, and site """ # Check for API credentials if not TENCENTCLOUD_SECRET_ID or not TENCENTCLOUD_SECRET_KEY: return json.dumps( { "success": False, "error": "TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY environment variable not set", "results": [], }, ensure_ascii=False, ) # Validate required parameter if not q or not q.strip(): return json.dumps( { "success": False, "error": "Search query 'q' is required and cannot be empty", "results": [], }, ensure_ascii=False, ) # Validate num parameter if num not in [10, 20, 30, 40, 50]: return json.dumps( { "success": False, "error": f"Invalid num value: {num}. Must be one of 10, 20, 30, 40, 50", "results": [], }, ensure_ascii=False, ) try: # Make the API request result = await make_sogou_request(q.strip(), num) # Remove RequestId from response if "RequestId" in result: del result["RequestId"] # Process and simplify the Pages field pages = [] if "Pages" in result: for page in result["Pages"]: page_json = json.loads(page) new_page = { "title": page_json.get("title", ""), "url": page_json.get("url", ""), "passage": page_json.get("passage", ""), "date": page_json.get("date", ""), "site": page_json.get("site", ""), } pages.append(new_page) result["Pages"] = pages # Decode URLs in the response result = decode_http_urls_in_dict(result) return json.dumps(result, ensure_ascii=False) except TencentCloudSDKException as e: return json.dumps( { "success": False, "error": f"Tencent Cloud API error: {str(e)}", "results": [], }, ensure_ascii=False, ) except Exception as e: return json.dumps( { "success": False, "error": f"Unexpected error: {str(e)}", "results": [], }, ensure_ascii=False, ) if __name__ == "__main__": mcp.run() ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import os from e2b_code_interpreter import Sandbox from mcp.server.fastmcp import FastMCP # Initialize FastMCP server mcp = FastMCP("stateless-python-server") # API keys E2B_API_KEY = os.environ.get("E2B_API_KEY") # DEFAULT CONFS DEFAULT_TIMEOUT = 300 # seconds @mcp.tool() async def python(code: str) -> str: """Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files). When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output. IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time. Args: code: The python code to run. Returns: A string containing the execution result including stdout and stderr. """ sandbox = Sandbox.create( timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template="1av7fdjfvcparqo8efq6" ) max_attempts = 2 for attempt in range(1, max_attempts + 1): try: execution = sandbox.run_code(code) break except Exception as e: if attempt == max_attempts: raise e execution = sandbox.run_code(code) sandbox.kill() return str(execution) if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/task_planner.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import json import logging import os from datetime import datetime from pathlib import Path from typing import Any, Dict, List from uuid import uuid4 from mcp.server.fastmcp import FastMCP # Configure logging logger = logging.getLogger("miroflow") # Initialize FastMCP server mcp = FastMCP("task_planner") # Configuration TODO_DATA_DIR = os.environ.get("TODO_DATA_DIR", "../../logs/todo_lists") # TASK_ID is required for task isolation # Without TASK_ID, task planner operations will fail TASK_ID = os.environ.get("TASK_ID") if not TASK_ID: raise ValueError( "TASK_ID environment variable is required for task_planner tool. " "This tool must have a unique task identifier to prevent data conflicts in concurrent execution." ) TODO_DATA_FILE = os.path.join(TODO_DATA_DIR, f"todos_{TASK_ID}.json") # Ensure data directory exists Path(TODO_DATA_DIR).mkdir(parents=True, exist_ok=True) def load_todos() -> List[Dict[str, Any]]: """Load task plan from the JSON file.""" if not os.path.exists(TODO_DATA_FILE): return [] try: with open(TODO_DATA_FILE, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: logger.error(f"Failed to load task plan: {str(e)}") return [] def save_todos(todos: List[Dict[str, Any]]) -> bool: """Save task plan to the JSON file.""" try: with open(TODO_DATA_FILE, "w", encoding="utf-8") as f: json.dump(todos, f, ensure_ascii=False, indent=2) return True except Exception as e: logger.error(f"Failed to save task plan: {str(e)}") return False def format_todos_as_markdown(todos: List[Dict[str, Any]], message: str = "") -> str: """ Format task plan as markdown checklist. Args: todos: List of task items message: Optional message to display at the top Returns: Markdown formatted string """ # Calculate statistics total = len(todos) completed = sum(1 for t in todos if t.get("completed", False)) pending = total - completed # Build markdown lines = [] if message: lines.append(f"{message}\n") lines.append("# Task Plan\n") lines.append(f"Total: {total} | Pending: {pending} | Completed: {completed}\n") lines.append("") if not todos: lines.append("No tasks planned yet.") else: for todo in todos: checkbox = "[x]" if todo.get("completed", False) else "[ ]" title = todo["title"] todo_id = todo["id"][:8] # Show first 8 chars of ID lines.append(f"- {checkbox} {title} ({todo_id})") return "\n".join(lines) @mcp.tool() async def add_todo(titles: List[str]) -> str: """ Create a task plan by adding one or more task items. CRITICAL: Before starting to work on ANY task, you MUST first create a complete task plan. This is the foundation of effective task execution: - Break down the main goal into clear, actionable steps - Identify all necessary subtasks upfront - Create a roadmap that guides your work - Ensure nothing is overlooked or forgotten Good task planning prevents confusion and ensures systematic progress toward your goal. Args: titles: List of task item titles. For example: - Single task: ["Complete project report"] - Multiple tasks: ["Complete project report", "Fix bug #123", "Update documentation"] - Complex project: ["Research requirements", "Design architecture", "Implement core features", "Write tests", "Document API"] Returns: Markdown formatted string showing the success message and current task plan. """ if not titles: return "❌ Error: Task titles list cannot be empty." # Filter out empty titles title_list = [t.strip() for t in titles if t and t.strip()] if not title_list: return "❌ Error: No valid task titles provided." todos = load_todos() added_todos = [] # Add all tasks for title in title_list: new_todo = { "id": str(uuid4()), "title": title, "completed": False, "created_at": datetime.now().isoformat(), } todos.append(new_todo) added_todos.append(title) if not save_todos(todos): return "❌ Error: Failed to save task plan." # Build success message if len(added_todos) == 1: message = f'✅ Task added: "{added_todos[0]}"' else: message = f"✅ Added {len(added_todos)} tasks:\n" + "\n".join( f" - {t}" for t in added_todos ) return format_todos_as_markdown(todos, message) @mcp.tool() async def list_todos() -> str: """ Display the complete task plan with all items and their status. Use this to review your overall progress, see what's done and what remains, and understand where you are in the execution of your plan. Returns: Markdown formatted string showing all tasks with their completion status. """ todos = load_todos() return format_todos_as_markdown(todos) @mcp.tool() async def complete_todo(todo_ids: List[str]) -> str: """ Mark one or more tasks as completed in your plan. Use this after finishing a task to track your progress and maintain an accurate view of what's done and what's remaining. Args: todo_ids: List of task IDs to mark as completed (full ID or first 8 characters). For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"] Returns: Markdown formatted string showing the success message and updated task plan. """ if not todo_ids: return "❌ Error: Task IDs list cannot be empty." # Filter out empty IDs id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()] if not id_list: return "❌ Error: No valid task IDs provided." todos = load_todos() completed_todos = [] not_found_ids = [] # Complete all matching tasks for todo_id in id_list: found = False for todo in todos: if todo["id"] == todo_id or todo["id"].startswith(todo_id): if not todo.get( "completed", False ): # Only mark if not already completed todo["completed"] = True completed_todos.append(todo["title"]) found = True break if not found: not_found_ids.append(todo_id) if not completed_todos and not_found_ids: return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}" if not save_todos(todos): return "❌ Error: Failed to save changes." # Build success message if len(completed_todos) == 1: message = f'✅ Completed: "{completed_todos[0]}"' else: message = f"✅ Completed {len(completed_todos)} tasks:\n" + "\n".join( f" - {t}" for t in completed_todos ) if not_found_ids: message += f'\n⚠️ Not found: {", ".join(not_found_ids)}' return format_todos_as_markdown(todos, message) @mcp.tool() async def delete_todo(todo_ids: List[str]) -> str: """ Remove one or more tasks from your plan. Use this to adjust your plan when tasks become irrelevant, duplicated, or no longer needed. This helps keep your plan focused and accurate. Args: todo_ids: List of task IDs to remove (full ID or first 8 characters). For example: ["a7f3b2c1"] or ["a7f3b2c1", "b8e4c3d2"] Returns: Markdown formatted string showing the success message and remaining task plan. """ if not todo_ids: return "❌ Error: Task IDs list cannot be empty." # Filter out empty IDs id_list = [tid.strip() for tid in todo_ids if tid and tid.strip()] if not id_list: return "❌ Error: No valid task IDs provided." todos = load_todos() deleted_todos = [] not_found_ids = [] ids_to_delete = set() # Find all tasks to delete for todo_id in id_list: found = False for todo in todos: if todo["id"] == todo_id or todo["id"].startswith(todo_id): deleted_todos.append(todo["title"]) ids_to_delete.add(todo["id"]) found = True break if not found: not_found_ids.append(todo_id) if not deleted_todos and not_found_ids: return f"❌ Error: Task IDs not found: {', '.join(not_found_ids)}" # Remove the tasks todos = [t for t in todos if t["id"] not in ids_to_delete] if not save_todos(todos): return "❌ Error: Failed to save changes." # Build success message if len(deleted_todos) == 1: message = f'🗑️ Deleted: "{deleted_todos[0]}"' else: message = f"🗑️ Deleted {len(deleted_todos)} tasks:\n" + "\n".join( f" - {t}" for t in deleted_todos ) if not_found_ids: message += f'\n⚠️ Not found: {", ".join(not_found_ids)}' return format_todos_as_markdown(todos, message) if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/manager.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import functools from typing import Any, Awaitable, Callable, Protocol, TypeVar from mcp import ClientSession, StdioServerParameters # (already imported in config.py) from mcp.client.sse import sse_client from mcp.client.stdio import stdio_client from .mcp_servers.browser_session import PlaywrightSession # logger = logging.getLogger("miroflow_agent") R = TypeVar("R") def with_timeout(timeout_s: float = 300.0): """ Decorator: wraps any *async* function in asyncio.wait_for(). Usage: @with_timeout(20) async def create_message_foo(...): ... """ def decorator( func: Callable[..., Awaitable[R]], ) -> Callable[..., Awaitable[R]]: @functools.wraps(func) async def wrapper(*args, **kwargs) -> R: return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s) return wrapper return decorator class ToolManagerProtocol(Protocol): """this enables other kinds of tool manager.""" async def get_all_tool_definitions(self) -> Any: ... async def execute_tool_call( self, *, server_name: str, tool_name: str, arguments: dict[str, Any] ) -> Any: ... class ToolManager(ToolManagerProtocol): def __init__(self, server_configs, tool_blacklist=None): """ Initialize ToolManager. :param server_configs: List returned by create_server_parameters() """ self.server_configs = server_configs self.server_dict = { config["name"]: config["params"] for config in server_configs } self.browser_session = None self.tool_blacklist = tool_blacklist if tool_blacklist else set() self.task_log = None def set_task_log(self, task_log): """Set the task logger for structured logging.""" self.task_log = task_log self._log( "info", "ToolManager | Initialization", f"ToolManager initialized, loaded servers: {list(self.server_dict.keys())}", ) def _log(self, level, step_name, message, metadata=None): """Helper method to log using task_log if available, otherwise skip logging.""" if self.task_log: self.task_log.log_step(level, step_name, message, metadata) def _is_huggingface_dataset_or_space_url(self, url): """ Check if the URL is a Hugging Face dataset or space URL. :param url: The URL to check :return: True if it's a HuggingFace dataset or space URL, False otherwise """ if not url: return False return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url def _should_block_hf_scraping(self, tool_name, arguments): """ Check if we should block scraping of Hugging Face datasets/spaces. :param tool_name: The name of the tool being called :param arguments: The arguments passed to the tool :return: True if scraping should be blocked, False otherwise """ return ( tool_name in ["scrape", "scrape_website"] and arguments.get("url") and self._is_huggingface_dataset_or_space_url(arguments["url"]) ) def get_server_params(self, server_name): """Get parameters for the specified server""" return self.server_dict.get(server_name) async def get_all_tool_definitions(self): """ Connect to all configured servers and get their tool definitions. Returns a list suitable for passing to the Prompt generator. """ all_servers_for_prompt = [] # Process remote server tools for config in self.server_configs: server_name = config["name"] server_params = config["params"] one_server_for_prompt = {"name": server_name, "tools": []} self._log( "info", "ToolManager | Get Tool Definitions", f"Getting tool definitions for server '{server_name}'...", ) try: if isinstance(server_params, StdioServerParameters): async with stdio_client(server_params) as (read, write): async with ClientSession( read, write, sampling_callback=None ) as session: await session.initialize() tools_response = await session.list_tools() # black list some tools for tool in tools_response.tools: if (server_name, tool.name) in self.tool_blacklist: self._log( "info", "ToolManager | Tool Blacklisted", f"Tool '{tool.name}' in server '{server_name}' is blacklisted, skipping.", ) continue one_server_for_prompt["tools"].append( { "name": tool.name, "description": tool.description, "schema": tool.inputSchema, } ) elif isinstance(server_params, str) and server_params.startswith( ("http://", "https://") ): # SSE endpoint async with sse_client(server_params) as (read, write): async with ClientSession( read, write, sampling_callback=None ) as session: await session.initialize() tools_response = await session.list_tools() for tool in tools_response.tools: # Can add specific tool filtering logic here (if needed) # if server_name == "tool-excel" and tool.name not in ["get_workbook_metadata", "read_data_from_excel"]: # continue one_server_for_prompt["tools"].append( { "name": tool.name, "description": tool.description, "schema": tool.inputSchema, } ) else: self._log( "error", "ToolManager | Unknown Parameter Type", f"Error: Unknown parameter type for server '{server_name}': {type(server_params)}", ) raise TypeError( f"Unknown server params type for {server_name}: {type(server_params)}" ) self._log( "info", "ToolManager | Tool Definitions Success", f"Successfully obtained {len(one_server_for_prompt['tools'])} tool definitions from server '{server_name}'.", ) all_servers_for_prompt.append(one_server_for_prompt) except Exception as e: self._log( "error", "ToolManager | Connection Error", f"Error: Unable to connect or get tools from server '{server_name}': {e}", ) # Still add server entry, but mark tool list as empty or include error information one_server_for_prompt["tools"] = [ {"error": f"Unable to fetch tools: {e}"} ] all_servers_for_prompt.append(one_server_for_prompt) return all_servers_for_prompt @with_timeout(1200) async def execute_tool_call(self, server_name, tool_name, arguments) -> Any: """ Execute a single tool call. :param server_name: Server name :param tool_name: Tool name :param arguments: Tool arguments dictionary :return: Dictionary containing result or error """ # Original remote server call logic server_params = self.get_server_params(server_name) if not server_params: self._log( "error", "ToolManager | Server Not Found", f"Error: Attempting to call server '{server_name}' not found", ) return { "server_name": server_name, "tool_name": tool_name, "error": f"Server '{server_name}' not found.", } self._log( "info", "ToolManager | Tool Call Start", f"Connecting to server '{server_name}' to call tool '{tool_name}'", metadata={"arguments": arguments}, ) if server_name == "playwright": try: if self.browser_session is None: self.browser_session = PlaywrightSession(server_params) await self.browser_session.connect() tool_result = await self.browser_session.call_tool( tool_name, arguments=arguments ) return { "server_name": server_name, "tool_name": tool_name, "result": tool_result, } except Exception as e: return { "server_name": server_name, "tool_name": tool_name, "error": f"Tool call failed: {str(e)}", } else: try: result_content = None if isinstance(server_params, StdioServerParameters): async with stdio_client(server_params) as (read, write): async with ClientSession( read, write, sampling_callback=None ) as session: await session.initialize() try: tool_result = await session.call_tool( tool_name, arguments=arguments ) result_content = ( tool_result.content[-1].text if tool_result.content else "" ) # post hoc check for browsing agent reading answers from hf datsets if self._should_block_hf_scraping(tool_name, arguments): result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose." except Exception as tool_error: self._log( "error", "ToolManager | Tool Execution Error", f"Tool execution error: {tool_error}", ) return { "server_name": server_name, "tool_name": tool_name, "error": f"Tool execution failed: {str(tool_error)}", } elif isinstance(server_params, str) and server_params.startswith( ("http://", "https://") ): async with sse_client(server_params) as (read, write): async with ClientSession( read, write, sampling_callback=None ) as session: await session.initialize() try: tool_result = await session.call_tool( tool_name, arguments=arguments ) result_content = ( tool_result.content[-1].text if tool_result.content else "" ) # post hoc check for browsing agent reading answers from hf datsets if self._should_block_hf_scraping(tool_name, arguments): result_content = "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose." except Exception as tool_error: self._log( "error", "ToolManager | Tool Execution Error", f"Tool execution error: {tool_error}", ) return { "server_name": server_name, "tool_name": tool_name, "error": f"Tool execution failed: {str(tool_error)}", } else: raise TypeError( f"Unknown server params type for {server_name}: {type(server_params)}" ) self._log( "info", "ToolManager | Tool Call Success", f"Tool '{tool_name}' (server: '{server_name}') called successfully.", ) return { "server_name": server_name, "tool_name": tool_name, "result": result_content, # Return extracted text content } except Exception as outer_e: # Rename this to outer_e to avoid shadowing self._log( "error", "ToolManager | Tool Call Failed", f"Error: Failed to call tool '{tool_name}' (server: '{server_name}'): {outer_e}", ) # Store the original error message for later use error_message = str(outer_e) if ( tool_name in ["scrape", "scrape_website"] and "unhandled errors" in error_message and "url" in arguments and arguments["url"] is not None ): try: self._log( "info", "ToolManager | Fallback Attempt", "Attempting fallback using MarkItDown...", ) from markitdown import MarkItDown md = MarkItDown( docintel_endpoint="" ) result = md.convert(arguments["url"]) self._log( "info", "ToolManager | Fallback Success", "MarkItDown fallback successful", ) return { "server_name": server_name, "tool_name": tool_name, "result": result.text_content, # Return extracted text content } except ( Exception ) as inner_e: # Use a different name to avoid shadowing # Log the inner exception if needed self._log( "error", "ToolManager | Fallback Failed", f"Fallback also failed: {inner_e}", ) # No need for pass here as we'll continue to the return statement # Always use the outer exception for the final error response return { "server_name": server_name, "tool_name": tool_name, "error": f"Tool call failed: {error_message}", } ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py ================================================ ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import base64 import contextlib import mimetypes import os import tempfile import wave from urllib.parse import urlparse import requests from fastmcp import FastMCP from mutagen import File as MutagenFile from openai import OpenAI OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") # Initialize FastMCP server mcp = FastMCP("audio-mcp-server") def _get_audio_extension(url: str, content_type: str = None) -> str: """ Determine the appropriate audio file extension from URL or content type. Args: url: The URL of the audio file content_type: The content type from HTTP headers Returns: File extension (with dot) to use for temporary file """ # First try to get extension from URL parsed_url = urlparse(url) path = parsed_url.path.lower() # Common audio extensions audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"] for ext in audio_extensions: if path.endswith(ext): return ext # If no extension found in URL, try content type if content_type: content_type = content_type.lower() if "mp3" in content_type or "mpeg" in content_type: return ".mp3" elif "wav" in content_type: return ".wav" elif "m4a" in content_type: return ".m4a" elif "aac" in content_type: return ".aac" elif "ogg" in content_type: return ".ogg" elif "flac" in content_type: return ".flac" # Default fallback to mp3 return ".mp3" def _get_audio_duration(audio_path: str) -> float: """ Get audio duration in seconds. Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc). Returns 0.0 if duration cannot be determined. """ # Try using wave for .wav files try: with contextlib.closing(wave.open(audio_path, "rb")) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) if duration > 0: return duration except Exception: pass # Not a wav file or failed # Try using mutagen for other audio formats (mp3, etc) try: audio = MutagenFile(audio_path) if ( audio is not None and hasattr(audio, "info") and hasattr(audio.info, "length") ): duration = float(audio.info.length) if duration > 0: return duration except Exception: pass # Failed to get duration # Return 0.0 if all methods failed return 0.0 def _encode_audio_file(audio_path: str) -> tuple[str, str]: """Encode audio file to base64 and determine format.""" with open(audio_path, "rb") as audio_file: audio_data = audio_file.read() encoded_string = base64.b64encode(audio_data).decode("utf-8") # Determine file format from file extension mime_type, _ = mimetypes.guess_type(audio_path) if mime_type and mime_type.startswith("audio/"): mime_format = mime_type.split("/")[-1] # Map MIME type formats to OpenAI supported formats format_mapping = { "mpeg": "mp3", # audio/mpeg -> mp3 "wav": "wav", # audio/wav -> wav "wave": "wav", # audio/wave -> wav } file_format = format_mapping.get(mime_format, "mp3") else: # Default to mp3 if we can't determine file_format = "mp3" return encoded_string, file_format @mcp.tool() async def audio_transcription(audio_path_or_url: str) -> str: """ Transcribe audio file to text and return the transcription. Args: audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported. Returns: The transcription of the audio file. """ max_retries = 3 retry = 0 transcription = None # Create client once outside the retry loop client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) while retry < max_retries: try: if os.path.exists(audio_path_or_url): # Check if the file exists locally with open(audio_path_or_url, "rb") as audio_file: transcription = client.audio.transcriptions.create( model="gpt-4o-transcribe", file=audio_file ) elif "home/user" in audio_path_or_url: return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction" else: # download the audio file from the URL response = requests.get(audio_path_or_url) response.raise_for_status() # Raise an exception for bad status codes # Basic content validation - check if response has content if not response.content: return ( "[ERROR]: Audio transcription failed: Downloaded file is empty" ) # Check content type if available content_type = response.headers.get("content-type", "").lower() # Get proper extension for the temporary file file_extension = _get_audio_extension(audio_path_or_url, content_type) # Use proper temporary file handling with correct extension with tempfile.NamedTemporaryFile( delete=False, suffix=file_extension ) as temp_file: temp_file.write(response.content) temp_audio_path = temp_file.name try: with open(temp_audio_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( model="gpt-4o-transcribe", file=audio_file ) finally: # Clean up the temp file if os.path.exists(temp_audio_path): os.remove(temp_audio_path) break except requests.RequestException as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) except Exception as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) return transcription.text @mcp.tool() async def audio_question_answering(audio_path_or_url: str, question: str) -> str: """ Answer the question based on the given audio information. Args: audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported. question: The question to answer. Returns: The answer to the question, and the duration of the audio file. """ max_retries = 3 retry = 0 # Create client once outside the retry loop client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Initialize variables to avoid scope issues encoded_string = None file_format = None duration = 0.0 while retry < max_retries: try: text_prompt = f"""Answer the following question based on the given \ audio information:\n\n{question}""" if os.path.exists(audio_path_or_url): # Check if the file exists locally encoded_string, file_format = _encode_audio_file(audio_path_or_url) duration = _get_audio_duration(audio_path_or_url) elif "home/user" in audio_path_or_url: return "[ERROR]: The audio_question_answering tool cannot access to sandbox file, please use the local path provided by original instruction" else: # download the audio file from the URL response = requests.get( audio_path_or_url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" }, ) response.raise_for_status() # Raise an exception for bad status codes # Basic content validation - check if response has content if not response.content: return "[ERROR]: Audio question answering failed: Downloaded file is empty.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported." # Check content type if available content_type = response.headers.get("content-type", "").lower() # Get proper extension for the temporary file file_extension = _get_audio_extension(audio_path_or_url, content_type) # Use proper temporary file handling with correct extension with tempfile.NamedTemporaryFile( delete=False, suffix=file_extension ) as temp_file: temp_file.write(response.content) temp_audio_path = temp_file.name try: encoded_string, file_format = _encode_audio_file(temp_audio_path) duration = _get_audio_duration(temp_audio_path) finally: # Clean up the temp file if os.path.exists(temp_audio_path): os.remove(temp_audio_path) if encoded_string is None or file_format is None: return "[ERROR]: Audio question answering failed: Failed to encode audio file.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3.\nNote: YouTube video URL is not supported." response = client.chat.completions.create( model="gpt-4o-audio-preview", messages=[ { "role": "system", "content": "You are a helpful assistant specializing in audio analysis.", }, { "role": "user", "content": [ {"type": "text", "text": text_prompt}, { "type": "input_audio", "input_audio": { "data": encoded_string, "format": file_format, }, }, ], }, ], ) # If we reach here, the API call was successful break except requests.RequestException as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio question answering failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) except Exception as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio question answering failed when calling OpenAI API: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) response_text = response.choices[0].message.content response_text += f"\n\nAudio duration: {duration} seconds" return response_text if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/audio_mcp_server_os.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import base64 import contextlib import mimetypes import os import tempfile import wave from urllib.parse import urlparse import requests from fastmcp import FastMCP from mutagen import File as MutagenFile from openai import OpenAI WHISPER_API_KEY = os.environ.get("WHISPER_API_KEY") WHISPER_BASE_URL = os.environ.get("WHISPER_BASE_URL") WHISPER_MODEL_NAME = os.environ.get("WHISPER_MODEL_NAME") # Initialize FastMCP server mcp = FastMCP("audio-mcp-server-os") def _get_audio_extension(url: str, content_type: str = None) -> str: """ Determine the appropriate audio file extension from URL or content type. Args: url: The URL of the audio file content_type: The content type from HTTP headers Returns: File extension (with dot) to use for temporary file """ # First try to get extension from URL parsed_url = urlparse(url) path = parsed_url.path.lower() # Common audio extensions audio_extensions = [".mp3", ".wav", ".m4a", ".aac", ".ogg", ".flac", ".wma"] for ext in audio_extensions: if path.endswith(ext): return ext # If no extension found in URL, try content type if content_type: content_type = content_type.lower() if "mp3" in content_type or "mpeg" in content_type: return ".mp3" elif "wav" in content_type: return ".wav" elif "m4a" in content_type: return ".m4a" elif "aac" in content_type: return ".aac" elif "ogg" in content_type: return ".ogg" elif "flac" in content_type: return ".flac" # Default fallback to mp3 return ".mp3" def _get_audio_duration(audio_path: str) -> float: """ Get audio duration in seconds. Tries to use wave (for .wav), then falls back to mutagen (for mp3, etc). """ # Try using wave for .wav files try: with contextlib.closing(wave.open(audio_path, "rb")) as f: frames = f.getnframes() rate = f.getframerate() duration = frames / float(rate) if duration > 0: return duration except Exception: pass # Not a wav file or failed # Try using mutagen for other audio formats (mp3, etc) try: audio = MutagenFile(audio_path) if ( audio is not None and hasattr(audio, "info") and hasattr(audio.info, "length") ): duration = float(audio.info.length) if duration > 0: return duration except Exception as e: return f"[ERROR]: Failed to get audio duration: {e}" def _encode_audio_file(audio_path: str) -> tuple[str, str]: """Encode audio file to base64 and determine format.""" with open(audio_path, "rb") as audio_file: audio_data = audio_file.read() encoded_string = base64.b64encode(audio_data).decode("utf-8") # Determine file format from file extension mime_type, _ = mimetypes.guess_type(audio_path) if mime_type and mime_type.startswith("audio/"): mime_format = mime_type.split("/")[-1] # Map MIME type formats to OpenAI supported formats format_mapping = { "mpeg": "mp3", # audio/mpeg -> mp3 "wav": "wav", # audio/wav -> wav "wave": "wav", # audio/wave -> wav } file_format = format_mapping.get(mime_format, "mp3") else: # Default to mp3 if we can't determine file_format = "mp3" return encoded_string, file_format @mcp.tool() async def audio_transcription(audio_path_or_url: str) -> str: """ Transcribe audio file to text and return the transcription. Args: audio_path_or_url: The path of the audio file locally or its URL. Path from sandbox is not supported. YouTube URL is not supported. Returns: The transcription of the audio file. """ max_retries = 3 retry = 0 transcription = None while retry < max_retries: try: client = OpenAI(base_url=WHISPER_BASE_URL, api_key=WHISPER_API_KEY) if os.path.exists(audio_path_or_url): # Check if the file exists locally with open(audio_path_or_url, "rb") as audio_file: transcription = client.audio.transcriptions.create( model=WHISPER_MODEL_NAME, file=audio_file ) elif "home/user" in audio_path_or_url: return "[ERROR]: The audio_transcription tool cannot access to sandbox file, please use the local path provided by original instruction" else: # download the audio file from the URL response = requests.get(audio_path_or_url) response.raise_for_status() # Raise an exception for bad status codes # Basic content validation - check if response has content if not response.content: return ( "[ERROR]: Audio transcription failed: Downloaded file is empty" ) # Check content type if available content_type = response.headers.get("content-type", "").lower() if content_type and not any( media_type in content_type for media_type in ["audio", "video", "application/octet-stream"] ): return f"[ERROR]: Audio transcription failed: Invalid content type '{content_type}'. Expected audio file." # Get proper extension for the temporary file file_extension = _get_audio_extension(audio_path_or_url, content_type) # Use proper temporary file handling with correct extension with tempfile.NamedTemporaryFile( delete=False, suffix=file_extension ) as temp_file: temp_file.write(response.content) temp_audio_path = temp_file.name try: with open(temp_audio_path, "rb") as audio_file: transcription = client.audio.transcriptions.create( model=WHISPER_MODEL_NAME, file=audio_file ) finally: # Clean up the temp file if os.path.exists(temp_audio_path): os.remove(temp_audio_path) break except requests.RequestException as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio transcription failed: Failed to download audio file - {e}.\nNote: Files from sandbox are not available. You should use local path given in the instruction. \nURLs must include the proper scheme (e.g., 'https://') and be publicly accessible. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) except Exception as e: retry += 1 if retry >= max_retries: return f"[ERROR]: Audio transcription failed: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction. The file should be in a common audio format such as MP3, WAV, or M4A.\nNote: YouTube video URL is not supported." await asyncio.sleep(5 * (2**retry)) return transcription.text if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import json import logging from mcp import StdioServerParameters from mcp.client.session import ClientSession from mcp.client.sse import sse_client from mcp.client.stdio import stdio_client logger = logging.getLogger("miroflow") class PlaywrightSession: """Class to maintain a persistent Playwright MCP session.""" def __init__(self, server_params): self.server_params = server_params self.read = None self.write = None self.session = None self._client = None async def connect(self): """Connect to the MCP server and initialize the session.""" if self.session is None: if isinstance(self.server_params, StdioServerParameters): self._client = stdio_client(self.server_params) else: self._client = sse_client(self.server_params) self.read, self.write = await self._client.__aenter__() self.session = ClientSession(self.read, self.write, sampling_callback=None) await self.session.__aenter__() await self.session.initialize() logger.info("Connected to MCP server and initialized session") async def call_tool(self, tool_name, arguments=None): """Call a tool while maintaining the session.""" if self.session is None: await self.connect() logger.info(f"Calling tool '{tool_name}'") tool_result = await self.session.call_tool(tool_name, arguments=arguments) result_content = tool_result.content[0].text if tool_result.content else "" return result_content async def close(self): """Close the session and connection.""" if self.session: await self.session.__aexit__(None, None, None) self.session = None if self._client: await self._client.__aexit__(None, None, None) self._client = None self.read = None self.write = None logger.info("Closed MCP session") # Example usage: async def test_persistent_session(): # Create a persistent session mcp_session = PlaywrightSession("http://localhost:8931") try: # First call: Navigate to a website await mcp_session.call_tool("browser_navigate", {"url": "https://example.com"}) logger.info("Navigation complete") # Wait a moment for the page to load await asyncio.sleep(2) # Second call: Take a snapshot of the current page snapshot_result = await mcp_session.call_tool("browser_snapshot", {}) # Process and save the snapshot snapshot_json = json.loads(snapshot_result) logger.info(f"Snapshot taken of page: {snapshot_json.get('url')}") logger.info(f"Page title: {snapshot_json.get('title')}") with open("snapshot.json", "w") as f: json.dump(snapshot_json, f, indent=2, ensure_ascii=False) logger.info("Snapshot saved to snapshot.json") finally: # Close the session when done with all tool calls await mcp_session.close() if __name__ == "__main__": asyncio.run(test_persistent_session()) ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/python_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import os import shlex from urllib.parse import urlparse from e2b_code_interpreter import Sandbox from fastmcp import FastMCP # Initialize FastMCP server mcp = FastMCP("e2b-python-interpreter") # API keys E2B_API_KEY = os.environ.get("E2B_API_KEY") LOGS_DIR = os.environ.get( "LOGS_DIR", "../../logs" ) # Directory where benchmark logs are stored # DEFAULT TEMPLATE ID DEFAULT_TEMPLATE_ID = "1av7fdjfvcparqo8efq6" # DEFAULT CONFS DEFAULT_TIMEOUT = 600 # seconds # Maximum number of tokens that can be returned by the Python tool MAX_RESULT_LEN = 20_000 # Maximum number of tokens allowed in an error message MAX_ERROR_LEN = 4_000 # Invalid sandbox IDs that are not allowed to be used INVALID_SANDBOX_IDS = { "default", "sandbox1", "sandbox", "some_id", "new_sandbox", "python", "create_sandbox", "sandbox123", "temp", "sandbox-0", "sandbox-1", "sandbox_0", "sandbox_1", "new", "0", "auto", "default_sandbox", "none", "sandbox_12345", "dummy", "sandbox_01", } def looks_like_dir(path: str) -> bool: """ Return True if the given path either: - exists and is a directory, OR - does not exist but looks like a directory (e.g., ends with '/', or has no file extension) """ # If it exists, trust the filesystem if os.path.isdir(path): return True # If it ends with '/' or has no extension, treat as directory if path.endswith(os.path.sep) or not os.path.splitext(path)[1]: return True return False def truncate_result(result: str) -> str: """ Truncate result to MAX_RESULT_LEN. Args: result: The full result string to potentially truncate Returns: Truncated result string """ if len(result) > MAX_RESULT_LEN: result = result[:MAX_RESULT_LEN] + " [Result truncated due to length limit]" return result @mcp.tool() async def create_sandbox(timeout: int = DEFAULT_TIMEOUT) -> str: """Create a linux sandbox. Args: timeout: Time in seconds before the sandbox is automatically shutdown. The default is 600 seconds. Returns: The sandbox_id of the newly created sandbox. You should use this sandbox_id to run other tools in the sandbox. """ max_retries = 5 timeout = min(timeout, DEFAULT_TIMEOUT) for attempt in range(1, max_retries + 1): sandbox = None try: sandbox = Sandbox( template=DEFAULT_TEMPLATE_ID, timeout=timeout, api_key=E2B_API_KEY, ) info = sandbox.get_info() tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles") os.makedirs(tmpfiles_dir, exist_ok=True) return f"Sandbox created with sandbox_id: {info.sandbox_id}" except Exception as e: if attempt == max_retries: error_details = str(e)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to create sandbox after {max_retries} attempts: {error_details}, please retry later." await asyncio.sleep(attempt**2) # Exponential backoff finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(timeout) except Exception: pass # Ignore timeout setting errors @mcp.tool() async def run_command(command: str, sandbox_id: str) -> str: """Execute a lightweight shell command in the linux sandbox (no long-running, blocking, or resource-heavy processes). Args: command: The command to execute. sandbox_id: The id of the sandbox to execute the command in. To create a new sandbox, use tool `create_sandbox`. Returns: A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...) """ if sandbox_id in INVALID_SANDBOX_IDS: return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool." try: sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY) except Exception: return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct." max_retries = 3 for attempt in range(1, max_retries + 1): try: sandbox.set_timeout( DEFAULT_TIMEOUT ) # refresh the timeout for each command execution result = sandbox.commands.run(command) result_str = str(result) return truncate_result(result_str) except Exception as e: if attempt == max_retries: # Build error message error_details = str(e)[:MAX_ERROR_LEN] error_msg = f"[ERROR]: Failed to run command after {max_retries} attempts.\n\nException type: {type(e).__name__}\nDetails: {error_details}" return error_msg await asyncio.sleep(attempt**2) # Exponential backoff finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(DEFAULT_TIMEOUT) except Exception: pass # Ignore timeout setting errors @mcp.tool() async def run_python_code(code_block: str, sandbox_id: str) -> str: """Run short, safe python code in a sandbox and return the execution result (avoid long loops or heavy tasks; must finish quickly). Args: code_block: The python code to run. sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`. Returns: A CommandResult object containing the result of the command execution, format like CommandResult(stderr=..., stdout=..., exit_code=..., error=...) """ # If sandbox_id is invalid, fallback to stateless execution if not sandbox_id or sandbox_id in INVALID_SANDBOX_IDS: try: sandbox = Sandbox( template=DEFAULT_TEMPLATE_ID, timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, ) try: execution = sandbox.run_code(code_block) return truncate_result(str(execution)) finally: sandbox.kill() except Exception as e: error_details = str(e)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to run code in stateless mode. Exception type: {type(e).__name__}, Details: {error_details}" try: sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY) except Exception: return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct." max_retries = 3 for attempt in range(1, max_retries + 1): try: sandbox.set_timeout( DEFAULT_TIMEOUT ) # refresh the timeout for each command execution execution = sandbox.run_code(code_block) result_str = str(execution) return truncate_result(result_str) except Exception as e: if attempt == max_retries: error_details = str(e)[:MAX_ERROR_LEN] error_msg = f"[ERROR]: Failed to run code in sandbox {sandbox_id} after {max_retries} attempts. Exception type: {type(e).__name__}, Details: {error_details}" return error_msg await asyncio.sleep(attempt**2) # Exponential backoff finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(DEFAULT_TIMEOUT) except Exception: pass # Ignore timeout setting errors @mcp.tool() async def upload_file_from_local_to_sandbox( sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user" ) -> str: """Upload a local file to the `/home/user` dir of the remote python interpreter. Args: sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`. local_file_path: The path of the file on local machine to upload. sandbox_file_path: The path of directory to upload the file to in the sandbox. Default is `/home/user/`. Returns: The path of the uploaded file in the remote python interpreter if the upload is successful. """ if sandbox_id in INVALID_SANDBOX_IDS: return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool." try: sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY) except Exception: return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct." try: sandbox.set_timeout( DEFAULT_TIMEOUT ) # refresh the timeout for each command execution # Check if local file exists and is readable if not os.path.exists(local_file_path): return f"[ERROR]: Local file does not exist: {local_file_path}" if not os.path.isfile(local_file_path): return f"[ERROR]: Path is not a file: {local_file_path}" # Get the uploaded file path uploaded_file_path = os.path.join( sandbox_file_path, os.path.basename(local_file_path) ) # Normalize the path uploaded_file_path = os.path.normpath(uploaded_file_path) # Ensure the parent directory exists in sandbox parent_dir = os.path.dirname(uploaded_file_path) if parent_dir and parent_dir != "/": mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}") if mkdir_result.exit_code != 0: mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}" # Upload the file with open(local_file_path, "rb") as f: sandbox.files.write(uploaded_file_path, f) return f"File uploaded to {uploaded_file_path}" except Exception as e: error_details = str(e)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to upload file {local_file_path} to sandbox {sandbox_id}: {error_details}" finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(DEFAULT_TIMEOUT) except Exception: pass # Ignore timeout setting errors @mcp.tool() async def download_file_from_internet_to_sandbox( sandbox_id: str, url: str, sandbox_file_path: str = "/home/user" ) -> str: """Download a file from the internet to the `/home/user` dir of the sandbox (avoid large or slow URLs). Args: sandbox_id: The id of the sandbox to run the code in. Reuse existing sandboxes whenever possible. To create a new sandbox, use tool `create_sandbox`. url: The URL of the file to download. sandbox_file_path: The path of directory to download the file to in the sandbox. Default is `/home/user/`. Returns: The path of the downloaded file in the sandbox if the download is successful. """ if sandbox_id in INVALID_SANDBOX_IDS: return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool." try: sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY) except Exception: return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct." try: sandbox.set_timeout( DEFAULT_TIMEOUT ) # refresh the timeout for each command execution # Extract basename from URL properly (handle query parameters) parsed_url = urlparse(url) basename = os.path.basename(parsed_url.path) or "downloaded_file" # Remove any query parameters or fragments from basename if "?" in basename: basename = basename.split("?")[0] if "#" in basename: basename = basename.split("#")[0] # Check whether sandbox_file_path looks like a directory if looks_like_dir(sandbox_file_path): # It's a directory — join with the filename downloaded_file_path = os.path.join(sandbox_file_path, basename) else: # It's a file path — use it directly downloaded_file_path = sandbox_file_path # Normalize the path downloaded_file_path = os.path.normpath(downloaded_file_path) # Ensure the parent directory exists in sandbox parent_dir = os.path.dirname(downloaded_file_path) if parent_dir and parent_dir != "/": mkdir_result = sandbox.commands.run(f"mkdir -p {shlex.quote(parent_dir)}") if mkdir_result.exit_code != 0: mkdir_result_str = str(mkdir_result)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to create directory {parent_dir} in sandbox {sandbox_id}: {mkdir_result_str}" # Download the file with retry logic max_retries = 3 for attempt in range(1, max_retries + 1): safe_url = shlex.quote(url) safe_path = shlex.quote(downloaded_file_path) cmd = f"wget {safe_url} -O {safe_path}" try: result = sandbox.commands.run(cmd) if result.exit_code == 0: return f"File downloaded to {safe_path}" elif attempt < max_retries: await asyncio.sleep(4**attempt) continue # Retry else: # Extract detailed error information error_details = "" if hasattr(result, "stderr") and result.stderr: error_details = f"stderr: {result.stderr}"[:MAX_ERROR_LEN] error_msg = ( f"[ERROR]: Failed to download file from {url} to {downloaded_file_path} after {max_retries} attempts.\n\n" f"exit_code: {result.exit_code}\n\n" f"Details: {error_details}" ) return error_msg except Exception as e: if attempt == max_retries: error_details = str(e)[:MAX_ERROR_LEN] error_msg = f"[ERROR]: Failed to download file from {url} to {downloaded_file_path}. Exception: {error_details}" return error_msg await asyncio.sleep(4**attempt) except Exception as e: error_details = str(e)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to download file from {url}: {error_details}" finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(DEFAULT_TIMEOUT) except Exception: pass # Ignore timeout setting errors @mcp.tool() async def download_file_from_sandbox_to_local( sandbox_id: str, sandbox_file_path: str, local_filename: str = None ) -> str: """Download a file from the sandbox to local system. Files in sandbox cannot be processed by tools from other servers - only local files and internet URLs can be processed by them. Args: sandbox_id: The id of the sandbox to download the file from. To have a sandbox, use tool `create_sandbox`. sandbox_file_path: The path of the file to download on the sandbox. local_filename: Optional filename to save as. If not provided, uses the original filename from sandbox_file_path. Returns: The local path of the downloaded file if successful, otherwise error message. """ if sandbox_id in INVALID_SANDBOX_IDS: return f"[ERROR]: '{sandbox_id}' is not a valid sandbox_id. Please create a real sandbox first using the create_sandbox tool." try: sandbox = Sandbox.connect(sandbox_id, api_key=E2B_API_KEY) except Exception: return f"[ERROR]: Failed to connect to sandbox {sandbox_id}. Make sure the sandbox is created and the sandbox_id is correct." try: sandbox.set_timeout( DEFAULT_TIMEOUT ) # refresh the timeout for each command execution # Create tmpfiles directory if it doesn't exist if not LOGS_DIR: return "[ERROR]: LOGS_DIR environment variable is not set. Cannot determine where to save the file." tmpfiles_dir = os.path.join(LOGS_DIR, "tmpfiles") os.makedirs(tmpfiles_dir, exist_ok=True) # Check if the path is a directory (before attempting to read) check_result = sandbox.commands.run( f'test -d {shlex.quote(sandbox_file_path)} && echo "is_directory" || echo "not_directory"' ) if check_result.stdout and "is_directory" in check_result.stdout: return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file." # Check if the file exists check_file_result = sandbox.commands.run( f'test -f {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"' ) if check_file_result.stdout and "not_exists" in check_file_result.stdout: # Check if it exists at all (might be a symlink or other type) check_any_result = sandbox.commands.run( f'test -e {shlex.quote(sandbox_file_path)} && echo "exists" || echo "not_exists"' ) if check_any_result.stdout and "not_exists" in check_any_result.stdout: error_msg = f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: file does not exist." return error_msg # Determine local filename if local_filename is None or local_filename.strip() == "": local_filename = os.path.basename(sandbox_file_path) # If basename is empty or just '/', use a default name if not local_filename or local_filename == "/": local_filename = "downloaded_file" local_file_path = os.path.join( tmpfiles_dir, f"sandbox_{sandbox_id}_{local_filename}" ) # Download the file try: with open(local_file_path, "wb") as f: content = sandbox.files.read(sandbox_file_path, format="bytes") f.write(content) except Exception as read_error: error_msg = str(read_error).lower() if "directory" in error_msg or "is a directory" in error_msg: return f"[ERROR]: Cannot download '{sandbox_file_path}' from sandbox {sandbox_id}: path is a directory, not a file." else: read_error_details = str(read_error)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to read file '{sandbox_file_path}' from sandbox {sandbox_id}: {read_error_details}" return f"File downloaded successfully to: {local_file_path}" except Exception as e: error_details = str(e)[:MAX_ERROR_LEN] return f"[ERROR]: Failed to download file '{sandbox_file_path}' from sandbox {sandbox_id}: {error_details}" finally: # Set timeout before exit to prevent timeout after function exits try: sandbox.set_timeout(DEFAULT_TIMEOUT) except Exception: pass # Ignore timeout setting errors if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import argparse import logging import sys from fastmcp import FastMCP from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client logger = logging.getLogger("miroflow") # Initialize FastMCP server mcp = FastMCP("reading-mcp-server") @mcp.tool() async def convert_to_markdown(uri: str) -> str: """Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.) described by an file: or data: URI to markdown. Args: uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes. Returns: str: The converted markdown content, or an error message if conversion fails. """ if not uri or not uri.strip(): return "Error: URI parameter is required and cannot be empty." # Validate URI scheme valid_schemes = ["http:", "https:", "file:", "data:"] if not any(uri.lower().startswith(scheme) for scheme in valid_schemes): return f"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}" tool_name = "convert_to_markdown" arguments = {"uri": uri} server_params = StdioServerParameters( command=sys.executable, args=["-m", "markitdown_mcp"], ) result_content = "" try: async with stdio_client(server_params) as (read, write): async with ClientSession(read, write, sampling_callback=None) as session: await session.initialize() try: tool_result = await session.call_tool( tool_name, arguments=arguments ) result_content = ( tool_result.content[-1].text if tool_result.content else "" ) except Exception as tool_error: logger.info(f"Tool execution error: {tool_error}") return f"Error: Tool execution failed: {str(tool_error)}" except Exception as session_error: logger.info(f"Session error: {session_error}") return ( f"Error: Failed to connect to markitdown-mcp server: {str(session_error)}" ) return result_content if __name__ == "__main__": # Set up argument parser parser = argparse.ArgumentParser(description="Reading MCP Server") parser.add_argument( "--transport", choices=["stdio", "http"], default="stdio", help="Transport method: 'stdio' or 'http' (default: stdio)", ) parser.add_argument( "--port", type=int, default=8080, help="Port to use when running with HTTP transport (default: 8080)", ) parser.add_argument( "--path", type=str, default="/mcp", help="URL path to use when running with HTTP transport (default: /mcp)", ) # Parse command line arguments args = parser.parse_args() # Run the server with the specified transport method if args.transport == "stdio": mcp.run(transport="stdio") else: # For HTTP transport, include port and path options mcp.run(transport="streamable-http", port=args.port, path=args.path) ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import logging import os from anthropic import Anthropic from fastmcp import FastMCP logger = logging.getLogger("miroflow") ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") # Initialize FastMCP server mcp = FastMCP("reasoning-mcp-server") @mcp.tool() async def reasoning(question: str) -> str: """You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts. DO NOT use this tool for simple and obvious question. Args: question: The hard question. Returns: The answer to the question. """ messages_for_llm = [ { "role": "user", "content": [ { "type": "text", "text": question, } ], } ] client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL) response = client.messages.create( model="claude-3-7-sonnet-20250219", max_tokens=21000, thinking={ "type": "enabled", "budget_tokens": 19000, }, messages=messages_for_llm, stream=False, ) try: return response.content[-1].text except Exception: logger.info("Reasoning Error: only thinking content is returned") return response.content[-1].thinking if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import logging import os import random import time import requests from fastmcp import FastMCP logger = logging.getLogger("miroflow") REASONING_API_KEY = os.environ.get("REASONING_API_KEY") REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL") REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME") # Initialize FastMCP server mcp = FastMCP("reasoning-mcp-server-os") # Retry configuration MAX_RETRIES = 10 BACKOFF_BASE = 1.0 # initial backoff in seconds BACKOFF_MAX = 30.0 # maximum backoff in seconds def post_with_retry(url, json, headers): """Send POST request with retry and exponential backoff. Returns response object if success, otherwise None.""" for attempt in range(1, MAX_RETRIES + 1): try: resp = requests.post(url, json=json, headers=headers, timeout=600) if resp.status_code == 200: return resp else: logger.warning( f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}" ) except requests.exceptions.RequestException as e: logger.warning(f"Request failed on attempt {attempt}: {e}") # Backoff before next retry if attempt < MAX_RETRIES: sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX) # Add jitter to avoid thundering herd sleep_time *= 0.8 + 0.4 * random.random() logger.info(f"Retrying in {sleep_time:.1f}s...") time.sleep(sleep_time) logger.warning(f"All {MAX_RETRIES} retries failed for {url}") return None @mcp.tool() async def reasoning(question: str) -> str: """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts. DO NOT use this tool for simple and obvious question. Args: question: The hard question. Returns: The answer to the question. """ payload = { "model": REASONING_MODEL_NAME, "messages": [{"role": "user", "content": question}], "temperature": 0.6, "top_p": 0.95, } headers = { "Authorization": f"Bearer {REASONING_API_KEY}", "Content-Type": "application/json", } response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers) if response is None: return "Reasoning service unavailable. Please try again later." json_response = response.json() try: content = json_response["choices"][0]["message"]["content"] if "
" in content: content = content.split("
", 1)[1].strip() return content except Exception: logger.info("Reasoning Error: only thinking content is returned") return json_response["choices"][0]["message"]["reasoning_content"] if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_google_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import calendar import datetime import json import os import sys import requests import wikipedia from fastmcp import FastMCP from mcp import ClientSession, StdioServerParameters # (already imported in config.py) from mcp.client.stdio import stdio_client from .utils import strip_markdown_links SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "") SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev") JINA_API_KEY = os.environ.get("JINA_API_KEY", "") JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai") # Google search result filtering environment variables REMOVE_SNIPPETS = os.environ.get("REMOVE_SNIPPETS", "").lower() in ("true", "1", "yes") REMOVE_KNOWLEDGE_GRAPH = os.environ.get("REMOVE_KNOWLEDGE_GRAPH", "").lower() in ( "true", "1", "yes", ) REMOVE_ANSWER_BOX = os.environ.get("REMOVE_ANSWER_BOX", "").lower() in ( "true", "1", "yes", ) # Initialize FastMCP server mcp = FastMCP("searching-google-mcp-server") def filter_google_search_result(result_content: str) -> str: """Filter google search result content based on environment variables. Args: result_content: The JSON string result from google search Returns: Filtered JSON string result """ try: # Parse JSON data = json.loads(result_content) # Remove knowledgeGraph if requested if REMOVE_KNOWLEDGE_GRAPH and "knowledgeGraph" in data: del data["knowledgeGraph"] # Remove answerBox if requested if REMOVE_ANSWER_BOX and "answerBox" in data: del data["answerBox"] # Remove snippets if requested if REMOVE_SNIPPETS: # Remove snippets from organic results if "organic" in data: for item in data["organic"]: if "snippet" in item: del item["snippet"] # Remove snippets from peopleAlsoAsk if "peopleAlsoAsk" in data: for item in data["peopleAlsoAsk"]: if "snippet" in item: del item["snippet"] # Return filtered JSON return json.dumps(data, ensure_ascii=False, indent=None) except (json.JSONDecodeError, Exception): # If filtering fails, return original content return result_content @mcp.tool() async def google_search( q: str, gl: str = "us", hl: str = "en", location: str = None, num: int = 10, tbs: str = None, page: int = 1, ) -> str: """Perform google searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph. Args: q: Search query string. gl: Country context for search (e.g., 'us' for United States, 'cn' for China, 'uk' for United Kingdom). Influences regional results priority. Default is 'us'. hl: Google interface language (e.g., 'en' for English, 'zh' for Chinese, 'es' for Spanish). Affects snippet language preference. Default is 'en'. location: City-level location for search results (e.g., 'SoHo, New York, United States', 'California, United States'). num: The number of results to return (default: 10). tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year). page: The page number of results to return (default: 1). Returns: The search results. """ if SERPER_API_KEY == "": return ( "[ERROR]: SERPER_API_KEY is not set, google_search tool is not available." ) tool_name = "google_search" arguments = { "q": q, "gl": gl, "hl": hl, "num": num, "page": page, "autocorrect": False, } if location: arguments["location"] = location if tbs: arguments["tbs"] = tbs server_params = StdioServerParameters( command=sys.executable, args=["-m", "miroflow_tools.mcp_servers.serper_mcp_server"], env={"SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL}, ) result_content = "" retry_count = 0 max_retries = 3 while retry_count < max_retries: try: async with stdio_client(server_params) as (read, write): async with ClientSession( read, write, sampling_callback=None ) as session: await session.initialize() tool_result = await session.call_tool( tool_name, arguments=arguments ) result_content = ( tool_result.content[-1].text if tool_result.content else "" ) assert ( result_content is not None and result_content.strip() != "" ), "Empty result from google_search tool, please try again." # Apply filtering based on environment variables filtered_result = filter_google_search_result(result_content) return filtered_result # Success, exit retry loop except Exception as error: retry_count += 1 if retry_count >= max_retries: return f"[ERROR]: google_search tool execution failed after {max_retries} attempts: {str(error)}" # Wait before retrying await asyncio.sleep(min(2**retry_count, 60)) return "[ERROR]: Unknown error occurred in google_search tool, please try again." # @mcp.tool() async def wiki_get_page_content(entity: str, first_sentences: int = 10) -> str: """Get specific Wikipedia page content for the specific entity (people, places, concepts, events) and return structured information. This tool searches Wikipedia for the given entity and returns either the first few sentences (which typically contain the summary/introduction) or full page content based on parameters. It handles disambiguation pages and provides clean, structured output. Args: entity: The entity to search for in Wikipedia. first_sentences: Number of first sentences to return from the page. Set to 0 to return full content. Defaults to 10. Returns: str: Formatted search results containing title, first sentences/full content, and URL. Returns error message if page not found or other issues occur. """ try: # Try to get the Wikipedia page directly page = wikipedia.page(title=entity, auto_suggest=False) # Prepare the result result_parts = [f"Page Title: {page.title}"] if first_sentences > 0: # Get summary with specified number of sentences try: summary = wikipedia.summary( entity, sentences=first_sentences, auto_suggest=False ) result_parts.append( f"First {first_sentences} sentences (introduction): {summary}" ) except Exception: # Fallback to page summary if direct summary fails content_sentences = page.content.split(". ")[:first_sentences] summary = ( ". ".join(content_sentences) + "." if content_sentences else page.content[:5000] + "..." ) result_parts.append( f"First {first_sentences} sentences (introduction): {summary}" ) else: # Return full content if first_sentences is 0 # TODO: Context Engineering Needed result_parts.append(f"Content: {page.content}") result_parts.append(f"URL: {page.url}") return "\n\n".join(result_parts) except wikipedia.exceptions.DisambiguationError as e: options_list = "\n".join( [f"- {option}" for option in e.options[:10]] ) # Limit to first 10 output = ( f"Disambiguation Error: Multiple pages found for '{entity}'.\n\n" f"Available options:\n{options_list}\n\n" f"Please be more specific in your search query." ) try: search_results = wikipedia.search(entity, results=5) if search_results: output += f"Try to search {entity} in Wikipedia: {search_results}" return output except Exception: pass return output except wikipedia.exceptions.PageError: # Try a search if direct page lookup fails try: search_results = wikipedia.search(entity, results=5) if search_results: suggestion_list = "\n".join( [f"- {result}" for result in search_results[:5]] ) return ( f"Page Not Found: No Wikipedia page found for '{entity}'.\n\n" f"Similar pages found:\n{suggestion_list}\n\n" f"Try searching for one of these suggestions instead." ) else: return ( f"Page Not Found: No Wikipedia page found for '{entity}' " f"and no similar pages were found. Please try a different search term." ) except Exception as search_error: return ( f"Page Not Found: No Wikipedia page found for '{entity}'. " f"Search for alternatives also failed: {str(search_error)}" ) except wikipedia.exceptions.RedirectError: return f"Redirect Error: Failed to follow redirect for '{entity}'" except requests.exceptions.RequestException as e: return f"Network Error: Failed to connect to Wikipedia: {str(e)}" except wikipedia.exceptions.WikipediaException as e: return f"Wikipedia Error: An error occurred while searching Wikipedia: {str(e)}" except Exception as e: return f"Unexpected Error: An unexpected error occurred: {str(e)}" # @mcp.tool() async def search_wiki_revision( entity: str, year: int, month: int, max_revisions: int = 50 ) -> str: """Search for an entity in Wikipedia and return the revision history for a specific month. Args: entity: The entity to search for in Wikipedia. year: The year of the revision (e.g. 2024). month: The month of the revision (1-12). max_revisions: Maximum number of revisions to return. Defaults to 50. Returns: str: Formatted revision history with timestamps, revision IDs, and URLs. Returns error message if page not found or other issues occur. """ # Auto-adjust date values and track changes adjustments = [] original_year, original_month = year, month current_year = datetime.datetime.now().year # Adjust year to valid range if year < 2000: year = 2000 adjustments.append( f"Year adjusted from {original_year} to 2000 (minimum supported)" ) elif year > current_year: year = current_year adjustments.append( f"Year adjusted from {original_year} to {current_year} (current year)" ) # Adjust month to valid range if month < 1: month = 1 adjustments.append(f"Month adjusted from {original_month} to 1") elif month > 12: month = 12 adjustments.append(f"Month adjusted from {original_month} to 12") # Prepare adjustment message if any changes were made if adjustments: adjustment_msg = ( "Date auto-adjusted: " + "; ".join(adjustments) + f". Using {year}-{month:02d} instead.\n\n" ) else: adjustment_msg = "" base_url = "https://en.wikipedia.org/w/api.php" try: # Construct the time range start_date = datetime.datetime(year, month, 1) last_day = calendar.monthrange(year, month)[1] end_date = datetime.datetime(year, month, last_day, 23, 59, 59) # Convert to ISO format (UTC time) start_iso = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") end_iso = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") # API parameters configuration params = { "action": "query", "format": "json", "titles": entity, "prop": "revisions", "rvlimit": min(max_revisions, 500), # Wikipedia API limit "rvstart": start_iso, "rvend": end_iso, "rvdir": "newer", "rvprop": "timestamp|ids", } response = requests.get(base_url, params=params) response.raise_for_status() data = response.json() # Check for API errors if "error" in data: return f"[ERROR]: Wikipedia API Error: {data['error'].get('info', 'Unknown error')}" # Process the response pages = data.get("query", {}).get("pages", {}) if not pages: return f"[ERROR]: No results found for entity '{entity}'" # Check if page exists page_id = list(pages.keys())[0] if page_id == "-1": return f"[ERROR]: Page Not Found: No Wikipedia page found for '{entity}'" page_info = pages[page_id] page_title = page_info.get("title", entity) if "revisions" not in page_info or not page_info["revisions"]: return ( adjustment_msg + f"Page Title: {page_title}\n\n" f"No revisions found for '{entity}' in {year}-{month:02d}.\n\n" f"The page may not have been edited during this time period." ) # Format the results result_parts = [ f"Page Title: {page_title}", f"Revision Period: {year}-{month:02d}", f"Total Revisions Found: {len(page_info['revisions'])}", ] # Add revision details revisions_details = [] for i, rev in enumerate(page_info["revisions"], 1): revision_id = rev["revid"] timestamp = rev["timestamp"] # Format timestamp for better readability try: dt = datetime.datetime.fromisoformat(timestamp.replace("Z", "+00:00")) formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC") except Exception: formatted_time = timestamp # Construct revision URL rev_url = f"https://en.wikipedia.org/w/index.php?title={entity}&oldid={revision_id}" revisions_details.append( f"{i}. Revision ID: {revision_id}\n" f" Timestamp: {formatted_time}\n" f" URL: {rev_url}" ) if revisions_details: result_parts.append("Revisions:\n" + "\n\n".join(revisions_details)) return ( adjustment_msg + "\n\n".join(result_parts) + "\n\nHint: You can use the `scrape_website` tool to get the webpage content of a URL." ) except requests.exceptions.Timeout: return f"[ERROR]: Network Error: Request timed out while fetching revision history for '{entity}'" except requests.exceptions.RequestException as e: return f"[ERROR]: Network Error: Failed to connect to Wikipedia: {str(e)}" except ValueError as e: return f"[ERROR]: Date Error: Invalid date values - {str(e)}" except Exception as e: return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}" # @mcp.tool() async def search_archived_webpage(url: str, year: int, month: int, day: int) -> str: """Search the Wayback Machine (archive.org) for archived versions of a webpage, optionally for a specific date. Args: url: The URL to search for in the Wayback Machine. year: The target year (e.g., 2023). month: The target month (1-12). day: The target day (1-31). Returns: str: Formatted archive information including archived URL, timestamp, and status. Returns error message if URL not found or other issues occur. """ # Handle empty URL if not url: return f"[ERROR]: Invalid URL: '{url}'. URL cannot be empty." # Auto-add https:// if no protocol is specified protocol_hint = "" if not url.startswith(("http://", "https://")): original_url = url url = f"https://{url}" protocol_hint = f"[NOTE]: Automatically added 'https://' to URL '{original_url}' -> '{url}'\n\n" hint_message = "" if ".wikipedia.org" in url: hint_message = "Note: You are trying to search a Wikipedia page, you can also use the `search_wiki_revision` tool to get the revision content of a Wikipedia page.\n\n" # Check if specific date is requested date = "" adjustment_msg = "" if year > 0 and month > 0: # Auto-adjust date values and track changes adjustments = [] original_year, original_month, original_day = year, month, day current_year = datetime.datetime.now().year # Adjust year to valid range if year < 1995: year = 1995 adjustments.append( f"Year adjusted from {original_year} to 1995 (minimum supported)" ) elif year > current_year: year = current_year adjustments.append( f"Year adjusted from {original_year} to {current_year} (current year)" ) # Adjust month to valid range if month < 1: month = 1 adjustments.append(f"Month adjusted from {original_month} to 1") elif month > 12: month = 12 adjustments.append(f"Month adjusted from {original_month} to 12") # Adjust day to valid range for the given month/year max_day = calendar.monthrange(year, month)[1] if day < 1: day = 1 adjustments.append(f"Day adjusted from {original_day} to 1") elif day > max_day: day = max_day adjustments.append( f"Day adjusted from {original_day} to {max_day} (max for {year}-{month:02d})" ) # Update the date string with adjusted values date = f"{year:04d}{month:02d}{day:02d}" try: # Validate the final adjusted date datetime.datetime(year, month, day) except ValueError as e: return f"[ERROR]: Invalid date: {year}-{month:02d}-{day:02d}. {str(e)}" # Prepare adjustment message if any changes were made if adjustments: adjustment_msg = ( "Date auto-adjusted: " + "; ".join(adjustments) + f". Using {date} instead.\n\n" ) try: base_url = "https://archive.org/wayback/available" # Search with specific date if provided if date: retry_count = 0 # retry 5 times if the response is not valid while retry_count < 5: response = requests.get(f"{base_url}?url={url}×tamp={date}") response.raise_for_status() data = response.json() if ( "archived_snapshots" in data and "closest" in data["archived_snapshots"] ): break retry_count += 1 await asyncio.sleep(min(2**retry_count, 60)) if "archived_snapshots" in data and "closest" in data["archived_snapshots"]: closest = data["archived_snapshots"]["closest"] archived_url = closest["url"] archived_timestamp = closest["timestamp"] available = closest.get("available", True) if not available: return ( hint_message + adjustment_msg + ( f"Archive Status: Snapshot exists but is not available\n\n" f"Original URL: {url}\n" f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n" f"Closest Snapshot: {archived_timestamp}\n\n" f"Try a different date" ) ) # Format timestamp for better readability try: dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S") formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC") except Exception: formatted_time = archived_timestamp return ( protocol_hint + hint_message + adjustment_msg + ( f"Archive Found: Archived version located\n\n" f"Original URL: {url}\n" f"Requested Date: {year:04d}-{month:02d}-{day:02d}\n" f"Archived URL: {archived_url}\n" f"Archived Timestamp: {formatted_time}\n" ) + "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL." ) # Search without specific date (most recent) retry_count = 0 # retry 5 times if the response is not valid while retry_count < 5: response = requests.get(f"{base_url}?url={url}") response.raise_for_status() data = response.json() if "archived_snapshots" in data and "closest" in data["archived_snapshots"]: break retry_count += 1 await asyncio.sleep(min(2**retry_count, 60)) if "archived_snapshots" in data and "closest" in data["archived_snapshots"]: closest = data["archived_snapshots"]["closest"] archived_url = closest["url"] archived_timestamp = closest["timestamp"] available = closest.get("available", True) if not available: return ( protocol_hint + hint_message + ( f"Archive Status: Most recent snapshot exists but is not available\n\n" f"Original URL: {url}\n" f"Most Recent Snapshot: {archived_timestamp}\n\n" f"The URL may have been archived but access is restricted" ) ) # Format timestamp for better readability try: dt = datetime.datetime.strptime(archived_timestamp, "%Y%m%d%H%M%S") formatted_time = dt.strftime("%Y-%m-%d %H:%M:%S UTC") except Exception: formatted_time = archived_timestamp return ( protocol_hint + hint_message + ( f"Archive Found: Most recent archived version\n\n" f"Original URL: {url}\n" f"Archived URL: {archived_url}\n" f"Archived Timestamp: {formatted_time}\n" ) + "\n\nHint: You can also use the `scrape_website` tool to get the webpage content of a URL." ) else: return ( protocol_hint + hint_message + ( f"Archive Not Found: No archived versions available\n\n" f"Original URL: {url}\n\n" f"The URL '{url}' has not been archived by the Wayback Machine.\n" f"You may want to:\n" f"- Check if the URL is correct\n" f"- Try a different URL and date\n" ) ) except requests.exceptions.RequestException as e: return f"[ERROR]: Network Error: Failed to connect to Wayback Machine: {str(e)}" except ValueError as e: return f"[ERROR]: Data Error: Failed to parse response from Wayback Machine: {str(e)}" except Exception as e: return f"[ERROR]: Unexpected Error: An unexpected error occurred: {str(e)}" @mcp.tool() async def scrape_website(url: str) -> str: """This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc. Args: url: The URL of the website to scrape. Returns: The scraped website content. """ # Validate URL format if not url or not url.startswith(("http://", "https://")): return f"Invalid URL: '{url}'. URL must start with http:// or https://" # Avoid duplicate Jina URL prefix if url.startswith("https://r.jina.ai/") and url.count("http") >= 2: url = url[len("https://r.jina.ai/") :] # Check for restricted domains if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url: return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose." if JINA_API_KEY == "": return "JINA_API_KEY is not set, scrape_website tool is not available." try: # Use Jina.ai reader API to convert URL to LLM-friendly text jina_url = f"{JINA_BASE_URL}/{url}" # Make request with proper headers headers = {"Authorization": f"Bearer {JINA_API_KEY}"} response = requests.get(jina_url, headers=headers, timeout=60) response.raise_for_status() # Get the content content = response.text.strip() content = strip_markdown_links(content) if not content: return f"No content retrieved from URL: {url}" return content except requests.exceptions.Timeout: return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive." except requests.exceptions.ConnectionError: return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible." except requests.exceptions.HTTPError as e: status_code = e.response.status_code if e.response else "unknown" if status_code == 404: return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist." elif status_code == 403: return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden." elif status_code == 500: return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error." else: return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}" except requests.exceptions.RequestException as e: return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}" except Exception as e: return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}" if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/searching_sogou_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import json import os import requests from fastmcp import FastMCP from tencentcloud.common import credential from tencentcloud.common.common_client import CommonClient from tencentcloud.common.exception.tencent_cloud_sdk_exception import ( TencentCloudSDKException, ) from tencentcloud.common.profile.client_profile import ClientProfile from tencentcloud.common.profile.http_profile import HttpProfile from .utils import strip_markdown_links TENCENTCLOUD_SECRET_ID = os.environ.get("TENCENTCLOUD_SECRET_ID", "") TENCENTCLOUD_SECRET_KEY = os.environ.get("TENCENTCLOUD_SECRET_KEY", "") JINA_API_KEY = os.environ.get("JINA_API_KEY", "") JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai") # Initialize FastMCP server mcp = FastMCP("searching-sogou-mcp-server") @mcp.tool() async def sogou_search(Query: str, Cnt: int = 10) -> str: """Performs web searches using the Tencent Cloud SearchPro API to retrieve comprehensive information, with Sogou search offering superior results for Chinese-language queries. Args: Query: The core search query string. Be specific to improve result relevance (e.g., "2024 World Cup final results"). (Required, no default value) Cnt: Number of search results to return (Can only be 10/20/30/40/50). Optional, default: 10) Returns: The search results in JSON format, including the following core fields: - Query: The original search query (consistent with the input Query, for request verification) - Pages: Array of JSON strings, each containing details of a single search result (e.g., title, url, passage, date, site, favicon) """ if TENCENTCLOUD_SECRET_ID == "" or TENCENTCLOUD_SECRET_KEY == "": return "[ERROR]: TENCENTCLOUD_SECRET_ID or TENCENTCLOUD_SECRET_KEY is not set, sogou_search tool is not available." retry_count = 0 max_retries = 3 while retry_count < max_retries: try: cred = credential.Credential( TENCENTCLOUD_SECRET_ID, TENCENTCLOUD_SECRET_KEY ) httpProfile = HttpProfile() httpProfile.endpoint = "wsa.tencentcloudapi.com" clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile params = f'{{"Query":"{Query}","Mode":0, "Cnt":{Cnt}}}' common_client = CommonClient( "wsa", "2025-05-08", cred, "", profile=clientProfile ) result = common_client.call_json("SearchPro", json.loads(params))[ "Response" ] del result["RequestId"] pages = [] for page in result["Pages"]: page_json = json.loads(page) new_page = {} new_page["title"] = page_json["title"] new_page["url"] = page_json["url"] new_page["passage"] = page_json["passage"] new_page["date"] = page_json["date"] # new_page["content"] = page_json["content"] new_page["site"] = page_json["site"] # new_page["favicon"] = page_json["favicon"] pages.append(new_page) result["Pages"] = pages return json.dumps(result, ensure_ascii=False) except TencentCloudSDKException: retry_count += 1 if retry_count >= max_retries: return f"[ERROR]: sogou_search tool execution failed after {max_retries} attempts: Unexpected error occurred." # Wait before retrying await asyncio.sleep(min(2**retry_count, 60)) return "[ERROR]: Unknown error occurred in google_search tool, please try again." @mcp.tool() async def scrape_website(url: str) -> str: """This tool is used to scrape a website for its content. Search engines are not supported by this tool. This tool can also be used to get YouTube video non-visual information (however, it may be incomplete), such as video subtitles, titles, descriptions, key moments, etc. Args: url: The URL of the website to scrape. Returns: The scraped website content. """ # Validate URL format if not url or not url.startswith(("http://", "https://")): return f"Invalid URL: '{url}'. URL must start with http:// or https://" # Avoid duplicate Jina URL prefix if url.startswith("https://r.jina.ai/") and url.count("http") >= 2: url = url[len("https://r.jina.ai/") :] # Check for restricted domains if "huggingface.co/datasets" in url or "huggingface.co/spaces" in url: return "You are trying to scrape a Hugging Face dataset for answers, please do not use the scrape tool for this purpose." if JINA_API_KEY == "": return "JINA_API_KEY is not set, scrape_website tool is not available." try: # Use Jina.ai reader API to convert URL to LLM-friendly text jina_url = f"{JINA_BASE_URL}/{url}" # Make request with proper headers headers = {"Authorization": f"Bearer {JINA_API_KEY}"} response = requests.get(jina_url, headers=headers, timeout=60) response.raise_for_status() # Get the content content = response.text.strip() content = strip_markdown_links(content) if not content: return f"No content retrieved from URL: {url}" return content except requests.exceptions.Timeout: return f"[ERROR]: Timeout Error: Request timed out while scraping '{url}'. The website may be slow or unresponsive." except requests.exceptions.ConnectionError: return f"[ERROR]: Connection Error: Failed to connect to '{url}'. Please check if the URL is correct and accessible." except requests.exceptions.HTTPError as e: status_code = e.response.status_code if e.response else "unknown" if status_code == 404: return f"[ERROR]: Page Not Found (404): The page at '{url}' does not exist." elif status_code == 403: return f"[ERROR]: Access Forbidden (403): Access to '{url}' is forbidden." elif status_code == 500: return f"[ERROR]: Server Error (500): The server at '{url}' encountered an internal error." else: return f"[ERROR]: HTTP Error ({status_code}): Failed to scrape '{url}'. {str(e)}" except requests.exceptions.RequestException as e: return f"[ERROR]: Request Error: Failed to scrape '{url}'. {str(e)}" except Exception as e: return f"[ERROR]: Unexpected Error: An unexpected error occurred while scraping '{url}': {str(e)}" if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. """ adapted from https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1 """ import json import os from typing import Any, Dict import requests from mcp.server.fastmcp import FastMCP from tenacity import ( retry, retry_if_exception_type, stop_after_attempt, wait_exponential, ) from .utils import decode_http_urls_in_dict SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev") SERPER_API_KEY = os.getenv("SERPER_API_KEY", "") # Initialize FastMCP server mcp = FastMCP("serper-mcp-server") @retry( stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10), retry=retry_if_exception_type( (requests.ConnectionError, requests.Timeout, requests.HTTPError) ), ) def make_serper_request( payload: Dict[str, Any], headers: Dict[str, str] ) -> requests.Response: """Make HTTP request to Serper API with retry logic.""" response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers) response.raise_for_status() return response def _is_huggingface_dataset_or_space_url(url): """ Check if the URL is a HuggingFace dataset or space URL. :param url: The URL to check :return: True if it's a HuggingFace dataset or space URL, False otherwise """ if not url: return False return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url @mcp.tool() def google_search( q: str, gl: str = "us", hl: str = "en", location: str | None = None, num: int | None = None, tbs: str | None = None, page: int | None = None, autocorrect: bool | None = None, ): """ Tool to perform web searches via Serper API and retrieve rich results. It is able to retrieve organic search results, people also ask, related searches, and knowledge graph. Args: q: Search query string gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us') hl: Optional language code for search results in ISO 639-1 format (e.g., 'en') location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States') num: Number of results to return (default: 10) tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year) page: Page number of results to return (default: 1) autocorrect: Whether to autocorrect spelling in query Returns: Dictionary containing search results and metadata. """ # Check for API key if not SERPER_API_KEY: return json.dumps( { "success": False, "error": "SERPER_API_KEY environment variable not set", "results": [], }, ensure_ascii=False, ) # Validate required parameter if not q or not q.strip(): return json.dumps( { "success": False, "error": "Search query 'q' is required and cannot be empty", "results": [], }, ensure_ascii=False, ) try: # Build payload with all supported parameters payload: dict[str, Any] = { "q": q.strip(), "gl": gl, "hl": hl, } # Add optional parameters if provided if location: payload["location"] = location if num is not None: payload["num"] = num else: payload["num"] = 10 # Default if tbs: payload["tbs"] = tbs if page is not None: payload["page"] = page if autocorrect is not None: payload["autocorrect"] = autocorrect # Set up headers headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} # Make the API request response = make_serper_request(payload, headers) data = response.json() # filter out HuggingFace dataset or space urls organic_results = [] if "organic" in data: for item in data["organic"]: if _is_huggingface_dataset_or_space_url(item.get("link", "")): continue organic_results.append(item) # Keep all original fields, but overwrite "organic" response_data = dict(data) response_data["organic"] = organic_results response_data = decode_http_urls_in_dict(response_data) return json.dumps(response_data, ensure_ascii=False) except Exception as e: return json.dumps( {"success": False, "error": f"Unexpected error: {str(e)}", "results": []}, ensure_ascii=False, ) if __name__ == "__main__": mcp.run() ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py ================================================ from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links __all__ = [ "safe_unquote", "decode_http_urls_in_dict", "strip_markdown_links", ] ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py ================================================ import re from urllib.parse import unquote from markdown_it import MarkdownIt # RFC 3986 reserved characters percent-encoding (decoding these would alter URL semantics/structure) # gen-delims: : / ? # [ ] @ # sub-delims: ! $ & ' ( ) * + , ; = RESERVED_PERCENT_ENCODINGS = frozenset( { "%2f", "%2F", # / path separator "%3f", "%3F", # ? query string start "%23", # # fragment start "%26", # & query parameter separator "%3d", "%3D", # = key-value separator "%40", # @ "%3a", "%3A", # : "%5b", "%5B", # [ "%5d", "%5D", # ] "%21", # ! "%24", # $ "%27", # ' "%28", # ( "%29", # ) "%2a", "%2A", # * "%2b", "%2B", # + "%2c", "%2C", # , "%3b", "%3B", # ; "%25", # % percent sign itself (prevents double-encoding issues) "%20", # space (keep encoded to avoid URL semantic changes) } ) def safe_unquote(url: str) -> str: """ Safely decode URL-encoded strings, only decoding characters that won't alter URL semantics. Preserve the following encodings (because decoding would change URL structure/semantics): - %2F (/) - path separator, decoding would alter path hierarchy - %3F (?) - query string start marker - %23 (#) - fragment start marker (not sent to server) - %26 (&) - query parameter separator - %3D (=) - key-value separator - %25 (%) - percent sign itself (prevents double-encoding issues, e.g. %252F -> %2F -> /) - %20 ( ) - space (keep encoded to avoid URL semantic changes) - and other RFC 3986 reserved characters Only decode unreserved characters and UTF-8 encoded international characters (e.g. Chinese). """ if not url: return url result = [] i = 0 n = len(url) while i < n: # Check if this is a percent-encoded sequence %XX if url[i] == "%" and i + 2 < n: hex_chars = url[i + 1 : i + 3] # Validate it's a valid hexadecimal if all(c in "0123456789ABCDEFabcdef" for c in hex_chars): percent_encoded = url[i : i + 3] # Check if this is a reserved character encoding that should be preserved if percent_encoded in RESERVED_PERCENT_ENCODINGS: # Keep the encoding, don't decode result.append(percent_encoded) i += 3 continue # Try to decode (may be a UTF-8 multi-byte sequence) # Collect consecutive percent-encoded sequences encoded_sequence = percent_encoded j = i + 3 while j + 2 < n and url[j] == "%": next_hex = url[j + 1 : j + 3] if all(c in "0123456789ABCDEFabcdef" for c in next_hex): next_encoded = url[j : j + 3] # Stop collecting if we encounter a reserved character if next_encoded in RESERVED_PERCENT_ENCODINGS: break encoded_sequence += next_encoded j += 3 else: break # Decode the collected sequence try: decoded = unquote(encoded_sequence) result.append(decoded) i = j continue except Exception: # Decoding failed, keep the original encoding result.append(percent_encoded) i += 3 continue result.append(url[i]) i += 1 return "".join(result) def decode_http_urls_in_dict(data): """ Traverse all values in the data structure: - If it's a string starting with http, apply urllib.parse.unquote - If it's a list, recursively process each element - If it's a dict, recursively process each value - Other types remain unchanged """ if isinstance(data, str): if "%" in data and "http" in data: return safe_unquote(data) else: return data elif isinstance(data, list): return [decode_http_urls_in_dict(item) for item in data] elif isinstance(data, dict): return {key: decode_http_urls_in_dict(value) for key, value in data.items()} else: return data md = MarkdownIt("commonmark") def strip_markdown_links(markdown: str) -> str: tokens = md.parse(markdown) def render(ts): out = [] for tok in ts: t = tok.type # 1) Links: drop the wrapper, keep inner text (children will be rendered) if t == "link_open" or t == "link_close": continue # 2) Images: skip the entire image block if t == "image": continue # 3) Line breaks and block closings if t == "softbreak": # inline single line break out.append("\n") continue if ( t == "hardbreak" ): # explicit line break (two spaces + newline in Markdown) out.append("\n") continue if t in ("paragraph_close", "heading_close", "blockquote_close"): out.append("\n\n") continue if t in ("list_item_close", "bullet_list_close", "ordered_list_close"): out.append("\n") continue if t == "hr": out.append("\n\n") continue # 4) Inline or nested tokens if tok.children: out.append(render(tok.children)) continue # Preserve inline code style if t == "code_inline": out.append(f"`{tok.content}`") else: out.append(tok.content or "") return "".join(out) text = render(tokens) # normalize excessive blank lines (avoid more than 2 consecutive newlines) text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n" return text.strip() ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import asyncio import base64 import os from fastmcp import FastMCP from openai import OpenAI OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", "https://api.openai.com/v1") # Initialize FastMCP server mcp = FastMCP("vision-mcp-server") # Maximum file size for vision processing (20MB for images, 50MB for videos) MAX_IMAGE_SIZE = 20 * 1024 * 1024 # 20MB MAX_VIDEO_SIZE = 50 * 1024 * 1024 # 50MB def guess_mime_media_type_from_extension(file_path: str) -> tuple[str, str]: """ Guess the MIME type and media category based on the file extension. Returns: Tuple of (mime_type, media_category) where media_category is 'image' or 'video' """ _, ext = os.path.splitext(file_path) ext = ext.lower() # Image formats if ext in [".jpg", ".jpeg"]: return "image/jpeg", "image" elif ext == ".png": return "image/png", "image" elif ext == ".gif": return "image/gif", "image" elif ext == ".webp": return "image/webp", "image" elif ext == ".bmp": return "image/bmp", "image" elif ext == ".tiff" or ext == ".tif": return "image/tiff", "image" # Video formats elif ext == ".mp4": return "video/mp4", "video" elif ext == ".mov": return "video/quicktime", "video" elif ext == ".avi": return "video/x-msvideo", "video" elif ext == ".mkv": return "video/x-matroska", "video" elif ext == ".webm": return "video/webm", "video" # Default to JPEG for unknown formats return "image/jpeg", "image" def _validate_file_size(file_path: str, media_category: str) -> tuple[bool, str]: """ Validate file size based on media category. Returns: Tuple of (is_valid, error_message) """ try: file_size = os.path.getsize(file_path) max_size = MAX_VIDEO_SIZE if media_category == "video" else MAX_IMAGE_SIZE max_size_mb = max_size / (1024 * 1024) if file_size > max_size: return ( False, f"[ERROR]: File size ({file_size / (1024 * 1024):.2f}MB) exceeds maximum allowed size ({max_size_mb}MB) for {media_category}", ) if file_size == 0: return False, "[ERROR]: File is empty" return True, "" except Exception as e: return False, f"[ERROR]: Failed to check file size: {e}" @mcp.tool() async def visual_question_answering(media_path_or_url: str, question: str) -> str: """Ask question about an image or a video and get the answer with GPT-4o vision model. Args: media_path_or_url: The path of the image/video file locally or its URL. Supports images (jpg, png, gif, webp, bmp, tiff) and videos (mp4, mov, avi, mkv, webm). question: The question to ask about the image or video. Returns: The answer to the media-related question. """ max_retries = 3 retry = 0 # Create client once outside the retry loop client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) # Initialize variables response = None media_data = None mime_type = None media_category = None while retry < max_retries: try: # Build message content content = [{"type": "text", "text": question}] if os.path.exists(media_path_or_url): # Check if the file exists locally # Get media type and validate mime_type, media_category = guess_mime_media_type_from_extension( media_path_or_url ) # Validate file size is_valid, error_msg = _validate_file_size( media_path_or_url, media_category ) if not is_valid: return error_msg # Read and encode file with open(media_path_or_url, "rb") as media_file: media_data = base64.b64encode(media_file.read()).decode("utf-8") # Add image_url content (works for both images and videos in OpenAI API) content.append( { "type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{media_data}"}, } ) elif "home/user" in media_path_or_url: return "[ERROR]: The visual_question_answering tool cannot access sandbox files, please use the local path provided by original instruction" else: # Otherwise, assume it's a URL # Basic URL validation if not media_path_or_url.startswith(("http://", "https://")): return "[ERROR]: Invalid URL format. URLs must start with http:// or https://" content.append( {"type": "image_url", "image_url": {"url": media_path_or_url}} ) # Make API call response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": content}], max_tokens=1024, ) # If we reach here, the API call was successful break except FileNotFoundError: return f"[ERROR]: File not found: {media_path_or_url}" except PermissionError: return f"[ERROR]: Permission denied when reading file: {media_path_or_url}" except Exception as e: retry += 1 if retry >= max_retries: error_type = ( "API call" if media_data is not None or not os.path.exists(media_path_or_url) else "file processing" ) return f"[ERROR]: Visual question answering failed during {error_type}: {e}\nNote: Files from sandbox are not available. You should use local path given in the instruction.\nSupported image formats: jpg, png, gif, webp, bmp, tiff\nSupported video formats: mp4, mov, avi, mkv, webm\nURLs must be publicly accessible and start with http:// or https://" await asyncio.sleep(5 * (2**retry)) # Extract and return response try: if response and response.choices and len(response.choices) > 0: return response.choices[0].message.content else: return "[ERROR]: Received empty response from API" except (AttributeError, IndexError) as e: return f"[ERROR]: Failed to parse API response: {e}" if __name__ == "__main__": mcp.run(transport="stdio") ================================================ FILE: libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py ================================================ # Copyright (c) 2025 MiroMind # This source code is licensed under the Apache 2.0 License. import base64 import os import aiohttp import requests from fastmcp import FastMCP VISION_API_KEY = os.environ.get("VISION_API_KEY") VISION_BASE_URL = os.environ.get("VISION_BASE_URL") VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME") # Initialize FastMCP server mcp = FastMCP("vision-mcp-server-os") def guess_mime_media_type_from_extension(file_path: str) -> str: """Guess the MIME type based on the file extension.""" _, ext = os.path.splitext(file_path) ext = ext.lower() if ext in [".jpg", ".jpeg"]: return "image/jpeg" elif ext == ".png": return "image/png" elif ext == ".gif": return "image/gif" else: return "image/jpeg" # Default to JPEG if unknown @mcp.tool() async def visual_question_answering(image_path_or_url: str, question: str) -> str: """Ask question about an image or a video and get the answer with a vision language model. Args: image_path_or_url: The path of the image file locally or its URL. question: The question to ask about the image. Returns: The answer to the image-related question. """ messages_for_llm = [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": None}}, { "type": "text", "text": question, }, ], } ] headers = { "Authorization": f"Bearer {VISION_API_KEY}", "Content-Type": "application/json", } try: if os.path.exists(image_path_or_url): # Check if the file exists locally with open(image_path_or_url, "rb") as image_file: image_data = base64.b64encode(image_file.read()).decode("utf-8") mime_type = guess_mime_media_type_from_extension(image_path_or_url) messages_for_llm[0]["content"][0]["image_url"]["url"] = ( f"data:{mime_type};base64,{image_data}" ) elif image_path_or_url.startswith(("http://", "https://")): async with aiohttp.ClientSession() as session: async with session.get(image_path_or_url) as resp: if resp.status == 200: image_bytes = await resp.read() mime_type = resp.headers.get( "Content-Type", "image/png" ) # fallback MIME type image_data = base64.b64encode(image_bytes).decode("utf-8") messages_for_llm[0]["content"][0]["image_url"]["url"] = ( f"data:{mime_type};base64,{image_data}" ) else: return f"Failed to fetch image from URL: {image_path_or_url}" else: messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm} response = requests.post(VISION_BASE_URL, json=payload, headers=headers) except Exception as e: return f"Error: {e}" try: return response.json()["choices"][0]["message"]["content"] except (AttributeError, IndexError): return response.json() if __name__ == "__main__": mcp.run(transport="stdio")