[
  {
    "path": ".github/workflows/lint.yml",
    "content": "name: lint\non:\n  pull_request:\n    types: [opened, reopened, synchronize]\n    paths:\n      - \"gui_agents/**\"\n      - \"tests/**\"\n      - \".github/workflows/lint.yml\"\n  push:\n    branches:\n      - main\n    paths:\n      - \"gui_agents/**\"\n      - \"tests/**\"\n      - \".github/workflows/lint.yml\"\n\nenv:\n  SUPPORTED_PYTHON_VERSIONS: \"3.11\"\n\njobs:\n  build:\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.10\", \"3.11\"]\n    steps:\n    - uses: actions/checkout@v3\n\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v4\n      with:\n        python-version: ${{ matrix.python-version }}\n\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        pip install -e .[dev]\n\n    - name: Run Linter\n      run: |\n        black --check gui_agents\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control\n.pdm.toml\n.pdm-python\n.pdm-build/\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\nlogs/\n.DS_Store"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "README.md",
    "content": "<h1 align=\"center\">\n  <img src=\"images/agent_s.png\" alt=\"Logo\" style=\"vertical-align:middle\" width=\"60\"> Agent S:\n  <small>Use Computer Like a Human</small>\n</h1>\n\n<h2 align=\"center\">🏆 Agent S3: First to Surpass Human Performance on OSWorld (72.60%)</h2>\n\n<p align=\"center\">&nbsp;\n  🌐 <a href=\"https://www.simular.ai/articles/agent-s3\">[S3 blog]</a>&nbsp;\n  📄 <a href=\"https://arxiv.org/abs/2510.02250\">[S3 Paper]</a>&nbsp;\n  🎥 <a href=\"https://www.youtube.com/watch?v=VHr0a3UBsh4\">[S3 Video]</a>\n</p>\n\n<p align=\"center\">&nbsp;\n  🌐 <a href=\"https://www.simular.ai/articles/agent-s2-technical-review\">[S2 blog]</a>&nbsp;\n  📄 <a href=\"https://arxiv.org/abs/2504.00906\">[S2 Paper (COLM 2025)]</a>&nbsp;\n  🎥 <a href=\"https://www.youtube.com/watch?v=wUGVQl7c0eg\">[S2 Video]</a>\n</p>\n\n<p align=\"center\">&nbsp;\n  🌐 <a href=\"https://www.simular.ai/agent-s\">[S1 blog]</a>&nbsp;\n  📄 <a href=\"https://arxiv.org/abs/2410.08164\">[S1 Paper (ICLR 2025)]</a>&nbsp;\n  🎥 <a href=\"https://www.youtube.com/watch?v=OBDE3Knte0g\">[S1 Video]</a>\n</p>\n\n<p align=\"center\">&nbsp;\n<a href=\"https://trendshift.io/repositories/13151\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/13151\" alt=\"simular-ai%2FAgent-S | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/></a>\n</p>\n\n<p align=\"center\">\n  <img src=\"https://img.shields.io/badge/OS-Windows-blue?logo=windows&logoColor=white\" alt=\"Windows\">\n  <img src=\"https://img.shields.io/badge/OS-macOS-black?logo=apple&logoColor=white\" alt=\"macOS\">\n  <img src=\"https://img.shields.io/badge/OS-Linux-yellow?logo=linux&logoColor=black\" alt=\"Linux\">\n  <a href=\"https://discord.gg/E2XfsK9fPV\">\n    <img src=\"https://dcbadge.limes.pink/api/server/https://discord.gg/E2XfsK9fPV?style=flat\" alt=\"Discord\">\n  </a>\n  &nbsp;&nbsp;\n  <a href=\"https://pepy.tech/projects/gui-agents\">\n    <img src=\"https://static.pepy.tech/badge/gui-agents\" alt=\"PyPI Downloads\">\n  </a>\n</p>\n\n<div align=\"center\">\n  <!-- Keep these links. Translations will automatically update with the README. -->\n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=de\">Deutsch</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=es\">Español</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=fr\">français</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=ja\">日本語</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=ko\">한국어</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=pt\">Português</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=ru\">Русский</a> | \n  <a href=\"https://www.readme-i18n.com/simular-ai/Agent-S?lang=zh\">中文</a>\n</div>\n\n<div align=\"center\">\n  &nbsp;&nbsp;\n<p>Skip the setup? Try Agent S in <a href=\"https://cloud.simular.ai/\">Simular Cloud</a>\n</div>\n\n## 🥳 Updates\n- [x] **2025/12/15**: Agent S3 is the **first** to surpass human-level performance on OSWorld with an impressive score of **72.60%**!\n- [x] **2025/10/02**: Released Agent S3 and its [technical paper](https://arxiv.org/abs/2510.02250), setting a new SOTA of **69.9%** on OSWorld (approaching 72% human performance), with strong generalizability on WindowsAgentArena and AndroidWorld! It is also simpler, faster, and more flexible.\n- [x] **2025/08/01**: Agent S2.5 is released (gui-agents v0.2.5): simpler, better, and faster! New SOTA on [OSWorld-Verified](https://os-world.github.io)!\n- [x] **2025/07/07**: The [Agent S2 paper](https://arxiv.org/abs/2504.00906) is accepted to COLM 2025! See you in Montreal!\n- [x] **2025/04/27**: The Agent S paper won the Best Paper Award 🏆 at ICLR 2025 Agentic AI for Science Workshop!\n- [x] **2025/04/01**: Released the [Agent S2 paper](https://arxiv.org/abs/2504.00906) with new SOTA results on OSWorld, WindowsAgentArena, and AndroidWorld!\n- [x] **2025/03/12**: Released Agent S2 along with v0.2.0 of [gui-agents](https://github.com/simular-ai/Agent-S), the new state-of-the-art for computer use agents (CUA), outperforming OpenAI's CUA/Operator and Anthropic's Claude 3.7 Sonnet Computer-Use!\n- [x] **2025/01/22**: The [Agent S paper](https://arxiv.org/abs/2410.08164) is accepted to ICLR 2025!\n- [x] **2025/01/21**: Released v0.1.2 of [gui-agents](https://github.com/simular-ai/Agent-S) library, with support for Linux and Windows!\n- [x] **2024/12/05**: Released v0.1.0 of [gui-agents](https://github.com/simular-ai/Agent-S) library, allowing you to use Agent-S for Mac, OSWorld, and WindowsAgentArena with ease!\n- [x] **2024/10/10**: Released the [Agent S paper](https://arxiv.org/abs/2410.08164) and codebase!\n\n## Table of Contents\n\n1. [💡 Introduction](#-introduction)\n2. [🎯 Current Results](#-current-results)\n3. [🛠️ Installation & Setup](#%EF%B8%8F-installation--setup) \n4. [🚀 Usage](#-usage)\n5. [🤝 Acknowledgements](#-acknowledgements)\n6. [💬 Citation](#-citation)\n\n## 💡 Introduction\n\nWelcome to **Agent S**, an open-source framework designed to enable autonomous interaction with computers through Agent-Computer Interface. Our mission is to build intelligent GUI agents that can learn from past experiences and perform complex tasks autonomously on your computer. \n\nWhether you're interested in AI, automation, or contributing to cutting-edge agent-based systems, we're excited to have you here!\n\n## 🎯 Current Results\n\n<p align=\"center\">\n  <img src=\"images/s3_results_new.png\" alt=\"Agent S3 Results\" width=\"700\"/>\n</p>\n\nOn OSWorld, Agent S3 alone reaches 66% in the 100-step setting, already exceeding the previous state of the art of 63.4% (GTA1 w/ GPT-5). With the addition of Behavior Best-of-N, performance climbs even higher to 72.6%, *surpassing* human-level performance on OSWorld (~72%)!\n\nAgent S3 also demonstrates strong zero-shot generalization! On WindowsAgentArena, accuracy rises from 50.2% using only Agent S3 to 56.6% by selecting from 3 rollouts. Similarly on AndroidWorld, performance improves from 68.1% to 71.6%\n\n## 🛠️ Installation & Setup\n\n### Prerequisites\n- **Single Monitor**: Our agent is designed for single monitor screens\n- **Security**: The agent runs Python code to control your computer - use with care\n- **Supported Platforms**: Linux, Mac, and Windows\n\n\n### Installation\nTo install Agent S3 without cloning the repository, run\n```bash\npip install gui-agents\n```\nIf you would like to test Agent S3 while making changes, clone the repository and install using\n```\npip install -e .\n```\n\nDon't forget to also `brew install tesseract`! Pytesseract requires this extra installation to work.\n\n### API Configuration\n\n#### Option 1: Environment Variables\nAdd to your `.bashrc` (Linux) or `.zshrc` (MacOS):\n```bash\nexport OPENAI_API_KEY=<YOUR_API_KEY>\nexport ANTHROPIC_API_KEY=<YOUR_ANTHROPIC_API_KEY>\nexport HF_TOKEN=<YOUR_HF_TOKEN>\n```\n\n#### Option 2: Python Script\n```python\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"<YOUR_API_KEY>\"\n```\n\n### Supported Models\nWe support Azure OpenAI, Anthropic, Gemini, Open Router, and vLLM inference. See [models.md](models.md) for details.\n\n### Grounding Models (Required)\nFor optimal performance, we recommend [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B) hosted on Hugging Face Inference Endpoints or another provider. See [Hugging Face Inference Endpoints](https://huggingface.co/learn/cookbook/en/enterprise_dedicated_endpoints) for setup instructions.\n\n## 🚀 Usage\n\n\n> ⚡️ **Recommended Setup:**  \n> For the best configuration, we recommend using **OpenAI gpt-5-2025-08-07** as the main model, paired with **UI-TARS-1.5-7B** for grounding.  \n\n\n### CLI\n\nNote, this is running Agent S3, our improved agent, without bBoN. \n\nRun Agent S3 with the required parameters:\n\n```bash\nagent_s \\\n    --provider openai \\\n    --model gpt-5-2025-08-07 \\\n    --ground_provider huggingface \\\n    --ground_url http://localhost:8080 \\\n    --ground_model ui-tars-1.5-7b \\\n    --grounding_width 1920 \\\n    --grounding_height 1080\n```\n\n#### Local Coding Environment (Optional)\nFor tasks that require code execution (e.g., data processing, file manipulation, system automation), you can enable the local coding environment:\n\n```bash\nagent_s \\\n    --provider openai \\\n    --model gpt-5-2025-08-07 \\\n    --ground_provider huggingface \\\n    --ground_url http://localhost:8080 \\\n    --ground_model ui-tars-1.5-7b \\\n    --grounding_width 1920 \\\n    --grounding_height 1080 \\\n    --enable_local_env\n```\n\n⚠️ **WARNING**: The local coding environment executes arbitrary Python and Bash code locally on your machine. Only use this feature in trusted environments and with trusted inputs.\n\n#### Required Parameters\n- **`--provider`**: Main generation model provider (e.g., openai, anthropic, etc.) - Default: \"openai\"\n- **`--model`**: Main generation model name (e.g., gpt-5-2025-08-07) - Default: \"gpt-5-2025-08-07\"\n- **`--ground_provider`**: The provider for the grounding model - **Required**\n- **`--ground_url`**: The URL of the grounding model - **Required**\n- **`--ground_model`**: The model name for the grounding model - **Required**\n- **`--grounding_width`**: Width of the output coordinate resolution from the grounding model - **Required**\n- **`--grounding_height`**: Height of the output coordinate resolution from the grounding model - **Required**\n\n#### Optional Parameters\n- **`--model_temperature`**: The temperature to fix all model calls to (necessary to set to 1.0 for models like o3 but can be left blank for other models)\n\n#### Grounding Model Dimensions\nThe grounding width and height should match the output coordinate resolution of your grounding model:\n- **UI-TARS-1.5-7B**: Use `--grounding_width 1920 --grounding_height 1080`\n- **UI-TARS-72B**: Use `--grounding_width 1000 --grounding_height 1000`\n\n#### Optional Parameters\n- **`--model_url`**: Custom API URL for main generation model - Default: \"\"\n- **`--model_api_key`**: API key for main generation model - Default: \"\"\n- **`--ground_api_key`**: API key for grounding model endpoint - Default: \"\"\n- **`--max_trajectory_length`**: Maximum number of image turns to keep in trajectory - Default: 8\n- **`--enable_reflection`**: Enable reflection agent to assist the worker agent - Default: True\n- **`--enable_local_env`**: Enable local coding environment for code execution (WARNING: Executes arbitrary code locally) - Default: False\n\n#### Local Coding Environment Details\nThe local coding environment enables Agent S3 to execute Python and Bash code directly on your machine. This is particularly useful for:\n\n- **Data Processing**: Manipulating spreadsheets, CSV files, or databases\n- **File Operations**: Bulk file processing, content extraction, or file organization\n- **System Automation**: Configuration changes, system setup, or automation scripts\n- **Code Development**: Writing, editing, or executing code files\n- **Text Processing**: Document manipulation, content editing, or formatting\n\nWhen enabled, the agent can use the `call_code_agent` action to execute code blocks for tasks that can be completed through programming rather than GUI interaction.\n\n**Requirements:**\n- **Python**: The same Python interpreter used to run Agent S3 (automatically detected)\n- **Bash**: Available at `/bin/bash` (standard on macOS and Linux)\n- **System Permissions**: The agent runs with the same permissions as the user executing it\n\n**Security Considerations:**\n- The local environment executes arbitrary code with the same permissions as the user running the agent\n- Only enable this feature in trusted environments\n- Be cautious when the agent generates code for system-level operations\n- Consider running in a sandboxed environment for untrusted tasks\n- Bash scripts are executed with a 30-second timeout to prevent hanging processes\n\n### `gui_agents` SDK\n\nFirst, we import the necessary modules. `AgentS3` is the main agent class for Agent S3. `OSWorldACI` is our grounding agent that translates agent actions into executable python code.\n```python\nimport pyautogui\nimport io\nfrom gui_agents.s3.agents.agent_s import AgentS3\nfrom gui_agents.s3.agents.grounding import OSWorldACI\nfrom gui_agents.s3.utils.local_env import LocalEnv  # Optional: for local coding environment\n\n# Load in your API keys.\nfrom dotenv import load_dotenv\nload_dotenv()\n\ncurrent_platform = \"linux\"  # \"darwin\", \"windows\"\n```\n\nNext, we define our engine parameters. `engine_params` is used for the main agent, and `engine_params_for_grounding` is for grounding. For `engine_params_for_grounding`, we support custom endpoints like HuggingFace TGI, vLLM, and Open Router.\n\n```python\nengine_params = {\n  \"engine_type\": provider,\n  \"model\": model,\n  \"base_url\": model_url,           # Optional\n  \"api_key\": model_api_key,        # Optional\n  \"temperature\": model_temperature # Optional\n}\n\n# Load the grounding engine from a custom endpoint\nground_provider = \"<your_ground_provider>\"\nground_url = \"<your_ground_url>\"\nground_model = \"<your_ground_model>\"\nground_api_key = \"<your_ground_api_key>\"\n\n# Set grounding dimensions based on your model's output coordinate resolution\n# UI-TARS-1.5-7B: grounding_width=1920, grounding_height=1080\n# UI-TARS-72B: grounding_width=1000, grounding_height=1000\ngrounding_width = 1920  # Width of output coordinate resolution\ngrounding_height = 1080  # Height of output coordinate resolution\n\nengine_params_for_grounding = {\n  \"engine_type\": ground_provider,\n  \"model\": ground_model,\n  \"base_url\": ground_url,\n  \"api_key\": ground_api_key,  # Optional\n  \"grounding_width\": grounding_width,\n  \"grounding_height\": grounding_height,\n}\n```\n\nThen, we define our grounding agent and Agent S3.\n\n```python\n# Optional: Enable local coding environment\nenable_local_env = False  # Set to True to enable local code execution\nlocal_env = LocalEnv() if enable_local_env else None\n\ngrounding_agent = OSWorldACI(\n    env=local_env,  # Pass local_env for code execution capability\n    platform=current_platform,\n    engine_params_for_generation=engine_params,\n    engine_params_for_grounding=engine_params_for_grounding,\n    width=1920,  # Optional: screen width\n    height=1080  # Optional: screen height\n)\n\nagent = AgentS3(\n    engine_params,\n    grounding_agent,\n    platform=current_platform,\n    max_trajectory_length=8,  # Optional: maximum image turns to keep\n    enable_reflection=True     # Optional: enable reflection agent\n)\n```\n\nFinally, let's query the agent!\n\n```python\n# Get screenshot.\nscreenshot = pyautogui.screenshot()\nbuffered = io.BytesIO() \nscreenshot.save(buffered, format=\"PNG\")\nscreenshot_bytes = buffered.getvalue()\n\nobs = {\n  \"screenshot\": screenshot_bytes,\n}\n\ninstruction = \"Close VS Code\"\ninfo, action = agent.predict(instruction=instruction, observation=obs)\n\nexec(action[0])\n```\n\nRefer to `gui_agents/s3/cli_app.py` for more details on how the inference loop works.\n\n### OSWorld\n\nTo deploy Agent S3 in OSWorld, follow the [OSWorld Deployment instructions](osworld_setup/s3/OSWorld.md).\n\n## 💬 Citations\n\nIf you find this codebase useful, please cite:\n\n```\n@misc{Agent-S3,\n      title={The Unreasonable Effectiveness of Scaling Agents for Computer Use}, \n      author={Gonzalo Gonzalez-Pumariega and Vincent Tu and Chih-Lun Lee and Jiachen Yang and Ang Li and Xin Eric Wang},\n      year={2025},\n      eprint={2510.02250},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https://arxiv.org/abs/2510.02250}, \n}\n\n@misc{Agent-S2,\n      title={Agent S2: A Compositional Generalist-Specialist Framework for Computer Use Agents}, \n      author={Saaket Agashe and Kyle Wong and Vincent Tu and Jiachen Yang and Ang Li and Xin Eric Wang},\n      year={2025},\n      eprint={2504.00906},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https://arxiv.org/abs/2504.00906}, \n}\n\n@inproceedings{Agent-S,\n    title={{Agent S: An Open Agentic Framework that Uses Computers Like a Human}},\n    author={Saaket Agashe and Jiuzhou Han and Shuyu Gan and Jiachen Yang and Ang Li and Xin Eric Wang},\n    booktitle={International Conference on Learning Representations (ICLR)},\n    year={2025},\n    url={https://arxiv.org/abs/2410.08164}\n}\n```\n\n## Star History\n\n[![Star History Chart](https://api.star-history.com/svg?repos=simular-ai/Agent-S&type=Date)](https://star-history.com/#simular-ai/Agent-S&Date)\n"
  },
  {
    "path": "WAA_setup.md",
    "content": "# Introduction\n\nThis is the WindowsAgentArena (WAA) setup with Agent S2.5 (and beyond). Why do we need a setup guide? Despite the thorough [README.md](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file\"), we have to include our code into their repository _and_ fix up a number of setup issues from the WAA environment. Sadly, this isn’t the most straightforward.\n\n# Initial WAA Setup\n\nThe initial WAA setup is straightforward. Follow the [README.md](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file\") on their repository. After you’ve finished this, try running `run-local.sh`. This will start up an experiment with their default `Navi` agent. At this point, the environment is _sufficient to run evaluation_, but it’s incomplete and thus the evaluation won’t be exactly correct due to environment issues.\n\n![](./images/waa_setup/fig1.png)\n\nFigure 1: Bash script chain of execution.\n\nWhile we’re at it, look to understand the following things:\n\n-   the entire README.md (especially the [Bring Your Own Agent guide](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent\"))\n    \n-   the _long_ chain of bash scripts that start the run (Figure 1)\n    \n-   the `run.py` to see how the agent/environment are instantiated and used together\n    \n-   the folder structure of the repository and the purpose of each folder\n    \n\n# Fixing Setup Issues\n\nBy now, your WAA environment should be set up to run locally. There are two major problems:\n\n-   setup issues\n    \n-   the VM persists across examples (it won’t reset after every example is completed which may make evaluation unfair)\n    \n\nLet’s tackle the first one: setup issues.\n\n### Office Apps Aren’t Installed\n\nThe first issue I ran into was the office apps aren’t installed. Why is that? Turns out all apps installed in the VM during the initial setup stage install via the links from this [file](https://github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/vm/setup/tools_config.json \"https://github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/vm/setup/tools_config.json\") (`tools_config.json`). At the time of writing this, only the office links do not work. Try out all the links to make sure they work. If the links do not lead to a download (and some error occurs), then that app was not installed in the VM. What do we do? Two options:\n\n-   redo the entire initial setup stage (time consuming; ~**4** hours for me and even then, it would just not work a lot of the times; ideally, WAA is setup on Linux as I’ve had no issues so far with it)\n    \n-   Enter the VM and install the apps manually (easier and faster)\n    \n\nWe’ll do the second approach.\n\nYou can access the VM via `https://localhost:8006`. You can turn the VM on by `run-local.sh`. There’s probably a better/faster way to do it, but this doesn’t take too much time anyways (~**1-2** mins). After the VM has started, enter the VM (the agent may be trying to take actions, but you can either just override the action in `run.py` with `import time; time.sleep(10000)` [here](https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/lib_run_single.py#L58 \"https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/lib_run_single.py#L58\") or fight the agent for control of the VM!).\n\nInside the VM, navigate to their [download page](https://www.libreoffice.org/download/download-libreoffice/ \"https://www.libreoffice.org/download/download-libreoffice/\") and download the latest LibreOffice version. After it’s downloaded, complete the setup wizard and make sure to delete the downloaded `*.msi` file in the VM. Finally, test the download by opening up LibreOffice Writer and Calc.\n\n### Google Chrome Pop-ups\n\nIn Google Chrome, there a couple unexpected pop-ups.\n\n![](./images/waa_setup/fig2.png)\n\nFigure 2: Pop-ups on Chrome.\n\nClose all these pop-ups and [make Google Chrome your default web browser](https://support.google.com/chrome/answer/95417?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac%2Cwindows \"https://support.google.com/chrome/answer/95417?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac%2Cwindows\").\n\n### VSCode Pop-ups\n\nThis isn’t as important, but there are a couple initial pop-ups in VSCode that you can close.\n\n### Note: `set_cell_values`\n\n_Important if you’re using_ `set_cell_values`\n\nAgent S2.5 uses a special grounding function called `set_cell_values` that takes advantage of the `soffice` CLI and `unotools` [Python library](https://pypi.org/project/unotools/ \"https://pypi.org/project/unotools/\"). TL; DR, this function lets the agent set the cell values for a given spreadsheet and sheet.\n\nFor this function to work on WAA, the set up is a bit messy…\n\n1.  Connect into the VM\n    \n2.  Open up a terminal and run `python --version`, you should see you’re using the GIMP Python which is `2.x`. This won’t let you use the `soffice` CLI or `import uno` in Python code.\n    \n3.  In the `Desktop` directory within a terminal, do `pip freeze > requirements.txt` to save all the PYPI libraries from the GIMP Python to a `requirements.txt`.\n    \n4.  Configuring Python path to LibreOffice’s Python\n    \n    1.  In the File Explorer, locate the `python.exe` file from LibreOffice. You can do this with `where python`. Copy this path.\n        \n    2.  In the Search bar in the bottom task bar inside the VM, search for “environment variables”.\n        \n    3.  Click on “Environment Variables” and click on “Path” under “System variables”. Paste the copied path from step (a) into there and ensure this path is _above_ the GIMP Python path so it takes precedence.\n        \n    4.  Reopen a terminal and run `soffice` to ensure it is now working. Create a temporary python file and ensure `import uno` works.\n        \n5.  LibreOffice’s Python should be `3.10` or above. However, it does not come with pip. To install pip, download this [file](https://bootstrap.pypa.io/get-pip.py \"https://bootstrap.pypa.io/get-pip.py\") and execute `python get-pip.py` to install it. Ensure the `python` here is LibreOffice’s Python. Next, install `pip install -r requirements.txt` using the `requirements.txt` from step 3. This is to ensure LibreOffice’s Python has all the dependencies needed for evaluation (pyautogui, etc).\n    \n6.  Clean up all installer files. Then, inside the [WAA repository code](https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/desktop_env/controllers/python.py#L193 \"https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/desktop_env/controllers/python.py#L193\"), change this line\n    \n\n`command_list = [\"python\", \"-c\", self.pkgs_prefix.format(command=command)]`\n\nto:\n\n`command_list = [\"absolute/path/to/libreoffice/python\", \"-c\", self.pkgs_prefix.format(command=command)]`\n\nThis ensures that the subprocess running in the flask server inside the VM will use that specific Python version.\n\n### Double Checking…\n\nDouble check all apps can be used and no unexpected pop-ups or issues are in the way. Any apps you open make sure to close them upon finishing your clean-up. Make sure any installation files you have in `Downloads` are deleted (and removed from Recycle Bin) to keep the environment clean. At the end, this is our **golden image**. You may want to save a copy of this VM somewhere safe so that you can always copy it back into the WAA repository to be reused (refer to [this](https://github.com/microsoft/WindowsAgentArena/tree/main?tab=readme-ov-file#additional-notes \"https://github.com/microsoft/WindowsAgentArena/tree/main?tab=readme-ov-file#additional-notes\")).\n\n# Set up Agent S2.5 with WAA Locally\n\nTake the time to understand the [Agent-S repository](https://github.com/simular-ai/Agent-S \"https://github.com/simular-ai/Agent-S\").\n\n1.  Instead of following the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md \"https://github.com/simular-ai/Agent-S/blob/main/README.md\") for Agent S2.5, you need to clone the repository then `pip install -r requirements.txt`\n    \n2.  Move the S2.5 folder to the [mm_agents](https://github.com/microsoft/WindowsAgentArena/tree/main/src/win-arena-container/client/mm_agents \"https://github.com/microsoft/WindowsAgentArena/tree/main/src/win-arena-container/client/mm_agents\") folder in WAA. Follow the [Bring Your Own Agent guide](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent\").\n    \n    1.  You will need to move the `agent_s.py` file out to the `S2.5` folder and update all the relevant import statements\n        \n3.  Make the necessary changes in `run.py` and `lib_run_single.py` to accommodate Agent S2.5 (replace the Navi Agent with Agent S2.5).\n    \n4.  Test it by running the experiments! Don’t forget when you do `run-local.sh`, now you need to specify Agent S2.5 instead of the navi agent `agent=\"agent_s\"`.\n    \n5.  You may have some import errors and these libraries need to be installed inside the `winarena` container (I think). You can just add the pip install commands to the bash script where the error stems from (hacky).\n\n# Agent S2.5 with WAA on Azure\n\n1.  Ensure you have:\n    \n    1.  a **clean copy** of the golden image\n        \n    2.  the correct Azure subscription (so you’re not using your own payment method)\n        \n2.  Follow the Azure deployment in the [README.md](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md \"https://github.com/microsoft/WindowsAgentArena/blob/main/README.md\").\n    \n3.  Test it! If this works, then we have a resettable golden image and WAA can be ran in parallel, making evaluation much _much_ faster! Good luck!"
  },
  {
    "path": "evaluation_sets/test_all.json",
    "content": "{\n  \"chrome\": [\n    \"bb5e4c0d-f964-439c-97b6-bdb9747de3f4\",\n    \"7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3\",\n    \"06fe7178-4491-4589-810f-2e2bc9502122\",\n    \"e1e75309-3ddb-4d09-92ec-de869c928143\",\n    \"35253b65-1c19-4304-8aa4-6884b8218fc0\",\n    \"2ad9387a-65d8-4e33-ad5b-7580065a27ca\",\n    \"7a5a7856-f1b6-42a4-ade9-1ca81ca0f263\",\n    \"44ee5668-ecd5-4366-a6ce-c1c9b8d4e938\",\n    \"2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3\",\n    \"480bcfea-d68f-4aaa-a0a9-2589ef319381\",\n    \"af630914-714e-4a24-a7bb-f9af687d3b91\",\n    \"3720f614-37fd-4d04-8a6b-76f54f8c222d\",\n    \"99146c54-4f37-4ab8-9327-5f3291665e1e\",\n    \"12086550-11c0-466b-b367-1d9e75b3910e\",\n    \"6766f2b8-8a72-417f-a9e5-56fcaa735837\",\n    \"93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9\",\n    \"ae78f875-5b98-4907-bbb5-9c737fc68c03\",\n    \"3299584d-8f11-4457-bf4c-ce98f7600250\",\n    \"030eeff7-b492-4218-b312-701ec99ee0cc\",\n    \"9656a811-9b5b-4ddf-99c7-5117bcef0626\",\n    \"fc6d8143-9452-4171-9459-7f515143419a\",\n    \"a96b564e-dbe9-42c3-9ccf-b4498073438a\",\n    \"1704f00f-79e6-43a7-961b-cedd3724d5fd\",\n    \"f3b19d1e-2d48-44e9-b4e1-defcae1a0197\",\n    \"82bc8d6a-36eb-4d2d-8801-ef714fb1e55a\",\n    \"47543840-672a-467d-80df-8f7c3b9788c9\",\n    \"c1fa57f3-c3db-4596-8f09-020701085416\",\n    \"da46d875-6b82-4681-9284-653b0c7ae241\",\n    \"6c4c23a1-42a4-43cc-9db1-2f86ff3738cc\",\n    \"f79439ad-3ee8-4f99-a518-0eb60e5652b0\",\n    \"b7895e80-f4d1-4648-bee0-4eb45a6f1fa8\",\n    \"9f3f70fc-5afc-4958-a7b7-3bb4fcb01805\",\n    \"7f52cab9-535c-4835-ac8c-391ee64dc930\",\n    \"82279c77-8fc6-46f6-9622-3ba96f61b477\",\n    \"2888b4e6-5b47-4b57-8bf5-c73827890774\",\n    \"b4f95342-463e-4179-8c3f-193cd7241fb2\",\n    \"f5d96daf-83a8-4c86-9686-bada31fc66ab\",\n    \"121ba48f-9e17-48ce-9bc6-a4fb17a7ebba\",\n    \"368d9ba4-203c-40c1-9fa3-da2f1430ce63\",\n    \"59155008-fe71-45ec-8a8f-dc35497b6aa8\",\n    \"a728a36e-8bf1-4bb6-9a03-ef039a5233f0\",\n    \"b070486d-e161-459b-aa2b-ef442d973b92\",\n    \"0d8b7de3-e8de-4d86-b9fd-dd2dce58a217\",\n    \"9f935cce-0a9f-435f-8007-817732bfc0a5\",\n    \"f0b971a1-6831-4b9b-a50e-22a6e47f45ba\",\n    \"cabb3bae-cccb-41bd-9f5d-0f3a9fecd825\"\n  ],\n  \"gimp\": [\n    \"7a4deb26-d57d-4ea9-9a73-630f66a7b568\",\n    \"554785e9-4523-4e7a-b8e1-8016f565f56a\",\n    \"77b8ab4d-994f-43ac-8930-8ca087d7c4b4\",\n    \"f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce\",\n    \"d52d6308-ec58-42b7-a2c9-de80e4837b2b\",\n    \"2a729ded-3296-423d-aec4-7dd55ed5fbb3\",\n    \"b148e375-fe0b-4bec-90e7-38632b0d73c2\",\n    \"a746add2-cab0-4740-ac36-c3769d9bfb46\",\n    \"7b7617bd-57cc-468e-9c91-40c4ec2bcb3d\",\n    \"d16c99dc-2a1e-46f2-b350-d97c86c85c15\",\n    \"06ca5602-62ca-47f6-ad4f-da151cde54cc\",\n    \"e2dd0213-26db-4349-abe5-d5667bfd725c\",\n    \"f723c744-e62c-4ae6-98d1-750d3cd7d79d\",\n    \"72f83cdc-bf76-4531-9a1b-eb893a13f8aa\",\n    \"7767eef2-56a3-4cea-8c9f-48c070c7d65b\",\n    \"734d6579-c07d-47a8-9ae2-13339795476b\",\n    \"e19bd559-633b-4b02-940f-d946248f088e\",\n    \"38f48d40-764e-4e77-a7cf-51dfce880291\",\n    \"fbb548ca-c2a6-4601-9204-e39a2efc507b\",\n    \"5ca86c6f-f317-49d8-b6a7-b527541caae8\",\n    \"62f7fd55-0687-4a43-b6e1-3eda16fc6252\",\n    \"8ea73f6f-9689-42ad-8c60-195bbf06a7ba\",\n    \"58d3eeeb-e9d0-499f-962e-fd0db2a744d8\",\n    \"2e6f678f-472d-4c55-99cc-8e7c5c402a71\",\n    \"045bf3ff-9077-4b86-b483-a1040a949cff\",\n    \"dbbf4b99-2253-4b10-9274-45f246af2466\"\n  ],\n  \"libreoffice_calc\": [\n    \"357ef137-7eeb-4c80-a3bb-0951f26a8aff\",\n    \"42e0a640-4f19-4b28-973d-729602b5a4a7\",\n    \"51719eea-10bc-4246-a428-ac7c433dd4b3\",\n    \"1954cced-e748-45c4-9c26-9855b97fbc5e\",\n    \"2bd59342-0664-4ccb-ba87-79379096cc08\",\n    \"3aaa4e37-dc91-482e-99af-132a612d40f3\",\n    \"1273e544-688f-496b-8d89-3e0f40aa0606\",\n    \"12382c62-0cd1-4bf2-bdc8-1d20bf9b2371\",\n    \"f9584479-3d0d-4c79-affa-9ad7afdd8850\",\n    \"535364ea-05bd-46ea-9937-9f55c68507e8\",\n    \"7e429b8d-a3f0-4ed0-9b58-08957d00b127\",\n    \"4f07fbe9-70de-4927-a4d5-bb28bc12c52c\",\n    \"04d9aeaf-7bed-4024-bedb-e10e6f00eb7f\",\n    \"0bf05a7d-b28b-44d2-955a-50b41e24012a\",\n    \"6054afcb-5bab-4702-90a0-b259b5d3217c\",\n    \"abed40dc-063f-4598-8ba5-9fe749c0615d\",\n    \"37608790-6147-45d0-9f20-1137bb35703d\",\n    \"26a8440e-c166-4c50-aef4-bfb77314b46b\",\n    \"d681960f-7bc3-4286-9913-a8812ba3261a\",\n    \"035f41ba-6653-43ab-aa63-c86d449d62e5\",\n    \"7efeb4b1-3d19-4762-b163-63328d66303b\",\n    \"1de60575-bb6e-4c3d-9e6a-2fa699f9f197\",\n    \"aa3a8974-2e85-438b-b29e-a64df44deb4b\",\n    \"51b11269-2ca8-4b2a-9163-f21758420e78\",\n    \"1e8df695-bd1b-45b3-b557-e7d599cf7597\",\n    \"ecb0df7a-4e8d-4a03-b162-053391d3afaf\",\n    \"8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14\",\n    \"a01fbce3-2793-461f-ab86-43680ccbae25\",\n    \"0326d92d-d218-48a8-9ca1-981cd6d064c7\",\n    \"0a2e43bf-b26c-4631-a966-af9dfa12c9e5\",\n    \"4188d3a4-077d-46b7-9c86-23e1a036f6c1\",\n    \"347ef137-7eeb-4c80-a3bb-0951f26a8aff\",\n    \"eb03d19a-b88d-4de4-8a64-ca0ac66f426b\",\n    \"0cecd4f3-74de-457b-ba94-29ad6b5dafb6\",\n    \"1d17d234-e39d-4ed7-b46f-4417922a4e7c\",\n    \"4e6fcf72-daf3-439f-a232-c434ce416af6\",\n    \"01b269ae-2111-4a07-81fd-3fcd711993b0\",\n    \"21df9241-f8d7-4509-b7f1-37e501a823f7\",\n    \"a9f325aa-8c05-4e4f-8341-9e4358565f4f\",\n    \"6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5\",\n    \"7a4e4bc8-922c-4c84-865c-25ba34136be1\",\n    \"4de54231-e4b5-49e3-b2ba-61a0bec721c0\",\n    \"30e3e107-1cfb-46ee-a755-2cd080d7ba6a\",\n    \"4172ea6e-6b77-4edb-a9cc-c0014bd1603b\",\n    \"1334ca3e-f9e3-4db8-9ca7-b4c653be7d17\",\n    \"3a7c8185-25c1-4941-bd7b-96e823c9f21f\",\n    \"21ab7b40-77c2-4ae6-8321-e00d3a086c73\"\n  ],\n  \"libreoffice_impress\": [\n    \"5d901039-a89c-4bfb-967b-bf66f4df075e\",\n    \"550ce7e7-747b-495f-b122-acdc4d0b8e54\",\n    \"455d3c66-7dc6-4537-a39a-36d3e9119df7\",\n    \"af23762e-2bfd-4a1d-aada-20fa8de9ce07\",\n    \"c59742c0-4323-4b9d-8a02-723c251deaa0\",\n    \"ef9d12bd-bcee-4ba0-a40e-918400f43ddf\",\n    \"9ec204e4-f0a3-42f8-8458-b772a6797cab\",\n    \"0f84bef9-9790-432e-92b7-eece357603fb\",\n    \"ce88f674-ab7a-43da-9201-468d38539e4a\",\n    \"3b27600c-3668-4abd-8f84-7bcdebbccbdb\",\n    \"a097acff-6266-4291-9fbd-137af7ecd439\",\n    \"bf4e9888-f10f-47af-8dba-76413038b73c\",\n    \"21760ecb-8f62-40d2-8d85-0cee5725cb72\",\n    \"ac9bb6cb-1888-43ab-81e4-a98a547918cd\",\n    \"2cd43775-7085-45d8-89fa-9e35c0a915cf\",\n    \"358aa0a7-6677-453f-ae35-e440f004c31e\",\n    \"a669ef01-ded5-4099-9ea9-25e99b569840\",\n    \"73c99fb9-f828-43ce-b87a-01dc07faa224\",\n    \"15aece23-a215-4579-91b4-69eec72e18da\",\n    \"986fc832-6af2-417c-8845-9272b3a1528b\",\n    \"a434992a-89df-4577-925c-0c58b747f0f4\",\n    \"7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8\",\n    \"841b50aa-df53-47bd-a73a-22d3a9f73160\",\n    \"8979838c-54a5-4454-a2b8-3d135a1a5c8f\",\n    \"b8adbc24-cef2-4b15-99d5-ecbe7ff445eb\",\n    \"2b94c692-6abb-48ae-ab0b-b3e8a19cb340\",\n    \"9cf05d24-6bd9-4dae-8967-f67d88f5d38a\",\n    \"08aced46-45a2-48d7-993b-ed3fb5b32302\",\n    \"edb61b14-a854-4bf5-a075-c8075c11293a\",\n    \"c82632a4-56b6-4db4-9dd1-3820ee3388e4\",\n    \"39be0d19-634d-4475-8768-09c130f5425d\",\n    \"ac1b39ff-ee4d-4483-abce-c117e98942f0\",\n    \"f23acfd2-c485-4b7c-a1e7-d4303ddfe864\",\n    \"70bca0cc-c117-427e-b0be-4df7299ebeb6\",\n    \"af2d657a-e6b3-4c6a-9f67-9e3ed015974c\",\n    \"57667013-ea97-417c-9dce-2713091e6e2a\",\n    \"0a211154-fda0-48d0-9274-eaac4ce5486d\",\n    \"a53f80cd-4a90-4490-8310-097b011433f6\",\n    \"7ae48c60-f143-4119-b659-15b8f485eb9a\",\n    \"5cfb9197-e72b-454b-900e-c06b0c802b40\",\n    \"05dd4c1d-c489-4c85-8389-a7836c4f0567\",\n    \"5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1\",\n    \"4ed5abd0-8b5d-47bd-839f-cacfa15ca37a\",\n    \"e4ef0baf-4b52-4590-a47e-d4d464cca2d7\",\n    \"ed43c15f-00cb-4054-9c95-62c880865d68\",\n    \"3161d64e-3120-47b4-aaad-6a764a92493b\",\n    \"04578141-1d42-4146-b9cf-6fab4ce5fd74\"\n  ],\n  \"libreoffice_writer\": [\n    \"0810415c-bde4-4443-9047-d5f70165a697\",\n    \"0a0faba3-5580-44df-965d-f562a99b291c\",\n    \"0b17a146-2934-46c7-8727-73ff6b6483e8\",\n    \"0e47de2a-32e0-456c-a366-8c607ef7a9d2\",\n    \"0e763496-b6bb-4508-a427-fad0b6c3e195\",\n    \"3ef2b351-8a84-4ff2-8724-d86eae9b842e\",\n    \"4bcb1253-a636-4df4-8cb0-a35c04dfef31\",\n    \"66399b0d-8fda-4618-95c4-bfc6191617e9\",\n    \"6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2\",\n    \"6ada715d-3aae-4a32-a6a7-429b2e43fb93\",\n    \"6f81754e-285d-4ce0-b59e-af7edb02d108\",\n    \"72b810ef-4156-4d09-8f08-a0cf57e7cefe\",\n    \"8472fece-c7dd-4241-8d65-9b3cd1a0b568\",\n    \"88fe4b2d-3040-4c70-9a70-546a47764b48\",\n    \"936321ce-5236-426a-9a20-e0e3c5dc536f\",\n    \"adf5e2c3-64c7-4644-b7b6-d2f0167927e7\",\n    \"b21acd93-60fd-4127-8a43-2f5178f4a830\",\n    \"d53ff5ee-3b1a-431e-b2be-30ed2673079b\",\n    \"e246f6d8-78d7-44ac-b668-fcf47946cb50\",\n    \"e528b65e-1107-4b8c-8988-490e4fece599\",\n    \"ecc2413d-8a48-416e-a3a2-d30106ca36cb\",\n    \"f178a4a9-d090-4b56-bc4c-4b72a61a035d\",\n    \"bb8ccc78-479f-4a2f-a71e-d565e439436b\"\n  ],\n  \"multi_apps\": [\n    \"2b9493d7-49b8-493a-a71b-56cd1f4d6908\",\n    \"2c9fc0de-3ee7-45e1-a5df-c86206ad78b5\",\n    \"2fe4b718-3bd7-46ec-bdce-b184f5653624\",\n    \"3680a5ee-6870-426a-a997-eba929a0d25c\",\n    \"46407397-a7d5-4c6b-92c6-dbe038b1457b\",\n    \"4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc\",\n    \"510f64c8-9bcc-4be1-8d30-638705850618\",\n    \"51f5801c-18b3-4f25-b0c3-02f85507a078\",\n    \"58565672-7bfe-48ab-b828-db349231de6b\",\n    \"78aed49a-a710-4321-a793-b611a7c5b56b\",\n    \"897e3b53-5d4d-444b-85cb-2cdc8a97d903\",\n    \"937087b6-f668-4ba6-9110-60682ee33441\",\n    \"a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb\",\n    \"b52b40a5-ad70-4c53-b5b0-5650a8387052\",\n    \"c867c42d-a52d-4a24-8ae3-f75d256b5618\",\n    \"d9b7c649-c975-4f53-88f5-940b29c47247\",\n    \"e135df7c-7687-4ac0-a5f0-76b74438b53e\",\n    \"ee9a3c83-f437-4879-8918-be5efbb9fac7\",\n    \"f7dfbef3-7697-431c-883a-db8583a4e4f9\",\n    \"f8cfa149-d1c1-4215-8dac-4a0932bad3c2\",\n    \"6d72aad6-187a-4392-a4c4-ed87269c51cf\",\n    \"f918266a-b3e0-4914-865d-4faa564f1aef\",\n    \"da52d699-e8d2-4dc5-9191-a2199e0b6a9b\",\n    \"bc2b57f3-686d-4ec9-87ce-edf850b7e442\",\n    \"74d5859f-ed66-4d3e-aa0e-93d7a592ce41\",\n    \"b5062e3e-641c-4e3a-907b-ac864d2e7652\",\n    \"00fa164e-2612-4439-992e-157d019a8436\",\n    \"acb0f96b-e27c-44d8-b55f-7cb76609dfcd\",\n    \"69acbb55-d945-4927-a87b-8480e1a5bb7e\",\n    \"48d05431-6cd5-4e76-82eb-12b60d823f7d\",\n    \"68a25bd4-59c7-4f4d-975e-da0c8509c848\",\n    \"eb303e01-261e-4972-8c07-c9b4e7a4922a\",\n    \"0c825995-5b70-4526-b663-113f4c999dd2\",\n    \"c7c1e4c3-9e92-4eba-a4b8-689953975ea4\",\n    \"d1acdb87-bb67-4f30-84aa-990e56a09c92\",\n    \"deec51c9-3b1e-4b9e-993c-4776f20e8bb2\",\n    \"8e116af7-7db7-4e35-a68b-b0939c066c78\",\n    \"337d318b-aa07-4f4f-b763-89d9a2dd013f\",\n    \"82e3c869-49f6-4305-a7ce-f3e64a0618e7\",\n    \"185f29bd-5da0-40a6-b69c-ba7f4e0324ef\",\n    \"869de13e-bef9-4b91-ba51-f6708c40b096\",\n    \"2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e\",\n    \"3a93cae4-ad3e-403e-8c12-65303b271818\",\n    \"1f18aa87-af6f-41ef-9853-cdb8f32ebdea\",\n    \"26150609-0da3-4a7d-8868-0faf9c5f01bb\",\n    \"9219480b-3aed-47fc-8bac-d2cffc5849f7\",\n    \"881deb30-9549-4583-a841-8270c65f2a17\",\n    \"7e287123-70ca-47b9-8521-47db09b69b14\",\n    \"e2392362-125e-4f76-a2ee-524b183a3412\",\n    \"5bc63fb9-276a-4439-a7c1-9dc76401737f\",\n    \"26660ad1-6ebb-4f59-8cba-a8432dfe8d38\",\n    \"a82b78bb-7fde-4cb3-94a4-035baf10bcf0\",\n    \"36037439-2044-4b50-b9d1-875b5a332143\",\n    \"716a6079-22da-47f1-ba73-c9d58f986a38\",\n    \"873cafdd-a581-47f6-8b33-b9696ddb7b05\",\n    \"a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a\",\n    \"6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a\",\n    \"da922383-bfa4-4cd3-bbad-6bebab3d7742\",\n    \"2373b66a-092d-44cb-bfd7-82e86e7a3b4d\",\n    \"81c425f5-78f3-4771-afd6-3d2973825947\",\n    \"bb83cab4-e5c7-42c7-a67b-e46068032b86\",\n    \"227d2f97-562b-4ccb-ae47-a5ec9e142fbb\",\n    \"b337d106-053f-4d37-8da0-7f9c4043a66b\",\n    \"20236825-b5df-46e7-89bf-62e1d640a897\",\n    \"8df7e444-8e06-4f93-8a1a-c5c974269d82\",\n    \"aad10cd7-9337-4b62-b704-a857848cedf2\",\n    \"02ce9a50-7af2-47ed-8596-af0c230501f8\",\n    \"4c26e3f3-3a14-4d86-b44a-d3cedebbb487\",\n    \"a503b07f-9119-456b-b75d-f5146737d24f\",\n    \"09a37c51-e625-49f4-a514-20a773797a8a\",\n    \"3e3fc409-bff3-4905-bf16-c968eee3f807\",\n    \"f5c13cdd-205c-4719-a562-348ae5cd1d91\",\n    \"5990457f-2adb-467b-a4af-5c857c92d762\",\n    \"415ef462-bed3-493a-ac36-ca8c6d23bf1b\",\n    \"7ff48d5b-2df2-49da-b500-a5150ffc7f18\",\n    \"9f3bb592-209d-43bc-bb47-d77d9df56504\",\n    \"dd60633f-2c72-42ba-8547-6f2c8cb0fdb0\",\n    \"ce2b64a2-ddc1-4f91-8c7d-a88be7121aac\",\n    \"3f05f3b9-29ba-4b6b-95aa-2204697ffc06\",\n    \"e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56\",\n    \"f8369178-fafe-40c2-adc4-b9b08a125456\",\n    \"778efd0a-153f-4842-9214-f05fc176b877\",\n    \"47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5\",\n    \"c2751594-0cd5-4088-be1b-b5f2f9ec97c4\",\n    \"788b3701-3ec9-4b67-b679-418bfa726c22\",\n    \"48c46dc7-fe04-4505-ade7-723cba1aa6f6\",\n    \"42d25c08-fb87-4927-8b65-93631280a26f\",\n    \"e8172110-ec08-421b-a6f5-842e6451911f\",\n    \"42f4d1c7-4521-4161-b646-0a8934e36081\",\n    \"3c8f201a-009d-4bbe-8b65-a6f8b35bb57f\",\n    \"d68204bf-11c1-4b13-b48b-d303c73d4bf6\",\n    \"91190194-f406-4cd6-b3f9-c43fac942b22\",\n    \"7f35355e-02a6-45b5-b140-f0be698bcf85\",\n    \"98e8e339-5f91-4ed2-b2b2-12647cb134f4\",\n    \"0e5303d4-8820-42f6-b18d-daf7e633de21\",\n    \"df67aebb-fb3a-44fd-b75b-51b6012df509\",\n    \"5df7b33a-9f77-4101-823e-02f863e1c1ae\",\n    \"aceb0368-56b8-4073-b70e-3dc9aee184e0\",\n    \"22a4636f-8179-4357-8e87-d1743ece1f81\",\n    \"236833a3-5704-47fc-888c-4f298f09f799\",\n    \"67890eb6-6ce5-4c00-9e3d-fb4972699b06\"\n  ],\n  \"os\": [\n    \"94d95f96-9699-4208-98ba-3c3119edf9c2\",\n    \"bedcedc4-4d72-425e-ad62-21960b11fe0d\",\n    \"ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3\",\n    \"a462a795-fdc7-4b23-b689-e8b6df786b78\",\n    \"f9be0997-4b7c-45c5-b05c-4612b44a6118\",\n    \"28cc3b7e-b194-4bc9-8353-d04c0f4d56d2\",\n    \"5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57\",\n    \"e0df059f-28a6-4169-924f-b9623e7184cc\",\n    \"b6781586-6346-41cd-935a-a6b1487918fc\",\n    \"b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa\",\n    \"3ce045a0-877b-42aa-8d2c-b4a863336ab8\",\n    \"fe41f596-a71b-4c2f-9b2f-9dcd40b568c3\",\n    \"a4d98375-215b-4a4d-aee9-3d4370fccc41\",\n    \"13584542-872b-42d8-b299-866967b5c3ef\",\n    \"23393935-50c7-4a86-aeea-2b78fd089c5c\",\n    \"5812b315-e7bd-4265-b51f-863c02174c28\",\n    \"c288e301-e626-4b98-a1ab-159dcb162af5\",\n    \"4783cc41-c03c-4e1b-89b4-50658f642bd5\",\n    \"5c1075ca-bb34-46a3-a7a0-029bd7463e79\",\n    \"5ced85fc-fa1a-4217-95fd-0fb530545ce2\",\n    \"37887e8c-da15-4192-923c-08fa390a176d\",\n    \"4127319a-8b79-4410-b58a-7a151e15f3d7\",\n    \"4d117223-a354-47fb-8b45-62ab1390a95f\",\n    \"6f56bf42-85b8-4fbb-8e06-6c44960184ba\"\n  ],\n  \"thunderbird\": [\n    \"dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397\",\n    \"15c3b339-88f7-4a86-ab16-e71c58dcb01e\",\n    \"7b1e1ff9-bb85-49be-b01d-d6424be18cd0\",\n    \"9bc3cc16-074a-45ac-9bdc-b2a362e1daf3\",\n    \"3f28fe4f-5d9d-4994-a456-efd78cfae1a3\",\n    \"5203d847-2572-4150-912a-03f062254390\",\n    \"dd84e895-72fd-4023-a336-97689ded257c\",\n    \"9b7bc335-06b5-4cd3-9119-1a649c478509\",\n    \"d38192b0-17dc-4e1d-99c3-786d0117de77\",\n    \"a10b69e1-6034-4a2b-93e1-571d45194f75\",\n    \"3f49d2cc-f400-4e7d-90cc-9b18e401cc31\",\n    \"f201fbc3-44e6-46fc-bcaa-432f9815454c\",\n    \"10a730d5-d414-4b40-b479-684bed1ae522\",\n    \"a1af9f1c-50d5-4bc3-a51e-4d9b425ff638\",\n    \"08c73485-7c6d-4681-999d-919f5c32dcfa\"\n  ],\n  \"vlc\": [\n    \"59f21cfb-0120-4326-b255-a5b827b38967\",\n    \"8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89\",\n    \"8f080098-ddb1-424c-b438-4e96e5e4786e\",\n    \"bba3381f-b5eb-4439-bd9e-80c22218d5a7\",\n    \"fba2c100-79e8-42df-ae74-b592418d54f4\",\n    \"efcf0d81-0835-4880-b2fd-d866e8bc2294\",\n    \"8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f\",\n    \"aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6\",\n    \"386dbd0e-0241-4a0a-b6a2-6704fba26b1c\",\n    \"9195653c-f4aa-453d-aa95-787f6ccfaae9\",\n    \"d06f0d4d-2cd5-4ede-8de9-598629438c6e\",\n    \"a5bbbcd5-b398-4c91-83d4-55e1e31bbb81\",\n    \"5ac2891a-eacd-4954-b339-98abba077adb\",\n    \"f3977615-2b45-4ac5-8bba-80c17dbe2a37\",\n    \"215dfd39-f493-4bc3-a027-8a97d72c61bf\",\n    \"cb130f0d-d36f-4302-9838-b3baf46139b6\",\n    \"7882ed6e-bece-4bf0-bada-c32dc1ddae72\"\n  ],\n  \"vs_code\": [\n    \"0ed39f63-6049-43d4-ba4d-5fa2fe04a951\",\n    \"53ad5833-3455-407b-bbc6-45b4c79ab8fb\",\n    \"eabc805a-bfcf-4460-b250-ac92135819f6\",\n    \"982d12a5-beab-424f-8d38-d2a48429e511\",\n    \"4e60007a-f5be-4bfc-9723-c39affa0a6d3\",\n    \"e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2\",\n    \"9439a27b-18ae-42d8-9778-5f68f891805e\",\n    \"ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae\",\n    \"930fdb3b-11a8-46fe-9bac-577332e2640e\",\n    \"276cc624-87ea-4f08-ab93-f770e3790175\",\n    \"9d425400-e9b2-4424-9a4b-d4c7abac4140\",\n    \"5e2d93d8-8ad0-4435-b150-1692aacaa994\",\n    \"6ed0a554-cbee-4b44-84ea-fd6c042f4fe1\",\n    \"ec71221e-ac43-46f9-89b8-ee7d80f7e1c5\",\n    \"70745df8-f2f5-42bd-8074-fbc10334fcc5\",\n    \"57242fad-77ca-454f-b71b-f187181a9f23\",\n    \"c6bf789c-ba3a-4209-971d-b63abf0ab733\",\n    \"0512bb38-d531-4acf-9e7e-0add90816068\",\n    \"847a96b6-df94-4927-97e6-8cc9ea66ced7\",\n    \"7aeae0e2-70ee-4705-821d-1bba5d5b2ddd\",\n    \"dcbe20e8-647f-4f1d-8696-f1c5bbb570e3\",\n    \"7c4cc09e-7a92-40dd-8338-b2286535c4ed\",\n    \"971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6\"\n  ]\n}"
  },
  {
    "path": "evaluation_sets/test_small_new.json",
    "content": "{\n    \"os\": [\n        \"5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57\",\n        \"5812b315-e7bd-4265-b51f-863c02174c28\",\n        \"c288e301-e626-4b98-a1ab-159dcb162af5\",\n        \"4783cc41-c03c-4e1b-89b4-50658f642bd5\",\n        \"5c1075ca-bb34-46a3-a7a0-029bd7463e79\",\n        \"5ced85fc-fa1a-4217-95fd-0fb530545ce2\"\n    ],\n    \"gimp\": [\n        \"a746add2-cab0-4740-ac36-c3769d9bfb46\",\n        \"7a4deb26-d57d-4ea9-9a73-630f66a7b568\",\n        \"d52d6308-ec58-42b7-a2c9-de80e4837b2b\",\n        \"2a729ded-3296-423d-aec4-7dd55ed5fbb3\",\n        \"d16c99dc-2a1e-46f2-b350-d97c86c85c15\"\n    ],\n    \"chrome\": [\n        \"bb5e4c0d-f964-439c-97b6-bdb9747de3f4\",\n        \"7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3\",\n        \"35253b65-1c19-4304-8aa4-6884b8218fc0\",\n        \"a96b564e-dbe9-42c3-9ccf-b4498073438a\",\n        \"e1e75309-3ddb-4d09-92ec-de869c928143\",\n        \"82bc8d6a-36eb-4d2d-8801-ef714fb1e55a\"\n    ],\n    \"thunderbird\": [\n        \"bb5e4c0d-f964-439c-97b6-bdb9747de3f4\",\n        \"7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3\",\n        \"2ad9387a-65d8-4e33-ad5b-7580065a27ca\",\n        \"480bcfea-d68f-4aaa-a0a9-2589ef319381\",\n        \"030eeff7-b492-4218-b312-701ec99ee0cc\"\n    ],\n    \"vs_code\": [\n        \"0ed39f63-6049-43d4-ba4d-5fa2fe04a951\",\n        \"dcbe20e8-647f-4f1d-8696-f1c5bbb570e3\",\n        \"9439a27b-18ae-42d8-9778-5f68f891805e\",\n        \"7c4cc09e-7a92-40dd-8338-b2286535c4ed\",\n        \"9d425400-e9b2-4424-9a4b-d4c7abac4140\"\n    ],\n    \"vlc\": [\n        \"59f21cfb-0120-4326-b255-a5b827b38967\",\n        \"8f080098-ddb1-424c-b438-4e96e5e4786e\",\n        \"5ac2891a-eacd-4954-b339-98abba077adb\",\n        \"f3977615-2b45-4ac5-8bba-80c17dbe2a37\",\n        \"215dfd39-f493-4bc3-a027-8a97d72c61bf\"\n    ],\n    \"libreoffice_calc\": [\n        \"357ef137-7eeb-4c80-a3bb-0951f26a8aff\",\n        \"42e0a640-4f19-4b28-973d-729602b5a4a7\",\n        \"abed40dc-063f-4598-8ba5-9fe749c0615d\",\n        \"035f41ba-6653-43ab-aa63-c86d449d62e5\",\n        \"7efeb4b1-3d19-4762-b163-63328d66303b\"\n    ],\n    \"libreoffice_impress\": [\n        \"5d901039-a89c-4bfb-967b-bf66f4df075e\",\n        \"550ce7e7-747b-495f-b122-acdc4d0b8e54\",\n        \"ac9bb6cb-1888-43ab-81e4-a98a547918cd\",\n        \"2cd43775-7085-45d8-89fa-9e35c0a915cf\",\n        \"358aa0a7-6677-453f-ae35-e440f004c31e\",\n        \"a669ef01-ded5-4099-9ea9-25e99b569840\"\n    ],\n    \"libreoffice_writer\": [\n        \"0810415c-bde4-4443-9047-d5f70165a697\",\n        \"e246f6d8-78d7-44ac-b668-fcf47946cb50\",\n        \"d53ff5ee-3b1a-431e-b2be-30ed2673079b\",\n        \"b21acd93-60fd-4127-8a43-2f5178f4a830\",\n        \"0a0faba3-5580-44df-965d-f562a99b291c\",\n        \"adf5e2c3-64c7-4644-b7b6-d2f0167927e7\"\n    ],\n    \"multi_apps\": [\n        \"a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a\",\n        \"5990457f-2adb-467b-a4af-5c857c92d762\",\n        \"2b9493d7-49b8-493a-a71b-56cd1f4d6908\",\n        \"acb0f96b-e27c-44d8-b55f-7cb76609dfcd\",\n        \"c867c42d-a52d-4a24-8ae3-f75d256b5618\",\n        \"74d5859f-ed66-4d3e-aa0e-93d7a592ce41\",\n        \"b5062e3e-641c-4e3a-907b-ac864d2e7652\",\n        \"48d05431-6cd5-4e76-82eb-12b60d823f7d\",\n        \"eb303e01-261e-4972-8c07-c9b4e7a4922a\",\n        \"d1acdb87-bb67-4f30-84aa-990e56a09c92\",\n        \"deec51c9-3b1e-4b9e-993c-4776f20e8bb2\",\n        \"8e116af7-7db7-4e35-a68b-b0939c066c78\",\n        \"716a6079-22da-47f1-ba73-c9d58f986a38\",\n        \"46407397-a7d5-4c6b-92c6-dbe038b1457b\",\n        \"4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc\",\n        \"897e3b53-5d4d-444b-85cb-2cdc8a97d903\"\n    ]\n}\n"
  },
  {
    "path": "gui_agents/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/README.md",
    "content": "<h1 align=\"center\">\n  <img src=\"../../images/agent_s.png\" alt=\"Logo\" style=\"vertical-align:middle\" width=\"60\"> Agent S:\n  <small>Using Computers Like a Human</small>\n</h1>\n\n<p align=\"center\">\n  🌐 <a href=\"https://www.simular.ai/agent-s\">[Website]</a>\n  📄 <a href=\"https://arxiv.org/abs/2410.08164\">[Paper]</a>\n  🎥 <a href=\"https://www.youtube.com/watch?v=OBDE3Knte0g\">[Video]</a>\n  🗨️ <a href=\"https://discord.gg/E2XfsK9fPV\">[Discord]</a>\n</p>\n\n## 🥳 Updates\n- [x] **2025/01/22**: The [Agent S paper](https://arxiv.org/abs/2410.08164) is accepted to ICLR 2025!\n- [x] **2025/01/21**: Released v0.1.2 of [gui-agents](https://github.com/simular-ai/Agent-S) library, with support for Linux and Windows!\n- [x] **2024/12/05**: Released v0.1.0 of [gui-agents](https://github.com/simular-ai/Agent-S) library, allowing you to use Agent-S for Mac, OSWorld, and WindowsAgentArena with ease!\n- [x] **2024/10/10**: Released [Agent S paper](https://arxiv.org/abs/2410.08164) and codebase!\n\n## Table of Contents\n\n1. [💡 Introduction](#-introduction)\n2. [🎯 Current Results](#-current-results)\n3. [🛠️ Installation](#%EF%B8%8F-installation) \n4. [🚀 Usage](#-usage)\n5. [🙌 Contributors](#-contributors)\n6. [💬 Citation](#-citation)\n\n## 💡 Introduction\n\n<p align=\"center\">\n    <img src=\"../../images/teaser.png\" width=\"800\">\n</p>\n\nWelcome to **Agent S**, an open-source framework designed to enable autonomous interaction with computers through Agent-Computer Interface. Our mission is to build intelligent GUI agents that can learn from past experiences and perform complex tasks autonomously on your computer. \n\nWhether you're interested in AI, automation, or contributing to cutting-edge agent-based systems, we're excited to have you here!\n\n## 🎯 Current Results\n\n<p align=\"center\">\n    <img src=\"../../images/results.png\" width=\"600\">\n    <br>\n    Results of Successful Rate (%) on the OSWorld full test set of all 369 test examples using Image + Accessibility Tree input.\n</p>\n\n\n## 🛠️ Installation & Setup\n\n> ❗**Warning**❗: If you are on a Linux machine, creating a `conda` environment will interfere with `pyatspi`. As of now, there's no clean solution for this issue. Proceed through the installation without using `conda` or any virtual environment.\n\nClone the repository:\n```\ngit clone https://github.com/simular-ai/Agent-S.git\n```\n\nInstall the gui-agents package:\n```\npip install gui-agents\n```\n\nSet your LLM API Keys and other environment variables. You can do this by adding the following line to your .bashrc (Linux), or .zshrc (MacOS) file. \n\n```\nexport OPENAI_API_KEY=<YOUR_API_KEY>\n```\n\nAlternatively, you can set the environment variable in your Python script:\n\n```\nimport os\nos.environ[\"OPENAI_API_KEY\"] = \"<YOUR_API_KEY>\"\n```\n\nWe also support Azure OpenAI, Anthropic, and vLLM inference. For more information refer to [../../models.md](models.md).\n\n### Setup Retrieval from Web using Perplexica\nAgent S works best with web-knowledge retrieval. To enable this feature, you need to setup Perplexica: \n\n1. Ensure Docker Desktop is installed and running on your system.\n\n2. Navigate to the directory containing the project files.\n\n   ```bash\n    cd Perplexica\n    git submodule update --init\n   ```\n\n3. Rename the `sample.config.toml` file to `config.toml`. For Docker setups, you need only fill in the following fields:\n\n   - `OPENAI`: Your OpenAI API key. **You only need to fill this if you wish to use OpenAI's models**.\n   - `OLLAMA`: Your Ollama API URL. You should enter it as `http://host.docker.internal:PORT_NUMBER`. If you installed Ollama on port 11434, use `http://host.docker.internal:11434`. For other ports, adjust accordingly. **You need to fill this if you wish to use Ollama's models instead of OpenAI's**.\n   - `GROQ`: Your Groq API key. **You only need to fill this if you wish to use Groq's hosted models**.\n   - `ANTHROPIC`: Your Anthropic API key. **You only need to fill this if you wish to use Anthropic models**.\n\n     **Note**: You can change these after starting Perplexica from the settings dialog.\n\n   - `SIMILARITY_MEASURE`: The similarity measure to use (This is filled by default; you can leave it as is if you are unsure about it.)\n\n4. Ensure you are in the directory containing the `docker-compose.yaml` file and execute:\n\n   ```bash\n   docker compose up -d\n   ```\n\n5. Next, export your Perplexica URL. This URL is used to interact with the Perplexica API backend. The port is given by the `config.toml` in your Perplexica directory.\n\n   ```bash\n   export PERPLEXICA_URL=http://localhost:{port}/api/search\n   ```\n\n6. Our implementation of Agent S incorporates the Perplexica API to integrate a search engine capability, which allows for a more convenient and responsive user experience. If you want to tailor the API to your settings and specific requirements, you may modify the URL and the message of request parameters in  `agent_s/query_perplexica.py`. For a comprehensive guide on configuring the Perplexica API, please refer to [Perplexica Search API Documentation](https://github.com/ItzCrazyKns/Perplexica/blob/master/docs/API/SEARCH.md)\n\nFor a more detailed setup and usage guide, please refer to the [Perplexica Repository](https://github.com/ItzCrazyKns/Perplexica.git).\n\n### Setup Paddle-OCR Server\n\nSwitch to a new terminal where you will run Agent S. Set the OCR_SERVER_ADDRESS environment variable as shown below. For a better experience, add the following line directly to your .bashrc (Linux), or .zshrc (MacOS) file.\n\n```\nexport OCR_SERVER_ADDRESS=http://localhost:8000/ocr/\n```\n\nRun the ocr_server.py file code to use OCR-based bounding boxes.\n\n```\ncd Agent-S\npython gui_agents/utils/ocr_server.py\n```\n\nYou can change the server address by editing the address in [gui_agents/s1/utils/ocr_server.py](utils/ocr_server.py) file.\n\n\n> ❗**Warning**❗: The agent will directly run python code to control your computer. Please use with care.\n\n## 🚀 Usage\n\n### CLI\n\nRun agent_s on your computer using:  \n```\nagent_s1 --model gpt-4o\n```\nThis will show a user query prompt where you can enter your query and interact with Agent S. You can use any model from the list of supported models in [models.md](../../models.md).\n\n### `gui_agents` SDK\n\nTo deploy Agent S on MacOS or Windows:\n\n```\nimport pyautogui\nimport io\nfrom gui_agents.core.AgentS import GraphSearchAgent\nimport platform\n\nif platform.system() == \"Darwin\":\n  from gui_agents.aci.MacOSACI import MacOSACI, UIElement\n  grounding_agent = MacOSACI()\nelif platform.system() == \"Windows\":\n  from gui_agents.aci.WindowsOSACI import WindowsACI, UIElement\n  grounding_agent = WindowsACI()\nelif platform.system() == \"Linux\":\n  from gui_agents.aci.LinuxOSACI import LinuxACI, UIElement\n  grounding_agent = LinuxACI()\nelse:\n  raise ValueError(\"Unsupported platform\")\n\nengine_params = {\n    \"engine_type\": \"openai\",\n    \"model\": \"gpt-4o\",\n}\n\nagent = GraphSearchAgent(\n  engine_params,\n  grounding_agent,\n  platform=\"ubuntu\",  # \"macos\", \"windows\"\n  action_space=\"pyautogui\",\n  observation_type=\"mixed\",\n  search_engine=\"Perplexica\"\n)\n\n# Get screenshot.\nscreenshot = pyautogui.screenshot()\nbuffered = io.BytesIO() \nscreenshot.save(buffered, format=\"PNG\")\nscreenshot_bytes = buffered.getvalue()\n\n# Get accessibility tree.\nacc_tree = UIElement.systemWideElement()\n\nobs = {\n  \"screenshot\": screenshot_bytes,\n  \"accessibility_tree\": acc_tree,\n}\n\ninstruction = \"Close VS Code\"\ninfo, action = agent.predict(instruction=instruction, observation=obs)\n\nexec(action[0])\n```\n\nRefer to `cli_app.py` for more details on how the inference loop works.\n\n#### Downloading the Knowledege Base\n\nAgent S2 uses a knowledge base that continually updates with new knowledge during inference. The knowledge base is initially downloaded when initializing `GraphSearchAgent`. The knowledge base is stored as assets under our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases). The `GraphSearchAgent` initialization will only download the knowledge base for your specified platform and agent version (e.g s1, s2). If you'd like to download the knowledge base programmatically, you can use the following code:\n\n```\ndownload_kb_data(\n    version=\"s2\",\n    release_tag=\"v0.2.2\",\n    download_dir=\"kb_data\",\n    platform=\"linux\"  # \"darwin\", \"windows\"\n)\n```\n\nThis will download Agent S2's knowledge base for Linux from release tag `v0.2.2` to the `kb_data` directory. Refer to our [GitHub Releases](https://github.com/simular-ai/Agent-S/releases) or release tags that include the knowledge bases.\n\n### OSWorld\n\nTo deploy Agent S in OSWorld, follow the [OSWorld Deployment instructions](OSWorld.md).\n\n### WindowsAgentArena\n\nTo deploy Agent S in WindowsAgentArena, follow the [WindowsAgentArena Deployment instructions](WindowsAgentArena.md).\n\n## 🙌 Contributors\n\nWe’re grateful to all the [amazing people](https://github.com/simular-ai/Agent-S/graphs/contributors) who have contributed to this project. Thank you! 🙏  \n\n## 💬 Citation\n```\n@misc{agashe2024agentsopenagentic,\n      title={Agent S: An Open Agentic Framework that Uses Computers Like a Human}, \n      author={Saaket Agashe and Jiuzhou Han and Shuyu Gan and Jiachen Yang and Ang Li and Xin Eric Wang},\n      year={2024},\n      eprint={2410.08164},\n      archivePrefix={arXiv},\n      primaryClass={cs.AI},\n      url={https://arxiv.org/abs/2410.08164}, \n}\n```\n\n"
  },
  {
    "path": "gui_agents/s1/WindowsAgentArena.md",
    "content": "## Deploying Agent-S in WindowsAgentArena\n> ⚠️ **Warning**: The refactored code has not be fully tested on WindowsAgentArena. To reproduce the results on WindowsAgentArena, please use commit 496a9fa of this repository.\n\n1. To use the Agent S with WindowsAgentArena, follows the setup instructions at: https://github.com/microsoft/WindowsAgentArena.git. **Please use the development mode while preparing the image and running the client as instructed in https://github.com/microsoft/WindowsAgentArena/blob/main/docs/Development-Tips.md.** \n\n2. To deploy our agent in the WindowsAgentArena, copy the agent_s folder in this repository to  `WindowsAgentArena/src/win-arena-container/client/mm_agents`. \n\n3. Change the name of the GraphSearchAgent.py file to agent.py to conform to the WindowsAgentArena Setup. \n\n4. Copy the ocr_server.py file to client/folder `WindowsAgentArena/src/win-arena-container/client` folder\n\n```\ncd WindowsAgentArena/src/win-arena-container/client\ncp mm_agents/agent_s/ocr_server.py .\n```\n\n5. Update the `start_client.sh` file in `WindowsAgentArena/src/win-arena-container` by adding the following line before Running the agent on line 75. \n\n```\npython ocr_server.py &\n```\n\n6. In the `src/win-arena-container/client/run.py` file import Agent S\n```\nfrom mm_agents.agent_s.agent import GraphSearchAgent\n```\n\n7. In the `src/win-arena-container/client/run.py` file, instantiate Agent S by adding the following lines after line 187 where the if condition for NAVI agent ends \n\n```python\nelif cfg_args[\"agent_name\"] == \"agent_s\":\n  if cfg_args[\"som_origin\"] in [\"a11y\"]:\n    som_config = None\n  elif cfg_args[\"som_origin\"] in [\"oss\", \"mixed-oss\"]:\n    som_config = {\n      \"pipeline\": [\"webparse\", \"groundingdino\", \"ocr\"],\n      \"groundingdino\": {\n        \"prompts\": [\"icon\", \"image\"]\n      },\n      \"ocr\": {\n        \"class_name\": \"TesseractOCR\"\n      },\n      \"webparse\": {\n        \"cdp_url\": f\"http://{args.emulator_ip}:9222\"\n      }\n    }\n  if args.model.startswith(\"claude\"):\n    engine_type = \"anthropic\"\n  elif args.model.startswith(\"gpt\"):\n    engine_type = \"openai\"\n  else:\n    engine_type = \"vllm\"\n\n  engine_params = {\n    \"engine_type\": engine_type,\n    \"model\": args.model,\n  }\n  agent = GraphSearchAgent(\n    engine_params=engine_params,\n    experiment_type='windowsAgentArena',\n    temperature=args.temperature\n  )\n```\n\n8. Run Agent S on WindowsAgentArena by changing the following parameters in the `scripts/run-local.sh` file\n\n```\nagent=\"agent_s\"\nmodel=\"gpt-4o\"\n```"
  },
  {
    "path": "gui_agents/s1/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/aci/ACI.py",
    "content": "import logging\nfrom typing import Any, Dict, List\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nclass ACI:\n    def __init__(self, top_app_only: bool = True, ocr: bool = False):\n        self.top_app_only = top_app_only\n        self.ocr = ocr\n        self.index_out_of_range_flag = False\n        self.notes: List[str] = []\n        self.clipboard = \"\"\n        self.nodes: List[Any] = []\n\n    def get_active_apps(self, obs: Dict) -> List[str]:\n        pass\n\n    def get_top_app(self):\n        pass\n\n    def preserve_nodes(self, tree: Any, exclude_roles: set = None) -> List[Dict]:\n        pass\n\n    def linearize_and_annotate_tree(\n        self, obs: Dict, show_all_elements: bool = False\n    ) -> str:\n        pass\n\n    def find_element(self, element_id: int) -> Dict:\n        pass\n"
  },
  {
    "path": "gui_agents/s1/aci/LinuxOSACI.py",
    "content": "import base64\nimport logging\nimport os\nimport time\nimport xml.etree.ElementTree as ET\nfrom typing import Dict, List, Optional, Tuple, Any, Sequence\nimport numpy as np\nimport requests\n\nfrom gui_agents.s1.aci.ACI import ACI\nfrom gui_agents.s1.utils.common_utils import box_iou\n\nimport platform\n\nif platform.system() == \"Linux\":\n    import pyatspi\n    from pyatspi import Accessible, StateType, STATE_SHOWING\n    from pyatspi import Action as ATAction\n    from pyatspi import Component  # , Document\n    from pyatspi import Text as ATText\n    from pyatspi import Value as ATValue\n\n    from pyatspi import Accessible, StateType\n    from lxml.etree import _Element\n    from typing import Optional, Dict, Any, List\n\n    import lxml.etree\n    import concurrent.futures\n\n_accessibility_ns_map_ubuntu = {\n    \"st\": \"https://accessibility.ubuntu.example.org/ns/state\",\n    \"attr\": \"https://accessibility.ubuntu.example.org/ns/attributes\",\n    \"cp\": \"https://accessibility.ubuntu.example.org/ns/component\",\n    \"doc\": \"https://accessibility.ubuntu.example.org/ns/document\",\n    \"docattr\": \"https://accessibility.ubuntu.example.org/ns/document/attributes\",\n    \"txt\": \"https://accessibility.ubuntu.example.org/ns/text\",\n    \"val\": \"https://accessibility.ubuntu.example.org/ns/value\",\n    \"act\": \"https://accessibility.ubuntu.example.org/ns/action\",\n}\n\nMAX_DEPTH = 50\nMAX_WIDTH = 1024\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\n# Agent action decorator\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nclass LinuxACI(ACI):\n    def __init__(self, top_app=None, vm_version=\"new\", top_app_only=True, ocr=True):\n        self.active_apps = set()\n        self.top_app = top_app\n        self.top_app_only = (\n            top_app_only  # Only include top app in the accessibility tree\n        )\n        self.ocr = ocr\n        self.index_out_of_range_flag = False\n        self.app_setup_code = f\"\"\"import subprocess;\nimport difflib;\nimport pyautogui;\npyautogui.press('escape');\ntime.sleep(0.5);\noutput = subprocess.check_output(['wmctrl', '-lx']);\noutput = output.decode('utf-8').splitlines();\nwindow_titles = [line.split(None, 4)[2] for line in output];\nclosest_matches = difflib.get_close_matches('APP_NAME', window_titles, n=1, cutoff=0.1);\nif closest_matches:\n    closest_match = closest_matches[0];\n    for line in output:\n        if closest_match in line:\n            window_id = line.split()[0]\n            break;\nsubprocess.run(['wmctrl', '-ia', window_id])\nsubprocess.run(['wmctrl', '-ir', window_id, '-b', 'add,maximized_vert,maximized_horz'])\n\"\"\"\n\n        self.top_active_app = None\n        self.notes = []\n        self.clipboard = \"\"\n\n        # TODO: this is terrible, fix this\n        global state_ns, component_ns, attributes_ns, value_ns\n        if vm_version == \"old\":\n\n            state_ns = \"uri:deskat:state.at-spi.gnome.org\"\n            component_ns = \"uri:deskat:component.at-spi.gnome.org\"\n        else:\n            attributes_ns = \"https://accessibility.windows.example.org/ns/attributes\"\n            state_ns = \"https://accessibility.ubuntu.example.org/ns/state\"\n            component_ns = \"https://accessibility.ubuntu.example.org/ns/component\"\n            value_ns = \"https://accessibility.ubuntu.example.org/ns/value\"\n\n    def get_active_apps(self, obs: Dict) -> List[str]:\n        tree = ET.ElementTree(ET.fromstring(obs[\"accessibility_tree\"]))\n        apps = []\n        exclude_list = [\"gjs\", \"gnome-shell\"]\n        for node in tree.iter():\n            # Keep applications and only those which have children\n            if (\n                node.tag.endswith(\"application\")\n                and list(node)\n                and node.attrib.get(\"name\", \"\") not in exclude_list\n            ):\n                apps.append(node.attrib.get(\"name\", \"\").replace(\"\\\\\", \"\"))\n        return apps\n\n    def check_new_apps(self, old_apps, new_apps):\n        return new_apps - old_apps\n\n    def get_top_app(self, obs):\n        return self.top_app\n\n    def find_active_applications(self, tree):\n        # names of applications to keep TODO: soffice is a single application with all the isntances like impress, calc etc. being frames this will need to be dealt with separately\n        to_keep = [\"gnome-shell\"]\n        apps_with_active_tag = []\n        for application in list(tree.getroot()):\n            app_name = application.attrib.get(\"name\")\n            for frame in application:\n                is_active = frame.attrib.get(\"{{{:}}}active\".format(state_ns), \"false\")\n                if is_active == \"true\":\n                    apps_with_active_tag.append(app_name)\n        if apps_with_active_tag:\n            to_keep.append(apps_with_active_tag[-1])\n        return to_keep\n\n    def filter_active_app(self, tree):\n        for application in list(tree.getroot()):\n            app_name = application.attrib.get(\"name\")\n            for frame in application:\n                is_active = frame.attrib.get(\"{{{:}}}active\".format(state_ns), \"false\")\n                if is_active == \"true\":\n                    return app_name\n        return None\n\n    def filter_nodes(self, tree, show_all=False):\n        # created and populate a preserved nodes list which filters out unnecessary elements and keeps only those elements which are currently showing on the screen\n        # TODO: include offscreen elements and then scroll to them before clicking\n        preserved_nodes = []\n        exclude_tags = [\"panel\", \"window\", \"filler\", \"frame\", \"separator\", \"scroll-bar\"]\n\n        for node in tree.iter():\n            if node.tag not in exclude_tags:\n                if show_all:\n                    if node.attrib.get(f\"{{{state_ns}}}visible\") == \"true\":\n                        coords: Tuple[int, int] = eval(\n                            node.get(\n                                \"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                            )\n                        )\n                        if coords[0] >= 0 and coords[1] >= 0:\n                            preserved_nodes.append(node)\n                # if show_all is false, only show elements that are currently showing on screen\n                else:\n                    if node.attrib.get(f\"{{{state_ns}}}showing\") == \"true\":\n                        coords: Tuple[int, int] = eval(\n                            node.get(\n                                \"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                            )\n                        )\n\n                        if coords[0] >= 0 and coords[1] >= 0:\n                            preserved_nodes.append(node)\n\n        return preserved_nodes\n\n    def linearize_tree(self, preserved_nodes):\n        # TODO: Run an ablation to check if class and desc\n        # linearized_accessibility_tree = [\"id\\ttag\\tname\\ttext\\tclass\\tdescription\"]\n        linearized_accessibility_tree = [\"id\\ttag\\tname\\ttext\"]\n        for idx, node in enumerate(preserved_nodes):\n            if node.text:\n                text = (\n                    node.text\n                    if '\"' not in node.text\n                    else '\"{:}\"'.format(node.text.replace('\"', '\"\"'))\n                )\n            else:\n                text = '\"\"'\n\n            linearized_accessibility_tree.append(\n                \"{:}\\t{:}\\t{:}\\t{:}\".format(\n                    idx,\n                    node.tag,\n                    node.get(\"name\", \"\"),\n                    text,\n                    # node.get(\"{{{:}}}class\".format(attributes_ns), \"\"),\n                    # node.get(\"{{{:}}}description\".format(attributes_ns), \"\"),\n                )\n            )\n\n        # returning list of linearized elements\n        return linearized_accessibility_tree\n\n    def extract_elements_from_screenshot(self, screenshot) -> Dict:\n        \"\"\"Uses paddle-ocr to extract elements with text from the screenshot. The elements will be added to the linearized accessibility tree downstream\"\"\"\n\n        # Convert screenshot to PIL image\n        def send_image_to_ocr(screenshot) -> Dict:\n\n            url = os.environ.get(\"OCR_SERVER_ADDRESS\", \"\")\n            if url == \"\":\n                raise Exception(\"OCR SERVER ADDRESS NOT SET\")\n            encoded_screenshot = base64.b64encode(screenshot).decode(\"utf-8\")\n            data = {\"img_bytes\": encoded_screenshot}\n            print(\"Getting OCR response\")\n            ocr_start = time.time()\n            response = requests.post(url, json=data)\n            print(\"Got OCR response in\", time.time() - ocr_start)\n\n            if response.status_code == 200:\n                return response.json()\n            else:\n                return {\n                    \"error\": f\"Request failed with status code {response.status_code}\",\n                    \"results\": [],\n                }\n\n        return send_image_to_ocr(screenshot)[\"results\"]\n\n    def add_ocr_elements(\n        self, screenshot, linearized_accessibility_tree, preserved_nodes\n    ):\n        # Get the bounding boxes of the elements in the linearized accessibility tree\n        tree_bboxes = []\n        for node in preserved_nodes:\n            coordinates: Tuple[int, int] = eval(\n                node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n            )\n            sizes: Tuple[int, int] = eval(\n                node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\")\n            )\n            tree_bboxes.append(\n                [\n                    coordinates[0],\n                    coordinates[1],\n                    coordinates[0] + sizes[0],\n                    coordinates[1] + sizes[1],\n                ]\n            )\n\n        # Use OCR to found boxes that might be missing from the accessibility tree\n        try:\n            ocr_bboxes = self.extract_elements_from_screenshot(screenshot)\n        except Exception as e:\n            print(f\"Error: {e}\")\n            ocr_bboxes = []\n        else:\n            # Check for intersection over union between the existing atree bounding boxes and the ocr bounding boxes, if ocr bounding boxes are new add them to the linearized accesibility tree\n            if (\n                len(ocr_bboxes) > 0\n            ):  # Only check IOUs and add if there are any bounding boxes returned by the ocr module\n                preserved_nodes_index = len(preserved_nodes)\n                for ind, (i, content, box) in enumerate(ocr_bboxes):\n                    # x1, y1, x2, y2 = int(box.get('left', 0)), int(box['top']), int(), int(box['bottom'])\n                    (\n                        x1,\n                        y1,\n                        x2,\n                        y2,\n                    ) = (\n                        int(box.get(\"left\", 0)),\n                        int(box.get(\"top\", 0)),\n                        int(box.get(\"right\", 0)),\n                        int(box.get(\"bottom\", 0)),\n                    )\n                    iou = box_iou(\n                        np.array(tree_bboxes, dtype=np.float32),\n                        np.array([[x1, y1, x2, y2]], dtype=np.float32),\n                    ).flatten()\n\n                    if max(iou) < 0.1:\n                        # Add the element to the linearized accessibility tree\n                        # TODO: ocr detected elements should be classified for their tag, currently set to push button for the agent to think they are interactable\n                        linearized_accessibility_tree.append(\n                            f\"{preserved_nodes_index}\\tpush-button\\t\\t{content}\\t\\t\"\n                        )\n\n                        # add to preserved node with the component_ns prefix node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                        node = ET.Element(\n                            \"ocr_node\",\n                            attrib={\n                                \"text\": content,\n                                \"{{{}}}screencoord\".format(\n                                    component_ns\n                                ): \"({},{})\".format(x1, y1),\n                                \"{{{}}}size\".format(component_ns): \"({},{})\".format(\n                                    x2 - x1, y2 - y1\n                                ),\n                            },\n                        )\n                        preserved_nodes.append(node)\n                        preserved_nodes_index += 1\n\n        return linearized_accessibility_tree, preserved_nodes\n\n    def linearize_and_annotate_tree(self, obs, show_all=False):\n        accessibility_tree = obs[\"accessibility_tree\"]\n        screenshot = obs[\"screenshot\"]\n\n        # convert the accessibility tree from a string representation to an xml tree\n        tree = ET.ElementTree(ET.fromstring(accessibility_tree))\n\n        # Get the applications to keep based on the active applications\n        to_keep = self.find_active_applications(tree)\n        self.top_app = to_keep[-1]\n\n        # Remove applications which are not included in the to_keep list\n        if not show_all:\n            for application in list(tree.getroot()):\n                if application.attrib.get(\"name\", \"\") not in to_keep:\n                    tree.getroot().remove(application)\n\n        # Save tree for debugging\n        with open(\"tree_raw.xml\", \"wb\") as file:\n            tree.write(file, encoding=\"utf-8\", xml_declaration=True)\n\n        # Filter out filler elements and overlapping elements\n        preserved_nodes = self.filter_nodes(tree, show_all)\n\n        assert len(preserved_nodes) > 0\n\n        # Linearize the tree as tsv\n        linearized_accessibility_tree = self.linearize_tree(preserved_nodes)\n\n        # Add OCR elements to the linearized accessibility tree to account for elements that are not in the accessibility tree\n        if self.ocr:\n            linearized_accessibility_tree, preserved_nodes = self.add_ocr_elements(\n                screenshot, linearized_accessibility_tree, preserved_nodes\n            )\n\n        # Convert accessibility tree to a string\n        linearized_accessibility_tree = \"\\n\".join(linearized_accessibility_tree)\n\n        # TODO: side-effect, set in separate functions\n        self.nodes = preserved_nodes\n\n        return linearized_accessibility_tree\n\n    def find_element(self, element_id):\n        try:\n            selected_element = self.nodes[int(element_id)]\n        except:\n            print(\"The index of the selected element was out of range.\")\n            selected_element = self.nodes[0]\n            self.index_out_of_range_flag = True\n        return selected_element\n\n    @agent_action\n    def click(\n        self,\n        element_id: int,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_id:int, ID of the element to click on\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        node = self.find_element(element_id)\n        coordinates: Tuple[int, int] = eval(\n            node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes: Tuple[int, int] = eval(\n            node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\")\n        )\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n\n        command = \"import pyautogui; \"\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def switch_applications(self, app_code):\n        \"\"\"Switch to a different application that is already open\n        Args:\n            app_code:str the code name of the application to switch to from the provided list of open applications\n        \"\"\"\n        return self.app_setup_code.replace(\"APP_NAME\", app_code)\n\n    @agent_action\n    def type(\n        self,\n        element_id: int = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into the element\n        Args:\n            element_id:int ID of the element to type into. If not provided, typing will start at the current cursor location.\n            text:str the text to type\n            overwrite:bool Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n        try:\n            # Use the provided element_id or default to None\n            node = self.find_element(element_id) if element_id is not None else None\n        except:\n            node = None\n\n        if node is not None:\n            # If a node is found, retrieve its coordinates and size\n            coordinates = eval(\n                node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n            )\n            sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n            # Calculate the center of the element\n            x = coordinates[0] + sizes[0] // 2\n            y = coordinates[1] + sizes[1] // 2\n\n            # Start typing at the center of the element\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            # If no element is found, start typing at the current cursor location\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(self, drag_from_id: int, drop_on_id: int, hold_keys: List = []):\n        \"\"\"Drag element1 and drop it on element2.\n        Args:\n            drag_from_id:int ID of element to drag\n            drop_on_id:int ID of element to drop on\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        node1 = self.find_element(drag_from_id)\n        node2 = self.find_element(drop_on_id)\n        coordinates1 = eval(\n            node1.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes1 = eval(node1.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        coordinates2 = eval(\n            node2.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes2 = eval(node2.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        # Calculate the center of the element\n        x1 = coordinates1[0] + sizes1[0] // 2\n        y1 = coordinates1[1] + sizes1[1] // 2\n\n        x2 = coordinates2[0] + sizes2[0] // 2\n        y2 = coordinates2[1] + sizes2[1] // 2\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def scroll(self, element_id: int, clicks: int):\n        \"\"\"Scroll the element in the specified direction\n        Args:\n            element_id:int ID of the element to scroll in\n            clicks:int the number of clicks to scroll can be positive (up) or negative (down).\n        \"\"\"\n        try:\n            node = self.find_element(element_id)\n        except:\n            node = self.find_element(0)\n        # print(node.attrib)\n        coordinates = eval(\n            node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n        return (\n            f\"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({clicks})\"\n        )\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])\n        \"\"\"\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)})\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(self):\n        \"\"\"End the current task with a success\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure\"\"\"\n        return \"\"\"FAIL\"\"\"\n\n\ndef _create_atspi_node(\n    node: Accessible, depth: int = 0, flag: Optional[str] = None\n) -> _Element:\n    node_name = node.name\n    attribute_dict: Dict[str, Any] = {\"name\": node_name}\n\n    #  States\n    states: List[StateType] = node.getState().get_states()\n    for st in states:\n        state_name: str = StateType._enum_lookup[st]\n        state_name: str = state_name.split(\"_\", maxsplit=1)[1].lower()\n        if len(state_name) == 0:\n            continue\n        attribute_dict[\n            \"{{{:}}}{:}\".format(_accessibility_ns_map_ubuntu[\"st\"], state_name)\n        ] = \"true\"\n\n    #  Attributes\n    attributes: Dict[str, str] = node.get_attributes()\n    for attribute_name, attribute_value in attributes.items():\n        if len(attribute_name) == 0:\n            continue\n        attribute_dict[\n            \"{{{:}}}{:}\".format(_accessibility_ns_map_ubuntu[\"attr\"], attribute_name)\n        ] = attribute_value\n\n    #  Component\n    if (\n        attribute_dict.get(\n            \"{{{:}}}visible\".format(_accessibility_ns_map_ubuntu[\"st\"]), \"false\"\n        )\n        == \"true\"\n        and attribute_dict.get(\n            \"{{{:}}}showing\".format(_accessibility_ns_map_ubuntu[\"st\"]), \"false\"\n        )\n        == \"true\"\n    ):\n        try:\n            component: Component = node.queryComponent()\n        except NotImplementedError:\n            pass\n        else:\n            bbox: Sequence[int] = component.getExtents(pyatspi.XY_SCREEN)\n            attribute_dict[\n                \"{{{:}}}screencoord\".format(_accessibility_ns_map_ubuntu[\"cp\"])\n            ] = str(tuple(bbox[0:2]))\n            attribute_dict[\"{{{:}}}size\".format(_accessibility_ns_map_ubuntu[\"cp\"])] = (\n                str(tuple(bbox[2:]))\n            )\n\n    text = \"\"\n    #  Text\n    try:\n        text_obj: ATText = node.queryText()\n        # only text shown on current screen is available\n        # attribute_dict[\"txt:text\"] = text_obj.getText(0, text_obj.characterCount)\n        text: str = text_obj.getText(0, text_obj.characterCount)\n        # if flag==\"thunderbird\":\n        # appeared in thunderbird (uFFFC) (not only in thunderbird), \"Object\n        # Replacement Character\" in Unicode, \"used as placeholder in text for\n        # an otherwise unspecified object; uFFFD is another \"Replacement\n        # Character\", just in case\n        text = text.replace(\"\\ufffc\", \"\").replace(\"\\ufffd\", \"\")\n    except NotImplementedError:\n        pass\n\n    #  Image, Selection, Value, Action\n    try:\n        node.queryImage()\n        attribute_dict[\"image\"] = \"true\"\n    except NotImplementedError:\n        pass\n\n    try:\n        node.querySelection()\n        attribute_dict[\"selection\"] = \"true\"\n    except NotImplementedError:\n        pass\n\n    try:\n        value: ATValue = node.queryValue()\n        value_key = f\"{{{_accessibility_ns_map_ubuntu['val']}}}\"\n\n        for attr_name, attr_func in [\n            (\"value\", lambda: value.currentValue),\n            (\"min\", lambda: value.minimumValue),\n            (\"max\", lambda: value.maximumValue),\n            (\"step\", lambda: value.minimumIncrement),\n        ]:\n            try:\n                attribute_dict[f\"{value_key}{attr_name}\"] = str(attr_func())\n            except:\n                pass\n    except NotImplementedError:\n        pass\n\n    try:\n        action: ATAction = node.queryAction()\n        for i in range(action.nActions):\n            action_name: str = action.getName(i).replace(\" \", \"-\")\n            attribute_dict[\n                \"{{{:}}}{:}_desc\".format(\n                    _accessibility_ns_map_ubuntu[\"act\"], action_name\n                )\n            ] = action.getDescription(i)\n            attribute_dict[\n                \"{{{:}}}{:}_kb\".format(_accessibility_ns_map_ubuntu[\"act\"], action_name)\n            ] = action.getKeyBinding(i)\n    except NotImplementedError:\n        pass\n\n    # Add from here if we need more attributes in the future...\n\n    raw_role_name: str = node.getRoleName().strip()\n    node_role_name = (raw_role_name or \"unknown\").replace(\" \", \"-\")\n\n    if not flag:\n        if raw_role_name == \"document spreadsheet\":\n            flag = \"calc\"\n        if raw_role_name == \"application\" and node.name == \"Thunderbird\":\n            flag = \"thunderbird\"\n\n    xml_node = lxml.etree.Element(\n        node_role_name, attrib=attribute_dict, nsmap=_accessibility_ns_map_ubuntu\n    )\n\n    if len(text) > 0:\n        xml_node.text = text\n\n    if depth == MAX_DEPTH:\n        logger.warning(\"Max depth reached\")\n        return xml_node\n\n    if flag == \"calc\" and node_role_name == \"table\":\n        # Maximum column: 1024 if ver<=7.3 else 16384\n        # Maximum row: 104 8576\n        # Maximun sheet: 1 0000\n\n        global libreoffice_version_tuple\n        MAXIMUN_COLUMN = 1024 if libreoffice_version_tuple < (7, 4) else 16384\n        MAX_ROW = 104_8576\n\n        index_base = 0\n        first_showing = False\n        column_base = None\n        for r in range(MAX_ROW):\n            for clm in range(column_base or 0, MAXIMUN_COLUMN):\n                child_node: Accessible = node[index_base + clm]\n                showing: bool = child_node.getState().contains(STATE_SHOWING)\n                if showing:\n                    child_node: _Element = _create_atspi_node(\n                        child_node, depth + 1, flag\n                    )\n                    if not first_showing:\n                        column_base = clm\n                        first_showing = True\n                    xml_node.append(child_node)\n                elif first_showing and column_base is not None or clm >= 500:\n                    break\n            if first_showing and clm == column_base or not first_showing and r >= 500:\n                break\n            index_base += MAXIMUN_COLUMN\n        return xml_node\n    else:\n        try:\n            for i, ch in enumerate(node):\n                if i == MAX_WIDTH:\n                    logger.warning(\"Max width reached\")\n                    break\n                xml_node.append(_create_atspi_node(ch, depth + 1, flag))\n        except:\n            logger.warning(\n                \"Error occurred during children traversing. Has Ignored. Node: %s\",\n                lxml.etree.tostring(xml_node, encoding=\"unicode\"),\n            )\n        return xml_node\n\n\nclass UIElement(object):\n    def __init__(self, node):\n        self.node = node\n\n    def getAttributeNames(self):\n        attributes = self.node.getAttributes()\n\n    @staticmethod\n    def systemWideElement():\n        # desktop = pyatspi.Registry.getDesktop(0)\n        # for app in desktop:\n        #     for window in app:\n        #         if window.getState().contains(pyatspi.STATE_ACTIVE):\n        #             active_node = app\n        # return UIElement(active_node)\n        desktop: Accessible = pyatspi.Registry.getDesktop(0)\n        xml_node = lxml.etree.Element(\n            \"desktop-frame\", nsmap=_accessibility_ns_map_ubuntu\n        )\n        with concurrent.futures.ThreadPoolExecutor() as executor:\n            futures = [\n                executor.submit(_create_atspi_node, app_node, 1) for app_node in desktop\n            ]\n            for future in concurrent.futures.as_completed(futures):\n                xml_tree = future.result()\n                xml_node.append(xml_tree)\n        return lxml.etree.tostring(xml_node, encoding=\"unicode\")\n\n    @property\n    def states(self):\n        state_names = []\n        states: List[StateType] = self.node.getState().get_states()\n        for st in states:\n            state_name: str = StateType._enum_lookup[st]\n            state_names.append(state_name)\n        return state_names\n\n    @property\n    def attributes(self):\n        try:\n            attributes: List[str] = self.node.getAttributes()\n            attribute_dict = {}\n            for attrbt in attributes:\n                attribute_name: str\n                attribute_value: str\n                attribute_name, attribute_value = attrbt.split(\":\", maxsplit=1)\n                attribute_dict[attribute_name] = attribute_value\n            return attribute_dict\n        except NotImplementedError:\n            return None\n\n    @property\n    def component(self):\n        try:\n            component: Component = self.node.queryComponent()\n            return component\n        except NotImplementedError:\n            return None\n\n    @property\n    def value(self):\n        try:\n            value: ATValue = self.node.queryValue()\n            return value\n        except NotImplementedError:\n            return None\n\n    @property\n    def text(self):\n        try:\n            text_obj: ATText = self.node.queryText()\n        except NotImplementedError:\n            return \"\"\n        else:\n            text: str = text_obj.getText(0, text_obj.characterCount)\n            text = text.replace(\"\\ufffc\", \"\").replace(\"\\ufffd\", \"\")\n            return text\n\n    @property\n    def role(self):\n        return self.node.getRoleName()\n\n    def children(self):\n        \"\"\"Return list of children of the current node\"\"\"\n        return list(self.node)\n\n    def __repr__(self):\n        return \"UIElement%s\" % (self.node)\n"
  },
  {
    "path": "gui_agents/s1/aci/MacOSACI.py",
    "content": "import base64\nimport os\nfrom typing import Any, Dict, List, Tuple\n\nimport numpy as np\nimport requests\nimport platform\nfrom gui_agents.s1.utils.common_utils import box_iou\n\nif platform.system() == \"Darwin\":\n    from AppKit import *\n    from ApplicationServices import (\n        AXUIElementCopyAttributeNames,\n        AXUIElementCopyAttributeValue,\n        AXUIElementCreateSystemWide,\n    )\n\nfrom gui_agents.s1.aci.ACI import ACI, agent_action\n\n\ndef _normalize_key(key: str) -> str:\n    \"\"\"Convert 'cmd' to 'command' for pyautogui compatibility\"\"\"\n    return \"command\" if key == \"cmd\" else key\n\n\ndef list_apps_in_directories(directories):\n    apps = []\n    for directory in directories:\n        if os.path.exists(directory):\n            directory_apps = [\n                app for app in os.listdir(directory) if app.endswith(\".app\")\n            ]\n            apps.extend(directory_apps)\n    return apps\n\n\nclass MacOSACI(ACI):\n    def __init__(self, top_app_only: bool = True, ocr: bool = False):\n        super().__init__(top_app_only=top_app_only, ocr=ocr)\n        # Directories to search for applications in MacOS\n        directories_to_search = [\"/System/Applications\", \"/Applications\"]\n        self.all_apps = list_apps_in_directories(directories_to_search)\n\n    def get_active_apps(self, obs: Dict) -> List[str]:\n        return UIElement.get_current_applications(obs)\n\n    def get_top_app(self, obs: Dict) -> str:\n        return UIElement.get_top_app(obs)\n\n    def preserve_nodes(self, tree, exclude_roles=None):\n        if exclude_roles is None:\n            exclude_roles = set()\n\n        preserved_nodes = []\n\n        # Inner function to recursively traverse the accessibility tree\n        def traverse_and_preserve(element):\n            role = element.attribute(\"AXRole\")\n\n            if role not in exclude_roles:\n                # TODO: get coordinate values directly from interface\n                position = element.attribute(\"AXPosition\")\n                size = element.attribute(\"AXSize\")\n                if position and size:\n                    pos_parts = position.__repr__().split().copy()\n                    # Find the parts containing 'x:' and 'y:'\n                    x_part = next(part for part in pos_parts if part.startswith(\"x:\"))\n                    y_part = next(part for part in pos_parts if part.startswith(\"y:\"))\n\n                    # Extract the numerical values after 'x:' and 'y:'\n                    x = float(x_part.split(\":\")[1])\n                    y = float(y_part.split(\":\")[1])\n\n                    size_parts = size.__repr__().split().copy()\n                    # Find the parts containing 'Width:' and 'Height:'\n                    width_part = next(\n                        part for part in size_parts if part.startswith(\"w:\")\n                    )\n                    height_part = next(\n                        part for part in size_parts if part.startswith(\"h:\")\n                    )\n\n                    # Extract the numerical values after 'Width:' and 'Height:'\n                    w = float(width_part.split(\":\")[1])\n                    h = float(height_part.split(\":\")[1])\n\n                    if x >= 0 and y >= 0 and w > 0 and h > 0:\n                        preserved_nodes.append(\n                            {\n                                \"position\": (x, y),\n                                \"size\": (w, h),\n                                \"title\": str(element.attribute(\"AXTitle\")),\n                                \"text\": str(element.attribute(\"AXDescription\"))\n                                or str(element.attribute(\"AXValue\")),\n                                \"role\": str(element.attribute(\"AXRole\")),\n                            }\n                        )\n\n            children = element.children()\n            if children:\n                for child_ref in children:\n                    child_element = UIElement(child_ref)\n                    traverse_and_preserve(child_element)\n\n        # Start traversing from the given element\n        traverse_and_preserve(tree)\n\n        return preserved_nodes\n\n    def extract_elements_from_screenshot(self, screenshot: bytes) -> Dict[str, Any]:\n        url = os.environ.get(\"OCR_SERVER_ADDRESS\")\n        if not url:\n            raise EnvironmentError(\"OCR SERVER ADDRESS NOT SET\")\n\n        encoded_screenshot = base64.b64encode(screenshot).decode(\"utf-8\")\n        response = requests.post(url, json={\"img_bytes\": encoded_screenshot})\n\n        if response.status_code != 200:\n            return {\n                \"error\": f\"Request failed with status code {response.status_code}\",\n                \"results\": [],\n            }\n        return response.json()\n\n    def add_ocr_elements(\n        self,\n        screenshot,\n        linearized_accessibility_tree: List[str],\n        preserved_nodes: List[Dict],\n    ) -> Tuple[List[str], List[Dict]]:\n        \"\"\"\n        Add OCR-detected elements to the accessibility tree if they don't overlap with existing elements\n        Uses optimized NumPy implementation\n        \"\"\"\n        # Convert preserved nodes to numpy array of bounding boxes\n        if preserved_nodes:\n            tree_bboxes = np.array(\n                [\n                    [\n                        node[\"position\"][0],\n                        node[\"position\"][1],\n                        node[\"position\"][0] + node[\"size\"][0],\n                        node[\"position\"][1] + node[\"size\"][1],\n                    ]\n                    for node in preserved_nodes\n                ],\n                dtype=np.float32,\n            )\n        else:\n            tree_bboxes = np.empty((0, 4), dtype=np.float32)\n\n        try:\n            ocr_bboxes = self.extract_elements_from_screenshot(screenshot)\n        except Exception as e:\n            print(f\"Error: {e}\")\n            ocr_bboxes = []\n        else:\n            if ocr_bboxes:\n                preserved_nodes_index = len(preserved_nodes)\n\n                # Convert OCR boxes to numpy array\n                ocr_boxes_array = np.array(\n                    [\n                        [\n                            int(box.get(\"left\", 0)),\n                            int(box.get(\"top\", 0)),\n                            int(box.get(\"right\", 0)),\n                            int(box.get(\"bottom\", 0)),\n                        ]\n                        for _, _, box in ocr_bboxes\n                    ],\n                    dtype=np.float32,\n                )\n\n                # Calculate max IOUs efficiently\n                if len(tree_bboxes) > 0:\n                    max_ious = box_iou(tree_bboxes, ocr_boxes_array).max(axis=0)\n                else:\n                    max_ious = np.zeros(len(ocr_boxes_array))\n\n                # Process boxes with low IOU\n                for idx, ((_, content, box), max_iou) in enumerate(\n                    zip(ocr_bboxes, max_ious)\n                ):\n                    if max_iou < 0.1:\n                        x1 = int(box.get(\"left\", 0))\n                        y1 = int(box.get(\"top\", 0))\n                        x2 = int(box.get(\"right\", 0))\n                        y2 = int(box.get(\"bottom\", 0))\n\n                        linearized_accessibility_tree.append(\n                            f\"{preserved_nodes_index}\\tAXButton\\t\\t{content}\\t\\t\"\n                        )\n\n                        node = {\n                            \"position\": (x1, y1),\n                            \"size\": (x2 - x1, y2 - y1),\n                            \"title\": \"\",\n                            \"text\": content,\n                            \"role\": \"AXButton\",\n                        }\n                        preserved_nodes.append(node)\n                        preserved_nodes_index += 1\n\n        return linearized_accessibility_tree, preserved_nodes\n\n    def linearize_and_annotate_tree(\n        self, obs: Dict, show_all_elements: bool = False\n    ) -> str:\n        accessibility_tree = obs[\"accessibility_tree\"]\n        screenshot = obs[\"screenshot\"]\n        self.top_app = (\n            NSWorkspace.sharedWorkspace().frontmostApplication().localizedName()\n        )\n        tree = UIElement(accessibility_tree.attribute(\"AXFocusedApplication\"))\n        exclude_roles = [\"AXGroup\", \"AXLayoutArea\", \"AXLayoutItem\", \"AXUnknown\"]\n        preserved_nodes = self.preserve_nodes(tree, exclude_roles).copy()\n        tree_elements = [\"id\\trole\\ttitle\\ttext\"]\n        for idx, node in enumerate(preserved_nodes):\n            tree_elements.append(\n                f\"{idx}\\t{node['role']}\\t{node['title']}\\t{node['text']}\"\n            )\n\n        if self.ocr:\n            tree_elements, preserved_nodes = self.add_ocr_elements(\n                screenshot, tree_elements, preserved_nodes, \"AXButton\"\n            )\n\n        self.nodes = preserved_nodes\n        return \"\\n\".join(tree_elements)\n\n    def find_element(self, element_id: int) -> Dict:\n        try:\n            return self.nodes[element_id]\n        except IndexError:\n            print(\"The index of the selected element was out of range.\")\n            self.index_out_of_range_flag = True\n            return self.nodes[0]\n\n    @agent_action\n    def open(self, app_or_file_name: str):\n        \"\"\"Open an application or file\n        Args:\n            app_or_file_name:str, the name of the application or file to open\n        \"\"\"\n        return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_file_name)}); pyautogui.press('enter'); time.sleep(1.0)\"\n\n    @agent_action\n    def switch_applications(self, app_or_file_name):\n        \"\"\"Switch to a different an application. Utility function to use instead of command+tab\n        Args:\n            app_or_file_name:str, the name of the application or file to switch to\n        \"\"\"\n        return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_file_name)}); pyautogui.press('enter'); time.sleep(1.0)\"\n\n    @agent_action\n    def click(\n        self,\n        element_id: int,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_id:int, ID of the element to click on\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        node = self.find_element(element_id)\n        coordinates: Tuple[int, int] = node[\"position\"]\n        sizes: Tuple[int, int] = node[\"size\"]\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n\n        command = \"import pyautogui; \"\n\n        # Normalize any 'cmd' to 'command'\n        hold_keys = [_normalize_key(k) for k in hold_keys]\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def type(\n        self,\n        element_id: int = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into the element\n        Args:\n            element_id:int ID of the element to type into. If not provided, typing will start at the current cursor location.\n            text:str the text to type\n            overwrite:bool Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool Assign it to True if the enter (return) key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n        try:\n            # Use the provided element_id or default to None\n            node = self.find_element(element_id) if element_id is not None else None\n        except:\n            node = None\n\n        if node is not None:\n            # If a node is found, retrieve its coordinates and size\n            coordinates = node[\"position\"]\n            sizes = node[\"size\"]\n\n            # Calculate the center of the element\n            x = coordinates[0] + sizes[0] // 2\n            y = coordinates[1] + sizes[1] // 2\n\n            # Start typing at the center of the element\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                # Use 'command' instead of 'cmd'\n                command += f\"pyautogui.hotkey('command', 'a', interval=1); pyautogui.press('backspace'); \"\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            # If no element is found, start typing at the current cursor location\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                # Use 'command' instead of 'cmd'\n                command += f\"pyautogui.hotkey('command', 'a', interval=1); pyautogui.press('backspace'); \"\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge for reuse during this task. Can be used for copy-pasting text, saving elements, etc. Use this instead of ctrl+c, ctrl+v.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(self, drag_from_id: int, drop_on_id: int, hold_keys: List = []):\n        \"\"\"Drag element1 and drop it on element2.\n        Args:\n            drag_from_id:int ID of element to drag\n            drop_on_id:int ID of element to drop on\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        node1 = self.find_element(drag_from_id)\n        node2 = self.find_element(drop_on_id)\n        coordinates1 = node1[\"position\"]\n        sizes1 = node1[\"size\"]\n\n        coordinates2 = node2[\"position\"]\n        sizes2 = node2[\"size\"]\n\n        # Calculate the center of the element\n        x1 = coordinates1[0] + sizes1[0] // 2\n        y1 = coordinates1[1] + sizes1[1] // 2\n\n        x2 = coordinates2[0] + sizes2[0] // 2\n        y2 = coordinates2[1] + sizes2[1] // 2\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def scroll(self, element_id: int, clicks: int):\n        \"\"\"Scroll in the specified direction inside the specified element\n        Args:\n            element_id:int ID of the element to scroll in\n            clicks:int the number of clicks to scroll can be positive (up) or negative (down).\n        \"\"\"\n        try:\n            node = self.find_element(element_id)\n        except:\n            node = self.find_element(0)\n        # print(node.attrib)\n        coordinates = node[\"position\"]\n        sizes = node[\"size\"]\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n        return (\n            f\"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({clicks})\"\n        )\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['shift', 'c'])\n        \"\"\"\n        # Normalize any 'cmd' to 'command'\n        keys = [_normalize_key(k) for k in keys]\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)}, interval=1)\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n        # Normalize any 'cmd' to 'command' in both lists\n        hold_keys = [_normalize_key(k) for k in hold_keys]\n        press_keys = [_normalize_key(k) for k in press_keys]\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(self):\n        \"\"\"End the current task with a success\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure\"\"\"\n        return \"\"\"FAIL\"\"\"\n\n\nclass UIElement(object):\n\n    def __init__(self, ref=None):\n        self.ref = ref\n\n    def getAttributeNames(self):\n        error_code, attributeNames = AXUIElementCopyAttributeNames(self.ref, None)\n        return list(attributeNames)\n\n    def attribute(self, key: str):\n        error, value = AXUIElementCopyAttributeValue(self.ref, key, None)\n        return value\n\n    def children(self):\n        return self.attribute(\"AXChildren\")\n\n    def systemWideElement():\n        ref = AXUIElementCreateSystemWide()\n        return UIElement(ref)\n\n    def role(self):\n        return self.attribute(\"AXRole\")\n\n    def position(self):\n        pos = self.attribute(\"AXPosition\")\n        if pos is None:\n            return None\n        pos_parts = pos.__repr__().split().copy()\n        # Find the parts containing 'x:' and 'y:'\n        x_part = next(part for part in pos_parts if part.startswith(\"x:\"))\n        y_part = next(part for part in pos_parts if part.startswith(\"y:\"))\n\n        # Extract the numerical values after 'x:' and 'y:'\n        x = float(x_part.split(\":\")[1])\n        y = float(y_part.split(\":\")[1])\n\n        return (x, y)\n\n    def size(self):\n        size = self.attribute(\"AXSize\")\n        if size is None:\n            return None\n        size_parts = size.__repr__().split().copy()\n        # Find the parts containing 'Width:' and 'Height:'\n        width_part = next(part for part in size_parts if part.startswith(\"w:\"))\n        height_part = next(part for part in size_parts if part.startswith(\"h:\"))\n\n        # Extract the numerical values after 'Width:' and 'Height:'\n        w = float(width_part.split(\":\")[1])\n        h = float(height_part.split(\":\")[1])\n        return (w, h)\n\n    def isValid(self):\n        if self.position() is not None and self.size() is not None:\n            return True\n\n    def parse(self, element):\n        position = element.position(element)\n        size = element.size(element)\n        return {\n            \"position\": position,\n            \"size\": size,\n            \"title\": str(element.attribute(\"AXTitle\")),\n            \"text\": str(element.attribute(\"AXDescription\"))\n            or str(element.attribute(\"AXValue\")),\n            \"role\": str(element.attribute(\"AXRole\")),\n        }\n\n    @staticmethod\n    def get_current_applications(obs: Dict):\n        # Get the shared workspace instance\n        workspace = NSWorkspace.sharedWorkspace()\n\n        # Get a list of running applications\n        running_apps = workspace.runningApplications()\n\n        # Iterate through the list and print each application's name\n        current_apps = []\n        for app in running_apps:\n            if app.activationPolicy() == 0:\n                app_name = app.localizedName()\n                current_apps.append(app_name)\n\n        return current_apps\n\n    @staticmethod\n    def list_apps_in_directories():\n        directories_to_search = [\"/System/Applications\", \"/Applications\"]\n        apps = []\n        for directory in directories_to_search:\n            if os.path.exists(directory):\n                directory_apps = [\n                    app for app in os.listdir(directory) if app.endswith(\".app\")\n                ]\n                apps.extend(directory_apps)\n        return apps\n\n    @staticmethod\n    def get_top_app(obs: Dict):\n        return NSWorkspace.sharedWorkspace().frontmostApplication().localizedName()\n\n    def __repr__(self):\n        return \"UIElement%s\" % (self.ref)\n"
  },
  {
    "path": "gui_agents/s1/aci/WindowsOSACI.py",
    "content": "import base64\nimport os\nimport platform\nfrom typing import Any, Dict, List, Tuple\n\nimport numpy as np\nimport psutil\nimport requests\nfrom gui_agents.s1.utils.common_utils import box_iou\n\nif platform.system() == \"Windows\":\n    import pywinauto\n    from pywinauto import Desktop\n    import win32gui\n    import win32process\n\nfrom gui_agents.s1.aci.ACI import ACI, agent_action\n\n\n# Helper functions\ndef _normalize_key(key: str) -> str:\n    \"\"\"Convert 'ctrl' to 'control' for pyautogui compatibility\"\"\"\n    return \"ctrl\" if key == \"control\" else key\n\n\ndef list_apps_in_directories():\n    directories_to_search = [\n        os.environ.get(\"PROGRAMFILES\", \"C:\\\\Program Files\"),\n        os.environ.get(\"PROGRAMFILES(X86)\", \"C:\\\\Program Files (x86)\"),\n    ]\n    apps = []\n    for directory in directories_to_search:\n        if os.path.exists(directory):\n            for root, dirs, files in os.walk(directory):\n                for file in files:\n                    if file.endswith(\".exe\"):\n                        apps.append(file)\n    return apps\n\n\n# WindowsACI Class\nclass WindowsACI(ACI):\n    def __init__(self, top_app_only: bool = True, ocr: bool = False):\n        super().__init__(top_app_only=top_app_only, ocr=ocr)\n        self.nodes = []\n        self.all_apps = list_apps_in_directories()\n\n    def get_active_apps(self, obs: Dict) -> List[str]:\n        return UIElement.get_current_applications(obs)\n\n    def get_top_app(self, obs: Dict) -> str:\n        return UIElement.get_top_app(obs)\n\n    def preserve_nodes(self, tree, exclude_roles=None):\n        if exclude_roles is None:\n            exclude_roles = set()\n\n        preserved_nodes = []\n\n        def traverse_and_preserve(element):\n            role = element.role()\n\n            if role not in exclude_roles:\n                position = element.position()\n                size = element.size()\n                if position and size:\n                    x, y = position\n                    w, h = size\n\n                    if x >= 0 and y >= 0 and w > 0 and h > 0:\n                        preserved_nodes.append(\n                            {\n                                \"position\": (x, y),\n                                \"size\": (w, h),\n                                \"title\": element.title(),\n                                \"text\": element.text(),\n                                \"role\": role,\n                            }\n                        )\n\n            children = element.children()\n            if children:\n                for child_element in children:\n                    traverse_and_preserve(child_element)\n\n        traverse_and_preserve(tree)\n        return preserved_nodes\n\n    def extract_elements_from_screenshot(self, screenshot: bytes) -> Dict[str, Any]:\n        url = os.environ.get(\"OCR_SERVER_ADDRESS\")\n        if not url:\n            raise EnvironmentError(\"OCR SERVER ADDRESS NOT SET\")\n\n        encoded_screenshot = base64.b64encode(screenshot).decode(\"utf-8\")\n        response = requests.post(url, json={\"img_bytes\": encoded_screenshot})\n\n        if response.status_code != 200:\n            return {\n                \"error\": f\"Request failed with status code {response.status_code}\",\n                \"results\": [],\n            }\n        return response.json()\n\n    def add_ocr_elements(\n        self,\n        screenshot,\n        linearized_accessibility_tree: List[str],\n        preserved_nodes: List[Dict],\n    ) -> Tuple[List[str], List[Dict]]:\n        \"\"\"\n        Add OCR-detected elements to the accessibility tree if they don't overlap with existing elements\n        Uses optimized NumPy implementation\n        \"\"\"\n        # Convert preserved nodes to numpy array of bounding boxes\n        if preserved_nodes:\n            tree_bboxes = np.array(\n                [\n                    [\n                        node[\"position\"][0],\n                        node[\"position\"][1],\n                        node[\"position\"][0] + node[\"size\"][0],\n                        node[\"position\"][1] + node[\"size\"][1],\n                    ]\n                    for node in preserved_nodes\n                ],\n                dtype=np.float32,\n            )\n        else:\n            tree_bboxes = np.empty((0, 4), dtype=np.float32)\n\n        try:\n            ocr_bboxes = self.extract_elements_from_screenshot(screenshot)\n        except Exception as e:\n            print(f\"Error: {e}\")\n            ocr_bboxes = []\n        else:\n            if ocr_bboxes:\n                preserved_nodes_index = len(preserved_nodes)\n\n                # Convert OCR boxes to numpy array\n                ocr_boxes_array = np.array(\n                    [\n                        [\n                            int(box.get(\"left\", 0)),\n                            int(box.get(\"top\", 0)),\n                            int(box.get(\"right\", 0)),\n                            int(box.get(\"bottom\", 0)),\n                        ]\n                        for _, _, box in ocr_bboxes[\"results\"]\n                    ],\n                    dtype=np.float32,\n                )\n\n                # Calculate max IOUs efficiently\n                if len(tree_bboxes) > 0:\n                    max_ious = box_iou(tree_bboxes, ocr_boxes_array).max(axis=0)\n                else:\n                    max_ious = np.zeros(len(ocr_boxes_array))\n\n                # Process boxes with low IOU\n                for idx, ((_, content, box), max_iou) in enumerate(\n                    zip(ocr_bboxes[\"results\"], max_ious)\n                ):\n                    if max_iou < 0.1:\n                        x1 = int(box.get(\"left\", 0))\n                        y1 = int(box.get(\"top\", 0))\n                        x2 = int(box.get(\"right\", 0))\n                        y2 = int(box.get(\"bottom\", 0))\n\n                        linearized_accessibility_tree.append(\n                            f\"{preserved_nodes_index}\\tButton\\t\\t{content}\\t\\t\"\n                        )\n\n                        node = {\n                            \"position\": (x1, y1),\n                            \"size\": (x2 - x1, y2 - y1),\n                            \"title\": \"\",\n                            \"text\": content,\n                            \"role\": \"Button\",\n                        }\n                        preserved_nodes.append(node)\n                        preserved_nodes_index += 1\n\n        return linearized_accessibility_tree, preserved_nodes\n\n    def linearize_and_annotate_tree(\n        self, obs: Dict, show_all_elements: bool = False\n    ) -> str:\n        desktop = Desktop(backend=\"uia\")\n        try:\n            tree = desktop.window(\n                handle=win32gui.GetForegroundWindow()\n            ).wrapper_object()\n        except Exception as e:\n            print(f\"Error accessing foreground window: {e}\")\n            self.nodes = []\n            return \"\"\n\n        exclude_roles = [\"Pane\", \"Group\", \"Unknown\"]\n        preserved_nodes = self.preserve_nodes(UIElement(tree), exclude_roles).copy()\n\n        if not preserved_nodes and show_all_elements:\n            preserved_nodes = self.preserve_nodes(\n                UIElement(tree), exclude_roles=[]\n            ).copy()\n\n        tree_elements = [\"id\\trole\\ttitle\\ttext\"]\n        for idx, node in enumerate(preserved_nodes):\n            tree_elements.append(\n                f\"{idx}\\t{node['role']}\\t{node['title']}\\t{node['text']}\"\n            )\n\n        if self.ocr:\n            screenshot = obs.get(\"screenshot\", None)\n            if screenshot is not None:\n                # return tree_elements, preserved_nodes\n                tree_elements, preserved_nodes = self.add_ocr_elements(\n                    screenshot, tree_elements, preserved_nodes\n                )\n\n        self.nodes = preserved_nodes\n        return \"\\n\".join(tree_elements)\n\n    def find_element(self, element_id: int) -> Dict:\n        if not self.nodes:\n            print(\"No elements found in the accessibility tree.\")\n            raise IndexError(\"No elements to select.\")\n        try:\n            return self.nodes[element_id]\n        except IndexError:\n            print(\"The index of the selected element was out of range.\")\n            self.index_out_of_range_flag = True\n            return self.nodes[0]\n\n    @agent_action\n    def open(self, app_or_file_name: str):\n        \"\"\"Open an application or file\n        Args:\n            app_or_file_name:str, the name of the application or file to open\n        \"\"\"\n        command = f\"import pyautogui; import time; pyautogui.hotkey('win', 'r', interval=0.5); pyautogui.typewrite({repr(app_or_file_name)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        return command\n\n    @agent_action\n    def switch_applications(self, app_or_file_name):\n        \"\"\"Switch to a different application. Utility function to use instead of alt+tab\n        Args:\n            app_or_file_name:str, the name of the application or file to switch to\n        \"\"\"\n        command = f\"import pyautogui; import time; pyautogui.hotkey('win', 'd', interval=0.5); pyautogui.typewrite({repr(app_or_file_name)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        return command\n\n    @agent_action\n    def click(\n        self,\n        element_id: int,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_id:int, ID of the element to click on\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        node = self.find_element(element_id)\n        coordinates: Tuple[int, int] = node[\"position\"]\n        sizes: Tuple[int, int] = node[\"size\"]\n\n        # Calculate the center of the element\n        x = int(coordinates[0] + sizes[0] // 2)\n        y = int(coordinates[1] + sizes[1] // 2)\n\n        command = \"import pyautogui; \"\n\n        # Normalize any 'ctrl' to 'control'\n        hold_keys = [_normalize_key(k) for k in hold_keys]\n\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        return command\n\n    @agent_action\n    def type(\n        self,\n        element_id: int = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into the element\n        Args:\n            element_id:int ID of the element to type into. If not provided, typing will start at the current cursor location.\n            text:str the text to type\n            overwrite:bool Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n        try:\n            node = self.find_element(element_id) if element_id is not None else None\n        except:\n            node = None\n\n        if node is not None:\n            coordinates = node[\"position\"]\n            sizes = node[\"size\"]\n\n            x = int(coordinates[0] + sizes[0] // 2)\n            y = int(coordinates[1] + sizes[1] // 2)\n\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                command += f\"pyautogui.hotkey('ctrl', 'a', interval=0.5); pyautogui.press('backspace'); \"\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                command += f\"pyautogui.hotkey('ctrl', 'a', interval=0.5); pyautogui.press('backspace'); \"\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge for reuse during this task. Can be used for copy-pasting text, saving elements, etc. Use this instead of ctrl+c, ctrl+v.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(self, drag_from_id: int, drop_on_id: int, hold_keys: List = []):\n        \"\"\"Drag element1 and drop it on element2.\n        Args:\n            drag_from_id:int ID of element to drag\n            drop_on_id:int ID of element to drop on\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        node1 = self.find_element(drag_from_id)\n        node2 = self.find_element(drop_on_id)\n        coordinates1 = node1[\"position\"]\n        sizes1 = node1[\"size\"]\n\n        coordinates2 = node2[\"position\"]\n        sizes2 = node2[\"size\"]\n\n        x1 = int(coordinates1[0] + sizes1[0] // 2)\n        y1 = int(coordinates1[1] + sizes1[1] // 2)\n\n        x2 = int(coordinates2[0] + sizes2[0] // 2)\n        y2 = int(coordinates2[1] + sizes2[1] // 2)\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.0); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def scroll(self, element_id: int, clicks: int):\n        \"\"\"Scroll in the specified direction inside the specified element\n        Args:\n            element_id:int ID of the element to scroll in\n            clicks:int the number of clicks to scroll can be positive (up) or negative (down).\n        \"\"\"\n        try:\n            node = self.find_element(element_id)\n        except:\n            node = self.find_element(0)\n\n        coordinates = node[\"position\"]\n        sizes = node[\"size\"]\n\n        x = int(coordinates[0] + sizes[0] // 2)\n        y = int(coordinates[1] + sizes[1] // 2)\n        command = (\n            f\"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({clicks})\"\n        )\n        return command\n\n    @agent_action\n    def hotkey(self, keys: List[str]):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List[str] the keys to press in combination in a list format (e.g. ['shift', 'c'])\n        \"\"\"\n        keys = [_normalize_key(k) for k in keys]\n        keys = [f\"'{key}'\" for key in keys]\n        command = f\"import pyautogui; pyautogui.hotkey({', '.join(keys)}, interval=0.5)\"\n        return command\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List[str], press_keys: List[str]):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List[str], list of keys to hold\n            press_keys:List[str], list of keys to press in a sequence\n        \"\"\"\n        hold_keys = [_normalize_key(k) for k in hold_keys]\n        press_keys = [_normalize_key(k) for k in press_keys]\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        command = f\"import time; time.sleep({time})\"\n        return command\n\n    @agent_action\n    def done(self):\n        \"\"\"End the current task with a success\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure\"\"\"\n        return \"\"\"FAIL\"\"\"\n\n\n# UIElement Class\nclass UIElement:\n    def __init__(self, element=None):\n        if isinstance(element, pywinauto.application.WindowSpecification):\n            self.element = element.wrapper_object()\n        else:\n            self.element = element  # This should be a control wrapper\n\n    def get_attribute_names(self):\n        return list(self.element.element_info.get_properties().keys())\n\n    def attribute(self, key: str):\n        props = self.element.element_info.get_properties()\n        return props.get(key, None)\n\n    def children(self):\n        try:\n            return [UIElement(child) for child in self.element.children()]\n        except Exception as e:\n            print(f\"Error accessing children: {e}\")\n            return []\n\n    def role(self):\n        return self.element.element_info.control_type\n\n    def position(self):\n        rect = self.element.rectangle()\n        return (rect.left, rect.top)\n\n    def size(self):\n        rect = self.element.rectangle()\n        return (rect.width(), rect.height())\n\n    def title(self):\n        return self.element.element_info.name\n\n    def text(self):\n        return self.element.window_text()\n\n    def isValid(self):\n        return self.position() is not None and self.size() is not None\n\n    def parse(self):\n        position = self.position()\n        size = self.size()\n        return {\n            \"position\": position,\n            \"size\": size,\n            \"title\": self.title(),\n            \"text\": self.text(),\n            \"role\": self.role(),\n        }\n\n    @staticmethod\n    def get_current_applications(obs: Dict):\n        apps = []\n        for proc in psutil.process_iter([\"pid\", \"name\"]):\n            apps.append(proc.info[\"name\"])\n        return apps\n\n    @staticmethod\n    def get_top_app(obs: Dict):\n        hwnd = win32gui.GetForegroundWindow()\n        _, pid = win32process.GetWindowThreadProcessId(hwnd)\n        for proc in psutil.process_iter([\"pid\", \"name\"]):\n            if proc.info[\"pid\"] == pid:\n                return proc.info[\"name\"]\n        return None\n\n    @staticmethod\n    def list_apps_in_directories():\n        return list_apps_in_directories()\n\n    @staticmethod\n    def systemWideElement():\n        desktop = Desktop(backend=\"uia\")\n        return UIElement(desktop)\n\n    def __repr__(self):\n        return f\"UIElement({self.element})\"\n"
  },
  {
    "path": "gui_agents/s1/aci/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/aci/windowsagentarena/GroundingAgent.py",
    "content": "import base64\nimport logging\nimport os\nimport time\nimport xml.etree.ElementTree as ET\nfrom typing import Dict, List, Tuple\nimport numpy as np\nimport requests\nfrom gui_agents.s1.utils.common_utils import box_iou\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nstate_ns = \"uri:deskat:state.at-spi.gnome.org\"\ncomponent_ns = \"uri:deskat:component.at-spi.gnome.org\"\n\n\n# Agent action decorator\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nclass GroundingAgent:\n    def __init__(self, vm_version: str, top_app=None, top_app_only=True, ocr=True):\n        self.active_apps = set()\n        self.top_app = top_app\n        self.top_app_only = (\n            top_app_only  # Only include top app in the accessibility tree\n        )\n        self.ocr = ocr\n        self.index_out_of_range_flag = False\n        self.app_setup_code = f\"\"\"import subprocess;\nimport difflib;\nimport pyautogui;\npyautogui.press('escape');\ntime.sleep(0.5);\noutput = subprocess.check_output(['wmctrl', '-lx']);\noutput = output.decode('utf-8').splitlines();\nwindow_titles = [line.split(None, 4)[2] for line in output];\nclosest_matches = difflib.get_close_matches('APP_NAME', window_titles, n=1, cutoff=0.1);\nif closest_matches:\n    closest_match = closest_matches[0];\n    for line in output:\n        if closest_match in line:\n            window_id = line.split()[0]\n            break;\nsubprocess.run(['wmctrl', '-ia', window_id])\nsubprocess.run(['wmctrl', '-ir', window_id, '-b', 'add,maximized_vert,maximized_horz'])\n\"\"\"\n\n        self.top_active_app = None\n        self.notes = []\n        self.clipboard = \"\"\n\n        # TODO: this is terrible, fix this\n        # global state_ns, component_ns, attributes_ns, value_ns\n        # if vm_version == \"old\":\n        #     state_ns = \"uri:deskat:state.at-spi.gnome.org\"\n        #     component_ns = \"uri:deskat:component.at-spi.gnome.org\"\n        # elif vm_version == 'win':\n        #     state_ns = \"uri:deskat:state.at-spi.gnome.org\"\n        #     component_ns = \"uri:deskat:component.at-spi.gnome.org\"\n        # else:\n        #     attributes_ns = \"https://accessibility.windows.example.org/ns/attributes\"\n        #     state_ns = \"https://accessibility.ubuntu.example.org/ns/state\"\n        #     component_ns = \"https://accessibility.ubuntu.example.org/ns/component\"\n        #     value_ns = \"https://accessibility.ubuntu.example.org/ns/value\"\n\n    def get_current_applications(self, obs):\n        tree = ET.ElementTree(ET.fromstring(obs[\"accessibility_tree\"]))\n        apps = []\n        root = tree.getroot()\n        for item in root:\n            apps.append(item.get(\"name\", \"\").replace(\"\\\\\", \"\"))\n        return apps\n\n    def check_new_apps(self, old_apps, new_apps):\n        return new_apps - old_apps\n\n    def find_active_applications(self, tree):\n        # names of applications to keep TODO: soffice is a single application with all the isntances like impress, calc etc. being frames this will need to be dealt with separately\n        to_keep = [\"Program Manager\"]\n        apps_with_active_tag = []\n        for application in list(tree.getroot()):\n            app_name = application.get(\"name\")\n            for frame in application:\n                is_active = frame.attrib.get(\"{{{:}}}active\".format(state_ns), \"false\")\n                if is_active == \"true\":\n                    apps_with_active_tag.append(app_name)\n        print(apps_with_active_tag)\n        if apps_with_active_tag:\n            to_keep.append(apps_with_active_tag[-1])\n        return to_keep\n\n    def filter_active_app(self, tree):\n        for application in list(tree.getroot()):\n            app_name = application.attrib.get(\"name\")\n            for frame in application:\n                is_active = frame.attrib.get(\"{{{:}}}active\".format(state_ns), \"false\")\n                if is_active == \"true\":\n                    return app_name\n        return None\n\n    def filter_nodes(self, tree, show_all=False):\n        # created and populate a preserved nodes list which filters out unnecessary elements and keeps only those elements which are currently showing on the screen\n        # TODO: include offscreen elements and then scroll to them before clicking\n        preserved_nodes = []\n        exclude_tags = [\"panel\", \"window\", \"filler\", \"frame\", \"separator\", \"scroll-bar\"]\n\n        for node in tree.iter():\n            if node.tag not in exclude_tags:\n                if show_all:\n                    if node.attrib.get(f\"{{{state_ns}}}enabled\") == \"true\":\n                        coords: Tuple[int, int] = eval(\n                            node.get(\n                                \"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                            )\n                        )\n                        if coords[0] >= 0 and coords[1] >= 0:\n                            preserved_nodes.append(node)\n                # if show_all is false, only show elements that are currently showing on screen\n                else:\n                    if node.attrib.get(f\"{{{state_ns}}}visible\") == \"true\":\n                        coords: Tuple[int, int] = eval(\n                            node.get(\n                                \"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                            )\n                        )\n\n                        if coords[0] >= 0 and coords[1] >= 0:\n                            preserved_nodes.append(node)\n        return preserved_nodes\n\n    def linearize_tree(self, preserved_nodes):\n        # TODO: Run an ablation to check if class and desc\n        # linearized_accessibility_tree = [\"id\\ttag\\tname\\ttext\\tclass\\tdescription\"]\n        linearized_accessibility_tree = [\"id\\ttag\\tname\\ttext\"]\n        for idx, node in enumerate(preserved_nodes):\n            if node.text:\n                text = (\n                    node.text\n                    if '\"' not in node.text\n                    else '\"{:}\"'.format(node.text.replace('\"', '\"\"'))\n                )\n            else:\n                text = '\"\"'\n\n            linearized_accessibility_tree.append(\n                \"{:}\\t{:}\\t{:}\\t{:}\".format(\n                    idx,\n                    node.tag,\n                    node.get(\"name\", \"\"),\n                    text,\n                    # node.get(\"{{{:}}}class\".format(attributes_ns), \"\"),\n                    # node.get(\"{{{:}}}description\".format(attributes_ns), \"\"),\n                )\n            )\n\n        # returning list of linearized elements\n        return linearized_accessibility_tree\n\n    def extract_elements_from_screenshot(self, screenshot) -> Dict:\n        \"\"\"Uses paddle-ocr to extract elements with text from the screenshot. The elements will be added to the linearized accessibility tree downstream\"\"\"\n\n        # Convert screenshot to PIL image\n        def send_image_to_ocr(screenshot) -> Dict:\n\n            # url = os.environ.get(\"OCR_SERVER_ADDRESS\", \"\")\n            url = \"http://127.0.0.1:8083/ocr/\"\n            if url == \"\":\n                raise Exception(\"OCR SERVER ADDRESS NOT SET\")\n            encoded_screenshot = base64.b64encode(screenshot).decode(\"utf-8\")\n            data = {\"img_bytes\": encoded_screenshot}\n            response = requests.post(url, json=data)\n\n            if response.status_code == 200:\n                return response.json()\n            else:\n                return {\n                    \"error\": f\"Request failed with status code {response.status_code}\",\n                    \"results\": [],\n                }\n\n        return send_image_to_ocr(screenshot)[\"results\"]\n\n    def add_ocr_elements(\n        self, screenshot, linearized_accessibility_tree, preserved_nodes\n    ):\n        # Get the bounding boxes of the elements in the linearized accessibility tree\n        tree_bboxes = []\n        for node in preserved_nodes:\n            coordinates: Tuple[int, int] = eval(\n                node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n            )\n            sizes: Tuple[int, int] = eval(\n                node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\")\n            )\n            tree_bboxes.append(\n                [\n                    coordinates[0],\n                    coordinates[1],\n                    coordinates[0] + sizes[0],\n                    coordinates[1] + sizes[1],\n                ]\n            )\n\n        # Use OCR to found boxes that might be missing from the accessibility tree\n        try:\n            ocr_bboxes = self.extract_elements_from_screenshot(screenshot)\n        except Exception as e:\n            print(f\"Error: {e}\")\n            ocr_bboxes = []\n        else:\n            # Check for intersection over union between the existing atree bounding boxes and the ocr bounding boxes, if ocr bounding boxes are new add them to the linearized accesibility tree\n            if (\n                len(ocr_bboxes) > 0\n            ):  # Only check IOUs and add if there are any bounding boxes returned by the ocr module\n                preserved_nodes_index = len(preserved_nodes)\n                for ind, (i, content, box) in enumerate(ocr_bboxes):\n                    # x1, y1, x2, y2 = int(box.get('left', 0)), int(box['top']), int(), int(box['bottom'])\n                    (\n                        x1,\n                        y1,\n                        x2,\n                        y2,\n                    ) = (\n                        int(box.get(\"left\", 0)),\n                        int(box.get(\"top\", 0)),\n                        int(box.get(\"right\", 0)),\n                        int(box.get(\"bottom\", 0)),\n                    )\n                    iou = box_iou(\n                        np.array(tree_bboxes, dtype=np.float32),\n                        np.array([[x1, y1, x2, y2]], dtype=np.float32),\n                    ).flatten()\n\n                    if max(iou) < 0.1:\n                        # Add the element to the linearized accessibility tree\n                        # TODO: ocr detected elements should be classified for their tag, currently set to push button for the agent to think they are interactable\n                        linearized_accessibility_tree.append(\n                            f\"{preserved_nodes_index}\\tpush-button\\t\\t{content}\\t\\t\"\n                        )\n\n                        # add to preserved node with the component_ns prefix node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"\n                        node = ET.Element(\n                            \"ocr_node\",\n                            attrib={\n                                \"text\": content,\n                                \"{{{}}}screencoord\".format(\n                                    component_ns\n                                ): \"({},{})\".format(x1, y1),\n                                \"{{{}}}size\".format(component_ns): \"({},{})\".format(\n                                    x2 - x1, y2 - y1\n                                ),\n                            },\n                        )\n                        preserved_nodes.append(node)\n                        preserved_nodes_index += 1\n\n        return linearized_accessibility_tree, preserved_nodes\n\n    def linearize_and_annotate_tree(self, obs, show_all=False):\n        accessibility_tree = obs[\"accessibility_tree\"]\n        screenshot = obs[\"screenshot\"]\n\n        # convert the accessibility tree from a string representation to an xml tree\n        tree = ET.ElementTree(ET.fromstring(accessibility_tree))\n\n        # Get the applications to keep based on the active applications\n        to_keep = self.find_active_applications(tree)\n        self.top_app = to_keep[-1]\n\n        # Remove applications which are not included in the to_keep list\n        if not show_all:\n            for application in list(tree.getroot()):\n                if application.attrib.get(\"name\", \"\") not in to_keep:\n                    tree.getroot().remove(application)\n\n        # Save tree for debugging\n        # from datetime import datetime\n        # with open(f\"tree_raw_{datetime.now()}.xml\", \"wb\") as file:\n        #     tree.write(file, encoding=\"utf-8\", xml_declaration=True)\n\n        # Filter out filler elements and overlapping elements\n        preserved_nodes = self.filter_nodes(tree, show_all)\n\n        assert len(preserved_nodes) > 0\n\n        # Linearize the tree as tsv\n        linearized_accessibility_tree = self.linearize_tree(preserved_nodes)\n\n        # Add OCR elements to the linearized accessibility tree to account for elements that are not in the accessibility tree\n        if self.ocr:\n            linearized_accessibility_tree, preserved_nodes = self.add_ocr_elements(\n                screenshot, linearized_accessibility_tree, preserved_nodes\n            )\n\n        # Convert accessibility tree to a string\n        linearized_accessibility_tree = \"\\n\".join(linearized_accessibility_tree)\n\n        # TODO: side-effect, set in separate functions\n        self.nodes = preserved_nodes\n\n        return linearized_accessibility_tree\n\n    def find_element(self, element_id):\n        try:\n            selected_element = self.nodes[int(element_id)]\n        except:\n            print(\"The index of the selected element was out of range.\")\n            selected_element = self.nodes[0]\n            self.index_out_of_range_flag = True\n        return selected_element\n\n    @agent_action\n    def click(\n        self,\n        element_id: int,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_id:int, ID of the element to click on\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        node = self.find_element(element_id)\n        coordinates: Tuple[int, int] = eval(\n            node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes: Tuple[int, int] = eval(\n            node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\")\n        )\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n\n        command = \"import pyautogui; \"\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def switch_window(self):\n        \"\"\"Switch to a different application that is already open\"\"\"\n        # return self.app_setup_code.replace(\"APP_NAME\", app_code)\n        return f\"import pyautogui; pyautogui.hotkey('alt', 'tab');\"\n\n    @agent_action\n    def type(\n        self,\n        text: str,\n        element_id: int = None,\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into the element\n        Args:\n            text:str the text to type\n            element_id:int ID of the element to type into. If not provided, typing will start at the current cursor location.\n            overwrite:bool Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n        try:\n            # Use the provided element_id or default to None\n            node = self.find_element(element_id) if element_id is not None else None\n        except:\n            node = None\n\n        if node is not None:\n            # If a node is found, retrieve its coordinates and size\n            coordinates = eval(\n                node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n            )\n            sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n            # Calculate the center of the element\n            x = coordinates[0] + sizes[0] // 2\n            y = coordinates[1] + sizes[1] // 2\n\n            # Start typing at the center of the element\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            # If no element is found, start typing at the current cursor location\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n        # if overwrite:\n        #     return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.hotkey(\"ctrl\", \"a\"); pyautogui.press(\"backspace\"); pyautogui.typewrite({repr(text)})\"\"\"\n        # else:\n        #     return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.hotkey(\"ctrl\", \"a\"); pyautogui.press(\"backspace\"); pyautogui.typewrite(\"{text}\")\"\"\"\n\n    # @agent_action\n    # def type_and_enter(self, element_id:int, text:str, overwrite: bool = True):\n    #     '''Type text into the element and press enter\n    #     Args:\n    #         element_id:int ID of the element to type into\n    #         text:str the text to type into the element\n    #     '''\n    #     try:\n    #         node = self.find_element(element_id)\n    #     except:\n    #         node = self.find_element(0)\n    #     # print(node.attrib)\n    #     coordinates = eval(\n    #         node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"))\n    #     sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n    #     # Calculate the center of the element\n    #     x = coordinates[0] + sizes[0] // 2\n    #     y = coordinates[1] + sizes[1] // 2\n\n    #     # Return pyautoguicode to type into the element\n    #     if overwrite:\n    #         return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.hotkey(\"ctrl\", \"a\"); pyautogui.press(\"backspace\"); pyautogui.typewrite({repr(text)}); pyautogui.press(\"enter\")\"\"\"\n    #     else:\n    #         return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.typewrite({repr(text)}); pyautogui.press(\"enter\")\"\"\"\n\n    # @agent_action\n    # def copy_text(self, element_id:int):\n    #     '''Copy the selected text, use instead of ctrl+c\n    #     Args:\n    #         element_id:int ID of the element to copy text from\n    #     '''\n    #     try:\n    #         node = self.find_element(element_id)\n    #     except:\n    #         node = self.find_element(0)\n\n    #     self.clipboard = node.text\n\n    # @agent_action\n    # def paste_text(self, element_id:int, overwrite: bool = True):\n    #     '''Paste text from the clipboard into the element, use instead of ctrl+v\n    #     Args:\n    #         element_id:int ID of the element to copy text from\n    #         overwrite:bool a boolean value to determine if the text should be pasted over the existing text or appended to it\n    #     '''\n    #     try:\n    #         node = self.find_element(element_id)\n    #     except:\n    #         node = self.find_element(0)\n\n    #     coordinates = eval(\n    #         node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\"))\n    #     sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n    #     # Calculate the center of the element\n    #     x = coordinates[0] + sizes[0] // 2\n    #     y = coordinates[1] + sizes[1] // 2\n\n    #     # Return pyautoguicode to paste into the element\n    #     if overwrite:\n    #         return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.typewrite(\"{self.clipboard}\");\"\"\"\n    #     else:\n    #         return f\"\"\"import pyautogui; pyautogui.click({x}, {y}); pyautogui.hotkey(\"ctrl\", \"a\"); pyautogui.press(\"backspace\"); pyautogui.typewrite(\"{self.clipboard}\");\"\"\"\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(self, drag_from_id: int, drop_on_id: int, hold_keys: List = []):\n        \"\"\"Drag element1 and drop it on element2.\n        Args:\n            drag_from_id:int ID of element to drag\n            drop_on_id:int ID of element to drop on\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        node1 = self.find_element(drag_from_id)\n        node2 = self.find_element(drop_on_id)\n        coordinates1 = eval(\n            node1.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes1 = eval(node1.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        coordinates2 = eval(\n            node2.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes2 = eval(node2.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        # Calculate the center of the element\n        x1 = coordinates1[0] + sizes1[0] // 2\n        y1 = coordinates1[1] + sizes1[1] // 2\n\n        x2 = coordinates2[0] + sizes2[0] // 2\n        y2 = coordinates2[1] + sizes2[1] // 2\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def scroll(self, element_id: int, clicks: int):\n        \"\"\"Scroll the element in the specified direction\n        Args:\n            element_id:int ID of the element to scroll in\n            clicks:int the number of clicks to scroll can be positive (up) or negative (down).\n        \"\"\"\n        try:\n            node = self.find_element(element_id)\n        except:\n            node = self.find_element(0)\n        # print(node.attrib)\n        coordinates = eval(\n            node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n        )\n        sizes = eval(node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\"))\n\n        # Calculate the center of the element\n        x = coordinates[0] + sizes[0] // 2\n        y = coordinates[1] + sizes[1] // 2\n        return (\n            f\"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({clicks})\"\n        )\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])\n        \"\"\"\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)})\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(self):\n        \"\"\"End the current task with a success\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure\"\"\"\n        return \"\"\"FAIL\"\"\"\n"
  },
  {
    "path": "gui_agents/s1/cli_app.py",
    "content": "import argparse\nimport datetime\nimport io\nimport logging\nimport os\nimport platform\nimport signal\nimport sys\nimport time\n\nimport pyautogui\n\nfrom gui_agents.s1.core.AgentS import GraphSearchAgent, UIAgent\n\ncurrent_platform = platform.system().lower()\n\n# Global flag to track pause state for debugging\npaused = False\n\n\ndef get_char():\n    \"\"\"Get a single character from stdin without pressing Enter\"\"\"\n    try:\n        # Import termios and tty on Unix-like systems\n        if platform.system() in [\"Darwin\", \"Linux\"]:\n            import termios\n            import tty\n\n            fd = sys.stdin.fileno()\n            old_settings = termios.tcgetattr(fd)\n            try:\n                tty.setraw(sys.stdin.fileno())\n                ch = sys.stdin.read(1)\n            finally:\n                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)\n            return ch\n        else:\n            # Windows fallback\n            import msvcrt\n\n            return msvcrt.getch().decode(\"utf-8\", errors=\"ignore\")\n    except:\n        return input()  # Fallback for non-terminal environments\n\n\ndef signal_handler(signum, frame):\n    \"\"\"Handle Ctrl+C signal for debugging during agent execution\"\"\"\n    global paused\n\n    if not paused:\n        print(\"\\n\\n🔸 Agent-S Workflow Paused 🔸\")\n        print(\"=\" * 50)\n        print(\"Options:\")\n        print(\"  • Press Ctrl+C again to quit\")\n        print(\"  • Press Esc to resume workflow\")\n        print(\"=\" * 50)\n\n        paused = True\n\n        while paused:\n            try:\n                print(\"\\n[PAUSED] Waiting for input... \", end=\"\", flush=True)\n                char = get_char()\n\n                if ord(char) == 3:  # Ctrl+C\n                    print(\"\\n\\n🛑 Exiting Agent-S...\")\n                    sys.exit(0)\n                elif ord(char) == 27:  # Esc\n                    print(\"\\n\\n▶️  Resuming Agent-S workflow...\")\n                    paused = False\n                    break\n                else:\n                    print(f\"\\n   Unknown command: '{char}' (ord: {ord(char)})\")\n\n            except KeyboardInterrupt:\n                print(\"\\n\\n🛑 Exiting Agent-S...\")\n                sys.exit(0)\n    else:\n        # Already paused, second Ctrl+C means quit\n        print(\"\\n\\n🛑 Exiting Agent-S...\")\n        sys.exit(0)\n\n\n# Set up signal handler for Ctrl+C\nsignal.signal(signal.SIGINT, signal_handler)\n\nif current_platform == \"darwin\":\n    from gui_agents.s1.aci.MacOSACI import MacOSACI, UIElement\nelif current_platform == \"linux\":\n    from gui_agents.s1.aci.LinuxOSACI import LinuxACI, UIElement\nelif current_platform == \"windows\":\n    from gui_agents.s1.aci.WindowsOSACI import WindowsACI, UIElement\nelse:\n    raise ValueError(f\"Unsupported platform: {current_platform}\")\n\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nlog_dir = \"logs\"\nos.makedirs(log_dir, exist_ok=True)\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n\nplatform_os = platform.system()\n\n\ndef show_permission_dialog(code: str, action_description: str):\n    \"\"\"Show a platform-specific permission dialog and return True if approved.\"\"\"\n    if platform.system() == \"Darwin\":\n        result = os.system(\n            f'osascript -e \\'display dialog \"Do you want to execute this action?\\n\\n{code} which will try to {action_description}\" with title \"Action Permission\" buttons {{\"Cancel\", \"OK\"}} default button \"OK\" cancel button \"Cancel\"\\''\n        )\n        return result == 0\n    elif platform.system() == \"Linux\":\n        result = os.system(\n            f'zenity --question --title=\"Action Permission\" --text=\"Do you want to execute this action?\\n\\n{code}\" --width=400 --height=200'\n        )\n        return result == 0\n    return False\n\n\ndef run_agent(agent: UIAgent, instruction: str):\n    global paused\n    obs = {}\n    traj = \"Task:\\n\" + instruction\n    subtask_traj = \"\"\n    for step in range(15):\n        # Check if we're in paused state and wait\n        while paused:\n            time.sleep(0.1)\n        obs[\"accessibility_tree\"] = UIElement.systemWideElement()\n\n        # Get screen shot using pyautogui.\n        # Take a screenshot\n        screenshot = pyautogui.screenshot()\n\n        # Save the screenshot to a BytesIO object\n        buffered = io.BytesIO()\n        screenshot.save(buffered, format=\"PNG\")\n\n        # Get the byte value of the screenshot\n        screenshot_bytes = buffered.getvalue()\n        # Convert to base64 string.\n        obs[\"screenshot\"] = screenshot_bytes\n\n        # Check again for pause state before prediction\n        while paused:\n            time.sleep(0.1)\n\n        print(f\"\\n🔄 Step {step + 1}/15: Getting next action from agent...\")\n\n        # Get next action code from the agent\n        info, code = agent.predict(instruction=instruction, observation=obs)\n\n        if \"done\" in code[0].lower() or \"fail\" in code[0].lower():\n            if platform.system() == \"Darwin\":\n                os.system(\n                    f'osascript -e \\'display dialog \"Task Completed\" with title \"OpenACI Agent\" buttons \"OK\" default button \"OK\"\\''\n                )\n            elif platform.system() == \"Linux\":\n                os.system(\n                    f'zenity --info --title=\"OpenACI Agent\" --text=\"Task Completed\" --width=200 --height=100'\n                )\n\n            agent.update_narrative_memory(traj)\n            break\n\n        if \"next\" in code[0].lower():\n            continue\n\n        if \"wait\" in code[0].lower():\n            print(\"⏳ Agent requested wait...\")\n            time.sleep(5)\n            continue\n\n        else:\n            time.sleep(1.0)\n            print(\"EXECUTING CODE:\", code[0])\n\n            # Check for pause state before execution\n            while paused:\n                time.sleep(0.1)\n\n            # Ask for permission before executing\n            exec(code[0])\n            time.sleep(1.0)\n\n            # Update task and subtask trajectories and optionally the episodic memory\n            traj += (\n                \"\\n\\nReflection:\\n\"\n                + str(info[\"reflection\"])\n                + \"\\n\\n----------------------\\n\\nPlan:\\n\"\n                + info[\"executor_plan\"]\n            )\n            subtask_traj = agent.update_episodic_memory(info, subtask_traj)\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        description=\"Run GraphSearchAgent with specified model.\"\n    )\n    parser.add_argument(\n        \"--model\",\n        type=str,\n        default=\"gpt-4o-mini\",\n        help=\"Specify the model to use (e.g., gpt-4o)\",\n    )\n    args = parser.parse_args()\n\n    if current_platform == \"Darwin\":\n        grounding_agent = MacOSACI()\n    elif current_platform == \"Windows\":\n        grounding_agent = WindowsACI()\n    elif current_platform == \"Linux\":\n        grounding_agent = LinuxACI()\n    else:\n        raise ValueError(\"Unsupported platform\")\n\n    while True:\n        query = input(\"Query: \")\n        if \"gpt\" in args.model:\n            engine_type = \"openai\"\n        elif \"claude\" in args.model:\n            engine_type = \"anthropic\"\n        engine_params = {\n            \"engine_type\": engine_type,\n            \"model\": args.model,\n        }\n\n        agent = GraphSearchAgent(\n            engine_params,\n            grounding_agent,\n            platform=current_platform,\n            action_space=\"pyautogui\",\n            observation_type=\"mixed\",\n        )\n\n        agent.reset()\n\n        # Run the agent on your own device\n        run_agent(agent, query)\n\n        response = input(\"Would you like to provide another query? (y/n): \")\n        if response.lower() != \"y\":\n            break\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "gui_agents/s1/core/AgentS.py",
    "content": "import json\nimport logging\nimport os\nfrom typing import Dict, List, Optional, Tuple\nimport platform\n\nfrom gui_agents.s1.aci.ACI import ACI\nfrom gui_agents.s1.core.Manager import Manager\nfrom gui_agents.s1.core.Worker import Worker\nfrom gui_agents.s1.utils.common_utils import Node\nfrom gui_agents.utils import download_kb_data\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass UIAgent:\n    \"\"\"Base class for UI automation agents\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        action_space: str = \"pyautogui\",\n        observation_type: str = \"a11y_tree\",\n        search_engine: str = \"perplexica\",\n    ):\n        \"\"\"Initialize UIAgent\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (macos, linux, windows)\n            action_space: Type of action space to use (pyautogui, aci)\n            observation_type: Type of observations to use (a11y_tree, mixed)\n            engine: Search engine to use (perplexica, LLM)\n        \"\"\"\n        self.engine_params = engine_params\n        self.grounding_agent = grounding_agent\n        self.platform = platform\n        self.action_space = action_space\n        self.observation_type = observation_type\n        self.engine = search_engine\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state\"\"\"\n        pass\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        \"\"\"Generate next action prediction\n\n        Args:\n            instruction: Natural language instruction\n            observation: Current UI state observation\n\n        Returns:\n            Tuple containing agent info dictionary and list of actions\n        \"\"\"\n        pass\n\n    def update_narrative_memory(self, trajectory: str) -> None:\n        \"\"\"Update narrative memory with task trajectory\n\n        Args:\n            trajectory: String containing task execution trajectory\n        \"\"\"\n        pass\n\n    def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str:\n        \"\"\"Update episodic memory with subtask trajectory\n\n        Args:\n            meta_data: Metadata about current subtask execution\n            subtask_trajectory: String containing subtask execution trajectory\n\n        Returns:\n            Updated subtask trajectory\n        \"\"\"\n        pass\n\n\nclass GraphSearchAgent(UIAgent):\n    \"\"\"Agent that uses hierarchical planning and directed acyclic graph modeling for UI automation\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        action_space: str = \"pyatuogui\",\n        observation_type: str = \"mixed\",\n        search_engine: Optional[str] = None,\n        memory_root_path: str = os.getcwd(),\n        memory_folder_name: str = \"kb_s1\",\n        kb_release_tag: str = \"v0.2.2\",\n    ):\n        \"\"\"Initialize GraphSearchAgent\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (macos, ubuntu)\n            action_space: Type of action space to use (pyautogui, other)\n            observation_type: Type of observations to use (a11y_tree, screenshot, mixed)\n            search_engine: Search engine to use (LLM, perplexica)\n            memory_root_path: Path to memory directory. Defaults to current working directory.\n            memory_folder_name: Name of memory folder. Defaults to \"kb_s2\".\n            kb_release_tag: Release tag for knowledge base. Defaults to \"v0.2.2\".\n        \"\"\"\n        super().__init__(\n            engine_params,\n            grounding_agent,\n            platform,\n            action_space,\n            observation_type,\n            search_engine,\n        )\n\n        self.memory_root_path = memory_root_path\n        self.memory_folder_name = memory_folder_name\n        self.kb_release_tag = kb_release_tag\n\n        # Initialize agent's knowledge base on user's current working directory.\n        print(\"Downloading knowledge base initial Agent-S knowledge...\")\n        self.local_kb_path = os.path.join(\n            self.memory_root_path, self.memory_folder_name\n        )\n\n        if not os.path.exists(self.local_kb_path):\n            download_kb_data(\n                version=\"s1\",\n                release_tag=kb_release_tag,\n                download_dir=self.local_kb_path,\n                platform=self.platform,\n            )\n            print(\n                f\"Successfully completed download of knowledge base for version s1, tag {self.kb_release_tag}, platform {self.platform}.\"\n            )\n        else:\n            print(\n                f\"Path local_kb_path {self.local_kb_path} already exists. Skipping download.\"\n            )\n            print(\n                f\"If you'd like to re-download the initial knowledge base, please delete the existing knowledge base at {self.local_kb_path}.\"\n            )\n            print(\n                \"Note, the knowledge is continually updated during inference. Deleting the knowledge base will wipe out all experience gained since the last knowledge base download.\"\n            )\n\n        self.reset()\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state and initialize components\"\"\"\n        # Initialize core components\n        self.planner = Manager(\n            self.engine_params,\n            self.grounding_agent,\n            platform=self.platform,\n            search_engine=self.engine,\n            local_kb_path=self.local_kb_path,\n        )\n        self.executor = Worker(\n            self.engine_params,\n            self.grounding_agent,\n            platform=self.platform,\n            local_kb_path=self.local_kb_path,\n        )\n\n        # Reset state variables\n        self.requires_replan: bool = True\n        self.needs_next_subtask: bool = True\n        self.step_count: int = 0\n        self.turn_count: int = 0\n        self.failure_feedback: str = \"\"\n        self.should_send_action: bool = False\n        self.completed_tasks: List[Node] = []\n        self.current_subtask: Optional[Node] = None\n        self.subtasks: List[Node] = []\n        self.search_query: str = \"\"\n        self.subtask_status: str = \"Start\"\n\n    def reset_executor_state(self) -> None:\n        \"\"\"Reset executor and step counter\"\"\"\n        self.executor.reset()\n        self.step_count = 0\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        \"\"\"Predict next UI action sequence\n\n        Args:\n            instruction: Natural language instruction\n            observation: Current UI state observation Dictionary {\"accessibility_tree\": str, \"screenshot\": bytes}\n            info: Dictionary containing additional information.\n\n        Returns:\n            Tuple of (agent info dict, list of actions)\n        \"\"\"\n        # Initialize the three info dictionaries\n        planner_info = {}\n        executor_info = {}\n        evaluator_info = {\n            \"obs_evaluator_response\": \"\",\n            \"num_input_tokens_evaluator\": 0,\n            \"num_output_tokens_evaluator\": 0,\n            \"evaluator_cost\": 0.0,\n        }\n        actions = []\n\n        # If the DONE response by the executor is for a subtask, then the agent should continue with the next subtask without sending the action to the environment\n        while not self.should_send_action:\n            self.subtask_status = \"In\"\n            # if replan is true, generate a new plan. True at start, then true again after a failed plan\n            if self.requires_replan:\n                logger.info(\"(RE)PLANNING...\")\n                # failure feedback is the reason for the failure of the previous plan\n                planner_info, self.subtasks = self.planner.get_action_queue(\n                    instruction=instruction,\n                    observation=observation,\n                    failure_feedback=self.failure_feedback,\n                )\n\n                self.requires_replan = False\n                if \"search_query\" in planner_info:\n                    self.search_query = planner_info[\"search_query\"]\n                else:\n                    self.search_query = \"\"\n\n            # use the exectuor to complete the topmost subtask\n            if self.needs_next_subtask:\n                logger.info(\"GETTING NEXT SUBTASK...\")\n                self.current_subtask = self.subtasks.pop(0)\n                logger.info(f\"NEXT SUBTASK: {self.current_subtask}\")\n                self.needs_next_subtask = False\n                self.subtask_status = \"Start\"\n\n            # get the next action from the executor\n            executor_info, actions = self.executor.generate_next_action(\n                instruction=instruction,\n                search_query=self.search_query,\n                subtask=self.current_subtask.name,\n                subtask_info=self.current_subtask.info,\n                future_tasks=self.subtasks,\n                done_task=self.completed_tasks,\n                obs=observation,\n            )\n\n            self.step_count += 1\n\n            # set the should_send_action flag to True if the executor returns an action\n            self.should_send_action = True\n            if \"FAIL\" in actions:\n                self.requires_replan = True\n                # set the failure feedback to the evaluator feedback\n                self.failure_feedback = f\"Completed subtasks: {self.completed_tasks}. The subtask {self.current_subtask} cannot be completed. Please try another approach. {executor_info['plan_code']}. Please replan.\"\n                self.needs_next_subtask = True\n\n                # reset the step count, executor, and evaluator\n                self.reset_executor_state()\n\n                # if more subtasks are remaining, we don't want to send DONE to the environment but move on to the next subtask\n                if self.subtasks:\n                    self.should_send_action = False\n\n            elif \"DONE\" in actions:\n                self.requires_replan = False\n                self.completed_tasks.append(self.current_subtask)\n                self.needs_next_subtask = True\n                if self.subtasks:\n                    self.should_send_action = False\n                self.subtask_status = \"Done\"\n\n                self.reset_executor_state()\n\n            self.turn_count += 1\n        # reset the should_send_action flag for next iteration\n        self.should_send_action = False\n\n        # concatenate the three info dictionaries\n        info = {\n            **{\n                k: v\n                for d in [planner_info or {}, executor_info or {}, evaluator_info or {}]\n                for k, v in d.items()\n            }\n        }\n        info.update(\n            {\n                \"subtask\": self.current_subtask.name,\n                \"subtask_info\": self.current_subtask.info,\n                \"subtask_status\": self.subtask_status,\n            }\n        )\n\n        return info, actions\n\n    def update_narrative_memory(self, trajectory: str) -> None:\n        \"\"\"Update narrative memory from task trajectory\n\n        Args:\n            trajectory: String containing task execution trajectory\n        \"\"\"\n        try:\n            reflection_path = os.path.join(\n                self.local_kb_path, self.platform, \"narrative_memory.json\"\n            )\n            try:\n                reflections = json.load(open(reflection_path))\n            except:\n                reflections = {}\n\n            if self.search_query not in reflections:\n                reflection = self.planner.summarize_narrative(trajectory)\n                reflections[self.search_query] = reflection\n\n            with open(reflection_path, \"w\") as f:\n                json.dump(reflections, f, indent=2)\n\n        except Exception as e:\n            logger.error(f\"Failed to update narrative memory: {e}\")\n\n    def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str:\n        \"\"\"Update episodic memory from subtask trajectory\n\n        Args:\n            meta_data: Metadata about current subtask execution\n            subtask_trajectory: String containing subtask execution trajectory\n\n        Returns:\n            Updated subtask trajectory\n        \"\"\"\n        subtask = meta_data[\"subtask\"]\n        subtask_info = meta_data[\"subtask_info\"]\n        subtask_status = meta_data[\"subtask_status\"]\n        # Handle subtask trajectory\n        if subtask_status == \"Start\" or subtask_status == \"Done\":\n            # If it's a new subtask start, finalize the previous subtask trajectory if it exists\n            if subtask_trajectory:\n                subtask_trajectory += \"\\nSubtask Completed.\\n\"\n                subtask_key = subtask_trajectory.split(\n                    \"\\n----------------------\\n\\nPlan:\\n\"\n                )[0]\n                try:\n                    subtask_path = os.path.join(\n                        self.local_kb_path, self.platform, \"episodic_memory.json\"\n                    )\n                    kb = json.load(open(subtask_path))\n                except:\n                    kb = {}\n                if subtask_key not in kb.keys():\n                    subtask_summarization = self.planner.summarize_episode(\n                        subtask_trajectory\n                    )\n                    kb[subtask_key] = subtask_summarization\n                else:\n                    subtask_summarization = kb[subtask_key]\n                logger.info(\"subtask_key: %s\", subtask_key)\n                logger.info(\"subtask_summarization: %s\", subtask_summarization)\n                with open(subtask_path, \"w\") as fout:\n                    json.dump(kb, fout, indent=2)\n                # Reset for the next subtask\n                subtask_trajectory = \"\"\n            # Start a new subtask trajectory\n            subtask_trajectory = (\n                \"Task:\\n\"\n                + self.search_query\n                + \"\\n\\nSubtask: \"\n                + subtask\n                + \"\\nSubtask Instruction: \"\n                + subtask_info\n                + \"\\n----------------------\\n\\nPlan:\\n\"\n                + meta_data[\"executor_plan\"]\n                + \"\\n\"\n            )\n        elif subtask_status == \"In\":\n            # Continue appending to the current subtask trajectory if it's still ongoing\n            subtask_trajectory += (\n                \"\\n----------------------\\n\\nPlan:\\n\"\n                + meta_data[\"executor_plan\"]\n                + \"\\n\"\n            )\n\n        return subtask_trajectory\n"
  },
  {
    "path": "gui_agents/s1/core/BaseModule.py",
    "content": "from typing import Dict, Optional\n\nfrom gui_agents.s1.mllm.MultimodalAgent import LMMAgent\n\n\nclass BaseModule:\n    def __init__(self, engine_params: Dict, platform: str):\n        self.engine_params = engine_params\n        self.platform = platform\n\n    def _create_agent(\n        self, system_prompt: str = None, engine_params: Optional[Dict] = None\n    ) -> LMMAgent:\n        \"\"\"Create a new LMMAgent instance\"\"\"\n        agent = LMMAgent(engine_params or self.engine_params)\n        if system_prompt:\n            agent.add_system_prompt(system_prompt)\n        return agent\n"
  },
  {
    "path": "gui_agents/s1/core/Knowledge.py",
    "content": "import json\nimport os\nfrom typing import Dict, Tuple\n\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\n\nfrom gui_agents.s1.core.BaseModule import BaseModule\nfrom gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY\nfrom gui_agents.s1.mllm.MultimodalEngine import OpenAIEmbeddingEngine\nfrom gui_agents.s1.utils.common_utils import (\n    load_embeddings,\n    load_knowledge_base,\n    save_embeddings,\n)\nfrom gui_agents.s1.utils.query_perplexica import query_to_perplexica\n\n\nclass KnowledgeBase(BaseModule):\n    def __init__(\n        self,\n        local_kb_path: str,\n        platform: str,\n        engine_params: Dict,\n        use_image_for_search: bool = False,\n    ):\n        super().__init__(engine_params, platform)\n\n        self.local_kb_path = local_kb_path\n\n        # initialize embedding engine\n        # TODO: Support other embedding engines\n        self.embedding_engine = OpenAIEmbeddingEngine(\n            api_key=(\n                engine_params[\"api_key\"]\n                if \"api_key\" in engine_params\n                else os.getenv(\"OPENAI_API_KEY\")\n            )\n        )\n\n        # Initialize paths for different memory types\n        self.episodic_memory_path = os.path.join(\n            self.local_kb_path, self.platform, \"episodic_memory.json\"\n        )\n        self.narrative_memory_path = os.path.join(\n            self.local_kb_path, self.platform, \"narrative_memory.json\"\n        )\n        self.embeddings_path = os.path.join(\n            self.local_kb_path, self.platform, \"embeddings.pkl\"\n        )\n\n        self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT.replace(\n            \"CURRENT_OS\", self.platform\n        )\n\n        # All three agent share a generic RAG prompt that ask agent to provide information for UI automation in CURRENT_OS\n        self.query_formulator = self._create_agent(self.rag_module_system_prompt)\n        self.llm_search_agent = self._create_agent(self.rag_module_system_prompt)\n        self.knowledge_fusion_agent = self._create_agent(self.rag_module_system_prompt)\n\n        self.use_image_for_search = use_image_for_search\n\n    def retrieve_knowledge(\n        self, instruction: str, search_query: str, search_engine: str = \"llm\"\n    ) -> Tuple[str, str]:\n        \"\"\"Retrieve knowledge using search engine\n        Args:\n            instruction (str): task instruction\n            observation (Dict): current observation\n            search_engine (str): search engine to use\"\"\"\n\n        # Use search engine to retrieve knowledge based on the formulated query\n        search_results = self._search(instruction, search_query, search_engine)\n\n        return search_query, search_results\n\n    def formulate_query(self, instruction: str, observation: Dict) -> str:\n        \"\"\"Formulate search query based on instruction and current state\"\"\"\n        query_path = os.path.join(\n            self.local_kb_path, self.platform, \"formulate_query.json\"\n        )\n        try:\n            with open(query_path, \"r\") as f:\n                formulate_query = json.load(f)\n        except:\n            formulate_query = {}\n\n        if instruction in formulate_query:\n            return formulate_query[instruction]\n\n        self.query_formulator.add_message(\n            f\"The task is: {instruction}\\n\"\n            f\"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\\n\"\n            \"To use google search to get some useful information, first carefully analyze \"\n            \"the accessibility tree of the current desktop UI state, then given the task \"\n            \"instruction, formulate a question that can be used to search on the Internet \"\n            \"for information in helping with the task execution.\\n\"\n            \"The question should not be too general or too specific. Please ONLY provide \"\n            \"the question.\\nQuestion:\",\n            image_content=(\n                observation[\"screenshot\"]\n                if self.use_image_for_search and \"screenshot\" in observation\n                else None\n            ),\n        )\n\n        search_query = self.query_formulator.get_response().strip().replace('\"', \"\")\n        print(\"search query: \", search_query)\n        formulate_query[instruction] = search_query\n        with open(query_path, \"w\") as f:\n            json.dump(formulate_query, f, indent=2)\n\n        return search_query\n\n    def _search(self, instruction: str, search_query: str, search_engine: str) -> str:\n        \"\"\"Execute search using specified engine\"\"\"\n\n        # Default to perplexica rag knowledge to see if the query exists\n        file = os.path.join(\n            self.local_kb_path, self.platform, f\"{search_engine}_rag_knowledge.json\"\n        )\n\n        try:\n            with open(file, \"r\") as f:\n                exist_search_results = json.load(f)\n        except:\n            exist_search_results = {}\n\n        if instruction in exist_search_results:\n            return exist_search_results[instruction]\n        if search_engine.lower() == \"llm\":\n            # Use LLM's internal knowledge like a search engine\n            self.llm_search_agent.add_message(search_query)\n            search_results = self.llm_search_agent.get_response()\n        elif search_engine.lower() == \"perplexica\":\n            # Use perplexica to search for the query\n            search_results = query_to_perplexica(search_query)\n        else:\n            raise ValueError(f\"Unsupported search engine: {search_engine}\")\n\n        exist_search_results[instruction] = search_results.strip()\n        with open(\n            os.path.join(\n                self.local_kb_path,\n                self.platform,\n                f\"{search_engine}_rag_knowledge.json\",\n            ),\n            \"w\",\n        ) as f:\n            json.dump(exist_search_results, f, indent=2)\n\n        return search_results\n\n    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str]:\n        \"\"\"Retrieve narrative experience using embeddings\"\"\"\n        knowledge_base = load_knowledge_base(self.narrative_memory_path)\n        if not knowledge_base:\n            return \"None\", \"None\"\n\n        embeddings = load_embeddings(self.embeddings_path)\n\n        # Get or create instruction embedding\n        instruction_embedding = embeddings.get(instruction)\n\n        if instruction_embedding is None:\n            instruction_embedding = self.embedding_engine.get_embeddings(instruction)\n            embeddings[instruction] = instruction_embedding\n\n        # Get or create embeddings for knowledge base entries\n        candidate_embeddings = []\n        for key in knowledge_base:\n            candidate_embedding = embeddings.get(key)\n            if candidate_embedding is None:\n                candidate_embedding = self.embedding_engine.get_embeddings(key)\n                embeddings[key] = candidate_embedding\n\n            candidate_embeddings.append(candidate_embedding)\n\n        save_embeddings(self.embeddings_path, embeddings)\n\n        similarities = cosine_similarity(\n            instruction_embedding, np.vstack(candidate_embeddings)\n        )[0]\n        sorted_indices = np.argsort(similarities)[::-1]\n\n        keys = list(knowledge_base.keys())\n        idx = 1 if keys[sorted_indices[0]] == instruction else 0\n        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]\n\n    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str]:\n        \"\"\"Retrieve similar task experience using embeddings\"\"\"\n        knowledge_base = load_knowledge_base(self.episodic_memory_path)\n        if not knowledge_base:\n            return \"None\", \"None\"\n\n        embeddings = load_embeddings(self.embeddings_path)\n\n        # Get or create instruction embedding\n        instruction_embedding = embeddings.get(instruction)\n\n        if instruction_embedding is None:\n            instruction_embedding = self.embedding_engine.get_embeddings(instruction)\n            embeddings[instruction] = instruction_embedding\n\n        # Get or create embeddings for knowledge base entries\n        candidate_embeddings = []\n        for key in knowledge_base:\n            candidate_embedding = embeddings.get(key)\n            if candidate_embedding is None:\n                candidate_embedding = self.embedding_engine.get_embeddings(key)\n                embeddings[key] = candidate_embedding\n\n            candidate_embeddings.append(candidate_embedding)\n\n        save_embeddings(self.embeddings_path, embeddings)\n\n        similarities = cosine_similarity(\n            instruction_embedding, np.vstack(candidate_embeddings)\n        )[0]\n        sorted_indices = np.argsort(similarities)[::-1]\n\n        keys = list(knowledge_base.keys())\n        idx = 1 if keys[sorted_indices[0]] == instruction else 0\n        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]\n\n    def knowledge_fusion(\n        self,\n        observation: Dict,\n        instruction: str,\n        web_knowledge: str,\n        similar_task: str,\n        experience: str,\n    ) -> str:\n        \"\"\"Combine web knowledge with similar task experience\"\"\"\n        self.knowledge_fusion_agent.add_message(\n            f\"Task: {instruction}\\n\"\n            f\"Accessibility tree of the current desktop UI state: {observation['linearized_accessibility_tree']}\\n\"\n            f\"**Web search result**:\\n{web_knowledge}\\n\\n\"\n            f\"**Retrieved similar task experience**:\\n\"\n            f\"Similar task:{similar_task}\\n{experience}\\n\\n\"\n            f\"Based on the web search result and the retrieved similar task experience, \"\n            f\"if you think the similar task experience is indeed useful to the main task, \"\n            f\"integrate it with the web search result. Provide the final knowledge in a numbered list.\",\n            image_content=(\n                observation[\"screenshot\"]\n                if self.use_image_for_search and \"screenshot\" in observation\n                else None\n            ),\n        )\n        return self.knowledge_fusion_agent.get_response()\n"
  },
  {
    "path": "gui_agents/s1/core/Manager.py",
    "content": "import logging\nfrom collections import defaultdict\nfrom typing import Dict, List, Optional, Tuple\nimport platform\n\nfrom gui_agents.s1.aci.ACI import ACI\nfrom gui_agents.s1.core.BaseModule import BaseModule\nfrom gui_agents.s1.core.Knowledge import KnowledgeBase\nfrom gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY\nfrom gui_agents.s1.utils.common_utils import (\n    Dag,\n    Node,\n    calculate_tokens,\n    call_llm_safe,\n    parse_dag,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\nNUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision\n\n\nclass Manager(BaseModule):\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        local_kb_path: str,\n        search_engine: Optional[str] = None,\n        multi_round: bool = False,\n        platform: str = platform.system().lower(),\n    ):\n        # TODO: move the prompt to Procedural Memory\n        super().__init__(engine_params, platform)\n\n        # Initialize the ACI\n        self.grounding_agent = grounding_agent\n\n        # Initialize the submodules of the Manager\n        self.generator_agent = self._create_agent(PROCEDURAL_MEMORY.MANAGER_PROMPT)\n        self.dag_translator_agent = self._create_agent(\n            PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT\n        )\n        self.narrative_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT\n        )\n        self.episode_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT\n        )\n\n        self.local_kb_path = local_kb_path\n\n        self.knowledge_base = KnowledgeBase(self.local_kb_path, platform, engine_params)\n\n        self.planner_history = []\n\n        self.turn_count = 0\n        self.search_engine = search_engine\n        self.multi_round = multi_round\n        self.platform = platform\n\n    def summarize_episode(self, trajectory):\n        \"\"\"Summarize the episode experience for lifelong learning reflection\n        Args:\n            trajectory: str: The episode experience to be summarized\n        \"\"\"\n\n        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars\n        self.episode_summarization_agent.add_message(trajectory)\n        subtask_summarization = call_llm_safe(self.episode_summarization_agent)\n        self.episode_summarization_agent.add_message(subtask_summarization)\n\n        return subtask_summarization\n\n    def summarize_narrative(self, trajectory):\n        \"\"\"Summarize the narrative experience for lifelong learning reflection\n        Args:\n            trajectory: str: The narrative experience to be summarized\n        \"\"\"\n        # Create Reflection on whole trajectories for next round trial\n        self.narrative_summarization_agent.add_message(trajectory)\n        lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent)\n\n        return lifelong_learning_reflection\n\n    def _generate_step_by_step_plan(\n        self, observation: Dict, instruction: str, failure_feedback: str = \"\"\n    ) -> Tuple[Dict, str]:\n        agent = self.grounding_agent\n\n        self.active_apps = agent.get_active_apps(observation)\n\n        tree_input = agent.linearize_and_annotate_tree(observation)\n        observation[\"linearized_accessibility_tree\"] = tree_input\n\n        # Perform Retrieval only at the first planning step\n        if self.turn_count == 0:\n\n            self.search_query = self.knowledge_base.formulate_query(\n                instruction, observation\n            )\n\n            retrieved_experience = \"\"\n            integrated_knowledge = \"\"\n            # Retrieve most similar narrative (task) experience\n            most_similar_task, retrieved_experience = (\n                self.knowledge_base.retrieve_narrative_experience(instruction)\n            )\n            logger.info(\n                \"SIMILAR TASK EXPERIENCE: %s\",\n                most_similar_task + \"\\n\" + retrieved_experience.strip(),\n            )\n\n            # Retrieve knowledge from the web if search_engine is provided\n            if self.search_engine is not None:\n                retrieved_knowledge = self.knowledge_base.retrieve_knowledge(\n                    instruction=instruction,\n                    search_query=self.search_query,\n                    search_engine=self.search_engine,\n                )\n                logger.info(\"RETRIEVED KNOWLEDGE: %s\", retrieved_knowledge)\n\n                if retrieved_knowledge is not None:\n                    # Fuse the retrieved knowledge and experience\n                    integrated_knowledge = self.knowledge_base.knowledge_fusion(\n                        observation=observation,\n                        instruction=instruction,\n                        web_knowledge=retrieved_knowledge,\n                        similar_task=most_similar_task,\n                        experience=retrieved_experience,\n                    )\n                    logger.info(\"INTEGRATED KNOWLEDGE: %s\", integrated_knowledge)\n\n            integrated_knowledge = integrated_knowledge or retrieved_experience\n\n            # Add the integrated knowledge to the task instruction in the system prompt\n            if integrated_knowledge:\n                instruction += f\"\\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}\"\n\n            self.generator_agent.add_system_prompt(\n                self.generator_agent.system_prompt.replace(\n                    \"TASK_DESCRIPTION\", instruction\n                )\n            )\n\n        generator_message = (\n            f\"Accessibility Tree: {tree_input}\\n\"\n            f\"The clipboard contains: {agent.clipboard}.\"\n            f\"The current open applications are {agent.get_active_apps(observation)}\"\n            + (\n                f\" Previous plan failed at step: {failure_feedback}\"\n                if failure_feedback\n                else \"\"\n            )\n        )\n\n        self.generator_agent.add_message(\n            generator_message, image_content=observation.get(\"screenshot\", None)\n        )\n\n        logger.info(\"GENERATING HIGH LEVEL PLAN\")\n\n        plan = call_llm_safe(self.generator_agent)\n\n        if plan == \"\":\n            raise Exception(\"Plan Generation Failed - Fix the Prompt\")\n\n        logger.info(\"HIGH LEVEL STEP BY STEP PLAN: %s\", plan)\n\n        self.generator_agent.add_message(plan)\n\n        self.planner_history.append(plan)\n\n        self.turn_count += 1\n\n        input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)\n\n        # Set Cost based on GPT-4o\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n\n        planner_info = {\n            \"search_query\": self.search_query,\n            \"goal_plan\": plan,\n            \"num_input_tokens_plan\": input_tokens,\n            \"num_output_tokens_plan\": output_tokens,\n            \"goal_plan_cost\": cost,\n        }\n\n        assert type(plan) == str\n\n        return planner_info, plan\n\n    def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]:\n        # Add initial instruction and plan to the agent's message history\n        self.dag_translator_agent.add_message(\n            f\"Instruction: {instruction}\\nPlan: {plan}\"\n        )\n\n        logger.info(\"GENERATING DAG\")\n\n        # Generate DAG\n        dag_raw = call_llm_safe(self.dag_translator_agent)\n\n        dag = parse_dag(dag_raw)\n\n        logger.info(\"Generated DAG: %s\", dag_raw)\n\n        self.dag_translator_agent.add_message(dag_raw)\n\n        input_tokens, output_tokens = calculate_tokens(\n            self.dag_translator_agent.messages\n        )\n\n        # Set Cost based on GPT-4o\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n\n        dag_info = {\n            \"dag\": dag_raw,\n            \"num_input_tokens_dag\": input_tokens,\n            \"num_output_tokens_dag\": output_tokens,\n            \"dag_cost\": cost,\n        }\n\n        assert type(dag) == Dag\n\n        return dag_info, dag\n\n    def _topological_sort(self, dag: Dag) -> List[Node]:\n        \"\"\"Topological sort of the DAG using DFS\n        dag: Dag: Object representation of the DAG with nodes and edges\n        \"\"\"\n\n        def dfs(node_name, visited, stack):\n            visited[node_name] = True\n            for neighbor in adj_list[node_name]:\n                if not visited[neighbor]:\n                    dfs(neighbor, visited, stack)\n            stack.append(node_name)\n\n        # Convert edges to adjacency list\n        adj_list = defaultdict(list)\n        for u, v in dag.edges:\n            adj_list[u.name].append(v.name)\n\n        visited = {node.name: False for node in dag.nodes}\n        stack = []\n\n        for node in dag.nodes:\n            if not visited[node.name]:\n                dfs(node.name, visited, stack)\n\n        # Return the nodes in topologically sorted order\n        sorted_nodes = [\n            next(n for n in dag.nodes if n.name == name) for name in stack[::-1]\n        ]\n        return sorted_nodes\n\n    def get_action_queue(\n        self,\n        instruction: str,\n        observation: Dict,\n        failure_feedback: str = None,\n    ):\n        \"\"\"Generate the action list based on the instruction\n        instruction:str: Instruction for the task\n        \"\"\"\n        # Generate the high level plan\n        planner_info, plan = self._generate_step_by_step_plan(\n            observation, instruction, failure_feedback\n        )\n\n        # Generate the DAG\n        dag_info, dag = self._generate_dag(instruction, plan)\n\n        # Topological sort of the DAG\n        action_queue = self._topological_sort(dag)\n\n        planner_info.update(dag_info)\n\n        return planner_info, action_queue\n"
  },
  {
    "path": "gui_agents/s1/core/ProceduralMemory.py",
    "content": "import inspect\nimport textwrap\n\n\nclass PROCEDURAL_MEMORY:\n    @staticmethod\n    def construct_worker_procedural_memory(agent_class):\n        procedural_memory = textwrap.dedent(\n            f\"\"\"\\\n        You are an expert in graphical user interfaces and Python code. You are responsible for executing the current subtask: `SUBTASK_DESCRIPTION` of the larger goal: `TASK_DESCRIPTION`.\n        IMPORTANT: ** The subtasks: ['DONE_TASKS'] have already been done. The future subtasks ['FUTURE_TASKS'] will be done in the future by me. You must only perform the current subtask: `SUBTASK_DESCRIPTION`. Do not try to do future subtasks. **\n        You are working in CURRENT_OS. You must only complete the subtask provided and not the larger goal.\n        You are provided with:\n        1. A simplified accessibility tree of the UI at the current time step.\n        2. A screenshot of the current time step.\n        3. The history of your previous interactions with the UI.\n        4. Access to the following class and methods to interact with the UI:\n        class Agent:\n        \"\"\"\n        )\n\n        for attr_name in dir(agent_class):\n            attr = getattr(agent_class, attr_name)\n            if callable(attr) and hasattr(attr, \"is_agent_action\"):\n                # Use inspect to get the full function signature\n                signature = inspect.signature(attr)\n                procedural_memory += f\"\"\"\n    def {attr_name}{signature}:\n    '''{attr.__doc__}'''\n        \"\"\"\n\n        procedural_memory += textwrap.dedent(\n            \"\"\"\n        Your response should be formatted like this:\n        (Previous action verification)\n        Carefully analyze based on the screenshot and the accessibility tree if the previous action was successful. If the previous action was not successful, provide a reason for the failure.\n\n        (Screenshot Analysis)\n        Closely examine and describe the current state of the desktop along with the currently open applications.\n\n        (Next Action)\n        Based on the current screenshot, the accessibility tree and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.\n\n        (Grounded Action)\n        Translate the next action into code using the provided API methods. Format the code like this:\n        ```python\n        agent.click(123, 1, \"left\")\n        ```\n        Note for the code:\n        1. Only perform one action at a time.\n        2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.\n        3. You must use only the available methods provided above to interact with the UI, do not invent new methods.\n        3. Only return one code block every time. There must be a single line of code in the code block.\n        4. Please only use the available methods provided above to interact with the UI.\n        5. If you think the task is already completed, you can return `agent.done()` in the code block.\n        6. If you think the task cannot be completed, you can return `agent.fail()` in the code block.\n        7. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the task is completed or `agent.fail()` if it cannot be completed.\n        8. Whenever possible use hot-keys or typing rather than mouse clicks.\n        9. My computer's password is 'password', feel free to use it when you need sudo rights\n        \"\"\"\n        )\n        return procedural_memory.strip()\n\n    # MANAGER_PROMPT = \"\"\"You are a planning agent for solving GUI navigation tasks. You will be provided the initial configuration of a system including accessibility, screenshot and other information. You need to solve the following task: TASK_DESCRIPTION. You will describe in as much detail as possible the steps required to complete the task by a GUI agent. Please do not include any verification steps in your plan that is not your responsibility. IMPORTANT: Your plan should be as concize as possible and should not include any unnecessary steps. Do not fine-tune, or embellish anything or cause any side effects. Generate the plan that can be accomplished in the shortest time. Please take the current state into account when generating the plan. Please provide the plan in a step-by-step format and make sure you do not include anything that's already done in the GUI in your plan.\"\"\"\n\n    # TODO: exploring this prompt\n    MANAGER_PROMPT = \"\"\"You are a planning agent for solving GUI navigation tasks. You will be provided the initial configuration of a system including accessibility, screenshot and other information. You need to solve the following task: TASK_DESCRIPTION. You will describe in as much detail as possible the steps required to complete the task by a GUI agent. Please do not include any verification steps in your plan that is not your responsibility. IMPORTANT: Your plan should be as concize as possible and should not include any unnecessary steps. Do not fine-tune, or embellish anything or cause any side effects. Generate the plan that can be accomplished in the shortest time. Please take the current state into account when generating the plan. Please provide the plan in a step-by-step format and make sure you do not include anything that's already done in the GUI in your plan. You don't need to arrange the steps in order just list out everything that needs to be done. You may follow a dependency structure. Note that the execution agent that will complete your plan can't actually see everything thats visible to you.\"\"\"\n\n    # NOTE: below prompt results in suboptimal initial plans\n    # MANAGER_PROMPT = \"\"\"You are an expert planning agent for GUI tasks. You will be provided with an initial state of the system including accessibility, screenshot and other information and the final state represented by the task: TASK_DESCRIPTION. Tell me everything that needs to be done in order to reach the goal state. You don't need to arrange the steps in order just list out everything that needs to be done. You may follow a dependency structure.\"\"\"\n\n    # USED IN OSWORLD EXPERIMENTS\n    RAG_AGENT_OSWORLD = \"\"\"\n    Given a desktop computer task instruction, you are an agent which should provide useful information as requested, to help another agent follow the instruction and perform the task.\n    The domain of the desktop computer task is from [CURRENT_OS, VLC, LibreOffice, Chrome, Thunderbird, VS Code, GIMP].\n    The task is: TASK_DESCRIPTION\n    The simplified accessibility tree of the current computer UI is: ACCESSIBLITY_TREE\n    \"\"\"\n\n    RAG_AGENT = \"\"\"\n    Given a desktop computer task instruction, you are an agent which should provide useful information as requested, to help another agent follow the instruction and perform the task in CURRENT_OS.\n    \"\"\"\n\n    # TODO: confirm this prompt\n    REFLECTION_ON_TRAJECTORY = \"\"\"\n    You are a reflection agent designed to assist in task execution by analyzing a trajectory of task execution until this time step and providing feedback for the next step prediction.\n    You have access to the Task Description and Current Trajectory, and the image for each step. The most recent image is what happened after the latest action in the trajectory.\n    You should ONLY provide informative reflection feedback (potential mitigation alternatives) based on your expertise for the planning agent when you observe the abnormal trajectory (e.g., contain consecutive failures).\n    Otherwise, let the agent continue to proceed as planned.\n    Make sure to avoid providing any information about specific planning or actions and avoid generating repeated reflection feedbacks.\n    Assume the grounded action is correct, do not judge about it.\n    \"\"\"\n\n    TASK_SUMMARIZATION_PROMPT = \"\"\"\n    You are a summarization agent designed to analyze a trajectory of desktop task execution.\n    You have access to the Task Description and Whole Trajectory including plan, verification and reflection at each step.\n    Your summarized information will be referred to by another agent when performing the tasks.\n    You should follow the below instructions:\n    1. If the task is successfully executed, you should summarize the successful plan based on the whole trajectory to finish the task.\n    2. Otherwise, provide the reasons why the task is failed and potential suggestions that may avoid this failure.\n\n    **ATTENTION**\n    1. Only extract the correct plan and do not provide redundant steps.\n    2. Do not contain grounded actions in the plan.\n    3. If there are the successfully used hot-keys, make sure to include them in the plan.\n    4. The suggestions are for another agent not human, so they must be doable through the agent's action.\n    5. Don't generate high-level suggestions (e.g., Implement Error Handling).\n    \"\"\"\n\n    # DAG_TRANSLATOR_PROMPT = \"\"\"You are a plan to Dependency Graph conversion agent. You will be provided a plan and you will generate a directed acyclic graph in the specified format for the plan. Each node in your graph should contain two fields name and subinfo. name is a one line description of each subtask. subinfo is all available information about executing that subtask available in the step by step plan. Please do not remove or edit any information out of the subinfo. The graph must be a directed acyclic graph. The graph must be connected. Do not include any repeated or optional steps in the graph, any extra info must go in the subinfo.\n    # \"\"\"\n\n    DAG_TRANSLATOR_PROMPT = \"\"\"You are a plan to Dependency Graph conversion agent. Your task is to analyze a given plan and generate a structured JSON output representing the plan and its corresponding directed acyclic graph (DAG).\n\nThe output should be a valid JSON object wrapped in <json></json> tags, with the following structure:\n\n<json>\n{\n  \"dag\": {\n    \"nodes\": [\n      {\n        \"name\": \"Short name or brief description of the step\",\n        \"info\": \"Detailed information about executing this step\"\n      }\n    ],\n    \"edges\": [\n      [\n        {\"name\": \"Name of the source node\", \"info\": \"Info of the source node\"},\n        {\"name\": \"Name of the target node\", \"info\": \"Info of the target node\"}\n      ]\n    ]\n  }\n}\n</json>\n\nGuidelines:\n1. The \"plan\" field should contain the entire original plan as a string.\n2. In the \"dag\" object:\n   a. Each node in the \"nodes\" array should contain 'name' and 'info' fields.\n   b. 'name' should be a concise, one-line description of the subtask.\n   c. 'info' should contain all available information about executing that subtask from the original plan. Do not remove or edit any information from the 'info' field.\n3. The \"edges\" array should represent the connections between nodes, showing the order and dependencies of the steps.\n4. The graph must be a directed acyclic graph (DAG) and must be connected.\n5. Do not include repeated or optional steps in the graph. Any extra information should be incorporated into the 'info' field of the relevant node.\n\nAnalyze the given plan and provide the output in this JSON format within the <json></json> tags. Ensure the JSON is valid and properly escaped.\n\"\"\"\n\n    SUBTASK_SUMMARIZATION_PROMPT = \"\"\"\n    You are a summarization agent designed to analyze a trajectory of desktop task execution.\n    You will summarize the correct plan and grounded actions based on the whole trajectory of a subtask, ensuring the summarized plan contains only correct and necessary steps.\n\n    **ATTENTION**\n\t1.\tSummarize the correct plan and its corresponding grounded actions. Carefully filter out any repeated or incorrect steps based on the verification output in the trajectory. Only include the necessary steps for successfully completing the subtask.\n\t2.\tID Replacement in Grounded Actions:\n    When summarizing grounded actions, replace all actual IDs with placeholders element1_id, element2_id, etc., while maintaining the total number of parameters.\n    Ensure the placeholders (element1_id, element2_id, …) follow the order of appearance in the grounded actions.\n\t3.\tOnly generate grounded actions that are explicitly present in the trajectory. Do not introduce any grounded actions that do not exist in the trajectory.\n\t4.\tFor each step in the plan, provide a corresponding grounded action. Use the exact format:\n    \tAction: [Description of the correct action]\n    \tGrounded Action: [Grounded actions with element_id replacement]\n\t5.\tExclude any other details that are not necessary for completing the task.\n    \"\"\"\n\n    STATE_EVALUATOR_SYSTEM_PROMPT = \"\"\"\n    You are an impartial evaluator to evaluate the completeness of the given desktop computer task, you are also an expert of accessibility tree, os environment and python programming.\n    The task is: TASK_DESCRIPTION, it is executed by a digital agent who can perform the task without knowing whether the task requirements are met.\n    As an evaluator, your task is to judge whether the task is finished and meets the task requirement.\n    You have access to the:\n    1. Task instruction.\n    2. The whole actions performed by the digital agent.\n    3. The accessibility tree at the first step and the last step.\n    4. The screenshot at the first step and the last step.\n\n    You are able to proceed your judgment process in the following ways based on the task instruction:\n    1. By comparing the difference in the accessibility trees of the UI, you should judge whether the task is complete given the task instruction.\n    2. If you cannot judge based on the observations, you can evalaute it by writing and running a python script to do a further examination. For example, you can use the 'subprocess' module to run the external command in a terminal to check whether an application has been installed.\n    You can also call the file system API to do the file check, etc. You can also try to interactive with the environment via other methods or interface you are familiared with.\n\n    **IMPORTANT**\n    1. If no python script is needed, you should provide your analysis and put the judgment at the end of the response in this format: Judgment: Yes/No\n    2. Otherwise, you should format your response into two parts as shown below:\n        ```python\n        # your code script here\n        ```\n\n    **ATTENTION**\n    1. You should only use scripts when you have to.\n    2. When you generate code script, only return one code block every time, the code block should contain the whole script you want to run. You must guarantee that the script is comprehensive and executable, make sure to print out the scripts' results for subsequent judgement.\n    Additionally, the comment of the code is **PROHIBITED**\n    3. You should strictly follow the response format mentioned above.\n\n    **SUBSEQUENCE**\n    If you have generated the python script, I will execute it and return the corresponding result to you (Started with \"The output after executing the script is:...\"). Then you should judge whether the task has been completed or not comprehensively based on the script and its result,\n    the task information, and the comparison of accessibility trees and screenshots. Provide your analysis and put the judgment at the end of the response in this format: Judgment: Yes/No\n    \"\"\"\n\n    OBS_EVALUATOR_SYSTEM_PROMPT = \"\"\"\n    You are an impartial evaluator to evaluate the completeness of the given desktop computer task.\n    The task is: TASK_DESCRIPTION, it is executed by a digital agent who can perform the task without knowing whether the task requirements are met.\n    As an evaluator, your task is to judge whether the task is finished and meets the task requirement.\n    You have access to the task instruction, the whole actions performed by the digital agent, the accessibility tree of the UI and screenshot at the first time step and the last time step.\n    By comparing the difference in the accessibility trees of the UI, you should judge whether the task is complete given the task instruction.\n    Provide your analysis and put the judgment at the end of the response in this format:\n    Judgment: Yes/No\n    Only say Yes or No in the Judgment section. Do not provide any other information in the Judgment section.\n    \"\"\"\n"
  },
  {
    "path": "gui_agents/s1/core/Worker.py",
    "content": "import logging\nimport os\nimport re\nfrom typing import Dict, List, Tuple\nimport platform\n\nfrom gui_agents.s1.aci.ACI import ACI\nfrom gui_agents.s1.core.BaseModule import BaseModule\nfrom gui_agents.s1.core.Knowledge import KnowledgeBase\nfrom gui_agents.s1.core.ProceduralMemory import PROCEDURAL_MEMORY\nfrom gui_agents.s1.utils import common_utils\nfrom gui_agents.s1.utils.common_utils import Node, calculate_tokens, call_llm_safe\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass Worker(BaseModule):\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        local_kb_path: str,\n        platform: str = platform.system().lower(),\n        search_engine: str = \"perplexica\",\n        enable_reflection: bool = True,\n        use_subtask_experience: bool = True,\n    ):\n        \"\"\"\n        Worker receives a subtask list and active subtask and generates the next action for the to execute.\n        Args:\n            engine_params: Dict\n                Parameters for the multimodal engine\n            grounding_agent: Agent\n                The grounding agent to use\n            local_kb_path: str\n                Path to knowledge base\n            search_engine: str\n                The search engine to use\n            enable_reflection: bool\n                Whether to enable reflection\n            use_subtask_experience: bool\n                Whether to use subtask experience\n        \"\"\"\n        super().__init__(engine_params, platform)\n\n        self.grounding_agent = grounding_agent\n        self.local_kb_path = local_kb_path\n        self.enable_reflection = enable_reflection\n        self.search_engine = search_engine\n        self.use_subtask_experience = use_subtask_experience\n        self.reset()\n\n    def flush_messages(self, n):\n        # After every max_trajectory_length trajectories, remove messages from the start except the system prompt\n        for agent in [self.generator_agent]:\n            if len(agent.messages) > 2 * n + 1:\n                # Remove the user message and assistant message, both are 1 because the elements will move back after 1 pop\n                agent.remove_message_at(1)\n                agent.remove_message_at(1)\n\n    def reset(self):\n        self.generator_agent = self._create_agent(\n            PROCEDURAL_MEMORY.construct_worker_procedural_memory(\n                type(self.grounding_agent)\n            ).replace(\"CURRENT_OS\", self.platform)\n        )\n        self.reflection_agent = self._create_agent(\n            PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY\n        )\n\n        self.knowledge_base = KnowledgeBase(\n            local_kb_path=self.local_kb_path,\n            platform=self.platform,\n            engine_params=self.engine_params,\n        )\n\n        self.turn_count = 0\n        self.planner_history = []\n        self.reflections = []\n        self.cost_this_turn = 0\n        self.tree_inputs = []\n        self.screenshot_inputs = []\n\n    # TODO: Experimental\n    def remove_ids_from_history(self):\n        for message in self.generator_agent.messages:\n            if message[\"role\"] == \"user\":\n                for content in message[\"content\"]:\n                    if content[\"type\"] == \"text\":\n                        # Regex pattern to match lines that start with a number followed by spaces and remove the number\n                        pattern = r\"^\\d+\\s+\"\n\n                        # Apply the regex substitution on each line\n                        processed_lines = [\n                            re.sub(pattern, \"\", line)\n                            for line in content[\"text\"].splitlines()\n                        ]\n\n                        # Join the processed lines back into a single string\n                        result = \"\\n\".join(processed_lines)\n\n                        result = result.replace(\"id\\t\", \"\")\n\n                        # replace message content\n                        content[\"text\"] = result\n\n    def generate_next_action(\n        self,\n        instruction: str,\n        search_query: str,\n        subtask: str,\n        subtask_info: str,\n        future_tasks: List[Node],\n        done_task: List[Node],\n        obs: Dict,\n    ) -> Tuple[Dict, List]:\n        \"\"\"\n        Predict the next action(s) based on the current observation.\n        \"\"\"\n        # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None\n        agent = self.grounding_agent\n\n        self.active_apps = agent.get_active_apps(obs)\n\n        # Get RAG knowledge, only update system message at t=0\n        if self.turn_count == 0:\n            # TODO: uncomment and fix for subtask level RAG\n            if self.use_subtask_experience:\n                subtask_query_key = (\n                    \"Task:\\n\"\n                    + search_query\n                    + \"\\n\\nSubtask: \"\n                    + subtask\n                    + \"\\nSubtask Instruction: \"\n                    + subtask_info\n                )\n                retrieved_similar_subtask, retrieved_subtask_experience = (\n                    self.knowledge_base.retrieve_episodic_experience(subtask_query_key)\n                )\n                logger.info(\n                    \"SIMILAR SUBTASK EXPERIENCE: %s\",\n                    retrieved_similar_subtask\n                    + \"\\n\"\n                    + retrieved_subtask_experience.strip(),\n                )\n                instruction += \"\\nYou may refer to some similar subtask experience if you think they are useful. {}\".format(\n                    retrieved_similar_subtask + \"\\n\" + retrieved_subtask_experience\n                )\n\n            self.generator_agent.add_system_prompt(\n                self.generator_agent.system_prompt.replace(\n                    \"SUBTASK_DESCRIPTION\", subtask\n                )\n                .replace(\"TASK_DESCRIPTION\", instruction)\n                .replace(\"FUTURE_TASKS\", \", \".join([f.name for f in future_tasks]))\n                .replace(\"DONE_TASKS\", \",\".join(d.name for d in done_task))\n            )\n\n        # Clear older messages - we keep full context. if you want to keep only the last n messages, you can use the flush_messages function\n        # self.flush_messages(3) # flushes generator messages\n\n        # Reflection generation\n        reflection = None\n        if self.enable_reflection and self.turn_count > 0:\n            # TODO: reuse planner history\n            self.reflection_agent.add_message(\n                \"Task Description: \"\n                + subtask\n                + \" Instruction: \"\n                + subtask_info\n                + \"\\n\"\n                + \"Current Trajectory: \"\n                + \"\\n\\n\".join(self.planner_history)\n                + \"\\n\"\n            )\n            reflection = call_llm_safe(self.reflection_agent)\n            self.reflections.append(reflection)\n            self.reflection_agent.add_message(reflection)\n\n            logger.info(\"REFLECTION: %s\", reflection)\n\n        # Plan Generation\n        tree_input = agent.linearize_and_annotate_tree(obs)\n\n        self.remove_ids_from_history()\n\n        # Bash terminal message.\n        generator_message = (\n            (\n                f\"\\nYou may use the reflection on the previous trajectory: {reflection}\\n\"\n                if reflection\n                else \"\"\n            )\n            + f\"Accessibility Tree: {tree_input}\\n\"\n            f\"Text Buffer = [{','.join(agent.notes)}]. \"\n            f\"The current open applications are {agent.get_active_apps(obs)} and the active app is {agent.get_top_app(obs)}.\\n\"\n        )\n\n        print(\"ACTIVE APP IS: \", agent.get_top_app(obs))\n        # Only provide subinfo in the very first message to avoid over influence and redundancy\n        if self.turn_count == 0:\n            generator_message += f\"Remeber only complete the subtask: {subtask}\\n\"\n            generator_message += f\"You can use this extra information for completing the current subtask: {subtask_info}.\\n\"\n\n        logger.info(\"GENERATOR MESSAGE: %s\", generator_message)\n\n        self.generator_agent.add_message(\n            generator_message, image_content=obs[\"screenshot\"]\n        )\n\n        plan = call_llm_safe(self.generator_agent)\n        self.planner_history.append(plan)\n        logger.info(\"PLAN: %s\", plan)\n\n        self.generator_agent.add_message(plan)\n\n        # Calculate input and output tokens\n        input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)\n\n        # Set Cost based on GPT-4o\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n        self.cost_this_turn += cost\n        logger.info(\"EXECTUOR COST: %s\", self.cost_this_turn)\n\n        # Extract code block from the plan\n        plan_code = common_utils.parse_single_code_from_string(\n            plan.split(\"Grounded Action\")[-1]\n        )\n        plan_code = common_utils.sanitize_code(plan_code)\n        plan_code = common_utils.extract_first_agent_function(plan_code)\n        exec_code = eval(plan_code)\n\n        # If agent selects an element that was out of range, it should not be executed just send a WAIT command.\n        # TODO: should provide this as code feedback to the agent?\n        if agent.index_out_of_range_flag:\n            plan_code = \"agent.wait(1.0)\"\n            exec_code = eval(plan_code)\n            agent.index_out_of_range_flag = False\n\n        executor_info = {\n            \"current_subtask\": subtask,\n            \"current_subtask_info\": subtask_info,\n            \"executor_plan\": plan,\n            \"linearized_accessibility_tree\": tree_input,\n            \"plan_code\": plan_code,\n            \"reflection\": reflection,\n            \"num_input_tokens_executor\": input_tokens,\n            \"num_output_tokens_executor\": output_tokens,\n            \"executor_cost\": cost,\n        }\n        self.turn_count += 1\n\n        self.tree_inputs.append(tree_input)\n        self.screenshot_inputs.append(obs[\"screenshot\"])\n\n        return executor_info, [exec_code]\n"
  },
  {
    "path": "gui_agents/s1/core/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/mllm/MultimodalAgent.py",
    "content": "# Author: Saaket Agashe\n# Date: 2021-09-15\n# License: MIT\n\nimport base64\nimport re\n\nfrom gui_agents.s1.mllm.MultimodalEngine import (\n    LMMEngineAnthropic,\n    LMMEngineAzureOpenAI,\n    LMMEngineOpenAI,\n    LMMEnginevLLM,\n)\n\ndata_type_map = {\n    \"openai\": {\"image_url\": \"image_url\"},\n    \"anthropic\": {\"image_url\": \"image\"},\n}\n\n\nclass LMMAgent:\n    def __init__(self, engine_params=None, system_prompt=None, engine=None):\n        if engine is None:\n            if engine_params is not None:\n                engine_type = engine_params.get(\"engine_type\")\n                if engine_type == \"openai\":\n                    self.engine = LMMEngineOpenAI(**engine_params)\n                elif engine_type == \"anthropic\":\n                    self.engine = LMMEngineAnthropic(**engine_params)\n                elif engine_type == \"azure\":\n                    self.engine = LMMEngineAzureOpenAI(**engine_params)\n                elif engine_type == \"vllm\":\n                    self.engine = LMMEnginevLLM(**engine_params)\n                else:\n                    raise ValueError(\"engine_type must be either 'openai' or 'azure'\")\n            else:\n                raise ValueError(\"engine_params must be provided\")\n        else:\n            self.engine = engine\n\n        self.messages = []  # Empty messages\n\n        if system_prompt:\n            self.add_system_prompt(system_prompt)\n        else:\n            self.add_system_prompt(\"You are a helpful assistant.\")\n\n    def encode_image(self, image_content):\n        # if image_content is a path to an image file, check type of the image_content to verify\n        if isinstance(image_content, str):\n            with open(image_content, \"rb\") as image_file:\n                return base64.b64encode(image_file.read()).decode(\"utf-8\")\n        else:\n            return base64.b64encode(image_content).decode(\"utf-8\")\n\n    def reset(\n        self,\n    ):\n\n        self.messages = [\n            {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        ]\n\n    def add_system_prompt(self, system_prompt):\n        self.system_prompt = system_prompt\n        if len(self.messages) > 0:\n            self.messages[0] = {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        else:\n            self.messages.append(\n                {\n                    \"role\": \"system\",\n                    \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n                }\n            )\n\n    def remove_message_at(self, index):\n        \"\"\"Remove a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages.pop(index)\n\n    def replace_message_at(\n        self, index, text_content, image_content=None, image_detail=\"high\"\n    ):\n        \"\"\"Replace a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages[index] = {\n                \"role\": self.messages[index][\"role\"],\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n            if image_content:\n                base64_image = self.encode_image(image_content)\n                self.messages[index][\"content\"].append(\n                    {\n                        \"type\": \"image_url\",\n                        \"image_url\": {\n                            \"url\": f\"data:image/png;base64,{base64_image}\",\n                            \"detail\": image_detail,\n                        },\n                    }\n                )\n\n    def add_message(\n        self, text_content, image_content=None, role=None, image_detail=\"high\"\n    ):\n        \"\"\"Add a new message to the list of messages\"\"\"\n\n        # API-style inference from OpenAI and AzureOpenAI\n        if isinstance(self.engine, (LMMEngineOpenAI, LMMEngineAzureOpenAI)):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/png;base64,{base64_image}\",\n                                    \"detail\": image_detail,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:image/png;base64,{base64_image}\",\n                                \"detail\": image_detail,\n                            },\n                        }\n                    )\n            self.messages.append(message)\n\n        # For API-style inference from Anthropic\n        elif isinstance(self.engine, LMMEngineAnthropic):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image\",\n                                \"source\": {\n                                    \"type\": \"base64\",\n                                    \"media_type\": \"image/png\",\n                                    \"data\": base64_image,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image\",\n                            \"source\": {\n                                \"type\": \"base64\",\n                                \"media_type\": \"image/png\",\n                                \"data\": base64_image,\n                            },\n                        }\n                    )\n            self.messages.append(message)\n\n        # Locally hosted vLLM model inference\n        elif isinstance(self.engine, LMMEnginevLLM):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image\",\n                                \"image\": f\"data:image;base64,{base64_image}\",\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\"type\": \"image\", \"image\": f\"data:image;base64,{base64_image}\"}\n                    )\n            self.messages.append(message)\n\n    def get_response(\n        self,\n        user_message=None,\n        image=None,\n        messages=None,\n        temperature=0.0,\n        max_new_tokens=None,\n        **kwargs,\n    ):\n        \"\"\"Generate the next response based on previous messages\"\"\"\n        if messages is None:\n            messages = self.messages\n        if user_message:\n            messages.append(\n                {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": user_message}]}\n            )\n\n        return self.engine.generate(\n            messages,\n            temperature=temperature,\n            max_new_tokens=max_new_tokens,\n            **kwargs,\n        )\n"
  },
  {
    "path": "gui_agents/s1/mllm/MultimodalEngine.py",
    "content": "# Author: Saaket Agashe\n# Date: 2021-09-15\n# License: MIT\n\nimport os\nimport re\nfrom io import BytesIO\n\nimport backoff\nimport numpy as np\nimport openai\nimport requests\nfrom anthropic import Anthropic\nfrom openai import APIConnectionError, APIError, AzureOpenAI, OpenAI, RateLimitError\nfrom PIL import Image\n\n# TODO: Import only if module exists, else ignore\n# from llava.model.builder import load_pretrained_model\n# from llava.mm_utils import (\n#     process_images,\n#     tokenizer_image_token,\n#     get_model_name_from_path,\n#     KeywordsStoppingCriteria,\n# )\n# from llava.constants import (\n#     IMAGE_TOKEN_INDEX,\n#     DEFAULT_IMAGE_TOKEN,\n#     DEFAULT_IM_START_TOKEN,\n#     DEFAULT_IM_END_TOKEN,\n#     IMAGE_PLACEHOLDER,\n# )\n# from llava.conversation import conv_templates, SeparatorStyle\n\n\n# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n\n\ndef image_parser(args):\n    out = args.image_file.split(args.sep)\n    return out\n\n\ndef load_image(image_file):\n    if image_file.startswith(\"http\") or image_file.startswith(\"https\"):\n        response = requests.get(image_file)\n        image = Image.open(BytesIO(response.content)).convert(\"RGB\")\n    else:\n        image = Image.open(image_file).convert(\"RGB\")\n    return image\n\n\ndef load_images(image_files):\n    out = []\n    for image_file in image_files:\n        image = load_image(image_file)\n        out.append(image)\n    return out\n\n\nclass LMMEngine:\n    pass\n\n\nclass LMMEngineOpenAI(LMMEngine):\n    def __init__(self, api_key=None, model=None, rate_limit=-1, **kwargs):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n\n        api_key = api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n\n        self.llm_client = OpenAI(api_key=self.api_key)\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        \"\"\"Generate the next message based on previous messages\"\"\"\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAnthropic(LMMEngine):\n    def __init__(self, api_key=None, model=None, **kwargs):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n\n        api_key = api_key or os.getenv(\"ANTHROPIC_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY\"\n            )\n\n        self.api_key = api_key\n\n        self.llm_client = Anthropic(api_key=self.api_key)\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        \"\"\"Generate the next message based on previous messages\"\"\"\n        return (\n            self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .content[0]\n            .text\n        )\n\n\nclass OpenAIEmbeddingEngine(LMMEngine):\n    def __init__(\n        self,\n        api_key=None,\n        rate_limit: int = -1,\n        display_cost: bool = True,\n    ):\n        \"\"\"Init an OpenAI Embedding engine\n\n        Args:\n            api_key (_type_, optional): Auth key from OpenAI. Defaults to None.\n            rate_limit (int, optional): Max number of requests per minute. Defaults to -1.\n            display_cost (bool, optional): Display cost of API call. Defaults to True.\n        \"\"\"\n        self.model = \"text-embedding-3-small\"\n        self.cost_per_thousand_tokens = 0.00002\n\n        api_key = api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n        self.api_key = api_key\n        self.display_cost = display_cost\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n\n    @backoff.on_exception(\n        backoff.expo,\n        (\n            APIError,\n            RateLimitError,\n            APIConnectionError,\n        ),\n    )\n    def get_embeddings(self, text: str) -> np.ndarray:\n        client = OpenAI(api_key=self.api_key)\n        response = client.embeddings.create(model=self.model, input=text)\n        if self.display_cost:\n            total_tokens = response.usage.total_tokens\n            cost = self.cost_per_thousand_tokens * total_tokens / 1000\n            # print(f\"Total cost for this embedding API call: {cost}\")\n        return np.array([data.embedding for data in response.data])\n\n\nclass LMMEngineAzureOpenAI(LMMEngine):\n    def __init__(\n        self,\n        api_key=None,\n        azure_endpoint=None,\n        model=None,\n        api_version=None,\n        rate_limit=-1,\n        **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n\n        assert api_version is not None, \"api_version must be provided\"\n        self.api_version = api_version\n\n        api_key = api_key or os.getenv(\"AZURE_OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY\"\n            )\n\n        self.api_key = api_key\n\n        azure_endpoint = azure_endpoint or os.getenv(\"AZURE_OPENAI_API_BASE\")\n        if azure_endpoint is None:\n            raise ValueError(\n                \"An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_API_BASE\"\n            )\n\n        self.azure_endpoint = azure_endpoint\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n\n        self.llm_client = AzureOpenAI(\n            azure_endpoint=self.azure_endpoint,\n            api_key=self.api_key,\n            api_version=self.api_version,\n        )\n        self.cost = 0.0\n\n    # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10)\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        \"\"\"Generate the next message based on previous messages\"\"\"\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temperature,\n            **kwargs,\n        )\n        total_tokens = completion.usage.total_tokens\n        self.cost += 0.02 * ((total_tokens + 500) / 1000)\n        return completion.choices[0].message.content\n\n\nclass LMMEnginevLLM(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_key = api_key\n\n        self.base_url = base_url or os.getenv(\"vLLM_ENDPOINT_URL\")\n        if self.base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL\"\n            )\n\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n\n        self.llm_client = OpenAI(base_url=self.base_url, api_key=self.api_key)\n\n    # @backoff.on_exception(backoff.expo, (APIConnectionError, APIError, RateLimitError), max_tries=10)\n    # TODO: Default params chosen for the Qwen model\n    def generate(\n        self,\n        messages,\n        temperature=0.0,\n        top_p=0.8,\n        repetition_penalty=1.05,\n        max_new_tokens=512,\n        **kwargs\n    ):\n        \"\"\"Generate the next message based on previous messages\"\"\"\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temperature,\n            top_p=top_p,\n            extra_body={\"repetition_penalty\": repetition_penalty},\n        )\n        return completion.choices[0].message.content\n"
  },
  {
    "path": "gui_agents/s1/mllm/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/utils/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s1/utils/common_utils.py",
    "content": "import base64\nimport io\nimport json\nimport os\nimport pickle\nimport re\nimport tempfile\nimport time\nimport xml.etree.ElementTree as ET\nfrom io import BytesIO\nfrom typing import Dict, List, Tuple, Union\nfrom xml.etree.ElementTree import Element\n\nimport numpy as np\nimport tiktoken\nfrom PIL import Image, ImageDraw, ImageFont\nfrom pydantic import BaseModel, ValidationError\n\n\ndef find_leaf_nodes(xlm_file_str):\n    if not xlm_file_str:\n        return []\n\n    root = ET.fromstring(xlm_file_str)\n\n    # Recursive function to traverse the XML tree and collect leaf nodes\n    def collect_leaf_nodes(node, leaf_nodes):\n        # If the node has no children, it is a leaf node, add it to the list\n        if not list(node):\n            leaf_nodes.append(node)\n        # If the node has children, recurse on each child\n        for child in node:\n            collect_leaf_nodes(child, leaf_nodes)\n\n    # List to hold all leaf nodes\n    leaf_nodes = []\n    collect_leaf_nodes(root, leaf_nodes)\n    return leaf_nodes\n\n\nstate_ns = \"uri:deskat:state.at-spi.gnome.org\"\ncomponent_ns = \"uri:deskat:component.at-spi.gnome.org\"\n\n\nclass Node(BaseModel):\n    name: str\n    info: str\n\n\nclass Dag(BaseModel):\n    nodes: List[Node]\n    edges: List[List[Node]]\n\n\nNUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision\n\n\ndef call_llm_safe(agent) -> Union[str, Dag]:\n    # Retry if fails\n    max_retries = 3  # Set the maximum number of retries\n    attempt = 0\n    response = \"\"\n    while attempt < max_retries:\n        try:\n            response = agent.get_response()\n            break  # If successful, break out of the loop\n        except Exception as e:\n            attempt += 1\n            print(f\"Attempt {attempt} failed: {e}\")\n            if attempt == max_retries:\n                print(\"Max retries reached. Handling failure.\")\n        time.sleep(1.0)\n    return response\n\n\ndef calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]:\n\n    num_input_images = 0\n    output_message = messages[-1]\n\n    input_message = messages[:-1]\n\n    input_string = \"\"\"\"\"\"\n    for message in input_message:\n        input_string += message[\"content\"][0][\"text\"] + \"\\n\"\n        if len(message[\"content\"]) > 1:\n            num_input_images += 1\n\n    input_text_tokens = get_input_token_length(input_string)\n\n    input_image_tokens = num_image_token * num_input_images\n\n    output_tokens = get_input_token_length(output_message[\"content\"][0][\"text\"])\n\n    return (input_text_tokens + input_image_tokens), output_tokens\n\n\ndef judge_node(node: Element, platform=\"ubuntu\", check_image=False) -> bool:\n    keeps: bool = (\n        node.tag.startswith(\"document\")\n        or node.tag.endswith(\"item\")\n        or node.tag.endswith(\"button\")\n        or node.tag.endswith(\"heading\")\n        or node.tag.endswith(\"label\")\n        or node.tag.endswith(\"scrollbar\")\n        or node.tag.endswith(\"searchbox\")\n        or node.tag.endswith(\"textbox\")\n        or node.tag.endswith(\"link\")\n        or node.tag.endswith(\"tabelement\")\n        or node.tag.endswith(\"textfield\")\n        or node.tag.endswith(\"textarea\")\n        or node.tag.endswith(\"menu\")\n        or node.tag.endswith(\"menu-item\")\n        or node.tag\n        in {\n            \"alert\",\n            \"canvas\",\n            \"check-box\",\n            \"combo-box\",\n            \"entry\",\n            \"icon\",\n            \"image\",\n            \"paragraph\",\n            \"scroll-bar\",\n            \"section\",\n            \"slider\",\n            \"static\",\n            \"table-cell\",\n            \"terminal\",\n            \"text\",\n            \"netuiribbontab\",\n            \"start\",\n            \"trayclockwclass\",\n            \"traydummysearchcontrol\",\n            \"uiimage\",\n            \"uiproperty\",\n            \"uiribboncommandbar\",\n        }\n    )\n\n    keeps = (\n        keeps\n        and (\n            platform == \"ubuntu\"\n            and node.get(\"{{{:}}}showing\".format(state_ns), \"false\") == \"true\"\n            and node.get(\"{{{:}}}visible\".format(state_ns), \"false\") == \"true\"\n            or platform == \"windows\"\n            and node.get(\"{{{:}}}visible\".format(state_ns), \"false\") == \"true\"\n        )\n        and (\n            node.get(\"name\", \"\") != \"\"\n            or node.text is not None\n            and len(node.text) > 0\n            or check_image\n            and node.get(\"image\", \"false\") == \"true\"\n        )\n    )\n    # and (node.get(\"{{{:}}}enabled\".format(state_ns), \"false\") == \"true\" \\\n    #      or node.get(\"{{{:}}}editable\".format(state_ns), \"false\") == \"true\" \\\n    #      or node.get(\"{{{:}}}expandable\".format(state_ns), \"false\") == \"true\" \\\n    #      or node.get(\"{{{:}}}checkable\".format(state_ns), \"false\") == \"true\"\n    #      ) \\\n\n    coordinates: Tuple[int, int] = eval(\n        node.get(\"{{{:}}}screencoord\".format(component_ns), \"(-1, -1)\")\n    )\n    sizes: Tuple[int, int] = eval(\n        node.get(\"{{{:}}}size\".format(component_ns), \"(-1, -1)\")\n    )\n    keeps = (\n        keeps\n        and coordinates[0] >= 0\n        and coordinates[1] >= 0\n        and sizes[0] > 0\n        and sizes[1] > 0\n    )\n    return keeps\n\n\ndef filter_nodes(root: Element, platform=\"ubuntu\", check_image=False):\n    filtered_nodes = []\n    all_nodes = []\n    for node in root.iter():\n        all_nodes.append(node)\n\n    for node in root.iter():\n        if judge_node(node, platform, check_image):\n            filtered_nodes.append(node)\n\n    return filtered_nodes\n\n\ndef draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0):\n    # Load the screenshot image\n    image_stream = io.BytesIO(image_file_content)\n    image = Image.open(image_stream)\n    if float(down_sampling_ratio) != 1.0:\n        image = image.resize(\n            (\n                int(image.size[0] * down_sampling_ratio),\n                int(image.size[1] * down_sampling_ratio),\n            )\n        )\n    draw = ImageDraw.Draw(image)\n    marks = []\n    drew_nodes = []\n    text_informations: List[str] = [\"index\\ttag\\tname\\ttext\"]\n\n    try:\n        # Adjust the path to the font file you have or use a default one\n        font = ImageFont.truetype(\"arial.ttf\", 15)\n    except IOError:\n        # Fallback to a basic font if the specified font can't be loaded\n        font = ImageFont.load_default()\n\n    index = 1\n\n    # Loop over all the visible nodes and draw their bounding boxes\n    for _node in nodes:\n        coords_str = _node.attrib.get(\n            \"{uri:deskat:component.at-spi.gnome.org}screencoord\"\n        )\n        size_str = _node.attrib.get(\"{uri:deskat:component.at-spi.gnome.org}size\")\n\n        if coords_str and size_str:\n            try:\n                # Parse the coordinates and size from the strings\n                coords = tuple(map(int, coords_str.strip(\"()\").split(\", \")))\n                size = tuple(map(int, size_str.strip(\"()\").split(\", \")))\n\n                import copy\n\n                original_coords = copy.deepcopy(coords)\n                original_size = copy.deepcopy(size)\n\n                if float(down_sampling_ratio) != 1.0:\n                    # Downsample the coordinates and size\n                    coords = tuple(int(coord * down_sampling_ratio) for coord in coords)\n                    size = tuple(int(s * down_sampling_ratio) for s in size)\n\n                # Check for negative sizes\n                if size[0] <= 0 or size[1] <= 0:\n                    raise ValueError(f\"Size must be positive, got: {size}\")\n\n                # Calculate the bottom-right corner of the bounding box\n                bottom_right = (coords[0] + size[0], coords[1] + size[1])\n\n                # Check that bottom_right > coords (x1 >= x0, y1 >= y0)\n                if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:\n                    raise ValueError(\n                        f\"Invalid coordinates or size, coords: {coords}, size: {size}\"\n                    )\n\n                # Check if the area only contains one color\n                cropped_image = image.crop((*coords, *bottom_right))\n                if len(set(list(cropped_image.getdata()))) == 1:\n                    continue\n\n                # Draw rectangle on image\n                draw.rectangle([coords, bottom_right], outline=\"red\", width=1)\n\n                # Draw index number at the bottom left of the bounding box with black background\n                text_position = (\n                    coords[0],\n                    bottom_right[1],\n                )  # Adjust Y to be above the bottom right\n                text_bbox: Tuple[int, int, int, int] = draw.textbbox(\n                    text_position, str(index), font=font, anchor=\"lb\"\n                )\n                # offset: int = bottom_right[1]-text_bbox[3]\n                # text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)\n\n                # draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')\n                draw.rectangle(text_bbox, fill=\"black\")\n                draw.text(\n                    text_position, str(index), font=font, anchor=\"lb\", fill=\"white\"\n                )\n\n                # each mark is an x, y, w, h tuple\n                marks.append(\n                    [\n                        original_coords[0],\n                        original_coords[1],\n                        original_size[0],\n                        original_size[1],\n                    ]\n                )\n                drew_nodes.append(_node)\n\n                if _node.text:\n                    node_text = (\n                        _node.text\n                        if '\"' not in _node.text\n                        else '\"{:}\"'.format(_node.text.replace('\"', '\"\"'))\n                    )\n                elif _node.get(\n                    \"{uri:deskat:uia.windows.microsoft.org}class\", \"\"\n                ).endswith(\"EditWrapper\") and _node.get(\n                    \"{uri:deskat:value.at-spi.gnome.org}value\"\n                ):\n                    node_text: str = _node.get(\n                        \"{uri:deskat:value.at-spi.gnome.org}value\"\n                    )\n                    node_text = (\n                        node_text\n                        if '\"' not in node_text\n                        else '\"{:}\"'.format(node_text.replace('\"', '\"\"'))\n                    )\n                else:\n                    node_text = '\"\"'\n                text_information: str = \"{:d}\\t{:}\\t{:}\\t{:}\".format(\n                    index, _node.tag, _node.get(\"name\", \"\"), node_text\n                )\n                text_informations.append(text_information)\n\n                index += 1\n\n            except ValueError:\n                pass\n\n    output_image_stream = io.BytesIO()\n    image.save(output_image_stream, format=\"PNG\")\n    image_content = output_image_stream.getvalue()\n\n    return marks, drew_nodes, \"\\n\".join(text_informations), image_content\n\n\ndef print_nodes_with_indent(nodes, indent=0):\n    for node in nodes:\n        print(\" \" * indent, node.tag, node.attrib)\n        print_nodes_with_indent(node, indent + 2)\n\n\n# Code based on https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py\n\n\ndef encode_image(image_content):\n    return base64.b64encode(image_content).decode(\"utf-8\")\n\n\ndef encoded_img_to_pil_img(data_str):\n    base64_str = data_str.replace(\"data:image/png;base64,\", \"\")\n    image_data = base64.b64decode(base64_str)\n    image = Image.open(BytesIO(image_data))\n\n    return image\n\n\ndef save_to_tmp_img_file(data_str):\n    base64_str = data_str.replace(\"data:image/png;base64,\", \"\")\n    image_data = base64.b64decode(base64_str)\n    image = Image.open(BytesIO(image_data))\n\n    tmp_img_path = os.path.join(tempfile.mkdtemp(), \"tmp_img.png\")\n    image.save(tmp_img_path)\n\n    return tmp_img_path\n\n\ndef linearize_accessibility_tree(accessibility_tree, platform=\"ubuntu\", tag=False):\n    # leaf_nodes = find_leaf_nodes(accessibility_tree)\n    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)\n    linearized_accessibility_tree = [\n        \"tag\\tname\\ttext\\tposition (top-left x&y)\\tsize (w&h)\"\n    ]\n    # Linearize the accessibility tree nodes into a table format\n\n    for node in filtered_nodes:\n        # linearized_accessibility_tree += node.tag + \"\\t\"\n        # linearized_accessibility_tree += node.attrib.get('name') + \"\\t\"\n        if node.text:\n            text = (\n                node.text\n                if '\"' not in node.text\n                else '\"{:}\"'.format(node.text.replace('\"', '\"\"'))\n            )\n        elif node.get(\"{uri:deskat:uia.windows.microsoft.org}class\", \"\").endswith(\n            \"EditWrapper\"\n        ) and node.get(\"{uri:deskat:value.at-spi.gnome.org}value\"):\n            text: str = node.get(\"{uri:deskat:value.at-spi.gnome.org}value\")\n            text = text if '\"' not in text else '\"{:}\"'.format(text.replace('\"', '\"\"'))\n        else:\n            text = '\"\"'\n        # linearized_accessibility_tree += node.attrib.get(\n        # , \"\") + \"\\t\"\n        # linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', \"\") + \"\\n\"\n        linearized_accessibility_tree.append(\n            \"{:}\\t{:}\\t{:}\\t{:}\\t{:}\".format(\n                node.tag,\n                node.get(\"name\", \"\"),\n                text,\n                node.get(\"{uri:deskat:component.at-spi.gnome.org}screencoord\", \"\"),\n                node.get(\"{uri:deskat:component.at-spi.gnome.org}size\", \"\"),\n            )\n        )\n    if tag:\n        linearized_accessibility_tree = tag_accessibility_tree(\n            linearized_accessibility_tree\n        )\n\n    return \"\\n\".join(linearized_accessibility_tree)\n\n\ndef tag_accessibility_tree(linear_accessibility_tree):\n    # Add 'id' to the first line\n    linear_accessibility_tree[0] = \"id\\t\" + linear_accessibility_tree[0]\n\n    # Start idx from 1 to correctly index into the list\n    for idx in range(1, len(linear_accessibility_tree)):\n        line = linear_accessibility_tree[idx]\n        linear_accessibility_tree[idx] = f\"[{str(idx)}]\\t\" + line\n\n    return linear_accessibility_tree\n\n\ndef tag_screenshot(screenshot, accessibility_tree, platform=\"ubuntu\"):\n    nodes = filter_nodes(\n        ET.fromstring(accessibility_tree), platform=platform, check_image=True\n    )\n    # Make tag screenshot\n    marks, drew_nodes, element_list, tagged_screenshot = draw_bounding_boxes(\n        nodes, screenshot\n    )\n\n    return marks, drew_nodes, tagged_screenshot, element_list\n\n\ndef parse_dag(text):\n    pattern = r\"<json>(.*?)</json>\"\n    match = re.search(pattern, text, re.DOTALL)\n    if match:\n        json_str = match.group(1)\n        try:\n            json_data = json.loads(json_str)\n            return Dag(**json_data[\"dag\"])\n        except json.JSONDecodeError:\n            print(\"Error: Invalid JSON\")\n            return None\n        except KeyError:\n            print(\"Error: 'dag' key not found in JSON\")\n            return None\n        except ValidationError as e:\n            print(f\"Error: Invalid data structure - {e}\")\n            return None\n    else:\n        print(\"Error: JSON not found\")\n        return None\n\n\ndef parse_subinfo(subinfo_string):\n    matches = re.findall(r\"```json\\s+(.*?)\\s+```\", subinfo_string, re.DOTALL)\n    if matches:\n        # Assuming there's only one match, parse the JSON string into a dictionary\n        try:\n            subinfo_dict = json.loads(matches[0])\n            return subinfo_dict\n        except json.JSONDecodeError as e:\n            print(f\"Failed to parse JSON: {e}\")\n            return {\"error\": e}\n    else:\n        return {\n            \"error\": \"Subinfo generated in incorrect format. Please use the correct format.\"\n        }\n\n\ndef parse_actions_from_string(input_string):\n    if input_string.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n        return [input_string.strip()]\n    # Search for a JSON string within the input string\n    actions = []\n    matches = re.findall(r\"```json\\s+(.*?)\\s+```\", input_string, re.DOTALL)\n    if matches:\n        # Assuming there's only one match, parse the JSON string into a dictionary\n        try:\n            for match in matches:\n                action_dict = json.loads(match)\n                actions.append(action_dict)\n            return actions\n        except json.JSONDecodeError as e:\n            return f\"Failed to parse JSON: {e}\"\n    else:\n        matches = re.findall(r\"```\\s+(.*?)\\s+```\", input_string, re.DOTALL)\n        if matches:\n            # Assuming there's only one match, parse the JSON string into a dictionary\n            try:\n                for match in matches:\n                    action_dict = json.loads(match)\n                    actions.append(action_dict)\n                return actions\n            except json.JSONDecodeError as e:\n                return f\"Failed to parse JSON: {e}\"\n        else:\n            try:\n                action_dict = json.loads(input_string)\n                return [action_dict]\n            except json.JSONDecodeError:\n                raise ValueError(\"Invalid response format: \" + input_string)\n\n\ndef parse_fixed_action_from_string(input_string):\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n    matches = re.findall(pattern, input_string)\n    if matches:\n        # Assuming there's only one match, parse the JSON string into a dictionary\n        try:\n            for match in matches:\n                action = match\n            return action\n        except json.JSONDecodeError as e:\n            return f\"Failed to parse JSON: {e}\"\n\n    return \"agent.wait()\"\n\n\ndef parse_code_from_string(input_string):\n    input_string = \"\\n\".join(\n        [line.strip() for line in input_string.split(\";\") if line.strip()]\n    )\n    if input_string.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n        return [input_string.strip()]\n\n    # This regular expression will match both ```code``` and ```python code```\n    # and capture the `code` part. It uses a non-greedy match for the content inside.\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n    # Find all non-overlapping matches in the string\n    matches = re.findall(pattern, input_string, re.DOTALL)\n\n    # The regex above captures the content inside the triple backticks.\n    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,\n    # so the code inside backticks can span multiple lines.\n\n    # matches now contains all the captured code snippets\n\n    codes = []\n\n    for match in matches:\n        match = match.strip()\n        commands = [\n            \"WAIT\",\n            \"DONE\",\n            \"FAIL\",\n        ]  # fixme: updates this part when we have more commands\n\n        if match in commands:\n            codes.append(match.strip())\n        elif match.split(\"\\n\")[-1] in commands:\n            if len(match.split(\"\\n\")) > 1:\n                codes.append(\"\\n\".join(match.split(\"\\n\")[:-1]))\n            codes.append(match.split(\"\\n\")[-1])\n        else:\n            codes.append(match)\n\n    return codes\n\n\ndef parse_single_code_from_string(input_string):\n    input_string = input_string.strip()\n    if input_string.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n        return input_string.strip()\n\n    # This regular expression will match both ```code``` and ```python code```\n    # and capture the `code` part. It uses a non-greedy match for the content inside.\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n    # Find all non-overlapping matches in the string\n    matches = re.findall(pattern, input_string, re.DOTALL)\n\n    # The regex above captures the content inside the triple backticks.\n    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,\n    # so the code inside backticks can span multiple lines.\n\n    # matches now contains all the captured code snippets\n\n    codes = []\n\n    for match in matches:\n        match = match.strip()\n        commands = [\n            \"WAIT\",\n            \"DONE\",\n            \"FAIL\",\n        ]  # fixme: updates this part when we have more commands\n\n        if match in commands:\n            codes.append(match.strip())\n        elif match.split(\"\\n\")[-1] in commands:\n            if len(match.split(\"\\n\")) > 1:\n                codes.append(\"\\n\".join(match.split(\"\\n\")[:-1]))\n            codes.append(match.split(\"\\n\")[-1])\n        else:\n            codes.append(match)\n\n    return codes[0]\n\n\ndef parse_action_from_fixed_code(action_string, linearized_accessibility_tree):\n\n    import re\n\n    def parse_action_from_agent_code(action_str):\n        # First, extract the code block within triple backticks\n        code_block_pattern = r\"```(.*?)```\"\n        code_block_match = re.search(code_block_pattern, action_str, re.DOTALL)\n\n        if not code_block_match:\n            raise ValueError(\"No code block found\")\n\n        code_block = code_block_match.group(1).strip()\n\n        # Define a regex pattern to extract the action type and parameters\n        action_pattern = r\"agent\\.(\\w+)\\((.*?)\\)\"\n        match = re.match(action_pattern, code_block, re.IGNORECASE)\n\n        if match:\n            action_type = match.group(1)\n            params_str = match.group(2)\n\n            # Split the parameters by comma and strip any surrounding whitespace or quotes\n            params = [\n                param.strip().strip('\"').strip(\"'\") for param in params_str.split(\",\")\n            ]\n\n            # Convert numeric parameters to integers\n            for i in range(len(params)):\n                try:\n                    params[i] = int(params[i])\n                except ValueError:\n                    pass\n\n            return action_type, params\n        else:\n            raise ValueError(\"Invalid action string format\")\n\n    parsed_action = parse_action_from_agent_code(action_string)\n    action_type, params = parsed_action\n    code = \"\"\n\n    def get_position_from_tree(element_id):\n        element = linearized_accessibility_tree[element_id]\n        position_str, size_str = element.split(\"\\t\")[-2].replace(\"(\", \"\").replace(\n            \")\", \"\"\n        ), element.split(\"\\t\")[-1].replace(\"(\", \"\").replace(\")\", \"\")\n        top_x_str, top_y_str = position_str.split(\",\")\n        top_x, top_y = int(top_x_str.strip()), int(top_y_str.strip())\n        size_x_str, size_y_str = size_str.split(\",\")\n        size_x, size_y = int(size_x_str.strip()), int(size_y_str.strip())\n        centroid_x, centroid_y = top_x + size_x // 2, top_y + size_y // 2\n        return centroid_x, centroid_y\n\n    if action_type == \"left_click_element_by_id\":\n        element_id = int(params[0])\n        centroid_x, centroid_y = get_position_from_tree(element_id)\n        code = f\"\"\"position = ({centroid_x}, {centroid_y}); pyautogui.click(position)\n        \"\"\"\n\n    elif action_type == \"right_click_element_by_id\":\n        element_id = int(params[0])\n        centroid_x, centroid_y = get_position_from_tree(element_id)\n        code = f\"\"\"\n        position = ({centroid_x}, {centroid_y}); pyautogui.click(position, button='right')\n        \"\"\"\n\n    elif action_type == \"hover_over_element_by_id\":\n        element_id = int(params[0])\n        centroid_x, centroid_y = get_position_from_tree(element_id)\n        code = (\n            f\"\"\"position = ({centroid_x}, {centroid_y}); pyautogui.moveTo(position)\"\"\"\n        )\n\n    elif action_type == \"type_write_element_by_id\":\n        element_id = int(params[0])\n        text = params[1]\n        centroid_x, centroid_y = get_position_from_tree(element_id)\n        code = f\"\"\"\n        position = ({centroid_x}, {centroid_y}); pyautogui.click(position); time.sleep(0.75); pyautogui.typewrite(\"{text}\")\"\"\"\n\n    elif action_type == \"press_key_combinations\":\n        keys = params\n        keys_str = '\", \"'.join(keys)\n        code = f\"\"\"\n        pyautogui.hotkey(\"{keys_str}\")\n        \"\"\"\n\n    elif action_type == \"wait\":\n        code = \"\"\"WAIT\"\"\"\n\n    elif action_type == \"done\":\n        code = \"\"\"DONE\"\"\"\n\n    elif action_type == \"fail\":\n        code = \"FAIL\"\n\n    return [code.strip()]\n\n\ndef parse_code_from_som_string(input_string, masks):\n    # parse the output string by masks\n    tag_vars = \"\"\n    for i, mask in enumerate(masks):\n        x, y, w, h = mask\n        tag_vars += (\n            \"tag_\"\n            + str(i + 1)\n            + \"=\"\n            + \"({}, {})\".format(int(x + w // 2), int(y + h // 2))\n        )\n        tag_vars += \"\\n\"\n\n    actions = parse_code_from_string(input_string)\n\n    for i, action in enumerate(actions):\n        if action.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n            pass\n        else:\n            action = tag_vars + action\n            actions[i] = action\n\n    return actions\n\n\ndef box_iou(boxes1: np.ndarray, boxes2: np.ndarray) -> np.ndarray:\n    \"\"\"\n    Fast vectorized IOU implementation using only NumPy\n    boxes1: [N, 4] array of boxes\n    boxes2: [M, 4] array of boxes\n    Returns: [N, M] array of IOU values\n    \"\"\"\n    # Calculate areas of boxes1\n    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])\n\n    # Calculate areas of boxes2\n    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])\n\n    # Get intersections using broadcasting\n    lt = np.maximum(boxes1[:, None, :2], boxes2[None, :, :2])  # [N,M,2]\n    rb = np.minimum(boxes1[:, None, 2:], boxes2[None, :, 2:])  # [N,M,2]\n\n    # Calculate intersection areas\n    wh = np.clip(rb - lt, 0, None)  # [N,M,2]\n    intersection = wh[:, :, 0] * wh[:, :, 1]  # [N,M]\n\n    # Calculate union areas\n    union = area1[:, None] + area2[None, :] - intersection\n\n    # Calculate IOU\n    iou = np.where(union > 0, intersection / union, 0)\n    return iou\n\n\ndef calculate_iou(rect1, rect2):\n    \"\"\"\n    Calculate the Intersection over Union (IoU) of two rectangles using numpy.\n\n    Parameters:\n    rect1, rect2: Tuples containing the coordinates of the rectangles in the form (x_min, y_min, x_max, y_max)\n\n    Returns:\n    IoU: Intersection over Union value\n    \"\"\"\n    # Convert the coordinates to tensors\n    box1 = np.array([rect1], dtype=np.float32)\n    box2 = np.array([rect2], dtype=np.float32)\n\n    # Calculate IoU using numpy\n    iou = box_iou(box1, box2)\n\n    return iou\n\n\ndef text_cvt_orc_format_paddle(paddle_result):\n    texts = []\n    print(\"paddle_result: \", paddle_result)\n    for i, line in enumerate(paddle_result[0]):\n        points = np.array(line[0])\n        print(\"points: \", points)\n        location = {\n            \"left\": int(min(points[:, 0])),\n            \"top\": int(min(points[:, 1])),\n            \"right\": int(max(points[:, 0])),\n            \"bottom\": int(max(points[:, 1])),\n        }\n        print(\"location: \", location)\n        content = line[1][0]\n        texts.append((i, content, location))\n    return texts\n\n\ndef trim_accessibility_tree(linearized_accessibility_tree, max_tokens):\n    enc = tiktoken.encoding_for_model(\"gpt-4\")\n    tokens = enc.encode(linearized_accessibility_tree)\n    if len(tokens) > max_tokens:\n        print(\"MAX TOKEN LENGTH OF ACCESSIBILITY TREE EXCEEDED\")\n        linearized_accessibility_tree = enc.decode(tokens[:max_tokens])\n        linearized_accessibility_tree += \"[...]\\n\"\n    return linearized_accessibility_tree\n\n\ndef get_input_token_length(input_string):\n    enc = tiktoken.encoding_for_model(\"gpt-4\")\n    tokens = enc.encode(input_string)\n    return len(tokens)\n\n\ndef load_osworld_example(base_path: str, domain: str, id: int):\n    example_path = f\"{base_path}/{domain}\"\n    example_path = (\n        f\"/Users/saaketagashe/Documents/OSWorld/evaluation_examples/examples/{domain}\"\n    )\n    examples = os.listdir(example_path)\n\n    with open(example_path + \"/\" + examples[id], \"r\") as f:\n        example = json.load(f)\n\n    return example\n\n\ndef sanitize_code(code):\n    # This pattern captures the outermost double-quoted text\n    if \"\\n\" in code:\n        pattern = r'(\".*?\")'\n        # Find all matches in the text\n        matches = re.findall(pattern, code, flags=re.DOTALL)\n        if matches:\n            # Replace the first occurrence only\n            first_match = matches[0]\n            code = code.replace(first_match, f'\"\"\"{first_match[1:-1]}\"\"\"', 1)\n    return code\n\n\ndef extract_first_agent_function(code_string):\n    # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses\n    pattern = r'agent\\.[a-zA-Z_]+\\((?:[^()\\'\"]|\\'[^\\']*\\'|\"[^\"]*\")*\\)'\n\n    # Find all matches in the string\n    matches = re.findall(pattern, code_string)\n\n    # Return the first match if found, otherwise return None\n    return matches[0] if matches else None\n\n\ndef load_knowledge_base(kb_path: str) -> Dict:\n    try:\n        with open(kb_path, \"r\") as f:\n            return json.load(f)\n    except Exception as e:\n        print(f\"Error loading knowledge base: {e}\")\n        return {}\n\n\ndef load_embeddings(embeddings_path: str) -> Dict:\n    try:\n        with open(embeddings_path, \"rb\") as f:\n            return pickle.load(f)\n    except Exception as e:\n        print(f\"Error loading embeddings: {e}\")\n        return {}\n\n\ndef save_embeddings(embeddings_path: str, embeddings: Dict):\n    try:\n        with open(embeddings_path, \"wb\") as f:\n            pickle.dump(embeddings, f)\n    except Exception as e:\n        print(f\"Error saving embeddings: {e}\")\n"
  },
  {
    "path": "gui_agents/s1/utils/ocr_server.py",
    "content": "import base64\nimport gc\nimport io\n\nimport numpy as np\nfrom fastapi import FastAPI\nfrom paddleocr import PaddleOCR\nfrom PIL import Image\nfrom pydantic import BaseModel\n\napp = FastAPI()\nocr_module = PaddleOCR(use_angle_cls=True, lang=\"en\")\n\n\nclass ImageData(BaseModel):\n    img_bytes: bytes\n\n\ndef text_cvt_orc_format_paddle(paddle_result):\n    texts = []\n    print(\"paddle_result: \", paddle_result)\n    for i, line in enumerate(paddle_result[0]):\n        points = np.array(line[0])\n        print(\"points: \", points)\n        location = {\n            \"left\": int(min(points[:, 0])),\n            \"top\": int(min(points[:, 1])),\n            \"right\": int(max(points[:, 0])),\n            \"bottom\": int(max(points[:, 1])),\n        }\n        print(\"location: \", location)\n        content = line[1][0]\n        texts.append((i, content, location))\n    return texts\n\n\ndef ocr_results(screenshot):\n    screenshot_img = Image.open(io.BytesIO(screenshot))\n    result = ocr_module.ocr(np.array(screenshot_img), cls=True)\n    return text_cvt_orc_format_paddle(result)\n\n\n@app.post(\"/ocr/\")\nasync def read_image(image_data: ImageData):\n    image_bytes = base64.b64decode(image_data.img_bytes)\n    results = ocr_results(image_bytes)\n\n    # Explicitly delete unused variables and run garbage collector\n    del image_bytes\n    gc.collect()\n\n    return {\"results\": results}\n\n\nif __name__ == \"__main__\":\n    import uvicorn\n\n    uvicorn.run(app, host=\"127.0.0.1\", port=8000)\n"
  },
  {
    "path": "gui_agents/s1/utils/query_perplexica.py",
    "content": "import requests\nimport toml\nimport os\n\n\ndef query_to_perplexica(query):\n    # Retrieve the URL from an environment variable\n    url = os.getenv(\"PERPLEXICA_URL\")\n    if not url:\n        raise ValueError(\n            \"PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory.\"\n        )\n\n    # Request Message\n    message = {\"focusMode\": \"webSearch\", \"query\": query, \"history\": [[\"human\", query]]}\n\n    response = requests.post(url, json=message)\n\n    if response.status_code == 200:\n        return response.json()[\"message\"]\n    elif response.status_code == 400:\n        raise ValueError(\n            \"The request is malformed or missing required fields, such as FocusModel or query\"\n        )\n    else:\n        raise ValueError(\"Internal Server Error\")\n\n\n# Test Code\nif __name__ == \"__main__\":\n    query = \"What is Agent S?\"\n    response = query_to_perplexica(query)\n    print(response)\n"
  },
  {
    "path": "gui_agents/s2/WAA_setup.md",
    "content": "# Introduction\n\nThis is the WindowsAgentArena (WAA) setup with Agent S2 (and beyond). Why do we need a setup guide? Despite the thorough [README.md](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file\"), we have to include our code into their repository _and_ fix up a number of setup issues from the WAA environment. Sadly, this isn’t the most straightforward.\n\n# Initial WAA Setup\n\nThe initial WAA setup is straightforward. Follow the [README.md](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file\") on their repository. After you’ve finished this, try running `run-local.sh`. This will start up an experiment with their default `Navi` agent. At this point, the environment is _sufficient to run evaluation_, but it’s incomplete and thus the evaluation won’t be exactly correct due to environment issues.\n\n![](./images/waa_setup/fig1.png)\n\nFigure 1: Bash script chain of execution.\n\nWhile we’re at it, look to understand the following things:\n\n-   the entire README.md (especially the [Bring Your Own Agent guide](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent\"))\n    \n-   the _long_ chain of bash scripts that start the run (Figure 1)\n    \n-   the `run.py` to see how the agent/environment are instantiated and used together\n    \n-   the folder structure of the repository and the purpose of each folder\n    \n\n# Fixing Setup Issues\n\nBy now, your WAA environment should be set up to run locally. There are two major problems:\n\n-   setup issues\n    \n-   the VM persists across examples (it won’t reset after every example is completed which may make evaluation unfair)\n    \n\nLet’s tackle the first one: setup issues.\n\n### Office Apps Aren’t Installed\n\nThe first issue I ran into was the office apps aren’t installed. Why is that? Turns out all apps installed in the VM during the initial setup stage install via the links from this [file](https://github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/vm/setup/tools_config.json \"https://github.com/microsoft/WindowsAgentArena/blob/main/src/win-arena-container/vm/setup/tools_config.json\") (`tools_config.json`). At the time of writing this, only the office links do not work. Try out all the links to make sure they work. If the links do not lead to a download (and some error occurs), then that app was not installed in the VM. What do we do? Two options:\n\n-   redo the entire initial setup stage (time consuming; ~**4** hours for me and even then, it would just not work a lot of the times; ideally, WAA is setup on Linux as I’ve had no issues so far with it)\n    \n-   Enter the VM and install the apps manually (easier and faster)\n    \n\nWe’ll do the second approach.\n\nYou can access the VM via `https://localhost:8006`. You can turn the VM on by `run-local.sh`. There’s probably a better/faster way to do it, but this doesn’t take too much time anyways (~**1-2** mins). After the VM has started, enter the VM (the agent may be trying to take actions, but you can either just override the action in `run.py` with `import time; time.sleep(10000)` [here](https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/lib_run_single.py#L58 \"https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/lib_run_single.py#L58\") or fight the agent for control of the VM!).\n\nInside the VM, navigate to their [download page](https://www.libreoffice.org/download/download-libreoffice/ \"https://www.libreoffice.org/download/download-libreoffice/\") and download the latest LibreOffice version. After it’s downloaded, complete the setup wizard and make sure to delete the downloaded `*.msi` file in the VM. Finally, test the download by opening up LibreOffice Writer and Calc.\n\n### Google Chrome Pop-ups\n\nIn Google Chrome, there a couple unexpected pop-ups.\n\n![](./images/waa_setup/fig2.png)\n\nFigure 2: Pop-ups on Chrome.\n\nClose all these pop-ups and [make Google Chrome your default web browser](https://support.google.com/chrome/answer/95417?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac%2Cwindows \"https://support.google.com/chrome/answer/95417?hl=en&co=GENIE.Platform%3DDesktop#zippy=%2Cmac%2Cwindows\").\n\n### VSCode Pop-ups\n\nThis isn’t as important, but there are a couple initial pop-ups in VSCode that you can close.\n\n### Note: `set_cell_values`\n\n_Important if you’re using_ `set_cell_values`\n\nAgent S2 uses a special grounding function called `set_cell_values` that takes advantage of the `soffice` CLI and `unotools` [Python library](https://pypi.org/project/unotools/ \"https://pypi.org/project/unotools/\"). TL; DR, this function lets the agent set the cell values for a given spreadsheet and sheet.\n\nFor this function to work on WAA, the set up is a bit messy…\n\n1.  Connect into the VM\n    \n2.  Open up a terminal and run `python --version`, you should see you’re using the GIMP Python which is `2.x`. This won’t let you use the `soffice` CLI or `import uno` in Python code.\n    \n3.  In the `Desktop` directory within a terminal, do `pip freeze > requirements.txt` to save all the PYPI libraries from the GIMP Python to a `requirements.txt`.\n    \n4.  Configuring Python path to LibreOffice’s Python\n    \n    1.  In the File Explorer, locate the `python.exe` file from LibreOffice. You can do this with `where python`. Copy this path.\n        \n    2.  In the Search bar in the bottom task bar inside the VM, search for “environment variables”.\n        \n    3.  Click on “Environment Variables” and click on “Path” under “System variables”. Paste the copied path from step (a) into there and ensure this path is _above_ the GIMP Python path so it takes precedence.\n        \n    4.  Reopen a terminal and run `soffice` to ensure it is now working. Create a temporary python file and ensure `import uno` works.\n        \n5.  LibreOffice’s Python should be `3.10` or above. However, it does not come with pip. To install pip, download this [file](https://bootstrap.pypa.io/get-pip.py \"https://bootstrap.pypa.io/get-pip.py\") and execute `python get-pip.py` to install it. Ensure the `python` here is LibreOffice’s Python. Next, install `pip install -r requirements.txt` using the `requirements.txt` from step 3. This is to ensure LibreOffice’s Python has all the dependencies needed for evaluation (pyautogui, etc).\n    \n6.  Clean up all installer files. Then, inside the [WAA repository code](https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/desktop_env/controllers/python.py#L193 \"https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/src/win-arena-container/client/desktop_env/controllers/python.py#L193\"), change this line\n    \n\n`command_list = [\"python\", \"-c\", self.pkgs_prefix.format(command=command)]`\n\nto:\n\n`command_list = [\"absolute/path/to/libreoffice/python\", \"-c\", self.pkgs_prefix.format(command=command)]`\n\nThis ensures that the subprocess running in the flask server inside the VM will use that specific Python version.\n\n### Double Checking…\n\nDouble check all apps can be used and no unexpected pop-ups or issues are in the way. Any apps you open make sure to close them upon finishing your clean-up. Make sure any installation files you have in `Downloads` are deleted (and removed from Recycle Bin) to keep the environment clean. At the end, this is our **golden image**. You may want to save a copy of this VM somewhere safe so that you can always copy it back into the WAA repository to be reused (refer to [this](https://github.com/microsoft/WindowsAgentArena/tree/main?tab=readme-ov-file#additional-notes \"https://github.com/microsoft/WindowsAgentArena/tree/main?tab=readme-ov-file#additional-notes\")).\n\n# Set up Agent S2 with WAA Locally\n\nTake the time to understand the [Agent-S repository](https://github.com/simular-ai/Agent-S \"https://github.com/simular-ai/Agent-S\").\n\n1.  Instead of following the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md \"https://github.com/simular-ai/Agent-S/blob/main/README.md\") for Agent S2, you need to clone the repository then `pip install -r requirements.txt`\n    \n2.  Move the s2 folder to the [mm_agents](https://github.com/microsoft/WindowsAgentArena/tree/main/src/win-arena-container/client/mm_agents \"https://github.com/microsoft/WindowsAgentArena/tree/main/src/win-arena-container/client/mm_agents\") folder in WAA. Follow the [Bring Your Own Agent guide](https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent \"https://github.com/microsoft/WindowsAgentArena?tab=readme-ov-file#-byoa-bring-your-own-agent\").\n    \n    1.  You will need to move the `agent_s.py` file out to the `s2` folder and update all the relevant import statements\n        \n3.  Make the necessary changes in `run.py` and `lib_run_single.py` to accommodate Agent S2 (replace the Navi Agent with Agent S2).\n    \n4.  Test it by running the experiments! Don’t forget when you do `run-local.sh`, now you need to specify Agent S2 instead of the navi agent `agent=\"agent_s\"`.\n    \n5.  You may have some import errors and these libraries need to be installed inside the `winarena` container (I think). You can just add the pip install commands to the bash script where the error stems from (hacky).\n    \n\n#### Perplexica\n\nThere may be a Perplexica issue. The Perplexica URL must be configured so that the agent in the `winarena` Docker container can communicate with `localhost:3001` which is the forwarded port from the Perplexica container. On Mac/Windows this can be fixed by changing the `PERPLEXICA_URL` to `http://host.docker.internal:3001/api/search` . On Linux, I just disabled it… I haven’t tried, but you can add `--add-host=host.docker.internal:host-gateway` as a flag to the docker command [here](https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/scripts/run.sh#L223 \"https://github.com/microsoft/WindowsAgentArena/blob/6d39ed88c545a0d40a7a02e39b928e278df7332b/scripts/run.sh#L223\") (run.sh). This may let you use `http://host.docker.internal:3001/api/search` as the `PERPLEXICA_URL`\n\n# Agent S2 with WAA on Azure\n\n1.  Ensure you have:\n    \n    1.  a **clean copy** of the golden image\n        \n    2.  the correct Azure subscription (so you’re not using your own payment method)\n        \n2.  Follow the Azure deployment in the [README.md](https://github.com/microsoft/WindowsAgentArena/blob/main/README.md \"https://github.com/microsoft/WindowsAgentArena/blob/main/README.md\").\n    \n3.  Test it! If this works, then we have a resettable golden image and WAA can be ran in parallel, making evaluation much _much_ faster! Good luck!"
  },
  {
    "path": "gui_agents/s2/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2/agents/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2/agents/agent_s.py",
    "content": "import json\nimport logging\nimport os\nimport platform\nfrom typing import Dict, List, Optional, Tuple\n\nfrom gui_agents.s2.agents.grounding import ACI\nfrom gui_agents.s2.agents.worker import Worker\nfrom gui_agents.s2.agents.manager import Manager\nfrom gui_agents.s2.utils.common_utils import Node\nfrom gui_agents.utils import download_kb_data\nfrom gui_agents.s2.core.engine import (\n    OpenAIEmbeddingEngine,\n    GeminiEmbeddingEngine,\n    AzureOpenAIEmbeddingEngine,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass UIAgent:\n    \"\"\"Base class for UI automation agents\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        action_space: str = \"pyautogui\",\n        observation_type: str = \"a11y_tree\",\n        search_engine: str = \"perplexica\",\n    ):\n        \"\"\"Initialize UIAgent\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (macos, linux, windows)\n            action_space: Type of action space to use (pyautogui, aci)\n            observation_type: Type of observations to use (a11y_tree, mixed)\n            engine: Search engine to use (perplexica, LLM)\n        \"\"\"\n        self.engine_params = engine_params\n        self.grounding_agent = grounding_agent\n        self.platform = platform\n        self.action_space = action_space\n        self.observation_type = observation_type\n        self.engine = search_engine\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state\"\"\"\n        pass\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        \"\"\"Generate next action prediction\n\n        Args:\n            instruction: Natural language instruction\n            observation: Current UI state observation\n\n        Returns:\n            Tuple containing agent info dictionary and list of actions\n        \"\"\"\n        pass\n\n    def update_narrative_memory(self, trajectory: str) -> None:\n        \"\"\"Update narrative memory with task trajectory\n\n        Args:\n            trajectory: String containing task execution trajectory\n        \"\"\"\n        pass\n\n    def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str:\n        \"\"\"Update episodic memory with subtask trajectory\n\n        Args:\n            meta_data: Metadata about current subtask execution\n            subtask_trajectory: String containing subtask execution trajectory\n\n        Returns:\n            Updated subtask trajectory\n        \"\"\"\n        pass\n\n\nclass AgentS2(UIAgent):\n    \"\"\"Agent that uses hierarchical planning and directed acyclic graph modeling for UI automation\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        action_space: str = \"pyautogui\",\n        observation_type: str = \"mixed\",\n        search_engine: Optional[str] = None,\n        memory_root_path: str = os.getcwd(),\n        use_default_kb: bool = False,\n        memory_folder_name: str = \"kb_s2\",\n        kb_release_tag: str = \"v0.2.2\",\n        embedding_engine_type: str = \"openai\",\n        embedding_engine_params: Dict = {},\n    ):\n        \"\"\"Initialize AgentS2\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (darwin, linux, windows)\n            action_space: Type of action space to use (pyautogui, other)\n            observation_type: Type of observations to use (a11y_tree, screenshot, mixed)\n            search_engine: Search engine to use (LLM, perplexica)\n            use_default_kb: True to use the default OpenAI kb.\n            memory_root_path: Path to memory directory. Defaults to current working directory.\n            memory_folder_name: Name of memory folder. Defaults to \"kb_s2\".\n            kb_release_tag: Release tag for knowledge base. Defaults to \"v0.2.2\".\n            embedding_engine_type: Embedding engine to use for knowledge base. Defaults to \"openai\". Supports \"openai\" and \"gemini\".\n            embedding_engine_params: Parameters for embedding engine. Defaults to {}.\n        \"\"\"\n        super().__init__(\n            engine_params,\n            grounding_agent,\n            platform,\n            action_space,\n            observation_type,\n            search_engine,\n        )\n\n        self.memory_root_path = memory_root_path\n        self.memory_folder_name = memory_folder_name\n        self.kb_release_tag = kb_release_tag\n\n        # Initialize agent's knowledge base on user's current working directory.\n        self.local_kb_path = os.path.join(\n            self.memory_root_path, self.memory_folder_name\n        )\n\n        if use_default_kb:\n            if not os.path.exists(os.path.join(self.local_kb_path, self.platform)):\n                print(\"Downloading Agent S2's default knowledge base...\")\n                download_kb_data(\n                    version=\"s2\",\n                    release_tag=kb_release_tag,\n                    download_dir=self.local_kb_path,\n                    platform=self.platform,\n                )\n                print(\n                    f\"Successfully completed download of knowledge base for version s2, tag {self.kb_release_tag}, platform {self.platform}.\"\n                )\n            else:\n                print(\n                    f\"Path local_kb_path {self.local_kb_path} already exists. Skipping download.\"\n                )\n                print(\n                    f\"If you'd like to re-download the initial knowledge base, please delete the existing knowledge base at {self.local_kb_path}.\"\n                )\n                print(\n                    \"Note, the knowledge is continually updated during inference. Deleting the knowledge base will wipe out all experience gained since the last knowledge base download.\"\n                )\n\n        if embedding_engine_type == \"openai\":\n            self.embedding_engine = OpenAIEmbeddingEngine(**embedding_engine_params)\n        elif embedding_engine_type == \"gemini\":\n            self.embedding_engine = GeminiEmbeddingEngine(**embedding_engine_params)\n        elif embedding_engine_type == \"azure\":\n            self.embedding_engine = AzureOpenAIEmbeddingEngine(\n                **embedding_engine_params\n            )\n\n        self.reset()\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state and initialize components\"\"\"\n        # Initialize core components\n        self.planner = Manager(\n            engine_params=self.engine_params,\n            grounding_agent=self.grounding_agent,\n            local_kb_path=self.local_kb_path,\n            embedding_engine=self.embedding_engine,\n            search_engine=self.engine,\n            platform=self.platform,\n        )\n        self.executor = Worker(\n            engine_params=self.engine_params,\n            grounding_agent=self.grounding_agent,\n            local_kb_path=self.local_kb_path,\n            embedding_engine=self.embedding_engine,\n            platform=self.platform,\n        )\n\n        # Reset state variables\n        self.requires_replan: bool = True\n        self.needs_next_subtask: bool = True\n        self.step_count: int = 0\n        self.turn_count: int = 0\n        self.failure_subtask: Optional[Node] = None\n        self.should_send_action: bool = False\n        self.completed_tasks: List[Node] = []\n        self.current_subtask: Optional[Node] = None\n        self.subtasks: List[Node] = []\n        self.search_query: str = \"\"\n        self.subtask_status: str = \"Start\"\n\n    def reset_executor_state(self) -> None:\n        \"\"\"Reset executor and step counter\"\"\"\n        self.executor.reset()\n        self.step_count = 0\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        # Initialize the three info dictionaries\n        planner_info = {}\n        executor_info = {}\n        evaluator_info = {\n            \"obs_evaluator_response\": \"\",\n            \"num_input_tokens_evaluator\": 0,\n            \"num_output_tokens_evaluator\": 0,\n            \"evaluator_cost\": 0.0,\n        }\n        actions = []\n\n        # If the DONE response by the executor is for a subtask, then the agent should continue with the next subtask without sending the action to the environment\n        while not self.should_send_action:\n            self.subtask_status = \"In\"\n            # If replan is true, generate a new plan. True at start, after a failed plan, or after subtask completion\n            if self.requires_replan:\n                logger.info(\"(RE)PLANNING...\")\n                planner_info, self.subtasks = self.planner.get_action_queue(\n                    instruction=instruction,\n                    observation=observation,\n                    failed_subtask=self.failure_subtask,\n                    completed_subtasks_list=self.completed_tasks,\n                    remaining_subtasks_list=self.subtasks,\n                )\n\n                self.requires_replan = False\n                if \"search_query\" in planner_info:\n                    self.search_query = planner_info[\"search_query\"]\n                else:\n                    self.search_query = \"\"\n\n            # use the exectuor to complete the topmost subtask\n            if self.needs_next_subtask:\n                logger.info(\"GETTING NEXT SUBTASK...\")\n\n                # this can be empty if the DAG planner deems that all subtasks are completed\n                if len(self.subtasks) <= 0:\n                    self.requires_replan = True\n                    self.needs_next_subtask = True\n                    self.failure_subtask = None\n                    self.completed_tasks.append(self.current_subtask)\n\n                    # reset executor state\n                    self.reset_executor_state()\n                    self.should_send_action = True\n                    self.subtask_status = \"Done\"\n                    executor_info = {\n                        \"executor_plan\": \"agent.done()\",\n                        \"plan_code\": \"agent.done()\",\n                        \"reflection\": \"agent.done()\",\n                    }\n                    actions = [\"DONE\"]\n                    break\n\n                self.current_subtask = self.subtasks.pop(0)\n                logger.info(f\"NEXT SUBTASK: {self.current_subtask}\")\n                self.needs_next_subtask = False\n                self.subtask_status = \"Start\"\n\n            # get the next action from the executor\n            executor_info, actions = self.executor.generate_next_action(\n                instruction=instruction,\n                search_query=self.search_query,\n                subtask=self.current_subtask.name,\n                subtask_info=self.current_subtask.info,\n                future_tasks=self.subtasks,\n                done_task=self.completed_tasks,\n                obs=observation,\n            )\n\n            self.step_count += 1\n\n            # set the should_send_action flag to True if the executor returns an action\n            self.should_send_action = True\n\n            # replan on failure\n            if \"FAIL\" in actions:\n                self.requires_replan = True\n                self.needs_next_subtask = True\n\n                # assign the failed subtask\n                self.failure_subtask = self.current_subtask\n\n                # reset the step count, executor, and evaluator\n                self.reset_executor_state()\n\n                # if more subtasks are remaining, we don't want to send DONE to the environment but move on to the next subtask\n                if self.subtasks:\n                    self.should_send_action = False\n\n            # replan on subtask completion\n            elif \"DONE\" in actions:\n                self.requires_replan = True\n                self.needs_next_subtask = True\n                self.failure_subtask = None\n                self.completed_tasks.append(self.current_subtask)\n\n                # reset the step count, executor, and evaluator\n                self.reset_executor_state()\n\n                # if more subtasks are remaining, we don't want to send DONE to the environment but move on to the next subtask\n                if self.subtasks:\n                    self.should_send_action = False\n                self.subtask_status = \"Done\"\n\n            self.turn_count += 1\n\n        # reset the should_send_action flag for next iteration\n        self.should_send_action = False\n\n        # concatenate the three info dictionaries\n        info = {\n            **{\n                k: v\n                for d in [planner_info or {}, executor_info or {}, evaluator_info or {}]\n                for k, v in d.items()\n            }\n        }\n        info.update(\n            {\n                \"subtask\": self.current_subtask.name,\n                \"subtask_info\": self.current_subtask.info,\n                \"subtask_status\": self.subtask_status,\n            }\n        )\n\n        return info, actions\n\n    def update_narrative_memory(self, trajectory: str) -> None:\n        \"\"\"Update narrative memory from task trajectory\n\n        Args:\n            trajectory: String containing task execution trajectory\n        \"\"\"\n        try:\n            reflection_path = os.path.join(\n                self.local_kb_path, self.platform, \"narrative_memory.json\"\n            )\n            try:\n                reflections = json.load(open(reflection_path))\n            except:\n                reflections = {}\n\n            if self.search_query not in reflections:\n                reflection = self.planner.summarize_narrative(trajectory)\n                reflections[self.search_query] = reflection\n\n            with open(reflection_path, \"w\") as f:\n                json.dump(reflections, f, indent=2)\n\n        except Exception as e:\n            logger.error(f\"Failed to update narrative memory: {e}\")\n\n    def update_episodic_memory(self, meta_data: Dict, subtask_trajectory: str) -> str:\n        \"\"\"Update episodic memory from subtask trajectory\n\n        Args:\n            meta_data: Metadata about current subtask execution\n            subtask_trajectory: String containing subtask execution trajectory\n\n        Returns:\n            Updated subtask trajectory\n        \"\"\"\n        subtask = meta_data[\"subtask\"]\n        subtask_info = meta_data[\"subtask_info\"]\n        subtask_status = meta_data[\"subtask_status\"]\n        # Handle subtask trajectory\n        if subtask_status == \"Start\" or subtask_status == \"Done\":\n            # If it's a new subtask start, finalize the previous subtask trajectory if it exists\n            if subtask_trajectory:\n                subtask_trajectory += \"\\nSubtask Completed.\\n\"\n                subtask_key = subtask_trajectory.split(\n                    \"\\n----------------------\\n\\nPlan:\\n\"\n                )[0]\n                try:\n                    subtask_path = os.path.join(\n                        self.local_kb_path, self.platform, \"episodic_memory.json\"\n                    )\n                    kb = json.load(open(subtask_path))\n                except:\n                    kb = {}\n                if subtask_key not in kb.keys():\n                    subtask_summarization = self.planner.summarize_episode(\n                        subtask_trajectory\n                    )\n                    kb[subtask_key] = subtask_summarization\n                else:\n                    subtask_summarization = kb[subtask_key]\n                logger.info(\"subtask_key: %s\", subtask_key)\n                logger.info(\"subtask_summarization: %s\", subtask_summarization)\n                with open(subtask_path, \"w\") as fout:\n                    json.dump(kb, fout, indent=2)\n                # Reset for the next subtask\n                subtask_trajectory = \"\"\n            # Start a new subtask trajectory\n            subtask_trajectory = (\n                \"Task:\\n\"\n                + self.search_query\n                + \"\\n\\nSubtask: \"\n                + subtask\n                + \"\\nSubtask Instruction: \"\n                + subtask_info\n                + \"\\n----------------------\\n\\nPlan:\\n\"\n                + meta_data[\"executor_plan\"]\n                + \"\\n\"\n            )\n        elif subtask_status == \"In\":\n            # Continue appending to the current subtask trajectory if it's still ongoing\n            subtask_trajectory += (\n                \"\\n----------------------\\n\\nPlan:\\n\"\n                + meta_data[\"executor_plan\"]\n                + \"\\n\"\n            )\n\n        return subtask_trajectory\n"
  },
  {
    "path": "gui_agents/s2/agents/grounding.py",
    "content": "import ast\nimport re\nfrom collections import defaultdict\nfrom io import BytesIO\nfrom typing import Any, Dict, List, Optional, Tuple, Union\n\nimport pytesseract\nfrom PIL import Image\nfrom pytesseract import Output\n\nfrom gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2.core.mllm import LMMAgent\nfrom gui_agents.s2.utils.common_utils import (\n    call_llm_safe,\n    parse_single_code_from_string,\n)\n\n\nclass ACI:\n    def __init__(self):\n        self.notes: List[str] = []\n\n\n# Agent action decorator\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nUBUNTU_APP_SETUP = f\"\"\"import subprocess;\nimport difflib;\nimport pyautogui;\npyautogui.press('escape');\ntime.sleep(0.5);\noutput = subprocess.check_output(['wmctrl', '-lx']);\noutput = output.decode('utf-8').splitlines();\nwindow_titles = [line.split(None, 4)[2] for line in output];\nclosest_matches = difflib.get_close_matches('APP_NAME', window_titles, n=1, cutoff=0.1);\nif closest_matches:\n    closest_match = closest_matches[0];\n    for line in output:\n        if closest_match in line:\n            window_id = line.split()[0]\n            break;\nsubprocess.run(['wmctrl', '-ia', window_id])\nsubprocess.run(['wmctrl', '-ir', window_id, '-b', 'add,maximized_vert,maximized_horz'])\n\"\"\"\n\n\nSET_CELL_VALUES_CMD = \"\"\"import uno\nimport subprocess\n\ndef identify_document_type(component):\n    if component.supportsService(\"com.sun.star.sheet.SpreadsheetDocument\"):\n        return \"Calc\"\n\n    if component.supportsService(\"com.sun.star.text.TextDocument\"):\n        return \"Writer\"\n\n    if component.supportsService(\"com.sun.star.sheet.PresentationDocument\"):\n        return \"Impress\"\n\n    return None\n\ndef cell_ref_to_indices(cell_ref):\n    column_letters = ''.join(filter(str.isalpha, cell_ref))\n    row_number = ''.join(filter(str.isdigit, cell_ref))\n\n    col = sum((ord(char.upper()) - ord('A') + 1) * (26**idx) for idx, char in enumerate(reversed(column_letters))) - 1\n    row = int(row_number) - 1\n    return col, row\n\ndef set_cell_values(new_cell_values: dict[str, str], app_name: str = \"Untitled 1\", sheet_name: str = \"Sheet1\"):\n    new_cell_values_idx = {{}}\n    for k, v in new_cell_values.items():\n        try:\n            col, row = cell_ref_to_indices(k)\n        except:\n            col = row = None\n\n        if col is not None and row is not None:\n            new_cell_values_idx[(col, row)] = v\n\n    # Clean up previous TCP connections.\n    subprocess.run(\n        'echo \\\"password\\\" | sudo -S ss --kill --tcp state TIME-WAIT sport = :2002',\n        shell=True,\n        check=True,\n        text=True,\n        capture_output=True\n    )\n\n    # Dynamically allow soffice to listen on port 2002.\n    subprocess.run(\n        [\n            \"soffice\",\n            \"--accept=socket,host=localhost,port=2002;urp;StarOffice.Service\"\n        ]\n    )\n\n    local_context = uno.getComponentContext()\n    resolver = local_context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.bridge.UnoUrlResolver\", local_context\n    )\n    context = resolver.resolve(\n        f\"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext\"\n    )\n    desktop = context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.frame.Desktop\", context\n    )\n\n    # Collect all LibreOffice-related opened windows.\n    documents = []\n    for i, component in enumerate(desktop.Components):\n        title = component.Title\n        doc_type = identify_document_type(component)\n        documents.append((i, component, title, doc_type))\n\n    # Find the LibreOffice Calc app and the sheet of interest.\n    spreadsheet = [doc for doc in documents if doc[3] == \"Calc\"]\n    selected_spreadsheet = [doc for doc in spreadsheet if doc[2] == app_name]\n    if spreadsheet:\n        try:\n            if selected_spreadsheet:\n                spreadsheet = selected_spreadsheet[0][1]\n            else:\n                spreadsheet = spreadsheet[0][1]\n\n            sheet = spreadsheet.Sheets.getByName(sheet_name)\n        except:\n            raise ValueError(f\"Could not find sheet {{sheet_name}} in {{app_name}}.\")\n\n        for (col, row), value in new_cell_values_idx.items():\n            cell = sheet.getCellByPosition(col, row)\n\n            # Set the cell value.\n            if isinstance(value, (int, float)):\n                cell.Value = value\n            elif isinstance(value, str):\n                if value.startswith(\"=\"):\n                    cell.Formula = value\n                else:\n                    cell.String = value\n            elif isinstance(value, bool):\n                cell.Value = 1 if value else 0\n            elif value is None:\n                cell.clearContents(0)\n            else:\n                raise ValueError(f\"Unsupported cell value type: {{type(value)}}\")\n\n    else:\n        raise ValueError(f\"Could not find LibreOffice Calc app corresponding to {{app_name}}.\")\n\nset_cell_values(new_cell_values={cell_values}, app_name=\"{app_name}\", sheet_name=\"{sheet_name}\")        \n\"\"\"\n\n\n# ACI primitives are parameterized by description, and coordinate generation uses a pretrained grounding model\nclass OSWorldACI(ACI):\n    def __init__(\n        self,\n        platform: str,\n        engine_params_for_generation: Dict,\n        engine_params_for_grounding: Dict,\n        width: int = 1920,\n        height: int = 1080,\n    ):\n        self.platform = (\n            platform  # Dictates how the switch_applications agent action works.\n        )\n\n        # Configure scaling\n        self.width = width\n        self.height = height\n\n        # Maintain state for save_to_knowledge\n        self.notes = []\n\n        # Coordinates used during ACI execution\n        self.coords1 = None\n        self.coords2 = None\n\n        # Configure the visual grounding model responsible for coordinate generation\n        self.grounding_model = LMMAgent(engine_params_for_grounding)\n        self.engine_params_for_grounding = engine_params_for_grounding\n\n        # Configure text grounding agent\n        self.text_span_agent = LMMAgent(\n            engine_params=engine_params_for_generation,\n            system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,\n        )\n\n    # Given the state and worker's referring expression, use the grounding model to generate (x,y)\n    def generate_coords(self, ref_expr: str, obs: Dict) -> List[int]:\n\n        # Reset the grounding model state\n        self.grounding_model.reset()\n\n        # Configure the context, UI-TARS demo does not use system prompt\n        prompt = f\"Query:{ref_expr}\\nOutput only the coordinate of one point in your response.\\n\"\n        self.grounding_model.add_message(\n            text_content=prompt, image_content=obs[\"screenshot\"], put_text_last=True\n        )\n\n        # Generate and parse coordinates\n        response = call_llm_safe(self.grounding_model)\n        print(\"RAW GROUNDING MODEL RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        assert len(numericals) >= 2\n        return [int(numericals[0]), int(numericals[1])]\n\n    # Calls pytesseract to generate word level bounding boxes for text grounding\n    def get_ocr_elements(self, b64_image_data: str) -> Tuple[str, List]:\n        image = Image.open(BytesIO(b64_image_data))\n        image_data = pytesseract.image_to_data(image, output_type=Output.DICT)\n\n        # Clean text by removing leading and trailing spaces and non-alphabetical characters, but keeping punctuation\n        for i, word in enumerate(image_data[\"text\"]):\n            image_data[\"text\"][i] = re.sub(\n                r\"^[^a-zA-Z\\s.,!?;:\\-\\+]+|[^a-zA-Z\\s.,!?;:\\-\\+]+$\", \"\", word\n            )\n\n        ocr_elements = []\n        ocr_table = \"Text Table:\\nWord id\\tText\\n\"\n        # Obtain the <id, text, group number, word number> for each valid element\n        grouping_map = defaultdict(list)\n        ocr_id = 0\n        for i in range(len(image_data[\"text\"])):\n            block_num = image_data[\"block_num\"][i]\n            if image_data[\"text\"][i]:\n                grouping_map[block_num].append(image_data[\"text\"][i])\n                ocr_table += f\"{ocr_id}\\t{image_data['text'][i]}\\n\"\n                ocr_elements.append(\n                    {\n                        \"id\": ocr_id,\n                        \"text\": image_data[\"text\"][i],\n                        \"group_num\": block_num,\n                        \"word_num\": len(grouping_map[block_num]),\n                        \"left\": image_data[\"left\"][i],\n                        \"top\": image_data[\"top\"][i],\n                        \"width\": image_data[\"width\"][i],\n                        \"height\": image_data[\"height\"][i],\n                    }\n                )\n                ocr_id += 1\n\n        return ocr_table, ocr_elements\n\n    # Given the state and worker's text phrase, generate the coords of the first/last word in the phrase\n    def generate_text_coords(\n        self, phrase: str, obs: Dict, alignment: str = \"\"\n    ) -> List[int]:\n\n        ocr_table, ocr_elements = self.get_ocr_elements(obs[\"screenshot\"])\n\n        alignment_prompt = \"\"\n        if alignment == \"start\":\n            alignment_prompt = \"**Important**: Output the word id of the FIRST word in the provided phrase.\\n\"\n        elif alignment == \"end\":\n            alignment_prompt = \"**Important**: Output the word id of the LAST word in the provided phrase.\\n\"\n\n        # Load LLM prompt\n        self.text_span_agent.reset()\n        self.text_span_agent.add_message(\n            alignment_prompt + \"Phrase: \" + phrase + \"\\n\" + ocr_table, role=\"user\"\n        )\n        self.text_span_agent.add_message(\n            \"Screenshot:\\n\", image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        # Obtain the target element\n        response = call_llm_safe(self.text_span_agent)\n        print(\"TEXT SPAN AGENT RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        if len(numericals) > 0:\n            text_id = int(numericals[-1])\n        else:\n            text_id = 0\n        elem = ocr_elements[text_id]\n\n        # Compute the element coordinates\n        if alignment == \"start\":\n            coords = [elem[\"left\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        elif alignment == \"end\":\n            coords = [elem[\"left\"] + elem[\"width\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        else:\n            coords = [\n                elem[\"left\"] + (elem[\"width\"] // 2),\n                elem[\"top\"] + (elem[\"height\"] // 2),\n            ]\n        return coords\n\n    # Takes a description based action and assigns the coordinates for any coordinate based action\n    # Raises an error if function can't be parsed\n    def assign_coordinates(self, plan: str, obs: Dict):\n\n        # Reset coords from previous action generation\n        self.coords1, self.coords2 = None, None\n\n        try:\n            # Extract the function name and args\n            action = parse_single_code_from_string(plan.split(\"Grounded Action\")[-1])\n            function_name = re.match(r\"(\\w+\\.\\w+)\\(\", action).group(1)\n            args = self.parse_function_args(action)\n        except Exception as e:\n            raise RuntimeError(f\"Error in parsing grounded action: {e}\") from e\n\n        # arg0 is a description\n        if (\n            function_name in [\"agent.click\", \"agent.type\", \"agent.scroll\"]\n            and len(args) >= 1\n            and args[0] != None\n        ):\n            self.coords1 = self.generate_coords(args[0], obs)\n        # arg0 and arg1 are descriptions\n        elif function_name == \"agent.drag_and_drop\" and len(args) >= 2:\n            self.coords1 = self.generate_coords(args[0], obs)\n            self.coords2 = self.generate_coords(args[1], obs)\n        # arg0 and arg1 are text phrases\n        elif function_name == \"agent.highlight_text_span\" and len(args) >= 2:\n            self.coords1 = self.generate_text_coords(args[0], obs, alignment=\"start\")\n            self.coords2 = self.generate_text_coords(args[1], obs, alignment=\"end\")\n\n    # Resize from grounding model dim into OSWorld dim (1920 * 1080)\n    def resize_coordinates(self, coordinates: List[int]) -> List[int]:\n        # User explicitly passes the grounding model dimensions\n        if {\"grounding_width\", \"grounding_height\"}.issubset(\n            self.engine_params_for_grounding\n        ):\n            grounding_width = self.engine_params_for_grounding[\"grounding_width\"]\n            grounding_height = self.engine_params_for_grounding[\"grounding_height\"]\n        # Default to (1000, 1000), which is UI-TARS resizing\n        else:\n            grounding_width = 1000\n            grounding_height = 1000\n\n        return [\n            round(coordinates[0] * self.width / grounding_width),\n            round(coordinates[1] * self.height / grounding_height),\n        ]\n\n    # Given a generated ACI function, returns a list of argument values, where descriptions are at the front of the list\n    def parse_function_args(self, function: str) -> List[str]:\n        tree = ast.parse(function)\n        call_node = tree.body[0].value\n\n        def safe_eval(node):\n            if isinstance(\n                node, ast.Constant\n            ):  # Handles literals like numbers, strings, etc.\n                return node.value\n            else:\n                return ast.unparse(node)  # Return as a string if not a literal\n\n        positional_args = [safe_eval(arg) for arg in call_node.args]\n        keyword_args = {kw.arg: safe_eval(kw.value) for kw in call_node.keywords}\n\n        res = []\n\n        for key, val in keyword_args.items():\n            if \"description\" in key:\n                res.append(val)\n\n        for arg in positional_args:\n            res.append(arg)\n\n        return res\n\n    @agent_action\n    def click(\n        self,\n        element_description: str,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        x, y = self.resize_coordinates(self.coords1)\n        command = \"import pyautogui; \"\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def switch_applications(self, app_code):\n        \"\"\"Switch to a different application that is already open\n        Args:\n            app_code:str the code name of the application to switch to from the provided list of open applications\n        \"\"\"\n        if self.platform == \"darwin\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        elif self.platform == \"linux\":\n            return UBUNTU_APP_SETUP.replace(\"APP_NAME\", app_code)\n        elif self.platform == \"windows\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('win', 'd', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n\n    @agent_action\n    def open(self, app_or_filename: str):\n        \"\"\"Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.\n        Args:\n            app_or_filename:str, the name of the application or filename to open\n        \"\"\"\n        return f\"import pyautogui; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(0.5)\"\n\n    @agent_action\n    def type(\n        self,\n        element_description: Optional[str] = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into a specific element\n        Args:\n            element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.\n            text:str, the text to type\n            overwrite:bool, Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n\n        if self.coords1 is not None:\n            # If a node is found, retrieve its coordinates and size\n            # Start typing at the center of the element\n\n            x, y = self.resize_coordinates(self.coords1)\n\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            # If no element is found, start typing at the current cursor location\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(\n        self, starting_description: str, ending_description: str, hold_keys: List = []\n    ):\n        \"\"\"Drag from the starting description to the ending description\n        Args:\n            starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.\n            ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        x1, y1 = self.resize_coordinates(self.coords1)\n        x2, y2 = self.resize_coordinates(self.coords2)\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def highlight_text_span(self, starting_phrase: str, ending_phrase: str):\n        \"\"\"Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.\n        Args:\n            starting_phrase:str, the phrase that denotes the start of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n            ending_phrase:str, the phrase that denotes the end of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n        \"\"\"\n\n        x1, y1 = self.coords1\n        x2, y2 = self.coords2\n\n        command = \"import pyautogui; \"\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); \"\n\n        # Return pyautoguicode to drag and drop the elements\n        return command\n\n    @agent_action\n    def set_cell_values(\n        self, cell_values: Dict[str, Any], app_name: str, sheet_name: str\n    ):\n        \"\"\"Use this to set individual cell values in a spreadsheet. For example, setting A2 to \"hello\" would be done by passing {\"A2\": \"hello\"} as cell_values. The sheet must be opened before this command can be used.\n        Args:\n            cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format \"A1\", \"B2\", etc.\n                Supported value types include: float, int, string, bool, formulas.\n            app_name: str, The name of the spreadsheet application. For example, \"Some_sheet.xlsx\".\n            sheet_name: str, The name of the sheet in the spreadsheet. For example, \"Sheet1\".\n        \"\"\"\n        return SET_CELL_VALUES_CMD.format(\n            cell_values=cell_values, app_name=app_name, sheet_name=sheet_name\n        )\n\n    @agent_action\n    def scroll(self, element_description: str, clicks: int, shift: bool = False):\n        \"\"\"Scroll the element in the specified direction\n        Args:\n            element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.\n            clicks:int, the number of clicks to scroll can be positive (up) or negative (down).\n            shift:bool, whether to use shift+scroll for horizontal scrolling\n        \"\"\"\n\n        x, y = self.resize_coordinates(self.coords1)\n\n        if shift:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})\"\n        else:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})\"\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])\n        \"\"\"\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)})\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(\n        self,\n        return_value: Optional[Union[Dict, str, List, Tuple, int, float, bool]] = None,\n    ):\n        \"\"\"End the current task with a success and the required return value\"\"\"\n        self.returned_info = return_value\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure, and replan the whole task.\"\"\"\n        return \"\"\"FAIL\"\"\"\n"
  },
  {
    "path": "gui_agents/s2/agents/manager.py",
    "content": "import logging\nimport re\nfrom collections import defaultdict\nfrom typing import Dict, List, Optional, Tuple\nimport platform\n\nfrom gui_agents.s2.agents.grounding import ACI\nfrom gui_agents.s2.core.module import BaseModule\nfrom gui_agents.s2.core.knowledge import KnowledgeBase\nfrom gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2.core.engine import OpenAIEmbeddingEngine\nfrom gui_agents.s2.utils.common_utils import (\n    Dag,\n    Node,\n    calculate_tokens,\n    call_llm_safe,\n    parse_dag,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\nNUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision\n\n\nclass Manager(BaseModule):\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        local_kb_path: str,\n        embedding_engine,\n        search_engine: Optional[str] = None,\n        multi_round: bool = False,\n        platform: str = platform.system().lower(),\n    ):\n        # TODO: move the prompt to Procedural Memory\n        super().__init__(engine_params, platform)\n\n        # Initialize the ACI\n        self.grounding_agent = grounding_agent\n\n        # Initialize the planner\n        sys_prompt = PROCEDURAL_MEMORY.COMBINED_MANAGER_PROMPT\n\n        self.generator_agent = self._create_agent(sys_prompt)\n\n        # Initialize the remaining modules\n        self.dag_translator_agent = self._create_agent(\n            PROCEDURAL_MEMORY.DAG_TRANSLATOR_PROMPT\n        )\n        self.narrative_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT\n        )\n        self.episode_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT\n        )\n\n        self.local_kb_path = local_kb_path\n\n        self.embedding_engine = embedding_engine\n        self.knowledge_base = KnowledgeBase(\n            embedding_engine=self.embedding_engine,\n            local_kb_path=self.local_kb_path,\n            platform=platform,\n            engine_params=engine_params,\n        )\n\n        self.planner_history = []\n\n        self.turn_count = 0\n        self.search_engine = search_engine\n        self.multi_round = multi_round\n\n    def summarize_episode(self, trajectory):\n        \"\"\"Summarize the episode experience for lifelong learning reflection\n        Args:\n            trajectory: str: The episode experience to be summarized\n        \"\"\"\n\n        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars\n        self.episode_summarization_agent.add_message(trajectory, role=\"user\")\n        subtask_summarization = call_llm_safe(self.episode_summarization_agent)\n        self.episode_summarization_agent.add_message(\n            subtask_summarization, role=\"assistant\"\n        )\n\n        return subtask_summarization\n\n    def summarize_narrative(self, trajectory):\n        \"\"\"Summarize the narrative experience for lifelong learning reflection\n        Args:\n            trajectory: str: The narrative experience to be summarized\n        \"\"\"\n        # Create Reflection on whole trajectories for next round trial\n        self.narrative_summarization_agent.add_message(trajectory, role=\"user\")\n        lifelong_learning_reflection = call_llm_safe(self.narrative_summarization_agent)\n\n        return lifelong_learning_reflection\n\n    def _generate_step_by_step_plan(\n        self,\n        observation: Dict,\n        instruction: str,\n        failed_subtask: Optional[Node] = None,\n        completed_subtasks_list: List[Node] = [],\n        remaining_subtasks_list: List[Node] = [],\n    ) -> Tuple[Dict, str]:\n        agent = self.grounding_agent\n\n        # Converts a list of DAG Nodes into a natural langauge list\n        def format_subtask_list(subtasks: List[Node]) -> str:\n            res = \"\"\n            for idx, node in enumerate(subtasks):\n                res += f\"{idx+1}. **{node.name}**:\\n\"\n                bullets = re.split(r\"(?<=[.!?;]) +\", node.info)\n                for bullet in bullets:\n                    res += f\"   - {bullet}\\n\"\n                res += \"\\n\"\n            return res\n\n        # Perform Retrieval only at the first planning step\n        if self.turn_count == 0:\n\n            self.search_query = self.knowledge_base.formulate_query(\n                instruction, observation\n            )\n\n            most_similar_task = \"\"\n            retrieved_experience = \"\"\n            integrated_knowledge = \"\"\n            # Retrieve most similar narrative (task) experience\n            most_similar_task, retrieved_experience = (\n                self.knowledge_base.retrieve_narrative_experience(instruction)\n            )\n            logger.info(\n                \"SIMILAR TASK EXPERIENCE: %s\",\n                most_similar_task + \"\\n\" + retrieved_experience.strip(),\n            )\n\n            # Retrieve knowledge from the web if search_engine is provided\n            if self.search_engine is not None:\n                retrieved_knowledge = self.knowledge_base.retrieve_knowledge(\n                    instruction=instruction,\n                    search_query=self.search_query,\n                    search_engine=self.search_engine,\n                )\n                logger.info(\"RETRIEVED KNOWLEDGE: %s\", retrieved_knowledge)\n\n                if retrieved_knowledge is not None:\n                    # Fuse the retrieved knowledge and experience\n                    integrated_knowledge = self.knowledge_base.knowledge_fusion(\n                        observation=observation,\n                        instruction=instruction,\n                        web_knowledge=retrieved_knowledge,\n                        similar_task=most_similar_task,\n                        experience=retrieved_experience,\n                    )\n                    logger.info(\"INTEGRATED KNOWLEDGE: %s\", integrated_knowledge)\n\n            integrated_knowledge = integrated_knowledge or retrieved_experience\n\n            # Add the integrated knowledge to the task instruction in the system prompt\n            if integrated_knowledge:\n                instruction += f\"\\nYou may refer to some retrieved knowledge if you think they are useful.{integrated_knowledge}\"\n\n            self.generator_agent.add_system_prompt(\n                self.generator_agent.system_prompt.replace(\n                    \"TASK_DESCRIPTION\", instruction\n                )\n            )\n\n        # Re-plan on failure case\n        if failed_subtask:\n            generator_message = (\n                f\"The subtask {failed_subtask} cannot be completed. Please generate a new plan for the remainder of the trajectory.\\n\\n\"\n                f\"Successfully Completed Subtasks:\\n{format_subtask_list(completed_subtasks_list)}\\n\"\n            )\n        # Re-plan on subtask completion case\n        elif len(completed_subtasks_list) + len(remaining_subtasks_list) > 0:\n            generator_message = (\n                \"The current trajectory and desktop state is provided. Please revise the plan for the following trajectory.\\n\\n\"\n                f\"Successfully Completed Subtasks:\\n{format_subtask_list(completed_subtasks_list)}\\n\"\n                f\"Future Remaining Subtasks:\\n{format_subtask_list(remaining_subtasks_list)}\\n\"\n            )\n        # Initial plan case\n        else:\n            generator_message = \"Please generate the initial plan for the task.\\n\"\n\n        logger.info(\"GENERATOR MESSAGE: %s\", generator_message)\n\n        self.generator_agent.add_message(\n            generator_message,\n            image_content=observation.get(\"screenshot\", None),\n            role=\"user\",\n        )\n\n        logger.info(\"GENERATING HIGH LEVEL PLAN\")\n\n        plan = call_llm_safe(self.generator_agent)\n        if plan == \"\":\n            raise Exception(\"Plan Generation Failed - Fix the Prompt\")\n\n        logger.info(\"HIGH LEVEL STEP BY STEP PLAN: %s\", plan)\n\n        self.generator_agent.add_message(plan, role=\"assistant\")\n        self.planner_history.append(plan)\n        self.turn_count += 1\n\n        # Set Cost based on GPT-4o\n        input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n\n        planner_info = {\n            \"search_query\": self.search_query,\n            \"goal_plan\": plan,\n            \"num_input_tokens_plan\": input_tokens,\n            \"num_output_tokens_plan\": output_tokens,\n            \"goal_plan_cost\": cost,\n        }\n\n        assert type(plan) == str\n\n        return planner_info, plan\n\n    def _generate_dag(self, instruction: str, plan: str) -> Tuple[Dict, Dag]:\n        # For the re-planning case, remove the prior input since this should only translate the new plan\n        self.dag_translator_agent.reset()\n\n        # Add initial instruction and plan to the agent's message history\n        self.dag_translator_agent.add_message(\n            f\"Instruction: {instruction}\\nPlan: {plan}\", role=\"user\"\n        )\n\n        logger.info(\"GENERATING DAG\")\n\n        # Generate DAG\n        dag_raw = call_llm_safe(self.dag_translator_agent)\n\n        dag = parse_dag(dag_raw)\n\n        logger.info(\"Generated DAG: %s\", dag_raw)\n\n        self.dag_translator_agent.add_message(dag_raw, role=\"assistant\")\n\n        input_tokens, output_tokens = calculate_tokens(\n            self.dag_translator_agent.messages\n        )\n\n        # Set Cost based on GPT-4o\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n\n        dag_info = {\n            \"dag\": dag_raw,\n            \"num_input_tokens_dag\": input_tokens,\n            \"num_output_tokens_dag\": output_tokens,\n            \"dag_cost\": cost,\n        }\n\n        assert type(dag) == Dag\n\n        return dag_info, dag\n\n    def _topological_sort(self, dag: Dag) -> List[Node]:\n        \"\"\"Topological sort of the DAG using DFS\n        dag: Dag: Object representation of the DAG with nodes and edges\n        \"\"\"\n\n        def dfs(node_name, visited, stack):\n            visited[node_name] = True\n            for neighbor in adj_list[node_name]:\n                if not visited[neighbor]:\n                    dfs(neighbor, visited, stack)\n            stack.append(node_name)\n\n        # Convert edges to adjacency list\n        adj_list = defaultdict(list)\n        for u, v in dag.edges:\n            adj_list[u.name].append(v.name)\n\n        visited = {node.name: False for node in dag.nodes}\n        stack = []\n\n        for node in dag.nodes:\n            if not visited[node.name]:\n                dfs(node.name, visited, stack)\n\n        # Return the nodes in topologically sorted order\n        sorted_nodes = [\n            next(n for n in dag.nodes if n.name == name) for name in stack[::-1]\n        ]\n        return sorted_nodes\n\n    def get_action_queue(\n        self,\n        instruction: str,\n        observation: Dict,\n        failed_subtask: Optional[Node] = None,\n        completed_subtasks_list: List[Node] = [],\n        remaining_subtasks_list: List[Node] = [],\n    ):\n        \"\"\"Generate the action list based on the instruction\n        instruction:str: Instruction for the task\n        \"\"\"\n\n        planner_info, plan = self._generate_step_by_step_plan(\n            observation,\n            instruction,\n            failed_subtask,\n            completed_subtasks_list,\n            remaining_subtasks_list,\n        )\n\n        # Generate the DAG\n        dag_info, dag = self._generate_dag(instruction, plan)\n\n        # Topological sort of the DAG\n        action_queue = self._topological_sort(dag)\n\n        planner_info.update(dag_info)\n\n        return planner_info, action_queue\n"
  },
  {
    "path": "gui_agents/s2/agents/worker.py",
    "content": "import logging\nimport re\nimport textwrap\nfrom typing import Dict, List, Tuple\nimport platform\n\nfrom gui_agents.s2.agents.grounding import ACI\nfrom gui_agents.s2.core.module import BaseModule\nfrom gui_agents.s2.core.knowledge import KnowledgeBase\nfrom gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2.utils.common_utils import (\n    Node,\n    calculate_tokens,\n    call_llm_safe,\n    parse_single_code_from_string,\n    sanitize_code,\n    extract_first_agent_function,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass Worker(BaseModule):\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        local_kb_path: str,\n        embedding_engine,\n        platform: str = platform.system().lower(),\n        enable_reflection: bool = True,\n        use_subtask_experience: bool = True,\n    ):\n        \"\"\"\n        Worker receives a subtask list and active subtask and generates the next action for the to execute.\n        Args:\n            engine_params: Dict\n                Parameters for the multimodal engine\n            grounding_agent: Agent\n                The grounding agent to use\n            local_kb_path: str\n                Path to knowledge base\n            platform: str\n                OS platform the agent runs on (darwin, linux, windows)\n            enable_reflection: bool\n                Whether to enable reflection\n            use_subtask_experience: bool\n                Whether to use subtask experience\n        \"\"\"\n        super().__init__(engine_params, platform)\n\n        self.grounding_agent = grounding_agent\n        self.local_kb_path = local_kb_path\n        self.embedding_engine = embedding_engine\n        self.enable_reflection = enable_reflection\n        self.use_subtask_experience = use_subtask_experience\n        self.reset()\n\n    def reset(self):\n        if self.platform != \"linux\":\n            skipped_actions = [\"set_cell_values\"]\n        else:\n            skipped_actions = []\n\n        sys_prompt = PROCEDURAL_MEMORY.construct_worker_procedural_memory(\n            type(self.grounding_agent), skipped_actions=skipped_actions\n        ).replace(\"CURRENT_OS\", self.platform)\n\n        self.generator_agent = self._create_agent(sys_prompt)\n        self.reflection_agent = self._create_agent(\n            PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY\n        )\n\n        self.knowledge_base = KnowledgeBase(\n            embedding_engine=self.embedding_engine,\n            local_kb_path=self.local_kb_path,\n            platform=self.platform,\n            engine_params=self.engine_params,\n        )\n\n        self.turn_count = 0\n        self.worker_history = []\n        self.reflections = []\n        self.cost_this_turn = 0\n        self.screenshot_inputs = []\n        self.planner_history = []\n        self.max_trajector_length = 8\n\n    def flush_messages(self):\n        # generator msgs are alternating [user, assistant], so 2 per round\n        if len(self.generator_agent.messages) > 2 * self.max_trajector_length + 1:\n            self.generator_agent.remove_message_at(1)\n            self.generator_agent.remove_message_at(1)\n        # reflector msgs are all [(user text, user image)], so 1 per round\n        if len(self.reflection_agent.messages) > self.max_trajector_length + 1:\n            self.reflection_agent.remove_message_at(1)\n\n    def generate_next_action(\n        self,\n        instruction: str,\n        search_query: str,\n        subtask: str,\n        subtask_info: Dict,\n        future_tasks: List[Node],\n        done_task: List[Node],\n        obs: Dict,\n    ) -> Tuple[Dict, List]:\n        \"\"\"\n        Predict the next action(s) based on the current observation.\n        \"\"\"\n        # Provide the top_app to the Grounding Agent to remove all other applications from the tree. At t=0, top_app is None\n        agent = self.grounding_agent\n\n        # Get RAG knowledge, only update system message at t=0\n        if self.turn_count == 0:\n            if self.use_subtask_experience:\n                subtask_query_key = (\n                    \"Task:\\n\"\n                    + search_query\n                    + \"\\n\\nSubtask: \"\n                    + subtask\n                    + \"\\nSubtask Instruction: \"\n                    + subtask_info\n                )\n                retrieved_similar_subtask, retrieved_subtask_experience = (\n                    self.knowledge_base.retrieve_episodic_experience(subtask_query_key)\n                )\n\n                # Dirty fix to replace id with element description during subtask retrieval\n                pattern = r\"\\(\\d+\"\n                retrieved_subtask_experience = re.sub(\n                    pattern, \"(element_description\", retrieved_subtask_experience\n                )\n                retrieved_subtask_experience = retrieved_subtask_experience.replace(\n                    \"_id\", \"_description\"\n                )\n\n                logger.info(\n                    \"SIMILAR SUBTASK EXPERIENCE: %s\",\n                    retrieved_similar_subtask\n                    + \"\\n\"\n                    + retrieved_subtask_experience.strip(),\n                )\n                instruction += \"\\nYou may refer to some similar subtask experience if you think they are useful. {}\".format(\n                    retrieved_similar_subtask + \"\\n\" + retrieved_subtask_experience\n                )\n\n            self.generator_agent.add_system_prompt(\n                self.generator_agent.system_prompt.replace(\n                    \"SUBTASK_DESCRIPTION\", subtask\n                )\n                .replace(\"TASK_DESCRIPTION\", instruction)\n                .replace(\"FUTURE_TASKS\", \", \".join([f.name for f in future_tasks]))\n                .replace(\"DONE_TASKS\", \",\".join(d.name for d in done_task))\n            )\n\n        # Reflection generation does not add its own response, it only gets the trajectory\n        reflection = None\n        if self.enable_reflection:\n            # Load the initial subtask info\n            if self.turn_count == 0:\n                text_content = textwrap.dedent(\n                    f\"\"\"\n                    Subtask Description: {subtask}\n                    Subtask Information: {subtask_info}\n                    Current Trajectory below:\n                    \"\"\"\n                )\n                updated_sys_prompt = (\n                    self.reflection_agent.system_prompt + \"\\n\" + text_content\n                )\n                self.reflection_agent.add_system_prompt(updated_sys_prompt)\n                self.reflection_agent.add_message(\n                    text_content=\"The initial screen is provided. No action has been taken yet.\",\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n            # Load the latest action\n            else:\n                text_content = self.clean_worker_generation_for_reflection(\n                    self.planner_history[-1]\n                )\n                self.reflection_agent.add_message(\n                    text_content=text_content,\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n                reflection = call_llm_safe(self.reflection_agent)\n                self.reflections.append(reflection)\n                logger.info(\"REFLECTION: %s\", reflection)\n\n        generator_message = (\n            f\"\\nYou may use this reflection on the previous action and overall trajectory: {reflection}\\n\"\n            if reflection and self.turn_count > 0\n            else \"\"\n        ) + f\"Text Buffer = [{','.join(agent.notes)}].\"\n\n        # Only provide subinfo in the very first message to avoid over influence and redundancy\n        if self.turn_count == 0:\n            generator_message += f\"Remember only complete the subtask: {subtask}\\n\"\n            generator_message += f\"You can use this extra information for completing the current subtask: {subtask_info}.\\n\"\n\n        # logger.info(\"GENERATOR MESSAGE: %s\", generator_message)\n\n        self.generator_agent.add_message(\n            generator_message, image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        plan = call_llm_safe(self.generator_agent)\n        self.planner_history.append(plan)\n        logger.info(\"PLAN: %s\", plan)\n        self.generator_agent.add_message(plan, role=\"assistant\")\n\n        # Calculate input/output tokens and gpt-4o cost\n        input_tokens, output_tokens = calculate_tokens(self.generator_agent.messages)\n        cost = input_tokens * (0.0050 / 1000) + output_tokens * (0.0150 / 1000)\n        self.cost_this_turn += cost\n        logger.info(\"EXECTUOR COST: %s\", self.cost_this_turn)\n\n        # Use the DescriptionBasedACI to convert agent_action(\"desc\") into agent_action([x, y])\n        try:\n            agent.assign_coordinates(plan, obs)\n            plan_code = parse_single_code_from_string(plan.split(\"Grounded Action\")[-1])\n            plan_code = sanitize_code(plan_code)\n            plan_code = extract_first_agent_function(plan_code)\n            exec_code = eval(plan_code)\n        except Exception as e:\n            logger.error(\"Error in parsing plan code: %s\", e)\n            plan_code = \"agent.wait(1.0)\"\n            exec_code = eval(plan_code)\n\n        executor_info = {\n            \"current_subtask\": subtask,\n            \"current_subtask_info\": subtask_info,\n            \"executor_plan\": plan,\n            \"plan_code\": plan_code,\n            \"reflection\": reflection,\n            \"num_input_tokens_executor\": input_tokens,\n            \"num_output_tokens_executor\": output_tokens,\n        }\n        self.turn_count += 1\n\n        self.screenshot_inputs.append(obs[\"screenshot\"])\n        self.flush_messages()\n\n        return executor_info, [exec_code]\n\n    # Removes the previous action verification, and removes any extraneous grounded actions\n    def clean_worker_generation_for_reflection(self, worker_generation: str) -> str:\n        # Remove the previous action verification\n        res = worker_generation[worker_generation.find(\"(Screenshot Analysis)\") :]\n        action = extract_first_agent_function(worker_generation)\n        # Cut off extra grounded actions\n        res = res[: res.find(\"(Grounded Action)\")]\n        res += f\"(Grounded Action)\\n```python\\n{action}\\n```\\n\"\n        return res\n"
  },
  {
    "path": "gui_agents/s2/cli_app.py",
    "content": "import argparse\nimport datetime\nimport io\nimport logging\nimport os\nimport platform\nimport pyautogui\nimport signal\nimport sys\nimport time\n\nfrom PIL import Image\n\nfrom gui_agents.s2.agents.grounding import OSWorldACI\nfrom gui_agents.s2.agents.agent_s import AgentS2\n\ncurrent_platform = platform.system().lower()\n\n# Global flag to track pause state for debugging\npaused = False\n\n\ndef get_char():\n    \"\"\"Get a single character from stdin without pressing Enter\"\"\"\n    try:\n        # Import termios and tty on Unix-like systems\n        if platform.system() in [\"Darwin\", \"Linux\"]:\n            import termios\n            import tty\n\n            fd = sys.stdin.fileno()\n            old_settings = termios.tcgetattr(fd)\n            try:\n                tty.setraw(sys.stdin.fileno())\n                ch = sys.stdin.read(1)\n            finally:\n                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)\n            return ch\n        else:\n            # Windows fallback\n            import msvcrt\n\n            return msvcrt.getch().decode(\"utf-8\", errors=\"ignore\")\n    except:\n        return input()  # Fallback for non-terminal environments\n\n\ndef signal_handler(signum, frame):\n    \"\"\"Handle Ctrl+C signal for debugging during agent execution\"\"\"\n    global paused\n\n    if not paused:\n        print(\"\\n\\n🔸 Agent-S Workflow Paused 🔸\")\n        print(\"=\" * 50)\n        print(\"Options:\")\n        print(\"  • Press Ctrl+C again to quit\")\n        print(\"  • Press Esc to resume workflow\")\n        print(\"=\" * 50)\n\n        paused = True\n\n        while paused:\n            try:\n                print(\"\\n[PAUSED] Waiting for input... \", end=\"\", flush=True)\n                char = get_char()\n\n                if ord(char) == 3:  # Ctrl+C\n                    print(\"\\n\\n🛑 Exiting Agent-S...\")\n                    sys.exit(0)\n                elif ord(char) == 27:  # Esc\n                    print(\"\\n\\n▶️  Resuming Agent-S workflow...\")\n                    paused = False\n                    break\n                else:\n                    print(f\"\\n   Unknown command: '{char}' (ord: {ord(char)})\")\n\n            except KeyboardInterrupt:\n                print(\"\\n\\n🛑 Exiting Agent-S...\")\n                sys.exit(0)\n    else:\n        # Already paused, second Ctrl+C means quit\n        print(\"\\n\\n🛑 Exiting Agent-S...\")\n        sys.exit(0)\n\n\n# Set up signal handler for Ctrl+C\nsignal.signal(signal.SIGINT, signal_handler)\n\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nlog_dir = \"logs\"\nos.makedirs(log_dir, exist_ok=True)\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n\nplatform_os = platform.system()\n\n\ndef show_permission_dialog(code: str, action_description: str):\n    \"\"\"Show a platform-specific permission dialog and return True if approved.\"\"\"\n    if platform.system() == \"Darwin\":\n        result = os.system(\n            f'osascript -e \\'display dialog \"Do you want to execute this action?\\n\\n{code} which will try to {action_description}\" with title \"Action Permission\" buttons {{\"Cancel\", \"OK\"}} default button \"OK\" cancel button \"Cancel\"\\''\n        )\n        return result == 0\n    elif platform.system() == \"Linux\":\n        result = os.system(\n            f'zenity --question --title=\"Action Permission\" --text=\"Do you want to execute this action?\\n\\n{code}\" --width=400 --height=200'\n        )\n        return result == 0\n    return False\n\n\ndef scale_screen_dimensions(width: int, height: int, max_dim_size: int):\n    scale_factor = min(max_dim_size / width, max_dim_size / height, 1)\n    safe_width = int(width * scale_factor)\n    safe_height = int(height * scale_factor)\n    return safe_width, safe_height\n\n\ndef run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):\n    global paused\n    obs = {}\n    traj = \"Task:\\n\" + instruction\n    subtask_traj = \"\"\n    for step in range(15):\n        # Check if we're in paused state and wait\n        while paused:\n            time.sleep(0.1)\n\n        # Get screen shot using pyautogui\n        screenshot = pyautogui.screenshot()\n        screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)\n\n        # Save the screenshot to a BytesIO object\n        buffered = io.BytesIO()\n        screenshot.save(buffered, format=\"PNG\")\n\n        # Get the byte value of the screenshot\n        screenshot_bytes = buffered.getvalue()\n        # Convert to base64 string.\n        obs[\"screenshot\"] = screenshot_bytes\n\n        # Check again for pause state before prediction\n        while paused:\n            time.sleep(0.1)\n\n        print(f\"\\n🔄 Step {step + 1}/15: Getting next action from agent...\")\n\n        # Get next action code from the agent\n        info, code = agent.predict(instruction=instruction, observation=obs)\n\n        if \"done\" in code[0].lower() or \"fail\" in code[0].lower():\n            if platform.system() == \"Darwin\":\n                os.system(\n                    f'osascript -e \\'display dialog \"Task Completed\" with title \"OpenACI Agent\" buttons \"OK\" default button \"OK\"\\''\n                )\n            elif platform.system() == \"Linux\":\n                os.system(\n                    f'zenity --info --title=\"OpenACI Agent\" --text=\"Task Completed\" --width=200 --height=100'\n                )\n\n            agent.update_narrative_memory(traj)\n            break\n\n        if \"next\" in code[0].lower():\n            continue\n\n        if \"wait\" in code[0].lower():\n            print(\"⏳ Agent requested wait...\")\n            time.sleep(5)\n            continue\n\n        else:\n            time.sleep(1.0)\n            print(\"EXECUTING CODE:\", code[0])\n\n            # Check for pause state before execution\n            while paused:\n                time.sleep(0.1)\n\n            # Ask for permission before executing\n            exec(code[0])\n            time.sleep(1.0)\n\n            # Update task and subtask trajectories and optionally the episodic memory\n            traj += (\n                \"\\n\\nReflection:\\n\"\n                + str(info[\"reflection\"])\n                + \"\\n\\n----------------------\\n\\nPlan:\\n\"\n                + info[\"executor_plan\"]\n            )\n            subtask_traj = agent.update_episodic_memory(info, subtask_traj)\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Run AgentS2 with specified model.\")\n    parser.add_argument(\n        \"--provider\",\n        type=str,\n        default=\"anthropic\",\n        help=\"Specify the provider to use (e.g., openai, anthropic, etc.)\",\n    )\n    parser.add_argument(\n        \"--model\",\n        type=str,\n        default=\"claude-3-7-sonnet-20250219\",\n        help=\"Specify the model to use (e.g., gpt-4o)\",\n    )\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n\n    # Grounding model config option 1: API based\n    parser.add_argument(\n        \"--grounding_model_provider\",\n        type=str,\n        default=\"anthropic\",\n        help=\"Specify the provider to use for the grounding model (e.g., openai, anthropic, etc.)\",\n    )\n    parser.add_argument(\n        \"--grounding_model\",\n        type=str,\n        default=\"claude-3-7-sonnet-20250219\",\n        help=\"Specify the grounding model to use (e.g., claude-3-5-sonnet-20241022)\",\n    )\n    parser.add_argument(\n        \"--grounding_model_resize_width\",\n        type=int,\n        default=1366,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_model_resize_height\",\n        type=int,\n        default=None,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # Grounding model config option 2: Self-hosted endpoint based\n    parser.add_argument(\n        \"--endpoint_provider\",\n        type=str,\n        default=\"\",\n        help=\"Specify the endpoint provider for your grounding model, only HuggingFace TGI support for now\",\n    )\n    parser.add_argument(\n        \"--endpoint_url\",\n        type=str,\n        default=\"\",\n        help=\"Specify the endpoint URL for your grounding model\",\n    )\n    parser.add_argument(\n        \"--endpoint_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n\n    parser.add_argument(\n        \"--embedding_engine_type\",\n        type=str,\n        default=\"openai\",\n        help=\"Specify the embedding engine type (supports openai, gemini)\",\n    )\n\n    args = parser.parse_args()\n    assert (\n        args.grounding_model_provider and args.grounding_model\n    ) or args.endpoint_url, \"Error: No grounding model was provided. Either provide an API based model, or a self-hosted HuggingFace endpoint\"\n\n    # Re-scales screenshot size to ensure it fits in UI-TARS context limit\n    screen_width, screen_height = pyautogui.size()\n    scaled_width, scaled_height = scale_screen_dimensions(\n        screen_width, screen_height, max_dim_size=2400\n    )\n\n    # Load the general engine params\n    engine_params = {\n        \"engine_type\": args.provider,\n        \"model\": args.model,\n        \"base_url\": args.model_url,\n        \"api_key\": args.model_api_key,\n    }\n\n    # Load the grounding engine from a HuggingFace TGI endpoint\n    if args.endpoint_url:\n        engine_params_for_grounding = {\n            \"engine_type\": args.endpoint_provider,\n            \"base_url\": args.endpoint_url,\n            \"api_key\": args.endpoint_api_key,\n        }\n    else:\n        grounding_height = args.grounding_model_resize_height\n        # If not provided, use the aspect ratio of the screen to compute the height\n        if grounding_height is None:\n            grounding_height = (\n                screen_height * args.grounding_model_resize_width / screen_width\n            )\n\n        engine_params_for_grounding = {\n            \"engine_type\": args.grounding_model_provider,\n            \"model\": args.grounding_model,\n            \"grounding_width\": args.grounding_model_resize_width,\n            \"grounding_height\": grounding_height,\n        }\n\n    grounding_agent = OSWorldACI(\n        platform=current_platform,\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=screen_width,\n        height=screen_height,\n    )\n\n    agent = AgentS2(\n        engine_params,\n        grounding_agent,\n        platform=current_platform,\n        action_space=\"pyautogui\",\n        observation_type=\"mixed\",\n        search_engine=None,\n        embedding_engine_type=args.embedding_engine_type,\n    )\n\n    while True:\n        query = input(\"Query: \")\n\n        agent.reset()\n\n        # Run the agent on your own device\n        run_agent(agent, query, scaled_width, scaled_height)\n\n        response = input(\"Would you like to provide another query? (y/n): \")\n        if response.lower() != \"y\":\n            break\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "gui_agents/s2/core/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2/core/engine.py",
    "content": "import os\n\nimport backoff\nimport numpy as np\nfrom anthropic import Anthropic\nfrom openai import (\n    AzureOpenAI,\n    APIConnectionError,\n    APIError,\n    AzureOpenAI,\n    OpenAI,\n    RateLimitError,\n)\nfrom google import genai\nfrom google.genai import types\n\n\nclass LMMEngine:\n    pass\n\n\nclass OpenAIEmbeddingEngine(LMMEngine):\n    def __init__(\n        self,\n        embedding_model: str = \"text-embedding-3-small\",\n        api_key=None,\n    ):\n        \"\"\"Init an OpenAI Embedding engine\n\n        Args:\n            embedding_model (str, optional): Model name. Defaults to \"text-embedding-3-small\".\n            api_key (_type_, optional): Auth key from OpenAI. Defaults to None.\n        \"\"\"\n        self.model = embedding_model\n        self.api_key = api_key\n\n    @backoff.on_exception(\n        backoff.expo,\n        (\n            APIError,\n            RateLimitError,\n            APIConnectionError,\n        ),\n    )\n    def get_embeddings(self, text: str) -> np.ndarray:\n        api_key = self.api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n        client = OpenAI(api_key=api_key)\n        response = client.embeddings.create(model=self.model, input=text)\n        return np.array([data.embedding for data in response.data])\n\n\nclass GeminiEmbeddingEngine(LMMEngine):\n    def __init__(\n        self,\n        embedding_model: str = \"text-embedding-004\",\n        api_key=None,\n    ):\n        \"\"\"Init an Gemini Embedding engine\n\n        Args:\n            embedding_model (str, optional): Model name. Defaults to \"text-embedding-004\".\n            api_key (_type_, optional): Auth key from Gemini. Defaults to None.\n        \"\"\"\n        self.model = embedding_model\n        self.api_key = api_key\n\n    @backoff.on_exception(\n        backoff.expo,\n        (\n            APIError,\n            RateLimitError,\n            APIConnectionError,\n        ),\n    )\n    def get_embeddings(self, text: str) -> np.ndarray:\n        api_key = self.api_key or os.getenv(\"GEMINI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named GEMINI_API_KEY\"\n            )\n        client = genai.Client(api_key=api_key)\n\n        result = client.models.embed_content(\n            model=self.model,\n            contents=text,\n            config=types.EmbedContentConfig(task_type=\"SEMANTIC_SIMILARITY\"),\n        )\n\n        return np.array([i.values for i in result.embeddings])\n\n\nclass AzureOpenAIEmbeddingEngine(LMMEngine):\n    def __init__(\n        self,\n        embedding_model: str = \"text-embedding-3-small\",\n        api_key=None,\n        api_version=None,\n        endpoint_url=None,\n    ):\n        \"\"\"Init an Azure OpenAI Embedding engine\n\n        Args:\n            embedding_model (str, optional): Model name. Defaults to \"text-embedding-3-small\".\n            api_key (_type_, optional): Auth key from Azure OpenAI. Defaults to None.\n            api_version (_type_, optional): API version. Defaults to None.\n            endpoint_url (_type_, optional): Endpoint URL. Defaults to None.\n        \"\"\"\n        self.model = embedding_model\n        self.api_key = api_key\n        self.api_version = api_version\n        self.endpoint_url = endpoint_url\n\n    @backoff.on_exception(\n        backoff.expo,\n        (\n            APIError,\n            RateLimitError,\n            APIConnectionError,\n        ),\n    )\n    def get_embeddings(self, text: str) -> np.ndarray:\n        api_key = self.api_key or os.getenv(\"AZURE_OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY\"\n            )\n        api_version = self.api_version or os.getenv(\"OPENAI_API_VERSION\")\n        if api_version is None:\n            raise ValueError(\n                \"An API Version needs to be provided in either the api_version parameter or as an environment variable named OPENAI_API_VERSION\"\n            )\n        endpoint_url = self.endpoint_url or os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n        if endpoint_url is None:\n            raise ValueError(\n                \"An Endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named AZURE_OPENAI_ENDPOINT\"\n            )\n        client = AzureOpenAI(\n            api_key=api_key,\n            api_version=api_version,\n            azure_endpoint=endpoint_url,\n        )\n        response = client.embeddings.create(input=text, model=self.model)\n        return np.array([data.embedding for data in response.data])\n\n\nclass LMMEngineOpenAI(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n        if not self.llm_client:\n            if not self.base_url:\n                self.llm_client = OpenAI(api_key=api_key)\n            else:\n                self.llm_client = OpenAI(base_url=self.base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAnthropic(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, thinking=False, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.thinking = thinking\n        self.api_key = api_key\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"ANTHROPIC_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY\"\n            )\n        if not self.llm_client:\n            self.llm_client = Anthropic(api_key=api_key)\n        if self.thinking:\n            full_response = self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=8192,\n                thinking={\"type\": \"enabled\", \"budget_tokens\": 4096},\n                **kwargs,\n            )\n            thoughts = full_response.content[0].thinking\n            print(\"CLAUDE 3.7 THOUGHTS:\", thoughts)\n            return full_response.content[1].text\n        return (\n            self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .content[0]\n            .text\n        )\n\n\nclass LMMEngineGemini(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"GEMINI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named GEMINI_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"GEMINI_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named GEMINI_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineOpenRouter(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENROUTER_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENROUTER_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"OPEN_ROUTER_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named OPEN_ROUTER_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAzureOpenAI(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        azure_endpoint=None,\n        model=None,\n        api_version=None,\n        rate_limit=-1,\n        **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_version = api_version\n        self.api_key = api_key\n        self.azure_endpoint = azure_endpoint\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.cost = 0.0\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"AZURE_OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY\"\n            )\n        api_version = self.api_version or os.getenv(\"OPENAI_API_VERSION\")\n        if api_version is None:\n            raise ValueError(\n                \"api_version must be provided either as a parameter or as an environment variable named OPENAI_API_VERSION\"\n            )\n        azure_endpoint = self.azure_endpoint or os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n        if azure_endpoint is None:\n            raise ValueError(\n                \"An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_ENDPOINT\"\n            )\n        if not self.llm_client:\n            self.llm_client = AzureOpenAI(\n                azure_endpoint=azure_endpoint,\n                api_key=api_key,\n                api_version=api_version,\n            )\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temperature,\n            **kwargs,\n        )\n        total_tokens = completion.usage.total_tokens\n        self.cost += 0.02 * ((total_tokens + 500) / 1000)\n        return completion.choices[0].message.content\n\n\nclass LMMEnginevLLM(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_key = api_key\n        self.base_url = base_url\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(\n        self,\n        messages,\n        temperature=0.0,\n        top_p=0.8,\n        repetition_penalty=1.05,\n        max_new_tokens=512,\n        **kwargs\n    ):\n        api_key = self.api_key or os.getenv(\"vLLM_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A vLLM API key needs to be provided in either the api_key parameter or as an environment variable named vLLM_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"vLLM_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temperature,\n            top_p=top_p,\n            extra_body={\"repetition_penalty\": repetition_penalty},\n        )\n        return completion.choices[0].message.content\n\n\nclass LMMEngineHuggingFace(LMMEngine):\n    def __init__(self, base_url=None, api_key=None, rate_limit=-1, **kwargs):\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"HF_TOKEN\")\n        if api_key is None:\n            raise ValueError(\n                \"A HuggingFace token needs to be provided in either the api_key parameter or as an environment variable named HF_TOKEN\"\n            )\n        base_url = self.base_url\n        if base_url is None:\n            raise ValueError(\n                \"HuggingFace endpoint must be provided as base_url parameter.\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=\"tgi\",\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineParasail(LMMEngine):\n    def __init__(self, api_key=None, model=None, rate_limit=-1, **kwargs):\n        assert model is not None, \"Parasail model id must be provided\"\n        self.model = model\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"PARASAIL_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A Parasail API key needs to be provided in either the api_key parameter or as an environment variable named PARASAIL_API_KEY\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(\n                base_url=\"https://api.parasail.io/v1\", api_key=api_key\n            )\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n"
  },
  {
    "path": "gui_agents/s2/core/knowledge.py",
    "content": "import json\nimport os\nfrom typing import Dict, Tuple\n\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\n\nfrom gui_agents.s2.core.module import BaseModule\nfrom gui_agents.s2.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2.utils.common_utils import (\n    call_llm_safe,\n    load_embeddings,\n    load_knowledge_base,\n    save_embeddings,\n)\nfrom gui_agents.s2.utils.query_perplexica import query_to_perplexica\n\n\nclass KnowledgeBase(BaseModule):\n    def __init__(\n        self,\n        embedding_engine,\n        local_kb_path: str,\n        platform: str,\n        engine_params: Dict,\n        save_knowledge: bool = True,\n    ):\n        super().__init__(engine_params, platform)\n\n        self.local_kb_path = local_kb_path\n\n        # initialize embedding engine\n        self.embedding_engine = embedding_engine\n\n        # Initialize paths for different memory types\n        self.episodic_memory_path = os.path.join(\n            self.local_kb_path, self.platform, \"episodic_memory.json\"\n        )\n        self.narrative_memory_path = os.path.join(\n            self.local_kb_path, self.platform, \"narrative_memory.json\"\n        )\n        self.embeddings_path = os.path.join(\n            self.local_kb_path, self.platform, \"embeddings.pkl\"\n        )\n\n        # Initialize trajectory tracking\n        self.task_trajectory = \"\"\n        self.current_subtask_trajectory = \"\"\n        self.current_search_query = \"\"\n\n        self.rag_module_system_prompt = PROCEDURAL_MEMORY.RAG_AGENT.replace(\n            \"CURRENT_OS\", self.platform\n        )\n\n        # All three agents share a generic RAG prompt that asks the agent to provide information for UI automation in CURRENT_OS\n        self.query_formulator = self._create_agent(self.rag_module_system_prompt)\n        self.llm_search_agent = self._create_agent(self.rag_module_system_prompt)\n        self.knowledge_fusion_agent = self._create_agent(self.rag_module_system_prompt)\n\n        self.narrative_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.TASK_SUMMARIZATION_PROMPT\n        )\n        self.episode_summarization_agent = self._create_agent(\n            PROCEDURAL_MEMORY.SUBTASK_SUMMARIZATION_PROMPT\n        )\n\n        self.save_knowledge = save_knowledge\n\n    def retrieve_knowledge(\n        self, instruction: str, search_query: str, search_engine: str = \"llm\"\n    ) -> Tuple[str, str]:\n        \"\"\"Retrieve knowledge using search engine\n        Args:\n            instruction (str): task instruction\n            observation (Dict): current observation\n            search_engine (str): search engine to use\"\"\"\n\n        # Use search engine to retrieve knowledge based on the formulated query\n        search_results = self._search(instruction, search_query, search_engine)\n\n        return search_query, search_results\n\n    def formulate_query(self, instruction: str, observation: Dict) -> str:\n        \"\"\"Formulate search query based on instruction and current state\"\"\"\n        query_path = os.path.join(\n            self.local_kb_path, self.platform, \"formulate_query.json\"\n        )\n        try:\n            with open(query_path, \"r\") as f:\n                formulate_query = json.load(f)\n        except:\n            formulate_query = {}\n\n        if instruction in formulate_query:\n            return formulate_query[instruction]\n\n        self.query_formulator.reset()\n\n        self.query_formulator.add_message(\n            f\"The task is: {instruction}\\n\"\n            \"To use google search to get some useful information, first carefully analyze \"\n            \"the screenshot of the current desktop UI state, then given the task \"\n            \"instruction, formulate a question that can be used to search on the Internet \"\n            \"for information in helping with the task execution.\\n\"\n            \"The question should not be too general or too specific. Please ONLY provide \"\n            \"the question.\\nQuestion:\",\n            image_content=(\n                observation[\"screenshot\"] if \"screenshot\" in observation else None\n            ),\n            role=\"user\",\n        )\n\n        search_query = self.query_formulator.get_response().strip().replace('\"', \"\")\n        print(\"search query: \", search_query)\n        formulate_query[instruction] = search_query\n        with open(query_path, \"w\") as f:\n            json.dump(formulate_query, f, indent=2)\n\n        return search_query\n\n    def _search(self, instruction: str, search_query: str, search_engine: str) -> str:\n        \"\"\"Execute search using specified engine\"\"\"\n\n        # Default to perplexica rag knowledge to see if the query exists\n        file = os.path.join(\n            self.local_kb_path, self.platform, f\"{search_engine}_rag_knowledge.json\"\n        )\n\n        try:\n            with open(file, \"r\") as f:\n                exist_search_results = json.load(f)\n        except:\n            exist_search_results = {}\n\n        if instruction in exist_search_results:\n            return exist_search_results[instruction]\n        if search_engine.lower() == \"llm\":\n            self.llm_search_agent.reset()\n            # Use LLM's internal knowledge like a search engine\n            self.llm_search_agent.add_message(search_query, role=\"user\")\n            search_results = self.llm_search_agent.get_response()\n        elif search_engine.lower() == \"perplexica\":\n            # Use perplexica to search for the query\n            search_results = query_to_perplexica(search_query)\n        else:\n            raise ValueError(f\"Unsupported search engine: {search_engine}\")\n\n        exist_search_results[instruction] = search_results.strip()\n        with open(\n            os.path.join(\n                self.local_kb_path,\n                self.platform,\n                f\"{search_engine}_rag_knowledge.json\",\n            ),\n            \"w\",\n        ) as f:\n            json.dump(exist_search_results, f, indent=2)\n\n        return search_results\n\n    def retrieve_narrative_experience(self, instruction: str) -> Tuple[str, str]:\n        \"\"\"Retrieve narrative experience using embeddings\"\"\"\n\n        knowledge_base = load_knowledge_base(self.narrative_memory_path)\n        if not knowledge_base:\n            return \"None\", \"None\"\n\n        embeddings = load_embeddings(self.embeddings_path)\n\n        # Get or create instruction embedding\n        instruction_embedding = embeddings.get(instruction)\n\n        if instruction_embedding is None:\n            instruction_embedding = self.embedding_engine.get_embeddings(instruction)\n            embeddings[instruction] = instruction_embedding\n\n        # Get or create embeddings for knowledge base entries\n        candidate_embeddings = []\n        for key in knowledge_base:\n            candidate_embedding = embeddings.get(key)\n            if candidate_embedding is None:\n                candidate_embedding = self.embedding_engine.get_embeddings(key)\n                embeddings[key] = candidate_embedding\n\n            candidate_embeddings.append(candidate_embedding)\n\n        save_embeddings(self.embeddings_path, embeddings)\n\n        similarities = cosine_similarity(\n            instruction_embedding, np.vstack(candidate_embeddings)\n        )[0]\n        sorted_indices = np.argsort(similarities)[::-1]\n\n        keys = list(knowledge_base.keys())\n        idx = 1 if keys[sorted_indices[0]] == instruction else 0\n        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]\n\n    def retrieve_episodic_experience(self, instruction: str) -> Tuple[str, str]:\n        \"\"\"Retrieve similar task experience using embeddings\"\"\"\n\n        knowledge_base = load_knowledge_base(self.episodic_memory_path)\n        if not knowledge_base:\n            return \"None\", \"None\"\n\n        embeddings = load_embeddings(self.embeddings_path)\n\n        # Get or create instruction embedding\n        instruction_embedding = embeddings.get(instruction)\n\n        if instruction_embedding is None:\n            instruction_embedding = self.embedding_engine.get_embeddings(instruction)\n            embeddings[instruction] = instruction_embedding\n\n        # Get or create embeddings for knowledge base entries\n        candidate_embeddings = []\n        for key in knowledge_base:\n            candidate_embedding = embeddings.get(key)\n            if candidate_embedding is None:\n                candidate_embedding = self.embedding_engine.get_embeddings(key)\n                embeddings[key] = candidate_embedding\n\n            candidate_embeddings.append(candidate_embedding)\n\n        save_embeddings(self.embeddings_path, embeddings)\n\n        similarities = cosine_similarity(\n            instruction_embedding, np.vstack(candidate_embeddings)\n        )[0]\n        sorted_indices = np.argsort(similarities)[::-1]\n\n        keys = list(knowledge_base.keys())\n        idx = 1 if keys[sorted_indices[0]] == instruction else 0\n        return keys[sorted_indices[idx]], knowledge_base[keys[sorted_indices[idx]]]\n\n    def knowledge_fusion(\n        self,\n        observation: Dict,\n        instruction: str,\n        web_knowledge: str,\n        similar_task: str,\n        experience: str,\n    ) -> str:\n        \"\"\"Combine web knowledge with similar task experience\"\"\"\n\n        self.knowledge_fusion_agent.reset()\n\n        self.knowledge_fusion_agent.add_message(\n            f\"Task: {instruction}\\n\"\n            f\"**Web search result**:\\n{web_knowledge}\\n\\n\"\n            f\"**Retrieved similar task experience**:\\n\"\n            f\"Similar task:{similar_task}\\n{experience}\\n\\n\"\n            f\"Based on the web search result and the retrieved similar task experience, \"\n            f\"if you think the similar task experience is indeed useful to the main task, \"\n            f\"integrate it with the web search result. Provide the final knowledge in a numbered list.\",\n            image_content=(\n                observation[\"screenshot\"] if \"screenshot\" in observation else None\n            ),\n            role=\"user\",\n        )\n        return self.knowledge_fusion_agent.get_response()\n\n    def save_episodic_memory(self, subtask_key: str, subtask_traj: str) -> None:\n        \"\"\"Save episodic memory (subtask level knowledge).\n\n        Args:\n            subtask_key (str): Key identifying the subtask\n            subtask_traj (str): Trajectory/experience of the subtask\n        \"\"\"\n        if not self.save_knowledge:\n            return\n\n        try:\n            kb = load_knowledge_base(self.episodic_memory_path)\n        except:\n            kb = {}\n\n        if subtask_key not in kb:\n            subtask_summarization = self.summarize_episode(subtask_traj)\n            kb[subtask_key] = subtask_summarization\n\n            os.makedirs(os.path.dirname(self.episodic_memory_path), exist_ok=True)\n            with open(self.episodic_memory_path, \"w\") as fout:\n                json.dump(kb, fout, indent=2)\n\n        return kb.get(subtask_key)\n\n    def save_narrative_memory(self, task_key: str, task_traj: str) -> None:\n        \"\"\"Save narrative memory (task level knowledge).\n\n        Args:\n            task_key (str): Key identifying the task\n            task_traj (str): Full trajectory/experience of the task\n        \"\"\"\n        if not self.save_knowledge:\n            return\n\n        try:\n            kb = load_knowledge_base(self.narrative_memory_path)\n        except:\n            kb = {}\n\n        if task_key not in kb:\n            task_summarization = self.summarize_narrative(task_traj)\n            kb[task_key] = task_summarization\n\n            os.makedirs(os.path.dirname(self.narrative_memory_path), exist_ok=True)\n            with open(self.narrative_memory_path, \"w\") as fout:\n                json.dump(kb, fout, indent=2)\n\n        return kb.get(task_key)\n\n    def initialize_task_trajectory(self, instruction: str) -> None:\n        \"\"\"Initialize a new task trajectory.\n\n        Args:\n            instruction (str): The task instruction\n        \"\"\"\n        self.task_trajectory = f\"Task:\\n{instruction}\"\n        self.current_search_query = \"\"\n        self.current_subtask_trajectory = \"\"\n\n    def update_task_trajectory(self, meta_data: Dict) -> None:\n        \"\"\"Update the task trajectory with new metadata.\n\n        Args:\n            meta_data (Dict): Metadata from the agent's prediction\n        \"\"\"\n        if not self.current_search_query and \"search_query\" in meta_data:\n            self.current_search_query = meta_data[\"search_query\"]\n\n        self.task_trajectory += (\n            \"\\n\\nReflection:\\n\"\n            + str(meta_data[\"reflection\"])\n            + \"\\n\\n----------------------\\n\\nPlan:\\n\"\n            + meta_data[\"executor_plan\"]\n        )\n\n    def handle_subtask_trajectory(self, meta_data: Dict) -> None:\n        \"\"\"Handle subtask trajectory updates based on subtask status.\n\n        Args:\n            meta_data (Dict): Metadata containing subtask information\n\n        Returns:\n            bool: Whether the subtask was completed\n        \"\"\"\n        subtask_status = meta_data[\"subtask_status\"]\n        subtask = meta_data[\"subtask\"]\n        subtask_info = meta_data[\"subtask_info\"]\n\n        if subtask_status in [\"Start\", \"Done\"]:\n            # If there's an existing subtask trajectory, finalize it\n            if self.current_subtask_trajectory:\n                self.current_subtask_trajectory += \"\\nSubtask Completed.\\n\"\n                subtask_key = self.current_subtask_trajectory.split(\n                    \"\\n----------------------\\n\\nPlan:\\n\"\n                )[0]\n                self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)\n                self.current_subtask_trajectory = \"\"\n                return True\n\n            # Start new subtask trajectory\n            self.current_subtask_trajectory = (\n                f\"Task:\\n{self.current_search_query}\\n\\n\"\n                f\"Subtask: {subtask}\\n\"\n                f\"Subtask Instruction: {subtask_info}\\n\"\n                f\"----------------------\\n\\n\"\n                f'Plan:\\n{meta_data[\"executor_plan\"]}\\n'\n            )\n            return False\n\n        elif subtask_status == \"In\":\n            # Continue current subtask trajectory\n            self.current_subtask_trajectory += (\n                f'\\n----------------------\\n\\nPlan:\\n{meta_data[\"executor_plan\"]}\\n'\n            )\n            return False\n\n    def finalize_task(self) -> None:\n        \"\"\"Finalize the task by saving any remaining trajectories.\"\"\"\n        # Save any remaining subtask trajectory\n        if self.current_subtask_trajectory:\n            self.current_subtask_trajectory += \"\\nSubtask Completed.\\n\"\n            subtask_key = self.current_subtask_trajectory.split(\n                \"\\n----------------------\\n\\nPlan:\\n\"\n            )[0]\n            self.save_episodic_memory(subtask_key, self.current_subtask_trajectory)\n\n        # Save the complete task trajectory\n        if self.task_trajectory and self.current_search_query:\n            self.save_narrative_memory(self.current_search_query, self.task_trajectory)\n\n        # Reset trajectories\n        self.task_trajectory = \"\"\n        self.current_subtask_trajectory = \"\"\n        self.current_search_query = \"\"\n\n    def summarize_episode(self, trajectory):\n        \"\"\"Summarize the episode experience for lifelong learning reflection\n        Args:\n            trajectory: str: The episode experience to be summarized\n        \"\"\"\n\n        # Create Reflection on whole trajectories for next round trial, keep earlier messages as exemplars\n        self.episode_summarization_agent.add_message(trajectory)\n        subtask_summarization = call_llm_safe(self.episode_summarization_agent)\n        self.episode_summarization_agent.add_message(subtask_summarization)\n\n        return subtask_summarization\n\n    def summarize_narrative(self, trajectory):\n        \"\"\"Summarize the narrative experience for lifelong learning reflection\n        Args:\n            trajectory: str: The narrative experience to be summarized\n        \"\"\"\n        # Create Reflection on whole trajectories for next round trial\n        self.narrative_summarization_agent.add_message(trajectory)\n        task_summarization = call_llm_safe(self.narrative_summarization_agent)\n\n        return task_summarization\n"
  },
  {
    "path": "gui_agents/s2/core/mllm.py",
    "content": "import base64\n\nimport numpy as np\n\nfrom gui_agents.s2.core.engine import (\n    LMMEngineAnthropic,\n    LMMEngineAzureOpenAI,\n    LMMEngineHuggingFace,\n    LMMEngineOpenAI,\n    LMMEngineOpenRouter,\n    LMMEngineParasail,\n    LMMEnginevLLM,\n    LMMEngineGemini,\n)\n\n\nclass LMMAgent:\n    def __init__(self, engine_params=None, system_prompt=None, engine=None):\n        if engine is None:\n            if engine_params is not None:\n                engine_type = engine_params.get(\"engine_type\")\n                if engine_type == \"openai\":\n                    self.engine = LMMEngineOpenAI(**engine_params)\n                elif engine_type == \"anthropic\":\n                    self.engine = LMMEngineAnthropic(**engine_params)\n                elif engine_type == \"azure\":\n                    self.engine = LMMEngineAzureOpenAI(**engine_params)\n                elif engine_type == \"vllm\":\n                    self.engine = LMMEnginevLLM(**engine_params)\n                elif engine_type == \"huggingface\":\n                    self.engine = LMMEngineHuggingFace(**engine_params)\n                elif engine_type == \"gemini\":\n                    self.engine = LMMEngineGemini(**engine_params)\n                elif engine_type == \"open_router\":\n                    self.engine = LMMEngineOpenRouter(**engine_params)\n                elif engine_type == \"parasail\":\n                    self.engine = LMMEngineParasail(**engine_params)\n                else:\n                    raise ValueError(\"engine_type is not supported\")\n            else:\n                raise ValueError(\"engine_params must be provided\")\n        else:\n            self.engine = engine\n\n        self.messages = []  # Empty messages\n\n        if system_prompt:\n            self.add_system_prompt(system_prompt)\n        else:\n            self.add_system_prompt(\"You are a helpful assistant.\")\n\n    def encode_image(self, image_content):\n        # if image_content is a path to an image file, check type of the image_content to verify\n        if isinstance(image_content, str):\n            with open(image_content, \"rb\") as image_file:\n                return base64.b64encode(image_file.read()).decode(\"utf-8\")\n        else:\n            return base64.b64encode(image_content).decode(\"utf-8\")\n\n    def reset(\n        self,\n    ):\n\n        self.messages = [\n            {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        ]\n\n    def add_system_prompt(self, system_prompt):\n        self.system_prompt = system_prompt\n        if len(self.messages) > 0:\n            self.messages[0] = {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        else:\n            self.messages.append(\n                {\n                    \"role\": \"system\",\n                    \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n                }\n            )\n\n    def remove_message_at(self, index):\n        \"\"\"Remove a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages.pop(index)\n\n    def replace_message_at(\n        self, index, text_content, image_content=None, image_detail=\"high\"\n    ):\n        \"\"\"Replace a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages[index] = {\n                \"role\": self.messages[index][\"role\"],\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n            if image_content:\n                base64_image = self.encode_image(image_content)\n                self.messages[index][\"content\"].append(\n                    {\n                        \"type\": \"image_url\",\n                        \"image_url\": {\n                            \"url\": f\"data:image/png;base64,{base64_image}\",\n                            \"detail\": image_detail,\n                        },\n                    }\n                )\n\n    def add_message(\n        self,\n        text_content,\n        image_content=None,\n        role=None,\n        image_detail=\"high\",\n        put_text_last=False,\n    ):\n        \"\"\"Add a new message to the list of messages\"\"\"\n\n        # API-style inference from OpenAI and AzureOpenAI\n        if isinstance(\n            self.engine,\n            (\n                LMMEngineOpenAI,\n                LMMEngineAzureOpenAI,\n                LMMEngineHuggingFace,\n                LMMEngineGemini,\n                LMMEngineOpenRouter,\n                LMMEngineParasail,\n            ),\n        ):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if isinstance(image_content, np.ndarray) or image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/png;base64,{base64_image}\",\n                                    \"detail\": image_detail,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:image/png;base64,{base64_image}\",\n                                \"detail\": image_detail,\n                            },\n                        }\n                    )\n\n            # Rotate text to be the last message if desired\n            if put_text_last:\n                text_content = message[\"content\"].pop(0)\n                message[\"content\"].append(text_content)\n\n            self.messages.append(message)\n\n        # For API-style inference from Anthropic\n        elif isinstance(self.engine, LMMEngineAnthropic):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image\",\n                                \"source\": {\n                                    \"type\": \"base64\",\n                                    \"media_type\": \"image/png\",\n                                    \"data\": base64_image,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image\",\n                            \"source\": {\n                                \"type\": \"base64\",\n                                \"media_type\": \"image/png\",\n                                \"data\": base64_image,\n                            },\n                        }\n                    )\n            self.messages.append(message)\n\n        # Locally hosted vLLM model inference\n        elif isinstance(self.engine, LMMEnginevLLM):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image;base64,{base64_image}\"\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\"url\": f\"data:image;base64,{base64_image}\"},\n                        }\n                    )\n\n            self.messages.append(message)\n        else:\n            raise ValueError(\"engine_type is not supported\")\n\n    def get_response(\n        self,\n        user_message=None,\n        messages=None,\n        temperature=0.0,\n        max_new_tokens=None,\n        **kwargs,\n    ):\n        \"\"\"Generate the next response based on previous messages\"\"\"\n        if messages is None:\n            messages = self.messages\n        if user_message:\n            messages.append(\n                {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": user_message}]}\n            )\n\n        return self.engine.generate(\n            messages,\n            temperature=temperature,\n            max_new_tokens=max_new_tokens,\n            **kwargs,\n        )\n"
  },
  {
    "path": "gui_agents/s2/core/module.py",
    "content": "from typing import Dict, Optional\nfrom gui_agents.s2.core.mllm import LMMAgent\n\n\nclass BaseModule:\n    def __init__(self, engine_params: Dict, platform: str):\n        self.engine_params = engine_params\n        self.platform = platform\n\n    def _create_agent(\n        self, system_prompt: str = None, engine_params: Optional[Dict] = None\n    ) -> LMMAgent:\n        \"\"\"Create a new LMMAgent instance\"\"\"\n        agent = LMMAgent(engine_params or self.engine_params)\n        if system_prompt:\n            agent.add_system_prompt(system_prompt)\n        return agent\n"
  },
  {
    "path": "gui_agents/s2/memory/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2/memory/procedural_memory.py",
    "content": "import inspect\nimport textwrap\n\n\nclass PROCEDURAL_MEMORY:\n\n    @staticmethod\n    def construct_worker_procedural_memory(agent_class, skipped_actions):\n        procedural_memory = textwrap.dedent(\n            f\"\"\"\\\n        You are an expert in graphical user interfaces and Python code. You are responsible for executing the current subtask: `SUBTASK_DESCRIPTION` of the larger goal: `TASK_DESCRIPTION`.\n        IMPORTANT: ** The subtasks: ['DONE_TASKS'] have already been done. The future subtasks ['FUTURE_TASKS'] will be done in the future by me. You must only perform the current subtask: `SUBTASK_DESCRIPTION`. Do not try to do future subtasks. **\n        You are working in CURRENT_OS. You must only complete the subtask provided and not the larger goal.\n        You are provided with:\n        1. A screenshot of the current time step.\n        2. The history of your previous interactions with the UI.\n        3. Access to the following class and methods to interact with the UI:\n        class Agent:\n        \"\"\"\n        )\n\n        for attr_name in dir(agent_class):\n            if attr_name in skipped_actions:\n                continue\n\n            attr = getattr(agent_class, attr_name)\n            if callable(attr) and hasattr(attr, \"is_agent_action\"):\n                # Use inspect to get the full function signature\n                signature = inspect.signature(attr)\n                procedural_memory += f\"\"\"\n    def {attr_name}{signature}:\n    '''{attr.__doc__}'''\n        \"\"\"\n\n        procedural_memory += textwrap.dedent(\n            \"\"\"\n        Your response should be formatted like this:\n        (Previous action verification)\n        Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.\n\n        (Screenshot Analysis)\n        Closely examine and describe the current state of the desktop along with the currently open applications.\n\n        (Next Action)\n        Based on the current screenshot and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.\n\n        (Grounded Action)\n        Translate the next action into code using the provided API methods. Format the code like this:\n        ```python\n        agent.click(\"The menu button at the top right of the window\", 1, \"left\")\n        ```\n        Note for the code:\n        1. Only perform one action at a time.\n        2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.\n        3. You must use only the available methods provided above to interact with the UI, do not invent new methods.\n        4. Only return one code block every time. There must be a single line of code in the code block.\n        5. If you think the task is already completed, return `agent.done()` in the code block.\n        6. If you think the task cannot be completed, return `agent.fail()` in the code block.\n        7. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the task is completed or `agent.fail()` if it cannot be completed.\n        8. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.\n        9. My computer's password is 'password', feel free to use it when you need sudo rights.\n        10. Do not use the \"command\" + \"tab\" hotkey on MacOS.\n        \"\"\"\n        )\n\n        return procedural_memory.strip()\n\n    # Manager prompt that generalizes to initial planning, re-planning after subtask completion, and re-planning after failure\n    COMBINED_MANAGER_PROMPT = textwrap.dedent(\n        \"\"\"\n    You are an expert planning agent for solving GUI navigation tasks. You need to generate a plan for solving the following task: TASK_DESCRIPTION.\n\n    You are provided with:\n    1. The state of the computer screen through a desktop screenshot and other related information\n    2. (If available) A list of successfully completed subtasks\n    3. (If available) A list of future remaining subtasks\n\n    Your responsibilities:\n    1. Generate a new plan or revise the pre-existing plan to complete the task\n    2. Ensure the plan is concise and contains only necessary steps\n    3. Carefully observe and understand the current state of the computer before generating your plan\n    4. Avoid including steps in your plan that the task does not ask for\n\n    Below are important considerations when generating your plan:\n    1. Provide the plan in a step-by-step format with detailed descriptions for each subtask.\n    2. Do not repeat subtasks that have already been successfully completed. Only plan for the remainder of the main task.\n    3. Do not include verification steps in your planning. Steps that confirm or validate other subtasks should not be included.\n    4. Do not include optional steps in your planning. Your plan must be as concise as possible.\n    5. Do not include unnecessary steps in your planning. If you are unsure if a step is necessary, do not include it in your plan.\n    6. When revising an existing plan:\n      - If you feel the trajectory and future subtasks seem correct based on the current state of the desktop, you may re-use future subtasks.\n      - If you feel some future subtasks are not detailed enough, use your observations from the desktop screenshot to update these subtasks to be more detailed.\n      - If you feel some future subtasks are incorrect or unnecessary, feel free to modify or even remove them.\n    \"\"\"\n    )\n\n    # USED IN OSWORLD EXPERIMENTS\n    RAG_AGENT_OSWORLD = \"\"\"\n    Given a desktop computer task instruction, you are an agent which should provide useful information as requested, to help another agent follow the instruction and perform the task.\n    The domain of the desktop computer task is from [CURRENT_OS, VLC, LibreOffice, Chrome, Thunderbird, VS Code, GIMP].\n    The task is: TASK_DESCRIPTION\n    The simplified accessibility tree of the current computer UI is: ACCESSIBLITY_TREE\n    \"\"\"\n\n    RAG_AGENT = \"\"\"\n    Given a desktop computer task instruction, you are an agent which should provide useful information as requested, to help another agent follow the instruction and perform the task in CURRENT_OS.\n    \"\"\"\n\n    # For reflection agent, post-action verification mainly for cycle detection\n    REFLECTION_ON_TRAJECTORY = textwrap.dedent(\n        \"\"\"\n    You are a reflection agent designed to assist in subtask execution by reflecting on the trajectory of a subtask and providing feedback for what the next step should be.\n    You have access to the Subtask Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.\n    Your task is to generate a reflection. Your generated reflection must fall under one of the two cases listed below:\n\n    Case 1. The trajectory is not going according to plan. This is often due to the latest action not being executed correctly, or a cycle of actions being continually repeated with no progress being made. In this case, explicitly highlight why the current trajectory is incorrect, and encourage the computer agent to try a new action. However, DO NOT encourage a specific action in particular.\n    Case 2. The trajectory is going according to plan. In this case, simply tell the agent to continue proceeding as planned. DO NOT encourage a specific action in particular.\n    \n    To be successful, you must follow the rules below:\n    - DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.\n    - Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.\n    - Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.\n    \"\"\"\n    )\n\n    TASK_SUMMARIZATION_PROMPT = \"\"\"\n    You are a summarization agent designed to analyze a trajectory of desktop task execution.\n    You have access to the Task Description and Whole Trajectory including plan, verification and reflection at each step.\n    Your summarized information will be referred to by another agent when performing the tasks.\n    You should follow the below instructions:\n    1. If the task is successfully executed, you should summarize the successful plan based on the whole trajectory to finish the task.\n    2. Otherwise, provide the reasons why the task is failed and potential suggestions that may avoid this failure.\n\n    **ATTENTION**\n    1. Only extract the correct plan and do not provide redundant steps.\n    2. Do not contain grounded actions in the plan.\n    3. If there are the successfully used hot-keys, make sure to include them in the plan.\n    4. The suggestions are for another agent not human, so they must be doable through the agent's action.\n    5. Don't generate high-level suggestions (e.g., Implement Error Handling).\n    \"\"\"\n\n    DAG_TRANSLATOR_PROMPT = \"\"\"You are a plan to Dependency Graph conversion agent. Your task is to analyze a given plan and generate a structured JSON output representing the plan and its corresponding directed acyclic graph (DAG).\n\nThe output should be a valid JSON object wrapped in <json></json> tags, with the following structure:\n\n<json>\n{\n  \"dag\": {\n    \"nodes\": [\n      {\n        \"name\": \"Short name or brief description of the step\",\n        \"info\": \"Detailed information about executing this step\"\n      }\n    ],\n    \"edges\": [\n      [\n        {\"name\": \"Name of the source node\", \"info\": \"Info of the source node\"},\n        {\"name\": \"Name of the target node\", \"info\": \"Info of the target node\"}\n      ]\n    ]\n  }\n}\n</json>\n\nImportant guidelines you must follow:\n1. The \"plan\" field should contain the entire original plan as a string.\n2. In the \"dag\" object:\n   a. Each node in the \"nodes\" array should contain 'name' and 'info' fields.\n   b. 'name' should be a concise, one-line description of the subtask.\n   c. 'info' should contain all available information about executing that subtask from the original plan. Do not remove or edit any information from the 'info' field.\n3. The \"edges\" array should represent the connections between nodes, showing the order and dependencies of the steps.\n4. If the plan only has one subtask, you MUST construct a graph with a SINGLE node. The \"nodes\" array should have that single subtask as a node, and the \"edges\" array should be empty.\n5. The graph must be a directed acyclic graph (DAG) and must be connected.\n6. Do not include completed subtasks in the graph. A completed subtask must not be included in a node or an edge.\n7. Do not include repeated or optional steps in the graph. Any extra information should be incorporated into the 'info' field of the relevant node.\n8. It is okay for the graph to have a single node and no edges, if the provided plan only has one subtask.\n\nAnalyze the given plan and provide the output in this JSON format within the <json></json> tags. Ensure the JSON is valid and properly escaped.\n\"\"\"\n\n    SUBTASK_SUMMARIZATION_PROMPT = textwrap.dedent(\n        \"\"\"\n    You are a summarization agent designed to analyze a trajectory of desktop task execution.\n    You will summarize the correct plan and grounded actions based on the whole trajectory of a subtask, ensuring the summarized plan contains only correct and necessary steps.\n\n    **ATTENTION**\n\t  1.\tSummarize the correct plan and its corresponding grounded actions. Carefully filter out any repeated or incorrect steps based on the verification output in the trajectory. Only include the necessary steps for successfully completing the subtask.\n    2.\tDescription Replacement in Grounded Actions:\n        When summarizing grounded actions, the agent.click() and agent.drag_and_drop() grounded actions take a description string as an argument.\n        Replace these description strings with placeholders like \\\"element1_description\\\", \\\"element2_description\\\", etc., while maintaining the total number of parameters.\n        For example, agent.click(\\\"The menu button in the top row\\\", 1) should be converted into agent.click(\\\"element1_description\\\", 1)\n        Ensure the placeholders (\\\"element1_description\\\", \\\"element2_description\\\", ...) follow the order of appearance in the grounded actions.\n\t  3.\tOnly generate grounded actions that are explicitly present in the trajectory. Do not introduce any grounded actions that do not exist in the trajectory.\n\t  4.\tFor each step in the plan, provide a corresponding grounded action. Use the exact format:\n    \t  Action: [Description of the correct action]\n    \t  Grounded Action: [Grounded actions with the \\\"element1_description\\\" replacement when needed]\n\t  5.\tExclude any other details that are not necessary for completing the task.\n    \"\"\"\n    )\n\n    STATE_EVALUATOR_SYSTEM_PROMPT = \"\"\"\n    You are an impartial evaluator to evaluate the completeness of the given desktop computer task, you are also an expert of accessibility tree, os environment and python programming.\n    The task is: TASK_DESCRIPTION, it is executed by a digital agent who can perform the task without knowing whether the task requirements are met.\n    As an evaluator, your task is to judge whether the task is finished and meets the task requirement.\n    You have access to the:\n    1. Task instruction.\n    2. The whole actions performed by the digital agent.\n    3. The accessibility tree at the first step and the last step.\n    4. The screenshot at the first step and the last step.\n\n    You are able to proceed your judgment process in the following ways based on the task instruction:\n    1. By comparing the difference in the accessibility trees of the UI, you should judge whether the task is complete given the task instruction.\n    2. If you cannot judge based on the observations, you can evalaute it by writing and running a python script to do a further examination. For example, you can use the 'subprocess' module to run the external command in a terminal to check whether an application has been installed.\n    You can also call the file system API to do the file check, etc. You can also try to interactive with the environment via other methods or interface you are familiared with.\n\n    **IMPORTANT**\n    1. If no python script is needed, you should provide your analysis and put the judgment at the end of the response in this format: Judgment: Yes/No\n    2. Otherwise, you should format your response into two parts as shown below:\n        ```python\n        # your code script here\n        ```\n\n    **ATTENTION**\n    1. You should only use scripts when you have to.\n    2. When you generate code script, only return one code block every time, the code block should contain the whole script you want to run. You must guarantee that the script is comprehensive and executable, make sure to print out the scripts' results for subsequent judgement.\n    Additionally, the comment of the code is **PROHIBITED**\n    3. You should strictly follow the response format mentioned above.\n\n    **SUBSEQUENCE**\n    If you have generated the python script, I will execute it and return the corresponding result to you (Started with \"The output after executing the script is:...\"). Then you should judge whether the task has been completed or not comprehensively based on the script and its result,\n    the task information, and the comparison of accessibility trees and screenshots. Provide your analysis and put the judgment at the end of the response in this format: Judgment: Yes/No\n    \"\"\"\n\n    OBS_EVALUATOR_SYSTEM_PROMPT = \"\"\"\n    You are an impartial evaluator to evaluate the completeness of the given desktop computer task.\n    The task is: TASK_DESCRIPTION, it is executed by a digital agent who can perform the task without knowing whether the task requirements are met.\n    As an evaluator, your task is to judge whether the task is finished and meets the task requirement.\n    You have access to the task instruction, the whole actions performed by the digital agent, the accessibility tree of the UI and screenshot at the first time step and the last time step.\n    By comparing the difference in the accessibility trees of the UI, you should judge whether the task is complete given the task instruction.\n    Provide your analysis and put the judgment at the end of the response in this format:\n    Judgment: Yes/No\n    Only say Yes or No in the Judgment section. Do not provide any other information in the Judgment section.\n    \"\"\"\n\n    PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent(\n        \"\"\"\n    You are an expert in graphical user interfaces. Your task is to process a phrase of text, and identify the most relevant word on the computer screen.\n    You are provided with a phrase, a table with all the text on the screen, and a screenshot of the computer screen. You will identify the single word id that is best associated with the provided phrase.\n    This single word must be displayed on the computer screenshot, and its location on the screen should align with the provided phrase.\n    Each row in the text table provides 2 pieces of data in the following order. 1st is the unique word id. 2nd is the corresponding word.\n\n    To be successful, it is very important to follow all these rules:\n    1. First, think step by step and generate your reasoning about which word id to click on.\n    2. Then, output the unique word id. Remember, the word id is the 1st number in each row of the text table.\n    3. If there are multiple occurrences of the same word, use the surrounding context in the phrase to choose the correct one. Pay very close attention to punctuation and capitalization.\n\n    \"\"\"\n    )\n"
  },
  {
    "path": "gui_agents/s2/utils/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2/utils/common_utils.py",
    "content": "import json\nimport re\nfrom typing import List\nimport time\nimport tiktoken\n\nfrom typing import Tuple, List, Union, Dict\n\nfrom pydantic import BaseModel, ValidationError\n\nimport pickle\n\n\nclass Node(BaseModel):\n    name: str\n    info: str\n\n\nclass Dag(BaseModel):\n    nodes: List[Node]\n    edges: List[List[Node]]\n\n\nNUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision\n\n\ndef call_llm_safe(agent) -> Union[str, Dag]:\n    # Retry if fails\n    max_retries = 3  # Set the maximum number of retries\n    attempt = 0\n    response = \"\"\n    while attempt < max_retries:\n        try:\n            response = agent.get_response()\n            break  # If successful, break out of the loop\n        except Exception as e:\n            attempt += 1\n            print(f\"Attempt {attempt} failed: {e}\")\n            if attempt == max_retries:\n                print(\"Max retries reached. Handling failure.\")\n        time.sleep(1.0)\n    return response\n\n\ndef calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]:\n\n    num_input_images = 0\n    output_message = messages[-1]\n\n    input_message = messages[:-1]\n\n    input_string = \"\"\"\"\"\"\n    for message in input_message:\n        input_string += message[\"content\"][0][\"text\"] + \"\\n\"\n        if len(message[\"content\"]) > 1:\n            num_input_images += 1\n\n    input_text_tokens = get_input_token_length(input_string)\n\n    input_image_tokens = num_image_token * num_input_images\n\n    output_tokens = get_input_token_length(output_message[\"content\"][0][\"text\"])\n\n    return (input_text_tokens + input_image_tokens), output_tokens\n\n\n# Code based on https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py\n\n\ndef parse_dag(text):\n    pattern = r\"<json>(.*?)</json>\"\n    match = re.search(pattern, text, re.DOTALL)\n    if match:\n        json_str = match.group(1)\n        try:\n            json_data = json.loads(json_str)\n            return Dag(**json_data[\"dag\"])\n        except json.JSONDecodeError:\n            print(\"Error: Invalid JSON\")\n            return None\n        except KeyError:\n            print(\"Error: 'dag' key not found in JSON\")\n            return None\n        except ValidationError as e:\n            print(f\"Error: Invalid data structure - {e}\")\n            return None\n    else:\n        print(\"Error: JSON not found\")\n        return None\n\n\ndef parse_dag(text):\n    \"\"\"\n    Try extracting JSON from <json>…</json> tags first;\n    if not found, try ```json … ``` Markdown fences.\n    \"\"\"\n\n    def _extract(pattern):\n        m = re.search(pattern, text, re.DOTALL)\n        return m.group(1).strip() if m else None\n\n    # 1) look for <json>…</json>\n    json_str = _extract(r\"<json>(.*?)</json>\")\n    # 2) fallback to ```json … ```\n    if json_str is None:\n        json_str = _extract(r\"```json\\s*(.*?)\\s*```\")\n\n    if json_str is None:\n        print(\"Error: JSON not found in either <json> tags or ```json``` fence\")\n        return None\n\n    try:\n        payload = json.loads(json_str)\n    except json.JSONDecodeError as e:\n        print(f\"Error: Invalid JSON ({e})\")\n        return None\n\n    if \"dag\" not in payload:\n        print(\"Error: 'dag' key not found in JSON\")\n        return None\n\n    try:\n        return Dag(**payload[\"dag\"])\n    except ValidationError as e:\n        print(f\"Error: Invalid data structure - {e}\")\n        return None\n\n\ndef parse_single_code_from_string(input_string):\n    input_string = input_string.strip()\n    if input_string.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n        return input_string.strip()\n\n    # This regular expression will match both ```code``` and ```python code```\n    # and capture the `code` part. It uses a non-greedy match for the content inside.\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n    # Find all non-overlapping matches in the string\n    matches = re.findall(pattern, input_string, re.DOTALL)\n\n    # The regex above captures the content inside the triple backticks.\n    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,\n    # so the code inside backticks can span multiple lines.\n\n    # matches now contains all the captured code snippets\n\n    codes = []\n\n    for match in matches:\n        match = match.strip()\n        commands = [\n            \"WAIT\",\n            \"DONE\",\n            \"FAIL\",\n        ]  # fixme: updates this part when we have more commands\n\n        if match in commands:\n            codes.append(match.strip())\n        elif match.split(\"\\n\")[-1] in commands:\n            if len(match.split(\"\\n\")) > 1:\n                codes.append(\"\\n\".join(match.split(\"\\n\")[:-1]))\n            codes.append(match.split(\"\\n\")[-1])\n        else:\n            codes.append(match)\n\n    if len(codes) <= 0:\n        return \"fail\"\n    return codes[0]\n\n\ndef get_input_token_length(input_string):\n    enc = tiktoken.encoding_for_model(\"gpt-4\")\n    tokens = enc.encode(input_string)\n    return len(tokens)\n\n\ndef sanitize_code(code):\n    # This pattern captures the outermost double-quoted text\n    if \"\\n\" in code:\n        pattern = r'(\".*?\")'\n        # Find all matches in the text\n        matches = re.findall(pattern, code, flags=re.DOTALL)\n        if matches:\n            # Replace the first occurrence only\n            first_match = matches[0]\n            code = code.replace(first_match, f'\"\"\"{first_match[1:-1]}\"\"\"', 1)\n    return code\n\n\ndef extract_first_agent_function(code_string):\n    # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses\n    pattern = r'agent\\.[a-zA-Z_]+\\((?:[^()\\'\"]|\\'[^\\']*\\'|\"[^\"]*\")*\\)'\n\n    # Find all matches in the string\n    matches = re.findall(pattern, code_string)\n\n    # Return the first match if found, otherwise return None\n    return matches[0] if matches else None\n\n\ndef load_knowledge_base(kb_path: str) -> Dict:\n    try:\n        with open(kb_path, \"r\") as f:\n            return json.load(f)\n    except Exception as e:\n        print(f\"Error loading knowledge base: {e}\")\n        return {}\n\n\ndef load_embeddings(embeddings_path: str) -> Dict:\n    try:\n        with open(embeddings_path, \"rb\") as f:\n            return pickle.load(f)\n    except Exception as e:\n        print(f\"Error loading embeddings: {e}\")\n        return {}\n\n\ndef save_embeddings(embeddings_path: str, embeddings: Dict):\n    try:\n        with open(embeddings_path, \"wb\") as f:\n            pickle.dump(embeddings, f)\n    except Exception as e:\n        print(f\"Error saving embeddings: {e}\")\n"
  },
  {
    "path": "gui_agents/s2/utils/query_perplexica.py",
    "content": "import requests\nimport os\n\n\ndef query_to_perplexica(query):\n    # Retrieve the URL from an environment variable\n    url = os.getenv(\"PERPLEXICA_URL\")\n    if not url:\n        raise ValueError(\n            \"PERPLEXICA_URL environment variable not set. It may take the form: 'http://localhost:{port}/api/search'. The port number is set in the config.toml in the Perplexica directory.\"\n        )\n\n    # Request Message\n    message = {\"focusMode\": \"webSearch\", \"query\": query, \"history\": [[\"human\", query]]}\n\n    response = requests.post(url, json=message)\n\n    if response.status_code == 200:\n        return response.json()[\"message\"]\n    elif response.status_code == 400:\n        raise ValueError(\n            \"The request is malformed or missing required fields, such as FocusModel or query\"\n        )\n    else:\n        raise ValueError(\"Internal Server Error\")\n\n\n# Test Code\nif __name__ == \"__main__\":\n    query = \"What is Agent S?\"\n    response = query_to_perplexica(query)\n    print(response)\n"
  },
  {
    "path": "gui_agents/s2_5/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2_5/agents/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2_5/agents/agent_s.py",
    "content": "import logging\nimport platform\nfrom typing import Dict, List, Tuple\n\nfrom gui_agents.s2_5.agents.grounding import ACI\nfrom gui_agents.s2_5.agents.worker import Worker\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass UIAgent:\n    \"\"\"Base class for UI automation agents\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n    ):\n        \"\"\"Initialize UIAgent\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (macos, linux, windows)\n        \"\"\"\n        self.engine_params = engine_params\n        self.grounding_agent = grounding_agent\n        self.platform = platform\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state\"\"\"\n        pass\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        \"\"\"Generate next action prediction\n\n        Args:\n            instruction: Natural language instruction\n            observation: Current UI state observation\n\n        Returns:\n            Tuple containing agent info dictionary and list of actions\n        \"\"\"\n        pass\n\n\nclass AgentS2_5(UIAgent):\n    \"\"\"Agent that uses no hierarchy for less inference time\"\"\"\n\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        max_trajectory_length: int = 8,\n        enable_reflection: bool = True,\n    ):\n        \"\"\"Initialize a minimalist AgentS2 without hierarchy\n\n        Args:\n            engine_params: Configuration parameters for the LLM engine\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (darwin, linux, windows)\n            max_trajectory_length: Maximum number of image turns to keep\n            enable_reflection: Creates a reflection agent to assist the worker agent\n        \"\"\"\n\n        super().__init__(engine_params, grounding_agent, platform)\n        self.max_trajectory_length = max_trajectory_length\n        self.enable_reflection = enable_reflection\n        self.reset()\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state and initialize components\"\"\"\n        self.executor = Worker(\n            engine_params=self.engine_params,\n            grounding_agent=self.grounding_agent,\n            platform=self.platform,\n            max_trajectory_length=self.max_trajectory_length,\n            enable_reflection=self.enable_reflection,\n        )\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        # Initialize the three info dictionaries\n        executor_info, actions = self.executor.generate_next_action(\n            instruction=instruction, obs=observation\n        )\n\n        # concatenate the three info dictionaries\n        info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}\n\n        return info, actions\n"
  },
  {
    "path": "gui_agents/s2_5/agents/grounding.py",
    "content": "import ast\nimport re\nfrom collections import defaultdict\nfrom io import BytesIO\nfrom typing import Any, Dict, List, Optional, Tuple, Union\n\nimport pytesseract\nfrom PIL import Image\nfrom pytesseract import Output\n\nfrom gui_agents.s2_5.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2_5.core.mllm import LMMAgent\nfrom gui_agents.s2_5.utils.common_utils import (\n    call_llm_safe,\n    parse_single_code_from_string,\n)\n\n\nclass ACI:\n    def __init__(self):\n        self.notes: List[str] = []\n\n\n# Agent action decorator\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nUBUNTU_APP_SETUP = f\"\"\"import subprocess;\nimport difflib;\nimport pyautogui;\npyautogui.press('escape');\ntime.sleep(0.5);\noutput = subprocess.check_output(['wmctrl', '-lx']);\noutput = output.decode('utf-8').splitlines();\nwindow_titles = [line.split(None, 4)[2] for line in output];\nclosest_matches = difflib.get_close_matches('APP_NAME', window_titles, n=1, cutoff=0.1);\nif closest_matches:\n    closest_match = closest_matches[0];\n    for line in output:\n        if closest_match in line:\n            window_id = line.split()[0]\n            break;\nsubprocess.run(['wmctrl', '-ia', window_id])\nsubprocess.run(['wmctrl', '-ir', window_id, '-b', 'add,maximized_vert,maximized_horz'])\n\"\"\"\n\n\nSET_CELL_VALUES_CMD = \"\"\"import uno\nimport subprocess\n\ndef identify_document_type(component):\n    if component.supportsService(\"com.sun.star.sheet.SpreadsheetDocument\"):\n        return \"Calc\"\n\n    if component.supportsService(\"com.sun.star.text.TextDocument\"):\n        return \"Writer\"\n\n    if component.supportsService(\"com.sun.star.sheet.PresentationDocument\"):\n        return \"Impress\"\n\n    return None\n\ndef cell_ref_to_indices(cell_ref):\n    column_letters = ''.join(filter(str.isalpha, cell_ref))\n    row_number = ''.join(filter(str.isdigit, cell_ref))\n\n    col = sum((ord(char.upper()) - ord('A') + 1) * (26**idx) for idx, char in enumerate(reversed(column_letters))) - 1\n    row = int(row_number) - 1\n    return col, row\n\ndef set_cell_values(new_cell_values: dict[str, str], app_name: str = \"Untitled 1\", sheet_name: str = \"Sheet1\"):\n    new_cell_values_idx = {{}}\n    for k, v in new_cell_values.items():\n        try:\n            col, row = cell_ref_to_indices(k)\n        except:\n            col = row = None\n\n        if col is not None and row is not None:\n            new_cell_values_idx[(col, row)] = v\n\n    # Clean up previous TCP connections.\n    subprocess.run(\n        'echo \\\"osworld-public-evaluation\\\" | sudo -S ss --kill --tcp state TIME-WAIT sport = :2002',\n        shell=True,\n        check=True,\n        text=True,\n        capture_output=True\n    )\n\n    # Dynamically allow soffice to listen on port 2002.\n    subprocess.run(\n        [\n            \"soffice\",\n            \"--accept=socket,host=localhost,port=2002;urp;StarOffice.Service\"\n        ]\n    )\n\n    local_context = uno.getComponentContext()\n    resolver = local_context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.bridge.UnoUrlResolver\", local_context\n    )\n    context = resolver.resolve(\n        f\"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext\"\n    )\n    desktop = context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.frame.Desktop\", context\n    )\n\n    # Collect all LibreOffice-related opened windows.\n    documents = []\n    for i, component in enumerate(desktop.Components):\n        title = component.Title\n        doc_type = identify_document_type(component)\n        documents.append((i, component, title, doc_type))\n\n    # Find the LibreOffice Calc app and the sheet of interest.\n    spreadsheet = [doc for doc in documents if doc[3] == \"Calc\"]\n    selected_spreadsheet = [doc for doc in spreadsheet if doc[2] == app_name]\n    if spreadsheet:\n        try:\n            if selected_spreadsheet:\n                spreadsheet = selected_spreadsheet[0][1]\n            else:\n                spreadsheet = spreadsheet[0][1]\n\n            sheet = spreadsheet.Sheets.getByName(sheet_name)\n        except:\n            raise ValueError(f\"Could not find sheet {{sheet_name}} in {{app_name}}.\")\n\n        for (col, row), value in new_cell_values_idx.items():\n            cell = sheet.getCellByPosition(col, row)\n\n            # Set the cell value.\n            if isinstance(value, (int, float)):\n                cell.Value = value\n            elif isinstance(value, str):\n                if value.startswith(\"=\"):\n                    cell.Formula = value\n                else:\n                    cell.String = value\n            elif isinstance(value, bool):\n                cell.Value = 1 if value else 0\n            elif value is None:\n                cell.clearContents(0)\n            else:\n                raise ValueError(f\"Unsupported cell value type: {{type(value)}}\")\n\n    else:\n        raise ValueError(f\"Could not find LibreOffice Calc app corresponding to {{app_name}}.\")\n\nset_cell_values(new_cell_values={cell_values}, app_name=\"{app_name}\", sheet_name=\"{sheet_name}\")        \n\"\"\"\n\n\n# ACI primitives are parameterized by description, and coordinate generation uses a pretrained grounding model\nclass OSWorldACI(ACI):\n    def __init__(\n        self,\n        platform: str,\n        engine_params_for_generation: Dict,\n        engine_params_for_grounding: Dict,\n        width: int = 1920,\n        height: int = 1080,\n    ):\n        self.platform = (\n            platform  # Dictates how the switch_applications agent action works.\n        )\n\n        # Configure scaling\n        self.width = width\n        self.height = height\n\n        # Maintain state for save_to_knowledge\n        self.notes = []\n\n        # Coordinates used during ACI execution\n        self.coords1 = None\n        self.coords2 = None\n\n        # Configure the visual grounding model responsible for coordinate generation\n        self.grounding_model = LMMAgent(engine_params_for_grounding)\n        self.engine_params_for_grounding = engine_params_for_grounding\n\n        # Configure text grounding agent\n        self.text_span_agent = LMMAgent(\n            engine_params=engine_params_for_generation,\n            system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,\n        )\n\n    # Given the state and worker's referring expression, use the grounding model to generate (x,y)\n    def generate_coords(self, ref_expr: str, obs: Dict) -> List[int]:\n\n        # Reset the grounding model state\n        self.grounding_model.reset()\n\n        # Configure the context, UI-TARS demo does not use system prompt\n        prompt = f\"Query:{ref_expr}\\nOutput only the coordinate of one point in your response.\\n\"\n        self.grounding_model.add_message(\n            text_content=prompt, image_content=obs[\"screenshot\"], put_text_last=True\n        )\n\n        # Generate and parse coordinates\n        response = call_llm_safe(self.grounding_model)\n        print(\"RAW GROUNDING MODEL RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        assert len(numericals) >= 2\n        return [int(numericals[0]), int(numericals[1])]\n\n    # Calls pytesseract to generate word level bounding boxes for text grounding\n    def get_ocr_elements(self, b64_image_data: str) -> Tuple[str, List]:\n        image = Image.open(BytesIO(b64_image_data))\n        image_data = pytesseract.image_to_data(image, output_type=Output.DICT)\n\n        # Clean text by removing leading and trailing spaces and non-alphabetical characters, but keeping punctuation\n        for i, word in enumerate(image_data[\"text\"]):\n            image_data[\"text\"][i] = re.sub(\n                r\"^[^a-zA-Z\\s.,!?;:\\-\\+]+|[^a-zA-Z\\s.,!?;:\\-\\+]+$\", \"\", word\n            )\n\n        ocr_elements = []\n        ocr_table = \"Text Table:\\nWord id\\tText\\n\"\n        # Obtain the <id, text, group number, word number> for each valid element\n        grouping_map = defaultdict(list)\n        ocr_id = 0\n        for i in range(len(image_data[\"text\"])):\n            block_num = image_data[\"block_num\"][i]\n            if image_data[\"text\"][i]:\n                grouping_map[block_num].append(image_data[\"text\"][i])\n                ocr_table += f\"{ocr_id}\\t{image_data['text'][i]}\\n\"\n                ocr_elements.append(\n                    {\n                        \"id\": ocr_id,\n                        \"text\": image_data[\"text\"][i],\n                        \"group_num\": block_num,\n                        \"word_num\": len(grouping_map[block_num]),\n                        \"left\": image_data[\"left\"][i],\n                        \"top\": image_data[\"top\"][i],\n                        \"width\": image_data[\"width\"][i],\n                        \"height\": image_data[\"height\"][i],\n                    }\n                )\n                ocr_id += 1\n\n        return ocr_table, ocr_elements\n\n    # Given the state and worker's text phrase, generate the coords of the first/last word in the phrase\n    def generate_text_coords(\n        self, phrase: str, obs: Dict, alignment: str = \"\"\n    ) -> List[int]:\n\n        ocr_table, ocr_elements = self.get_ocr_elements(obs[\"screenshot\"])\n\n        alignment_prompt = \"\"\n        if alignment == \"start\":\n            alignment_prompt = \"**Important**: Output the word id of the FIRST word in the provided phrase.\\n\"\n        elif alignment == \"end\":\n            alignment_prompt = \"**Important**: Output the word id of the LAST word in the provided phrase.\\n\"\n\n        # Load LLM prompt\n        self.text_span_agent.reset()\n        self.text_span_agent.add_message(\n            alignment_prompt + \"Phrase: \" + phrase + \"\\n\" + ocr_table, role=\"user\"\n        )\n        self.text_span_agent.add_message(\n            \"Screenshot:\\n\", image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        # Obtain the target element\n        response = call_llm_safe(self.text_span_agent)\n        print(\"TEXT SPAN AGENT RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        if len(numericals) > 0:\n            text_id = int(numericals[-1])\n        else:\n            text_id = 0\n        elem = ocr_elements[text_id]\n\n        # Compute the element coordinates\n        if alignment == \"start\":\n            coords = [elem[\"left\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        elif alignment == \"end\":\n            coords = [elem[\"left\"] + elem[\"width\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        else:\n            coords = [\n                elem[\"left\"] + (elem[\"width\"] // 2),\n                elem[\"top\"] + (elem[\"height\"] // 2),\n            ]\n        return coords\n\n    # Takes a description based action and assigns the coordinates for any coordinate based action\n    # Raises an error if function can't be parsed\n    def assign_coordinates(self, plan: str, obs: Dict):\n\n        # Reset coords from previous action generation\n        self.coords1, self.coords2 = None, None\n\n        try:\n            # Extract the function name and args\n            action = parse_single_code_from_string(plan.split(\"Grounded Action\")[-1])\n            function_name = re.match(r\"(\\w+\\.\\w+)\\(\", action).group(1)\n            args = self.parse_function_args(action)\n        except Exception as e:\n            raise RuntimeError(f\"Error in parsing grounded action: {e}\") from e\n\n        # arg0 is a description\n        if (\n            function_name in [\"agent.click\", \"agent.type\", \"agent.scroll\"]\n            and len(args) >= 1\n            and args[0] != None\n        ):\n            self.coords1 = self.generate_coords(args[0], obs)\n        # arg0 and arg1 are descriptions\n        elif function_name == \"agent.drag_and_drop\" and len(args) >= 2:\n            self.coords1 = self.generate_coords(args[0], obs)\n            self.coords2 = self.generate_coords(args[1], obs)\n        # arg0 and arg1 are text phrases\n        elif function_name == \"agent.highlight_text_span\" and len(args) >= 2:\n            self.coords1 = self.generate_text_coords(args[0], obs, alignment=\"start\")\n            self.coords2 = self.generate_text_coords(args[1], obs, alignment=\"end\")\n\n    # Resize from grounding model dim into OSWorld dim (1920 * 1080)\n    def resize_coordinates(self, coordinates: List[int]) -> List[int]:\n        grounding_width = self.engine_params_for_grounding[\"grounding_width\"]\n        grounding_height = self.engine_params_for_grounding[\"grounding_height\"]\n\n        return [\n            round(coordinates[0] * self.width / grounding_width),\n            round(coordinates[1] * self.height / grounding_height),\n        ]\n\n    # Given a generated ACI function, returns a list of argument values, where descriptions are at the front of the list\n    def parse_function_args(self, function: str) -> List[str]:\n        tree = ast.parse(function)\n        call_node = tree.body[0].value\n\n        def safe_eval(node):\n            if isinstance(\n                node, ast.Constant\n            ):  # Handles literals like numbers, strings, etc.\n                return node.value\n            else:\n                return ast.unparse(node)  # Return as a string if not a literal\n\n        positional_args = [safe_eval(arg) for arg in call_node.args]\n        keyword_args = {kw.arg: safe_eval(kw.value) for kw in call_node.keywords}\n\n        res = []\n\n        for key, val in keyword_args.items():\n            if \"description\" in key:\n                res.append(val)\n\n        for arg in positional_args:\n            res.append(arg)\n\n        return res\n\n    @agent_action\n    def click(\n        self,\n        element_description: str,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        x, y = self.resize_coordinates(self.coords1)\n        command = \"import pyautogui; \"\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def switch_applications(self, app_code):\n        \"\"\"Switch to a different application that is already open\n        Args:\n            app_code:str the code name of the application to switch to from the provided list of open applications\n        \"\"\"\n        if self.platform == \"darwin\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        elif self.platform == \"linux\":\n            return UBUNTU_APP_SETUP.replace(\"APP_NAME\", app_code)\n        elif self.platform == \"windows\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('win', 'd', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n\n    @agent_action\n    def open(self, app_or_filename: str):\n        \"\"\"Open any application or file with name app_or_filename. This action should be used on Linux/Darwin systems instead of opening the file manually. Do not use on Windows.\n        Args:\n            app_or_filename:str, the name of the application or filename to open\n        \"\"\"\n        if self.platform == \"linux\":\n            return f\"import pyautogui; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(0.5)\"\n        elif self.platform == \"darwin\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)\"\n\n    @agent_action\n    def type(\n        self,\n        element_description: Optional[str] = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text into a specific element\n        Args:\n            element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.\n            text:str, the text to type\n            overwrite:bool, Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n\n        select_mod = \"command\" if self.platform == \"darwin\" else \"ctrl\"\n\n        if self.coords1 is not None:\n            # If a node is found, retrieve its coordinates and size\n            # Start typing at the center of the element\n\n            x, y = self.resize_coordinates(self.coords1)\n\n            command = \"import pyautogui; \"\n            command += f\"pyautogui.click({x}, {y}); \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey({repr(select_mod)}, 'a'); \"\n                    \"pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n        else:\n            # If no element is found, start typing at the current cursor location\n            command = \"import pyautogui; \"\n\n            if overwrite:\n                command += (\n                    f\"pyautogui.hotkey({repr(select_mod)}, 'a'); \"\n                    \"pyautogui.press('backspace'); \"\n                )\n\n            command += f\"pyautogui.write({repr(text)}); \"\n\n            if enter:\n                command += \"pyautogui.press('enter'); \"\n\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(\n        self, starting_description: str, ending_description: str, hold_keys: List = []\n    ):\n        \"\"\"Drag from the starting description to the ending description\n        Args:\n            starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.\n            ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        x1, y1 = self.resize_coordinates(self.coords1)\n        x2, y2 = self.resize_coordinates(self.coords2)\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1., button='left'); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def highlight_text_span(\n        self, starting_phrase: str, ending_phrase: str, button: str = \"left\"\n    ):\n        \"\"\"Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.\n        Args:\n            starting_phrase:str, the phrase that denotes the start of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n            ending_phrase:str, the phrase that denotes the end of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n            button:str, the button to use to highlight the text span. Defaults to \"left\". Can be \"left\", \"right\", or \"middle\".\n        \"\"\"\n\n        x1, y1 = self.coords1\n        x2, y2 = self.coords2\n\n        command = \"import pyautogui; \"\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1., button='{button}'); pyautogui.mouseUp(); \"\n\n        # Return pyautoguicode to drag and drop the elements\n        return command\n\n    @agent_action\n    def set_cell_values(\n        self, cell_values: Dict[str, Any], app_name: str, sheet_name: str\n    ):\n        \"\"\"Use this to set individual cell values in a spreadsheet. For example, setting A2 to \"hello\" would be done by passing {\"A2\": \"hello\"} as cell_values. The sheet must be opened before this command can be used.\n        Args:\n            cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format \"A1\", \"B2\", etc.\n                Supported value types include: float, int, string, bool, formulas.\n            app_name: str, The name of the spreadsheet application. For example, \"Some_sheet.xlsx\".\n            sheet_name: str, The name of the sheet in the spreadsheet. For example, \"Sheet1\".\n        \"\"\"\n        return SET_CELL_VALUES_CMD.format(\n            cell_values=cell_values, app_name=app_name, sheet_name=sheet_name\n        )\n\n    @agent_action\n    def scroll(self, element_description: str, clicks: int, shift: bool = False):\n        \"\"\"Scroll the element in the specified direction\n        Args:\n            element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.\n            clicks:int, the number of clicks to scroll can be positive (up) or negative (down).\n            shift:bool, whether to use shift+scroll for horizontal scrolling\n        \"\"\"\n\n        x, y = self.resize_coordinates(self.coords1)\n\n        if shift:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})\"\n        else:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})\"\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])\n        \"\"\"\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)})\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(\n        self,\n        return_value: Optional[Union[Dict, str, List, Tuple, int, float, bool]] = None,\n    ):\n        \"\"\"End the current task with a success and the required return value\"\"\"\n        self.returned_info = return_value\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure, and replan the whole task.\"\"\"\n        return \"\"\"FAIL\"\"\"\n\n\n# ACI that supports the worker-only mode: done() and fail() become task scoped instead\nclass OSWorldWorkerOnlyACI(OSWorldACI):\n    @agent_action\n    def done(\n        self,\n    ):\n        \"\"\"End the current task with a success. Use this when you believe the entire task has been fully completed.\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure. Use this when you believe the entire task is impossible to complete.\"\"\"\n        return \"\"\"FAIL\"\"\"\n"
  },
  {
    "path": "gui_agents/s2_5/agents/worker.py",
    "content": "import logging\nimport textwrap\nfrom typing import Dict, List, Tuple\n\nfrom gui_agents.s2_5.agents.grounding import ACI\nfrom gui_agents.s2_5.core.module import BaseModule\nfrom gui_agents.s2_5.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s2_5.utils.common_utils import (\n    call_llm_safe,\n    extract_first_agent_function,\n    parse_single_code_from_string,\n    sanitize_code,\n    split_thinking_response,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass Worker(BaseModule):\n    def __init__(\n        self,\n        engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = \"ubuntu\",\n        max_trajectory_length: int = 8,\n        enable_reflection: bool = True,\n    ):\n        \"\"\"\n        Worker receives the main task and generates actions, without the need of hierarchical planning\n        Args:\n            engine_params: Dict\n                Parameters for the multimodal engine\n            grounding_agent: Agent\n                The grounding agent to use\n            platform: str\n                OS platform the agent runs on (darwin, linux, windows)\n            max_trajectory_length: int\n                The amount of images turns to keep\n            enable_reflection: bool\n                Whether to enable reflection\n        \"\"\"\n        super().__init__(engine_params, platform)\n\n        self.grounding_agent = grounding_agent\n        self.max_trajectory_length = max_trajectory_length\n        self.enable_reflection = enable_reflection\n        self.temperature = engine_params.get(\"temperature\", 0.0)\n        self.use_thinking = engine_params.get(\"model\", \"\") in [\n            \"claude-3-7-sonnet-20250219\"\n        ]\n        self.reset()\n\n    def reset(self):\n        if self.platform != \"linux\":\n            skipped_actions = [\"set_cell_values\"]\n        else:\n            skipped_actions = []\n\n        sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(\n            type(self.grounding_agent), skipped_actions=skipped_actions\n        ).replace(\"CURRENT_OS\", self.platform)\n\n        self.generator_agent = self._create_agent(sys_prompt)\n        self.reflection_agent = self._create_agent(\n            PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY\n        )\n\n        self.turn_count = 0\n        self.worker_history = []\n        self.reflections = []\n        self.cost_this_turn = 0\n        self.screenshot_inputs = []\n\n    # Flushing strategy dependant on model context limits\n    def flush_messages(self):\n        engine_type = self.engine_params.get(\"engine_type\", \"\")\n\n        # Flush strategy for long-context models: keep all text, only keep latest images\n        if engine_type in [\"anthropic\", \"openai\", \"gemini\"]:\n            max_images = self.max_trajectory_length\n            for agent in [self.generator_agent, self.reflection_agent]:\n                # keep latest k images\n                img_count = 0\n                for i in range(len(agent.messages) - 1, -1, -1):\n                    for j in range(len(agent.messages[i][\"content\"])):\n                        if \"image\" in agent.messages[i][\"content\"][j].get(\"type\", \"\"):\n                            img_count += 1\n                            if img_count > max_images:\n                                del agent.messages[i][\"content\"][j]\n\n        # Flush strategy for non-long-context models: drop full turns\n        else:\n            # generator msgs are alternating [user, assistant], so 2 per round\n            if len(self.generator_agent.messages) > 2 * self.max_trajectory_length + 1:\n                self.generator_agent.messages.pop(1)\n                self.generator_agent.messages.pop(1)\n            # reflector msgs are all [(user text, user image)], so 1 per round\n            if len(self.reflection_agent.messages) > self.max_trajectory_length + 1:\n                self.reflection_agent.messages.pop(1)\n\n    def generate_next_action(\n        self,\n        instruction: str,\n        obs: Dict,\n    ) -> Tuple[Dict, List]:\n        \"\"\"\n        Predict the next action(s) based on the current observation.\n        \"\"\"\n        agent = self.grounding_agent\n        generator_message = (\n            \"\"\n            if self.turn_count > 0\n            else \"The initial screen is provided. No action has been taken yet.\"\n        )\n\n        # Load the task into the system prompt\n        if self.turn_count == 0:\n            self.generator_agent.add_system_prompt(\n                self.generator_agent.system_prompt.replace(\n                    \"TASK_DESCRIPTION\", instruction\n                )\n            )\n\n        # Get the per-step reflection\n        reflection = None\n        reflection_thoughts = None\n        if self.enable_reflection:\n            # Load the initial message\n            if self.turn_count == 0:\n                text_content = textwrap.dedent(\n                    f\"\"\"\n                    Task Description: {instruction}\n                    Current Trajectory below:\n                    \"\"\"\n                )\n                updated_sys_prompt = (\n                    self.reflection_agent.system_prompt + \"\\n\" + text_content\n                )\n                self.reflection_agent.add_system_prompt(updated_sys_prompt)\n                self.reflection_agent.add_message(\n                    text_content=\"The initial screen is provided. No action has been taken yet.\",\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n            # Load the latest action\n            else:\n                self.reflection_agent.add_message(\n                    text_content=self.worker_history[-1],\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n                full_reflection = call_llm_safe(\n                    self.reflection_agent,\n                    temperature=self.temperature,\n                    use_thinking=self.use_thinking,\n                )\n                reflection, reflection_thoughts = split_thinking_response(\n                    full_reflection\n                )\n                self.reflections.append(reflection)\n                generator_message += f\"REFLECTION: You may use this reflection on the previous action and overall trajectory:\\n{reflection}\\n\"\n                logger.info(\"REFLECTION: %s\", reflection)\n\n        # Add finalized message to conversation\n        generator_message += f\"\\nCurrent Text Buffer = [{','.join(agent.notes)}]\\n\"\n        self.generator_agent.add_message(\n            generator_message, image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        full_plan = call_llm_safe(\n            self.generator_agent,\n            temperature=self.temperature,\n            use_thinking=self.use_thinking,\n        )\n        plan, plan_thoughts = split_thinking_response(full_plan)\n        # NOTE: currently dropping thinking tokens from context\n        self.worker_history.append(plan)\n        logger.info(\"FULL PLAN:\\n %s\", full_plan)\n        self.generator_agent.add_message(plan, role=\"assistant\")\n\n        # Use the grounding agent to convert agent_action(\"desc\") into agent_action([x, y])\n        try:\n            agent.assign_coordinates(plan, obs)\n            plan_code = parse_single_code_from_string(plan.split(\"Grounded Action\")[-1])\n            plan_code = sanitize_code(plan_code)\n            plan_code = extract_first_agent_function(plan_code)\n            exec_code = eval(plan_code)\n        except Exception as e:\n            logger.error(\"Error in parsing plan code: %s\", e)\n            plan_code = \"agent.wait(1.0)\"\n            exec_code = eval(plan_code)\n\n        executor_info = {\n            \"full_plan\": full_plan,\n            \"executor_plan\": plan,\n            \"plan_thoughts\": plan_thoughts,\n            \"plan_code\": plan_code,\n            \"reflection\": reflection,\n            \"reflection_thoughts\": reflection_thoughts,\n        }\n        self.turn_count += 1\n\n        self.screenshot_inputs.append(obs[\"screenshot\"])\n        self.flush_messages()\n\n        return executor_info, [exec_code]\n"
  },
  {
    "path": "gui_agents/s2_5/cli_app.py",
    "content": "import argparse\nimport datetime\nimport io\nimport logging\nimport os\nimport platform\nimport pyautogui\nimport signal\nimport sys\nimport time\n\nfrom PIL import Image\n\nfrom gui_agents.s2_5.agents.grounding import OSWorldACI\nfrom gui_agents.s2_5.agents.agent_s import AgentS2_5\n\ncurrent_platform = platform.system().lower()\n\n# Global flag to track pause state for debugging\npaused = False\n\n\ndef get_char():\n    \"\"\"Get a single character from stdin without pressing Enter\"\"\"\n    try:\n        # Import termios and tty on Unix-like systems\n        if platform.system() in [\"Darwin\", \"Linux\"]:\n            import termios\n            import tty\n\n            fd = sys.stdin.fileno()\n            old_settings = termios.tcgetattr(fd)\n            try:\n                tty.setraw(sys.stdin.fileno())\n                ch = sys.stdin.read(1)\n            finally:\n                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)\n            return ch\n        else:\n            # Windows fallback\n            import msvcrt\n\n            return msvcrt.getch().decode(\"utf-8\", errors=\"ignore\")\n    except:\n        return input()  # Fallback for non-terminal environments\n\n\ndef signal_handler(signum, frame):\n    \"\"\"Handle Ctrl+C signal for debugging during agent execution\"\"\"\n    global paused\n\n    if not paused:\n        print(\"\\n\\n🔸 Agent-S Workflow Paused 🔸\")\n        print(\"=\" * 50)\n        print(\"Options:\")\n        print(\"  • Press Ctrl+C again to quit\")\n        print(\"  • Press Esc to resume workflow\")\n        print(\"=\" * 50)\n\n        paused = True\n\n        while paused:\n            try:\n                print(\"\\n[PAUSED] Waiting for input... \", end=\"\", flush=True)\n                char = get_char()\n\n                if ord(char) == 3:  # Ctrl+C\n                    print(\"\\n\\n🛑 Exiting Agent-S...\")\n                    sys.exit(0)\n                elif ord(char) == 27:  # Esc\n                    print(\"\\n\\n▶️  Resuming Agent-S workflow...\")\n                    paused = False\n                    break\n                else:\n                    print(f\"\\n   Unknown command: '{char}' (ord: {ord(char)})\")\n\n            except KeyboardInterrupt:\n                print(\"\\n\\n🛑 Exiting Agent-S...\")\n                sys.exit(0)\n    else:\n        # Already paused, second Ctrl+C means quit\n        print(\"\\n\\n🛑 Exiting Agent-S...\")\n        sys.exit(0)\n\n\n# Set up signal handler for Ctrl+C\nsignal.signal(signal.SIGINT, signal_handler)\n\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nlog_dir = \"logs\"\nos.makedirs(log_dir, exist_ok=True)\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n\nplatform_os = platform.system()\n\n\ndef show_permission_dialog(code: str, action_description: str):\n    \"\"\"Show a platform-specific permission dialog and return True if approved.\"\"\"\n    if platform.system() == \"Darwin\":\n        result = os.system(\n            f'osascript -e \\'display dialog \"Do you want to execute this action?\\n\\n{code} which will try to {action_description}\" with title \"Action Permission\" buttons {{\"Cancel\", \"OK\"}} default button \"OK\" cancel button \"Cancel\"\\''\n        )\n        return result == 0\n    elif platform.system() == \"Linux\":\n        result = os.system(\n            f'zenity --question --title=\"Action Permission\" --text=\"Do you want to execute this action?\\n\\n{code}\" --width=400 --height=200'\n        )\n        return result == 0\n    return False\n\n\ndef scale_screen_dimensions(width: int, height: int, max_dim_size: int):\n    scale_factor = min(max_dim_size / width, max_dim_size / height, 1)\n    safe_width = int(width * scale_factor)\n    safe_height = int(height * scale_factor)\n    return safe_width, safe_height\n\n\ndef run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):\n    global paused\n    obs = {}\n    traj = \"Task:\\n\" + instruction\n    subtask_traj = \"\"\n    for step in range(15):\n        # Check if we're in paused state and wait\n        while paused:\n            time.sleep(0.1)\n        # Get screen shot using pyautogui\n        screenshot = pyautogui.screenshot()\n        screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)\n\n        # Save the screenshot to a BytesIO object\n        buffered = io.BytesIO()\n        screenshot.save(buffered, format=\"PNG\")\n\n        # Get the byte value of the screenshot\n        screenshot_bytes = buffered.getvalue()\n        # Convert to base64 string.\n        obs[\"screenshot\"] = screenshot_bytes\n\n        # Check again for pause state before prediction\n        while paused:\n            time.sleep(0.1)\n\n        print(f\"\\n🔄 Step {step + 1}/15: Getting next action from agent...\")\n\n        # Get next action code from the agent\n        info, code = agent.predict(instruction=instruction, observation=obs)\n\n        if \"done\" in code[0].lower() or \"fail\" in code[0].lower():\n            if platform.system() == \"Darwin\":\n                os.system(\n                    f'osascript -e \\'display dialog \"Task Completed\" with title \"OpenACI Agent\" buttons \"OK\" default button \"OK\"\\''\n                )\n            elif platform.system() == \"Linux\":\n                os.system(\n                    f'zenity --info --title=\"OpenACI Agent\" --text=\"Task Completed\" --width=200 --height=100'\n                )\n\n            break\n\n        if \"next\" in code[0].lower():\n            continue\n\n        if \"wait\" in code[0].lower():\n            print(\"⏳ Agent requested wait...\")\n            time.sleep(5)\n            continue\n\n        else:\n            time.sleep(1.0)\n            print(\"EXECUTING CODE:\", code[0])\n\n            # Check for pause state before execution\n            while paused:\n                time.sleep(0.1)\n\n            # Ask for permission before executing\n            exec(code[0])\n            time.sleep(1.0)\n\n            # Update task and subtask trajectories\n            if \"reflection\" in info and \"executor_plan\" in info:\n                traj += (\n                    \"\\n\\nReflection:\\n\"\n                    + str(info[\"reflection\"])\n                    + \"\\n\\n----------------------\\n\\nPlan:\\n\"\n                    + info[\"executor_plan\"]\n                )\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Run AgentS2_5 with specified model.\")\n    parser.add_argument(\n        \"--provider\",\n        type=str,\n        default=\"openai\",\n        help=\"Specify the provider to use (e.g., openai, anthropic, etc.)\",\n    )\n    parser.add_argument(\n        \"--model\",\n        type=str,\n        default=\"gpt-5-2025-08-07\",\n        help=\"Specify the model to use (e.g., gpt-5-2025-08-07)\",\n    )\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # Grounding model config: Self-hosted endpoint based (required)\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\",\n        type=str,\n        required=True,\n        help=\"The URL of the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # AgentS2_5 specific arguments\n    parser.add_argument(\n        \"--max_trajectory_length\",\n        type=int,\n        default=8,\n        help=\"Maximum number of image turns to keep in trajectory\",\n    )\n    parser.add_argument(\n        \"--enable_reflection\",\n        action=\"store_true\",\n        default=True,\n        help=\"Enable reflection agent to assist the worker agent\",\n    )\n\n    args = parser.parse_args()\n\n    # Re-scales screenshot size to ensure it fits in UI-TARS context limit\n    screen_width, screen_height = pyautogui.size()\n    scaled_width, scaled_height = scale_screen_dimensions(\n        screen_width, screen_height, max_dim_size=2400\n    )\n\n    # Load the general engine params\n    engine_params = {\n        \"engine_type\": args.provider,\n        \"model\": args.model,\n        \"base_url\": args.model_url,\n        \"api_key\": args.model_api_key,\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n\n    # Load the grounding engine from a custom endpoint\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": args.ground_url,\n        \"api_key\": args.ground_api_key,\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    grounding_agent = OSWorldACI(\n        platform=current_platform,\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=screen_width,\n        height=screen_height,\n    )\n\n    agent = AgentS2_5(\n        engine_params,\n        grounding_agent,\n        platform=current_platform,\n        max_trajectory_length=args.max_trajectory_length,\n        enable_reflection=args.enable_reflection,\n    )\n\n    while True:\n        query = input(\"Query: \")\n\n        agent.reset()\n\n        # Run the agent on your own device\n        run_agent(agent, query, scaled_width, scaled_height)\n\n        response = input(\"Would you like to provide another query? (y/n): \")\n        if response.lower() != \"y\":\n            break\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "gui_agents/s2_5/core/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2_5/core/engine.py",
    "content": "import os\n\nimport backoff\nfrom anthropic import Anthropic\nfrom openai import (\n    AzureOpenAI,\n    APIConnectionError,\n    APIError,\n    AzureOpenAI,\n    OpenAI,\n    RateLimitError,\n)\n\n\nclass LMMEngine:\n    pass\n\n\nclass LMMEngineOpenAI(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        organization=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.organization = organization\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature  # Can force temperature to be the same (in the case of o3 requiring temperature to be 1)\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n        organization = self.organization or os.getenv(\"OPENAI_ORG_ID\")\n        if not self.llm_client:\n            if not self.base_url:\n                self.llm_client = OpenAI(api_key=api_key, organization=organization)\n            else:\n                self.llm_client = OpenAI(\n                    base_url=self.base_url, api_key=api_key, organization=organization\n                )\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_completion_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=(\n                    temperature if self.temperature is None else self.temperature\n                ),\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAnthropic(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        thinking=False,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.thinking = thinking\n        self.api_key = api_key\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"ANTHROPIC_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY\"\n            )\n        if not self.llm_client:\n            self.llm_client = Anthropic(api_key=api_key)\n        # Use the instance temperature if not specified in the call\n        temp = self.temperature if temperature is None else temperature\n        if self.thinking:\n            full_response = self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=8192,\n                thinking={\"type\": \"enabled\", \"budget_tokens\": 4096},\n                **kwargs,\n            )\n            thoughts = full_response.content[0].thinking\n            return full_response.content[1].text\n        return (\n            self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .content[0]\n            .text\n        )\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    # Compatible with Claude-3.7 Sonnet thinking mode\n    def generate_with_thinking(\n        self, messages, temperature=0.0, max_new_tokens=None, **kwargs\n    ):\n        \"\"\"Generate the next message based on previous messages, and keeps the thinking tokens\"\"\"\n\n        full_response = self.llm_client.messages.create(\n            system=messages[0][\"content\"][0][\"text\"],\n            model=self.model,\n            messages=messages[1:],\n            max_tokens=8192,\n            thinking={\"type\": \"enabled\", \"budget_tokens\": 4096},\n            **kwargs,\n        )\n\n        thoughts = full_response.content[0].thinking\n        answer = full_response.content[1].text\n        full_response = (\n            f\"<thoughts>\\n{thoughts}\\n</thoughts>\\n\\n<answer>\\n{answer}\\n</answer>\\n\"\n        )\n        return full_response\n\n\nclass LMMEngineGemini(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"GEMINI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named GEMINI_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"GEMINI_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named GEMINI_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use the temperature passed to generate, otherwise use the instance's temperature, otherwise default to 0.0\n        temp = self.temperature if temperature is None else temperature\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineOpenRouter(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENROUTER_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENROUTER_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"OPEN_ROUTER_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named OPEN_ROUTER_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAzureOpenAI(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        azure_endpoint=None,\n        model=None,\n        api_version=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_version = api_version\n        self.api_key = api_key\n        self.azure_endpoint = azure_endpoint\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.cost = 0.0\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"AZURE_OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY\"\n            )\n        api_version = self.api_version or os.getenv(\"OPENAI_API_VERSION\")\n        if api_version is None:\n            raise ValueError(\n                \"api_version must be provided either as a parameter or as an environment variable named OPENAI_API_VERSION\"\n            )\n        azure_endpoint = self.azure_endpoint or os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n        if azure_endpoint is None:\n            raise ValueError(\n                \"An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_ENDPOINT\"\n            )\n        if not self.llm_client:\n            self.llm_client = AzureOpenAI(\n                azure_endpoint=azure_endpoint,\n                api_key=api_key,\n                api_version=api_version,\n            )\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temp,\n            **kwargs,\n        )\n        total_tokens = completion.usage.total_tokens\n        self.cost += 0.02 * ((total_tokens + 500) / 1000)\n        return completion.choices[0].message.content\n\n\nclass LMMEnginevLLM(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_key = api_key\n        self.base_url = base_url\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(\n        self,\n        messages,\n        temperature=0.0,\n        top_p=0.8,\n        repetition_penalty=1.05,\n        max_new_tokens=512,\n        **kwargs,\n    ):\n        api_key = self.api_key or os.getenv(\"vLLM_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A vLLM API key needs to be provided in either the api_key parameter or as an environment variable named vLLM_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"vLLM_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temp,\n            top_p=top_p,\n            extra_body={\"repetition_penalty\": repetition_penalty},\n        )\n        return completion.choices[0].message.content\n\n\nclass LMMEngineHuggingFace(LMMEngine):\n    def __init__(self, base_url=None, api_key=None, rate_limit=-1, **kwargs):\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"HF_TOKEN\")\n        if api_key is None:\n            raise ValueError(\n                \"A HuggingFace token needs to be provided in either the api_key parameter or as an environment variable named HF_TOKEN\"\n            )\n        base_url = self.base_url or os.getenv(\"HF_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"HuggingFace endpoint must be provided as base_url parameter or as an environment variable named HF_ENDPOINT_URL.\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=\"tgi\",\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineParasail(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"Parasail model id must be provided\"\n        self.base_url = base_url\n        self.model = model\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"PARASAIL_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A Parasail API key needs to be provided in either the api_key parameter or as an environment variable named PARASAIL_API_KEY\"\n            )\n        base_url = self.base_url\n        if base_url is None:\n            raise ValueError(\n                \"Parasail endpoint must be provided as base_url parameter or as an environment variable named PARASAIL_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(\n                base_url=base_url if base_url else \"https://api.parasail.io/v1\",\n                api_key=api_key,\n            )\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n"
  },
  {
    "path": "gui_agents/s2_5/core/mllm.py",
    "content": "import base64\n\nimport numpy as np\n\nfrom gui_agents.s2_5.core.engine import (\n    LMMEngineAnthropic,\n    LMMEngineAzureOpenAI,\n    LMMEngineHuggingFace,\n    LMMEngineOpenAI,\n    LMMEngineOpenRouter,\n    LMMEngineParasail,\n    LMMEnginevLLM,\n    LMMEngineGemini,\n)\n\n\nclass LMMAgent:\n    def __init__(self, engine_params=None, system_prompt=None, engine=None):\n        if engine is None:\n            if engine_params is not None:\n                engine_type = engine_params.get(\"engine_type\")\n                if engine_type == \"openai\":\n                    self.engine = LMMEngineOpenAI(**engine_params)\n                elif engine_type == \"anthropic\":\n                    self.engine = LMMEngineAnthropic(**engine_params)\n                elif engine_type == \"azure\":\n                    self.engine = LMMEngineAzureOpenAI(**engine_params)\n                elif engine_type == \"vllm\":\n                    self.engine = LMMEnginevLLM(**engine_params)\n                elif engine_type == \"huggingface\":\n                    self.engine = LMMEngineHuggingFace(**engine_params)\n                elif engine_type == \"gemini\":\n                    self.engine = LMMEngineGemini(**engine_params)\n                elif engine_type == \"open_router\":\n                    self.engine = LMMEngineOpenRouter(**engine_params)\n                elif engine_type == \"parasail\":\n                    self.engine = LMMEngineParasail(**engine_params)\n                else:\n                    raise ValueError(\"engine_type is not supported\")\n            else:\n                raise ValueError(\"engine_params must be provided\")\n        else:\n            self.engine = engine\n\n        self.messages = []  # Empty messages\n\n        if system_prompt:\n            self.add_system_prompt(system_prompt)\n        else:\n            self.add_system_prompt(\"You are a helpful assistant.\")\n\n    def encode_image(self, image_content):\n        # if image_content is a path to an image file, check type of the image_content to verify\n        if isinstance(image_content, str):\n            with open(image_content, \"rb\") as image_file:\n                return base64.b64encode(image_file.read()).decode(\"utf-8\")\n        else:\n            return base64.b64encode(image_content).decode(\"utf-8\")\n\n    def reset(\n        self,\n    ):\n\n        self.messages = [\n            {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        ]\n\n    def add_system_prompt(self, system_prompt):\n        self.system_prompt = system_prompt\n        if len(self.messages) > 0:\n            self.messages[0] = {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        else:\n            self.messages.append(\n                {\n                    \"role\": \"system\",\n                    \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n                }\n            )\n\n    def remove_message_at(self, index):\n        \"\"\"Remove a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages.pop(index)\n\n    def replace_message_at(\n        self, index, text_content, image_content=None, image_detail=\"high\"\n    ):\n        \"\"\"Replace a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages[index] = {\n                \"role\": self.messages[index][\"role\"],\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n            if image_content:\n                base64_image = self.encode_image(image_content)\n                self.messages[index][\"content\"].append(\n                    {\n                        \"type\": \"image_url\",\n                        \"image_url\": {\n                            \"url\": f\"data:image/png;base64,{base64_image}\",\n                            \"detail\": image_detail,\n                        },\n                    }\n                )\n\n    def add_message(\n        self,\n        text_content,\n        image_content=None,\n        role=None,\n        image_detail=\"high\",\n        put_text_last=False,\n    ):\n        \"\"\"Add a new message to the list of messages\"\"\"\n\n        # API-style inference from OpenAI and AzureOpenAI\n        if isinstance(\n            self.engine,\n            (\n                LMMEngineOpenAI,\n                LMMEngineAzureOpenAI,\n                LMMEngineHuggingFace,\n                LMMEngineGemini,\n                LMMEngineOpenRouter,\n                LMMEngineParasail,\n            ),\n        ):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if isinstance(image_content, np.ndarray) or image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/png;base64,{base64_image}\",\n                                    \"detail\": image_detail,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:image/png;base64,{base64_image}\",\n                                \"detail\": image_detail,\n                            },\n                        }\n                    )\n\n            # Rotate text to be the last message if desired\n            if put_text_last:\n                text_content = message[\"content\"].pop(0)\n                message[\"content\"].append(text_content)\n\n            self.messages.append(message)\n\n        # For API-style inference from Anthropic\n        elif isinstance(self.engine, LMMEngineAnthropic):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image\",\n                                \"source\": {\n                                    \"type\": \"base64\",\n                                    \"media_type\": \"image/png\",\n                                    \"data\": base64_image,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image\",\n                            \"source\": {\n                                \"type\": \"base64\",\n                                \"media_type\": \"image/png\",\n                                \"data\": base64_image,\n                            },\n                        }\n                    )\n            self.messages.append(message)\n\n        # Locally hosted vLLM model inference\n        elif isinstance(self.engine, LMMEnginevLLM):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image;base64,{base64_image}\"\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\"url\": f\"data:image;base64,{base64_image}\"},\n                        }\n                    )\n\n            self.messages.append(message)\n        else:\n            raise ValueError(\"engine_type is not supported\")\n\n    def get_response(\n        self,\n        user_message=None,\n        messages=None,\n        temperature=0.0,\n        max_new_tokens=None,\n        use_thinking=False,\n        **kwargs,\n    ):\n        \"\"\"Generate the next response based on previous messages\"\"\"\n        if messages is None:\n            messages = self.messages\n        if user_message:\n            messages.append(\n                {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": user_message}]}\n            )\n\n        # Thinking enabled for Claude Sonnet 3.7 and Gemini 2.5 Pro\n        if use_thinking:\n            return self.engine.generate_with_thinking(\n                messages,\n                temperature=temperature,\n                max_new_tokens=max_new_tokens,\n                **kwargs,\n            )\n\n        # Regular generation\n        return self.engine.generate(\n            messages,\n            temperature=temperature,\n            max_new_tokens=max_new_tokens,\n            **kwargs,\n        )\n"
  },
  {
    "path": "gui_agents/s2_5/core/module.py",
    "content": "from typing import Dict, Optional\nfrom gui_agents.s2_5.core.mllm import LMMAgent\n\n\nclass BaseModule:\n    def __init__(self, engine_params: Dict, platform: str):\n        self.engine_params = engine_params\n        self.platform = platform\n\n    def _create_agent(\n        self, system_prompt: str = None, engine_params: Optional[Dict] = None\n    ) -> LMMAgent:\n        \"\"\"Create a new LMMAgent instance\"\"\"\n        agent = LMMAgent(engine_params or self.engine_params)\n        if system_prompt:\n            agent.add_system_prompt(system_prompt)\n        return agent\n"
  },
  {
    "path": "gui_agents/s2_5/memory/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2_5/memory/procedural_memory.py",
    "content": "import inspect\nimport textwrap\n\n\nclass PROCEDURAL_MEMORY:\n    @staticmethod\n    def construct_simple_worker_procedural_memory(agent_class, skipped_actions):\n        procedural_memory = textwrap.dedent(\n            f\"\"\"\\\n        You are an expert in graphical user interfaces and Python code. You are responsible for executing the task: `TASK_DESCRIPTION`.\n        You are working in CURRENT_OS.\n        You are provided with:\n        1. A screenshot of the current time step.\n        2. The history of your previous interactions with the UI.\n        3. Access to the following class and methods to interact with the UI:\n        class Agent:\n        \"\"\"\n        )\n\n        for attr_name in dir(agent_class):\n            if attr_name in skipped_actions:\n                continue\n\n            attr = getattr(agent_class, attr_name)\n            if callable(attr) and hasattr(attr, \"is_agent_action\"):\n                # Use inspect to get the full function signature\n                signature = inspect.signature(attr)\n                procedural_memory += f\"\"\"\n    def {attr_name}{signature}:\n    '''{attr.__doc__}'''\n        \"\"\"\n\n        procedural_memory += textwrap.dedent(\n            \"\"\"\n        Your response should be formatted like this:\n        (Previous action verification)\n        Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.\n\n        (Screenshot Analysis)\n        Closely examine and describe the current state of the desktop along with the currently open applications.\n\n        (Next Action)\n        Based on the current screenshot and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.\n\n        (Grounded Action)\n        Translate the next action into code using the provided API methods. Format the code like this:\n        ```python\n        agent.click(\"The menu button at the top right of the window\", 1, \"left\")\n        ```\n        Note for the code:\n        1. Only perform one action at a time.\n        2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.\n        3. You must use only the available methods provided above to interact with the UI, do not invent new methods.\n        4. Only return one code block every time. There must be a single line of code in the code block.\n        5. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the subtask is completed or `agent.fail()` if it cannot be completed.\n        6. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.\n        7. My computer's password is 'osworld-public-evaluation', feel free to use it when you need sudo rights.\n        8. Generate agent.fail() as your grounded action if you get exhaustively stuck on the task and believe it is impossible.\n        9. Generate agent.done() as your grounded action when your believe the task is fully complete.\n        10. Do not use the \"command\" + \"tab\" hotkey on MacOS.\n        \"\"\"\n        )\n\n        return procedural_memory.strip()\n\n    # For reflection agent, post-action verification mainly for cycle detection\n    REFLECTION_ON_TRAJECTORY = textwrap.dedent(\n        \"\"\"\n    You are an expert computer use agent designed to reflect on the trajectory of a task and provide feedback on what has happened so far.\n    You have access to the Task Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.\n    Your task is to generate a reflection. Your generated reflection must fall under one of the cases listed below:\n\n    Case 1. The trajectory is not going according to plan. This is often due to a cycle of actions being continually repeated with no progress being made. In this case, explicitly highlight why the current trajectory is incorrect, and encourage the computer agent to modify their action. However, DO NOT encourage a specific action in particular.\n    Case 2. The trajectory is going according to plan. In this case, simply tell the agent to continue proceeding as planned. DO NOT encourage a specific action in particular.\n    Case 3. You believe the current task has been completed. In this case, tell the agent that the task has been successfully completed.\n    \n    To be successful, you must follow the rules below:\n    - **Your output MUST be based on one of the case options above**.\n    - DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.\n    - Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.\n    - Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.\n    \"\"\"\n    )\n\n    PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent(\n        \"\"\"\n    You are an expert in graphical user interfaces. Your task is to process a phrase of text, and identify the most relevant word on the computer screen.\n    You are provided with a phrase, a table with all the text on the screen, and a screenshot of the computer screen. You will identify the single word id that is best associated with the provided phrase.\n    This single word must be displayed on the computer screenshot, and its location on the screen should align with the provided phrase.\n    Each row in the text table provides 2 pieces of data in the following order. 1st is the unique word id. 2nd is the corresponding word.\n\n    To be successful, it is very important to follow all these rules:\n    1. First, think step by step and generate your reasoning about which word id to click on.\n    2. Then, output the unique word id. Remember, the word id is the 1st number in each row of the text table.\n    3. If there are multiple occurrences of the same word, use the surrounding context in the phrase to choose the correct one. Pay very close attention to punctuation and capitalization.\n\n    \"\"\"\n    )\n"
  },
  {
    "path": "gui_agents/s2_5/utils/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s2_5/utils/common_utils.py",
    "content": "import re\nimport time\n\nfrom typing import Tuple\n\n\ndef call_llm_safe(agent, temperature: float = 0.0, use_thinking: bool = False) -> str:\n    # Retry if fails\n    max_retries = 3  # Set the maximum number of retries\n    attempt = 0\n    response = \"\"\n    while attempt < max_retries:\n        try:\n            response = agent.get_response(\n                temperature=temperature, use_thinking=use_thinking\n            )\n            assert response is not None, \"Response from agent should not be None\"\n            print(\"Response success!\")\n            break  # If successful, break out of the loop\n        except Exception as e:\n            attempt += 1\n            print(f\"Attempt {attempt} failed: {e}\")\n            if attempt == max_retries:\n                print(\"Max retries reached. Handling failure.\")\n        time.sleep(1.0)\n    return response if response is not None else \"\"\n\n\ndef split_thinking_response(full_response: str) -> Tuple[str, str]:\n    try:\n        # Extract thoughts section\n        thoughts_match = re.search(\n            r\"<thoughts>(.*?)</thoughts>\", full_response, re.DOTALL\n        )\n        thoughts = thoughts_match.group(1).strip()\n        # Extract answer section\n        answer_match = re.search(r\"<answer>(.*?)</answer>\", full_response, re.DOTALL)\n        answer = answer_match.group(1).strip()\n        return answer, thoughts\n    except Exception as e:\n        return full_response, \"\"\n\n\ndef parse_single_code_from_string(input_string):\n    input_string = input_string.strip()\n    if input_string.strip() in [\"WAIT\", \"DONE\", \"FAIL\"]:\n        return input_string.strip()\n\n    # This regular expression will match both ```code``` and ```python code```\n    # and capture the `code` part. It uses a non-greedy match for the content inside.\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n    # Find all non-overlapping matches in the string\n    matches = re.findall(pattern, input_string, re.DOTALL)\n\n    # The regex above captures the content inside the triple backticks.\n    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,\n    # so the code inside backticks can span multiple lines.\n\n    # matches now contains all the captured code snippets\n\n    codes = []\n\n    for match in matches:\n        match = match.strip()\n        commands = [\n            \"WAIT\",\n            \"DONE\",\n            \"FAIL\",\n        ]  # fixme: updates this part when we have more commands\n\n        if match in commands:\n            codes.append(match.strip())\n        elif match.split(\"\\n\")[-1] in commands:\n            if len(match.split(\"\\n\")) > 1:\n                codes.append(\"\\n\".join(match.split(\"\\n\")[:-1]))\n            codes.append(match.split(\"\\n\")[-1])\n        else:\n            codes.append(match)\n\n    if len(codes) <= 0:\n        return \"fail\"\n    return codes[0]\n\n\ndef sanitize_code(code):\n    # This pattern captures the outermost double-quoted text\n    if \"\\n\" in code:\n        pattern = r'(\".*?\")'\n        # Find all matches in the text\n        matches = re.findall(pattern, code, flags=re.DOTALL)\n        if matches:\n            # Replace the first occurrence only\n            first_match = matches[0]\n            code = code.replace(first_match, f'\"\"\"{first_match[1:-1]}\"\"\"', 1)\n    return code\n\n\ndef extract_first_agent_function(code_string):\n    # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses\n    pattern = r'agent\\.[a-zA-Z_]+\\((?:[^()\\'\"]|\\'[^\\']*\\'|\"[^\"]*\")*\\)'\n\n    # Find all matches in the string\n    matches = re.findall(pattern, code_string)\n\n    # Return the first match if found, otherwise return None\n    return matches[0] if matches else None\n"
  },
  {
    "path": "gui_agents/s3/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/agents/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/agents/agent_s.py",
    "content": "import logging\nimport platform\nfrom typing import Dict, List, Tuple\n\nfrom gui_agents.s3.agents.grounding import ACI\nfrom gui_agents.s3.agents.worker import Worker\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass UIAgent:\n    \"\"\"Base class for UI automation agents\"\"\"\n\n    def __init__(\n        self,\n        worker_engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n    ):\n        \"\"\"Initialize UIAgent\n\n        Args:\n            worker_engine_params: Configuration parameters for the worker LLM agent\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (macos, linux, windows)\n        \"\"\"\n        self.worker_engine_params = worker_engine_params\n        self.grounding_agent = grounding_agent\n        self.platform = platform\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state\"\"\"\n        pass\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        \"\"\"Generate next action prediction\n\n        Args:\n            instruction: Natural language instruction\n            observation: Current UI state observation\n\n        Returns:\n            Tuple containing agent info dictionary and list of actions\n        \"\"\"\n        pass\n\n\nclass AgentS3(UIAgent):\n    \"\"\"Agent that uses no hierarchy for less inference time\"\"\"\n\n    def __init__(\n        self,\n        worker_engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = platform.system().lower(),\n        max_trajectory_length: int = 8,\n        enable_reflection: bool = True,\n    ):\n        \"\"\"Initialize a minimalist AgentS2 without hierarchy\n\n        Args:\n            worker_engine_params: Configuration parameters for the worker agent.\n            grounding_agent: Instance of ACI class for UI interaction\n            platform: Operating system platform (darwin, linux, windows)\n            max_trajectory_length: Maximum number of image turns to keep\n            enable_reflection: Creates a reflection agent to assist the worker agent\n        \"\"\"\n\n        super().__init__(worker_engine_params, grounding_agent, platform)\n        self.max_trajectory_length = max_trajectory_length\n        self.enable_reflection = enable_reflection\n\n        self.reset()\n\n    def reset(self) -> None:\n        \"\"\"Reset agent state and initialize components\"\"\"\n        self.executor = Worker(\n            worker_engine_params=self.worker_engine_params,\n            grounding_agent=self.grounding_agent,\n            platform=self.platform,\n            max_trajectory_length=self.max_trajectory_length,\n            enable_reflection=self.enable_reflection,\n        )\n\n    def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:\n        # Initialize the three info dictionaries\n        executor_info, actions = self.executor.generate_next_action(\n            instruction=instruction, obs=observation\n        )\n\n        # concatenate the three info dictionaries\n        info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}\n\n        return info, actions\n"
  },
  {
    "path": "gui_agents/s3/agents/code_agent.py",
    "content": "import logging\nfrom typing import Dict, List, Tuple, Optional\n\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s3.utils.common_utils import call_llm_safe, split_thinking_response\nfrom gui_agents.s3.core.mllm import LMMAgent\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\ndef extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:\n    \"\"\"Extract code and determine type from action string.\"\"\"\n    if \"```python\" in action:\n        code_type = \"python\"\n        code = action.split(\"```python\")[1].split(\"```\")[0].strip()\n    elif \"```bash\" in action:\n        code_type = \"bash\"\n        code = action.split(\"```bash\")[1].split(\"```\")[0].strip()\n    elif \"```\" in action:\n        code_type = None\n        code = action.split(\"```\")[1].split(\"```\")[0].strip()\n    else:\n        code_type = None\n        code = None\n\n    logger.debug(\n        f\"Extracted code block: type={code_type}, length={len(code) if code else 0}\"\n    )\n    return code_type, code\n\n\ndef execute_code(code_type: str, code: str, env_controller) -> Dict:\n    \"\"\"Execute code based on its type.\"\"\"\n    # Log the full code being executed (untruncated)\n    logger.info(f\"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\\nCode:\\n{code}\")\n\n    try:\n        if code_type == \"bash\":\n            result = env_controller.run_bash_script(code, timeout=30)\n        elif code_type == \"python\":\n            result = env_controller.run_python_script(code)\n        else:\n            result = {\"status\": \"error\", \"error\": f\"Unknown code type: {code_type}\"}\n\n        return result\n\n    except Exception as e:\n        logger.error(f\"Error executing {code_type} code: {e}\")\n        return {\"status\": \"error\", \"error\": str(e)}\n\n\ndef format_result(result: Dict, step_count: int) -> str:\n    \"\"\"Format execution result into context string.\"\"\"\n    if not result:\n        logger.warning(f\"Step {step_count + 1}: No result returned from execution\")\n        return f\"\"\"\nStep {step_count + 1} Error:\nError: No result returned from execution\n\"\"\"\n\n    status = result.get(\"status\", \"unknown\")\n    return_code = result.get(\"returncode\", result.get(\"return_code\", -1))\n\n    # Handle different response structures for bash vs python\n    if \"returncode\" in result:\n        # Bash script response\n        output = result.get(\"output\", \"\")  # Contains both stdout and stderr merged\n        error = result.get(\"error\", \"\")  # Always empty for bash\n    else:\n        # Python script response\n        output = result.get(\"output\", \"\")  # stdout only\n        error = result.get(\"error\", \"\")  # stderr only\n\n    logger.debug(f\"Step {step_count + 1}: Status={status}, Return Code={return_code}\")\n\n    # Format with better structure for multi-line outputs\n    result_text = f\"Step {step_count + 1} Result:\\n\"\n    result_text += f\"Status: {status}\\n\"\n    result_text += f\"Return Code: {return_code}\\n\"\n\n    if output:\n        result_text += f\"Output:\\n{output}\\n\"\n\n    if error:\n        result_text += f\"Error:\\n{error}\\n\"\n\n    return result_text\n\n\nclass CodeAgent:\n    \"\"\"A dedicated agent for executing code with a budget of steps.\"\"\"\n\n    def __init__(self, engine_params: Dict, budget: int = 20):\n        \"\"\"Initialize the CodeAgent.\"\"\"\n        if not engine_params:\n            raise ValueError(\"engine_params cannot be None or empty\")\n\n        self.engine_params = engine_params\n        self.budget = budget\n        self.agent = None\n\n        logger.info(f\"CodeAgent initialized with budget={budget}\")\n        self.reset()\n\n    def reset(self):\n        \"\"\"Reset the code agent state.\"\"\"\n        logger.debug(\"Resetting CodeAgent state\")\n        self.agent = LMMAgent(\n            engine_params=self.engine_params,\n            system_prompt=PROCEDURAL_MEMORY.CODE_AGENT_PROMPT,\n        )\n\n    def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:\n        \"\"\"Execute code for the given task with a budget of steps.\"\"\"\n        if env_controller is None:\n            raise ValueError(\"env_controller is required for code execution\")\n\n        print(f\"\\n🚀 STARTING CODE EXECUTION\")\n        print(\"=\" * 60)\n        print(f\"Task: {task_instruction}\")\n        print(f\"Budget: {self.budget} steps\")\n        print(\"=\" * 60)\n\n        logger.info(f\"Starting code execution for task: {task_instruction}\")\n        logger.info(f\"Budget: {self.budget} steps\")\n\n        self.reset()\n\n        # Add initial task instruction and screenshot context as user message\n        context = (\n            f\"Task: {task_instruction}\\n\\nCurrent screenshot is provided for context.\"\n        )\n        self.agent.add_message(context, image_content=screenshot, role=\"user\")\n\n        step_count = 0\n        execution_history = []\n\n        while step_count < self.budget:\n            logger.info(f\"Step {step_count + 1}/{self.budget}\")\n\n            # Get assistant response (thoughts and code)\n            response = call_llm_safe(self.agent, temperature=1)\n\n            # Print to terminal for immediate visibility\n            print(f\"\\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}\")\n            print(\"=\" * 60)\n            print(response)\n            print(\"=\" * 60)\n\n            # Log the latest message from the coding agent (untruncated)\n            logger.info(\n                f\"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\\n{response}\"\n            )\n\n            # Check if response is None or empty\n            if not response or response.strip() == \"\":\n                error_msg = f\"Step {step_count + 1}: LLM returned empty response\"\n                logger.error(error_msg)\n                raise RuntimeError(error_msg)\n\n            # Parse the response to extract action\n            action, thoughts = split_thinking_response(response)\n\n            execution_history.append(\n                {\"step\": step_count + 1, \"action\": action, \"thoughts\": thoughts}\n            )\n\n            # Check for completion signals\n            action_upper = action.upper().strip()\n            if action_upper == \"DONE\":\n                print(f\"\\n✅ TASK COMPLETED - Step {step_count + 1}\")\n                print(\"=\" * 60)\n                print(\"Agent signaled task completion\")\n                print(\"=\" * 60)\n                logger.info(f\"Step {step_count + 1}: Task completed successfully\")\n                completion_reason = \"DONE\"\n                break\n            elif action_upper == \"FAIL\":\n                print(f\"\\n❌ TASK FAILED - Step {step_count + 1}\")\n                print(\"=\" * 60)\n                print(\"Agent signaled task failure\")\n                print(\"=\" * 60)\n                logger.info(f\"Step {step_count + 1}: Task failed by agent request\")\n                completion_reason = \"FAIL\"\n                break\n\n            # Extract and execute code\n            code_type, code = extract_code_block(action)\n\n            if code:\n                result = execute_code(code_type, code, env_controller)\n                # Prepare formatted output and error for logging\n                output = result.get(\"output\", \"\")\n                error = result.get(\"error\", \"\")\n                message = result.get(\"message\", \"\")\n                status = result.get(\"status\", \"\")\n\n                # Print execution result to terminal for immediate visibility\n                print(f\"\\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}\")\n                print(\"-\" * 50)\n                print(f\"Status: {status}\")\n                if output:\n                    print(f\"Output:\\n{output}\")\n                if error:\n                    print(f\"Error:\\n{error}\")\n                if message and not output and not error:\n                    print(f\"Message:\\n{message}\")\n                print(\"-\" * 50)\n\n                log_lines = [\n                    f\"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\",\n                    f\"Status: {status}\" if status else None,\n                ]\n\n                if output:\n                    log_lines.append(\n                        \"Output:\\n\" + (\"-\" * 40) + f\"\\n{output}\\n\" + (\"-\" * 40)\n                    )\n                if error:\n                    log_lines.append(\n                        \"Error:\\n\" + (\"!\" * 40) + f\"\\n{error}\\n\" + (\"!\" * 40)\n                    )\n                if message and not output and not error:\n                    log_lines.append(\n                        \"Message:\\n\" + (\"-\" * 40) + f\"\\n{message}\\n\" + (\"-\" * 40)\n                    )\n\n                # Remove None entries and join\n                formatted_log = \"\\n\".join([line for line in log_lines if line])\n                logger.info(formatted_log)\n            else:\n                print(f\"\\n⚠️  NO CODE BLOCK FOUND - Step {step_count + 1}\")\n                print(\"-\" * 50)\n                print(\"Action did not contain executable code\")\n                print(\"-\" * 50)\n\n                logger.warning(f\"Step {step_count + 1}: No code block found in action\")\n                result = {\"status\": \"skipped\", \"message\": \"No code block found\"}\n                logger.info(\n                    f\"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\\n\"\n                    f\"Status: skipped\\n\"\n                    f\"Message:\\n{'-' * 40}\\n{result['message']}\\n{'-' * 40}\"\n                )\n            # Add assistant's thoughts and code to message history\n            self.agent.add_message(response, role=\"assistant\")\n\n            # Process result and add formatted environment results as user message\n            result_context = format_result(result, step_count)\n            self.agent.add_message(result_context, role=\"user\")\n\n            step_count += 1\n\n        # Handle budget exhaustion\n        if \"completion_reason\" not in locals():\n            print(f\"\\n⏰ BUDGET EXHAUSTED - {step_count} steps completed\")\n            print(\"=\" * 60)\n            print(f\"Maximum budget of {self.budget} steps reached\")\n            print(\"=\" * 60)\n            logger.info(f\"Budget exhausted after {step_count} steps\")\n            completion_reason = f\"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS\"\n\n        # Generate final summary\n        logger.info(\"Generating execution summary\")\n        summary = self._generate_summary(execution_history, task_instruction)\n\n        result = {\n            \"task_instruction\": task_instruction,\n            \"completion_reason\": completion_reason,\n            \"summary\": summary,\n            \"execution_history\": execution_history,\n            \"steps_executed\": step_count,\n            \"budget\": self.budget,\n        }\n\n        logger.info(f\"Code execution completed: steps={step_count}\")\n        return result\n\n    def _generate_summary(\n        self, execution_history: List[Dict], task_instruction: str\n    ) -> str:\n        \"\"\"Generate summary of code execution session.\"\"\"\n        if not execution_history:\n            logger.info(\"No execution history to summarize\")\n            return \"No actions were executed.\"\n\n        logger.info(f\"Generated summary for {len(execution_history)} steps\")\n\n        # Build detailed execution context for summary agent\n        execution_context = f\"Task: {task_instruction}\\n\\nExecution Steps:\\n\"\n\n        for step in execution_history:\n            step_num = step[\"step\"]\n            thoughts = step.get(\"thoughts\", \"\")\n            action = step.get(\"action\", \"\")\n\n            execution_context += f\"\\nStep {step_num}:\\n\"\n            if thoughts:\n                execution_context += f\"Thoughts: {thoughts}\\n\"\n            execution_context += f\"Code: {action}\\n\"\n\n        # Create summary prompt with same context as coding agent\n        summary_prompt = f\"\"\"\n{execution_context}\n\nPlease provide a concise summary of the code execution session. Focus on:\n\n1. The code logic implemented at each step\n2. The outputs and results produced by each code execution\n3. The progression of the solution approach\n\nDo not make judgments about success or failure. Simply describe what was attempted and what resulted.\n\nKeep the summary under 150 words and use clear, factual language.\n\"\"\"\n\n        # Generate summary using LLM with dedicated summary system prompt\n        try:\n            summary_agent = LMMAgent(\n                engine_params=self.engine_params,\n                system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,\n            )\n            summary_agent.add_message(summary_prompt, role=\"user\")\n            summary = call_llm_safe(summary_agent, temperature=1)\n\n            if not summary or summary.strip() == \"\":\n                summary = \"Summary generation failed - no response from LLM\"\n                logger.warning(\"Summary generation failed - empty response from LLM\")\n\n        except Exception as e:\n            summary = f\"Summary generation failed: {str(e)}\"\n            logger.error(f\"Error generating summary: {e}\")\n\n        return summary\n"
  },
  {
    "path": "gui_agents/s3/agents/grounding.py",
    "content": "import re\nfrom collections import defaultdict\nfrom io import BytesIO\nfrom typing import Any, Dict, List, Optional, Tuple\n\nimport pytesseract\nfrom PIL import Image\nfrom pytesseract import Output\n\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s3.core.mllm import LMMAgent\nfrom gui_agents.s3.utils.common_utils import call_llm_safe\nfrom gui_agents.s3.agents.code_agent import CodeAgent\nimport logging\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass ACI:\n    def __init__(self):\n        self.notes: List[str] = []\n\n\n# Agent action decorator\ndef agent_action(func):\n    func.is_agent_action = True\n    return func\n\n\nUBUNTU_APP_SETUP = f\"\"\"import subprocess;\nimport difflib;\nimport pyautogui;\npyautogui.press('escape');\ntime.sleep(0.5);\noutput = subprocess.check_output(['wmctrl', '-lx']);\noutput = output.decode('utf-8').splitlines();\nwindow_titles = [line.split(None, 4)[2] for line in output];\nclosest_matches = difflib.get_close_matches('APP_NAME', window_titles, n=1, cutoff=0.1);\nif closest_matches:\n    closest_match = closest_matches[0];\n    for line in output:\n        if closest_match in line:\n            window_id = line.split()[0]\n            break;\nsubprocess.run(['wmctrl', '-ia', window_id])\nsubprocess.run(['wmctrl', '-ir', window_id, '-b', 'add,maximized_vert,maximized_horz'])\n\"\"\"\n\n\nSET_CELL_VALUES_CMD = \"\"\"import uno\nimport subprocess\nimport unicodedata, json\n\ndef identify_document_type(component):\n    if component.supportsService(\"com.sun.star.sheet.SpreadsheetDocument\"):\n        return \"Calc\"\n\n    if component.supportsService(\"com.sun.star.text.TextDocument\"):\n        return \"Writer\"\n\n    if component.supportsService(\"com.sun.star.sheet.PresentationDocument\"):\n        return \"Impress\"\n\n    return None\n\ndef _norm_name(s: str | None) -> str | None:\n    if s is None:\n        return None\n    if \"\\\\\\\\u\" in s or \"\\\\\\\\U\" in s or \"\\\\\\\\x\" in s:\n        try:\n            # json.loads handles all the escape forms safely\n            s = json.loads(f\"{{s}}\")\n        except Exception:\n            # fallback: best-effort\n            try:\n                s = s.encode(\"utf-8\").decode(\"unicode_escape\")\n            except Exception:\n                pass\n    # Normalize (NFC works well across platforms)\n    return unicodedata.normalize(\"NFC\", s)\n\ndef cell_ref_to_indices(cell_ref):\n    column_letters = ''.join(filter(str.isalpha, cell_ref))\n    row_number = ''.join(filter(str.isdigit, cell_ref))\n\n    col = sum((ord(char.upper()) - ord('A') + 1) * (26**idx) for idx, char in enumerate(reversed(column_letters))) - 1\n    row = int(row_number) - 1\n    return col, row\n\ndef set_cell_values(new_cell_values: dict[str, str], app_name: str = \"Untitled 1\", sheet_name: str = \"Sheet1\"):\n    app_name  = _norm_name(app_name)\n    sheet_name = _norm_name(sheet_name)\n\n    new_cell_values_idx = {{}}\n    for k, v in new_cell_values.items():\n        try:\n            col, row = cell_ref_to_indices(k)\n        except:\n            col = row = None\n\n        if col is not None and row is not None:\n            new_cell_values_idx[(col, row)] = v\n\n    # Clean up previous TCP connections.\n    subprocess.run(\n        'echo \\\"osworld-public-evaluation\\\" | sudo -S ss --kill --tcp state TIME-WAIT sport = :2002',\n        shell=True,\n        check=True,\n        text=True,\n        capture_output=True\n    )\n\n    # Dynamically allow soffice to listen on port 2002.\n    subprocess.run(\n        [\n            \"soffice\",\n            \"--accept=socket,host=localhost,port=2002;urp;StarOffice.Service\"\n        ]\n    )\n\n    local_context = uno.getComponentContext()\n    resolver = local_context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.bridge.UnoUrlResolver\", local_context\n    )\n    context = resolver.resolve(\n        f\"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext\"\n    )\n    desktop = context.ServiceManager.createInstanceWithContext(\n        \"com.sun.star.frame.Desktop\", context\n    )\n\n    # Collect all LibreOffice-related opened windows.\n    documents = []\n    for i, component in enumerate(desktop.Components):\n        title = component.Title\n        doc_type = identify_document_type(component)\n        documents.append((i, component, title, doc_type))\n\n    # Find the LibreOffice Calc app and the sheet of interest.\n    spreadsheet = [doc for doc in documents if doc[3] == \"Calc\"]\n    selected_spreadsheet = [doc for doc in spreadsheet if doc[2] == app_name]\n    if spreadsheet:\n        try:\n            if selected_spreadsheet:\n                spreadsheet = selected_spreadsheet[0][1]\n            else:\n                spreadsheet = spreadsheet[0][1]\n\n            sheet = spreadsheet.Sheets.getByName(sheet_name)\n        except:\n            raise ValueError(f\"Could not find sheet {{sheet_name}} in {{app_name}}.\")\n\n        for (col, row), value in new_cell_values_idx.items():\n            cell = sheet.getCellByPosition(col, row)\n\n            # Set the cell value.\n            if isinstance(value, (int, float)):\n                cell.Value = value\n            elif isinstance(value, str):\n                if value.startswith(\"=\"):\n                    cell.Formula = value\n                else:\n                    cell.String = value\n            elif isinstance(value, bool):\n                cell.Value = 1 if value else 0\n            elif value is None:\n                cell.clearContents(0)\n            else:\n                raise ValueError(f\"Unsupported cell value type: {{type(value)}}\")\n\n    else:\n        raise ValueError(f\"Could not find LibreOffice Calc app corresponding to {{app_name}}.\")\n\nset_cell_values(new_cell_values={cell_values}, app_name=\"{app_name}\", sheet_name=\"{sheet_name}\")        \n\"\"\"\n\n\n# ACI primitives are parameterized by description, and coordinate generation uses a pretrained grounding model\nclass OSWorldACI(ACI):\n    def __init__(\n        self,\n        env,\n        platform: str,\n        engine_params_for_generation: Dict,\n        engine_params_for_grounding: Dict,\n        width: int = 1920,\n        height: int = 1080,\n        code_agent_budget: int = 20,\n        code_agent_engine_params: Dict = None,\n    ):\n        super().__init__()\n\n        self.env = env\n        self.platform = (\n            platform  # Dictates how the switch_applications agent action works.\n        )\n\n        # Configure scaling\n        self.width = width\n        self.height = height\n\n        # Maintain state for save_to_knowledge\n        self.notes = []\n\n        # Screenshot used during ACI execution\n        self.obs = None\n\n        # Configure the visual grounding model responsible for coordinate generation\n        self.grounding_model = LMMAgent(engine_params_for_grounding)\n        self.engine_params_for_grounding = engine_params_for_grounding\n\n        # Configure text grounding agent\n        self.text_span_agent = LMMAgent(\n            engine_params=engine_params_for_generation,\n            system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,\n        )\n\n        # Configure code agent\n        code_agent_engine_params = (\n            code_agent_engine_params or engine_params_for_generation\n        )\n        self.code_agent = CodeAgent(code_agent_engine_params, code_agent_budget)\n\n        # Store task instruction for code agent\n        self.current_task_instruction = None\n        self.last_code_agent_result = None\n\n    # Given the state and worker's referring expression, use the grounding model to generate (x,y)\n    def generate_coords(self, ref_expr: str, obs: Dict) -> List[int]:\n\n        # Reset the grounding model state\n        self.grounding_model.reset()\n\n        # Configure the context, UI-TARS demo does not use system prompt\n        prompt = f\"Query:{ref_expr}\\nOutput only the coordinate of one point in your response.\\n\"\n        self.grounding_model.add_message(\n            text_content=prompt, image_content=obs[\"screenshot\"], put_text_last=True\n        )\n\n        # Generate and parse coordinates\n        response = call_llm_safe(self.grounding_model)\n        print(\"RAW GROUNDING MODEL RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        assert len(numericals) >= 2\n        return [int(numericals[0]), int(numericals[1])]\n\n    # Calls pytesseract to generate word level bounding boxes for text grounding\n    def get_ocr_elements(self, b64_image_data: str) -> Tuple[str, List]:\n        image = Image.open(BytesIO(b64_image_data))\n        image_data = pytesseract.image_to_data(image, output_type=Output.DICT)\n\n        # Clean text by removing leading and trailing spaces and non-alphabetical characters, but keeping punctuation\n        for i, word in enumerate(image_data[\"text\"]):\n            image_data[\"text\"][i] = re.sub(\n                r\"^[^a-zA-Z\\s.,!?;:\\-\\+]+|[^a-zA-Z\\s.,!?;:\\-\\+]+$\", \"\", word\n            )\n\n        ocr_elements = []\n        ocr_table = \"Text Table:\\nWord id\\tText\\n\"\n        # Obtain the <id, text, group number, word number> for each valid element\n        grouping_map = defaultdict(list)\n        ocr_id = 0\n        for i in range(len(image_data[\"text\"])):\n            block_num = image_data[\"block_num\"][i]\n            if image_data[\"text\"][i]:\n                grouping_map[block_num].append(image_data[\"text\"][i])\n                ocr_table += f\"{ocr_id}\\t{image_data['text'][i]}\\n\"\n                ocr_elements.append(\n                    {\n                        \"id\": ocr_id,\n                        \"text\": image_data[\"text\"][i],\n                        \"group_num\": block_num,\n                        \"word_num\": len(grouping_map[block_num]),\n                        \"left\": image_data[\"left\"][i],\n                        \"top\": image_data[\"top\"][i],\n                        \"width\": image_data[\"width\"][i],\n                        \"height\": image_data[\"height\"][i],\n                    }\n                )\n                ocr_id += 1\n\n        return ocr_table, ocr_elements\n\n    # Given the state and worker's text phrase, generate the coords of the first/last word in the phrase\n    def generate_text_coords(\n        self, phrase: str, obs: Dict, alignment: str = \"\"\n    ) -> List[int]:\n\n        ocr_table, ocr_elements = self.get_ocr_elements(obs[\"screenshot\"])\n\n        alignment_prompt = \"\"\n        if alignment == \"start\":\n            alignment_prompt = \"**Important**: Output the word id of the FIRST word in the provided phrase.\\n\"\n        elif alignment == \"end\":\n            alignment_prompt = \"**Important**: Output the word id of the LAST word in the provided phrase.\\n\"\n\n        # Load LLM prompt\n        self.text_span_agent.reset()\n        self.text_span_agent.add_message(\n            alignment_prompt + \"Phrase: \" + phrase + \"\\n\" + ocr_table, role=\"user\"\n        )\n        self.text_span_agent.add_message(\n            \"Screenshot:\\n\", image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        # Obtain the target element\n        response = call_llm_safe(self.text_span_agent)\n        print(\"TEXT SPAN AGENT RESPONSE:\", response)\n        numericals = re.findall(r\"\\d+\", response)\n        if len(numericals) > 0:\n            text_id = int(numericals[-1])\n        else:\n            text_id = 0\n        elem = ocr_elements[text_id]\n\n        # Compute the element coordinates\n        if alignment == \"start\":\n            coords = [elem[\"left\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        elif alignment == \"end\":\n            coords = [elem[\"left\"] + elem[\"width\"], elem[\"top\"] + (elem[\"height\"] // 2)]\n        else:\n            coords = [\n                elem[\"left\"] + (elem[\"width\"] // 2),\n                elem[\"top\"] + (elem[\"height\"] // 2),\n            ]\n        return coords\n\n    def assign_screenshot(self, obs: Dict):\n        self.obs = obs\n\n    def set_task_instruction(self, task_instruction: str):\n        \"\"\"Set the current task instruction for the code agent.\"\"\"\n        self.current_task_instruction = task_instruction\n\n    # Resize from grounding model dim into OSWorld dim (1920 * 1080)\n    def resize_coordinates(self, coordinates: List[int]) -> List[int]:\n        grounding_width = self.engine_params_for_grounding[\"grounding_width\"]\n        grounding_height = self.engine_params_for_grounding[\"grounding_height\"]\n\n        return [\n            round(coordinates[0] * self.width / grounding_width),\n            round(coordinates[1] * self.height / grounding_height),\n        ]\n\n    @agent_action\n    def click(\n        self,\n        element_description: str,\n        num_clicks: int = 1,\n        button_type: str = \"left\",\n        hold_keys: List = [],\n    ):\n        \"\"\"Click on the element\n        Args:\n            element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.\n            num_clicks:int, number of times to click the element\n            button_type:str, which mouse button to press can be \"left\", \"middle\", or \"right\"\n            hold_keys:List, list of keys to hold while clicking\n        \"\"\"\n        coords1 = self.generate_coords(element_description, self.obs)\n        x, y = self.resize_coordinates(coords1)\n        command = \"import pyautogui; \"\n\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"\"\"import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); \"\"\"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n        # Return pyautoguicode to click on the element\n        return command\n\n    @agent_action\n    def switch_applications(self, app_code):\n        \"\"\"Switch to a different application that is already open\n        Args:\n            app_code:str the code name of the application to switch to from the provided list of open applications\n        \"\"\"\n        if self.platform == \"darwin\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        elif self.platform == \"linux\":\n            return UBUNTU_APP_SETUP.replace(\"APP_NAME\", app_code)\n        elif self.platform == \"windows\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('win', 'd', interval=0.5); pyautogui.typewrite({repr(app_code)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        else:\n            assert (\n                False\n            ), f\"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows.\"\n\n    @agent_action\n    def open(self, app_or_filename: str):\n        \"\"\"Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.\n        Args:\n            app_or_filename:str, the name of the application or filename to open\n        \"\"\"\n        if self.platform == \"linux\":\n            return f\"import pyautogui; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(0.5)\"\n        elif self.platform == \"darwin\":\n            return f\"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)\"\n        elif self.platform == \"windows\":\n            return (\n                \"import pyautogui; import time; \"\n                \"pyautogui.hotkey('win'); time.sleep(0.5); \"\n                f\"pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); \"\n                \"pyautogui.press('enter'); time.sleep(0.5)\"\n            )\n        else:\n            assert (\n                False\n            ), f\"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows.\"\n\n    @agent_action\n    def type(\n        self,\n        element_description: Optional[str] = None,\n        text: str = \"\",\n        overwrite: bool = False,\n        enter: bool = False,\n    ):\n        \"\"\"Type text/unicode into a specific element\n        Args:\n            element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.\n            text:str, the text to type\n            overwrite:bool, Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.\n            enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.\n        \"\"\"\n        command = \"import pyautogui; \"\n        command += (\n            \"\\ntry:\\n\"\n            \"    import pyperclip\\n\"\n            \"except ImportError:\\n\"\n            \"    import subprocess\\n\"\n            \"    subprocess.run('echo \\\"osworld-public-evaluation\\\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True)\\n\"\n            \"    subprocess.check_call([subprocess.sys.executable, '-m', 'pip', 'install', 'pyperclip'])\\n\"\n            \"    import pyperclip\\n\\n\"\n        )\n\n        if element_description is not None:\n            coords1 = self.generate_coords(element_description, self.obs)\n            x, y = self.resize_coordinates(coords1)\n            command += f\"pyautogui.click({x}, {y}); \"\n\n        if overwrite:\n            command += (\n                f\"pyautogui.hotkey({repr('command' if self.platform == 'darwin' else 'ctrl')}, 'a'); \"\n                \"pyautogui.press('backspace'); \"\n            )\n\n        # Check if text contains Unicode characters that pyautogui.write() can't handle\n        has_unicode = any(ord(char) > 127 for char in text)\n\n        if has_unicode:\n            # Use clipboard method for Unicode characters\n            command += f\"pyperclip.copy({repr(text)}); \"\n            command += f\"pyautogui.hotkey({repr('command' if self.platform == 'darwin' else 'ctrl')}, 'v'); \"\n        else:\n            # Use regular pyautogui.write() for ASCII text\n            command += f\"pyautogui.write({repr(text)}); \"\n\n        if enter:\n            command += \"pyautogui.press('enter'); \"\n        return command\n\n    @agent_action\n    def save_to_knowledge(self, text: List[str]):\n        \"\"\"Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.\n        Args:\n            text:List[str] the text to save to the knowledge\n        \"\"\"\n        self.notes.extend(text)\n        return \"\"\"WAIT\"\"\"\n\n    @agent_action\n    def drag_and_drop(\n        self, starting_description: str, ending_description: str, hold_keys: List = []\n    ):\n        \"\"\"Drag from the starting description to the ending description\n        Args:\n            starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.\n            ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.\n            hold_keys:List list of keys to hold while dragging\n        \"\"\"\n        coords1 = self.generate_coords(starting_description, self.obs)\n        coords2 = self.generate_coords(ending_description, self.obs)\n        x1, y1 = self.resize_coordinates(coords1)\n        x2, y2 = self.resize_coordinates(coords2)\n\n        command = \"import pyautogui; \"\n\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        # TODO: specified duration?\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1., button='left'); pyautogui.mouseUp(); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        # Return pyautoguicode to drag and drop the elements\n\n        return command\n\n    @agent_action\n    def highlight_text_span(\n        self, starting_phrase: str, ending_phrase: str, button: str = \"left\"\n    ):\n        \"\"\"Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.\n        Args:\n            starting_phrase:str, the phrase that denotes the start of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n            ending_phrase:str, the phrase that denotes the end of the text span you want to highlight. If you only want to highlight one word, just pass in that single word.\n            button:str, the button to use to highlight the text span. Defaults to \"left\". Can be \"left\", \"right\", or \"middle\".\n        \"\"\"\n        coords1 = self.generate_text_coords(\n            starting_phrase, self.obs, alignment=\"start\"\n        )\n        coords2 = self.generate_text_coords(ending_phrase, self.obs, alignment=\"end\")\n        x1, y1 = coords1\n        x2, y2 = coords2\n\n        command = \"import pyautogui; \"\n        command += f\"pyautogui.moveTo({x1}, {y1}); \"\n        command += f\"pyautogui.dragTo({x2}, {y2}, duration=1., button='{button}'); pyautogui.mouseUp(); \"\n\n        # Return pyautoguicode to drag and drop the elements\n        return command\n\n    @agent_action\n    def set_cell_values(\n        self, cell_values: Dict[str, Any], app_name: str, sheet_name: str\n    ):\n        \"\"\"Use this to set individual cell values in a spreadsheet. For example, setting A2 to \"hello\" would be done by passing {\"A2\": \"hello\"} as cell_values. The sheet must be opened before this command can be used.\n        Args:\n            cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format \"A1\", \"B2\", etc.\n                Supported value types include: float, int, string, bool, formulas.\n            app_name: str, The name of the spreadsheet application. For example, \"Some_sheet.xlsx\".\n            sheet_name: str, The name of the sheet in the spreadsheet. For example, \"Sheet1\".\n        \"\"\"\n        return SET_CELL_VALUES_CMD.format(\n            cell_values=cell_values, app_name=app_name, sheet_name=sheet_name\n        )\n\n    @agent_action\n    def call_code_agent(self, task: str = None):\n        \"\"\"Call the code agent to execute code for tasks or subtasks that can be completed solely with coding.\n\n        Args:\n            task: str, the task or subtask to execute. If None, uses the current full task instruction.\n\n        **🚨 CRITICAL GUIDELINES:**\n        - **ONLY pass a task parameter for SPECIFIC subtasks** (e.g., \"Calculate sum of column B\", \"Filter data by date\")\n        - **NEVER pass a task parameter for full tasks** - let it default to the original task instruction\n        - **NEVER rephrase or modify the original task** - this prevents hallucination corruption\n        - **If unsure, omit the task parameter entirely** to use the original task instruction\n\n        Use this for tasks that can be fully accomplished through code execution, particularly for:\n        - Spreadsheet applications (LibreOffice Calc, Excel): data processing, filtering, sorting, calculations, formulas, data analysis\n        - Document editors (LibreOffice Writer, Word): text processing, content editing, formatting, document manipulation\n        - Code editors (VS Code, text editors): code editing, file processing, text manipulation, configuration\n        - Data analysis tools: statistical analysis, data transformation, reporting\n        - File management: bulk operations, file processing, content extraction\n        - System utilities: configuration, setup, automation\n        \"\"\"\n        logger.info(\"=\" * 50)\n        logger.info(\"GROUNDING AGENT: Calling Code Agent\")\n        logger.info(\"=\" * 50)\n\n        # **CRITICAL**: Only use provided task for specific subtasks, otherwise use original task instruction\n        if task is not None:\n            # This is a subtask - use the provided task\n            task_to_execute = task\n            logger.info(f\"Executing SUBTASK: {task_to_execute}\")\n        else:\n            # This is a full task - use the original task instruction to prevent hallucination\n            task_to_execute = self.current_task_instruction\n            logger.info(f\"Executing FULL TASK: {task_to_execute}\")\n\n        if task_to_execute:\n            print(\"obs keys: \", self.obs.keys())\n            screenshot = self.obs.get(\"screenshot\", \"\") if self.obs else \"\"\n            logger.info(f\"Screenshot available: {'Yes' if screenshot else 'No'}\")\n\n            logger.info(\"Executing code agent...\")\n            result = self.code_agent.execute(\n                task_to_execute, screenshot, self.env.controller\n            )\n\n            # Store the result for the worker to access\n            self.last_code_agent_result = result\n\n            logger.info(\"Code agent execution completed\")\n            logger.info(f\"Result - Completion reason: {result['completion_reason']}\")\n            logger.info(f\"Steps executed: {result['steps_executed']}\")\n            logger.info(f\"Summary: {result['summary']}\")\n\n            logger.info(\"=\" * 50)\n            logger.info(\"GROUNDING AGENT: Code Agent Call Finished\")\n            logger.info(\"=\" * 50)\n\n            # Return code to be executed in the environment\n            return \"import time; time.sleep(2.222)\"\n        else:\n            logger.warning(\"No task instruction available for code agent call\")\n            return \"import time; time.sleep(1.111)\"\n\n    @agent_action\n    def scroll(self, element_description: str, clicks: int, shift: bool = False):\n        \"\"\"Scroll the element in the specified direction\n        Args:\n            element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.\n            clicks:int, the number of clicks to scroll can be positive (up) or negative (down).\n            shift:bool, whether to use shift+scroll for horizontal scrolling\n        \"\"\"\n        coords1 = self.generate_coords(element_description, self.obs)\n        x, y = self.resize_coordinates(coords1)\n\n        if shift:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})\"\n        else:\n            return f\"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})\"\n\n    @agent_action\n    def hotkey(self, keys: List):\n        \"\"\"Press a hotkey combination\n        Args:\n            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])\n        \"\"\"\n        # add quotes around the keys\n        keys = [f\"'{key}'\" for key in keys]\n        return f\"import pyautogui; pyautogui.hotkey({', '.join(keys)})\"\n\n    @agent_action\n    def hold_and_press(self, hold_keys: List, press_keys: List):\n        \"\"\"Hold a list of keys and press a list of keys\n        Args:\n            hold_keys:List, list of keys to hold\n            press_keys:List, list of keys to press in a sequence\n        \"\"\"\n\n        press_keys_str = \"[\" + \", \".join([f\"'{key}'\" for key in press_keys]) + \"]\"\n        command = \"import pyautogui; \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyDown({repr(k)}); \"\n        command += f\"pyautogui.press({press_keys_str}); \"\n        for k in hold_keys:\n            command += f\"pyautogui.keyUp({repr(k)}); \"\n\n        return command\n\n    @agent_action\n    def wait(self, time: float):\n        \"\"\"Wait for a specified amount of time\n        Args:\n            time:float the amount of time to wait in seconds\n        \"\"\"\n        return f\"\"\"import time; time.sleep({time})\"\"\"\n\n    @agent_action\n    def done(\n        self,\n    ):\n        \"\"\"End the current task with a success. Use this when you believe the entire task has been fully completed.\"\"\"\n        return \"\"\"DONE\"\"\"\n\n    @agent_action\n    def fail(self):\n        \"\"\"End the current task with a failure. Use this when you believe the entire task is impossible to complete.\"\"\"\n        return \"\"\"FAIL\"\"\"\n"
  },
  {
    "path": "gui_agents/s3/agents/worker.py",
    "content": "from functools import partial\nimport logging\nimport textwrap\nfrom typing import Dict, List, Tuple\n\nfrom gui_agents.s3.agents.grounding import ACI\nfrom gui_agents.s3.core.module import BaseModule\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s3.utils.common_utils import (\n    call_llm_safe,\n    call_llm_formatted,\n    parse_code_from_string,\n    split_thinking_response,\n    create_pyautogui_code,\n)\nfrom gui_agents.s3.utils.formatters import (\n    SINGLE_ACTION_FORMATTER,\n    CODE_VALID_FORMATTER,\n)\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\nclass Worker(BaseModule):\n    def __init__(\n        self,\n        worker_engine_params: Dict,\n        grounding_agent: ACI,\n        platform: str = \"ubuntu\",\n        max_trajectory_length: int = 8,\n        enable_reflection: bool = True,\n    ):\n        \"\"\"\n        Worker receives the main task and generates actions, without the need of hierarchical planning\n        Args:\n            worker_engine_params: Dict\n                Parameters for the worker agent\n            grounding_agent: Agent\n                The grounding agent to use\n            platform: str\n                OS platform the agent runs on (darwin, linux, windows)\n            max_trajectory_length: int\n                The amount of images turns to keep\n            enable_reflection: bool\n                Whether to enable reflection\n        \"\"\"\n        super().__init__(worker_engine_params, platform)\n\n        self.temperature = worker_engine_params.get(\"temperature\", 0.0)\n        self.use_thinking = worker_engine_params.get(\"model\", \"\") in [\n            \"claude-opus-4-20250514\",\n            \"claude-sonnet-4-20250514\",\n            \"claude-3-7-sonnet-20250219\",\n            \"claude-sonnet-4-5-20250929\",\n            \"claude-opus-4-5-20251101\",\n        ]\n        self.grounding_agent = grounding_agent\n        self.max_trajectory_length = max_trajectory_length\n        self.enable_reflection = enable_reflection\n\n        self.reset()\n\n    def reset(self):\n        if self.platform != \"linux\":\n            skipped_actions = [\"set_cell_values\"]\n        else:\n            skipped_actions = []\n\n        # Hide code agent action entirely if no env/controller is available\n        if not getattr(self.grounding_agent, \"env\", None) or not getattr(\n            getattr(self.grounding_agent, \"env\", None), \"controller\", None\n        ):\n            skipped_actions.append(\"call_code_agent\")\n\n        sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(\n            type(self.grounding_agent), skipped_actions=skipped_actions\n        ).replace(\"CURRENT_OS\", self.platform)\n\n        self.generator_agent = self._create_agent(sys_prompt)\n        self.reflection_agent = self._create_agent(\n            PROCEDURAL_MEMORY.REFLECTION_ON_TRAJECTORY\n        )\n\n        self.turn_count = 0\n        self.worker_history = []\n        self.reflections = []\n        self.cost_this_turn = 0\n        self.screenshot_inputs = []\n\n    def flush_messages(self):\n        \"\"\"Flush messages based on the model's context limits.\n\n        This method ensures that the agent's message history does not exceed the maximum trajectory length.\n\n        Side Effects:\n            - Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.\n        \"\"\"\n        engine_type = self.engine_params.get(\"engine_type\", \"\")\n\n        # Flush strategy for long-context models: keep all text, only keep latest images\n        if engine_type in [\"anthropic\", \"openai\", \"gemini\"]:\n            max_images = self.max_trajectory_length\n            for agent in [self.generator_agent, self.reflection_agent]:\n                if agent is None:\n                    continue\n                # keep latest k images\n                img_count = 0\n                for i in range(len(agent.messages) - 1, -1, -1):\n                    for j in range(len(agent.messages[i][\"content\"])):\n                        if \"image\" in agent.messages[i][\"content\"][j].get(\"type\", \"\"):\n                            img_count += 1\n                            if img_count > max_images:\n                                del agent.messages[i][\"content\"][j]\n\n        # Flush strategy for non-long-context models: drop full turns\n        else:\n            # generator msgs are alternating [user, assistant], so 2 per round\n            if len(self.generator_agent.messages) > 2 * self.max_trajectory_length + 1:\n                self.generator_agent.messages.pop(1)\n                self.generator_agent.messages.pop(1)\n            # reflector msgs are all [(user text, user image)], so 1 per round\n            if len(self.reflection_agent.messages) > self.max_trajectory_length + 1:\n                self.reflection_agent.messages.pop(1)\n\n    def _generate_reflection(self, instruction: str, obs: Dict) -> Tuple[str, str]:\n        \"\"\"\n        Generate a reflection based on the current observation and instruction.\n\n        Args:\n            instruction (str): The task instruction.\n            obs (Dict): The current observation containing the screenshot.\n\n        Returns:\n            Optional[str, str]: The generated reflection text and thoughts, if any (turn_count > 0).\n\n        Side Effects:\n            - Updates reflection agent's history\n            - Generates reflection response with API call\n        \"\"\"\n        reflection = None\n        reflection_thoughts = None\n        if self.enable_reflection:\n            # Load the initial message\n            if self.turn_count == 0:\n                text_content = textwrap.dedent(\n                    f\"\"\"\n                    Task Description: {instruction}\n                    Current Trajectory below:\n                    \"\"\"\n                )\n                updated_sys_prompt = (\n                    self.reflection_agent.system_prompt + \"\\n\" + text_content\n                )\n                self.reflection_agent.add_system_prompt(updated_sys_prompt)\n                self.reflection_agent.add_message(\n                    text_content=\"The initial screen is provided. No action has been taken yet.\",\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n            # Load the latest action\n            else:\n                self.reflection_agent.add_message(\n                    text_content=self.worker_history[-1],\n                    image_content=obs[\"screenshot\"],\n                    role=\"user\",\n                )\n                full_reflection = call_llm_safe(\n                    self.reflection_agent,\n                    temperature=self.temperature,\n                    use_thinking=self.use_thinking,\n                )\n                reflection, reflection_thoughts = split_thinking_response(\n                    full_reflection\n                )\n                self.reflections.append(reflection)\n                logger.info(\"REFLECTION THOUGHTS: %s\", reflection_thoughts)\n                logger.info(\"REFLECTION: %s\", reflection)\n        return reflection, reflection_thoughts\n\n    def generate_next_action(self, instruction: str, obs: Dict) -> Tuple[Dict, List]:\n        \"\"\"\n        Predict the next action(s) based on the current observation.\n        \"\"\"\n\n        self.grounding_agent.assign_screenshot(obs)\n        self.grounding_agent.set_task_instruction(instruction)\n\n        generator_message = (\n            \"\"\n            if self.turn_count > 0\n            else \"The initial screen is provided. No action has been taken yet.\"\n        )\n\n        # Load the task into the system prompt\n        if self.turn_count == 0:\n            prompt_with_instructions = self.generator_agent.system_prompt.replace(\n                \"TASK_DESCRIPTION\", instruction\n            )\n            self.generator_agent.add_system_prompt(prompt_with_instructions)\n\n        # Get the per-step reflection\n        reflection, reflection_thoughts = self._generate_reflection(instruction, obs)\n        if reflection:\n            generator_message += f\"REFLECTION: You may use this reflection on the previous action and overall trajectory:\\n{reflection}\\n\"\n\n        # Get the grounding agent's knowledge base buffer\n        generator_message += (\n            f\"\\nCurrent Text Buffer = [{','.join(self.grounding_agent.notes)}]\\n\"\n        )\n\n        # Add code agent result from previous step if available (from full task or subtask execution)\n        if (\n            hasattr(self.grounding_agent, \"last_code_agent_result\")\n            and self.grounding_agent.last_code_agent_result is not None\n        ):\n            code_result = self.grounding_agent.last_code_agent_result\n            generator_message += f\"\\nCODE AGENT RESULT:\\n\"\n            generator_message += (\n                f\"Task/Subtask Instruction: {code_result['task_instruction']}\\n\"\n            )\n            generator_message += f\"Steps Completed: {code_result['steps_executed']}\\n\"\n            generator_message += f\"Max Steps: {code_result['budget']}\\n\"\n            generator_message += (\n                f\"Completion Reason: {code_result['completion_reason']}\\n\"\n            )\n            generator_message += f\"Summary: {code_result['summary']}\\n\"\n            if code_result[\"execution_history\"]:\n                generator_message += f\"Execution History:\\n\"\n                for i, step in enumerate(code_result[\"execution_history\"]):\n                    action = step[\"action\"]\n                    # Format code snippets with proper backticks\n                    if \"```python\" in action:\n                        # Extract Python code and format it\n                        code_start = action.find(\"```python\") + 9\n                        code_end = action.find(\"```\", code_start)\n                        if code_end != -1:\n                            python_code = action[code_start:code_end].strip()\n                            generator_message += (\n                                f\"Step {i+1}: \\n```python\\n{python_code}\\n```\\n\"\n                            )\n                        else:\n                            generator_message += f\"Step {i+1}: \\n{action}\\n\"\n                    elif \"```bash\" in action:\n                        # Extract Bash code and format it\n                        code_start = action.find(\"```bash\") + 7\n                        code_end = action.find(\"```\", code_start)\n                        if code_end != -1:\n                            bash_code = action[code_start:code_end].strip()\n                            generator_message += (\n                                f\"Step {i+1}: \\n```bash\\n{bash_code}\\n```\\n\"\n                            )\n                        else:\n                            generator_message += f\"Step {i+1}: \\n{action}\\n\"\n                    else:\n                        generator_message += f\"Step {i+1}: \\n{action}\\n\"\n            generator_message += \"\\n\"\n\n            # Log the code agent result section for debugging (truncated execution history)\n            log_message = f\"\\nCODE AGENT RESULT:\\n\"\n            log_message += (\n                f\"Task/Subtask Instruction: {code_result['task_instruction']}\\n\"\n            )\n            log_message += f\"Steps Completed: {code_result['steps_executed']}\\n\"\n            log_message += f\"Max Steps: {code_result['budget']}\\n\"\n            log_message += f\"Completion Reason: {code_result['completion_reason']}\\n\"\n            log_message += f\"Summary: {code_result['summary']}\\n\"\n            if code_result[\"execution_history\"]:\n                log_message += f\"Execution History (truncated):\\n\"\n                # Only log first 3 steps and last 2 steps to keep logs manageable\n                total_steps = len(code_result[\"execution_history\"])\n                for i, step in enumerate(code_result[\"execution_history\"]):\n                    if i < 3 or i >= total_steps - 2:  # First 3 and last 2 steps\n                        action = step[\"action\"]\n                        if \"```python\" in action:\n                            code_start = action.find(\"```python\") + 9\n                            code_end = action.find(\"```\", code_start)\n                            if code_end != -1:\n                                python_code = action[code_start:code_end].strip()\n                                log_message += (\n                                    f\"Step {i+1}: ```python\\n{python_code}\\n```\\n\"\n                                )\n                            else:\n                                log_message += f\"Step {i+1}: {action}\\n\"\n                        elif \"```bash\" in action:\n                            code_start = action.find(\"```bash\") + 7\n                            code_end = action.find(\"```\", code_start)\n                            if code_end != -1:\n                                bash_code = action[code_start:code_end].strip()\n                                log_message += (\n                                    f\"Step {i+1}: ```bash\\n{bash_code}\\n```\\n\"\n                                )\n                            else:\n                                log_message += f\"Step {i+1}: {action}\\n\"\n                        else:\n                            log_message += f\"Step {i+1}: {action}\\n\"\n                    elif i == 3 and total_steps > 5:\n                        log_message += f\"... (truncated {total_steps - 5} steps) ...\\n\"\n\n            logger.info(\n                f\"WORKER_CODE_AGENT_RESULT_SECTION - Step {self.turn_count + 1}: Code agent result added to generator message:\\n{log_message}\"\n            )\n\n            # Reset the code agent result after adding it to context\n            self.grounding_agent.last_code_agent_result = None\n\n        # Finalize the generator message\n        self.generator_agent.add_message(\n            generator_message, image_content=obs[\"screenshot\"], role=\"user\"\n        )\n\n        # Generate the plan and next action\n        format_checkers = [\n            SINGLE_ACTION_FORMATTER,\n            partial(CODE_VALID_FORMATTER, self.grounding_agent, obs),\n        ]\n        plan = call_llm_formatted(\n            self.generator_agent,\n            format_checkers,\n            temperature=self.temperature,\n            use_thinking=self.use_thinking,\n        )\n        self.worker_history.append(plan)\n        self.generator_agent.add_message(plan, role=\"assistant\")\n        logger.info(\"PLAN:\\n %s\", plan)\n\n        # Extract the next action from the plan\n        plan_code = parse_code_from_string(plan)\n        try:\n            assert plan_code, \"Plan code should not be empty\"\n            exec_code = create_pyautogui_code(self.grounding_agent, plan_code, obs)\n        except Exception as e:\n            logger.error(\n                f\"Could not evaluate the following plan code:\\n{plan_code}\\nError: {e}\"\n            )\n            exec_code = self.grounding_agent.wait(\n                1.333\n            )  # Skip a turn if the code cannot be evaluated\n\n        executor_info = {\n            \"plan\": plan,\n            \"plan_code\": plan_code,\n            \"exec_code\": exec_code,\n            \"reflection\": reflection,\n            \"reflection_thoughts\": reflection_thoughts,\n            \"code_agent_output\": (\n                self.grounding_agent.last_code_agent_result\n                if hasattr(self.grounding_agent, \"last_code_agent_result\")\n                and self.grounding_agent.last_code_agent_result is not None\n                else None\n            ),\n        }\n        self.turn_count += 1\n        self.screenshot_inputs.append(obs[\"screenshot\"])\n        self.flush_messages()\n        return executor_info, [exec_code]\n"
  },
  {
    "path": "gui_agents/s3/bbon/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/bbon/behavior_narrator.py",
    "content": "from gui_agents.s3.core.mllm import LMMAgent\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s3.utils.common_utils import (\n    call_llm_formatted,\n    split_thinking_response,\n    compress_image,\n)\nfrom gui_agents.s3.utils.formatters import (\n    THOUGHTS_ANSWER_TAG_FORMATTER,\n)\nfrom PIL import Image, ImageDraw, ImageFont\nfrom io import BytesIO\nfrom typing import Dict\nimport base64\nimport cv2\nimport numpy as np\n\n\nclass BehaviorNarrator:\n    def __init__(self, engine_params):\n        self.judge_agent = LMMAgent(engine_params=engine_params)\n\n    @staticmethod\n    def extract_mouse_action(action: str) -> list[str]:\n        mouse_actions = []\n        for sub_action in action.split(\";\"):\n            sub_action = sub_action.strip()\n            if (\n                sub_action.startswith(\"pyautogui.click\")\n                or sub_action.startswith(\"pyautogui.moveTo\")\n                or sub_action.startswith(\"pyautogui.dragTo\")\n            ):\n                mouse_actions.append(sub_action)\n        return mouse_actions\n\n    @staticmethod\n    def mark_action(mouse_actions: list[str], img: Image):\n        draw = ImageDraw.Draw(img)\n        font = ImageFont.load_default(25)\n\n        drag_start_width, drag_start_height = None, None\n\n        for mouse_action in mouse_actions:\n            width, height = mouse_action.split(\"(\")[1].strip(\")\").split(\", \")[:2]\n            width, height = int(width), int(height)\n\n            # Clamp coordinates within bounds\n            width = max(0, min(img.width - 1, width))\n            height = max(0, min(img.height - 1, height))\n\n            def place_text(label, color):\n                bbox = draw.textbbox((0, 0), label, font=font)\n                text_w, text_h = (\n                    bbox[2] - bbox[0],\n                    bbox[3] - bbox[1],\n                )  # Measure text size\n                offset_x, offset_y = -5, 5  # Default offset\n                if width + offset_x + text_w > img.width:  # Out of bounds on right\n                    offset_x = -text_w - 5\n                if height + offset_y + text_h > img.height:  # Out of bounds on bottom\n                    offset_y = -text_h - 5\n                if width + offset_x < 0:  # Out of bounds on left\n                    offset_x = 5\n                if height + offset_y < 0:  # Out of bounds on top\n                    offset_y = 5\n                draw.text(\n                    (width + offset_x, height + offset_y), label, fill=color, font=font\n                )\n\n            if mouse_action.startswith(\"pyautogui.click\"):\n                draw.circle((width, height), radius=3, fill=(255, 0, 0))\n                place_text(\"Click\", (255, 0, 0))\n            if mouse_action.startswith(\"pyautogui.moveTo\"):\n                draw.circle((width, height), radius=3, fill=(0, 0, 255))\n                place_text(\"MoveTo\", (0, 0, 255))\n                drag_start_height, drag_start_width = height, width\n            if mouse_action.startswith(\"pyautogui.dragTo\"):\n                draw.line(\n                    [(drag_start_width, drag_start_height), (width, height)],\n                    fill=(0, 255, 0),\n                    width=2,\n                )\n                draw.circle((width, height), radius=3, fill=(0, 255, 0))\n                place_text(\"DragTo\", (0, 255, 0))\n\n    @staticmethod\n    def get_mouse_action_representation(mouse_actions: list[str]) -> str:\n        \"\"\"\n        Returns a string representation of the mouse action for the given action.\n        \"\"\"\n        assert (\n            len(mouse_actions) <= 2\n        ), f\"Multiple mouse action types found: {mouse_actions}\"\n        if len(mouse_actions) == 1:\n            action = mouse_actions[0]\n            if action.startswith(\"pyautogui.click\"):\n                return \"The red circle labeled 'Click' marks the position where the mouse was clicked.\"\n            elif action.startswith(\"pyautogui.moveTo\"):\n                return \"The blue circle labeled 'MoveTo' marks the position where the mouse was moved to.\"\n            else:\n                raise ValueError(f\"Unknown single action type: {action}\")\n        else:\n            assert mouse_actions[0].startswith(\"pyautogui.moveTo\") and mouse_actions[\n                1\n            ].startswith(\"pyautogui.dragTo\")\n            return \"The blue circle labeled 'MoveTo' marks the starting position of the mouse.\\nThe green circle labeled 'DragTo' marks the ending position.\\nThe green line illustrates the mouse's drag path.\"\n\n    @staticmethod\n    def get_zoomed_image(\n        image_bytes: bytes,\n        x: int,\n        y: int,\n        width: int = 300,\n        height: int = 300,\n        upscaling: bool = False,\n        scale: int = 4,\n        add_bounding_box: bool = False,\n    ) -> bytes:\n        \"\"\"Returns a zoomed image centered around (x, y) coordinates.\n\n        Args:\n            image_bytes (bytes): The original image in bytes.\n            x (int): The x-coordinate of the center point.\n            y (int): The y-coordinate of the center point.\n            width (int): The width of the zoomed area.\n            height (int): The height of the zoomed area.\n            padding (int): Extra padding around the zoomed area.\n            upscaling (bool): Whether to upscale and enhance the zoomed image.\n            scale (int): The upscaling factor if upscaling is True.\n            add_bounding_box (bool): Whether to add a bounding box around the zoomed area in the original image.\n\n        Returns:\n            bytes: The zoomed image in bytes.\n            bytes: The original image with bounding box in bytes (if add_bounding_box is True). Otherwise, returns original bytes.\n        \"\"\"\n        # Find zoom dimensions\n        img = Image.open(BytesIO(image_bytes)).convert(\"RGB\")\n        cx, cy = x - width // 2, y - height // 2  # Center coordinates\n        W, H = img.size\n        left = min(max(cx, 0), W - width)\n        top = min(max(cy, 0), H - height)\n        right = left + width\n        bottom = top + height\n        zoomed_img = img.crop((left, top, right, bottom))\n        # Add noticeable bounding box to original image\n        if add_bounding_box:\n            draw_img = img.copy()\n            draw = ImageDraw.Draw(draw_img)\n            draw.rectangle([left, top, right, bottom], outline=\"red\", width=3)\n            original_with_box_bytes = compress_image(\n                image=draw_img\n            )  # Compress to reduce size\n        else:\n            original_with_box_bytes = image_bytes\n        if upscaling:\n            # Upscale and enhance zoomed image\n            zoomed_img = cv2.cvtColor(\n                np.array(zoomed_img), cv2.COLOR_RGB2BGR\n            )  # PIL -> OpenCV\n            zoomed_img = cv2.resize(\n                zoomed_img, None, fx=scale, fy=scale, interpolation=cv2.INTER_LANCZOS4\n            )\n            zoomed_img = cv2.fastNlMeansDenoisingColored(\n                zoomed_img, None, 5, 5, 7, 21\n            )  # light denoise (helps with JPEG speckle)\n            zoomed_img = Image.fromarray(\n                cv2.cvtColor(zoomed_img, cv2.COLOR_BGR2RGB)\n            )  # OpenCV -> PIL\n        zoomed_img_bytes = compress_image(image=zoomed_img)  # Compress to reduce size\n        return zoomed_img_bytes, original_with_box_bytes\n\n    def judge(\n        self,\n        screenshot_num: int,\n        before_img_bytes: bytes,\n        after_img_bytes: bytes,\n        pyautogui_action: str,\n    ) -> Dict[str, str]:\n        if pyautogui_action == \"DONE\":\n            return {\n                \"fact_thoughts\": \"The agent has indicated that it is done with the task.\",\n                \"fact_answer\": \"The agent has indicated that it is done with the task.\",\n            }\n        elif pyautogui_action == \"FAIL\":\n            return {\n                \"fact_thoughts\": \"The agent has indicated that it is impossible to proceed further with the task.\",\n                \"fact_answer\": \"The agent has indicated that it is impossible to proceed further with the task.\",\n            }\n        # Prepare ANNOTATED BEFORE image\n        mouse_actions = BehaviorNarrator.extract_mouse_action(pyautogui_action)\n        before_img = Image.open(BytesIO(before_img_bytes))\n        BehaviorNarrator.mark_action(mouse_actions, before_img)\n        out_buffer = BytesIO()\n        before_img.save(out_buffer, format=\"PNG\")\n        marked_before_img_bytes = out_buffer.getvalue()\n        marked_before_img_message = {\n            \"type\": \"image_url\",\n            \"image_url\": {\n                \"url\": f\"data:image/png;base64,{base64.b64encode(marked_before_img_bytes).decode('utf-8')}\",\n                \"detail\": \"high\",\n            },\n        }\n        if mouse_actions:\n            coords = mouse_actions[-1].split(\"(\")[1].strip(\")\").split(\", \")\n            x, y = int(coords[0]), int(coords[1])\n            zoomed_after_img_bytes, marked_after_img_bytes = (\n                BehaviorNarrator.get_zoomed_image(\n                    image_bytes=after_img_bytes,\n                    x=x,\n                    y=y,\n                    width=300,\n                    height=300,\n                    scale=4,\n                    upscaling=True,\n                    add_bounding_box=True,\n                )\n            )\n            after_img_message = {\n                \"type\": \"image_url\",\n                \"image_url\": {\n                    \"url\": f\"data:image/png;base64,{base64.b64encode(marked_after_img_bytes).decode('utf-8')}\",\n                    \"detail\": \"high\",\n                },\n            }\n            zoomed_after_img_message = {\n                \"type\": \"image_url\",\n                \"image_url\": {\n                    \"url\": f\"data:image/png;base64,{base64.b64encode(zoomed_after_img_bytes).decode('utf-8')}\",\n                    \"detail\": \"high\",\n                },\n            }\n        else:\n            after_img_message = {\n                \"type\": \"image_url\",\n                \"image_url\": {\n                    \"url\": f\"data:image/png;base64,{base64.b64encode(after_img_bytes).decode('utf-8')}\",\n                    \"detail\": \"high\",\n                },\n            }\n            zoomed_after_img_message = None\n\n        fact_message = [\n            {\n                \"role\": \"system\",\n                \"content\": PROCEDURAL_MEMORY.BEHAVIOR_NARRATOR_SYSTEM_PROMPT,\n            }\n        ]\n        fact_message_content = [\n            {\"type\": \"text\", \"text\": \"BEFORE:\"},\n            marked_before_img_message,\n            {\"type\": \"text\", \"text\": f\"Agent Action: {pyautogui_action}\"},\n            {\"type\": \"text\", \"text\": \"AFTER:\"},\n            after_img_message,\n        ]\n        if zoomed_after_img_message:\n            fact_message_content += [\n                {\"type\": \"text\", \"text\": \"ZOOMED AFTER:\"},\n                zoomed_after_img_message,\n            ]\n        fact_message += [{\"role\": \"user\", \"content\": fact_message_content}]\n        fact_response = call_llm_formatted(\n            self.judge_agent,\n            [THOUGHTS_ANSWER_TAG_FORMATTER],\n            messages=fact_message,\n            temperature=0.0,\n        )\n        fact_answer, fact_thoughts = split_thinking_response(fact_response)\n\n        result = {\n            \"fact_thoughts\": fact_thoughts,\n            \"fact_answer\": f\"Fact Caption from Screenshot {screenshot_num}: {fact_answer}\",\n        }\n        return result\n"
  },
  {
    "path": "gui_agents/s3/bbon/comparative_judge.py",
    "content": "import os\nimport base64\nfrom typing import List, Tuple, Optional, List\n\nfrom gui_agents.s3.core.mllm import LMMAgent\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\nfrom gui_agents.s3.utils.common_utils import call_llm_formatted, split_thinking_response\n\n\ndef get_final_screenshot_file(task_dir: str) -> str:\n    \"\"\"Get the final screenshot file name from a task directory.\"\"\"\n    screenshot_files = []\n    for filename in os.listdir(task_dir):\n        if filename.startswith(\"step_\") and filename.endswith(\".png\"):\n            screenshot_files.append(filename)\n\n    if not screenshot_files:\n        return \"step_0.png\"  # fallback\n\n    # Sort by step number and get the last one\n    def extract_step_num(filename):\n        try:\n            return int(filename.split(\"_\")[1].split(\".\")[0])\n        except:\n            return 0\n\n    screenshot_files.sort(key=extract_step_num)\n    return screenshot_files[-1]\n\n\ndef image_to_openai_message_format(\n    image_path: str, caption: str = \"\"\n) -> Optional[dict]:\n    \"\"\"Convert an image file to OpenAI message format.\"\"\"\n    if not os.path.exists(image_path):\n        return None\n\n    try:\n        with open(image_path, \"rb\") as image_file:\n            image_data = base64.b64encode(image_file.read()).decode(\"utf-8\")\n\n        content = []\n        if caption:\n            content.append({\"type\": \"text\", \"text\": caption})\n\n        content.append(\n            {\n                \"type\": \"image_url\",\n                \"image_url\": {\n                    \"url\": f\"data:image/png;base64,{image_data}\",\n                    \"detail\": \"high\",\n                },\n            }\n        )\n\n        return {\"role\": \"user\", \"content\": content}\n    except Exception as e:\n        print(f\"Error loading image {image_path}: {e}\")\n        return None\n\n\nclass ComparativeJudge:\n    def __init__(self, engine_params):\n        self.judge_agent = LMMAgent(engine_params=engine_params)\n\n    def judge(\n        self,\n        task_description: str,\n        task: str,\n        result_dirs: List[str],\n        all_fact_captions: List[List[str]],\n    ) -> Tuple[str, str, Optional[str]]:\n        \"\"\"\n        Fact captions + initial/final screenshots judging.\n        Pipeline: use provided fact captions → include initial/final screenshots → judge.\n        \"\"\"\n        num_trajectories = len(result_dirs)\n        system_prompt = PROCEDURAL_MEMORY.VLM_EVALUATOR_PROMPT_COMPARATIVE_BASELINE\n        system_prompt = system_prompt.replace(\n            \"<TASK_DESCRIPTION_INPUT>\", task_description\n        )\n        system_prompt = system_prompt.replace(\n            \"<NUMBER OF TRAJECTORIES>\", str(num_trajectories)\n        )\n\n        messages = [{\"role\": \"system\", \"content\": system_prompt}]\n\n        for i, (result_dir, fact_captions) in enumerate(\n            zip(result_dirs, all_fact_captions)\n        ):\n            task_dir = os.path.join(result_dir, task.split(\"/\")[0], task.split(\"/\")[1])\n            result_initial_screenshot = os.path.join(task_dir, \"step_0.png\")\n            result_final_screenshot = os.path.join(\n                task_dir, get_final_screenshot_file(task_dir)\n            )\n            initial_screenshot_message = image_to_openai_message_format(\n                result_initial_screenshot, caption=f\"Initial screenshot of result{i+1}\"\n            )\n            final_screenshot_message = image_to_openai_message_format(\n                result_final_screenshot, caption=f\"Final screenshot of result{i+1}\"\n            )\n            if (\n                initial_screenshot_message is not None\n                and final_screenshot_message is not None\n            ):\n                messages.append(initial_screenshot_message)\n                messages.append(final_screenshot_message)\n            if fact_captions:\n                messages.append(\n                    {\n                        \"role\": \"user\",\n                        \"content\": [\n                            {\n                                \"type\": \"text\",\n                                \"text\": f\"Fact captions for Trajectory {i+1}:\",\n                            }\n                        ]\n                        + [\n                            {\"type\": \"text\", \"text\": caption}\n                            for caption in fact_captions\n                        ],\n                    }\n                )\n\n        messages.append(\n            {\n                \"role\": \"user\",\n                \"content\": [\n                    {\n                        \"type\": \"text\",\n                        \"text\": f\"Please evaluate the {num_trajectories} trajectories based on the criteria provided in the system prompt.\",\n                    }\n                ],\n            }\n        )\n\n        response = call_llm_formatted(self.judge_agent, [], messages=messages)\n        answer, thoughts = split_thinking_response(response)\n\n        try:\n            judge_choice = int(answer)\n            if 1 <= judge_choice <= num_trajectories:\n                selected_trajectory = result_dirs[judge_choice - 1]\n            else:\n                selected_trajectory = None\n        except ValueError:\n            selected_trajectory = None\n\n        return answer, thoughts, selected_trajectory\n"
  },
  {
    "path": "gui_agents/s3/cli_app.py",
    "content": "import argparse\nimport datetime\nimport io\nimport logging\nimport os\nimport platform\nimport pyautogui\nimport signal\nimport sys\nimport time\n\nfrom PIL import Image\n\nfrom gui_agents.s3.agents.grounding import OSWorldACI\nfrom gui_agents.s3.agents.agent_s import AgentS3\nfrom gui_agents.s3.utils.local_env import LocalEnv\n\ncurrent_platform = platform.system().lower()\n\n# Global flag to track pause state for debugging\npaused = False\n\n\ndef get_char():\n    \"\"\"Get a single character from stdin without pressing Enter\"\"\"\n    try:\n        # Import termios and tty on Unix-like systems\n        if platform.system() in [\"Darwin\", \"Linux\"]:\n            import termios\n            import tty\n\n            fd = sys.stdin.fileno()\n            old_settings = termios.tcgetattr(fd)\n            try:\n                tty.setraw(sys.stdin.fileno())\n                ch = sys.stdin.read(1)\n            finally:\n                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)\n            return ch\n        else:\n            # Windows fallback\n            import msvcrt\n\n            return msvcrt.getch().decode(\"utf-8\", errors=\"ignore\")\n    except:\n        return input()  # Fallback for non-terminal environments\n\n\ndef signal_handler(signum, frame):\n    \"\"\"Handle Ctrl+C signal for debugging during agent execution\"\"\"\n    global paused\n\n    if not paused:\n        print(\"\\n\\n🔸 Agent-S Workflow Paused 🔸\")\n        print(\"=\" * 50)\n        print(\"Options:\")\n        print(\"  • Press Ctrl+C again to quit\")\n        print(\"  • Press Esc to resume workflow\")\n        print(\"=\" * 50)\n\n        paused = True\n\n        while paused:\n            try:\n                print(\"\\n[PAUSED] Waiting for input... \", end=\"\", flush=True)\n                char = get_char()\n\n                if ord(char) == 3:  # Ctrl+C\n                    print(\"\\n\\n🛑 Exiting Agent-S...\")\n                    sys.exit(0)\n                elif ord(char) == 27:  # Esc\n                    print(\"\\n\\n▶️  Resuming Agent-S workflow...\")\n                    paused = False\n                    break\n                else:\n                    print(f\"\\n   Unknown command: '{char}' (ord: {ord(char)})\")\n\n            except KeyboardInterrupt:\n                print(\"\\n\\n🛑 Exiting Agent-S...\")\n                sys.exit(0)\n    else:\n        # Already paused, second Ctrl+C means quit\n        print(\"\\n\\n🛑 Exiting Agent-S...\")\n        sys.exit(0)\n\n\n# Set up signal handler for Ctrl+C\nsignal.signal(signal.SIGINT, signal_handler)\n\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nlog_dir = \"logs\"\nos.makedirs(log_dir, exist_ok=True)\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n\nplatform_os = platform.system()\n\n\ndef show_permission_dialog(code: str, action_description: str):\n    \"\"\"Show a platform-specific permission dialog and return True if approved.\"\"\"\n    if platform.system() == \"Darwin\":\n        result = os.system(\n            f'osascript -e \\'display dialog \"Do you want to execute this action?\\n\\n{code} which will try to {action_description}\" with title \"Action Permission\" buttons {{\"Cancel\", \"OK\"}} default button \"OK\" cancel button \"Cancel\"\\''\n        )\n        return result == 0\n    elif platform.system() == \"Linux\":\n        result = os.system(\n            f'zenity --question --title=\"Action Permission\" --text=\"Do you want to execute this action?\\n\\n{code}\" --width=400 --height=200'\n        )\n        return result == 0\n    return False\n\n\ndef scale_screen_dimensions(width: int, height: int, max_dim_size: int):\n    scale_factor = min(max_dim_size / width, max_dim_size / height, 1)\n    safe_width = int(width * scale_factor)\n    safe_height = int(height * scale_factor)\n    return safe_width, safe_height\n\n\ndef run_agent(agent, instruction: str, scaled_width: int, scaled_height: int):\n    global paused\n    obs = {}\n    traj = \"Task:\\n\" + instruction\n    subtask_traj = \"\"\n    for step in range(15):\n        # Check if we're in paused state and wait\n        while paused:\n            time.sleep(0.1)\n        # Get screen shot using pyautogui\n        screenshot = pyautogui.screenshot()\n        screenshot = screenshot.resize((scaled_width, scaled_height), Image.LANCZOS)\n\n        # Save the screenshot to a BytesIO object\n        buffered = io.BytesIO()\n        screenshot.save(buffered, format=\"PNG\")\n\n        # Get the byte value of the screenshot\n        screenshot_bytes = buffered.getvalue()\n        # Convert to base64 string.\n        obs[\"screenshot\"] = screenshot_bytes\n\n        # Check again for pause state before prediction\n        while paused:\n            time.sleep(0.1)\n\n        print(f\"\\n🔄 Step {step + 1}/15: Getting next action from agent...\")\n\n        # Get next action code from the agent\n        info, code = agent.predict(instruction=instruction, observation=obs)\n\n        if \"done\" in code[0].lower() or \"fail\" in code[0].lower():\n            if platform.system() == \"Darwin\":\n                os.system(\n                    f'osascript -e \\'display dialog \"Task Completed\" with title \"OpenACI Agent\" buttons \"OK\" default button \"OK\"\\''\n                )\n            elif platform.system() == \"Linux\":\n                os.system(\n                    f'zenity --info --title=\"OpenACI Agent\" --text=\"Task Completed\" --width=200 --height=100'\n                )\n\n            break\n\n        if \"next\" in code[0].lower():\n            continue\n\n        if \"wait\" in code[0].lower():\n            print(\"⏳ Agent requested wait...\")\n            time.sleep(5)\n            continue\n\n        else:\n            time.sleep(1.0)\n            print(\"EXECUTING CODE:\", code[0])\n\n            # Check for pause state before execution\n            while paused:\n                time.sleep(0.1)\n\n            # Ask for permission before executing\n            exec(code[0])\n            time.sleep(1.0)\n\n            # Update task and subtask trajectories\n            if \"reflection\" in info and \"executor_plan\" in info:\n                traj += (\n                    \"\\n\\nReflection:\\n\"\n                    + str(info[\"reflection\"])\n                    + \"\\n\\n----------------------\\n\\nPlan:\\n\"\n                    + info[\"executor_plan\"]\n                )\n\n\ndef main():\n    parser = argparse.ArgumentParser(description=\"Run AgentS3 with specified model.\")\n    parser.add_argument(\n        \"--provider\",\n        type=str,\n        default=\"openai\",\n        help=\"Specify the provider to use (e.g., openai, anthropic, etc.)\",\n    )\n    parser.add_argument(\n        \"--model\",\n        type=str,\n        default=\"gpt-5-2025-08-07\",\n        help=\"Specify the model to use (e.g., gpt-5-2025-08-07)\",\n    )\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # Grounding model config: Self-hosted endpoint based (required)\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\",\n        type=str,\n        required=True,\n        help=\"The URL of the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # AgentS3 specific arguments\n    parser.add_argument(\n        \"--max_trajectory_length\",\n        type=int,\n        default=8,\n        help=\"Maximum number of image turns to keep in trajectory\",\n    )\n    parser.add_argument(\n        \"--enable_reflection\",\n        action=\"store_true\",\n        default=True,\n        help=\"Enable reflection agent to assist the worker agent\",\n    )\n    parser.add_argument(\n        \"--enable_local_env\",\n        action=\"store_true\",\n        default=False,\n        help=\"Enable local coding environment for code execution (WARNING: Executes arbitrary code locally)\",\n    )\n    parser.add_argument(\n        \"--task\",\n        type=str,\n        help=\"The task instruction for Agent-S3 to perform.\",\n    )\n\n    args = parser.parse_args()\n\n    # Re-scales screenshot size to ensure it fits in UI-TARS context limit\n    screen_width, screen_height = pyautogui.size()\n    scaled_width, scaled_height = scale_screen_dimensions(\n        screen_width, screen_height, max_dim_size=2400\n    )\n\n    # Load the general engine params\n    engine_params = {\n        \"engine_type\": args.provider,\n        \"model\": args.model,\n        \"base_url\": args.model_url,\n        \"api_key\": args.model_api_key,\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n\n    # Load the grounding engine from a custom endpoint\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": args.ground_url,\n        \"api_key\": args.ground_api_key,\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    # Initialize environment based on user preference\n    local_env = None\n    if args.enable_local_env:\n        print(\n            \"⚠️  WARNING: Local coding environment enabled. This will execute arbitrary code locally!\"\n        )\n        local_env = LocalEnv()\n\n    grounding_agent = OSWorldACI(\n        env=local_env,\n        platform=current_platform,\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=screen_width,\n        height=screen_height,\n    )\n\n    agent = AgentS3(\n        engine_params,\n        grounding_agent,\n        platform=current_platform,\n        max_trajectory_length=args.max_trajectory_length,\n        enable_reflection=args.enable_reflection,\n    )\n\n    task = args.task\n\n    # handle query from command line\n    if isinstance(task, str) and task.strip():\n        agent.reset()\n        run_agent(agent, task, scaled_width, scaled_height)\n        return\n\n    while True:\n        query = input(\"Query: \")\n\n        agent.reset()\n\n        # Run the agent on your own device\n        run_agent(agent, query, scaled_width, scaled_height)\n\n        response = input(\"Would you like to provide another query? (y/n): \")\n        if response.lower() != \"y\":\n            break\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "gui_agents/s3/core/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/core/engine.py",
    "content": "import os\n\nimport backoff\nfrom anthropic import Anthropic\nfrom openai import (\n    AzureOpenAI,\n    APIConnectionError,\n    APIError,\n    AzureOpenAI,\n    OpenAI,\n    RateLimitError,\n)\n\n\nclass LMMEngine:\n    pass\n\n\nclass LMMEngineOpenAI(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        organization=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.organization = organization\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature  # Can force temperature to be the same (in the case of o3 requiring temperature to be 1)\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENAI_API_KEY\"\n            )\n        organization = self.organization or os.getenv(\"OPENAI_ORG_ID\")\n        if not self.llm_client:\n            if not self.base_url:\n                self.llm_client = OpenAI(api_key=api_key, organization=organization)\n            else:\n                self.llm_client = OpenAI(\n                    base_url=self.base_url, api_key=api_key, organization=organization\n                )\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                # max_completion_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=(\n                    temperature if self.temperature is None else self.temperature\n                ),\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAnthropic(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        thinking=False,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.thinking = thinking\n        self.api_key = api_key\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"ANTHROPIC_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY\"\n            )\n        self.llm_client = Anthropic(api_key=api_key)\n        # Use the instance temperature if not specified in the call\n        temp = self.temperature if temperature is None else temperature\n        if self.thinking:\n            full_response = self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=8192,\n                thinking={\"type\": \"enabled\", \"budget_tokens\": 4096},\n                **kwargs,\n            )\n            thoughts = full_response.content[0].thinking\n            return full_response.content[1].text\n        return (\n            self.llm_client.messages.create(\n                system=messages[0][\"content\"][0][\"text\"],\n                model=self.model,\n                messages=messages[1:],\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .content[0]\n            .text\n        )\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    # Compatible with Claude-3.7 Sonnet thinking mode\n    def generate_with_thinking(\n        self, messages, temperature=0.0, max_new_tokens=None, **kwargs\n    ):\n        \"\"\"Generate the next message based on previous messages, and keeps the thinking tokens\"\"\"\n        api_key = self.api_key or os.getenv(\"ANTHROPIC_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named ANTHROPIC_API_KEY\"\n            )\n        self.llm_client = Anthropic(api_key=api_key)\n        full_response = self.llm_client.messages.create(\n            system=messages[0][\"content\"][0][\"text\"],\n            model=self.model,\n            messages=messages[1:],\n            max_tokens=8192,\n            thinking={\"type\": \"enabled\", \"budget_tokens\": 4096},\n            **kwargs,\n        )\n\n        thoughts = full_response.content[0].thinking\n        answer = full_response.content[1].text\n        full_response = (\n            f\"<thoughts>\\n{thoughts}\\n</thoughts>\\n\\n<answer>\\n{answer}\\n</answer>\\n\"\n        )\n        return full_response\n\n\nclass LMMEngineGemini(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"GEMINI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named GEMINI_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"GEMINI_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named GEMINI_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use the temperature passed to generate, otherwise use the instance's temperature, otherwise default to 0.0\n        temp = self.temperature if temperature is None else temperature\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineOpenRouter(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"OPENROUTER_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named OPENROUTER_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"OPEN_ROUTER_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named OPEN_ROUTER_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temp,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineAzureOpenAI(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        azure_endpoint=None,\n        model=None,\n        api_version=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_version = api_version\n        self.api_key = api_key\n        self.azure_endpoint = azure_endpoint\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.cost = 0.0\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"AZURE_OPENAI_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"An API Key needs to be provided in either the api_key parameter or as an environment variable named AZURE_OPENAI_API_KEY\"\n            )\n        api_version = self.api_version or os.getenv(\"OPENAI_API_VERSION\")\n        if api_version is None:\n            raise ValueError(\n                \"api_version must be provided either as a parameter or as an environment variable named OPENAI_API_VERSION\"\n            )\n        azure_endpoint = self.azure_endpoint or os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n        if azure_endpoint is None:\n            raise ValueError(\n                \"An Azure API endpoint needs to be provided in either the azure_endpoint parameter or as an environment variable named AZURE_OPENAI_ENDPOINT\"\n            )\n        if not self.llm_client:\n            self.llm_client = AzureOpenAI(\n                azure_endpoint=azure_endpoint,\n                api_key=api_key,\n                api_version=api_version,\n            )\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temp,\n            **kwargs,\n        )\n        total_tokens = completion.usage.total_tokens\n        self.cost += 0.02 * ((total_tokens + 500) / 1000)\n        return completion.choices[0].message.content\n\n\nclass LMMEnginevLLM(LMMEngine):\n    def __init__(\n        self,\n        base_url=None,\n        api_key=None,\n        model=None,\n        rate_limit=-1,\n        temperature=None,\n        **kwargs,\n    ):\n        assert model is not None, \"model must be provided\"\n        self.model = model\n        self.api_key = api_key\n        self.base_url = base_url\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n        self.temperature = temperature\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(\n        self,\n        messages,\n        temperature=0.0,\n        top_p=0.8,\n        repetition_penalty=1.05,\n        max_new_tokens=512,\n        **kwargs,\n    ):\n        api_key = self.api_key or os.getenv(\"vLLM_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A vLLM API key needs to be provided in either the api_key parameter or as an environment variable named vLLM_API_KEY\"\n            )\n        base_url = self.base_url or os.getenv(\"vLLM_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"An endpoint URL needs to be provided in either the endpoint_url parameter or as an environment variable named vLLM_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        # Use self.temperature if set, otherwise use the temperature argument\n        temp = self.temperature if self.temperature is not None else temperature\n        completion = self.llm_client.chat.completions.create(\n            model=self.model,\n            messages=messages,\n            max_tokens=max_new_tokens if max_new_tokens else 4096,\n            temperature=temp,\n            top_p=top_p,\n            extra_body={\"repetition_penalty\": repetition_penalty},\n        )\n        return completion.choices[0].message.content\n\n\nclass LMMEngineHuggingFace(LMMEngine):\n    def __init__(self, base_url=None, api_key=None, rate_limit=-1, **kwargs):\n        self.base_url = base_url\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"HF_TOKEN\")\n        if api_key is None:\n            raise ValueError(\n                \"A HuggingFace token needs to be provided in either the api_key parameter or as an environment variable named HF_TOKEN\"\n            )\n        base_url = self.base_url or os.getenv(\"HF_ENDPOINT_URL\")\n        if base_url is None:\n            raise ValueError(\n                \"HuggingFace endpoint must be provided as base_url parameter or as an environment variable named HF_ENDPOINT_URL.\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(base_url=base_url, api_key=api_key)\n        return (\n            self.llm_client.chat.completions.create(\n                model=\"tgi\",\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n\n\nclass LMMEngineParasail(LMMEngine):\n    def __init__(\n        self, base_url=None, api_key=None, model=None, rate_limit=-1, **kwargs\n    ):\n        assert model is not None, \"Parasail model id must be provided\"\n        self.base_url = base_url\n        self.model = model\n        self.api_key = api_key\n        self.request_interval = 0 if rate_limit == -1 else 60.0 / rate_limit\n        self.llm_client = None\n\n    @backoff.on_exception(\n        backoff.expo, (APIConnectionError, APIError, RateLimitError), max_time=60\n    )\n    def generate(self, messages, temperature=0.0, max_new_tokens=None, **kwargs):\n        api_key = self.api_key or os.getenv(\"PARASAIL_API_KEY\")\n        if api_key is None:\n            raise ValueError(\n                \"A Parasail API key needs to be provided in either the api_key parameter or as an environment variable named PARASAIL_API_KEY\"\n            )\n        base_url = self.base_url\n        if base_url is None:\n            raise ValueError(\n                \"Parasail endpoint must be provided as base_url parameter or as an environment variable named PARASAIL_ENDPOINT_URL\"\n            )\n        if not self.llm_client:\n            self.llm_client = OpenAI(\n                base_url=base_url if base_url else \"https://api.parasail.io/v1\",\n                api_key=api_key,\n            )\n        return (\n            self.llm_client.chat.completions.create(\n                model=self.model,\n                messages=messages,\n                max_tokens=max_new_tokens if max_new_tokens else 4096,\n                temperature=temperature,\n                **kwargs,\n            )\n            .choices[0]\n            .message.content\n        )\n"
  },
  {
    "path": "gui_agents/s3/core/mllm.py",
    "content": "import base64\n\nimport numpy as np\n\nfrom gui_agents.s3.core.engine import (\n    LMMEngineAnthropic,\n    LMMEngineAzureOpenAI,\n    LMMEngineHuggingFace,\n    LMMEngineOpenAI,\n    LMMEngineOpenRouter,\n    LMMEngineParasail,\n    LMMEnginevLLM,\n    LMMEngineGemini,\n)\n\n\nclass LMMAgent:\n    def __init__(self, engine_params=None, system_prompt=None, engine=None):\n        if engine is None:\n            if engine_params is not None:\n                engine_type = engine_params.get(\"engine_type\")\n                if engine_type == \"openai\":\n                    self.engine = LMMEngineOpenAI(**engine_params)\n                elif engine_type == \"anthropic\":\n                    self.engine = LMMEngineAnthropic(**engine_params)\n                elif engine_type == \"azure\":\n                    self.engine = LMMEngineAzureOpenAI(**engine_params)\n                elif engine_type == \"vllm\":\n                    self.engine = LMMEnginevLLM(**engine_params)\n                elif engine_type == \"huggingface\":\n                    self.engine = LMMEngineHuggingFace(**engine_params)\n                elif engine_type == \"gemini\":\n                    self.engine = LMMEngineGemini(**engine_params)\n                elif engine_type == \"open_router\":\n                    self.engine = LMMEngineOpenRouter(**engine_params)\n                elif engine_type == \"parasail\":\n                    self.engine = LMMEngineParasail(**engine_params)\n                else:\n                    raise ValueError(f\"engine_type '{engine_type}' is not supported\")\n            else:\n                raise ValueError(\"engine_params must be provided\")\n        else:\n            self.engine = engine\n\n        self.messages = []  # Empty messages\n\n        if system_prompt:\n            self.add_system_prompt(system_prompt)\n        else:\n            self.add_system_prompt(\"You are a helpful assistant.\")\n\n    def encode_image(self, image_content):\n        # if image_content is a path to an image file, check type of the image_content to verify\n        if isinstance(image_content, str):\n            with open(image_content, \"rb\") as image_file:\n                return base64.b64encode(image_file.read()).decode(\"utf-8\")\n        else:\n            return base64.b64encode(image_content).decode(\"utf-8\")\n\n    def reset(\n        self,\n    ):\n\n        self.messages = [\n            {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        ]\n\n    def add_system_prompt(self, system_prompt):\n        self.system_prompt = system_prompt\n        if len(self.messages) > 0:\n            self.messages[0] = {\n                \"role\": \"system\",\n                \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n            }\n        else:\n            self.messages.append(\n                {\n                    \"role\": \"system\",\n                    \"content\": [{\"type\": \"text\", \"text\": self.system_prompt}],\n                }\n            )\n\n    def remove_message_at(self, index):\n        \"\"\"Remove a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages.pop(index)\n\n    def replace_message_at(\n        self, index, text_content, image_content=None, image_detail=\"high\"\n    ):\n        \"\"\"Replace a message at a given index\"\"\"\n        if index < len(self.messages):\n            self.messages[index] = {\n                \"role\": self.messages[index][\"role\"],\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n            if image_content:\n                base64_image = self.encode_image(image_content)\n                self.messages[index][\"content\"].append(\n                    {\n                        \"type\": \"image_url\",\n                        \"image_url\": {\n                            \"url\": f\"data:image/png;base64,{base64_image}\",\n                            \"detail\": image_detail,\n                        },\n                    }\n                )\n\n    def add_message(\n        self,\n        text_content,\n        image_content=None,\n        role=None,\n        image_detail=\"high\",\n        put_text_last=False,\n    ):\n        \"\"\"Add a new message to the list of messages\"\"\"\n\n        # API-style inference from OpenAI and AzureOpenAI\n        if isinstance(\n            self.engine,\n            (\n                LMMEngineOpenAI,\n                LMMEngineAzureOpenAI,\n                LMMEngineHuggingFace,\n                LMMEngineGemini,\n                LMMEngineOpenRouter,\n                LMMEngineParasail,\n            ),\n        ):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if isinstance(image_content, np.ndarray) or image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image/png;base64,{base64_image}\",\n                                    \"detail\": image_detail,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\n                                \"url\": f\"data:image/png;base64,{base64_image}\",\n                                \"detail\": image_detail,\n                            },\n                        }\n                    )\n\n            # Rotate text to be the last message if desired\n            if put_text_last:\n                text_content = message[\"content\"].pop(0)\n                message[\"content\"].append(text_content)\n\n            self.messages.append(message)\n\n        # For API-style inference from Anthropic\n        elif isinstance(self.engine, LMMEngineAnthropic):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image\",\n                                \"source\": {\n                                    \"type\": \"base64\",\n                                    \"media_type\": \"image/png\",\n                                    \"data\": base64_image,\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image\",\n                            \"source\": {\n                                \"type\": \"base64\",\n                                \"media_type\": \"image/png\",\n                                \"data\": base64_image,\n                            },\n                        }\n                    )\n            self.messages.append(message)\n\n        # Locally hosted vLLM model inference\n        elif isinstance(self.engine, LMMEnginevLLM):\n            # infer role from previous message\n            if role != \"user\":\n                if self.messages[-1][\"role\"] == \"system\":\n                    role = \"user\"\n                elif self.messages[-1][\"role\"] == \"user\":\n                    role = \"assistant\"\n                elif self.messages[-1][\"role\"] == \"assistant\":\n                    role = \"user\"\n\n            message = {\n                \"role\": role,\n                \"content\": [{\"type\": \"text\", \"text\": text_content}],\n            }\n\n            if image_content:\n                # Check if image_content is a list or a single image\n                if isinstance(image_content, list):\n                    # If image_content is a list of images, loop through each image\n                    for image in image_content:\n                        base64_image = self.encode_image(image)\n                        message[\"content\"].append(\n                            {\n                                \"type\": \"image_url\",\n                                \"image_url\": {\n                                    \"url\": f\"data:image;base64,{base64_image}\"\n                                },\n                            }\n                        )\n                else:\n                    # If image_content is a single image, handle it directly\n                    base64_image = self.encode_image(image_content)\n                    message[\"content\"].append(\n                        {\n                            \"type\": \"image_url\",\n                            \"image_url\": {\"url\": f\"data:image;base64,{base64_image}\"},\n                        }\n                    )\n\n            self.messages.append(message)\n        else:\n            raise ValueError(\"engine_type is not supported\")\n\n    def get_response(\n        self,\n        user_message=None,\n        messages=None,\n        temperature=0.0,\n        max_new_tokens=None,\n        use_thinking=False,\n        **kwargs,\n    ):\n        \"\"\"Generate the next response based on previous messages\"\"\"\n        if messages is None:\n            messages = self.messages\n        if user_message:\n            messages.append(\n                {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": user_message}]}\n            )\n\n        # Regular generation\n        if use_thinking:\n            return self.engine.generate_with_thinking(\n                messages,\n                temperature=temperature,\n                max_new_tokens=max_new_tokens,\n                **kwargs,\n            )\n\n        return self.engine.generate(\n            messages,\n            temperature=temperature,\n            max_new_tokens=max_new_tokens,\n            **kwargs,\n        )\n"
  },
  {
    "path": "gui_agents/s3/core/module.py",
    "content": "from typing import Dict, Optional\nfrom gui_agents.s3.core.mllm import LMMAgent\n\n\nclass BaseModule:\n    def __init__(self, engine_params: Dict, platform: str):\n        self.engine_params = engine_params\n        self.platform = platform\n\n    def _create_agent(\n        self, system_prompt: str = None, engine_params: Optional[Dict] = None\n    ) -> LMMAgent:\n        \"\"\"Create a new LMMAgent instance\"\"\"\n        agent = LMMAgent(engine_params or self.engine_params)\n        if system_prompt:\n            agent.add_system_prompt(system_prompt)\n        return agent\n"
  },
  {
    "path": "gui_agents/s3/memory/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/memory/procedural_memory.py",
    "content": "import inspect\nimport textwrap\n\n\nclass PROCEDURAL_MEMORY:\n\n    FORMATTING_FEEDBACK_PROMPT = textwrap.dedent(\n        \"\"\"\n    Your previous response was not formatted correctly. You must respond again to replace your previous response. Do not make reference to this message while fixing the response. Please address the following issues below to improve the previous response:\n    FORMATTING_FEEDBACK\n    \"\"\"\n    )\n\n    @staticmethod\n    def construct_simple_worker_procedural_memory(agent_class, skipped_actions):\n        procedural_memory = textwrap.dedent(\n            f\"\"\"\\\n        You are an expert in graphical user interfaces and Python code. You are responsible for executing the task: `TASK_DESCRIPTION`.\n        You are working in CURRENT_OS.\n\n        # GUIDELINES\n\n        ## Agent Usage Guidelines\n        You have access to both GUI and code agents. Choose the appropriate agent based on the task requirements:\n\n        ### GUI Agent\n        - **Use for**: clicking, typing, navigation, file operations, tasks requiring specific application features, visual elements, interactive features, application UI, complex formatting, print/export settings, multi-step workflows, pivot tables, charts\n\n        ### Code Agent\n        You have access to a code agent that can execute Python/Bash code for complex tasks.\n\n        Use code agent for:\n        - **ALL spreadsheet calculations**: sums, totals, averages, formulas, data filling, missing value calculations\n        - **ALL data manipulation tasks**: including calculations, data processing (filtering, sorting, replacing, cleanup), bulk operations (filling or transforming ranges), formatting changes (number/date/currency formats, styles), and large-scale data entry or editing\n\n        **Usage Strategy**:\n        - **Full Task**: Use `agent.call_code_agent()` when the task involves ANY data manipulation, calculations, or bulk operations\n        - **Subtask**: Use `agent.call_code_agent(\"specific subtask\")` for focused data tasks\n        - **CRITICAL**: If calling the code agent for the full task, pass the original task instruction without rewording or modification\n\n        ### Code Agent Result Interpretation\n        - The code agent runs Python/Bash code in the background (up to 20 steps), independently performing tasks like file modification, package installation, or system operations.\n        - After execution, you receive a report with:\n            * Steps completed (actual steps run)\n            * Max steps (step budget)\n            * Completion reason: DONE (success), FAIL (gave up), or BUDGET_EXHAUSTED (used all steps)\n            * Summary of work done\n            * Full execution history\n        - Interpretation:\n            * DONE: The code agent finished before using all steps, believing the task was completed through code.\n            * FAIL: The code agent determined the task could not be completed by code and failed after trying.\n            * BUDGET_EXHAUSTED: The task required more steps than allowed by the step budget.\n\n        ### Code Agent Verification\n        - After the code agent modifies files, your job is to find and verify these files via GUI actions (e.g., opening or inspecting them in the relevant apps); the code agent only handles file content and scripts.\n        - ALWAYS verify code agent results with GUI actions before using agent.done(); NEVER trust code agent output alone. If verification or the code agent fails, use GUI actions to finish the task and only use agent.done() if results match expectations.\n        - **CRITICAL**: Files modified by code agent may not show changes in currently open applications - you MUST close and reopen the entire application. Reloading the page/file is insufficient.\n\n        # General Task Guidelines\n        - For formatting tasks, always use the code agent for proper formatting.\n        - **Never use the code agent for charts, graphs, pivot tables, or visual elements—always use the GUI for those.**\n        - If creating a new sheet with no name specified, use default sheet names (e.g., \"Sheet1\", \"Sheet2\", etc.).\n        - After opening or reopening applications, wait at least 3 seconds for full loading.\n        - Don't provide specific row/column numbers to the coding agent; let it infer the spreadsheet structure itself.\n\n        Never assume a task is done based on appearances-always ensure the specific requested action has been performed and verify the modification. If you haven't executed any actions, the task is not complete.\n\n        ### END OF GUIDELINES\n\n        You are provided with:\n        1. A screenshot of the current time step.\n        2. The history of your previous interactions with the UI.\n        3. Access to the following class and methods to interact with the UI:\n        class Agent:\n        \"\"\"\n        )\n\n        for attr_name in dir(agent_class):\n            if attr_name in skipped_actions:\n                continue\n\n            attr = getattr(agent_class, attr_name)\n            if callable(attr) and hasattr(attr, \"is_agent_action\"):\n                # Use inspect to get the full function signature\n                signature = inspect.signature(attr)\n                procedural_memory += f\"\"\"\n    def {attr_name}{signature}:\n    '''{attr.__doc__}'''\n        \"\"\"\n\n        procedural_memory += textwrap.dedent(\n            \"\"\"\n        Your response should be formatted like this:\n        (Previous action verification)\n        Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.\n\n        (Screenshot Analysis)\n        Closely examine and describe the current state of the desktop along with the currently open applications.\n\n        (Next Action)\n        Based on the current screenshot and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.\n\n        (Grounded Action)\n        Translate the next action into code using the provided API methods. Format the code like this:\n        ```python\n        agent.click(\"The menu button at the top right of the window\", 1, \"left\")\n        ```\n        Note for the grounded action:\n        1. Only perform one action at a time.\n        2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.\n        3. You must use only the available methods provided above to interact with the UI, do not invent new methods.\n        4. Only return one code block every time. There must be a single line of code in the code block.\n        5. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the subtask is completed or `agent.fail()` if it cannot be completed.\n        6. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.\n        7. My computer's password is 'osworld-public-evaluation', feel free to use it when you need sudo rights.\n        8. Generate agent.fail() as your grounded action if you get exhaustively stuck on the task and believe it is impossible.\n        9. Generate agent.done() as your grounded action when your believe the task is fully complete.\n        10. Do not use the \"command\" + \"tab\" hotkey on MacOS.\n        11. Prefer hotkeys and application features over clicking on text elements when possible. Highlighting text is fine.\n        \"\"\"\n        )\n\n        return procedural_memory.strip()\n\n    # For reflection agent, post-action verification mainly for cycle detection\n    REFLECTION_ON_TRAJECTORY = textwrap.dedent(\n        \"\"\"\n    You are an expert computer use agent designed to reflect on the trajectory of a task and provide feedback on what has happened so far.\n    You have access to the Task Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.\n    \n    IMPORTANT: The system includes a code agent that can modify files and applications programmatically. When you see:\n    - Files with different content than expected\n    - Applications being closed and reopened\n    - Documents with fewer lines or modified content\n    These may be LEGITIMATE results of code agent execution, not errors or corruption.\n    \n    Your task is to generate a reflection. Your generated reflection must fall under one of the cases listed below:\n\n    Case 1. The trajectory is not going according to plan. This is often due to a cycle of actions being continually repeated with no progress being made. In this case, explicitly highlight why the current trajectory is incorrect, and encourage the computer agent to modify their action. However, DO NOT encourage a specific action in particular.\n    Case 2. The trajectory is going according to plan. In this case, simply tell the agent to continue proceeding as planned. DO NOT encourage a specific action in particular.\n    Case 3. You believe the current task has been completed. In this case, tell the agent that the task has been successfully completed.\n    \n    To be successful, you must follow the rules below:\n    - **Your output MUST be based on one of the case options above**.\n    - DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.\n    - Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.\n    - Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.\n    - IMPORTANT: Do not assume file modifications or application restarts are errors - they may be legitimate code agent actions\n    - Consider whether observed changes align with the task requirements before determining if the trajectory is off-track\n    \"\"\"\n    )\n\n    PHRASE_TO_WORD_COORDS_PROMPT = textwrap.dedent(\n        \"\"\"\n    You are an expert in graphical user interfaces. Your task is to process a phrase of text, and identify the most relevant word on the computer screen.\n    You are provided with a phrase, a table with alxl the text on the screen, and a screenshot of the computer screen. You will identify the single word id that is best associated with the provided phrase.\n    This single word must be displayed on the computer screenshot, and its location on the screen should align with the provided phrase.\n    Each row in the text table provides 2 pieces of data in the following order. 1st is the unique word id. 2nd is the corresponding word.\n\n    To be successful, it is very important to follow all these rules:\n    1. First, think step by step and generate your reasoning about which word id to click on.\n    2. Then, output the unique word id. Remember, the word id is the 1st number in each row of the text table.\n    3. If there are multiple occurrences of the same word, use the surrounding context in the phrase to choose the correct one. Pay very close attention to punctuation and capitalization.\n\n    \"\"\"\n    )\n\n    CODE_AGENT_PROMPT = textwrap.dedent(\n        \"\"\"\\\n    You are a code execution agent with a limited step budget to complete tasks.\n\n    # Core Guidelines:\n    - Execute Python/Bash code step-by-step to progress toward the goal\n    - Use sudo with: \"echo osworld-public-evaluation | sudo -S [COMMANDS]\"\n    - Username: \"user\"\n    - Print results and handle errors appropriately\n    - Code execution may not show immediately on screen\n\n    # CRITICAL: Incremental Step-by-Step Approach\n    - Break down complex tasks into small, self-contained steps\n    - Each step should contain a single, focused code snippet that advances toward the goal\n    - Code from each step does NOT persist to the next step - write complete, standalone snippets\n    - Example workflow:\n        * Step 1: Write code to locate/find the target file\n        * Step 2: Write code to **THOROUGHLY** inspect/read the file contents\n        * Step 3: Write code to modify the file based on findings\n        * Step 4: Write code to verify the changes\n        - If verification fails (the modification did not work as intended), return to Step 3 and rewrite the modification code. Repeat until verification succeeds.\n    - Do NOT write entire scripts in one step - focus on one small task per step\n\n    # CRITICAL: Data Format Guidelines\n    - Store dates as proper date objects, not text strings\n    - Store numbers as numeric values, not formatted text with symbols\n    - Preserve data types for calculations and evaluations\n    - When applying data validation to spreadsheet columns, limit the range to only the rows containing actual data, not entire columns\n    - When creating cross-sheet references, use cell references (e.g., =Sheet1!A1) instead of manually typing values\n    - When asked to create a new sheet and no specific name is provided, default to the default sheet name (e.g., \"Sheet1\", \"Sheet2\", etc.)\n\n    # CRITICAL: File Modification Strategy\n    - ALWAYS prioritize modifying existing open files IN PLACE rather than creating new files\n    - The screenshot context shows which file is currently open and should be modified\n    - For open documents (LibreOffice .docx/.xlsx, text editors, etc.), modify the existing file directly\n    - Use appropriate libraries (python-docx, openpyxl, etc.) to modify files in place\n    - CRITICAL: When modifying files, perform COMPLETE OVERWRITES, not appends\n    - For documents: replace all paragraphs/sheets with new content\n    - For text files: write the complete new content, overwriting the old\n    - Only create new files when explicitly required by the task\n    - Verify your reasoning aligns with the user's intent for the open file\n\n    # CRITICAL: Thorough File Inspection Guidelines\n    - **ALWAYS inspect file contents AND data types before and after modifications**\n    - Check cell values, formats, data types, number formats, decimal separators, and formatting properties\n    - For spreadsheets: inspect cell values, number formats, date formats, currency formats, and cell properties\n    - For documents: inspect text content, formatting, styles, and structural elements\n    - Verify that modifications actually changed the intended properties (not just values)\n    - Compare before/after states to ensure changes were applied correctly\n\n    # CRITICAL: Code-Based Task Solving\n    - You are responsible for writing EXECUTABLE CODE to solve the task programmatically\n    - Write Python/Bash scripts that process, filter, transform, or manipulate the data as required\n\n    # CRITICAL: Preserve Document Structure and Formatting\n    - When modifying documents/spreadsheets, PRESERVE the original structure, headers, and formatting\n    - NEVER modify column headers, row headers, document titles, or sheet names unless explicitly requested\n    - Maintain fonts, colors, borders, cell formatting, paragraph styles, etc.\n    - Only change the content/data, not the structure or visual presentation\n    - Use libraries that support formatting preservation (python-docx, openpyxl, etc.)\n    - The goal is to keep the document looking exactly the same, just with different content\n    - **For column reordering**: Preserve table position - reorder columns within the table without shifting the table itself\n\n    # CRITICAL: Final Step Requirement\n    - At the final step before completing the task (the step before you return DONE), you MUST print out the contents of any files you modified\n    - Use appropriate commands to display the final state of modified files:\n        * For text files: `cat filename` or `head -n 50 filename` for large files\n        * For Python files: `cat filename.py`\n        * For configuration files: `cat filename.conf`\n        * For any other file type: use appropriate viewing commands\n    - This ensures the user can see exactly what changes were made to the files\n\n    # CRITICAL: Verification Instructions\n    - When you complete a task that modifies files, you MUST provide clear verification instructions\n    - Include specific details about what the GUI agent should check:\n        * Which files were modified and their expected final state\n        * What the content should look like (number of lines, key data points, etc.)\n        * How to verify the changes are correct\n        * Whether the task is complete or if additional GUI actions are needed\n    - This helps the GUI agent understand what to expect and how to verify your work correctly\n\n    # Response Format:\n    You MUST respond using exactly this format:\n\n    <thoughts>\n    Your step-by-step reasoning about what needs to be done and how to approach the current step.\n    </thoughts>\n\n    <answer>\n    Return EXACTLY ONE of the following options:\n\n    For Python code:\n    ```python\n    your_python_code_here\n    ```\n\n    For Bash commands:\n    ```bash\n    your_bash_commands_here\n    ```\n\n    For task completion:\n    DONE\n\n    For task failure:\n    FAIL\n    </answer>\n\n    # Technical Notes:\n    - Wrap code in ONE block, identify language (python/bash)\n    - Python code runs line-by-line in interactive terminal (no __main__)\n    - Install missing packages as needed\n    - Ignore \"sudo: /etc/sudoers.d is world writable\" error\n    - After in-place modifications, close/reopen files via GUI to show changes\n\n    Focus on progress within your step budget.\n    \"\"\"\n    )\n\n    CODE_SUMMARY_AGENT_PROMPT = textwrap.dedent(\n        \"\"\"\\\n    You are a code execution summarizer. Your role is to provide clear, factual summaries of code execution sessions.\n\n    Key responsibilities:\n    - Summarize the code logic and approach used at each step\n    - Describe the outputs and results produced by code execution\n    - Explain the progression of the solution approach\n    - Use neutral, objective language without making judgments about success or failure\n    - Focus on what was attempted and what resulted\n    - Keep summaries concise and well-structured\n\n    CRITICAL: Include verification instructions for the GUI agent\n    - If files were modified, provide specific verification guidance:\n      * What files were changed and their expected final state\n      * What the GUI agent should look for when verifying\n      * How to verify the changes are correct\n      * Whether the task appears complete or if additional GUI actions are needed\n    - This helps the GUI agent understand what to expect and verify your work properly\n\n    Always maintain a factual, non-judgmental tone.\n    \"\"\"\n    )\n\n    BEHAVIOR_NARRATOR_SYSTEM_PROMPT = textwrap.dedent(\n        \"\"\"\\\n    You are an expert in computer usage responsible for analyzing what happened after a computer action is taken. \n\n    **Reasoning Guidelines:**\n    You will analyze the before and after screenshots given an action and provide a clear summary of the changes observed. Some things to note:\n    - Pay attention to any circular visual markers that may suggest where clicks, mouse movements, or drags occurred.\n      - Clicks will be marked with a red circle and labeled Click\n      - Moving the mouse without clicking will be marked with a blue circle and labeled MoveTo\n      - Drag and drops will have an initial blue circle labeled MoveTo, a green circle labeled DragTo, and a green line connecting the two circles.\n    - If any mouse action occurred, the after screenshot will be accompanied with a zoomed-in view of the area around the action to help you see changes more clearly.\n      - This is intended to help with small details that are unclear in the full screenshot so make sure to refer to it.\n      - The after screenshot will have a bounding box around the zoomed-in area to help you locate it in the full screenshot.\n      - The zoomed-in view will be centered around the location of the mouse action (for drags, it will be centered around the DragTo location).\n    - Focus on the changes that were induced by the action, rather than irrelevant details (e.g. the time change in the system clock).\n      - The action will be represented as Pyautogui code which may include more than one interaction so be sure to account for all changes (since the after screenshot may not show all intermediate states).\n      - Note that even if the action is expected to cause a change, it may have not. Never assume that the action was successful without clear evidence in the screenshots.\n      - Do not rely on the coordinates of the action to determine what changed; always refer to the visual marker as the true location of the action.\n    - Your response will be used to caption the differences between before and after screenshots so they must be extremely precise.\n    - Make sure to include the <thoughts>...</thoughts> and <answer>...</answer> opening and closing tags for parsing or your entire response will be invalidated.\n    \n    Please format your response as follows below.\n    <thoughts>\n    [Your detailed reasoning about the before screenshot and any visual markers, the action being taken, and the changes in the after screenshot and zoomed-in view (if present).]\n    </thoughts>\n    <answer>\n    [An unordered list of the relevant changes induced by the action]\n    </answer>\n    \"\"\"\n    )\n\n    VLM_EVALUATOR_PROMPT_COMPARATIVE_BASELINE = textwrap.dedent(\n        \"\"\"\\\n    You are a meticulous and impartial evaluator, tasked with judging <NUMBER OF TRAJECTORIES> sequences of OS desktop actions to determine which one better completes the user's request. Your evaluation must be strict, detailed, and adhere to the provided criteria.\n\n    **User Request:** \n    <TASK_DESCRIPTION_INPUT>\n\n    **Judge Guidelines:**\n    These guidelines are to help you evaluate both sequences of actions. These are strict guidelines and should not be deviated from.\n    While judging:\n    Be thorough when aligning the agent's actions with the key constraints and following expected agent behaviors (if relevant).\n    The agent is always expected to complete the task; key constraints take precedence over these guidelines which act as tie breakers.\n    Always double-check the agent's calculations for accuracy.\n    Explicitly state which rows and columns must be selected.\n    Always verify that exact values match the user's request.\n    Pay particular attention that spreadsheet modifications do not deviate from the original user's formatting, layout, and ordering unless absolutely necessary.\n    \n    Expected agent behaviors:\n    The agent must map the user's request to the software's built-in features, not hacky methods.\n    The agent must return control with a clean desktop, closing any popups, tabs, toolbars, search bars, or other elements it opened that weren't originally there even if they are unobtrusive.\n    The agent must maintain the original format of the user's spreadsheet as closely as possible.\n    The agent must preserve the spreadsheet's layout, formatting, and row/column order, making changes only within existing cells without creating gaps or adding new columns unless required for essential changes.\n    The agent must close the settings tab on Chrome for changes to take effect.\n    The agent must prioritize the safest options whenever the user expresses safety concerns.\n    The agent must fully complete user requests, following flows to the end to save the user time.\n    The agent must fulfill the user's request on the website where the request originates, using other sites only if absolutely necessary.                                      \n    The agent must apply all relevant filters to fully satisfy the user's request. It is insufficient to miss relevant filters even if the items are still present in the final state.\n\n    **Reasoning Structure:**\n    1. **Evaluate both sequences of actions against relevant judge guidelines.** Explicitly list EACH AND EVERY judge guidelines, whether they apply, and, if so, verify that they were met, partially met, or not met at all for both sequences.\n    2. **Reason about the differences between the two sequences.** Consider which sequence better meets the judge guidelines. If they both meet the guidelines equally, consider which sequence is more efficient, effective, or cleaner.\n    3. **Provide a brief justification for your decision, highlighting which judge guidelines were met and which were missed.**\n\n    **Reasoning Guidelines:**\n    - You will be provided <NUMBER OF TRAJECTORIES> results, each result is in the form of initial_screenshot, final_screenshot.\n    - You **must** refer to final_screenshot to understand what has changed from initial_screenshot to final_screenshot. These facts are accurate; **Do not assume what has changed or likely changed.**\n    - You can cite facts during reasoning, e.g., Fact 2, Facts 1-2, but **must** refer to fact captions for accurate changes.\n    - You **must** explicitly write out all justifications\n    - You **must** enclose all reasoning in <thoughts> tags and the final answer in <answer> tags\n\n    - The user prefers that the agent communicates when it is impossible to proceed rather than attempting to complete the task incorrectly.\n    - If at least one trajectory is deemed impossible to proceed, it should be chosen if the other trajectory doesn't satisfy the request either.\n    - You **must** explicitly state when either trajectory was deemed impossible to proceed.\n    - You **must** explicitly write out all reasoning and justifications\n\n    Which sequence of actions better completes the user request OR correctly notes the request is impossible? Please provide your evaluation in the following format:\n    <thoughts>\n    [Your reasoning doing a comprehensive comparison of the two sequences, strictly following the structure in Reasoning Structure, adhering to the Reasoning Guidelines, and using the Reasoning Format.]\n    </thoughts>\n    <answer>\n    [The index of the better sequence, a single integer from 1 to <NUMBER OF TRAJECTORIES>]\n    </answer>\n    \"\"\"\n    )\n"
  },
  {
    "path": "gui_agents/s3/utils/__init__.py",
    "content": ""
  },
  {
    "path": "gui_agents/s3/utils/common_utils.py",
    "content": "import re\nimport time\nfrom io import BytesIO\nfrom PIL import Image\n\nfrom typing import Tuple, Dict\n\nfrom gui_agents.s3.memory.procedural_memory import PROCEDURAL_MEMORY\n\nimport logging\n\nlogger = logging.getLogger(\"desktopenv.agent\")\n\n\ndef create_pyautogui_code(agent, code: str, obs: Dict) -> str:\n    \"\"\"\n    Attempts to evaluate the code into a pyautogui code snippet with grounded actions using the observation screenshot.\n\n    Args:\n        agent (ACI): The grounding agent to use for evaluation.\n        code (str): The code string to evaluate.\n        obs (Dict): The current observation containing the screenshot.\n\n    Returns:\n        exec_code (str): The pyautogui code to execute the grounded action.\n\n    Raises:\n        Exception: If there is an error in evaluating the code.\n    \"\"\"\n    agent.assign_screenshot(obs)  # Necessary for grounding\n    exec_code = eval(code)\n    return exec_code\n\n\ndef call_llm_safe(\n    agent, temperature: float = 0.0, use_thinking: bool = False, **kwargs\n) -> str:\n    # Retry if fails\n    max_retries = 3  # Set the maximum number of retries\n    attempt = 0\n    response = \"\"\n    while attempt < max_retries:\n        try:\n            response = agent.get_response(\n                temperature=temperature, use_thinking=use_thinking, **kwargs\n            )\n            assert response is not None, \"Response from agent should not be None\"\n            print(\"Response success!\")\n            break  # If successful, break out of the loop\n        except Exception as e:\n            attempt += 1\n            print(f\"Attempt {attempt} failed: {e}\")\n            if attempt == max_retries:\n                print(\"Max retries reached. Handling failure.\")\n        time.sleep(1.0)\n    return response if response is not None else \"\"\n\n\ndef call_llm_formatted(generator, format_checkers, **kwargs):\n    \"\"\"\n    Calls the generator agent's LLM and ensures correct formatting.\n\n    Args:\n        generator (ACI): The generator agent to call.\n        obs (Dict): The current observation containing the screenshot.\n        format_checkers (Callable): Functions that take the response and return a tuple of (success, feedback).\n        **kwargs: Additional keyword arguments for the LLM call.\n\n    Returns:\n        response (str): The formatted response from the generator agent.\n    \"\"\"\n    max_retries = 3  # Set the maximum number of retries\n    attempt = 0\n    response = \"\"\n    if kwargs.get(\"messages\") is None:\n        messages = (\n            generator.messages.copy()\n        )  # Copy messages to avoid modifying the original\n    else:\n        messages = kwargs[\"messages\"]\n        del kwargs[\"messages\"]  # Remove messages from kwargs to avoid passing it twice\n    while attempt < max_retries:\n        response = call_llm_safe(generator, messages=messages, **kwargs)\n\n        # Prepare feedback messages for incorrect formatting\n        feedback_msgs = []\n        for format_checker in format_checkers:\n            success, feedback = format_checker(response)\n            if not success:\n                feedback_msgs.append(feedback)\n        if not feedback_msgs:\n            # logger.info(f\"Response formatted correctly on attempt {attempt} for {generator.engine.model}\")\n            break\n        logger.error(\n            f\"Response formatting error on attempt {attempt} for {generator.engine.model}. Response: {response} {', '.join(feedback_msgs)}\"\n        )\n        messages.append(\n            {\n                \"role\": \"assistant\",\n                \"content\": [{\"type\": \"text\", \"text\": response}],\n            }\n        )\n        logger.info(f\"Bad response: {response}\")\n        delimiter = \"\\n- \"\n        formatting_feedback = f\"- {delimiter.join(feedback_msgs)}\"\n        messages.append(\n            {\n                \"role\": \"user\",\n                \"content\": [\n                    {\n                        \"type\": \"text\",\n                        \"text\": PROCEDURAL_MEMORY.FORMATTING_FEEDBACK_PROMPT.replace(\n                            \"FORMATTING_FEEDBACK\", formatting_feedback\n                        ),\n                    }\n                ],\n            }\n        )\n        logger.info(\"Feedback:\\n%s\", formatting_feedback)\n\n        attempt += 1\n        if attempt == max_retries:\n            logger.error(\n                \"Max retries reached when formatting response. Handling failure.\"\n            )\n        time.sleep(1.0)\n    return response\n\n\ndef split_thinking_response(full_response: str) -> Tuple[str, str]:\n    try:\n        # Extract thoughts section\n        thoughts = full_response.split(\"<thoughts>\")[-1].split(\"</thoughts>\")[0].strip()\n\n        # Extract answer section\n        answer = full_response.split(\"<answer>\")[-1].split(\"</answer>\")[0].strip()\n\n        return answer, thoughts\n    except Exception as e:\n        return full_response, \"\"\n\n\ndef parse_code_from_string(input_string):\n    \"\"\"Parses a string to extract each line of code enclosed in triple backticks (```)\n\n    Args:\n        input_string (str): The input string containing code snippets.\n\n    Returns:\n        str: The last code snippet found in the input string, or an empty string if no code is found.\n    \"\"\"\n    input_string = input_string.strip()\n\n    # This regular expression will match both ```code``` and ```python code```\n    # and capture the `code` part. It uses a non-greedy match for the content inside.\n    pattern = r\"```(?:\\w+\\s+)?(.*?)```\"\n\n    # Find all non-overlapping matches in the string\n    matches = re.findall(pattern, input_string, re.DOTALL)\n    if len(matches) == 0:\n        # return []\n        return \"\"\n    relevant_code = matches[\n        -1\n    ]  # We only care about the last match given it is the grounded action\n    return relevant_code\n\n\ndef extract_agent_functions(code):\n    \"\"\"Extracts all agent function calls from the given code.\n\n    Args:\n        code (str): The code string to search for agent function calls.\n\n    Returns:\n        list: A list of all agent function calls found in the code.\n    \"\"\"\n    pattern = r\"(agent\\.\\w+\\(\\s*.*\\))\"  # Matches\n    return re.findall(pattern, code)\n\n\ndef compress_image(image_bytes: bytes = None, image: Image = None) -> bytes:\n    \"\"\"Compresses an image represented as bytes.\n\n    Compression involves resizing image into half its original size and saving to webp format.\n\n    Args:\n        image_bytes (bytes): The image data to compress.\n\n    Returns:\n        bytes: The compressed image data.\n    \"\"\"\n    if not image:\n        image = Image.open(BytesIO(image_bytes))\n    output = BytesIO()\n    image.save(output, format=\"WEBP\")\n    compressed_image_bytes = output.getvalue()\n    return compressed_image_bytes\n"
  },
  {
    "path": "gui_agents/s3/utils/formatters.py",
    "content": "\"\"\"This file contains various formatting checks used to reprompt an agent for correctly formatted responses.\"\"\"\n\nfrom gui_agents.s3.utils.common_utils import (\n    extract_agent_functions,\n    parse_code_from_string,\n    create_pyautogui_code,\n    split_thinking_response,\n)\n\nsingle_action_check = (\n    lambda response: len(extract_agent_functions(parse_code_from_string(response))) == 1\n)\nsingle_action_error_msg = (\n    \"Incorrect code: There must be a single agent action in the code response.\"\n)\nSINGLE_ACTION_FORMATTER = lambda response: (\n    single_action_check(response),\n    single_action_error_msg,\n)\n\n\ndef _attempt_code_creation(agent, code, obs):\n    \"\"\"Attempts to create a pyautogui code snippet from the response code\"\"\"\n    try:\n        return create_pyautogui_code(agent, code, obs)\n    except Exception as e:\n        return None\n\n\ncode_valid_check = (\n    lambda agent, obs, response: _attempt_code_creation(\n        agent, parse_code_from_string(response), obs\n    )\n    is not None\n)\ncode_valid_error_msg = \"Incorrect code: The agent action must be a valid function and use valid parameters from the docstring list.\"\nCODE_VALID_FORMATTER = lambda agent, obs, response: (\n    code_valid_check(agent, obs, response),\n    code_valid_error_msg,\n)\n\nthoughts_answer_tag_check = lambda response: split_thinking_response(response)[1] != \"\"\nthoughts_answer_tag_error_msg = \"Incorrect response: The response must contain both <thoughts>...</thoughts> and <answer>...</answer> tags.\"\nTHOUGHTS_ANSWER_TAG_FORMATTER = lambda response: (\n    thoughts_answer_tag_check(response),\n    thoughts_answer_tag_error_msg,\n)\n\ninteger_answer_check = (\n    lambda response: split_thinking_response(response)[0].strip().isdigit()\n)\ninteger_answer_error_msg = (\n    \"Incorrect response: The <answer>...</answer> tag must contain a single integer.\"\n)\nINTEGER_ANSWER_FORMATTER = lambda response: (\n    integer_answer_check(response),\n    integer_answer_error_msg,\n)\n"
  },
  {
    "path": "gui_agents/s3/utils/local_env.py",
    "content": "import subprocess\nimport sys\nfrom typing import Dict\n\n\nclass LocalController:\n    \"\"\"Minimal controller to execute bash and python code locally.\n\n    WARNING: Executing arbitrary code is dangerous. Only enable/use this in trusted\n    environments and with trusted inputs.\n    \"\"\"\n\n    def run_bash_script(self, code: str, timeout: int = 30) -> Dict:\n        try:\n            proc = subprocess.run(\n                [\"/bin/bash\", \"-lc\", code],\n                capture_output=True,\n                text=True,\n                timeout=timeout,\n            )\n            output = (proc.stdout or \"\") + (proc.stderr or \"\")\n\n            print(\"BASH OUTPUT =======================================\")\n            print(output)\n            print(\"BASH OUTPUT =======================================\")\n\n            return {\n                \"status\": \"ok\" if proc.returncode == 0 else \"error\",\n                \"returncode\": proc.returncode,\n                \"output\": output,\n                \"error\": \"\",\n            }\n        except subprocess.TimeoutExpired as e:\n            return {\n                \"status\": \"error\",\n                \"returncode\": -1,\n                \"output\": e.stdout or \"\",\n                \"error\": f\"TimeoutExpired: {str(e)}\",\n            }\n        except Exception as e:\n            return {\n                \"status\": \"error\",\n                \"returncode\": -1,\n                \"output\": \"\",\n                \"error\": str(e),\n            }\n\n    def run_python_script(self, code: str) -> Dict:\n        try:\n            proc = subprocess.run(\n                [sys.executable, \"-c\", code],\n                capture_output=True,\n                text=True,\n            )\n            print(\"PYTHON OUTPUT =======================================\")\n            print(proc.stdout or \"\")\n            print(\"PYTHON OUTPUT =======================================\")\n            return {\n                \"status\": \"ok\" if proc.returncode == 0 else \"error\",\n                \"return_code\": proc.returncode,\n                \"output\": proc.stdout or \"\",\n                \"error\": proc.stderr or \"\",\n            }\n        except Exception as e:\n            return {\n                \"status\": \"error\",\n                \"return_code\": -1,\n                \"output\": \"\",\n                \"error\": str(e),\n            }\n\n\nclass LocalEnv:\n    \"\"\"Simple environment that provides a controller compatible with CodeAgent.\"\"\"\n\n    def __init__(self):\n        self.controller = LocalController()\n"
  },
  {
    "path": "gui_agents/utils.py",
    "content": "\"\"\"General utility.\"\"\"\n\nimport platform\nimport requests\nimport zipfile\nimport io\nimport os\n\n\ndef download_kb_data(\n    version=\"s2\",\n    release_tag=\"v0.2.2\",\n    download_dir=\"kb_data\",\n    platform=platform.system().lower(),\n):\n    \"\"\"Download and extract the appropriate KB ZIP file for the current OS.\n\n    Args:\n        version (str): Prefix in the asset name (e.g., \"s1\" or \"s2\")\n        release_tag (str): Tag of the release that has the assets (e.g., \"v0.2.2\")\n        download_dir (str): Where to extract the downloaded files\n        platform (str): OS (e.g., \"windows\", \"darwin\", \"linux\")\n    \"\"\"\n    # Detect OS\n    if platform not in [\"windows\", \"darwin\", \"linux\"]:\n        raise RuntimeError(f\"Unsupported OS: {platform}\")\n\n    # Build asset filename, e.g. \"s1_windows.zip\" or \"s1_darwin.zip\"\n    asset_name = f\"{version}_{platform}.zip\"\n\n    download_url = f\"https://github.com/simular-ai/Agent-S/releases/download/{release_tag}/{asset_name}\"\n\n    # Make sure our output directory exists\n    os.makedirs(download_dir, exist_ok=True)\n\n    print(f\"Downloading {asset_name} from {download_url} ...\")\n    response = requests.get(download_url)\n    if response.status_code != 200:\n        raise RuntimeError(\n            f\"Failed to download {asset_name}. \"\n            f\"HTTP status: {response.status_code} - {response.reason}\"\n        )\n\n    # Extract the ZIP in-memory\n    zip_data = io.BytesIO(response.content)\n    with zipfile.ZipFile(zip_data, \"r\") as zip_ref:\n        zip_ref.extractall(download_dir)\n\n    print(f\"Extracted {asset_name} to ./{download_dir}\")\n"
  },
  {
    "path": "integrations/openclaw/README.md",
    "content": "# Agent-S OpenClaw Integration\n\nThis integration enables [OpenClaw](https://github.com/openclaw/openclaw) to use [Agent-S](https://github.com/simular-ai/Agent-S) for autonomous GUI automation tasks.\n\n## Overview\n\nAgent-S is a powerful autonomous agent that can control your computer's graphical interface to complete complex tasks. This integration provides a simple wrapper that allows OpenClaw agents to invoke Agent-S for GUI automation.\n\n## Prerequisites\n\n### Required Software\n\n1. **Agent-S**: Install the gui-agents package\n   ```bash\n   pip install gui-agents\n   ```\n\n2. **Tesseract**: Required for OCR functionality\n   ```bash\n   brew install tesseract  # macOS\n   # or\n   sudo apt install tesseract-ocr  # Linux\n   ```\n\n3. **OpenClaw**: This integration is designed to work with OpenClaw\n\n### Required Environment Variables\n\nYou need at least one API key for your chosen provider:\n\n- **`ANTHROPIC_API_KEY`**: For Claude models (Anthropic provider)\n  ```bash\n  export ANTHROPIC_API_KEY=\"your-api-key-here\"\n  ```\n\n- **`OPENAI_API_KEY`**: For GPT models (OpenAI provider)\n  ```bash\n  export OPENAI_API_KEY=\"your-api-key-here\"\n  ```\n\n- **`GEMINI_API_KEY`**: For Gemini models (Google provider)\n  ```bash\n  export GEMINI_API_KEY=\"your-api-key-here\"\n  ```\n\nBy default, the wrapper uses Anthropic's Claude Sonnet 4.5. You can modify `agent_s_wrapper.py` to use a different provider and model.\n\n### Grounding Model Configuration (Required)\n\nAgent-S requires a grounding model for visual element detection. We recommend [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B):\n\n- **`AGENT_S_GROUND_URL`** (Required): Grounding model endpoint URL\n- **`AGENT_S_GROUND_MODEL`** (Required): Model name (default: \"ui-tars-1.5-7b\")\n- **`AGENT_S_GROUNDING_WIDTH`** (Required): Output coordinate width (default: \"1920\")\n- **`AGENT_S_GROUNDING_HEIGHT`** (Required): Output coordinate height (default: \"1080\")\n- **`AGENT_S_GROUND_API_KEY`** (Optional): API key for grounding endpoint\n\nExample configuration:\n```bash\nexport AGENT_S_GROUND_URL=\"http://localhost:8080\"\nexport AGENT_S_GROUND_API_KEY=\"your-grounding-api-key\"\nexport AGENT_S_GROUND_MODEL=\"ui-tars-1.5-7b\"\nexport AGENT_S_GROUNDING_WIDTH=\"1920\"\nexport AGENT_S_GROUNDING_HEIGHT=\"1080\"\n```\n\nSee the [Agent-S documentation](https://github.com/simular-ai/Agent-S#grounding-models-required) for details on setting up grounding models.\n\n## Installation\n\n1. **Clone or copy this directory** to your OpenClaw skills folder:\n   ```bash\n   cp -r integrations/openclaw ~/.openclaw/workspace/skills/agent-s\n   ```\n\n2. **Make scripts executable**:\n   ```bash\n   chmod +x ~/.openclaw/workspace/skills/agent-s/agent_s_task\n   chmod +x ~/.openclaw/workspace/skills/agent-s/agent_s_wrapper.py\n   ```\n\n3. **Verify installation**:\n   ```bash\n   which agent_s\n   # Should show the path to agent_s executable\n   ```\n\n## Usage\n\n### From OpenClaw Agent\n\nThe OpenClaw agent can invoke Agent-S by reading the SKILL.md file and using the bash tool:\n\n```bash\n~/.openclaw/workspace/skills/agent-s/agent_s_task \"Open Safari and go to google.com\"\n```\n\n### From Command Line\n\nYou can test the integration directly:\n\n```bash\n# Basic usage\n./agent_s_task \"Open System Preferences\"\n\n# Using the Python wrapper with options\n./agent_s_wrapper.py \"Open TextEdit and type Hello World\" --max-steps 10 --json\n```\n\n### Advanced Options\n\n```bash\n# Custom max steps\n./agent_s_wrapper.py \"complex task\" --max-steps 30\n\n# Disable reflection (faster but less accurate)\n./agent_s_wrapper.py \"simple task\" --no-reflection\n\n# Enable local code environment (WARNING: executes arbitrary code)\n./agent_s_wrapper.py \"task requiring code execution\" --enable-local-env\n\n# JSON output (for programmatic use)\n./agent_s_wrapper.py \"task\" --json\n```\n\n## Testing\n\n### Quick Test\n\nVerify the integration works:\n\n```bash\n# Test 1: Check help\n./agent_s_wrapper.py --help\n\n# Test 2: Simple task (will actually execute)\n./agent_s_task \"Open Calculator\"\n```\n\n### Testing with OpenClaw Agent\n\n1. **Start OpenClaw**:\n   ```bash\n   openclaw\n   ```\n\n2. **Ask your agent** to use Agent-S:\n   - \"Can you use Agent-S to open the Calculator app?\"\n   - \"I need you to use the Agent-S skill to open Safari and navigate to github.com\"\n   - \"Read the Agent-S skill documentation and then use it to open System Preferences\"\n\n3. **Expected behavior**:\n   - Agent reads `SKILL.md` in the skills directory\n   - Agent executes `agent_s_task` command via bash tool\n   - Agent-S launches and completes the GUI task\n   - Results are returned to OpenClaw agent\n\n### Verification Checklist\n\n- [ ] `agent_s` executable is in PATH\n- [ ] `ANTHROPIC_API_KEY` is set\n- [ ] `AGENT_S_GROUND_URL` is set (grounding model endpoint)\n- [ ] Scripts are executable\n- [ ] OpenClaw agent can read skill files\n- [ ] Test task executes successfully\n\n## Configuration\n\nAll configuration is done via environment variables (see Prerequisites section above).\n\n### Customizing the Provider and Model\n\nBy default, the wrapper uses Anthropic's Claude Sonnet 4.5. To use a different provider or model, modify the `agent_s_wrapper.py` file:\n\n```python\n# For OpenAI\ncmd = [\n    agent_s_path,\n    \"--provider\", \"openai\",\n    \"--model\", \"gpt-5-2025-08-07\",  # or other OpenAI models\n    ...\n]\n\n# For Gemini\ncmd = [\n    agent_s_path,\n    \"--provider\", \"gemini\",\n    \"--model\", \"gemini-2.0-flash-exp\",  # or other Gemini models\n    ...\n]\n```\n\nSee the [Agent-S models documentation](https://github.com/simular-ai/Agent-S/blob/main/models.md) for all supported providers and models.\n\n### Logs\n\nAgent-S logs are stored in: `~/workspace/Agent-S/logs/`\n\nCheck these logs if something goes wrong:\n```bash\nls -lt ~/workspace/Agent-S/logs/ | head -5\ntail -f ~/workspace/Agent-S/logs/debug-*.log\n```\n\n## Safety\n\n- Agent-S has full GUI control access\n- Only use for trusted automation tasks\n- All actions are logged\n- Can be paused with Ctrl+C and resumed with Esc\n- Timeout: 10 minutes per task by default\n\n## Troubleshooting\n\n### Agent-S not found\n\nCheck that agent_s is in your PATH:\n```bash\nwhich agent_s\n```\n\nIf not found, install gui-agents:\n```bash\npip install gui-agents\n```\n\n### Permission errors\n\nEnsure scripts are executable:\n```bash\nchmod +x ./agent_s_task\nchmod +x ./agent_s_wrapper.py\n```\n\n### API errors\n\nCheck that your API key is set for your chosen provider:\n```bash\n# For Anthropic (default)\necho $ANTHROPIC_API_KEY\n\n# For OpenAI\necho $OPENAI_API_KEY\n\n# For Gemini\necho $GEMINI_API_KEY\n```\n\nIf empty, add it to your shell profile (`~/.zshrc` or `~/.bashrc`):\n```bash\nexport ANTHROPIC_API_KEY=\"your-key-here\"\n# or\nexport OPENAI_API_KEY=\"your-key-here\"\n# or\nexport GEMINI_API_KEY=\"your-key-here\"\n\nsource ~/.zshrc  # or ~/.bashrc\n```\n\n### Task failures\n\n1. Check the logs in `~/workspace/Agent-S/logs/` for detailed error messages\n2. Verify grounding configuration if using custom endpoint\n3. Ensure task description is clear and specific\n4. Try with `--no-reflection` for simpler tasks\n\n### Grounding model issues\n\nIf you see errors about grounding:\n- Verify `AGENT_S_GROUND_URL` is accessible\n- Check `AGENT_S_GROUND_API_KEY` is correct\n- Ensure grounding dimensions match your model's output resolution\n\n## Files\n\n- **`README.md`** - This file\n- **`SKILL.md`** - Skill documentation for the OpenClaw agent\n- **`agent_s_wrapper.py`** - Python wrapper for invoking Agent-S\n- **`agent_s_task`** - Simple bash entry point for task execution\n\n## Support\n\n- **Agent-S**: https://github.com/simular-ai/Agent-S\n- **OpenClaw**: https://github.com/openclaw/openclaw\n- **Report issues**: Use the Agent-S repository issue tracker for integration-specific issues\n\n## License\n\nThis integration follows the same license as Agent-S. See the main repository for details.\n"
  },
  {
    "path": "integrations/openclaw/SKILL.md",
    "content": "# Agent-S - Autonomous GUI Agent\n\nAgent-S is a powerful autonomous agent that can control your computer's graphical interface to complete complex tasks. It combines vision and action understanding to interact with any GUI element.\n\n## What It Does\n\nAgent-S can:\n- Navigate and interact with desktop applications\n- Fill forms, click buttons, and manipulate GUI elements\n- Complete multi-step workflows across different applications\n- Take screenshots and understand visual interfaces\n- Execute complex GUI automation tasks autonomously\n\n## When to Use\n\nUse Agent-S when you need to:\n- Automate GUI-based tasks that don't have CLI alternatives\n- Interact with desktop applications programmatically\n- Complete workflows that require visual understanding\n- Perform actions across multiple applications\n- Test GUI interfaces\n\n## How to Invoke\n\nCall the Agent-S wrapper via bash from the OpenClaw skills directory:\n\n```bash\n./agent_s_task \"task description\"\n```\n\nOr if installed in the default OpenClaw skills location:\n\n```bash\n~/.openclaw/workspace/skills/agent-s/agent_s_task \"task description\"\n```\n\n**Note**: Agent-S tasks can take 2-5 minutes to complete (up to 15 steps by default). The wrapper will wait for completion.\n\n## Parameters\n\n- `task` (required): Natural language description of the GUI task to complete\n- `max_steps` (optional): Maximum steps the agent can take (default: 15)\n- `enable_reflection` (optional): Enable self-reflection for better performance (default: true)\n\n## Examples\n\n```python\n# Basic navigation\nagent_s_task(task=\"Open Finder and create a new folder called 'Reports'\")\n\n# Form filling\nagent_s_task(task=\"Open TextEdit, create a new document, and type 'Hello World'\")\n\n# Multi-step workflows\nagent_s_task(task=\"Open Chrome, search for 'Python tutorials', and bookmark the first result\")\n\n# Application interaction\nagent_s_task(task=\"Open System Preferences and check the current display resolution\")\n```\n\n## Technical Details\n\nAgent-S uses:\n- **Main Model**: Claude Sonnet 4.5 for reasoning and planning\n- **Grounding Model**: UI-TARS-1.5-7B for visual grounding and coordinate extraction\n- **Screen Resolution**: Automatically scaled to 2400px max dimension\n- **Platform Support**: macOS, Linux, Windows\n\n## Safety\n\n- Agent-S has full GUI control - only use for trusted tasks\n- The agent will pause on Ctrl+C and can be resumed with Esc\n- Each action is logged to `~/workspace/Agent-S/logs/`\n- Tasks timeout after 15 steps by default\n\n## Configuration\n\nAgent-S requires configuration via environment variables:\n\n**Required:**\n- `ANTHROPIC_API_KEY`: API key for Claude model\n- `AGENT_S_GROUND_URL`: Grounding model endpoint URL\n- `AGENT_S_GROUND_MODEL`: Grounding model name (default: ui-tars-1.5-7b)\n- `AGENT_S_GROUNDING_WIDTH`: Output width (default: 1920)\n- `AGENT_S_GROUNDING_HEIGHT`: Output height (default: 1080)\n\n**Optional:**\n- `AGENT_S_GROUND_API_KEY`: API key for grounding endpoint\n\nSee the README.md in this directory for detailed setup instructions.\n\n## Limitations\n\n- Cannot interact with system-level dialogs requiring admin approval\n- Performance depends on screen resolution and GUI complexity\n- Some applications may have accessibility restrictions\n- Voice/audio commands are not supported\n\n## Source\n\nAgent-S GitHub: https://github.com/simular-ai/Agent-S\nInstallation: `pip install gui-agents`\n"
  },
  {
    "path": "integrations/openclaw/agent_s_task",
    "content": "#!/bin/bash\n# Agent-S Task Executor for OpenClaw\n# Usage: agent_s_task \"task description\"\n\nTASK=\"$1\"\n\nif [ -z \"$TASK\" ]; then\n    echo \"Error: Task description required\"\n    echo \"Usage: agent_s_task \\\"task description\\\"\"\n    exit 1\nfi\n\n# Execute the Python wrapper using relative path\nSCRIPT_DIR=\"$(cd \"$(dirname \"$0\")\" && pwd)\"\nexec \"$SCRIPT_DIR/agent_s_wrapper.py\" \"$TASK\"\n"
  },
  {
    "path": "integrations/openclaw/agent_s_wrapper.py",
    "content": "#!/usr/bin/env python3\n\"\"\"\nAgent-S Wrapper for OpenClaw Integration\n\nThis script provides a simple interface for OpenClaw to invoke Agent-S\nfor GUI automation tasks.\n\"\"\"\n\nimport argparse\nimport json\nimport subprocess\nimport sys\nimport os\nimport shutil\n\n\ndef run_agent_s(task, max_steps=15, enable_reflection=True, enable_local_env=False):\n    \"\"\"\n    Execute an Agent-S task and return the result.\n\n    Args:\n        task: Natural language task description\n        max_steps: Maximum number of steps (default: 15)\n        enable_reflection: Enable reflection agent (default: True)\n        enable_local_env: Enable local code execution (default: False, WARNING: executes arbitrary code)\n\n    Returns:\n        Dictionary with status and message\n    \"\"\"\n\n    # Path to agent_s executable - auto-detect or use environment variable\n    agent_s_path = os.environ.get(\"AGENT_S_PATH\") or shutil.which(\"agent_s\")\n    if not agent_s_path:\n        return {\n            \"status\": \"error\",\n            \"message\": \"agent_s not found in PATH. Install with: pip install gui-agents\",\n            \"error\": \"agent_s executable not found\"\n        }\n\n    # Build base command\n    cmd = [\n        agent_s_path,\n        \"--provider\", \"anthropic\",\n        \"--model\", \"claude-sonnet-4-5\",\n        \"--model_temperature\", \"1.0\",\n        \"--max_trajectory_length\", str(max_steps),\n        \"--task\", task,\n    ]\n    \n    # Add optional grounding configuration from environment variables\n    ground_url = os.environ.get(\"AGENT_S_GROUND_URL\")\n    ground_api_key = os.environ.get(\"AGENT_S_GROUND_API_KEY\")\n    ground_model = os.environ.get(\"AGENT_S_GROUND_MODEL\", \"ui-tars-1.5-7b\")\n    grounding_width = os.environ.get(\"AGENT_S_GROUNDING_WIDTH\", \"1920\")\n    grounding_height = os.environ.get(\"AGENT_S_GROUNDING_HEIGHT\", \"1080\")\n    \n    if ground_url:\n        cmd.extend([\"--ground_provider\", \"huggingface\"])\n        cmd.extend([\"--ground_url\", ground_url])\n        cmd.extend([\"--ground_model\", ground_model])\n        cmd.extend([\"--grounding_width\", grounding_width])\n        cmd.extend([\"--grounding_height\", grounding_height])\n        if ground_api_key:\n            cmd.extend([\"--ground_api_key\", ground_api_key])\n\n    if enable_reflection:\n        cmd.append(\"--enable_reflection\")\n\n    if enable_local_env:\n        cmd.append(\"--enable_local_env\")\n\n    try:\n        # Run Agent-S\n        print(f\"Starting Agent-S with task: {task}\", file=sys.stderr)\n        print(f\"Command: {' '.join(cmd)}\", file=sys.stderr)\n\n        # Agent-S can take 2-5 minutes for complex tasks (15 steps max)\n        # Don't capture output - let it stream to allow real-time GUI interaction\n        result = subprocess.run(\n            cmd,\n            capture_output=False,  # Changed: let output stream\n            text=True,\n            timeout=600  # 10 minute timeout\n        )\n\n        if result.returncode == 0:\n            return {\n                \"status\": \"success\",\n                \"message\": f\"Agent-S completed the task: {task}\",\n                \"logs_directory\": os.path.expanduser(\"~/workspace/Agent-S/logs/\"),\n                \"note\": \"Output was streamed to terminal. Check logs for details.\"\n            }\n        else:\n            return {\n                \"status\": \"error\",\n                \"message\": f\"Agent-S failed with return code {result.returncode}\",\n                \"logs_directory\": os.path.expanduser(\"~/workspace/Agent-S/logs/\"),\n                \"note\": \"Check logs for error details.\"\n            }\n\n    except subprocess.TimeoutExpired:\n        return {\n            \"status\": \"error\",\n            \"message\": f\"Agent-S timed out after 10 minutes for task: {task}\",\n            \"error\": \"Timeout expired\"\n        }\n\n    except Exception as e:\n        return {\n            \"status\": \"error\",\n            \"message\": f\"Failed to execute Agent-S: {str(e)}\",\n            \"error\": str(e)\n        }\n\n\ndef main():\n    parser = argparse.ArgumentParser(\n        description=\"OpenClaw wrapper for Agent-S GUI automation\"\n    )\n    parser.add_argument(\n        \"task\",\n        type=str,\n        help=\"Natural language description of the GUI task to perform\"\n    )\n    parser.add_argument(\n        \"--max-steps\",\n        type=int,\n        default=15,\n        help=\"Maximum number of agent steps (default: 15)\"\n    )\n    parser.add_argument(\n        \"--enable-reflection\",\n        action=\"store_true\",\n        default=True,\n        help=\"Enable reflection agent for better performance\"\n    )\n    parser.add_argument(\n        \"--no-reflection\",\n        action=\"store_false\",\n        dest=\"enable_reflection\",\n        help=\"Disable reflection agent\"\n    )\n    parser.add_argument(\n        \"--enable-local-env\",\n        action=\"store_true\",\n        default=False,\n        help=\"Enable local code execution (WARNING: executes arbitrary code)\"\n    )\n    parser.add_argument(\n        \"--json\",\n        action=\"store_true\",\n        help=\"Output result as JSON\"\n    )\n\n    args = parser.parse_args()\n\n    # Execute Agent-S task\n    result = run_agent_s(\n        task=args.task,\n        max_steps=args.max_steps,\n        enable_reflection=args.enable_reflection,\n        enable_local_env=args.enable_local_env\n    )\n\n    # Output result\n    if args.json:\n        print(json.dumps(result, indent=2))\n    else:\n        if result[\"status\"] == \"success\":\n            print(f\"✓ {result['message']}\")\n            if result.get(\"output\"):\n                print(f\"\\nOutput:\\n{result['output']}\")\n        else:\n            print(f\"✗ {result['message']}\")\n            if result.get(\"error\"):\n                print(f\"\\nError:\\n{result['error']}\", file=sys.stderr)\n            sys.exit(1)\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "models.md",
    "content": "We support the following APIs for MLLM inference: OpenAI, Anthropic, Gemini, Azure OpenAI, vLLM for local models, and Open Router. To use these APIs, you need to set the corresponding environment variables:\n\n1. OpenAI\n\n```\nexport OPENAI_API_KEY=<YOUR_API_KEY>\n```\n\n2. Anthropic\n\n```\nexport ANTHROPIC_API_KEY=<YOUR_API_KEY>\n```\n\n3. Gemini\n\n```\nexport GEMINI_API_KEY=<YOUR_API_KEY>\nexport GEMINI_ENDPOINT_URL=\"https://generativelanguage.googleapis.com/v1beta/openai/\"\n```\n\n4. OpenAI on Azure\n\n```\nexport AZURE_OPENAI_API_BASE=<DEPLOYMENT_NAME>\nexport AZURE_OPENAI_API_KEY=<YOUR_API_KEY>\n```\n\n5. vLLM for Local Models\n\n```\nexport vLLM_ENDPOINT_URL=<YOUR_DEPLOYMENT_URL>\n```\n\nAlternatively you can directly pass the API keys into the engine_params argument while instantating the agent.\n\n6. Open Router\n\n```\nexport OPENROUTER_API_KEY=<YOUR_API_KEY>\nexport OPEN_ROUTER_ENDPOINT_URL=\"https://openrouter.ai/api/v1\"\n```\n\n```python\nfrom gui_agents.s2_5.agents.agent_s import AgentS2_5\n\nengine_params = {\n    \"engine_type\": 'openai', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router'\n    \"model\": 'gpt-5-2025-08-07', # Allowed Values: Any Vision and Language Model from the supported APIs\n}\nagent = AgentS2_5(\n    engine_params,\n    grounding_agent,\n    platform=current_platform,\n)\n```\n\nTo use the underlying Multimodal Agent (LMMAgent) which wraps LLMs with message handling functionality, you can use the following code snippet:\n\n```python\nfrom gui_agents.s2_5.core.mllm import LMMAgent\n\nengine_params = {\n    \"engine_type\": 'openai', # Allowed Values: 'openai', 'anthropic', 'gemini', 'azure_openai', 'vllm', 'open_router'\n    \"model\": 'gpt-5-2025-08-07', # Allowed Values: Any Vision and Language Model from the supported APIs\n    }\nagent = LMMAgent(\n    engine_params=engine_params,\n)\n```\n\nThe `AgentS2_5` also utilizes this `LMMAgent` internally."
  },
  {
    "path": "osworld_setup/s1/OSWorld.md",
    "content": "# Deplying Agent-S in OSWorld\n\n# Step 1: Set up Agent S\n\nFollow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/gui_agents/s1/README.md) to set up Agent S.\n\n# Step 2: Copying Over Run Files\n\nIf you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder.\n\nWe have set the latest Agent S to use the latest Ubuntu VM image from OSWorld. However, our experiments are based on the older version of the VM. To reproduce the results, set the vm_version argument to 'old' while instantiating the agent.\n\n\n# Step 3: Best Practices\n\nAt this point, you will have set up the Agent-S and OSWorld environments and the VMWare Workstation Pro application. Below, we'll list some best practices, and common problems and their fixes.\n\n---\n\n```\nfrom desktop_env.desktop_env import DesktopEnv\n\nexample = {\n    \"id\": \"94d95f96-9699-4208-98ba-3c3119edf9c2\",\n    \"instruction\": \"I want to install Spotify on my current system. Could you please help me?\",\n    \"config\": [\n        {\n            \"type\": \"execute\",\n            \"parameters\": {\n                \"command\": [\n                    \"python\",\n                    \"-c\",\n                    \"import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);\"\n                ]\n            }\n        }\n    ],\n    \"evaluator\": {\n        \"func\": \"check_include_exclude\",\n        \"result\": {\n            \"type\": \"vm_command_line\",\n            \"command\": \"which spotify\"\n        },\n        \"expected\": {\n            \"type\": \"rule\",\n            \"rules\": {\n                \"include\": [\"spotify\"],\n                \"exclude\": [\"not found\"]\n            }\n        }\n    }\n}\n\nenv = DesktopEnv(action_space=\"pyautogui\")\n\nobs = env.reset(task_config=example)\nobs, reward, done, info = env.step(\"pyautogui.rightClick()\")\n```\n\nThe code above will boot up a VM and restart it. If, for whatever reason, running the starter code below leads to an infinitely long run time, cancel out of the VM.\nYou should then see:\n\n```\nparent/\n  Agent-S/\n  OSWorld/\n    vmware_vm_data/\n      Ubuntu0/\n        *.lck\n        *.vmem\n        ...\n      ...\n      UbuntuX/\n```\n\nIf you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to \ndelete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). \n\n---\n\nIf even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. \n\n```\nenv = DesktopEnv(action_space=\"pyautogui\", headless=False, require_terminal=True, path_to_vm=<absolute_path>)\n```\n\nPass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here:\n\n\n```\nparent/\n  Agent-S/\n  OSWorld/\n    vmware_vm_data/\n      Ubuntu0/\n        *.lck\n        *.vmem\n        ...\n        *.vmx\n      ...\n      UbuntuX/\n```\n\n📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the \"<\" in the `set(...)` within the following function: \n\n```\ndef isShiftCharacter(character):\n    \"\"\"\n    Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as\n    uppercase letters or the symbols on the keyboard's number row.\n    \"\"\"\n    # NOTE TODO - This will be different for non-qwerty keyboards.\n    return character.isupper() or character in set('~!@#$%^&*()_+{}|:\"<>?')\n```\n\n📌 **Note**: If in case, your VM encounters an issue with \"The root file system on <path> requires a manual fsck\", reset the VM to the previous snapshot. \n\nWith these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊"
  },
  {
    "path": "osworld_setup/s1/lib_run_single.py",
    "content": "import datetime\nimport json\nimport logging\nimport os\nimport time\nfrom wrapt_timeout_decorator import *\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef run_single_example(\n    agent, env, example, max_steps, instruction, args, example_result_dir, scores\n):\n    runtime_logger = setup_logger(example, example_result_dir)\n    agent.reset()\n    env.reset(task_config=example)\n    time.sleep(60)  # Wait for the environment to be ready\n    obs = env._get_obs()  # Get the initial observation\n    done = False\n    step_idx = 0\n    env.controller.start_recording()\n    while not done and step_idx < max_steps:\n        response, actions = agent.predict(instruction, obs)\n        for action in actions:\n            # Capture the timestamp before executing the action\n            action_timestamp = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n            logger.info(\"Step %d: %s\", step_idx + 1, action)\n            obs, reward, done, info = env.step(action, args.sleep_after_execution)\n\n            logger.info(\"Reward: %.2f\", reward)\n            logger.info(\"Done: %s\", done)\n            # Save screenshot and trajectory information\n            with open(\n                os.path.join(\n                    example_result_dir, f\"step_{step_idx + 1}_{action_timestamp}.png\"\n                ),\n                \"wb\",\n            ) as _f:\n                _f.write(obs[\"screenshot\"])\n            with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                f.write(\n                    json.dumps(\n                        {\n                            \"step_num\": step_idx + 1,\n                            \"action_timestamp\": action_timestamp,\n                            \"action\": action,\n                            \"reward\": reward,\n                            \"done\": done,\n                            \"info\": info,\n                            \"screenshot_file\": f\"step_{step_idx + 1}_{action_timestamp}.png\",\n                        }\n                    )\n                )\n                f.write(\"\\n\")\n            if done:\n                logger.info(\"The episode is done.\")\n                break\n        step_idx += 1\n    result = env.evaluate()\n    logger.info(\"Result: %.2f\", result)\n    scores.append(result)\n    with open(\n        os.path.join(example_result_dir, \"result.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(f\"{result}\\n\")\n    env.controller.end_recording(os.path.join(example_result_dir, \"recording.mp4\"))\n\n\ndef setup_logger(example, example_result_dir):\n    runtime_logger = logging.getLogger(f\"desktopenv.example.{example['id']}\")\n    runtime_logger.setLevel(logging.DEBUG)\n    runtime_logger.addHandler(\n        logging.FileHandler(os.path.join(example_result_dir, \"runtime.log\"))\n    )\n    return runtime_logger\n"
  },
  {
    "path": "osworld_setup/s1/run.py",
    "content": "\"\"\"OSWorld's run.py with AgentS.\"\"\"\n\n\"\"\"Script to run end-to-end evaluation on the benchmark.\nUtils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.\n\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\n\nfrom gui_agents.s1.core.AgentS import GraphSearchAgent\nfrom gui_agents.s1.aci.LinuxOSACI import LinuxACI\nfrom tqdm import tqdm\n\nimport lib_run_single\nfrom desktop_env.desktop_env import DesktopEnv\n\n# import wandb\n\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"a11y_tree\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=0.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=3)\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n\n    # lm config\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\"--temperature\", type=float, default=1.0)\n    parser.add_argument(\"--top_p\", type=float, default=0.9)\n    parser.add_argument(\"--max_tokens\", type=int, default=1500)\n    parser.add_argument(\"--stop_token\", type=str, default=None)\n\n    # example config\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n\n    # logging related\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n\n    # NEW!\n    parser.add_argument(\"--huggingface_endpoint_url\", type=str, required=True)\n    parser.add_argument(\"--kb_name\", default=\"kb_s2\", type=str)\n\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    scores = []\n    max_steps = args.max_steps\n\n    # log args\n    logger.info(\"Args: %s\", args)\n    # set wandb project\n    cfg_args = {\n        \"path_to_vm\": args.path_to_vm,\n        \"headless\": args.headless,\n        \"action_space\": args.action_space,\n        \"observation_type\": args.observation_type,\n        \"screen_width\": args.screen_width,\n        \"screen_height\": args.screen_height,\n        \"sleep_after_execution\": args.sleep_after_execution,\n        \"max_steps\": args.max_steps,\n        \"max_trajectory_length\": args.max_trajectory_length,\n        \"model\": args.model,\n        \"temperature\": args.temperature,\n        \"top_p\": args.top_p,\n        \"max_tokens\": args.max_tokens,\n        \"stop_token\": args.stop_token,\n        \"result_dir\": args.result_dir,\n    }\n\n    # NEW!\n    if args.model.startswith(\"claude\"):\n        engine_type = \"anthropic\"\n    elif args.model.startswith(\"gpt\"):\n        engine_type = \"openai\"\n    else:\n        engine_type = \"vllm\"\n\n    engine_params = {\"engine_type\": engine_type, \"model\": args.model}\n\n    # NEW!\n    grounding_agent = LinuxACI()\n\n    # NEW!\n    agent = GraphSearchAgent(\n        engine_params,\n        grounding_agent,\n        platform=\"linux\",\n        action_space=\"pyautogui\",\n        observation_type=\"mixed\",\n        search_engine=\"Perplexica\",\n        memory_root_path=os.getcwd(),\n        memory_folder_name=args.kb_name,\n        kb_release_tag=\"v0.2.2\",\n    )\n\n    env = DesktopEnv(\n        path_to_vm=args.path_to_vm,\n        action_space=agent.action_space,\n        screen_size=(args.screen_width, args.screen_height),\n        headless=args.headless,\n        os_type=\"Ubuntu\",\n        require_a11y_tree=args.observation_type\n        in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n    )\n\n    for domain in tqdm(test_all_meta, desc=\"Domain\"):\n        for example_id in tqdm(test_all_meta[domain], desc=\"Example\", leave=False):\n            config_file = os.path.join(\n                args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n            )\n            with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                example = json.load(f)\n\n            logger.info(f\"[Domain]: {domain}\")\n            logger.info(f\"[Example ID]: {example_id}\")\n\n            instruction = example[\"instruction\"]\n\n            logger.info(f\"[Instruction]: {instruction}\")\n            # wandb each example config settings\n            cfg_args[\"instruction\"] = instruction\n            cfg_args[\"start_time\"] = datetime.datetime.now().strftime(\n                \"%Y:%m:%d-%H:%M:%S\"\n            )\n            # run.config.update(cfg_args)\n\n            example_result_dir = os.path.join(\n                args.result_dir,\n                args.action_space,\n                args.observation_type,\n                args.model,\n                domain,\n                example_id,\n            )\n            os.makedirs(example_result_dir, exist_ok=True)\n            # example start running\n            try:\n                lib_run_single.run_single_example(\n                    agent,\n                    env,\n                    example,\n                    max_steps,\n                    instruction,\n                    args,\n                    example_result_dir,\n                    scores,\n                )\n            except Exception as e:\n                logger.error(f\"Exception in {domain}/{example_id}: {e}\")\n                env.controller.end_recording(\n                    os.path.join(example_result_dir, \"recording.mp4\")\n                )\n                with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                    f.write(\n                        json.dumps(\n                            {\"Error\": f\"Time limit exceeded in {domain}/{example_id}\"}\n                        )\n                    )\n                    f.write(\"\\n\")\n\n    env.close()\n    logger.info(f\"Average score: {sum(scores) / len(scores)}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "osworld_setup/s2/OSWorld.md",
    "content": "# Deplying Agent S2 in OSWorld\n\n# Step 1: Set up Agent S2\n\nFollow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md) to set up Agent S2.\n\n# Step 2: Copying Over Run Files\n\nIf you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder.\n\n# Best Practices\n\nAt this point, you will have set up the Agent S2, the OSWorld environment, and the VMWare Workstation Pro application set up. Below, we'll list some best practices, and common problems and their fixes.\n\n---\n\n```\nfrom desktop_env.desktop_env import DesktopEnv\n\nexample = {\n    \"id\": \"94d95f96-9699-4208-98ba-3c3119edf9c2\",\n    \"instruction\": \"I want to install Spotify on my current system. Could you please help me?\",\n    \"config\": [\n        {\n            \"type\": \"execute\",\n            \"parameters\": {\n                \"command\": [\n                    \"python\",\n                    \"-c\",\n                    \"import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);\"\n                ]\n            }\n        }\n    ],\n    \"evaluator\": {\n        \"func\": \"check_include_exclude\",\n        \"result\": {\n            \"type\": \"vm_command_line\",\n            \"command\": \"which spotify\"\n        },\n        \"expected\": {\n            \"type\": \"rule\",\n            \"rules\": {\n                \"include\": [\"spotify\"],\n                \"exclude\": [\"not found\"]\n            }\n        }\n    }\n}\n\nenv = DesktopEnv(action_space=\"pyautogui\")\n\nobs = env.reset(task_config=example)\nobs, reward, done, info = env.step(\"pyautogui.rightClick()\")\n```\n\nNote, this code is just for demonstrating how the OSWorld `DesktopEnv` is instantiated. If you're running OSWorld, this process is already part of their code base. The code above will boot up a VM and restart it. If, for whatever reason, running the starter code (or running OSWorld experiments) leads to an infinitely long run time, cancel out of the VM.\nYou should then see:\n\n```\nparent/\n  OSWorld/\n    vmware_vm_data/\n      Ubuntu0/\n        *.lck\n        *.vmem\n        ...\n      ...\n      UbuntuX/\n```\n\nIf you happen to have any `*.lck` folder in your VM's folder, be sure to delete them. Every time you are powering on the VM from creating a new `DesktopEnv` instance, you need to \ndelete the `*.lck` folders first. If your VM is already powered on, and your session (in a Jupyter Notebook, for example) crashes, you can keep the `*.lck` files and just re-instantiate the `DesktopEnv` instance. I'd also suggest using just a single VM (as a VM takes up a lot of space!). Also, be sure to shut down the VM when you've finished using it. Deleting the `*.lck` files should be done after every time you power off the VM (though it seems to not be an issue from testing).\n\n---\n\nIf even after rerunning the code and deleting the `*.lck` files don't work, then you should try passing in the `path_to_vm` explicitly to the `DesktopEnv` class. \n\n```\nenv = DesktopEnv(action_space=\"pyautogui\", headless=False, require_terminal=True, path_to_vm=<absolute_path>)\n```\n\nPass the absolute path to your VM's (Ubuntu0) `.vmx` file. This file is located here:\n\n\n```\nparent/\n  OSWorld/\n    vmware_vm_data/\n      Ubuntu0/\n        *.lck\n        *.vmem\n        ...\n        *.vmx\n      ...\n      UbuntuX/\n```\n\n📌 **Note**: If you are testing on the `os` domain, there is an [issue](https://github.com/asweigart/pyautogui/issues/198#issuecomment-1465268536) with `pyautogui`. A *hacky* way to solve this is to, inside the VM, locate where the `pyautogui` module is installed and open the `__init__.py` located under the `pyautogui` folder and remove the \"<\" in the `set(...)` within the following function: \n\n```\ndef isShiftCharacter(character):\n    \"\"\"\n    Returns True if the ``character`` is a keyboard key that would require the shift key to be held down, such as\n    uppercase letters or the symbols on the keyboard's number row.\n    \"\"\"\n    # NOTE TODO - This will be different for non-qwerty keyboards.\n    return character.isupper() or character in set('~!@#$%^&*()_+{}|:\"<>?')\n```\n\n📌 **Note**: If in case, your VM encounters an issue with \"The root file system on <path> requires a manual fsck\", reset the VM to the previous snapshot. \n\n📌 **Note**: OSWorld scripts will create the `DesktopEnv` instance which will create a VM for you with a specific snapshot (`snapshot_name` parameter in `DesktopEnv`). If you wish to create a new snapshot of the VM and use that for your experiments, be sure to specify the name of this snapshot where `DesktopEnv` is instantiated.\n\nWith these changes, you should be able to get up and running with VMWare, DesktopEnv, and OSWorld! 😊"
  },
  {
    "path": "osworld_setup/s2/lib_run_single.py",
    "content": "import datetime\nimport json\nimport logging\nimport os\nimport time\nfrom wrapt_timeout_decorator import *\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef run_single_example(\n    agent, env, example, max_steps, instruction, args, example_result_dir, scores\n):\n    runtime_logger = setup_logger(example, example_result_dir)\n    agent.reset()\n    env.reset(task_config=example)\n    time.sleep(60)  # Wait for the environment to be ready\n    obs = env._get_obs()  # Get the initial observation\n    done = False\n    step_idx = 0\n    env.controller.start_recording()\n    while not done and step_idx < max_steps:\n        response, actions = agent.predict(instruction, obs)\n        for action in actions:\n            # Capture the timestamp before executing the action\n            action_timestamp = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n            logger.info(\"Step %d: %s\", step_idx + 1, action)\n            obs, reward, done, info = env.step(action, args.sleep_after_execution)\n\n            logger.info(\"Reward: %.2f\", reward)\n            logger.info(\"Done: %s\", done)\n            # Save screenshot and trajectory information\n            with open(\n                os.path.join(\n                    example_result_dir, f\"step_{step_idx + 1}_{action_timestamp}.png\"\n                ),\n                \"wb\",\n            ) as _f:\n                _f.write(obs[\"screenshot\"])\n            with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                f.write(\n                    json.dumps(\n                        {\n                            \"step_num\": step_idx + 1,\n                            \"action_timestamp\": action_timestamp,\n                            \"action\": action,\n                            \"reward\": reward,\n                            \"done\": done,\n                            \"info\": info,\n                            \"screenshot_file\": f\"step_{step_idx + 1}_{action_timestamp}.png\",\n                        }\n                    )\n                )\n                f.write(\"\\n\")\n            if done:\n                logger.info(\"The episode is done.\")\n                break\n        step_idx += 1\n    result = env.evaluate()\n    logger.info(\"Result: %.2f\", result)\n    scores.append(result)\n    with open(\n        os.path.join(example_result_dir, \"result.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(f\"{result}\\n\")\n    env.controller.end_recording(os.path.join(example_result_dir, \"recording.mp4\"))\n\n\ndef setup_logger(example, example_result_dir):\n    runtime_logger = logging.getLogger(f\"desktopenv.example.{example['id']}\")\n    runtime_logger.setLevel(logging.DEBUG)\n    runtime_logger.addHandler(\n        logging.FileHandler(os.path.join(example_result_dir, \"runtime.log\"))\n    )\n    return runtime_logger\n"
  },
  {
    "path": "osworld_setup/s2/run.py",
    "content": "\"\"\"OSWorld's run.py with AgentS2.\"\"\"\n\n\"\"\"Script to run end-to-end evaluation on the benchmark.\nUtils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.\n\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\n\nfrom gui_agents.s2.agents.agent_s import AgentS2\nfrom gui_agents.s2.agents.grounding import OSWorldACI\nfrom tqdm import tqdm\n\nimport lib_run_single\nfrom desktop_env.desktop_env import DesktopEnv\n\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"screenshot\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=0.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=3)\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n\n    # lm config\n    parser.add_argument(\"--model_provider\", type=str, default=\"openai\")\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\"--temperature\", type=float, default=1.0)\n    parser.add_argument(\"--top_p\", type=float, default=0.9)\n    parser.add_argument(\"--max_tokens\", type=int, default=1500)\n    parser.add_argument(\"--stop_token\", type=str, default=None)\n\n    # example config\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n\n    # logging related\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n\n    # NEW!\n\n    # Configuration 1\n    parser.add_argument(\"--grounding_model_provider\", type=str, default=\"anthropic\")\n    parser.add_argument(\n        \"--grounding_model\", type=str, default=\"claude-3-7-sonnet-20250219\"\n    )\n    parser.add_argument(\n        \"--grounding_model_resize_width\",\n        type=int,\n        default=1366,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_model_resize_height\",\n        type=int,\n        default=None,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # Configuration 2\n    parser.add_argument(\"--endpoint_provider\", type=str, default=\"\")\n    parser.add_argument(\"--endpoint_url\", type=str, default=\"\")\n    parser.add_argument(\n        \"--endpoint_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n\n    parser.add_argument(\"--kb_name\", default=\"kb_s2\", type=str)\n\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    scores = []\n    max_steps = args.max_steps\n\n    # log args\n    logger.info(\"Args: %s\", args)\n    cfg_args = {\n        \"path_to_vm\": args.path_to_vm,\n        \"headless\": args.headless,\n        \"action_space\": args.action_space,\n        \"observation_type\": args.observation_type,\n        \"screen_width\": args.screen_width,\n        \"screen_height\": args.screen_height,\n        \"sleep_after_execution\": args.sleep_after_execution,\n        \"max_steps\": args.max_steps,\n        \"max_trajectory_length\": args.max_trajectory_length,\n        \"model\": args.model,\n        \"temperature\": args.temperature,\n        \"top_p\": args.top_p,\n        \"max_tokens\": args.max_tokens,\n        \"stop_token\": args.stop_token,\n        \"result_dir\": args.result_dir,\n    }\n\n    # NEW!\n    engine_params = {\n        \"engine_type\": args.model_provider,\n        \"model\": args.model,\n        \"base_url\": args.model_url,\n        \"api_key\": args.model_api_key,\n    }\n\n    if args.endpoint_url:\n        engine_params_for_grounding = {\n            \"engine_type\": args.endpoint_provider,\n            \"base_url\": args.endpoint_url,\n            \"api_key\": args.endpoint_api_key,\n        }\n    else:\n        grounding_height = args.grounding_model_resize_height\n        # If not provided, use the aspect ratio of the screen to compute the height\n        if grounding_height is None:\n            grounding_height = (\n                args.screen_height\n                * args.grounding_model_resize_width\n                / args.screen_width\n            )\n\n        engine_params_for_grounding = {\n            \"engine_type\": args.grounding_model_provider,\n            \"model\": args.grounding_model,\n            \"grounding_width\": args.grounding_model_resize_width,\n            \"grounding_height\": grounding_height,\n        }\n\n    # NEW!\n    grounding_agent = OSWorldACI(\n        platform=\"linux\",\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=args.screen_width,\n        height=args.screen_height,\n    )\n\n    # NEW!\n    agent = AgentS2(\n        engine_params,\n        grounding_agent,\n        platform=\"linux\",\n        action_space=\"pyautogui\",\n        observation_type=\"mixed\",\n        search_engine=\"Perplexica\",\n        memory_root_path=os.getcwd(),\n        memory_folder_name=args.kb_name,\n        kb_release_tag=\"v0.2.2\",\n        embedding_engine_type=\"openai\",\n    )\n\n    env = DesktopEnv(\n        path_to_vm=args.path_to_vm,\n        action_space=agent.action_space,\n        screen_size=(args.screen_width, args.screen_height),\n        headless=args.headless,\n        require_a11y_tree=args.observation_type\n        in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n    )\n\n    for domain in tqdm(test_all_meta, desc=\"Domain\"):\n        for example_id in tqdm(test_all_meta[domain], desc=\"Example\", leave=False):\n            config_file = os.path.join(\n                args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n            )\n            with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                example = json.load(f)\n\n            logger.info(f\"[Domain]: {domain}\")\n            logger.info(f\"[Example ID]: {example_id}\")\n\n            instruction = example[\"instruction\"]\n\n            logger.info(f\"[Instruction]: {instruction}\")\n            # wandb each example config settings\n            cfg_args[\"instruction\"] = instruction\n            cfg_args[\"start_time\"] = datetime.datetime.now().strftime(\n                \"%Y:%m:%d-%H:%M:%S\"\n            )\n\n            example_result_dir = os.path.join(\n                args.result_dir,\n                args.action_space,\n                args.observation_type,\n                args.model,\n                domain,\n                example_id,\n            )\n            os.makedirs(example_result_dir, exist_ok=True)\n            # example start running\n            try:\n                lib_run_single.run_single_example(\n                    agent,\n                    env,\n                    example,\n                    max_steps,\n                    instruction,\n                    args,\n                    example_result_dir,\n                    scores,\n                )\n            except Exception as e:\n                logger.error(f\"Exception in {domain}/{example_id}: {e}\")\n                env.controller.end_recording(\n                    os.path.join(example_result_dir, \"recording.mp4\")\n                )\n                with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                    f.write(\n                        json.dumps(\n                            {\"Error\": f\"Time limit exceeded in {domain}/{example_id}\"}\n                        )\n                    )\n                    f.write(\"\\n\")\n\n    env.close()\n    logger.info(f\"Average score: {sum(scores) / len(scores)}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "osworld_setup/s2_5/OSWorld.md",
    "content": "# Deplying Agent S2.5 in OSWorld\n\n# Step 1: Set up Agent S2.5\n\nFollow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md) to set up Agent S2.5.\n\n# Step 2: Copying Over Run Files\n\nIf you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder. `run_local.py` and `lib_run_single_local.py` are for if you want to run locally on VMWare and `run.py` and `lib_run_single.py` are for if you want to run on AWS.\n\n"
  },
  {
    "path": "osworld_setup/s2_5/lib_run_single.py",
    "content": "import datetime\nimport json\nimport logging\nimport os\nimport time\nfrom typing import *\nfrom wrapt_timeout_decorator import *\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef run_single_example(\n    agent, env, example, max_steps, instruction, args, example_result_dir, scores\n):\n    runtime_logger = setup_logger(example, example_result_dir)\n    try:\n        agent.reset(runtime_logger)\n    except Exception as e:\n        agent.reset()\n\n    env.reset(task_config=example)\n    time.sleep(60)  # Wait for the environment to be ready\n    obs = env._get_obs()  # Get the initial observation\n\n    with open(os.path.join(example_result_dir, f\"step_0.png\"), \"wb\") as _f:\n        _f.write(obs[\"screenshot\"])\n\n    with open(\n        os.path.join(example_result_dir, \"instruction.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(instruction)\n\n    done = False\n    step_idx = 0\n    env.controller.start_recording()\n    while not done and step_idx < max_steps:\n        response, actions = agent.predict(instruction, obs)\n        for action in actions:\n            action_timestamp = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n            logger.info(\"Step %d: %s\", step_idx + 1, action)\n            obs, reward, done, info = env.step(action, args.sleep_after_execution)\n\n            logger.info(\"Reward: %.2f\", reward)\n            logger.info(\"Done: %s\", done)\n            # Save screenshot and trajectory information\n            with open(\n                os.path.join(\n                    example_result_dir, f\"step_{step_idx + 1}_{action_timestamp}.png\"\n                ),\n                \"wb\",\n            ) as _f:\n                _f.write(obs[\"screenshot\"])\n\n            response.update(\n                {\n                    \"step_num\": step_idx + 1,\n                    \"action_timestamp\": action_timestamp,\n                    \"action\": action,\n                    \"reward\": reward,\n                    \"done\": done,\n                    \"info\": info,\n                    \"screenshot_file\": f\"step_{step_idx + 1}_{action_timestamp}.png\",\n                }\n            )\n            with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                f.write(json.dumps(response))\n                f.write(\"\\n\")\n            if done:\n                logger.info(\"The episode is done.\")\n                break\n        step_idx += 1\n    result = env.evaluate()\n    logger.info(\"Result: %.2f\", result)\n    scores.append(result)\n    with open(\n        os.path.join(example_result_dir, \"result.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(f\"{result}\\n\")\n    env.controller.end_recording(os.path.join(example_result_dir, \"recording.mp4\"))\n\n\ndef setup_logger(example, example_result_dir):\n    runtime_logger = logging.getLogger(f\"desktopenv.example.{example['id']}\")\n    runtime_logger.setLevel(logging.DEBUG)\n    runtime_logger.addHandler(\n        logging.FileHandler(os.path.join(example_result_dir, \"runtime.log\"))\n    )\n    return runtime_logger\n"
  },
  {
    "path": "osworld_setup/s2_5/lib_run_single_local.py",
    "content": "import datetime\nimport json\nimport logging\nimport os\nimport time\nfrom typing import *\nfrom wrapt_timeout_decorator import *\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef run_single_example(\n    agent, env, example, max_steps, instruction, args, example_result_dir, scores\n):\n    runtime_logger = setup_logger(example, example_result_dir)\n    try:\n        agent.reset(runtime_logger)\n    except Exception as e:\n        agent.reset()\n\n    env.reset(task_config=example)\n    time.sleep(60)  # Wait for the environment to be ready\n    obs = env._get_obs()  # Get the initial observation\n\n    with open(os.path.join(example_result_dir, f\"step_0.png\"), \"wb\") as _f:\n        _f.write(obs[\"screenshot\"])\n\n    with open(\n        os.path.join(example_result_dir, \"instruction.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(instruction)\n\n    done = False\n    step_idx = 0\n    env.controller.start_recording()\n    while not done and step_idx < max_steps:\n        time.sleep(0.5)\n        response, actions = agent.predict(instruction, obs)\n        for action in actions:\n            action_timestamp = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n            logger.info(\"Step %d: %s\", step_idx + 1, action)\n            obs, reward, done, info = env.step(action, args.sleep_after_execution)\n\n            logger.info(\"Reward: %.2f\", reward)\n            logger.info(\"Done: %s\", done)\n            # Save screenshot and trajectory information\n            with open(\n                os.path.join(\n                    example_result_dir, f\"step_{step_idx + 1}_{action_timestamp}.png\"\n                ),\n                \"wb\",\n            ) as _f:\n                _f.write(obs[\"screenshot\"])\n\n            response.update(\n                {\n                    \"step_num\": step_idx + 1,\n                    \"action_timestamp\": action_timestamp,\n                    \"action\": action,\n                    \"reward\": reward,\n                    \"done\": done,\n                    \"info\": info,\n                    \"screenshot_file\": f\"step_{step_idx + 1}_{action_timestamp}.png\",\n                }\n            )\n            with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                f.write(json.dumps(response))\n                f.write(\"\\n\")\n            if done:\n                logger.info(\"The episode is done.\")\n                break\n        step_idx += 1\n    result = env.evaluate()\n    logger.info(\"Result: %.2f\", result)\n    scores.append(result)\n    with open(\n        os.path.join(example_result_dir, \"result.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(f\"{result}\\n\")\n    env.controller.end_recording(os.path.join(example_result_dir, \"recording.mp4\"))\n\n\ndef setup_logger(example, example_result_dir):\n    runtime_logger = logging.getLogger(f\"desktopenv.example.{example['id']}\")\n    runtime_logger.setLevel(logging.DEBUG)\n    runtime_logger.addHandler(\n        logging.FileHandler(os.path.join(example_result_dir, \"runtime.log\"))\n    )\n    return runtime_logger\n"
  },
  {
    "path": "osworld_setup/s2_5/run.py",
    "content": "\"\"\"OSWorld's run.py with AgentS2_5.\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\nimport signal\nimport time\nfrom multiprocessing import Process, Manager, current_process, Queue\n\n\nimport lib_run_single\nfrom desktop_env.desktop_env import DesktopEnv\n\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nstdout_handler = logging.StreamHandler(sys.stdout)\n\nstdout_handler.setLevel(logging.INFO)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\n\nstdout_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(stdout_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\n# Global variables for signal handling\nactive_environments = []\nprocesses = []\nis_terminating = False\n\n\ndef distribute_tasks(test_all_meta: dict) -> list:\n    all_tasks = []\n    for domain, examples in test_all_meta.items():\n        for example_id in examples:\n            all_tasks.append((domain, example_id))\n    return all_tasks\n\n\ndef process_signal_handler(signum, frame, env_idx):\n    logger.info(f\"Process {env_idx + 1} received signal {signum}. Shutting down...\")\n    local_vars = frame.f_locals\n    active_environments = local_vars.get(\"active_environments\", [])\n    for env in active_environments:\n        if env is not None:\n            try:\n                logger.info(f\"Process {env_idx + 1} closing environment...\")\n                env.close()\n                logger.info(f\"Process {env_idx + 1} environment closed successfully\")\n            except Exception as e:\n                logger.error(f\"Process {env_idx + 1} error closing environment: {e}\")\n    logger.info(f\"Process {env_idx + 1} shutdown complete. Exiting.\")\n    sys.exit(0)\n\n\ndef run_env_tasks(\n    task_queue: Queue,\n    args: argparse.Namespace,\n    shared_scores: list,\n    engine_params,\n    engine_params_for_grounding,\n):\n    active_environments = []\n    env = None\n    try:\n        # Use IMAGE_ID_MAP for AWS provider to get snapshot_name\n        snapshot_name = None\n        region = getattr(args, \"region\", None)\n        if args.provider_name == \"aws\" and region is not None:\n            try:\n                from desktop_env.providers.aws.manager import IMAGE_ID_MAP\n\n                screen_size = (args.screen_width, args.screen_height)\n                snapshot_name = IMAGE_ID_MAP[region].get(\n                    screen_size, IMAGE_ID_MAP[region][(1920, 1080)]\n                )\n            except Exception as e:\n                logger.error(f\"Failed to get snapshot_name from IMAGE_ID_MAP: {e}\")\n                snapshot_name = None\n        from gui_agents.s2_5.agents.agent_s import AgentS2_5\n        from gui_agents.s2_5.agents.grounding import OSWorldACI\n\n        grounding_agent = OSWorldACI(\n            platform=\"linux\",\n            engine_params_for_generation=engine_params,\n            engine_params_for_grounding=engine_params_for_grounding,\n            width=args.screen_width,\n            height=args.screen_height,\n        )\n        agent = AgentS2_5(\n            engine_params,\n            grounding_agent,\n            platform=\"linux\",\n        )\n        env = DesktopEnv(\n            path_to_vm=args.path_to_vm,\n            action_space=args.action_space,\n            provider_name=args.provider_name,\n            region=region,\n            snapshot_name=snapshot_name,\n            screen_size=(args.screen_width, args.screen_height),\n            headless=args.headless,\n            os_type=\"Ubuntu\",\n            require_a11y_tree=args.observation_type\n            in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n            enable_proxy=True,\n            client_password=getattr(args, \"client_password\", \"\"),\n        )\n        active_environments.append(env)\n        logger.info(f\"Process {current_process().name} started.\")\n        while True:\n            try:\n                item = task_queue.get(timeout=5)\n            except Exception:\n                break\n            domain, example_id = item\n            try:\n                config_file = os.path.join(\n                    args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n                )\n                with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                    example = json.load(f)\n                instruction = example[\"instruction\"]\n                example_result_dir = os.path.join(\n                    args.result_dir,\n                    args.action_space,\n                    args.observation_type,\n                    args.model,\n                    domain,\n                    example_id,\n                )\n                os.makedirs(example_result_dir, exist_ok=True)\n                logger.info(f\"[{current_process().name}][Domain]: {domain}\")\n                logger.info(f\"[{current_process().name}][Example ID]: {example_id}\")\n                logger.info(f\"[{current_process().name}][Instruction]: {instruction}\")\n                try:\n                    lib_run_single.run_single_example(\n                        agent,\n                        env,\n                        example,\n                        args.max_steps,\n                        instruction,\n                        args,\n                        example_result_dir,\n                        shared_scores,\n                    )\n                except Exception as e:\n                    import traceback\n\n                    logger.error(\n                        f\"Exception in {current_process().name} {domain}/{example_id}: {e}\"\n                    )\n                    logger.error(traceback.format_exc())\n                    try:\n                        env.controller.end_recording(\n                            os.path.join(example_result_dir, \"recording.mp4\")\n                        )\n                    except Exception as rec_e:\n                        logger.error(f\"Failed to end recording: {rec_e}\")\n                    with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                        f.write(json.dumps({\"Error\": f\"{domain}/{example_id} - {e}\"}))\n                        f.write(\"\\n\")\n            except Exception as e:\n                logger.error(f\"Task-level error in {current_process().name}: {e}\")\n                import traceback\n\n                logger.error(traceback.format_exc())\n    except Exception as e:\n        logger.error(f\"Process-level error in {current_process().name}: {e}\")\n        import traceback\n\n        logger.error(traceback.format_exc())\n    finally:\n        logger.info(f\"{current_process().name} cleaning up environment...\")\n        try:\n            if env:\n                env.close()\n                logger.info(f\"{current_process().name} environment closed successfully\")\n        except Exception as e:\n            logger.error(\n                f\"{current_process().name} error during environment cleanup: {e}\"\n            )\n\n\ndef signal_handler(signum, frame):\n    global is_terminating, active_environments, processes\n    if is_terminating:\n        return\n    is_terminating = True\n    logger.info(f\"Received signal {signum}. Gracefully shutting down...\")\n    for env in active_environments:\n        try:\n            logger.info(f\"Closing environment...\")\n            env.close()\n            logger.info(f\"Environment closed successfully\")\n        except Exception as e:\n            logger.error(f\"Error closing environment: {e}\")\n    for p in processes:\n        if p.is_alive():\n            try:\n                logger.info(f\"Sending termination signal to process {p.name}...\")\n                p.terminate()\n            except Exception as e:\n                logger.error(f\"Error sending termination signal to process: {e}\")\n    time.sleep(1)\n    for p in processes:\n        if p.is_alive():\n            try:\n                logger.info(f\"Forcefully terminating process {p.name}...\")\n                import signal as sig\n\n                os.kill(p.pid, sig.SIGKILL)\n            except Exception as e:\n                logger.error(f\"Error forcefully terminating process: {e}\")\n    logger.info(\"Shutdown complete. Exiting.\")\n    sys.exit(0)\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--provider_name\",\n        type=str,\n        default=\"vmware\",\n        help=\"Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)\",\n    )\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"screenshot\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\n        \"--num_envs\",\n        type=int,\n        default=1,\n        help=\"Number of environments to run in parallel\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=1.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n\n    parser.add_argument(\n        \"--region\", type=str, default=\"us-east-1\", help=\"AWS region for the VM\"\n    )\n    parser.add_argument(\n        \"--client_password\", type=str, default=\"\", help=\"Client password\"\n    )\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=8)\n\n    # lm config\n    parser.add_argument(\"--model_provider\", type=str, default=\"openai\")\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # grounding model config\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\", type=str, required=True, help=\"The URL of the grounding model\"\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    global processes\n    logger.info(\"Args: %s\", args)\n    all_tasks = distribute_tasks(test_all_meta)\n    logger.info(f\"Total tasks: {len(all_tasks)}\")\n\n    engine_params = {\n        \"engine_type\": args.model_provider,\n        \"model\": args.model,\n        \"base_url\": getattr(args, \"model_url\", \"\"),\n        \"api_key\": getattr(args, \"model_api_key\", \"\"),\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": getattr(args, \"ground_url\", \"\"),\n        \"api_key\": getattr(args, \"ground_api_key\", \"\"),\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    with Manager() as manager:\n        shared_scores = manager.list()\n        task_queue = manager.Queue()\n        for item in all_tasks:\n            task_queue.put(item)\n        num_envs = args.num_envs\n        processes = []\n        for i in range(num_envs):\n            p = Process(\n                target=run_env_tasks,\n                args=(\n                    task_queue,\n                    args,\n                    shared_scores,\n                    engine_params,\n                    engine_params_for_grounding,\n                ),\n                name=f\"EnvProcess-{i+1}\",\n            )\n            p.daemon = True\n            p.start()\n            processes.append(p)\n            logger.info(f\"Started process {p.name} with PID {p.pid}\")\n        try:\n            while True:\n                alive_count = 0\n                for idx, p in enumerate(processes):\n                    if not p.is_alive():\n                        logger.warning(f\"Process {p.name} died, restarting...\")\n                        new_p = Process(\n                            target=run_env_tasks,\n                            args=(\n                                task_queue,\n                                args,\n                                shared_scores,\n                                engine_params,\n                                engine_params_for_grounding,\n                            ),\n                            name=f\"EnvProcess-Restart-{idx+1}\",\n                        )\n                        new_p.daemon = True\n                        new_p.start()\n                        processes[idx] = new_p\n                        logger.info(\n                            f\"Restarted process {new_p.name} with PID {new_p.pid}\"\n                        )\n                    else:\n                        alive_count += 1\n                if task_queue.empty():\n                    logger.info(\"All tasks finished.\")\n                    break\n                if alive_count == 0:\n                    logger.error(\"All processes died, exiting.\")\n                    break\n                time.sleep(5)\n            for p in processes:\n                p.join()\n        except KeyboardInterrupt:\n            logger.info(\n                \"Main process received KeyboardInterrupt. Initiating graceful shutdown...\"\n            )\n            raise\n        except Exception as e:\n            logger.error(\n                f\"Unexpected error while waiting for processes: {e}\", exc_info=True\n            )\n            for p in processes:\n                if p.is_alive():\n                    try:\n                        logger.info(f\"Terminating process {p.name} due to error...\")\n                        p.terminate()\n                    except Exception as term_e:\n                        logger.error(f\"Error terminating process {p.name}: {term_e}\")\n            raise\n        scores = list(shared_scores)\n    logger.info(f\"Average score: {sum(scores) / len(scores) if scores else 0}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    signal.signal(signal.SIGINT, signal_handler)\n    signal.signal(signal.SIGTERM, signal_handler)\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    # save args to json in result_dir/action_space/observation_type/model/args.json\n    path_to_args = os.path.join(\n        args.result_dir,\n        args.action_space,\n        args.observation_type,\n        args.model,\n        \"args.json\",\n    )\n    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)\n    with open(path_to_args, \"w\", encoding=\"utf-8\") as f:\n        json.dump(vars(args), f, indent=4)\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "osworld_setup/s2_5/run_local.py",
    "content": "\"\"\"Script to run end-to-end evaluation on the benchmark.\nUtils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.\n\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\n\nfrom tqdm import tqdm\n\nimport lib_run_single_local\nfrom desktop_env.desktop_env import DesktopEnv\nfrom gui_agents.s2_5.agents.agent_s import AgentS2_5\nfrom gui_agents.s2_5.agents.grounding import OSWorldACI\n\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\n# Almost deprecated since it's not multi-env, use run_multienv_*.py instead\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--provider_name\",\n        type=str,\n        default=\"vmware\",\n        help=\"Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)\",\n    )\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"screenshot\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=3.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=3)\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n\n    # lm config\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\"--temperature\", type=float, default=1.0)\n\n    # AgentS2 specific config\n    parser.add_argument(\"--model_provider\", type=str, default=\"openai\")\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # grounding model config\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\", type=str, required=True, help=\"The URL of the grounding model\"\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # example config\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n\n    # logging related\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    scores = []\n    max_steps = args.max_steps\n\n    # log args\n    logger.info(\"Args: %s\", args)\n    # set wandb project\n    cfg_args = {\n        \"path_to_vm\": args.path_to_vm,\n        \"provider_name\": args.provider_name,\n        \"headless\": args.headless,\n        \"action_space\": args.action_space,\n        \"observation_type\": args.observation_type,\n        \"screen_width\": args.screen_width,\n        \"screen_height\": args.screen_height,\n        \"sleep_after_execution\": args.sleep_after_execution,\n        \"max_steps\": args.max_steps,\n        \"max_trajectory_length\": args.max_trajectory_length,\n        \"model\": args.model,\n        \"temperature\": args.temperature,\n        \"result_dir\": args.result_dir,\n    }\n\n    # AgentS2 configuration\n    engine_params = {\n        \"engine_type\": args.model_provider,\n        \"model\": args.model,\n        \"base_url\": getattr(args, \"model_url\", \"\"),\n        \"api_key\": getattr(args, \"model_api_key\", \"\"),\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": getattr(args, \"ground_url\", \"\"),\n        \"api_key\": getattr(args, \"ground_api_key\", \"\"),\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    # Create grounding agent\n    grounding_agent = OSWorldACI(\n        platform=\"linux\",\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=args.screen_width,\n        height=args.screen_height,\n    )\n\n    # Create AgentS2 worker\n    agent = AgentS2_5(\n        engine_params,\n        grounding_agent,\n        platform=\"linux\",\n    )\n\n    env = DesktopEnv(\n        provider_name=args.provider_name,\n        path_to_vm=args.path_to_vm,\n        action_space=args.action_space,\n        screen_size=(args.screen_width, args.screen_height),\n        headless=args.headless,\n        os_type=\"Ubuntu\",\n        require_a11y_tree=args.observation_type\n        in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        enable_proxy=True,\n        snapshot_name=\"signed_in_state_1\",\n    )\n\n    for domain in tqdm(test_all_meta, desc=\"Domain\"):\n        for example_id in tqdm(test_all_meta[domain], desc=\"Example\", leave=False):\n            config_file = os.path.join(\n                args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n            )\n            with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                example = json.load(f)\n\n            logger.info(f\"[Domain]: {domain}\")\n            logger.info(f\"[Example ID]: {example_id}\")\n\n            instruction = example[\"instruction\"]\n\n            logger.info(f\"[Instruction]: {instruction}\")\n            # wandb each example config settings\n            cfg_args[\"instruction\"] = instruction\n            cfg_args[\"start_time\"] = datetime.datetime.now().strftime(\n                \"%Y:%m:%d-%H:%M:%S\"\n            )\n            # run.config.update(cfg_args)\n\n            example_result_dir = os.path.join(\n                args.result_dir,\n                args.action_space,\n                args.observation_type,\n                args.model,\n                domain,\n                example_id,\n            )\n            os.makedirs(example_result_dir, exist_ok=True)\n            # example start running\n            try:\n                lib_run_single_local.run_single_example(\n                    agent,\n                    env,\n                    example,\n                    max_steps,\n                    instruction,\n                    args,\n                    example_result_dir,\n                    scores,\n                )\n            except Exception as e:\n                logger.error(f\"Exception in {domain}/{example_id}: {e}\")\n                # Only attempt to end recording if controller exists (not Docker provider)\n                if hasattr(env, \"controller\") and env.controller is not None:\n                    env.controller.end_recording(\n                        os.path.join(example_result_dir, \"recording.mp4\")\n                    )\n                with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                    f.write(\n                        json.dumps(\n                            {\"Error\": f\"Time limit exceeded in {domain}/{example_id}\"}\n                        )\n                    )\n                    f.write(\"\\n\")\n\n    env.close()\n    logger.info(f\"Average score: {sum(scores) / len(scores)}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    # save args to json in result_dir/action_space/observation_type/model/args.json\n    path_to_args = os.path.join(\n        args.result_dir,\n        args.action_space,\n        args.observation_type,\n        args.model,\n        \"args.json\",\n    )\n    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)\n    with open(path_to_args, \"w\", encoding=\"utf-8\") as f:\n        json.dump(vars(args), f, indent=4)\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "osworld_setup/s3/OSWorld.md",
    "content": "# Deplying Agent S3 in OSWorld\n\n# Step 1: Set up Agent S3\n\nFollow the [README.md](https://github.com/simular-ai/Agent-S/blob/main/README.md) to set up Agent S3.\n\n# Step 2: Copying Over Run Files\n\nIf you haven't already, please follow the [OSWorld environment setup](https://github.com/xlang-ai/OSWorld/blob/main/README.md). We've provided the relevant OSWorld run files for evaluation in this `osworld_setup` folder. Please copy this over to your OSWorld folder. `run_local.py` is for if you want to run locally on VMWare and `run.py` and `lib_run_single.py` are for if you want to run on AWS. All run commands in order are provided in the `run.sh`. Copy over the files in `osworld_setup/s3/bbon` as well. \n\n# Step 3: Switch the AMI \n\nSwitch image AMI for the AWS provider in `desktop_env/providers/aws/manager.py` is set to `\"ami-0b505e9d0d99ba88c\"`.\n\n# Step 4: Generating Facts\n\nAfter completing your OSWorld runs and having result directories, run `generate_facts.py` to generate fact captions for screenshot pairs:\n\n```bash\npython osworld_setup/s3/bbon/generate_facts.py \\\n  --results-dirs \\\n    results1/pyautogui/screenshot/gpt-5-2025-08-07 \\\n    results2/pyautogui/screenshot/gpt-5-2025-08-07 \\\n  --model \"gpt-5-2025-08-07\" \\\n  --engine-type \"openai\" \\\n  --temperature 1.0\n```\n\nThis will populate your result directories with `fact_captions.jsonl` files containing behavioral descriptions of screenshot differences.\n\n# Step 5: Run the Judge\n\nFinally, run `run_judge.py` to evaluate the trajectories using the generated fact captions:\n\n```bash\npython osworld_setup/s3/bbon/run_judge.py \\\n  --results-dirs \\\n    results1/pyautogui/screenshot/gpt-5-2025-08-07 \\\n    results2/pyautogui/screenshot/gpt-5-2025-08-07 \\\n  --output-dir \"judge_results\" \\\n  --examples-path \"evaluation_examples/examples\" \\\n  --model \"gpt-5-2025-08-07\" \\\n  --engine-type \"openai\" \\\n  --temperature 1.0\n```\n\nThis will:\n- Compare trajectories across different result directories\n- Use the facts to judge which trajectory performs better\n- Generate evaluation results\n- Save results to the specified output directory\n\nThe judge will create files like `BoN2.json`, `BoN3.json`, etc., showing the performance comparison as you add more trajectories.\n\n"
  },
  {
    "path": "osworld_setup/s3/bbon/generate_facts.py",
    "content": "import os\nimport json\nimport asyncio\nimport argparse\nfrom typing import List, Optional\nfrom dotenv import load_dotenv\n\nfrom gui_agents.s3.bbon.behavior_narrator import BehaviorNarrator\nfrom utils import get_new_tasks_classification\n\nload_dotenv()\n\n\nasync def generate_single_fact_caption(\n    task_dir: str,\n    screenshot_files: List[str],\n    i: int,\n    judge: BehaviorNarrator,\n    trajectory_lines: List[str],\n):\n    \"\"\"Generate a single fact caption for a screenshot pair.\"\"\"\n    before_file = os.path.join(task_dir, screenshot_files[i])\n    after_file = os.path.join(task_dir, screenshot_files[i + 1])\n\n    # Load action from trajectory data if available\n    pyautogui_action = None\n    if i < len(trajectory_lines):\n        try:\n            data = json.loads(trajectory_lines[i])\n            pyautogui_action = data.get(\"exec_code\")\n        except:\n            pass\n\n    if pyautogui_action is None:\n        raise ValueError(f\"No pyautogui action found for step {i+1}\")\n\n    # Read image bytes\n    try:\n        with open(before_file, \"rb\") as f:\n            before_bytes = f.read()\n        with open(after_file, \"rb\") as f:\n            after_bytes = f.read()\n    except Exception as e:\n        raise Exception(f\"Error reading images: {e}\")\n\n    # Generate fact caption using behavior narrator\n    result = await asyncio.to_thread(\n        judge.judge,\n        screenshot_num=i + 1,\n        before_img_bytes=before_bytes,\n        after_img_bytes=after_bytes,\n        pyautogui_action=pyautogui_action,\n    )\n    result[\"screenshot_num\"] = i + 1\n\n    return result\n\n\nasync def generate_fact_captions_parallel(\n    task_dir: str,\n    judge: BehaviorNarrator,\n    step_semaphore: Optional[asyncio.Semaphore] = None,\n):\n    \"\"\"Generate fact captions for a task directory when they don't exist (parallelized version).\"\"\"\n    print(f\"Generating fact captions for {task_dir}...\")\n\n    # Find all screenshot files\n    screenshot_files = []\n    for filename in os.listdir(task_dir):\n        if filename.startswith(\"step_\") and filename.endswith(\".png\"):\n            screenshot_files.append(filename)\n\n    # Sort by step number\n    def extract_step_num(filename):\n        try:\n            return int(filename.split(\"_\")[1].split(\".\")[0])\n        except:\n            return 0\n\n    screenshot_files.sort(key=extract_step_num)\n\n    if len(screenshot_files) < 2:\n        print(f\"Not enough screenshots to generate fact captions in {task_dir}\")\n        return []\n\n    # Load trajectory data once\n    trajectory_lines = []\n    trajectory_file = os.path.join(task_dir, \"traj.jsonl\")\n    if os.path.exists(trajectory_file):\n        try:\n            with open(trajectory_file, \"r\") as f:\n                trajectory_lines = f.readlines()\n        except:\n            pass\n\n    # Use shared semaphore to limit concurrent judge calls\n    if step_semaphore is None:\n        step_semaphore = asyncio.Semaphore(5)  # Default limit\n\n    async def bounded_task(task_func, *args, **kwargs):\n        async with step_semaphore:\n            return await task_func(*args, **kwargs)\n\n    try:\n        # Create bounded tasks for parallel execution\n        bounded_tasks = [\n            bounded_task(\n                generate_single_fact_caption,\n                task_dir,\n                screenshot_files,\n                i,\n                judge,\n                trajectory_lines,\n            )\n            for i in range(len(screenshot_files) - 1)\n        ]\n        results = await asyncio.gather(*bounded_tasks, return_exceptions=True)\n    except Exception as e:\n        print(f\"Error in parallel execution: {e}\")\n        return []\n\n    # Process results and save to file\n    fact_captions = []\n    successful_results = []\n    fact_captions_file = os.path.join(task_dir, \"fact_captions.jsonl\")\n\n    for i, result in enumerate(results):\n        if isinstance(result, Exception):\n            print(f\"Error generating fact caption for step {i+1}: {result}\")\n            continue\n        successful_results.append(result)\n        fact_caption = f\"Fact Caption from Screenshot {result['screenshot_num']}: {result['fact_answer']}\"\n        fact_captions.append(fact_caption)\n\n    # Save all results to file at once\n    if successful_results:\n        with open(fact_captions_file, \"w\") as f:\n            for result in successful_results:\n                f.write(json.dumps(result) + \"\\n\")\n\n    print(f\"Generated {len(fact_captions)} fact captions for {task_dir}\")\n    return fact_captions\n\n\nasync def main(engine_params: dict, results_dirs: List[str]):\n    \"\"\"Main function to generate fact captions for multiple task directories.\n\n    Args:\n        engine_params: Engine parameters for BehaviorNarrator\n        results_dirs: List of results directories to analyze for task classification\n    \"\"\"\n    # Get task IDs automatically using get_new_tasks_classification\n    tasks_classification = get_new_tasks_classification(results_dirs)\n    task_ids = tasks_classification[\"variance\"]\n\n    print(f\"Found {len(task_ids)} variance tasks to process\")\n    judge = BehaviorNarrator(engine_params=engine_params)\n\n    # Get concurrency settings from environment\n    per_step = int(os.getenv(\"DIFFCAP_PER_STEP_CONCURRENCY\", \"100\"))\n    per_taskdir = int(os.getenv(\"DIFFCAP_PER_TASKDIR_CONCURRENCY\", \"4\"))\n\n    # Build list of task directories to process\n    task_dirs = []\n    for task_id in task_ids:\n        domain, example_id = task_id.split(\"/\")\n\n        # Check each results directory for this task\n        for results_dir in results_dirs:\n            task_dir = os.path.join(results_dir, domain, example_id)\n\n            try:\n                if \"fact_captions.jsonl\" in os.listdir(task_dir):\n                    print(f\"Fact captions already exist for {task_dir}\")\n                    continue\n            except FileNotFoundError:\n                continue\n\n            task_dirs.append(task_dir)\n\n    if not task_dirs:\n        print(\"No new task directories to process.\")\n        return\n\n    print(f\"Scheduling {len(task_dirs)} task directories...\")\n\n    # Set up semaphores for concurrency control\n    shared_step_semaphore = asyncio.Semaphore(per_step)\n    taskdir_semaphore = asyncio.Semaphore(per_taskdir)\n\n    async def run_one(task_dir):\n        async with taskdir_semaphore:\n            print(f\"Processing {task_dir}\")\n            return await generate_fact_captions_parallel(\n                task_dir, judge, step_semaphore=shared_step_semaphore\n            )\n\n    # Execute all tasks in parallel\n    results = await asyncio.gather(\n        *[run_one(d) for d in task_dirs], return_exceptions=True\n    )\n\n    # Report results\n    failures = sum(1 for r in results if isinstance(r, Exception))\n    if failures:\n        print(\n            f\"Completed with {failures} failures out of {len(task_dirs)} task directories.\"\n        )\n    else:\n        print(\"Completed all task directories successfully.\")\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Generate fact captions for OSWorld task directories\"\n    )\n    parser.add_argument(\n        \"--results-dirs\",\n        nargs=\"+\",\n        required=True,\n        help=\"List of results directories to analyze for task classification\",\n    )\n    parser.add_argument(\n        \"--model\", default=\"gpt-5-2025-08-07\", help=\"Model to use for generation\"\n    )\n    parser.add_argument(\"--engine-type\", default=\"openai\", help=\"Engine type\")\n    parser.add_argument(\n        \"--temperature\", type=float, default=1.0, help=\"Temperature for generation\"\n    )\n\n    args = parser.parse_args()\n\n    # Engine parameters\n    engine_params = {\n        \"model\": args.model,\n        \"engine_type\": args.engine_type,\n        \"temperature\": args.temperature,\n    }\n\n    print(f\"Results directories: {args.results_dirs}\")\n    asyncio.run(main(engine_params, args.results_dirs))\n"
  },
  {
    "path": "osworld_setup/s3/bbon/run_judge.py",
    "content": "import json\nimport os\nimport asyncio\nimport argparse\nimport concurrent.futures\nfrom typing import List, Tuple, Optional\nfrom dotenv import load_dotenv\nfrom tqdm.asyncio import tqdm_asyncio\n\nload_dotenv()\n\nfrom utils import (\n    get_new_tasks_classification,\n    evaluate_comparative_results,\n    load_task_instruction,\n    load_facts,\n)\nfrom gui_agents.s3.bbon.comparative_judge import ComparativeJudge\n\n\ndef run_judge(\n    task: str, task_instruction: str, result_dirs: List[str], judge: ComparativeJudge\n) -> Tuple[str, str, Optional[str]]:\n    \"\"\"\n    Fact captions + initial/final screenshots judging.\n    Pipeline: load trajectories → load existing fact captions → include initial/final screenshots → judge.\n    \"\"\"\n    # 1. Use provided task instruction\n    # task_instruction is now a direct input parameter\n\n    # 2. Load fact captions for all trajectories\n    all_fact_captions = []\n    for result_dir in result_dirs:\n        task_dir = os.path.join(result_dir, task.split(\"/\")[0], task.split(\"/\")[1])\n        fact_captions = load_facts(task_dir)\n        all_fact_captions.append(fact_captions)\n\n    # 3. Use the new Judge class method\n    return judge.judge(task_instruction, task, result_dirs, all_fact_captions)\n\n\ndef evaluate_trajectories(\n    task: str, task_instruction: str, result_dirs: List[str], judge: ComparativeJudge\n) -> Tuple[str, str, dict]:\n    \"\"\"Wrapper that runs fact-only MCQ judge and returns results.\"\"\"\n    answer, thoughts, selected_trajectory = run_judge(\n        task, task_instruction, result_dirs, judge\n    )\n\n    record = {\n        \"selected_trajectory\": selected_trajectory,\n        \"answer\": answer,\n        \"thoughts\": thoughts,\n    }\n\n    print(f\"✅ Added task {task} (MCQ fact-only)\")\n    return answer, thoughts, record\n\n\nasyncio.get_event_loop().set_default_executor(\n    concurrent.futures.ThreadPoolExecutor(max_workers=100)\n)\n\n\nasync def run_async(\n    task: str, task_instruction: str, result_dirs: List[str], judge: ComparativeJudge\n):\n    \"\"\"Async wrapper for fact-only MCQ evaluation.\"\"\"\n    return await asyncio.to_thread(\n        evaluate_trajectories,\n        task=task,\n        task_instruction=task_instruction,\n        result_dirs=result_dirs,\n        judge=judge,\n    )\n\n\nasync def evaluate_and_save(\n    result_dirs: List[str],\n    output_file_path: str,\n    examples_path: str,\n    engine_params: dict,\n):\n    \"\"\"Main evaluation function that processes tasks and saves results.\"\"\"\n    res = get_new_tasks_classification(results_dirs=result_dirs)\n    for key in res:\n        print(f\"{key}: {res[key]}\")\n    optimal, minimum, expected_value = (\n        res[\"optimal\"],\n        res[\"minimum\"],\n        res[\"expected_value\"],\n    )\n    print(f\"optimal score: {optimal}, minimum score: {minimum}\")\n\n    variance = res[\"variance\"]\n\n    judge = ComparativeJudge(engine_params=engine_params)\n\n    # Load existing results\n    if os.path.exists(output_file_path):\n        with open(output_file_path, \"r\", encoding=\"utf-8\") as f:\n            try:\n                data = json.load(f)\n                if not isinstance(data, dict):\n                    data = {}\n            except json.JSONDecodeError:\n                data = {}\n    else:\n        data = {}\n\n    # Prepare async tasks only for tasks not yet in data\n    tasks = []\n    task_names = []\n    for task in variance:\n        if str(task) in data:\n            print(f\"⚠️ Task {task} already exists in results — skipping.\")\n            continue\n\n        # Load task instruction from examples path\n        task_instruction = load_task_instruction(task, examples_path)\n        if task_instruction is None:\n            print(f\"⚠️ No task instruction found for {task}, skipping...\")\n            continue\n\n        tasks.append(run_async(task, task_instruction, result_dirs, judge))\n        task_names.append(task)\n\n    # Run only new tasks\n    results = await tqdm_asyncio.gather(*tasks)\n    # Merge into existing results\n    for task, (ans, thoughts, record) in zip(task_names, results):\n        data[str(task)] = record\n\n    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)\n    with open(output_file_path, \"w\") as f:\n        json.dump(data, f, indent=2)\n\n    res = evaluate_comparative_results(result_dirs, json_path=output_file_path)\n    gain, maximum_gain = res\n    data[\"score\"] = {\n        \"optimal\": optimal,\n        \"minimum\": minimum,\n        \"expected_value\": expected_value,\n        \"res\": res,\n        \"actual score\": minimum + gain,\n    }\n    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)\n    with open(output_file_path, \"w\") as f:\n        json.dump(data, f, indent=2)\n\n    return results\n\n\nasync def run_experiment(\n    shuffled_runs: List[str],\n    output_dir: str,\n    examples_path: str,\n    engine_params: dict,\n    start_round: int = 2,\n    max_rounds: int = None,\n):\n    \"\"\"\n    Run fact-only experiments progressively: start_round vs start_round+1, etc.\n    \"\"\"\n    if max_rounds is None:\n        max_rounds = len(shuffled_runs)\n\n    os.makedirs(output_dir, exist_ok=True)\n\n    for i in range(start_round, max_rounds + 1):  # start at start_round (default 2)\n        test_dirs = shuffled_runs[:i]\n        output_file_path = os.path.join(output_dir, f\"BoN{i}.json\")\n\n        print(f\"Running fact-only experiment with {i} dirs → {output_file_path}\")\n        await evaluate_and_save(\n            test_dirs, output_file_path, examples_path, engine_params\n        )\n\n\nasync def main(\n    shuffled_runs: List[str] = None,\n    output_dir: str = None,\n    examples_path: str = None,\n    engine_params: dict = None,\n    start_round: int = 2,\n    max_rounds: int = None,\n):\n    \"\"\"Main function to run fact-only judge experiments.\n\n    Args:\n        shuffled_runs: List of result directory paths to compare\n        output_dir: Directory to save results\n        examples_path: Path to examples directory containing task instructions\n        engine_params: Engine parameters for the judge\n        start_round: Starting round number (default: 2)\n        max_rounds: Maximum number of rounds to run (default: len(shuffled_runs))\n    \"\"\"\n    if shuffled_runs is None:\n        print(\"Error: shuffled_runs must be provided\")\n        return\n\n    if output_dir is None:\n        print(\"Error: output_dir must be provided\")\n        return\n\n    if examples_path is None:\n        print(\"Error: examples_path must be provided\")\n        return\n\n    if engine_params is None:\n        print(\"Error: engine_params must be provided\")\n        return\n\n    await run_experiment(\n        shuffled_runs, output_dir, examples_path, engine_params, start_round, max_rounds\n    )\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(\n        description=\"Run fact-only judge experiments on OSWorld task directories\"\n    )\n    parser.add_argument(\n        \"--results-dirs\",\n        nargs=\"+\",\n        required=True,\n        help=\"List of results directories to analyze\",\n    )\n    parser.add_argument(\"--output-dir\", required=True, help=\"Directory to save results\")\n    parser.add_argument(\n        \"--examples-path\",\n        required=True,\n        help=\"Path to examples directory containing task instructions\",\n    )\n    parser.add_argument(\n        \"--start-round\", type=int, default=2, help=\"Starting round number (default: 2)\"\n    )\n    parser.add_argument(\n        \"--max-rounds\",\n        type=int,\n        default=None,\n        help=\"Maximum number of rounds to run (default: len(results_dirs))\",\n    )\n    parser.add_argument(\n        \"--model\", default=\"gpt-5-2025-08-07\", help=\"Model to use for judging\"\n    )\n    parser.add_argument(\"--engine-type\", default=\"openai\", help=\"Engine type\")\n    parser.add_argument(\n        \"--temperature\", type=float, default=1.0, help=\"Temperature for generation\"\n    )\n\n    args = parser.parse_args()\n\n    # Engine parameters\n    engine_params = {\n        \"model\": args.model,\n        \"engine_type\": args.engine_type,\n        \"temperature\": args.temperature,\n    }\n\n    print(f\"Results directories: {args.results_dirs}\")\n    print(f\"Output directory: {args.output_dir}\")\n    print(f\"Examples path: {args.examples_path}\")\n    print(f\"Start round: {args.start_round}\")\n    print(f\"Max rounds: {args.max_rounds}\")\n    print(f\"Engine params: {engine_params}\")\n\n    # Run fact-only evaluation\n    asyncio.run(\n        main(\n            shuffled_runs=args.results_dirs,\n            output_dir=args.output_dir,\n            examples_path=args.examples_path,\n            engine_params=engine_params,\n            start_round=args.start_round,\n            max_rounds=args.max_rounds,\n        )\n    )\n"
  },
  {
    "path": "osworld_setup/s3/bbon/utils.py",
    "content": "import logging\nimport os\nimport re\nimport json\nfrom PIL import Image\nfrom typing import Optional, List\nimport base64\n\n\ndef image_to_openai_message_format(\n    image_path: str, caption: str = None\n) -> Optional[dict]:\n    \"\"\"Convert an image file to OpenAI message format.\"\"\"\n    if not os.path.exists(image_path):\n        print(f\"Image file not found: {image_path}\")\n        return None\n\n    try:\n        with open(image_path, \"rb\") as f:\n            image_bytes = f.read()\n\n        if not image_bytes:\n            print(f\"Empty image file: {image_path}\")\n            return None\n\n        base64_image = base64.b64encode(image_bytes).decode(\"utf-8\")\n\n        if not base64_image:\n            print(f\"Failed to encode image to base64: {image_path}\")\n            return None\n\n        content = []\n        if caption:\n            content.append({\"type\": \"text\", \"text\": caption})\n\n        content.append(\n            {\n                \"type\": \"image_url\",\n                \"image_url\": {\"url\": f\"data:image/png;base64,{base64_image}\"},\n            }\n        )\n\n        return {\"role\": \"user\", \"content\": content}\n\n    except Exception as e:\n        print(f\"Error processing image {image_path}: {e}\")\n        return None\n\n\ndef load_facts(task_dir: str) -> List[str]:\n    \"\"\"Load existing facts from facts.jsonl file.\"\"\"\n    fact_captions_file = os.path.join(task_dir, \"fact_captions.jsonl\")\n\n    if not os.path.exists(fact_captions_file):\n        print(f\"fact_captions.jsonl not found at {fact_captions_file}\")\n        return []\n\n    fact_captions = []\n    with open(fact_captions_file, \"r\") as f:\n        for line in f:\n            if line.strip():\n                data = json.loads(line)\n                if \"fact_answer\" in data:\n                    fact_captions.append(data[\"fact_answer\"])\n\n    return fact_captions\n\n\ndef load_task_instruction(task: str, examples_path: str) -> Optional[str]:\n    \"\"\"\n    Load task instruction from examples path.\n\n    Args:\n        task: Task ID in format \"domain/example_id\"\n        examples_path: Path to the examples directory (e.g., \"/home/ubuntu/Simular/OSWorld/evaluation_examples/examples\")\n\n    Returns:\n        Task instruction string or None if not found\n    \"\"\"\n    domain, example_id = task.split(\"/\", 1)\n\n    # Construct path to the JSON file\n    json_file_path = os.path.join(examples_path, domain, f\"{example_id}.json\")\n\n    if not os.path.exists(json_file_path):\n        logging.warning(f\"Example file not found: {json_file_path}\")\n        return None\n\n    try:\n        with open(json_file_path, \"r\", encoding=\"utf-8\") as f:\n            data = json.load(f)\n\n        # Extract instruction from the JSON\n        if \"instruction\" in data:\n            instruction = data[\"instruction\"]\n            if instruction and instruction.strip():\n                return instruction.strip()\n\n        logging.warning(f\"No 'instruction' key found in {json_file_path}\")\n        return None\n\n    except Exception as e:\n        logging.warning(f\"Error reading example file {json_file_path}: {e}\")\n        return None\n\n\ndef get_final_screenshot_file(result_dir: str) -> str:\n    \"\"\"\n    Finds the screenshot file with the largest valid step index in the given directory.\n    Works with filenames like step_0.png, step_1_20250.png, step-2.png, etc.\n    Only considers .png files (case-insensitive).\n    If the highest index file is invalid/corrupted, it tries the next lower index.\n    Returns None if no valid matching files are found.\n    \"\"\"\n    # First, collect all valid step files with their indices\n    step_files = {}\n    pattern = re.compile(r\"step[_\\-]?(\\d+)\", re.IGNORECASE)\n\n    for fname in os.listdir(result_dir):\n        if not fname.lower().endswith(\".png\"):\n            continue\n        match = pattern.match(fname)\n        if match:\n            idx = int(match.group(1))\n            step_files[idx] = fname\n    if not step_files:\n        return None\n    # Sort indices in descending order (highest first)\n    sorted_indices = sorted(step_files.keys(), reverse=True)\n    # Try each file from highest to lowest index\n    for idx in sorted_indices:\n        fname = step_files[idx]\n        file_path = os.path.join(result_dir, fname)\n        # Check if file exists and is valid\n        if os.path.exists(file_path) and is_valid_image(file_path):\n            return fname\n        else:\n            print(\n                f\"Invalid or corrupted image at step {idx}: {fname}, trying previous step...\"\n            )\n    return None\n\n\ndef is_valid_image(file_path: str) -> bool:\n    \"\"\"\n    Check if an image file is valid by trying to open it with PIL.\n    Also checks if file is not empty.\n    \"\"\"\n    try:\n        # Check file size first (quick check)\n        if os.path.getsize(file_path) == 0:\n            return False\n\n        # Try to open and verify the image\n        with Image.open(file_path) as img:\n            img.verify()  # This will raise an exception if image is corrupted\n            return True\n    except Exception as e:\n        print(f\"Image validation failed for {file_path}: {e}\")\n        return False\n\n\ndef get_new_tasks_classification(results_dirs: [str]):\n    # Step 1: collect domain/task_ids for each trajectory\n    tasks_per_dir = []\n    for results_dir in results_dirs:\n        domain_tasks = set()\n        for domain in os.listdir(results_dir):\n            domain_dir = os.path.join(results_dir, domain)\n            if not os.path.isdir(domain_dir):\n                continue\n            for task_id in os.listdir(domain_dir):\n                task_dir = os.path.join(domain_dir, task_id)\n                if os.path.isdir(task_dir):\n                    domain_tasks.add(f\"{domain}/{task_id}\")\n        tasks_per_dir.append(domain_tasks)\n\n    # Step 2: find tasks common to all trajectories\n    common_tasks = set.intersection(*tasks_per_dir)\n\n    constant_tasks = []\n    variance_tasks = []\n    constant_tasks_scores = []\n    optimal_sum = 0.0\n    expected_value = 0.0\n\n    # Step 3: evaluate each common task\n    for domain_task in sorted(common_tasks):\n        domain, task_id = domain_task.split(\"/\", 1)\n        results = []\n        for results_dir in results_dirs:\n            task_dir = os.path.join(results_dir, domain, task_id)\n            result_file = os.path.join(task_dir, \"result.txt\")\n            if os.path.isfile(result_file):\n                with open(result_file, \"r\") as f:\n                    try:\n                        val = float(f.read().strip())\n                        results.append(val)\n                    except ValueError:\n                        continue\n\n        if not results:  # skip if no valid results\n            logging.warning(f\"No valid results for {domain_task}\")\n            continue\n\n        # classification\n        if all(r == results[0] for r in results):\n            constant_tasks.append(domain_task)\n            constant_tasks_scores.append(results[0])\n        else:\n            variance_tasks.append(domain_task)\n\n        # accumulate min/optimal\n        # minimum_sum += min(results) #We incorrectly also counted the minimum sum of variance tasks, we should not do this\n        optimal_sum += max(results)\n        expected_value += sum(results) / len(results)\n\n    return {\n        \"constant\": constant_tasks,  # We dont evaluate constant tasks\n        \"variance\": variance_tasks,  # We evaluate variance tasks\n        \"minimum\": sum(\n            constant_tasks_scores\n        ),  # sum of constant tasks scores (easy + hard)\n        \"optimal\": optimal_sum,  # If we get the best score, we get the optimal score\n        \"expected_value\": expected_value,  # If we get the average score across all tasks for all trajectories, we get the expected value\n    }\n\n\ndef check_selected_trajectory(results_dirs: [str], selected_trajectory: str, task: str):\n    \"\"\"\n    results_dirs: list of directories in format results_dir/<domain>/<task_id>\n    selected_trajectory: the path of the selected trajectory\n    task: string in format \"<domain>/<task_id>\"\n\n    Returns (selected_val, optimal_val)\n    \"\"\"\n    domain, task_id = task.split(\"/\")\n    all_results = []\n\n    if not any(\n        os.path.commonpath([os.path.abspath(selected_trajectory), os.path.abspath(rd)])\n        == os.path.abspath(rd)\n        for rd in results_dirs\n    ):\n        return None, None\n\n    for rd in results_dirs:\n        result_file = os.path.join(rd, domain, task_id, \"result.txt\")\n        if os.path.isfile(result_file):\n            try:\n                all_results.append(float(open(result_file).read().strip()))\n            except ValueError:\n                pass\n\n    selected_file = os.path.join(selected_trajectory, domain, task_id, \"result.txt\")\n    if not os.path.isfile(selected_file):\n        return None, max(all_results) if all_results else None\n\n    try:\n        selected_val = float(open(selected_file).read().strip())\n    except ValueError:\n        return None, max(all_results) if all_results else None\n\n    optimal_val = max(all_results) if all_results else selected_val\n    return selected_val, optimal_val\n\n\ndef evaluate_comparative_results(results_dirs: [str], json_path: str = None):\n    \"\"\"\n    Opens comparative_judge_results.json (default) or a given path,\n    evaluates each task, and returns results.\n\n    Args:\n        results_dirs: list of result directories\n        json_path: optional path to comparative_judge_results.json\n\n    Returns:\n        dict mapping task -> {\"selected_val\": float or None, \"optimal_val\": float or None}\n    \"\"\"\n    judge_score = 0\n    optimal_score = 0\n    if json_path is None:\n        json_path = \"comparative_judge_results.json\"\n\n    with open(json_path, \"r\") as f:\n        data = json.load(f)\n\n    results = {}\n    for task, info in data.items():\n        selected_trajectory = info.get(\"selected_trajectory\")\n        if selected_trajectory:\n            selected_val, optimal_val = check_selected_trajectory(\n                results_dirs, selected_trajectory, task\n            )\n            if selected_val is not None and optimal_val is not None:\n                print(\n                    f\"task: {task}, selected_val: {selected_val}, optimal_val: {optimal_val}\"\n                )\n                judge_score += selected_val\n                optimal_score += optimal_val\n    return judge_score, optimal_score\n"
  },
  {
    "path": "osworld_setup/s3/lib_run_single.py",
    "content": "import datetime\nimport json\nimport logging\nimport os\nimport time\nfrom typing import *\nfrom wrapt_timeout_decorator import *\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef run_single_example(\n    agent, env, example, max_steps, instruction, args, example_result_dir, scores\n):\n    runtime_logger = setup_logger(example, example_result_dir)\n    try:\n        agent.reset(runtime_logger)\n    except Exception as e:\n        agent.reset()\n\n    env.reset(task_config=example)\n    time.sleep(60)  # Wait for the environment to be ready\n    obs = env._get_obs()  # Get the initial observation\n\n    with open(os.path.join(example_result_dir, f\"step_0.png\"), \"wb\") as _f:\n        _f.write(obs[\"screenshot\"])\n\n    with open(\n        os.path.join(example_result_dir, \"instruction.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(instruction)\n\n    done = False\n    step_idx = 0\n    # env.controller.start_recording()\n    while not done and step_idx < max_steps:\n        response, actions = agent.predict(instruction, obs)\n        for action in actions:\n            action_timestamp = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n            logger.info(\"Step %d: %s\", step_idx + 1, action)\n            obs, reward, done, info = env.step(action, args.sleep_after_execution)\n\n            logger.info(\"Reward: %.2f\", reward)\n            logger.info(\"Done: %s\", done)\n            # Save screenshot and trajectory information\n            with open(\n                os.path.join(\n                    example_result_dir, f\"step_{step_idx + 1}_{action_timestamp}.png\"\n                ),\n                \"wb\",\n            ) as _f:\n                _f.write(obs[\"screenshot\"])\n\n            response.update(\n                {\n                    \"step_num\": step_idx + 1,\n                    \"action_timestamp\": action_timestamp,\n                    \"action\": action,\n                    \"reward\": reward,\n                    \"done\": done,\n                    \"info\": info,\n                    \"screenshot_file\": f\"step_{step_idx + 1}_{action_timestamp}.png\",\n                }\n            )\n            with open(\n                os.path.join(example_result_dir, \"traj.jsonl\"), \"a\", encoding=\"utf-8\"\n            ) as f:\n                f.write(json.dumps(response, ensure_ascii=False))\n                f.write(\"\\n\")\n            if done:\n                logger.info(\"The episode is done.\")\n                break\n        step_idx += 1\n    result = env.evaluate()\n    logger.info(\"Result: %.2f\", result)\n    scores.append(result)\n    with open(\n        os.path.join(example_result_dir, \"result.txt\"), \"w\", encoding=\"utf-8\"\n    ) as f:\n        f.write(f\"{result}\\n\")\n    # env.controller.end_recording(os.path.join(example_result_dir, \"recording.mp4\"))\n\n\ndef setup_logger(example, example_result_dir):\n    runtime_logger = logging.getLogger(f\"desktopenv.example.{example['id']}\")\n    runtime_logger.setLevel(logging.DEBUG)\n    runtime_logger.addHandler(\n        logging.FileHandler(os.path.join(example_result_dir, \"runtime.log\"))\n    )\n    return runtime_logger\n"
  },
  {
    "path": "osworld_setup/s3/run.py",
    "content": "\"\"\"OSWorld's run.py with AgentS2.\"\"\"\n\n\"\"\"Script to run end-to-end evaluation on the benchmark.\nUtils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.\n\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\nimport signal\nimport time\nfrom multiprocessing import Process, Manager, current_process, Queue\n\n\nimport lib_run_single\nfrom desktop_env.desktop_env import DesktopEnv\n\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nstdout_handler = logging.StreamHandler(sys.stdout)\n\nstdout_handler.setLevel(logging.INFO)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\n\nstdout_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(stdout_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\n# Global variables for signal handling\nactive_environments = []\nprocesses = []\nis_terminating = False\n\n\ndef distribute_tasks(test_all_meta: dict) -> list:\n    all_tasks = []\n    for domain, examples in test_all_meta.items():\n        for example_id in examples:\n            all_tasks.append((domain, example_id))\n    return all_tasks\n\n\ndef process_signal_handler(signum, frame, env_idx):\n    logger.info(f\"Process {env_idx + 1} received signal {signum}. Shutting down...\")\n    local_vars = frame.f_locals\n    active_environments = local_vars.get(\"active_environments\", [])\n    for env in active_environments:\n        if env is not None:\n            try:\n                logger.info(f\"Process {env_idx + 1} closing environment...\")\n                env.close()\n                logger.info(f\"Process {env_idx + 1} environment closed successfully\")\n            except Exception as e:\n                logger.error(f\"Process {env_idx + 1} error closing environment: {e}\")\n    logger.info(f\"Process {env_idx + 1} shutdown complete. Exiting.\")\n    sys.exit(0)\n\n\ndef run_env_tasks(\n    task_queue: Queue,\n    args: argparse.Namespace,\n    shared_scores: list,\n    engine_params,\n    engine_params_for_grounding,\n):\n    active_environments = []\n    env = None\n    try:\n        # Use IMAGE_ID_MAP for AWS provider to get snapshot_name\n        snapshot_name = None\n        region = getattr(args, \"region\", None)\n        if args.provider_name == \"aws\" and region is not None:\n            try:\n                from desktop_env.providers.aws.manager import IMAGE_ID_MAP\n\n                screen_size = (args.screen_width, args.screen_height)\n                snapshot_name = IMAGE_ID_MAP[region].get(\n                    screen_size, IMAGE_ID_MAP[region][(1920, 1080)]\n                )\n            except Exception as e:\n                logger.error(f\"Failed to get snapshot_name from IMAGE_ID_MAP: {e}\")\n                snapshot_name = None\n        from gui_agents.s3.agents.agent_s import AgentS3\n        from gui_agents.s3.agents.grounding import OSWorldACI\n\n        env = DesktopEnv(\n            path_to_vm=args.path_to_vm,\n            action_space=args.action_space,\n            provider_name=args.provider_name,\n            region=region,\n            snapshot_name=snapshot_name,\n            screen_size=(args.screen_width, args.screen_height),\n            headless=args.headless,\n            os_type=\"Ubuntu\",\n            require_a11y_tree=args.observation_type\n            in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n            enable_proxy=True,\n            client_password=getattr(args, \"client_password\", \"\"),\n        )\n        grounding_agent = OSWorldACI(\n            env=env,\n            platform=\"linux\",\n            engine_params_for_generation=engine_params,\n            engine_params_for_grounding=engine_params_for_grounding,\n            width=args.screen_width,\n            height=args.screen_height,\n        )\n        agent = AgentS3(\n            engine_params,\n            grounding_agent,\n            platform=\"linux\",\n        )\n\n        active_environments.append(env)\n        logger.info(f\"Process {current_process().name} started.\")\n        while True:\n            try:\n                item = task_queue.get(timeout=5)\n            except Exception:\n                break\n            domain, example_id = item\n            try:\n                config_file = os.path.join(\n                    args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n                )\n                with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                    example = json.load(f)\n                instruction = example[\"instruction\"]\n                example_result_dir = os.path.join(\n                    args.result_dir,\n                    args.action_space,\n                    args.observation_type,\n                    args.model,\n                    domain,\n                    example_id,\n                )\n                os.makedirs(example_result_dir, exist_ok=True)\n                logger.info(f\"[{current_process().name}][Domain]: {domain}\")\n                logger.info(f\"[{current_process().name}][Example ID]: {example_id}\")\n                logger.info(f\"[{current_process().name}][Instruction]: {instruction}\")\n                try:\n                    lib_run_single.run_single_example(\n                        agent,\n                        env,\n                        example,\n                        args.max_steps,\n                        instruction,\n                        args,\n                        example_result_dir,\n                        shared_scores,\n                    )\n                except Exception as e:\n                    import traceback\n\n                    logger.error(\n                        f\"Exception in {current_process().name} {domain}/{example_id}: {e}\"\n                    )\n                    logger.error(traceback.format_exc())\n                    try:\n                        env.controller.end_recording(\n                            os.path.join(example_result_dir, \"recording.mp4\")\n                        )\n                    except Exception as rec_e:\n                        logger.error(f\"Failed to end recording: {rec_e}\")\n                    with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                        f.write(json.dumps({\"Error\": f\"{domain}/{example_id} - {e}\"}))\n                        f.write(\"\\n\")\n            except Exception as e:\n                logger.error(f\"Task-level error in {current_process().name}: {e}\")\n                import traceback\n\n                logger.error(traceback.format_exc())\n    except Exception as e:\n        logger.error(f\"Process-level error in {current_process().name}: {e}\")\n        import traceback\n\n        logger.error(traceback.format_exc())\n    finally:\n        logger.info(f\"{current_process().name} cleaning up environment...\")\n        try:\n            if env:\n                env.close()\n                logger.info(f\"{current_process().name} environment closed successfully\")\n        except Exception as e:\n            logger.error(\n                f\"{current_process().name} error during environment cleanup: {e}\"\n            )\n\n\ndef signal_handler(signum, frame):\n    global is_terminating, active_environments, processes\n    if is_terminating:\n        return\n    is_terminating = True\n    logger.info(f\"Received signal {signum}. Gracefully shutting down...\")\n    for env in active_environments:\n        try:\n            logger.info(f\"Closing environment...\")\n            env.close()\n            logger.info(f\"Environment closed successfully\")\n        except Exception as e:\n            logger.error(f\"Error closing environment: {e}\")\n    for p in processes:\n        if p.is_alive():\n            try:\n                logger.info(f\"Sending termination signal to process {p.name}...\")\n                p.terminate()\n            except Exception as e:\n                logger.error(f\"Error sending termination signal to process: {e}\")\n    time.sleep(1)\n    for p in processes:\n        if p.is_alive():\n            try:\n                logger.info(f\"Forcefully terminating process {p.name}...\")\n                import signal as sig\n\n                os.kill(p.pid, sig.SIGKILL)\n            except Exception as e:\n                logger.error(f\"Error forcefully terminating process: {e}\")\n    logger.info(\"Shutdown complete. Exiting.\")\n    sys.exit(0)\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--provider_name\",\n        type=str,\n        default=\"vmware\",\n        help=\"Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)\",\n    )\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"screenshot\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\n        \"--num_envs\",\n        type=int,\n        default=1,\n        help=\"Number of environments to run in parallel\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=1.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n\n    parser.add_argument(\n        \"--region\", type=str, default=\"us-east-1\", help=\"AWS region for the VM\"\n    )\n    parser.add_argument(\n        \"--client_password\", type=str, default=\"\", help=\"Client password\"\n    )\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=8)\n\n    # lm config\n    parser.add_argument(\"--model_provider\", type=str, default=\"openai\")\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # grounding model config\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\", type=str, required=True, help=\"The URL of the grounding model\"\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    global processes\n    logger.info(\"Args: %s\", args)\n    all_tasks = distribute_tasks(test_all_meta)\n    logger.info(f\"Total tasks: {len(all_tasks)}\")\n\n    engine_params = {\n        \"engine_type\": args.model_provider,\n        \"model\": args.model,\n        \"base_url\": getattr(args, \"model_url\", \"\"),\n        \"api_key\": getattr(args, \"model_api_key\", \"\"),\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": getattr(args, \"ground_url\", \"\"),\n        \"api_key\": getattr(args, \"ground_api_key\", \"\"),\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    with Manager() as manager:\n        shared_scores = manager.list()\n        task_queue = manager.Queue()\n        for item in all_tasks:\n            task_queue.put(item)\n        num_envs = args.num_envs\n        processes = []\n        for i in range(num_envs):\n            p = Process(\n                target=run_env_tasks,\n                args=(\n                    task_queue,\n                    args,\n                    shared_scores,\n                    engine_params,\n                    engine_params_for_grounding,\n                ),\n                name=f\"EnvProcess-{i+1}\",\n            )\n            p.daemon = True\n            p.start()\n            processes.append(p)\n            logger.info(f\"Started process {p.name} with PID {p.pid}\")\n        try:\n            while True:\n                alive_count = 0\n                for idx, p in enumerate(processes):\n                    if not p.is_alive():\n                        logger.warning(f\"Process {p.name} died, restarting...\")\n                        new_p = Process(\n                            target=run_env_tasks,\n                            args=(\n                                task_queue,\n                                args,\n                                shared_scores,\n                                engine_params,\n                                engine_params_for_grounding,\n                            ),\n                            name=f\"EnvProcess-Restart-{idx+1}\",\n                        )\n                        new_p.daemon = True\n                        new_p.start()\n                        processes[idx] = new_p\n                        logger.info(\n                            f\"Restarted process {new_p.name} with PID {new_p.pid}\"\n                        )\n                    else:\n                        alive_count += 1\n                if task_queue.empty():\n                    logger.info(\"All tasks finished.\")\n                    break\n                if alive_count == 0:\n                    logger.error(\"All processes died, exiting.\")\n                    break\n                time.sleep(5)\n            for p in processes:\n                p.join()\n        except KeyboardInterrupt:\n            logger.info(\n                \"Main process received KeyboardInterrupt. Initiating graceful shutdown...\"\n            )\n            raise\n        except Exception as e:\n            logger.error(\n                f\"Unexpected error while waiting for processes: {e}\", exc_info=True\n            )\n            for p in processes:\n                if p.is_alive():\n                    try:\n                        logger.info(f\"Terminating process {p.name} due to error...\")\n                        p.terminate()\n                    except Exception as term_e:\n                        logger.error(f\"Error terminating process {p.name}: {term_e}\")\n            raise\n        scores = list(shared_scores)\n    logger.info(f\"Average score: {sum(scores) / len(scores) if scores else 0}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    signal.signal(signal.SIGINT, signal_handler)\n    signal.signal(signal.SIGTERM, signal_handler)\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    # save args to json in result_dir/action_space/observation_type/model/args.json\n    path_to_args = os.path.join(\n        args.result_dir,\n        args.action_space,\n        args.observation_type,\n        args.model,\n        \"args.json\",\n    )\n    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)\n    with open(path_to_args, \"w\", encoding=\"utf-8\") as f:\n        json.dump(vars(args), f, indent=4)\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "osworld_setup/s3/run.sh",
    "content": "# Step 1: Complete 2 or more rollouts on either AWS or locally\npython run.py \\\n  --provider_name \"aws\" \\\n  --headless \\\n  --num_envs 10 \\\n  --max_steps 100 \\\n  --domain \"all\" \\\n  --test_all_meta_path evaluation_examples/test_nogdrive.json \\\n  --result_dir \"results\" \\\n  --region \"us-east-1\" \\\n  --model_provider \"openai\" \\\n  --model \"gpt-5-2025-08-07\" \\\n  --model_temperature 1.0 \\\n  --ground_provider \"huggingface\" \\\n  --ground_url \"<YOUR_HUGGINGFACE_ENDPOINT_URL>/v1\" \\\n  --grounding_width 1920 \\\n  --grounding_height 1080 \\\n  --sleep_after_execution 3\n\npython run_local.py \\\n  --path_to_vm \"/Users/user/OSWorld/vmware_vm_data/Ubuntu0/Ubuntu0.vmx\" \\\n  --provider_name \"vmware\" \\\n  --headless \\\n  --max_steps 100 \\\n  --domain \"all\" \\\n  --test_all_meta_path evaluation_examples/test_nogdrive.json \\\n  --result_dir \"results\" \\\n  --model_provider \"openai\" \\\n  --model \"gpt-5-2025-08-07\" \\\n  --model_temperature 1.0 \\\n  --ground_provider \"huggingface\" \\\n  --ground_url \"<YOUR_HUGGINGFACE_ENDPOINT_URL>/v1\" \\\n  --grounding_width 1920 \\\n  --grounding_height 1080\n\n# Step 2: Generate Facts\npython generate_facts.py \\\n  --results-dirs \\\n    results1/pyautogui/screenshot/gpt-5-2025-08-07 \\\n    results2/pyautogui/screenshot/gpt-5-2025-08-07 \\\n  --model \"gpt-5-2025-08-07\" \\\n  --engine-type \"openai\" \\\n  --temperature 1.0\n\n# Step 3: Run the Judge. Make sure the order of the results-dirs is the same as the order above.\npython run_judge.py \\\n  --results-dirs \\\n    results1/pyautogui/screenshot/gpt-5-2025-08-07 \\\n    results2/pyautogui/screenshot/gpt-5-2025-08-07 \\\n  --output-dir \"judge_results\" \\\n  --examples-path \"evaluation_examples/examples\" \\\n  --model \"gpt-5-2025-08-07\" \\\n  --engine-type \"openai\" \\\n  --temperature 1.0"
  },
  {
    "path": "osworld_setup/s3/run_local.py",
    "content": "\"\"\"Script to run end-to-end evaluation on the benchmark.\nUtils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.\n\"\"\"\n\nimport argparse\nimport datetime\nimport json\nimport logging\nimport os\nimport sys\n\nfrom tqdm import tqdm\n\nimport lib_run_single\nfrom desktop_env.desktop_env import DesktopEnv\nfrom gui_agents.s3.agents.agent_s import AgentS3\nfrom gui_agents.s3.agents.grounding import OSWorldACI\n\nfrom dotenv import load_dotenv\n\nload_dotenv()\n\n# Almost deprecated since it's not multi-env, use run_multienv_*.py instead\n\n#  Logger Configs {{{ #\nlogger = logging.getLogger()\nlogger.setLevel(logging.DEBUG)\n\ndatetime_str: str = datetime.datetime.now().strftime(\"%Y%m%d@%H%M%S\")\n\nfile_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"normal-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\ndebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"debug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\nstdout_handler = logging.StreamHandler(sys.stdout)\nsdebug_handler = logging.FileHandler(\n    os.path.join(\"logs\", \"sdebug-{:}.log\".format(datetime_str)), encoding=\"utf-8\"\n)\n\nfile_handler.setLevel(logging.INFO)\ndebug_handler.setLevel(logging.DEBUG)\nstdout_handler.setLevel(logging.INFO)\nsdebug_handler.setLevel(logging.DEBUG)\n\nformatter = logging.Formatter(\n    fmt=\"\\x1b[1;33m[%(asctime)s \\x1b[31m%(levelname)s \\x1b[32m%(module)s/%(lineno)d-%(processName)s\\x1b[1;33m] \\x1b[0m%(message)s\"\n)\nfile_handler.setFormatter(formatter)\ndebug_handler.setFormatter(formatter)\nstdout_handler.setFormatter(formatter)\nsdebug_handler.setFormatter(formatter)\n\nstdout_handler.addFilter(logging.Filter(\"desktopenv\"))\nsdebug_handler.addFilter(logging.Filter(\"desktopenv\"))\n\nlogger.addHandler(file_handler)\nlogger.addHandler(debug_handler)\nlogger.addHandler(stdout_handler)\nlogger.addHandler(sdebug_handler)\n#  }}} Logger Configs #\n\nlogger = logging.getLogger(\"desktopenv.experiment\")\n\n\ndef config() -> argparse.Namespace:\n    parser = argparse.ArgumentParser(\n        description=\"Run end-to-end evaluation on the benchmark\"\n    )\n\n    # environment config\n    parser.add_argument(\"--path_to_vm\", type=str, default=None)\n    parser.add_argument(\n        \"--provider_name\",\n        type=str,\n        default=\"vmware\",\n        help=\"Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)\",\n    )\n    parser.add_argument(\n        \"--headless\", action=\"store_true\", help=\"Run in headless machine\"\n    )\n    parser.add_argument(\n        \"--action_space\", type=str, default=\"pyautogui\", help=\"Action type\"\n    )\n    parser.add_argument(\n        \"--observation_type\",\n        choices=[\"screenshot\", \"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        default=\"screenshot\",\n        help=\"Observation type\",\n    )\n    parser.add_argument(\"--screen_width\", type=int, default=1920)\n    parser.add_argument(\"--screen_height\", type=int, default=1080)\n    parser.add_argument(\"--sleep_after_execution\", type=float, default=3.0)\n    parser.add_argument(\"--max_steps\", type=int, default=15)\n\n    # agent config\n    parser.add_argument(\"--max_trajectory_length\", type=int, default=3)\n    parser.add_argument(\n        \"--test_config_base_dir\", type=str, default=\"evaluation_examples\"\n    )\n\n    # lm config\n    parser.add_argument(\"--model\", type=str, default=\"gpt-4o\")\n    parser.add_argument(\"--temperature\", type=float, default=1.0)\n\n    # AgentS2 specific config\n    parser.add_argument(\"--model_provider\", type=str, default=\"openai\")\n    parser.add_argument(\n        \"--model_url\",\n        type=str,\n        default=\"\",\n        help=\"The URL of the main generation model API.\",\n    )\n    parser.add_argument(\n        \"--model_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the main generation model.\",\n    )\n    parser.add_argument(\n        \"--model_temperature\",\n        type=float,\n        default=None,\n        help=\"Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)\",\n    )\n\n    # grounding model config\n    parser.add_argument(\n        \"--ground_provider\",\n        type=str,\n        required=True,\n        help=\"The provider for the grounding model\",\n    )\n    parser.add_argument(\n        \"--ground_url\", type=str, required=True, help=\"The URL of the grounding model\"\n    )\n    parser.add_argument(\n        \"--ground_api_key\",\n        type=str,\n        default=\"\",\n        help=\"The API key of the grounding model.\",\n    )\n    parser.add_argument(\n        \"--ground_model\",\n        type=str,\n        required=True,\n        help=\"The model name for the grounding model\",\n    )\n    parser.add_argument(\n        \"--grounding_width\",\n        type=int,\n        required=True,\n        help=\"Width of screenshot image after processor rescaling\",\n    )\n    parser.add_argument(\n        \"--grounding_height\",\n        type=int,\n        required=True,\n        help=\"Height of screenshot image after processor rescaling\",\n    )\n\n    # example config\n    parser.add_argument(\"--domain\", type=str, default=\"all\")\n    parser.add_argument(\n        \"--test_all_meta_path\", type=str, default=\"evaluation_examples/test_all.json\"\n    )\n\n    # logging related\n    parser.add_argument(\"--result_dir\", type=str, default=\"./results\")\n    args = parser.parse_args()\n\n    return args\n\n\ndef test(args: argparse.Namespace, test_all_meta: dict) -> None:\n    scores = []\n    max_steps = args.max_steps\n\n    # log args\n    logger.info(\"Args: %s\", args)\n    # set wandb project\n    cfg_args = {\n        \"path_to_vm\": args.path_to_vm,\n        \"provider_name\": args.provider_name,\n        \"headless\": args.headless,\n        \"action_space\": args.action_space,\n        \"observation_type\": args.observation_type,\n        \"screen_width\": args.screen_width,\n        \"screen_height\": args.screen_height,\n        \"sleep_after_execution\": args.sleep_after_execution,\n        \"max_steps\": args.max_steps,\n        \"max_trajectory_length\": args.max_trajectory_length,\n        \"model\": args.model,\n        \"temperature\": args.temperature,\n        \"result_dir\": args.result_dir,\n    }\n\n    # AgentS2 configuration\n    engine_params = {\n        \"engine_type\": args.model_provider,\n        \"model\": args.model,\n        \"base_url\": getattr(args, \"model_url\", \"\"),\n        \"api_key\": getattr(args, \"model_api_key\", \"\"),\n        \"temperature\": getattr(args, \"model_temperature\", None),\n    }\n    engine_params_for_grounding = {\n        \"engine_type\": args.ground_provider,\n        \"model\": args.ground_model,\n        \"base_url\": getattr(args, \"ground_url\", \"\"),\n        \"api_key\": getattr(args, \"ground_api_key\", \"\"),\n        \"grounding_width\": args.grounding_width,\n        \"grounding_height\": args.grounding_height,\n    }\n\n    env = DesktopEnv(\n        provider_name=args.provider_name,\n        path_to_vm=args.path_to_vm,\n        action_space=args.action_space,\n        screen_size=(args.screen_width, args.screen_height),\n        headless=args.headless,\n        os_type=\"Ubuntu\",\n        require_a11y_tree=args.observation_type\n        in [\"a11y_tree\", \"screenshot_a11y_tree\", \"som\"],\n        enable_proxy=True,\n    )\n\n    grounding_agent = OSWorldACI(\n        env=env,\n        platform=\"linux\",\n        engine_params_for_generation=engine_params,\n        engine_params_for_grounding=engine_params_for_grounding,\n        width=args.screen_width,\n        height=args.screen_height,\n    )\n    agent = AgentS3(\n        engine_params,\n        grounding_agent,\n        platform=\"linux\",\n    )\n\n    for domain in tqdm(test_all_meta, desc=\"Domain\"):\n        for example_id in tqdm(test_all_meta[domain], desc=\"Example\", leave=False):\n            config_file = os.path.join(\n                args.test_config_base_dir, f\"examples/{domain}/{example_id}.json\"\n            )\n            with open(config_file, \"r\", encoding=\"utf-8\") as f:\n                example = json.load(f)\n\n            logger.info(f\"[Domain]: {domain}\")\n            logger.info(f\"[Example ID]: {example_id}\")\n\n            instruction = example[\"instruction\"]\n\n            logger.info(f\"[Instruction]: {instruction}\")\n            # wandb each example config settings\n            cfg_args[\"instruction\"] = instruction\n            cfg_args[\"start_time\"] = datetime.datetime.now().strftime(\n                \"%Y:%m:%d-%H:%M:%S\"\n            )\n            # run.config.update(cfg_args)\n\n            example_result_dir = os.path.join(\n                args.result_dir,\n                args.action_space,\n                args.observation_type,\n                args.model,\n                domain,\n                example_id,\n            )\n            os.makedirs(example_result_dir, exist_ok=True)\n            # example start running\n            try:\n                lib_run_single.run_single_example(\n                    agent,\n                    env,\n                    example,\n                    max_steps,\n                    instruction,\n                    args,\n                    example_result_dir,\n                    scores,\n                )\n            except Exception as e:\n                logger.error(f\"Exception in {domain}/{example_id}: {e}\")\n                # Only attempt to end recording if controller exists (not Docker provider)\n                if hasattr(env, \"controller\") and env.controller is not None:\n                    env.controller.end_recording(\n                        os.path.join(example_result_dir, \"recording.mp4\")\n                    )\n                with open(os.path.join(example_result_dir, \"traj.jsonl\"), \"a\") as f:\n                    f.write(\n                        json.dumps(\n                            {\"Error\": f\"Time limit exceeded in {domain}/{example_id}\"}\n                        )\n                    )\n                    f.write(\"\\n\")\n\n    env.close()\n    logger.info(f\"Average score: {sum(scores) / len(scores)}\")\n\n\ndef get_unfinished(\n    action_space, use_model, observation_type, result_dir, total_file_json\n):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n\n    if not os.path.exists(target_dir):\n        return total_file_json\n\n    finished = {}\n    for domain in os.listdir(target_dir):\n        finished[domain] = []\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                if example_id == \"onboard\":\n                    continue\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" not in os.listdir(example_path):\n                        # empty all files under example_id\n                        for file in os.listdir(example_path):\n                            os.remove(os.path.join(example_path, file))\n                    else:\n                        finished[domain].append(example_id)\n\n    if not finished:\n        return total_file_json\n\n    for domain, examples in finished.items():\n        if domain in total_file_json:\n            total_file_json[domain] = [\n                x for x in total_file_json[domain] if x not in examples\n            ]\n\n    return total_file_json\n\n\ndef get_result(action_space, use_model, observation_type, result_dir, total_file_json):\n    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)\n    if not os.path.exists(target_dir):\n        print(\"New experiment, no result yet.\")\n        return None\n\n    all_result = []\n\n    for domain in os.listdir(target_dir):\n        domain_path = os.path.join(target_dir, domain)\n        if os.path.isdir(domain_path):\n            for example_id in os.listdir(domain_path):\n                example_path = os.path.join(domain_path, example_id)\n                if os.path.isdir(example_path):\n                    if \"result.txt\" in os.listdir(example_path):\n                        # empty all files under example_id\n                        try:\n                            all_result.append(\n                                float(\n                                    open(\n                                        os.path.join(example_path, \"result.txt\"), \"r\"\n                                    ).read()\n                                )\n                            )\n                        except:\n                            all_result.append(0.0)\n\n    if not all_result:\n        print(\"New experiment, no result yet.\")\n        return None\n    else:\n        print(\"Current Success Rate:\", sum(all_result) / len(all_result) * 100, \"%\")\n        return all_result\n\n\nif __name__ == \"__main__\":\n    ####### The complete version of the list of examples #######\n    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n    args = config()\n\n    # save args to json in result_dir/action_space/observation_type/model/args.json\n    path_to_args = os.path.join(\n        args.result_dir,\n        args.action_space,\n        args.observation_type,\n        args.model,\n        \"args.json\",\n    )\n    os.makedirs(os.path.dirname(path_to_args), exist_ok=True)\n    with open(path_to_args, \"w\", encoding=\"utf-8\") as f:\n        json.dump(vars(args), f, indent=4)\n\n    with open(args.test_all_meta_path, \"r\", encoding=\"utf-8\") as f:\n        test_all_meta = json.load(f)\n\n    if args.domain != \"all\":\n        test_all_meta = {args.domain: test_all_meta[args.domain]}\n\n    test_file_list = get_unfinished(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    left_info = \"\"\n    for domain in test_file_list:\n        left_info += f\"{domain}: {len(test_file_list[domain])}\\n\"\n    logger.info(f\"Left tasks:\\n{left_info}\")\n\n    get_result(\n        args.action_space,\n        args.model,\n        args.observation_type,\n        args.result_dir,\n        test_all_meta,\n    )\n    test(args, test_file_list)\n"
  },
  {
    "path": "requirements.txt",
    "content": "numpy\nbackoff\npandas\nopenai\nanthropic\nfastapi\nuvicorn\npaddleocr\npaddlepaddle\ntogether\nscikit-learn\nwebsockets\ntiktoken\npyautogui\ntoml\nblack\npytesseract\ngoogle-genai\n\n# Platform-specific dependencies\npyobjc; platform_system == \"Darwin\"\npywinauto; platform_system == \"Windows\"\npywin32; platform_system == \"Windows\" "
  },
  {
    "path": "setup.py",
    "content": "from setuptools import find_packages, setup\n\nsetup(\n    name=\"gui-agents\",\n    version=\"0.3.2\",\n    description=\"A library for creating general purpose GUI agents using multimodal LLMs.\",\n    long_description=open(\"README.md\", encoding=\"utf-8\").read(),\n    long_description_content_type=\"text/markdown\",\n    author=\"Simular AI\",\n    author_email=\"eric@simular.ai\",\n    packages=find_packages(),\n    install_requires=[\n        \"numpy\",\n        \"backoff\",\n        \"pandas\",\n        \"openai\",\n        \"anthropic\",\n        \"fastapi\",\n        \"uvicorn\",\n        \"paddleocr\",\n        \"paddlepaddle\",\n        \"together\",\n        \"scikit-learn\",\n        \"websockets\",\n        \"tiktoken\",\n        \"selenium\",\n        'pyobjc; platform_system == \"Darwin\"',\n        \"pyautogui\",\n        \"toml\",\n        \"pytesseract\",\n        \"google-genai\",\n        'pywinauto; platform_system == \"Windows\"',  # Only for Windows\n        'pywin32; platform_system == \"Windows\"',  # Only for Windows\n    ],\n    extras_require={\"dev\": [\"black\"]},  # Code formatter for linting\n    entry_points={\n        \"console_scripts\": [\n            \"agent_s=gui_agents.s3.cli_app:main\",\n        ],\n    },\n    classifiers=[\n        \"Programming Language :: Python :: 3\",\n        \"Programming Language :: Python :: 3.9\",\n        \"License :: OSI Approved :: Apache Software License\",\n        \"Operating System :: Microsoft :: Windows\",\n        \"Operating System :: POSIX :: Linux\",\n        \"Operating System :: MacOS :: MacOS X\",\n        \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n    ],\n    keywords=\"ai, llm, gui, agent, multimodal\",\n    project_urls={\n        \"Source\": \"https://github.com/simular-ai/Agent-S\",\n        \"Bug Reports\": \"https://github.com/simular-ai/Agent-S/issues\",\n    },\n    python_requires=\">=3.9, <=3.12\",\n)\n"
  }
]