Repository: abi/screenshot-to-code Branch: main Commit: aaaa838548c3 Files: 228 Total size: 793.7 KB Directory structure: gitextract_l7qpd28l/ ├── .claude/ │ └── launch.json ├── .gitattributes ├── .github/ │ ├── FUNDING.yml │ └── ISSUE_TEMPLATE/ │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── .gitignore ├── .vscode/ │ └── settings.json ├── AGENTS.md ├── CLAUDE.md ├── Evaluation.md ├── LICENSE ├── README.md ├── TESTING.md ├── Troubleshooting.md ├── backend/ │ ├── .gitignore │ ├── .pre-commit-config.yaml │ ├── Dockerfile │ ├── README.md │ ├── agent/ │ │ ├── engine.py │ │ ├── providers/ │ │ │ ├── __init__.py │ │ │ ├── anthropic/ │ │ │ │ ├── __init__.py │ │ │ │ ├── image.py │ │ │ │ └── provider.py │ │ │ ├── base.py │ │ │ ├── factory.py │ │ │ ├── gemini.py │ │ │ ├── openai.py │ │ │ ├── pricing.py │ │ │ ├── token_usage.py │ │ │ └── types.py │ │ ├── runner.py │ │ ├── state.py │ │ └── tools/ │ │ ├── __init__.py │ │ ├── definitions.py │ │ ├── parsing.py │ │ ├── runtime.py │ │ ├── summaries.py │ │ └── types.py │ ├── codegen/ │ │ ├── __init__.py │ │ ├── test_utils.py │ │ └── utils.py │ ├── config.py │ ├── custom_types.py │ ├── debug/ │ │ ├── DebugFileWriter.py │ │ └── __init__.py │ ├── evals/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── core.py │ │ ├── runner.py │ │ └── utils.py │ ├── fs_logging/ │ │ ├── __init__.py │ │ ├── openai_input_compare.py │ │ ├── openai_input_formatting.py │ │ └── openai_turn_inputs.py │ ├── image_generation/ │ │ ├── __init__.py │ │ ├── core.py │ │ ├── generation.py │ │ └── replicate.py │ ├── llm.py │ ├── main.py │ ├── prompts/ │ │ ├── __init__.py │ │ ├── create/ │ │ │ ├── __init__.py │ │ │ ├── image.py │ │ │ ├── text.py │ │ │ └── video.py │ │ ├── message_builder.py │ │ ├── pipeline.py │ │ ├── plan.py │ │ ├── policies.py │ │ ├── prompt_types.py │ │ ├── request_parsing.py │ │ ├── system_prompt.py │ │ └── update/ │ │ ├── __init__.py │ │ ├── from_file_snapshot.py │ │ └── from_history.py │ ├── pyproject.toml │ ├── pyrightconfig.json │ ├── pytest.ini │ ├── routes/ │ │ ├── evals.py │ │ ├── generate_code.py │ │ ├── home.py │ │ ├── model_choice_sets.py │ │ └── screenshot.py │ ├── run_evals.py │ ├── run_image_generation_evals.py │ ├── start.py │ ├── tests/ │ │ ├── __init__.py │ │ ├── test_agent_tool_runtime.py │ │ ├── test_agent_tools.py │ │ ├── test_batching.py │ │ ├── test_codegen_utils.py │ │ ├── test_evals_openai_input_compare.py │ │ ├── test_image_generation_replicate.py │ │ ├── test_model_selection.py │ │ ├── test_openai_input_compare.py │ │ ├── test_openai_provider_session.py │ │ ├── test_openai_reasoning_parser.py │ │ ├── test_openai_turn_input_logging.py │ │ ├── test_parameter_extraction_stage.py │ │ ├── test_prompt_summary.py │ │ ├── test_prompts.py │ │ ├── test_request_parsing.py │ │ ├── test_screenshot.py │ │ ├── test_status_broadcast.py │ │ └── test_token_usage.py │ ├── utils.py │ ├── video/ │ │ ├── __init__.py │ │ ├── cost_estimation.py │ │ └── utils.py │ └── ws/ │ ├── __init__.py │ └── constants.py ├── blog/ │ └── evaluating-claude.md ├── design-docs/ │ ├── agent-tool-calling-flow.md │ ├── agentic-runner-refactor.md │ ├── commits-and-variants.md │ ├── general.md │ ├── images-in-update-history.md │ ├── prompt-history-refactor.md │ └── variant-system.md ├── docker-compose.yml ├── frontend/ │ ├── .eslintrc.cjs │ ├── .gitignore │ ├── Dockerfile │ ├── components.json │ ├── index.html │ ├── jest.config.js │ ├── package.json │ ├── postcss.config.js │ ├── src/ │ │ ├── App.tsx │ │ ├── components/ │ │ │ ├── ImageLightbox.tsx │ │ │ ├── ImageUpload.tsx │ │ │ ├── ImportCodeSection.tsx │ │ │ ├── TermsOfServiceDialog.tsx │ │ │ ├── UpdateImageUpload.tsx │ │ │ ├── agent/ │ │ │ │ └── AgentActivity.tsx │ │ │ ├── commits/ │ │ │ │ ├── types.ts │ │ │ │ └── utils.ts │ │ │ ├── core/ │ │ │ │ ├── KeyboardShortcutBadge.tsx │ │ │ │ ├── Spinner.tsx │ │ │ │ ├── StackLabel.tsx │ │ │ │ └── WorkingPulse.tsx │ │ │ ├── evals/ │ │ │ │ ├── AllEvalsPage.tsx │ │ │ │ ├── BestOfNEvalsPage.tsx │ │ │ │ ├── EvalNavigation.tsx │ │ │ │ ├── EvalsPage.tsx │ │ │ │ ├── InputFileSelector.tsx │ │ │ │ ├── OpenAIInputComparePage.tsx │ │ │ │ ├── PairwiseEvalsPage.tsx │ │ │ │ ├── RatingPicker.tsx │ │ │ │ └── RunEvalsPage.tsx │ │ │ ├── generate-from-text/ │ │ │ │ └── GenerateFromText.tsx │ │ │ ├── history/ │ │ │ │ ├── HistoryDisplay.tsx │ │ │ │ ├── utils.test.ts │ │ │ │ └── utils.ts │ │ │ ├── messages/ │ │ │ │ ├── OnboardingNote.tsx │ │ │ │ ├── PicoBadge.tsx │ │ │ │ └── TipLink.tsx │ │ │ ├── preview/ │ │ │ │ ├── CodeMirror.tsx │ │ │ │ ├── CodePreview.tsx │ │ │ │ ├── CodeTab.tsx │ │ │ │ ├── PreviewComponent.tsx │ │ │ │ ├── PreviewPane.tsx │ │ │ │ ├── download.ts │ │ │ │ ├── extractHtml.ts │ │ │ │ └── simpleHash.ts │ │ │ ├── recording/ │ │ │ │ ├── ScreenRecorder.tsx │ │ │ │ └── utils.ts │ │ │ ├── select-and-edit/ │ │ │ │ └── utils.ts │ │ │ ├── settings/ │ │ │ │ ├── GenerationSettings.tsx │ │ │ │ ├── OutputSettingsSection.tsx │ │ │ │ └── SettingsTab.tsx │ │ │ ├── sidebar/ │ │ │ │ ├── IconStrip.tsx │ │ │ │ └── Sidebar.tsx │ │ │ ├── start-pane/ │ │ │ │ └── StartPane.tsx │ │ │ ├── thinking/ │ │ │ │ └── ThinkingIndicator.tsx │ │ │ ├── ui/ │ │ │ │ ├── accordion.tsx │ │ │ │ ├── alert-dialog.tsx │ │ │ │ ├── badge.tsx │ │ │ │ ├── button.tsx │ │ │ │ ├── checkbox.tsx │ │ │ │ ├── collapsible.tsx │ │ │ │ ├── dialog.tsx │ │ │ │ ├── hover-card.tsx │ │ │ │ ├── input.tsx │ │ │ │ ├── label.tsx │ │ │ │ ├── popover.tsx │ │ │ │ ├── progress.tsx │ │ │ │ ├── scroll-area.tsx │ │ │ │ ├── select.tsx │ │ │ │ ├── separator.tsx │ │ │ │ ├── switch.tsx │ │ │ │ ├── tabs.tsx │ │ │ │ └── textarea.tsx │ │ │ ├── unified-input/ │ │ │ │ ├── UnifiedInputPane.tsx │ │ │ │ └── tabs/ │ │ │ │ ├── ImportTab.tsx │ │ │ │ ├── TextTab.tsx │ │ │ │ ├── UploadTab.tsx │ │ │ │ └── UrlTab.tsx │ │ │ └── variants/ │ │ │ └── Variants.tsx │ │ ├── config.ts │ │ ├── constants.ts │ │ ├── generateCode.ts │ │ ├── hooks/ │ │ │ ├── useBrowserTabIndicator.ts │ │ │ ├── usePersistedState.ts │ │ │ └── useThrottle.ts │ │ ├── index.css │ │ ├── lib/ │ │ │ ├── models.ts │ │ │ ├── prompt-history.test.ts │ │ │ ├── prompt-history.ts │ │ │ ├── stacks.ts │ │ │ ├── takeScreenshot.ts │ │ │ └── utils.ts │ │ ├── main.tsx │ │ ├── setupTests.ts │ │ ├── store/ │ │ │ ├── app-store.ts │ │ │ └── project-store.ts │ │ ├── tests/ │ │ │ ├── fixtures/ │ │ │ │ └── simple_page.html │ │ │ └── qa.test.ts │ │ ├── types.ts │ │ ├── urls.ts │ │ └── vite-env.d.ts │ ├── tailwind.config.js │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts ├── package.json └── plan.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .claude/launch.json ================================================ { "version": "0.0.1", "configurations": [ { "name": "frontend", "runtimeExecutable": "yarn", "runtimeArgs": ["dev"], "port": 5173, "cwd": "frontend" } ] } ================================================ FILE: .gitattributes ================================================ # Auto detect text files and perform LF normalization * text=auto ================================================ FILE: .github/FUNDING.yml ================================================ github: [abi] ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error **Screenshots of backend AND frontend terminal logs** If applicable, add screenshots to help explain your problem. ================================================ FILE: .github/ISSUE_TEMPLATE/custom.md ================================================ --- name: Custom issue template about: Describe this issue template's purpose here. title: '' labels: '' assignees: '' --- ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .gitignore ================================================ .aider* node_modules # Project-related files # Run logs backend/run_logs/* # Weird Docker setup related files backend/backend/* # Env vars frontend/.env.local .env # Mac files .DS_Store #Rodney .rodney ================================================ FILE: .vscode/settings.json ================================================ { "python.analysis.typeCheckingMode": "strict", "python.analysis.extraPaths": ["./backend"], "python.autoComplete.extraPaths": ["./backend"] } ================================================ FILE: AGENTS.md ================================================ # Project Agent Instructions Python environment: - Always use the backend Poetry virtualenv (`backend-py3.10`) for Python commands. - Preferred invocation: `cd backend && poetry run `. - If you need to activate directly, use Poetry to discover it in the current environment: - `cd backend && poetry env activate` (then run the `source .../bin/activate` command it prints) Testing policy: - Always run backend tests after every code change: `cd backend && poetry run pytest`. - Always run type checking after every code change: `cd backend && poetry run pyright`. - Type checking policy: no new warnings in changed files (`pyright`). ## Frontend - Frontend: `cd frontend && yarn lint` If changes touch both, run both sets. ## Prompt formatting - Prefer triple-quoted strings (`"""..."""`) for multi-line prompt text. - For interpolated multi-line prompts, prefer a single triple-quoted f-string over concatenated string fragments. # Hosted The hosted version is on the `hosted` branch. The `hosted` branch connects to a saas backend, which is a seperate codebase at ../screenshot-to-code-saas ================================================ FILE: CLAUDE.md ================================================ # Project Agent Instructions Python environment: - Always use the backend Poetry virtualenv (`backend-py3.10`) for Python commands. - Preferred invocation: `cd backend && poetry run `. - If you need to activate directly, use Poetry to discover it in the current environment: - `cd backend && poetry env activate` (then run the `source .../bin/activate` command it prints) Testing policy: - Always run backend tests after every code change: `cd backend && poetry run pytest`. - Always run type checking after every code change: `cd backend && poetry run pyright`. - Type checking policy: no new warnings in changed files (`pyright`). ## Frontend - Frontend: `cd frontend && yarn lint` If changes touch both, run both sets. ## Prompt formatting - Prefer triple-quoted strings (`"""..."""`) for multi-line prompt text. - For interpolated multi-line prompts, prefer a single triple-quoted f-string over concatenated string fragments. # Hosted The hosted version is on the `hosted` branch. The `hosted` branch connects to a saas backend, which is a seperate codebase at ../screenshot-to-code-saas ================================================ FILE: Evaluation.md ================================================ ## Evaluating models and prompts Evaluation dataset consists of 16 screenshots. A Python script for running screenshot-to-code on the dataset and a UI for rating outputs is included. With this set up, we can compare and evaluate various models and prompts. ### Running evals - Input screenshots should be located at `backend/evals_data/inputs` and the outputs will be `backend/evals_data/outputs`. If you want to modify this, modify `EVALS_DIR` in `backend/evals/config.py`. You can download the input screenshot dataset here: TODO. - Set a stack and model (`STACK` var, `MODEL` var) in `backend/run_evals.py` - Run `OPENAI_API_KEY=sk-... python run_evals.py` - this runs the screenshot-to-code on the input dataset in parallel but it will still take a few minutes to complete. - Once the script is done, you can find the outputs in `backend/evals_data/outputs`. ### Rating evals In order to view and rate the outputs, visit your front-end at `/evals`. - Rate each output on a scale of 1-4 - You can also print the page as PDF to share your results with others. Generally, I run three tests for each model/prompt + stack combo and take the average score out of those tests to evaluate. ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Abi Raja Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # screenshot-to-code A simple tool to convert screenshots, mockups and Figma designs into clean, functional code using AI. Now supporting Gemini 3 and Claude Opus 4.5! https://github.com/user-attachments/assets/85b911c0-efea-4957-badb-daa97ec402ad Supported stacks: - HTML + Tailwind - HTML + CSS - React + Tailwind - Vue + Tailwind - Bootstrap - Ionic + Tailwind - SVG Supported AI models: - Gemini 3 Flash and Pro - Best models! (Google) - Claude Opus 4.5 - Best model! (Anthropic) - GPT-5.3, GPT-5.2, GPT-4.1 (OpenAI) - Other models are available as well but we recommend using the above models. - DALL-E 3 or Flux Schnell (using Replicate) for image generation See the [Examples](#-examples) section below for more demos. We have experimental support for taking a video/screen recording of a website in action and turning that into a functional prototype. ![google in app quick 3](https://github.com/abi/screenshot-to-code/assets/23818/8758ffa4-9483-4b9b-bb66-abd6d1594c33) [Learn more about video here](https://github.com/abi/screenshot-to-code/wiki/Screen-Recording-to-Code). [Follow me on Twitter for updates](https://twitter.com/_abi_). ## 🌍 Hosted Version [Try it live on the hosted version (paid)](https://screenshottocode.com). ## 🛠 Getting Started The app has a React/Vite frontend and a FastAPI backend. Keys needed: - [OpenAI API key](https://github.com/abi/screenshot-to-code/blob/main/Troubleshooting.md), Anthropic key, or Google Gemini key - Multiple keys are recommended so you can compare results from different models If you'd like to run the app with Ollama open source models (not recommended due to poor quality results), [follow this comment](https://github.com/abi/screenshot-to-code/issues/354#issuecomment-2435479853). Run the backend (I use Poetry for package management - `pip install --upgrade poetry` if you don't have it): ```bash cd backend echo "OPENAI_API_KEY=sk-your-key" > .env echo "ANTHROPIC_API_KEY=your-key" >> .env echo "GEMINI_API_KEY=your-key" >> .env poetry install poetry env activate # run the printed command, e.g. source /path/to/venv/bin/activate poetry run uvicorn main:app --reload --port 7001 ``` You can also set up the keys using the settings dialog on the front-end (click the gear icon after loading the frontend). Run the frontend: ```bash cd frontend yarn yarn dev ``` Open http://localhost:5173 to use the app. If you prefer to run the backend on a different port, update VITE_WS_BACKEND_URL in `frontend/.env.local` ## Docker If you have Docker installed on your system, in the root directory, run: ```bash echo "OPENAI_API_KEY=sk-your-key" > .env docker-compose up -d --build ``` The app will be up and running at http://localhost:5173. Note that you can't develop the application with this setup as the file changes won't trigger a rebuild. ## 🙋‍♂️ FAQs - **I'm running into an error when setting up the backend. How can I fix it?** [Try this](https://github.com/abi/screenshot-to-code/issues/3#issuecomment-1814777959). If that still doesn't work, open an issue. - **How do I get an OpenAI API key?** See https://github.com/abi/screenshot-to-code/blob/main/Troubleshooting.md - **How can I configure an OpenAI proxy?** - If you're not able to access the OpenAI API directly (due to e.g. country restrictions), you can try a VPN or you can configure the OpenAI base URL to use a proxy: Set OPENAI_BASE_URL in the `backend/.env` or directly in the UI in the settings dialog. Make sure the URL has "v1" in the path so it should look like this: `https://xxx.xxxxx.xxx/v1` - **How can I update the backend host that my front-end connects to?** - Configure VITE_HTTP_BACKEND_URL and VITE_WS_BACKEND_URL in front/.env.local For example, set VITE_HTTP_BACKEND_URL=http://124.10.20.1:7001 - **Seeing UTF-8 errors when running the backend?** - On windows, open the .env file with notepad++, then go to Encoding and select UTF-8. - **How can I provide feedback?** For feedback, feature requests and bug reports, open an issue or ping me on [Twitter](https://twitter.com/_abi_). ## 📚 Examples **NYTimes** | Original | Replica | | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Screenshot 2023-11-20 at 12 54 03 PM | Screenshot 2023-11-20 at 12 59 56 PM | **Instagram** https://github.com/user-attachments/assets/a335a105-f9cc-40e6-ac6b-64e5390bfc21 **Hacker News** https://github.com/user-attachments/assets/205cb5c7-9c3c-438d-acd4-26dfe6e077e5 ================================================ FILE: TESTING.md ================================================ # Testing Guide This guide explains how to run tests for the Screenshot to Code project. ## Backend Tests The backend uses pytest for testing. All tests are located in the `backend/tests` directory. ### Prerequisites Make sure you have Poetry installed and have installed all dependencies: ```bash cd backend poetry install ``` ### Running Tests #### Run all tests ```bash cd backend poetry run pytest ``` #### Run tests with verbose output ```bash poetry run pytest -vv ``` #### Run a specific test file ```bash poetry run pytest tests/test_screenshot.py ``` #### Run a specific test class ```bash poetry run pytest tests/test_screenshot.py::TestNormalizeUrl ``` #### Run a specific test method ```bash poetry run pytest tests/test_screenshot.py::TestNormalizeUrl::test_url_without_protocol ``` #### Run tests with coverage report ```bash poetry run pytest --cov=routes ``` #### Run tests in parallel (requires pytest-xdist) ```bash poetry install --with dev pytest-xdist # Install if not already installed poetry run pytest -n auto ``` ### Test Configuration The pytest configuration is defined in `backend/pytest.ini`: - Tests are discovered in the `tests` directory - Test files must match the pattern `test_*.py` - Test classes must start with `Test` - Test functions must start with `test_` - Verbose output and short traceback format are enabled by default ### Writing New Tests 1. Create a new test file in `backend/tests/` following the naming convention `test_.py` 2. Import the functions/classes you want to test 3. Write test functions or classes following pytest conventions Example: ```python import pytest from routes.screenshot import normalize_url def test_url_normalization(): assert normalize_url("example.com") == "https://example.com" ``` ================================================ FILE: Troubleshooting.md ================================================ ### Getting an OpenAI API key with GPT-4 model access You don't need a ChatGPT Pro account. Screenshot to code uses API keys from your OpenAI developer account. In order to get access to the GPT4 Vision model, log into your OpenAI account and then, follow these instructions: 1. Open [OpenAI Dashboard](https://platform.openai.com/) 1. Go to Settings > Billing 1. Click at the Add payment details 285636868-c80deb92-ab47-45cd-988f-deee67fbd44d 4. You have to buy some credits. The minimum is $5. 5. Go to Settings > Limits and check at the bottom of the page, your current tier has to be "Tier 1" to have GPT4 access 285636973-da38bd4d-8a78-4904-8027-ca67d729b933 6. Navigate to OpenAI [api keys](https://platform.openai.com/api-keys) page and create and copy a new secret key. 7. Go to Screenshot to code and paste it in the Settings dialog under OpenAI key (gear icon). Your key is only stored in your browser. Never stored on our servers. ## Still not working? - Some users have also reported that it can take upto 30 minutes after your credit purchase for the GPT4 vision model to be activated. - You need to add credits to your account AND set it to renew when credits run out in order to be upgraded to Tier 1. Make sure your "Settings > Limits" page shows that you are at Tier 1. If you've followed these steps, and it still doesn't work, feel free to open a Github issue. We only provide support for the open source version since we don't have debugging logs on the hosted version. If you're looking to use the hosted version, we recommend getting a paid subscription on screenshottocode.com ================================================ FILE: backend/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Temporary eval output evals_data # Temporary video evals (Remove before merge) video_evals ================================================ FILE: backend/.pre-commit-config.yaml ================================================ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v3.2.0 hooks: # - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files # - repo: local # hooks: # - id: poetry-pytest # name: Run pytest with Poetry # entry: poetry run --directory backend pytest # language: system # pass_filenames: false # always_run: true # files: ^backend/ # # - id: poetry-pyright # # name: Run pyright with Poetry # # entry: poetry run --directory backend pyright # # language: system # # pass_filenames: false # # always_run: true # # files: ^backend/ ================================================ FILE: backend/Dockerfile ================================================ FROM python:3.12.3-slim-bullseye ENV POETRY_VERSION 1.8.0 # Install system dependencies RUN pip install "poetry==$POETRY_VERSION" # Set work directory WORKDIR /app # Copy only requirements to cache them in docker layer COPY poetry.lock pyproject.toml /app/ # Disable the creation of virtual environments RUN poetry config virtualenvs.create false # Install dependencies RUN poetry install # Copy the current directory contents into the container at /app COPY ./ /app/ ================================================ FILE: backend/README.md ================================================ # Run the type checker poetry run pyright # Run tests poetry run pytest ## Prompt Summary Use `print_prompt_summary` from `utils.py` to quickly visualize prompts: ```python from utils import print_prompt_summary print_prompt_summary(prompt_messages) ``` ================================================ FILE: backend/agent/engine.py ================================================ import asyncio import uuid from typing import Any, Awaitable, Callable, Dict, List, Optional from openai.types.chat import ChatCompletionMessageParam from codegen.utils import extract_html_content from llm import Llm from agent.providers.base import ExecutedToolCall, ProviderSession, StreamEvent from agent.providers.factory import create_provider_session from agent.state import AgentFileState, seed_file_state_from_messages from agent.tools import ( AgentToolRuntime, extract_content_from_args, extract_path_from_args, summarize_text, summarize_tool_input, ) class AgentEngine: def __init__( self, send_message: Callable[ [str, Optional[str], int, Optional[Dict[str, Any]], Optional[str]], Awaitable[None], ], variant_index: int, openai_api_key: Optional[str], openai_base_url: Optional[str], anthropic_api_key: Optional[str], gemini_api_key: Optional[str], should_generate_images: bool, initial_file_state: Optional[Dict[str, str]] = None, option_codes: Optional[List[str]] = None, ): self.send_message = send_message self.variant_index = variant_index self.openai_api_key = openai_api_key self.openai_base_url = openai_base_url self.anthropic_api_key = anthropic_api_key self.gemini_api_key = gemini_api_key self.should_generate_images = should_generate_images self.file_state = AgentFileState() if initial_file_state and initial_file_state.get("content"): self.file_state.path = initial_file_state.get("path") or "index.html" self.file_state.content = initial_file_state["content"] self.tool_runtime = AgentToolRuntime( file_state=self.file_state, should_generate_images=should_generate_images, openai_api_key=openai_api_key, openai_base_url=openai_base_url, option_codes=option_codes, ) self._tool_preview_lengths: Dict[str, int] = {} def _next_event_id(self, prefix: str) -> str: return f"{prefix}-{self.variant_index}-{uuid.uuid4().hex[:8]}" async def _send( self, msg_type: str, value: Optional[str] = None, data: Optional[Dict[str, Any]] = None, event_id: Optional[str] = None, ) -> None: await self.send_message(msg_type, value, self.variant_index, data, event_id) def _mark_preview_length(self, tool_event_id: Optional[str], length: int) -> None: if not tool_event_id: return current = self._tool_preview_lengths.get(tool_event_id, 0) if length > current: self._tool_preview_lengths[tool_event_id] = length async def _stream_code_preview(self, tool_event_id: Optional[str], content: str) -> None: if not tool_event_id or not content: return already_sent = self._tool_preview_lengths.get(tool_event_id, 0) total_len = len(content) if already_sent >= total_len: return max_chunks = 18 min_step = 200 step = max(min_step, total_len // max_chunks) start = already_sent if already_sent > 0 else 0 for end in range(start + step, total_len, step): await self._send("setCode", content[:end]) self._mark_preview_length(tool_event_id, end) await asyncio.sleep(0.01) await self._send("setCode", content) self._mark_preview_length(tool_event_id, total_len) async def _handle_streamed_tool_delta( self, event: StreamEvent, started_tool_ids: set[str], streamed_lengths: Dict[str, int], ) -> None: if event.type != "tool_call_delta": return if event.tool_name != "create_file": return if not event.tool_call_id: return content = extract_content_from_args(event.tool_arguments) if content is None: return tool_event_id = event.tool_call_id if tool_event_id not in started_tool_ids: path = ( extract_path_from_args(event.tool_arguments) or self.file_state.path or "index.html" ) await self._send( "toolStart", data={ "name": "create_file", "input": { "path": path, "contentLength": len(content), "preview": summarize_text(content, 200), }, }, event_id=tool_event_id, ) started_tool_ids.add(tool_event_id) last_len = streamed_lengths.get(tool_event_id, 0) if last_len == 0 and content: streamed_lengths[tool_event_id] = len(content) await self._send("setCode", content) self._mark_preview_length(tool_event_id, len(content)) elif len(content) - last_len >= 40: streamed_lengths[tool_event_id] = len(content) await self._send("setCode", content) self._mark_preview_length(tool_event_id, len(content)) async def _run_with_session(self, session: ProviderSession) -> str: max_steps = 20 for _ in range(max_steps): assistant_event_id = self._next_event_id("assistant") thinking_event_id = self._next_event_id("thinking") started_tool_ids: set[str] = set() streamed_lengths: Dict[str, int] = {} async def on_event(event: StreamEvent) -> None: if event.type == "assistant_delta": if event.text: await self._send( "assistant", event.text, event_id=assistant_event_id, ) return if event.type == "thinking_delta": if event.text: await self._send( "thinking", event.text, event_id=thinking_event_id, ) return if event.type == "tool_call_delta": await self._handle_streamed_tool_delta( event, started_tool_ids, streamed_lengths, ) turn = await session.stream_turn(on_event) if not turn.tool_calls: return await self._finalize_response(turn.assistant_text) executed_tool_calls: List[ExecutedToolCall] = [] for tool_call in turn.tool_calls: tool_event_id = tool_call.id or self._next_event_id("tool") if tool_event_id not in started_tool_ids: await self._send( "toolStart", data={ "name": tool_call.name, "input": summarize_tool_input(tool_call, self.file_state), }, event_id=tool_event_id, ) if tool_call.name == "create_file": content = extract_content_from_args(tool_call.arguments) if content: await self._stream_code_preview(tool_event_id, content) tool_result = await self.tool_runtime.execute(tool_call) if tool_result.updated_content: await self._send("setCode", tool_result.updated_content) await self._send( "toolResult", data={ "name": tool_call.name, "output": tool_result.summary, "ok": tool_result.ok, }, event_id=tool_event_id, ) executed_tool_calls.append( ExecutedToolCall(tool_call=tool_call, result=tool_result) ) session.append_tool_results(turn, executed_tool_calls) raise Exception("Agent exceeded max tool turns") async def run(self, model: Llm, prompt_messages: List[ChatCompletionMessageParam]) -> str: seed_file_state_from_messages(self.file_state, prompt_messages) session = create_provider_session( model=model, prompt_messages=prompt_messages, should_generate_images=self.should_generate_images, openai_api_key=self.openai_api_key, openai_base_url=self.openai_base_url, anthropic_api_key=self.anthropic_api_key, gemini_api_key=self.gemini_api_key, ) try: return await self._run_with_session(session) finally: await session.close() async def _finalize_response(self, assistant_text: str) -> str: if self.file_state.content: return self.file_state.content html = extract_html_content(assistant_text) if html: self.file_state.content = html await self._send("setCode", html) return self.file_state.content ================================================ FILE: backend/agent/providers/__init__.py ================================================ from agent.providers.anthropic import AnthropicProviderSession, serialize_anthropic_tools from agent.providers.base import ( EventSink, ExecutedToolCall, ProviderSession, ProviderTurn, StreamEvent, ) from agent.providers.factory import create_provider_session from agent.providers.gemini import GeminiProviderSession, serialize_gemini_tools from agent.providers.openai import OpenAIProviderSession, parse_event, serialize_openai_tools __all__ = [ "AnthropicProviderSession", "EventSink", "ExecutedToolCall", "GeminiProviderSession", "OpenAIProviderSession", "ProviderSession", "ProviderTurn", "StreamEvent", "create_provider_session", "parse_event", "serialize_anthropic_tools", "serialize_gemini_tools", "serialize_openai_tools", ] ================================================ FILE: backend/agent/providers/anthropic/__init__.py ================================================ from agent.providers.anthropic.provider import ( AnthropicProviderSession, serialize_anthropic_tools, _extract_anthropic_usage, ) __all__ = [ "AnthropicProviderSession", "serialize_anthropic_tools", "_extract_anthropic_usage", ] ================================================ FILE: backend/agent/providers/anthropic/image.py ================================================ # pyright: reportUnknownVariableType=false """Claude-specific image processing. Handles resizing and compressing images to comply with Claude's vision API limits before sending them as base64-encoded payloads. Comparison with official Anthropic docs (https://docs.anthropic.com/en/docs/build-with-claude/vision): Aligned: - 5 MB per-image size limit matches the documented API maximum. - Output uses the correct base64 source format (type, media_type, data). Divergences: - Max dimension is set to 7990 px as a safety margin; the API rejects at 8000 px. This is intentionally conservative. - The docs note that when >20 images are sent in a single request the per-image limit drops to 2000x2000 px. We do not enforce that stricter limit here (the app typically sends far fewer images). - JPEG conversion drops alpha channels, which is acceptable for website screenshots but would degrade transparent PNGs. Recommendation: The docs recommend resizing to 1568 px on the long edge (~1.15 megapixels) for optimal time-to-first-token. Images above that threshold are resized server-side anyway, so sending larger images only adds latency and bandwidth cost with no quality benefit. Consider lowering CLAUDE_MAX_IMAGE_DIMENSION to 1568. """ import base64 import io import time from PIL import Image # Hard API limit: 5 MB per image (base64-encoded). CLAUDE_IMAGE_MAX_SIZE = 5 * 1024 * 1024 # API rejects images wider or taller than 8000 px. We use 7990 as a safety # margin. Note: the docs recommend 1568 px for best latency (see module # docstring). CLAUDE_MAX_IMAGE_DIMENSION = 7990 def process_image(image_data_url: str) -> tuple[str, str]: """Resize / compress a data-URL image to fit Claude's vision limits. Returns (media_type, base64_data) suitable for an ``image`` content block. """ media_type = image_data_url.split(";")[0].split(":")[1] base64_data = image_data_url.split(",")[1] image_bytes = base64.b64decode(base64_data) img = Image.open(io.BytesIO(image_bytes)) is_under_dimension_limit = ( img.width < CLAUDE_MAX_IMAGE_DIMENSION and img.height < CLAUDE_MAX_IMAGE_DIMENSION ) is_under_size_limit = len(base64_data) <= CLAUDE_IMAGE_MAX_SIZE if is_under_dimension_limit and is_under_size_limit: return (media_type, base64_data) start_time = time.time() if not is_under_dimension_limit: if img.width > img.height: new_width = CLAUDE_MAX_IMAGE_DIMENSION new_height = int((CLAUDE_MAX_IMAGE_DIMENSION / img.width) * img.height) else: new_height = CLAUDE_MAX_IMAGE_DIMENSION new_width = int((CLAUDE_MAX_IMAGE_DIMENSION / img.height) * img.width) img = img.resize((new_width, new_height), Image.DEFAULT_STRATEGY) quality = 95 output = io.BytesIO() img = img.convert("RGB") img.save(output, format="JPEG", quality=quality) while ( len(base64.b64encode(output.getvalue())) > CLAUDE_IMAGE_MAX_SIZE and quality > 10 ): output = io.BytesIO() img.save(output, format="JPEG", quality=quality) quality -= 5 end_time = time.time() processing_time = end_time - start_time print(f"[CLAUDE IMAGE PROCESSING] processing time: {processing_time:.2f} seconds") return ("image/jpeg", base64.b64encode(output.getvalue()).decode("utf-8")) ================================================ FILE: backend/agent/providers/anthropic/provider.py ================================================ # pyright: reportUnknownVariableType=false import copy import json import uuid from dataclasses import dataclass, field from typing import Any, Dict, List, cast from anthropic import AsyncAnthropic from openai.types.chat import ChatCompletionMessageParam from agent.providers.base import ( EventSink, ExecutedToolCall, ProviderSession, ProviderTurn, StreamEvent, ) from agent.providers.anthropic.image import process_image from agent.providers.pricing import MODEL_PRICING from agent.providers.token_usage import TokenUsage from agent.tools import CanonicalToolDefinition, ToolCall, parse_json_arguments from llm import Llm THINKING_MODELS = { Llm.CLAUDE_4_5_SONNET_2025_09_29.value, Llm.CLAUDE_4_5_OPUS_2025_11_01.value, } ADAPTIVE_THINKING_MODELS = { Llm.CLAUDE_OPUS_4_6.value, Llm.CLAUDE_SONNET_4_6.value, } def _convert_openai_messages_to_claude( messages: List[ChatCompletionMessageParam], ) -> tuple[str, List[Dict[str, Any]]]: cloned_messages = copy.deepcopy(messages) system_prompt = cast(str, cloned_messages[0].get("content")) claude_messages = [dict(message) for message in cloned_messages[1:]] for message in claude_messages: if not isinstance(message["content"], list): continue for content in message["content"]: # type: ignore if content["type"] != "image_url": continue content["type"] = "image" image_data_url = cast(str, content["image_url"]["url"]) media_type, base64_data = process_image(image_data_url) del content["image_url"] content["source"] = { "type": "base64", "media_type": media_type, "data": base64_data, } return system_prompt, claude_messages def serialize_anthropic_tools( tools: List[CanonicalToolDefinition], ) -> List[Dict[str, Any]]: return [ { "name": tool.name, "description": tool.description, "eager_input_streaming": True, "input_schema": copy.deepcopy(tool.parameters), } for tool in tools ] @dataclass class AnthropicParseState: assistant_text: str = "" tool_blocks: Dict[int, Dict[str, Any]] = field(default_factory=dict) tool_json_buffers: Dict[int, str] = field(default_factory=dict) async def _parse_stream_event( event: Any, state: AnthropicParseState, on_event: EventSink, ) -> None: if event.type == "content_block_start": block = event.content_block if getattr(block, "type", None) != "tool_use": return tool_id = getattr(block, "id", None) or f"tool-{uuid.uuid4().hex[:6]}" tool_name = getattr(block, "name", None) or "unknown_tool" args = getattr(block, "input", None) state.tool_blocks[event.index] = { "id": tool_id, "name": tool_name, } state.tool_json_buffers[event.index] = "" if args: await on_event( StreamEvent( type="tool_call_delta", tool_call_id=tool_id, tool_name=tool_name, tool_arguments=args, ) ) return if event.type != "content_block_delta": return if event.delta.type == "thinking_delta": await on_event(StreamEvent(type="thinking_delta", text=event.delta.thinking)) return if event.delta.type == "text_delta": state.assistant_text += event.delta.text await on_event(StreamEvent(type="assistant_delta", text=event.delta.text)) return if event.delta.type != "input_json_delta": return partial_json = getattr(event.delta, "partial_json", None) or "" if not partial_json: return buffer = state.tool_json_buffers.get(event.index, "") + partial_json state.tool_json_buffers[event.index] = buffer meta = state.tool_blocks.get(event.index) if not meta: return await on_event( StreamEvent( type="tool_call_delta", tool_call_id=meta.get("id"), tool_name=meta.get("name"), tool_arguments=buffer, ) ) def _extract_tool_calls(final_message: Any) -> List[ToolCall]: tool_calls: List[ToolCall] = [] if final_message and final_message.content: for block in final_message.content: if block.type != "tool_use": continue raw_input = getattr(block, "input", {}) args: Dict[str, Any] if isinstance(raw_input, dict): args = cast(Dict[str, Any], raw_input) else: parsed, error = parse_json_arguments(raw_input) if error: args = {"INVALID_JSON": str(raw_input)} else: args = parsed tool_calls.append( ToolCall( id=block.id, name=block.name, arguments=args, ) ) return tool_calls def _extract_anthropic_usage(final_message: Any) -> TokenUsage: """Extract unified token usage from an Anthropic final message. Anthropic includes thinking tokens in ``output_tokens`` so no extra addition is needed. ``total`` is computed since the API doesn't provide it. """ usage = getattr(final_message, "usage", None) if usage is None: return TokenUsage() input_tokens = getattr(usage, "input_tokens", 0) or 0 output_tokens = getattr(usage, "output_tokens", 0) or 0 cache_read = getattr(usage, "cache_read_input_tokens", 0) or 0 cache_write = getattr(usage, "cache_creation_input_tokens", 0) or 0 return TokenUsage( input=input_tokens, output=output_tokens, cache_read=cache_read, cache_write=cache_write, total=input_tokens + output_tokens + cache_read + cache_write, ) class AnthropicProviderSession(ProviderSession): def __init__( self, client: AsyncAnthropic, model: Llm, prompt_messages: List[ChatCompletionMessageParam], tools: List[Dict[str, Any]], ): self._client = client self._model = model self._tools = tools self._total_usage = TokenUsage() system_prompt, claude_messages = _convert_openai_messages_to_claude(prompt_messages) self._system_prompt = system_prompt self._messages = claude_messages async def stream_turn(self, on_event: EventSink) -> ProviderTurn: stream_kwargs: Dict[str, Any] = { "model": self._model.value, "max_tokens": 50000, "system": self._system_prompt, "messages": self._messages, "tools": self._tools, "cache_control": {"type": "ephemeral"}, } if self._model.value in ADAPTIVE_THINKING_MODELS: stream_kwargs["thinking"] = { "type": "adaptive", } effort = ( "high" if self._model.value == Llm.CLAUDE_SONNET_4_6.value else "max" ) stream_kwargs["output_config"] = {"effort": effort} elif self._model.value in THINKING_MODELS: stream_kwargs["thinking"] = { "type": "enabled", "budget_tokens": 10000, } else: stream_kwargs["temperature"] = 0.0 state = AnthropicParseState() async with self._client.messages.stream(**stream_kwargs) as stream: async for event in stream: await _parse_stream_event(event, state, on_event) final_message = await stream.get_final_message() self._total_usage.accumulate(_extract_anthropic_usage(final_message)) tool_calls = _extract_tool_calls(final_message) return ProviderTurn( assistant_text=state.assistant_text, tool_calls=tool_calls, assistant_turn=final_message, ) def append_tool_results( self, turn: ProviderTurn, executed_tool_calls: list[ExecutedToolCall], ) -> None: assistant_blocks: List[Dict[str, Any]] = [] if turn.assistant_text: assistant_blocks.append({"type": "text", "text": turn.assistant_text}) for call in turn.tool_calls: assistant_blocks.append( { "type": "tool_use", "id": call.id, "name": call.name, "input": call.arguments, } ) self._messages.append({"role": "assistant", "content": assistant_blocks}) tool_result_blocks: List[Dict[str, Any]] = [] for executed in executed_tool_calls: tool_result_blocks.append( { "type": "tool_result", "tool_use_id": executed.tool_call.id, "content": json.dumps(executed.result.result), "is_error": not executed.result.ok, } ) self._messages.append({"role": "user", "content": tool_result_blocks}) async def close(self) -> None: u = self._total_usage model_name = self._model.value pricing = MODEL_PRICING.get(model_name) cost_str = f" cost=${u.cost(pricing):.4f}" if pricing else "" cache_hit_rate_str = f" cache_hit_rate={u.cache_hit_rate_percent():.2f}%" print( f"[TOKEN USAGE] provider=anthropic model={model_name} | " f"input={u.input} output={u.output} " f"cache_read={u.cache_read} cache_write={u.cache_write} " f"total={u.total}{cache_hit_rate_str}{cost_str}" ) await self._client.close() ================================================ FILE: backend/agent/providers/base.py ================================================ from dataclasses import dataclass from typing import Any, Awaitable, Callable, Literal, Optional, Protocol from agent.tools import ToolCall, ToolExecutionResult StreamEventType = Literal[ "assistant_delta", "thinking_delta", "tool_call_delta", ] @dataclass class StreamEvent: type: StreamEventType text: str = "" tool_call_id: Optional[str] = None tool_name: Optional[str] = None tool_arguments: Any = None @dataclass class ProviderTurn: assistant_text: str tool_calls: list[ToolCall] # Provider-native assistant turn object required to continue the conversation. assistant_turn: Any = None @dataclass class ExecutedToolCall: tool_call: ToolCall result: ToolExecutionResult EventSink = Callable[[StreamEvent], Awaitable[None]] class ProviderSession(Protocol): async def stream_turn(self, on_event: EventSink) -> ProviderTurn: ... def append_tool_results( self, turn: ProviderTurn, executed_tool_calls: list[ExecutedToolCall], ) -> None: ... async def close(self) -> None: ... ================================================ FILE: backend/agent/providers/factory.py ================================================ from typing import Optional from anthropic import AsyncAnthropic from google import genai from openai import AsyncOpenAI from openai.types.chat import ChatCompletionMessageParam from agent.providers.anthropic import AnthropicProviderSession, serialize_anthropic_tools from agent.providers.base import ProviderSession from agent.providers.gemini import GeminiProviderSession, serialize_gemini_tools from agent.providers.openai import OpenAIProviderSession, serialize_openai_tools from agent.tools import canonical_tool_definitions from llm import ANTHROPIC_MODELS, GEMINI_MODELS, OPENAI_MODELS, Llm def create_provider_session( model: Llm, prompt_messages: list[ChatCompletionMessageParam], should_generate_images: bool, openai_api_key: Optional[str], openai_base_url: Optional[str], anthropic_api_key: Optional[str], gemini_api_key: Optional[str], ) -> ProviderSession: canonical_tools = canonical_tool_definitions( image_generation_enabled=should_generate_images ) if model in OPENAI_MODELS: if not openai_api_key: raise Exception("OpenAI API key is missing.") client = AsyncOpenAI(api_key=openai_api_key, base_url=openai_base_url) return OpenAIProviderSession( client=client, model=model, prompt_messages=prompt_messages, tools=serialize_openai_tools(canonical_tools), ) if model in ANTHROPIC_MODELS: if not anthropic_api_key: raise Exception("Anthropic API key is missing.") client = AsyncAnthropic(api_key=anthropic_api_key) return AnthropicProviderSession( client=client, model=model, prompt_messages=prompt_messages, tools=serialize_anthropic_tools(canonical_tools), ) if model in GEMINI_MODELS: if not gemini_api_key: raise Exception("Gemini API key is missing.") client = genai.Client(api_key=gemini_api_key) return GeminiProviderSession( client=client, model=model, prompt_messages=prompt_messages, tools=serialize_gemini_tools(canonical_tools), ) raise ValueError(f"Unsupported model: {model.value}") ================================================ FILE: backend/agent/providers/gemini.py ================================================ # pyright: reportUnknownVariableType=false import base64 import copy import uuid from dataclasses import dataclass, field from typing import Any, Dict, List, cast from google import genai from google.genai import types from openai.types.chat import ChatCompletionMessageParam from agent.providers.base import ( EventSink, ExecutedToolCall, ProviderSession, ProviderTurn, StreamEvent, ) from agent.providers.pricing import MODEL_PRICING from agent.providers.token_usage import TokenUsage from agent.tools import CanonicalToolDefinition, ToolCall from llm import Llm DEFAULT_VIDEO_FPS = 10 def serialize_gemini_tools(tools: List[CanonicalToolDefinition]) -> List[types.Tool]: declarations = [ types.FunctionDeclaration( name=tool.name, description=tool.description, parameters_json_schema=copy.deepcopy(tool.parameters), ) for tool in tools ] return [types.Tool(function_declarations=declarations)] def _get_gemini_api_model_name(model: Llm) -> str: if model in [Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL]: return "gemini-3-flash-preview" if model in [ Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_MEDIUM, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, ]: return "gemini-3.1-pro-preview" return model.value def _get_thinking_level_for_model(model: Llm) -> str: if model in [ Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ]: return "high" if model == Llm.GEMINI_3_1_PRO_PREVIEW_LOW: return "low" if model == Llm.GEMINI_3_1_PRO_PREVIEW_MEDIUM: return "medium" if model == Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL: return "minimal" return "high" def _extract_text_from_content(content: str | List[Dict[str, Any]]) -> str: if isinstance(content, str): return content for content_part in content: if content_part.get("type") == "text": return content_part.get("text", "") return "" def _detect_mime_type_from_base64(base64_data: str) -> str | None: try: decoded = base64.b64decode(base64_data[:32]) if decoded[:8] == b"\x89PNG\r\n\x1a\n": return "image/png" if decoded[:2] == b"\xff\xd8": return "image/jpeg" if decoded[:6] in (b"GIF87a", b"GIF89a"): return "image/gif" if decoded[:4] == b"RIFF" and decoded[8:12] == b"WEBP": return "image/webp" if decoded[4:8] == b"ftyp": return "video/mp4" if decoded[:4] == b"\x1aE\xdf\xa3": return "video/webm" except Exception: pass return None def _extract_images_from_content(content: str | List[Dict[str, Any]]) -> List[Dict[str, str]]: if isinstance(content, str): return [] images: List[Dict[str, str]] = [] for content_part in content: if content_part.get("type") != "image_url": continue image_url = content_part["image_url"]["url"] if image_url.startswith("data:"): mime_type = image_url.split(";")[0].split(":")[1] base64_data = image_url.split(",")[1] if mime_type == "application/octet-stream": detected_mime = _detect_mime_type_from_base64(base64_data) if detected_mime: mime_type = detected_mime else: print("Warning: Could not detect MIME type for data URL, skipping") continue images.append({"mime_type": mime_type, "data": base64_data}) continue images.append({"uri": image_url}) return images def _convert_message_to_gemini_content( message: ChatCompletionMessageParam, ) -> types.Content: role = message.get("role", "user") content = message.get("content", "") gemini_role = "model" if role == "assistant" else "user" parts: List[types.Part | Dict[str, str]] = [] text = _extract_text_from_content(content) # type: ignore image_data_list = _extract_images_from_content(content) # type: ignore if text: parts.append({"text": text}) for image_data in image_data_list: if "data" in image_data: mime_type = image_data["mime_type"] media_bytes = base64.b64decode(image_data["data"]) if mime_type.startswith("video/"): parts.append( types.Part( inline_data=types.Blob(data=media_bytes, mime_type=mime_type), video_metadata=types.VideoMetadata(fps=DEFAULT_VIDEO_FPS), media_resolution=types.PartMediaResolutionLevel.MEDIA_RESOLUTION_HIGH, ) ) continue parts.append( types.Part.from_bytes( data=media_bytes, mime_type=mime_type, media_resolution=types.PartMediaResolutionLevel.MEDIA_RESOLUTION_ULTRA_HIGH, ) ) continue if "uri" in image_data: parts.append({"file_uri": image_data["uri"]}) return types.Content(role=gemini_role, parts=parts) # type: ignore @dataclass class GeminiParseState: assistant_text: str = "" tool_calls: List[ToolCall] = field(default_factory=list) model_parts: List[types.Part] = field(default_factory=list) model_role: str = "model" def _extract_usage(chunk: types.GenerateContentResponse) -> TokenUsage | None: """Extract unified token usage from a Gemini streaming chunk. Gemini reports thinking tokens separately; they are folded into ``output`` to match the unified schema used by the other providers. ``prompt_token_count`` *includes* ``cached_content_token_count``, so we subtract cached tokens to get the non-cached input count (same approach as the OpenAI provider). """ meta = chunk.usage_metadata if meta is None: return None candidates = meta.candidates_token_count or 0 thoughts = meta.thoughts_token_count or 0 prompt_tokens = meta.prompt_token_count or 0 cached_tokens = meta.cached_content_token_count or 0 return TokenUsage( input=prompt_tokens - cached_tokens, output=candidates + thoughts, cache_read=cached_tokens, cache_write=0, total=meta.total_token_count or 0, ) async def _parse_chunk( chunk: types.GenerateContentResponse, state: GeminiParseState, on_event: EventSink, ) -> None: if not chunk.candidates: return candidate_content = chunk.candidates[0].content if not candidate_content or not candidate_content.parts: return if candidate_content.role: state.model_role = candidate_content.role for part in candidate_content.parts: # Preserve each model part as streamed so thought signatures remain attached. state.model_parts.append(part) if getattr(part, "thought", False) and part.text: await on_event(StreamEvent(type="thinking_delta", text=part.text)) continue if part.function_call: args = part.function_call.args or {} tool_id = part.function_call.id or f"tool-{uuid.uuid4().hex[:6]}" tool_name = part.function_call.name or "unknown_tool" await on_event( StreamEvent( type="tool_call_delta", tool_call_id=tool_id, tool_name=tool_name, tool_arguments=args, ) ) state.tool_calls.append( ToolCall( id=tool_id, name=tool_name, arguments=args, ) ) continue if part.text: state.assistant_text += part.text await on_event(StreamEvent(type="assistant_delta", text=part.text)) class GeminiProviderSession(ProviderSession): def __init__( self, client: genai.Client, model: Llm, prompt_messages: List[ChatCompletionMessageParam], tools: List[types.Tool], ): self._client = client self._model = model self._tools = tools self._total_usage = TokenUsage() self._system_prompt = str(prompt_messages[0].get("content", "")) self._contents: List[types.Content] = [ _convert_message_to_gemini_content(msg) for msg in prompt_messages[1:] ] async def stream_turn(self, on_event: EventSink) -> ProviderTurn: thinking_level = _get_thinking_level_for_model(self._model) config = types.GenerateContentConfig( temperature=1.0, max_output_tokens=50000, system_instruction=self._system_prompt, thinking_config=types.ThinkingConfig( thinking_level=cast(Any, thinking_level), include_thoughts=True, ), tools=self._tools, ) stream = await self._client.aio.models.generate_content_stream( model=_get_gemini_api_model_name(self._model), contents=cast(Any, self._contents), config=config, ) state = GeminiParseState() turn_usage: TokenUsage | None = None async for chunk in stream: await _parse_chunk(chunk, state, on_event) chunk_usage = _extract_usage(chunk) if chunk_usage is not None: turn_usage = chunk_usage if turn_usage is not None: self._total_usage.accumulate(turn_usage) assistant_turn = ( types.Content(role=state.model_role, parts=state.model_parts) if state.model_parts else None ) return ProviderTurn( assistant_text=state.assistant_text, tool_calls=state.tool_calls, assistant_turn=assistant_turn, ) def append_tool_results( self, turn: ProviderTurn, executed_tool_calls: list[ExecutedToolCall], ) -> None: model_content = turn.assistant_turn if not isinstance(model_content, types.Content) or not model_content.parts: raise ValueError( "Gemini step is missing model content. Cannot append tool results without the original model turn." ) self._contents.append(model_content) tool_result_parts: List[types.Part] = [] for executed in executed_tool_calls: tool_result_parts.append( types.Part.from_function_response( name=executed.tool_call.name, response=executed.result.result, ) ) self._contents.append(types.Content(role="tool", parts=tool_result_parts)) async def close(self) -> None: u = self._total_usage model_name = _get_gemini_api_model_name(self._model) pricing = MODEL_PRICING.get(model_name) cost_str = f" cost=${u.cost(pricing):.4f}" if pricing else "" cache_hit_rate_str = f" cache_hit_rate={u.cache_hit_rate_percent():.2f}%" print( f"[TOKEN USAGE] provider=gemini model={model_name} | " f"input={u.input} output={u.output} " f"cache_read={u.cache_read} cache_write={u.cache_write} " f"total={u.total}{cache_hit_rate_str}{cost_str}" ) ================================================ FILE: backend/agent/providers/openai.py ================================================ # pyright: reportUnknownVariableType=false import copy import json import uuid from dataclasses import dataclass, field from typing import Any, Dict, List from openai import AsyncOpenAI from openai.types.chat import ChatCompletionMessageParam from agent.providers.base import ( EventSink, ExecutedToolCall, ProviderSession, ProviderTurn, StreamEvent, ) from agent.providers.pricing import MODEL_PRICING from agent.providers.token_usage import TokenUsage from agent.state import ensure_str from agent.tools import CanonicalToolDefinition, ToolCall, parse_json_arguments from config import IS_DEBUG_ENABLED from fs_logging.openai_turn_inputs import OpenAITurnInputLogger from llm import Llm, get_openai_api_name, get_openai_reasoning_effort def _convert_message_to_responses_input( message: ChatCompletionMessageParam, ) -> Dict[str, Any]: role = message.get("role", "user") content = message.get("content", "") if isinstance(content, str): return {"role": role, "content": content} parts: List[Dict[str, Any]] = [] if isinstance(content, list): for part in content: if not isinstance(part, dict): continue if part.get("type") == "text": parts.append({"type": "input_text", "text": part.get("text", "")}) elif part.get("type") == "image_url": image_url = part.get("image_url", {}) parts.append( { "type": "input_image", "image_url": image_url.get("url", ""), "detail": image_url.get("detail", "high"), } ) return {"role": role, "content": parts} def _get_event_attr(event: Any, key: str, default: Any = None) -> Any: if hasattr(event, key): return getattr(event, key) if isinstance(event, dict): return event.get(key, default) return default def _copy_schema(schema: Dict[str, Any]) -> Dict[str, Any]: return copy.deepcopy(schema) def _nullable_type(type_value: Any) -> Any: if isinstance(type_value, list): if "null" not in type_value: return [*type_value, "null"] return type_value if isinstance(type_value, str): return [type_value, "null"] return type_value def _make_responses_schema_strict(schema: Dict[str, Any]) -> Dict[str, Any]: schema_copy: Dict[str, Any] = _copy_schema(schema) def transform(node: Dict[str, Any], in_object_property: bool = False) -> None: node_type = node.get("type") if node_type == "object": node["additionalProperties"] = False properties = node.get("properties") or {} if isinstance(properties, dict): node["required"] = list(properties.keys()) for prop in properties.values(): if isinstance(prop, dict): transform(prop, in_object_property=True) return if node_type == "array": if in_object_property: node["type"] = _nullable_type(node_type) items = node.get("items") if isinstance(items, dict): transform(items, in_object_property=False) return if in_object_property and node_type is not None: node["type"] = _nullable_type(node_type) transform(schema_copy, in_object_property=False) return schema_copy def serialize_openai_tools( tools: List[CanonicalToolDefinition], ) -> List[Dict[str, Any]]: serialized: List[Dict[str, Any]] = [] for tool in tools: schema = _make_responses_schema_strict(tool.parameters) serialized.append( { "type": "function", "name": tool.name, "description": tool.description, "parameters": schema, "strict": True, } ) return serialized @dataclass class OpenAIResponsesParseState: assistant_text: str = "" tool_calls: Dict[str, Dict[str, Any]] = field(default_factory=dict) item_to_call_id: Dict[str, str] = field(default_factory=dict) output_items_by_index: Dict[int, Dict[str, Any]] = field(default_factory=dict) saw_reasoning_summary_text_delta: bool = False last_emitted_reasoning_summary_part: str = "" turn_usage: TokenUsage | None = None def _extract_openai_usage(response: Any) -> TokenUsage: """Extract unified token usage from an OpenAI Responses ``response.completed`` event. OpenAI includes cached tokens inside ``input_tokens``, so they are subtracted to get the non-cached input count. """ usage = _get_event_attr(response, "usage") if usage is None: return TokenUsage() input_tokens = _get_event_attr(usage, "input_tokens", 0) or 0 output_tokens = _get_event_attr(usage, "output_tokens", 0) or 0 total_tokens = _get_event_attr(usage, "total_tokens", 0) or 0 details = _get_event_attr(usage, "input_tokens_details") or {} cached_tokens = _get_event_attr(details, "cached_tokens", 0) or 0 return TokenUsage( input=input_tokens - cached_tokens, output=output_tokens, cache_read=cached_tokens, cache_write=0, total=total_tokens, ) async def parse_event( event: Any, state: OpenAIResponsesParseState, on_event: EventSink, ) -> None: event_type = _get_event_attr(event, "type") if event_type in ( "response.created", "response.completed", "response.done", "response.output_item.done", ): if event_type == "response.completed": response = _get_event_attr(event, "response") if response: state.turn_usage = _extract_openai_usage(response) if event_type == "response.output_item.done": output_index = _get_event_attr(event, "output_index") item = _get_event_attr(event, "item") if isinstance(output_index, int) and item: state.output_items_by_index[output_index] = item return if event_type == "response.output_text.delta": delta = _get_event_attr(event, "delta", "") if delta: state.assistant_text += delta await on_event(StreamEvent(type="assistant_delta", text=delta)) return if event_type in ( "response.reasoning_text.delta", "response.reasoning_summary_text.delta", ): delta = _get_event_attr(event, "delta", "") if delta: if event_type == "response.reasoning_summary_text.delta": state.saw_reasoning_summary_text_delta = True await on_event(StreamEvent(type="thinking_delta", text=delta)) return if event_type in ( "response.reasoning_summary_part.added", "response.reasoning_summary_part.done", ): if state.saw_reasoning_summary_text_delta: return part = _get_event_attr(event, "part") or {} text = _get_event_attr(part, "text", "") if text and text != state.last_emitted_reasoning_summary_part: state.last_emitted_reasoning_summary_part = text await on_event(StreamEvent(type="thinking_delta", text=text)) return if event_type == "response.output_item.added": item = _get_event_attr(event, "item") item_type = _get_event_attr(item, "type") if item else None output_index = _get_event_attr(event, "output_index") if isinstance(output_index, int) and item: state.output_items_by_index.setdefault(output_index, item) if item and item_type in ("function_call", "custom_tool_call"): item_id = _get_event_attr(item, "id") call_id = _get_event_attr(item, "call_id") or item_id if item_id and call_id: state.item_to_call_id[item_id] = call_id if call_id: if item_id and item_id in state.tool_calls and item_id != call_id: existing = state.tool_calls.pop(item_id) state.tool_calls[call_id] = { **existing, "id": call_id, } args_value = _get_event_attr(item, "arguments") if args_value is None and item_type == "custom_tool_call": args_value = _get_event_attr(item, "input") state.tool_calls.setdefault( call_id, { "id": call_id, "name": _get_event_attr(item, "name"), "arguments": args_value or "", }, ) if args_value: await on_event( StreamEvent( type="tool_call_delta", tool_call_id=call_id, tool_name=_get_event_attr(item, "name"), tool_arguments=args_value, ) ) return if event_type in ( "response.function_call_arguments.delta", "response.mcp_call_arguments.delta", "response.custom_tool_call_input.delta", ): item_id = _get_event_attr(event, "item_id") call_id = _get_event_attr(event, "call_id") if call_id and item_id: state.item_to_call_id[item_id] = call_id if not call_id: call_id = state.item_to_call_id.get(item_id) if item_id else None if not call_id and item_id: call_id = item_id if not call_id: return entry = state.tool_calls.setdefault( call_id, { "id": call_id, "name": _get_event_attr(event, "name"), "arguments": "", }, ) delta_value = _get_event_attr(event, "delta") if delta_value is None: delta_value = _get_event_attr(event, "input") entry["arguments"] += ensure_str(delta_value) await on_event( StreamEvent( type="tool_call_delta", tool_call_id=call_id, tool_name=entry.get("name"), tool_arguments=entry.get("arguments"), ) ) return if event_type not in ( "response.function_call_arguments.done", "response.mcp_call_arguments.done", "response.custom_tool_call_input.done", ): return item_id = _get_event_attr(event, "item_id") call_id = _get_event_attr(event, "call_id") if call_id and item_id: state.item_to_call_id[item_id] = call_id if not call_id: call_id = state.item_to_call_id.get(item_id) if item_id else None if not call_id and item_id: call_id = item_id if not call_id: return entry = state.tool_calls.setdefault( call_id, { "id": call_id, "name": _get_event_attr(event, "name"), "arguments": "", }, ) final_value = _get_event_attr(event, "arguments") if final_value is None: final_value = _get_event_attr(event, "input") if final_value is None: final_value = entry["arguments"] entry["arguments"] = final_value if _get_event_attr(event, "name"): entry["name"] = _get_event_attr(event, "name") await on_event( StreamEvent( type="tool_call_delta", tool_call_id=call_id, tool_name=entry.get("name"), tool_arguments=entry.get("arguments"), ) ) output_index = _get_event_attr(event, "output_index") if ( item_id and isinstance(output_index, int) and isinstance(state.output_items_by_index.get(output_index), dict) ): state.output_items_by_index[output_index] = { **state.output_items_by_index[output_index], "arguments": entry["arguments"], "call_id": call_id, "name": entry.get("name"), } def _build_provider_turn(state: OpenAIResponsesParseState) -> ProviderTurn: output_items = [ state.output_items_by_index[idx] for idx in sorted(state.output_items_by_index.keys()) if state.output_items_by_index.get(idx) ] tool_items = [ item for item in output_items if isinstance(item, dict) and item.get("type") in ("function_call", "custom_tool_call") ] tool_calls: List[ToolCall] = [] if tool_items: for item in tool_items: raw_args = item.get("arguments") if raw_args is None and item.get("type") == "custom_tool_call": raw_args = item.get("input") args, error = parse_json_arguments(raw_args) if error: args = {"INVALID_JSON": ensure_str(raw_args)} call_id = item.get("call_id") or item.get("id") tool_calls.append( ToolCall( id=call_id or f"call-{uuid.uuid4().hex[:6]}", name=item.get("name") or "unknown_tool", arguments=args, ) ) else: for entry in state.tool_calls.values(): args, error = parse_json_arguments(entry.get("arguments")) if error: args = {"INVALID_JSON": ensure_str(entry.get("arguments"))} call_id = entry.get("id") or entry.get("call_id") tool_calls.append( ToolCall( id=call_id or f"call-{uuid.uuid4().hex[:6]}", name=entry.get("name") or "unknown_tool", arguments=args, ) ) assistant_turn: List[Dict[str, Any]] = output_items if tool_calls else [] return ProviderTurn( assistant_text=state.assistant_text, tool_calls=tool_calls, assistant_turn=assistant_turn, ) class OpenAIProviderSession(ProviderSession): def __init__( self, client: AsyncOpenAI, model: Llm, prompt_messages: List[ChatCompletionMessageParam], tools: List[Dict[str, Any]], ): self._client = client self._model = model self._tools = tools self._total_usage = TokenUsage() self._turn_input_logger = OpenAITurnInputLogger( model, enabled=IS_DEBUG_ENABLED, ) self._input_items: List[Dict[str, Any]] = [ _convert_message_to_responses_input(message) for message in prompt_messages ] async def stream_turn(self, on_event: EventSink) -> ProviderTurn: model_name = get_openai_api_name(self._model) params: Dict[str, Any] = { "model": model_name, "input": self._input_items, "tools": self._tools, "tool_choice": "auto", "stream": True, "max_output_tokens": 50000, } if model_name == "gpt-5.4-2026-03-05": params["prompt_cache_retention"] = "24h" reasoning_effort = get_openai_reasoning_effort(self._model) if reasoning_effort: params["reasoning"] = {"effort": reasoning_effort, "summary": "auto"} self._turn_input_logger.record_turn_input( self._input_items, request_payload=params, ) state = OpenAIResponsesParseState() stream = await self._client.responses.create(**params) # type: ignore async for event in stream: # type: ignore await parse_event(event, state, on_event) if state.turn_usage is not None: self._turn_input_logger.record_turn_usage(state.turn_usage) self._total_usage.accumulate(state.turn_usage) return _build_provider_turn(state) def append_tool_results( self, turn: ProviderTurn, executed_tool_calls: list[ExecutedToolCall], ) -> None: assistant_output_items = turn.assistant_turn or [] if assistant_output_items: self._input_items.extend(assistant_output_items) tool_output_items: List[Dict[str, Any]] = [] for executed in executed_tool_calls: tool_output_items.append( { "type": "function_call_output", "call_id": executed.tool_call.id, "output": json.dumps(executed.result.result), } ) self._input_items.extend(tool_output_items) async def close(self) -> None: u = self._total_usage model_name = get_openai_api_name(self._model) pricing = MODEL_PRICING.get(model_name) cost_str = f" cost=${u.cost(pricing):.4f}" if pricing else "" cache_hit_rate_str = f" cache_hit_rate={u.cache_hit_rate_percent():.2f}%" print( f"[TOKEN USAGE] provider=openai model={model_name} | " f"input={u.input} output={u.output} " f"cache_read={u.cache_read} cache_write={u.cache_write} " f"total={u.total}{cache_hit_rate_str}{cost_str}" ) report_path = self._turn_input_logger.write_html_report() if report_path: print(f"[OPENAI TURN INPUT] HTML report: {report_path}") await self._client.close() ================================================ FILE: backend/agent/providers/pricing.py ================================================ from dataclasses import dataclass from typing import Dict @dataclass class ModelPricing: """Per-million-token pricing in USD.""" input: float = 0.0 output: float = 0.0 cache_read: float = 0.0 cache_write: float = 0.0 # Pricing keyed by the API model name string sent to the provider. MODEL_PRICING: Dict[str, ModelPricing] = { # --- OpenAI --- "gpt-4.1-2025-04-14": ModelPricing( input=2.00, output=8.00, cache_read=0.50 ), "gpt-5.2-codex": ModelPricing( input=1.75, output=14.00, cache_read=0.4375 ), "gpt-5.3-codex": ModelPricing( input=1.75, output=14.00, cache_read=0.4375 ), "gpt-5.4-2026-03-05": ModelPricing( input=2.50, output=15.00, cache_read=0.25 ), # --- Anthropic --- "claude-sonnet-4-6": ModelPricing( input=3.00, output=15.00, cache_read=0.30, cache_write=3.75 ), "claude-sonnet-4-5-20250929": ModelPricing( input=3.00, output=15.00, cache_read=0.30, cache_write=3.75 ), "claude-opus-4-5-20251101": ModelPricing( input=5.00, output=25.00, cache_read=0.50, cache_write=6.25 ), "claude-opus-4-6": ModelPricing( input=5.00, output=25.00, cache_read=0.50, cache_write=6.25 ), # --- Gemini --- "gemini-3-flash-preview": ModelPricing( input=0.50, output=3.00, cache_read=0.05 ), "gemini-3-pro-preview": ModelPricing( input=2.00, output=12.00, cache_read=0.20 ), "gemini-3.1-pro-preview": ModelPricing( input=2.00, output=12.00, cache_read=0.20 ), } ================================================ FILE: backend/agent/providers/token_usage.py ================================================ from __future__ import annotations from dataclasses import dataclass from agent.providers.pricing import ModelPricing @dataclass class TokenUsage: """Unified token usage across all providers. Log line example: [TOKEN USAGE] provider=gemini model=... | input=1000 output=500 cache_read=200 cache_write=0 total=1700 cost=$0.0020 Fields: input: Non-cached input tokens (billed at full input rate). For providers whose API includes cached tokens in the prompt count (OpenAI, Gemini), cached tokens are subtracted so this is always *exclusive* of cache_read. output: Output tokens including thinking/reasoning (billed at output rate). cache_read: Input tokens served from cache (billed at reduced rate). cache_write: Input tokens written to cache (Anthropic only). total: All tokens as reported by the provider API. Equals input + cache_read + output (+ thinking for Gemini). Total input sent to the model = input + cache_read + cache_write. Cost = (input * input_rate + output * output_rate + cache_read * cache_read_rate + cache_write * cache_write_rate) / 1_000_000 """ input: int = 0 output: int = 0 cache_read: int = 0 cache_write: int = 0 total: int = 0 def accumulate(self, other: TokenUsage) -> None: self.input += other.input self.output += other.output self.cache_read += other.cache_read self.cache_write += other.cache_write self.total += other.total def cost(self, pricing: ModelPricing) -> float: """Compute cost in USD using per-million-token rates.""" return ( self.input * pricing.input + self.output * pricing.output + self.cache_read * pricing.cache_read + self.cache_write * pricing.cache_write ) / 1_000_000 def total_input_tokens(self) -> int: """All input tokens, including non-cached, cache-read, and cache-write.""" return self.input + self.cache_read + self.cache_write def cache_hit_rate_percent(self) -> float: """Percent of total input tokens served from cache.""" total_input = self.total_input_tokens() if total_input == 0: return 0.0 return (self.cache_read / total_input) * 100.0 ================================================ FILE: backend/agent/providers/types.py ================================================ from agent.providers.base import ( EventSink, ExecutedToolCall, ProviderTurn, StreamEvent, ) # Backwards-compatible alias for older imports. StepResult = ProviderTurn __all__ = [ "EventSink", "ExecutedToolCall", "ProviderTurn", "StepResult", "StreamEvent", ] ================================================ FILE: backend/agent/runner.py ================================================ from agent.engine import AgentEngine class Agent(AgentEngine): pass ================================================ FILE: backend/agent/state.py ================================================ from dataclasses import dataclass from typing import Any, List from openai.types.chat import ChatCompletionMessageParam from codegen.utils import extract_html_content @dataclass class AgentFileState: path: str = "index.html" content: str = "" def ensure_str(value: Any) -> str: if value is None: return "" return str(value) def extract_text_content(message: ChatCompletionMessageParam) -> str: content = message.get("content", "") if isinstance(content, str): return content if isinstance(content, list): for part in content: if isinstance(part, dict) and part.get("type") == "text": return ensure_str(part.get("text")) return "" def seed_file_state_from_messages( file_state: AgentFileState, prompt_messages: List[ChatCompletionMessageParam], ) -> None: if file_state.content: return for message in reversed(prompt_messages): if message.get("role") != "assistant": continue raw_text = extract_text_content(message) if not raw_text: continue extracted = extract_html_content(raw_text) file_state.content = extracted or raw_text if not file_state.path: file_state.path = "index.html" return if not prompt_messages: return system_message = prompt_messages[0] if system_message.get("role") != "system": return system_text = extract_text_content(system_message) markers = [ "Here is the code of the app:", ] for marker in markers: if marker not in system_text: continue raw_text = system_text.split(marker, 1)[1].strip() extracted = extract_html_content(raw_text) file_state.content = extracted or raw_text if not file_state.path: file_state.path = "index.html" return ================================================ FILE: backend/agent/tools/__init__.py ================================================ from agent.tools.definitions import canonical_tool_definitions from agent.tools.parsing import ( extract_content_from_args, extract_path_from_args, parse_json_arguments, ) from agent.tools.runtime import AgentToolRuntime, AgentToolbox from agent.tools.summaries import summarize_text, summarize_tool_input from agent.tools.types import ( CanonicalToolDefinition, ToolCall, ToolExecutionResult, ) __all__ = [ "AgentToolRuntime", "AgentToolbox", "CanonicalToolDefinition", "ToolCall", "ToolExecutionResult", "canonical_tool_definitions", "extract_content_from_args", "extract_path_from_args", "parse_json_arguments", "summarize_text", "summarize_tool_input", ] ================================================ FILE: backend/agent/tools/definitions.py ================================================ from typing import Any, Dict, List from agent.tools.types import CanonicalToolDefinition def _create_schema() -> Dict[str, Any]: return { "type": "object", "properties": { "path": { "type": "string", "description": "Path for the main HTML file. Use index.html if unsure.", }, "content": { "type": "string", "description": "Full HTML for the single-file app.", }, }, "required": ["content"], } def _edit_schema() -> Dict[str, Any]: return { "type": "object", "properties": { "path": { "type": "string", "description": "Path for the main HTML file.", }, "old_text": { "type": "string", "description": "Exact text to replace. Must match the file contents.", }, "new_text": { "type": "string", "description": "Replacement text.", }, "count": { "type": "integer", "description": "How many occurrences to replace. Use -1 for all.", }, "edits": { "type": "array", "items": { "type": "object", "properties": { "old_text": {"type": "string"}, "new_text": {"type": "string"}, "count": {"type": "integer"}, }, "required": ["old_text", "new_text"], }, }, }, } def _image_schema() -> Dict[str, Any]: return { "type": "object", "properties": { "prompts": { "type": "array", "items": { "type": "string", "description": "Prompt describing a single image to generate.", }, } }, "required": ["prompts"], } def _remove_background_schema() -> Dict[str, Any]: return { "type": "object", "properties": { "image_urls": { "type": "array", "items": { "type": "string", "description": "URL of an image to remove the background from.", }, }, }, "required": ["image_urls"], } def _retrieve_option_schema() -> Dict[str, Any]: return { "type": "object", "properties": { "option_number": { "type": "integer", "description": "1-based option number to retrieve (Option 1, Option 2, etc.).", } }, "required": ["option_number"], } def canonical_tool_definitions( image_generation_enabled: bool = True, ) -> List[CanonicalToolDefinition]: tools: List[CanonicalToolDefinition] = [ CanonicalToolDefinition( name="create_file", description=( "Create the main HTML file for the app. Use exactly once to write the " "full HTML. Returns a success message and file metadata." ), parameters=_create_schema(), ), CanonicalToolDefinition( name="edit_file", description=( "Edit the main HTML file using exact string replacements. Do not " "regenerate the entire file. Returns a success message plus edit " "details, including a unified diff and first changed line." ), parameters=_edit_schema(), ), ] if image_generation_enabled: tools.append( CanonicalToolDefinition( name="generate_images", description=( "Generate image URLs from prompts. Use to replace placeholder images. " "You can pass multiple prompts at once." ), parameters=_image_schema(), ) ) tools.extend( [ CanonicalToolDefinition( name="remove_background", description=( "Remove the background from one or more images. You can pass multiple " "image URLs at once. Returns URLs to the processed images with " "transparent backgrounds." ), parameters=_remove_background_schema(), ), CanonicalToolDefinition( name="retrieve_option", description=( "Retrieve the full HTML for a specific option (variant) so you can " "reference it." ), parameters=_retrieve_option_schema(), ), ] ) return tools ================================================ FILE: backend/agent/tools/parsing.py ================================================ # pyright: reportUnknownVariableType=false import json from typing import Any, Dict, Optional, Tuple from agent.state import ensure_str def parse_json_arguments(raw_args: Any) -> Tuple[Dict[str, Any], Optional[str]]: if isinstance(raw_args, dict): return raw_args, None if raw_args is None: return {}, None raw_text = ensure_str(raw_args).strip() if not raw_text: return {}, None try: return json.loads(raw_text), None except json.JSONDecodeError as exc: return {}, f"Invalid JSON arguments: {exc}" def _strip_incomplete_escape(value: str) -> str: if not value: return value trailing = 0 for ch in reversed(value): if ch == "\\": trailing += 1 else: break if trailing % 2 == 1: return value[:-1] return value def _extract_partial_json_string(raw_text: str, key: str) -> Optional[str]: if not raw_text: return None token = f'"{key}"' idx = raw_text.find(token) if idx == -1: return None colon = raw_text.find(":", idx + len(token)) if colon == -1: return None cursor = colon + 1 while cursor < len(raw_text) and raw_text[cursor].isspace(): cursor += 1 if cursor >= len(raw_text) or raw_text[cursor] != '"': return None start = cursor + 1 last_quote: Optional[int] = None cursor = start while cursor < len(raw_text): if raw_text[cursor] == '"': backslashes = 0 back = cursor - 1 while back >= start and raw_text[back] == "\\": backslashes += 1 back -= 1 if backslashes % 2 == 0: last_quote = cursor cursor += 1 partial = raw_text[start:] if last_quote is None else raw_text[start:last_quote] partial = _strip_incomplete_escape(partial) if not partial: return "" try: return json.loads(f'"{partial}"') except Exception: return ( partial.replace("\\n", "\n") .replace("\\t", "\t") .replace("\\r", "\r") .replace('\\"', '"') .replace("\\\\", "\\") ) def extract_content_from_args(raw_args: Any) -> Optional[str]: if isinstance(raw_args, dict): content = raw_args.get("content") if content is None: return None return ensure_str(content) raw_text = ensure_str(raw_args) return _extract_partial_json_string(raw_text, "content") def extract_path_from_args(raw_args: Any) -> Optional[str]: if isinstance(raw_args, dict): path = raw_args.get("path") return ensure_str(path) if path is not None else None raw_text = ensure_str(raw_args) return _extract_partial_json_string(raw_text, "path") ================================================ FILE: backend/agent/tools/runtime.py ================================================ # pyright: reportUnknownVariableType=false import asyncio import difflib from typing import Any, Dict, List, Optional, Tuple, Union from codegen.utils import extract_html_content from config import REPLICATE_API_KEY from image_generation.generation import process_tasks from image_generation.replicate import remove_background from agent.state import AgentFileState, ensure_str from agent.tools.types import ToolCall, ToolExecutionResult from agent.tools.summaries import summarize_text class AgentToolRuntime: def __init__( self, file_state: AgentFileState, should_generate_images: bool, openai_api_key: Optional[str], openai_base_url: Optional[str], option_codes: Optional[List[str]] = None, ): self.file_state = file_state self.should_generate_images = should_generate_images self.openai_api_key = openai_api_key self.openai_base_url = openai_base_url self.option_codes = option_codes or [] async def execute(self, tool_call: ToolCall) -> ToolExecutionResult: if "INVALID_JSON" in tool_call.arguments: invalid_json = ensure_str(tool_call.arguments.get("INVALID_JSON")) return ToolExecutionResult( ok=False, result={ "error": "Tool arguments were invalid JSON.", "INVALID_JSON": invalid_json, }, summary={"error": "Invalid JSON tool arguments"}, ) if tool_call.name == "create_file": return self._create_file(tool_call.arguments) if tool_call.name == "edit_file": return self._edit_file(tool_call.arguments) if tool_call.name == "generate_images": return await self._generate_images(tool_call.arguments) if tool_call.name == "remove_background": return await self._remove_background(tool_call.arguments) if tool_call.name == "retrieve_option": return self._retrieve_option(tool_call.arguments) return ToolExecutionResult( ok=False, result={"error": f"Unknown tool: {tool_call.name}"}, summary={"error": f"Unknown tool: {tool_call.name}"}, ) def _create_file(self, args: Dict[str, Any]) -> ToolExecutionResult: path = ensure_str(args.get("path") or self.file_state.path or "index.html") content = ensure_str(args.get("content")) if not content: return ToolExecutionResult( ok=False, result={"error": "create_file requires non-empty content"}, summary={"error": "Missing content"}, ) extracted = extract_html_content(content) self.file_state.path = path self.file_state.content = extracted or content summary = { "path": self.file_state.path, "contentLength": len(self.file_state.content), "preview": summarize_text(self.file_state.content, 320), } result = { "content": f"Successfully created file at {self.file_state.path}.", "details": { "path": self.file_state.path, "contentLength": len(self.file_state.content), }, } return ToolExecutionResult( ok=True, result=result, summary=summary, updated_content=self.file_state.content, ) @staticmethod def _generate_diff(old_content: str, new_content: str, path: str) -> Dict[str, Any]: """Generate a unified diff between old and new content.""" old_lines = old_content.splitlines(keepends=True) new_lines = new_content.splitlines(keepends=True) diff_lines = list( difflib.unified_diff(old_lines, new_lines, fromfile=path, tofile=path) ) diff_str = "".join(diff_lines) first_changed_line: Optional[int] = None for line in diff_lines: if not line.startswith("@@"): continue try: plus_part = line.split("+")[1].split("@@")[0].strip() first_changed_line = int(plus_part.split(",")[0]) except (IndexError, ValueError): pass break return { "diff": diff_str, "firstChangedLine": first_changed_line, } def _apply_single_edit( self, content: str, old_text: str, new_text: str, count: Optional[int], ) -> Tuple[str, int]: if old_text not in content: return content, 0 if count is None: replace_count = 1 elif count < 0: replace_count = content.count(old_text) else: replace_count = count updated = content.replace(old_text, new_text, replace_count) return updated, min(replace_count, content.count(old_text)) def _edit_file(self, args: Dict[str, Any]) -> ToolExecutionResult: if not self.file_state.content: return ToolExecutionResult( ok=False, result={"error": "No file exists yet. Call create_file first."}, summary={"error": "No file to edit"}, ) edits = args.get("edits") if not edits: old_text = ensure_str(args.get("old_text")) new_text = ensure_str(args.get("new_text")) count = args.get("count") edits = [{"old_text": old_text, "new_text": new_text, "count": count}] if not isinstance(edits, list): return ToolExecutionResult( ok=False, result={"error": "edits must be a list"}, summary={"error": "Invalid edits payload"}, ) content = self.file_state.content original_content = content summary_edits: List[Dict[str, Any]] = [] for edit in edits: old_text = ensure_str(edit.get("old_text")) new_text = ensure_str(edit.get("new_text")) count = edit.get("count") if not old_text: return ToolExecutionResult( ok=False, result={"error": "edit_file requires old_text"}, summary={"error": "Missing old_text"}, ) content, replaced = self._apply_single_edit(content, old_text, new_text, count) if replaced == 0: return ToolExecutionResult( ok=False, result={"error": "old_text not found", "old_text": old_text}, summary={ "error": "old_text not found", "old_text": summarize_text(old_text, 160), }, ) summary_edits.append( { "old_text": summarize_text(old_text, 140), "new_text": summarize_text(new_text, 140), "replaced": replaced, } ) self.file_state.content = content path = self.file_state.path or "index.html" diff_info = self._generate_diff(original_content, content, path) summary = { "path": path, "edits": summary_edits, "contentLength": len(self.file_state.content), "diff": diff_info["diff"], "firstChangedLine": diff_info["firstChangedLine"], } result = { "content": f"Successfully edited file at {path}.", "details": { "diff": diff_info["diff"], "firstChangedLine": diff_info["firstChangedLine"], }, } return ToolExecutionResult( ok=True, result=result, summary=summary, updated_content=self.file_state.content, ) async def _generate_images(self, args: Dict[str, Any]) -> ToolExecutionResult: if not self.should_generate_images: return ToolExecutionResult( ok=False, result={"error": "Image generation is disabled."}, summary={"error": "Image generation disabled"}, ) prompts = args.get("prompts") or [] if not isinstance(prompts, list) or not prompts: return ToolExecutionResult( ok=False, result={"error": "generate_images requires a non-empty prompts list"}, summary={"error": "Missing prompts"}, ) cleaned = [prompt.strip() for prompt in prompts if isinstance(prompt, str)] unique_prompts = list(dict.fromkeys([p for p in cleaned if p])) if not unique_prompts: return ToolExecutionResult( ok=False, result={"error": "No valid prompts provided"}, summary={"error": "No valid prompts"}, ) if REPLICATE_API_KEY: model = "flux" api_key = REPLICATE_API_KEY base_url = None else: if not self.openai_api_key: return ToolExecutionResult( ok=False, result={"error": "No API key available for image generation."}, summary={"error": "Missing image generation API key"}, ) model = "dalle3" api_key = self.openai_api_key base_url = self.openai_base_url generated = await process_tasks(unique_prompts, api_key, base_url, model) # type: ignore merged_results = { prompt: url for prompt, url in zip(unique_prompts, generated) } summary_items = [ { "prompt": prompt, "url": url, "status": "ok" if url else "error", } for prompt, url in merged_results.items() ] result = {"images": merged_results} summary = {"images": summary_items} return ToolExecutionResult(ok=True, result=result, summary=summary) async def _remove_background(self, args: Dict[str, Any]) -> ToolExecutionResult: if not REPLICATE_API_KEY: return ToolExecutionResult( ok=False, result={"error": "Background removal requires REPLICATE_API_KEY."}, summary={"error": "Missing Replicate API key"}, ) image_urls = args.get("image_urls") or [] if not isinstance(image_urls, list) or not image_urls: return ToolExecutionResult( ok=False, result={ "error": "remove_background requires a non-empty image_urls list" }, summary={"error": "Missing image_urls"}, ) cleaned = [url.strip() for url in image_urls if isinstance(url, str)] unique_urls = list(dict.fromkeys([u for u in cleaned if u])) if not unique_urls: return ToolExecutionResult( ok=False, result={"error": "No valid image URLs provided"}, summary={"error": "No valid image_urls"}, ) batch_size = 20 raw_results: list[str | BaseException] = [] for i in range(0, len(unique_urls), batch_size): batch = unique_urls[i : i + batch_size] tasks = [remove_background(url, REPLICATE_API_KEY) for url in batch] raw_results.extend(await asyncio.gather(*tasks, return_exceptions=True)) results: List[Dict[str, Any]] = [] for url, raw in zip(unique_urls, raw_results): if isinstance(raw, BaseException): print(f"Background removal failed for {url}: {raw}") results.append( {"image_url": url, "result_url": None, "status": "error"} ) else: results.append( {"image_url": url, "result_url": raw, "status": "ok"} ) summary_items = [ { "image_url": summarize_text(r["image_url"], 100), "result_url": r["result_url"], "status": r["status"], } for r in results ] return ToolExecutionResult( ok=True, result={"images": results}, summary={"images": summary_items}, ) def _retrieve_option(self, args: Dict[str, Any]) -> ToolExecutionResult: raw_option_number = args.get("option_number") raw_index = args.get("index") def coerce_int(value: Any) -> Optional[int]: if value is None: return None try: return int(value) except (TypeError, ValueError): return None option_number = coerce_int(raw_option_number) index = coerce_int(raw_index) if option_number is None and index is None: return ToolExecutionResult( ok=False, result={"error": "retrieve_option requires option_number"}, summary={"error": "Missing option_number"}, ) resolved_index = index if option_number is None else option_number - 1 if resolved_index is None: return ToolExecutionResult( ok=False, result={"error": "Invalid option_number"}, summary={"error": "Invalid option_number"}, ) if resolved_index < 0 or resolved_index >= len(self.option_codes): return ToolExecutionResult( ok=False, result={ "error": "Option index out of range", "option_number": resolved_index + 1, "available": len(self.option_codes), }, summary={ "error": "Option index out of range", "available": len(self.option_codes), }, ) code = ensure_str(self.option_codes[resolved_index]) if not code.strip(): return ToolExecutionResult( ok=False, result={ "error": "Option code is empty or unavailable", "option_number": resolved_index + 1, }, summary={"error": "Option code unavailable"}, ) summary = { "option_number": resolved_index + 1, "contentLength": len(code), "preview": summarize_text(code, 200), } result = {"option_number": resolved_index + 1, "code": code} return ToolExecutionResult(ok=True, result=result, summary=summary) # Backwards-compatible alias for older imports. AgentToolbox = AgentToolRuntime ================================================ FILE: backend/agent/tools/summaries.py ================================================ # pyright: reportUnknownVariableType=false from typing import Any, Dict from agent.state import AgentFileState, ensure_str from agent.tools.types import ToolCall def summarize_text(value: str, limit: int = 240) -> str: if len(value) <= limit: return value return value[:limit] + "..." def summarize_tool_input(tool_call: ToolCall, file_state: AgentFileState) -> Dict[str, Any]: args = tool_call.arguments or {} if tool_call.name == "create_file": content = ensure_str(args.get("content")) return { "path": args.get("path") or file_state.path, "contentLength": len(content), "preview": summarize_text(content, 200), } if tool_call.name == "edit_file": edits = args.get("edits") if not edits: edits = [ { "old_text": args.get("old_text"), "new_text": args.get("new_text"), "count": args.get("count"), } ] summary_edits = [] for edit in edits if isinstance(edits, list) else []: summary_edits.append( { "old_text": summarize_text(ensure_str(edit.get("old_text")), 160), "new_text": summarize_text(ensure_str(edit.get("new_text")), 160), "count": edit.get("count"), } ) return { "path": args.get("path") or file_state.path, "edits": summary_edits, } if tool_call.name == "generate_images": prompts = args.get("prompts") or [] if isinstance(prompts, list): return { "count": len(prompts), "prompts": [ensure_str(p) for p in prompts], } if tool_call.name == "remove_background": image_urls = args.get("image_urls") or [] if isinstance(image_urls, list): return { "count": len(image_urls), "image_urls": [ensure_str(u) for u in image_urls], } return {"image_urls": []} if tool_call.name == "retrieve_option": return { "option_number": args.get("option_number"), "index": args.get("index"), } return args ================================================ FILE: backend/agent/tools/types.py ================================================ from dataclasses import dataclass from typing import Any, Dict, Optional @dataclass(frozen=True) class ToolCall: id: str name: str arguments: Dict[str, Any] @dataclass class ToolExecutionResult: ok: bool result: Dict[str, Any] summary: Dict[str, Any] updated_content: Optional[str] = None @dataclass(frozen=True) class CanonicalToolDefinition: name: str description: str parameters: Dict[str, Any] ================================================ FILE: backend/codegen/__init__.py ================================================ ================================================ FILE: backend/codegen/test_utils.py ================================================ import unittest from codegen.utils import extract_html_content class TestUtils(unittest.TestCase): def test_extract_html_content_with_html_tags(self): text = "

Hello, World!

" expected = "

Hello, World!

" result = extract_html_content(text) self.assertEqual(result, expected) def test_extract_html_content_without_html_tags(self): text = "No HTML content here." expected = "No HTML content here." result = extract_html_content(text) self.assertEqual(result, expected) def test_extract_html_content_with_partial_html_tags(self): text = "

Hello, World!

" expected = "

Hello, World!

" result = extract_html_content(text) self.assertEqual(result, expected) def test_extract_html_content_with_multiple_html_tags(self): text = "

First

Some text

Second

" expected = "

First

" result = extract_html_content(text) self.assertEqual(result, expected) ## The following are tests based on actual LLM outputs def test_extract_html_content_some_explanation_before(self): text = """Got it! You want the song list to be displayed horizontally. I'll update the code to ensure that the song list is displayed in a horizontal layout. Here's the updated code: """ expected = '' result = extract_html_content(text) self.assertEqual(result, expected) def test_markdown_tags(self): text = "```html```" expected = "```html```" result = extract_html_content(text) self.assertEqual(result, expected) def test_doctype_text(self): text = '' expected = '' result = extract_html_content(text) self.assertEqual(result, expected) if __name__ == "__main__": unittest.main() ================================================ FILE: backend/codegen/utils.py ================================================ import re def extract_html_content(text: str) -> str: file_match = re.search( r"\s*(.*?)\s*", text, re.DOTALL | re.IGNORECASE, ) if file_match: return extract_html_content(file_match.group(1).strip()) # First, strip markdown code fences if present text = re.sub(r'^```html?\s*\n?', '', text, flags=re.MULTILINE) text = re.sub(r'\n?```\s*$', '', text, flags=re.MULTILINE) # Try to find DOCTYPE + html tags together match_with_doctype = re.search( r"(]*>.*?.*?)", text, re.DOTALL | re.IGNORECASE ) if match_with_doctype: return match_with_doctype.group(1) # Fall back to just tags match = re.search(r"(.*?)", text, re.DOTALL) if match: return match.group(1) else: # Otherwise, we just send the previous HTML over print( "[HTML Extraction] No tags found in the generated content" ) return text ================================================ FILE: backend/config.py ================================================ import os NUM_VARIANTS = 4 NUM_VARIANTS_VIDEO = 2 # LLM-related OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None) ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None) GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", None) OPENAI_BASE_URL = os.environ.get("OPENAI_BASE_URL", None) # Image generation (optional) REPLICATE_API_KEY = os.environ.get("REPLICATE_API_KEY", None) # Debugging-related IS_DEBUG_ENABLED = bool(os.environ.get("IS_DEBUG_ENABLED", False)) DEBUG_DIR = os.environ.get("DEBUG_DIR", "") # Set to True when running in production (on the hosted version) # Used as a feature flag to enable or disable certain features IS_PROD = os.environ.get("IS_PROD", False) ================================================ FILE: backend/custom_types.py ================================================ from typing import Literal InputMode = Literal[ "image", "video", "text", ] ================================================ FILE: backend/debug/DebugFileWriter.py ================================================ import os import logging import uuid from config import DEBUG_DIR, IS_DEBUG_ENABLED class DebugFileWriter: def __init__(self): if not IS_DEBUG_ENABLED: return try: self.debug_artifacts_path = os.path.expanduser( f"{DEBUG_DIR}/{str(uuid.uuid4())}" ) os.makedirs(self.debug_artifacts_path, exist_ok=True) print(f"Debugging artifacts will be stored in: {self.debug_artifacts_path}") except: logging.error("Failed to create debug directory") def write_to_file(self, filename: str, content: str) -> None: try: with open(os.path.join(self.debug_artifacts_path, filename), "w") as file: file.write(content) except Exception as e: logging.error(f"Failed to write to file: {e}") def extract_html_content(self, text: str) -> str: return str(text.split("")[-1].rsplit("", 1)[0] + "") ================================================ FILE: backend/debug/__init__.py ================================================ ================================================ FILE: backend/evals/__init__.py ================================================ ================================================ FILE: backend/evals/config.py ================================================ EVALS_DIR = "./evals_data" ================================================ FILE: backend/evals/core.py ================================================ from config import ( ANTHROPIC_API_KEY, GEMINI_API_KEY, OPENAI_API_KEY, OPENAI_BASE_URL, ) from llm import Llm, OPENAI_MODELS, ANTHROPIC_MODELS, GEMINI_MODELS from agent.runner import Agent from prompts.create.image import build_image_prompt_messages from prompts.prompt_types import Stack from openai.types.chat import ChatCompletionMessageParam from typing import Any async def generate_code_for_image(image_url: str, stack: Stack, model: Llm) -> str: prompt_messages = build_image_prompt_messages( image_data_urls=[image_url], stack=stack, text_prompt="", image_generation_enabled=True, ) async def send_message( _: str, __: str | None, ___: int, ____: dict[str, Any] | None = None, _____: str | None = None, ) -> None: # Evals do not stream tool/assistant messages to a frontend. return None if model in ANTHROPIC_MODELS and not ANTHROPIC_API_KEY: raise Exception("Anthropic API key not found") if model in GEMINI_MODELS and not GEMINI_API_KEY: raise Exception("Gemini API key not found") if model in OPENAI_MODELS and not OPENAI_API_KEY: raise Exception("OpenAI API key not found") print(f"[EVALS] Using agent runner for model: {model.value}") runner = Agent( send_message=send_message, variant_index=0, openai_api_key=OPENAI_API_KEY, openai_base_url=OPENAI_BASE_URL, anthropic_api_key=ANTHROPIC_API_KEY, gemini_api_key=GEMINI_API_KEY, should_generate_images=True, initial_file_state=None, option_codes=None, ) return await runner.run(model, prompt_messages) ================================================ FILE: backend/evals/runner.py ================================================ from typing import Any, Awaitable, Callable, Coroutine, List, Optional, Tuple import asyncio import os from datetime import datetime import time import inspect from llm import Llm from prompts.prompt_types import Stack from .core import generate_code_for_image from .utils import image_to_data_url from .config import EVALS_DIR MAX_EVAL_RETRIES = 2 def _resolve_eval_filenames(input_files: Optional[List[str]]) -> List[str]: input_dir = EVALS_DIR + "/inputs" if input_files and len(input_files) > 0: return [os.path.basename(f) for f in input_files if f.endswith(".png")] return [f for f in os.listdir(input_dir) if f.endswith(".png")] def _output_html_filename(original_filename: str, attempt_idx: int) -> str: return f"{os.path.splitext(original_filename)[0]}_{attempt_idx}.html" def get_eval_output_subfolder(stack: Stack, model: str) -> str: today = datetime.now().strftime("%b_%d_%Y") output_dir = EVALS_DIR + "/outputs" return os.path.join(output_dir, f"{today}_{model}_{stack}") def count_pending_eval_tasks( stack: Stack, model: str, input_files: Optional[List[str]] = None, n: int = 1, diff_mode: bool = False, ) -> Tuple[int, int]: evals = _resolve_eval_filenames(input_files) if not diff_mode: return len(evals) * n, 0 output_subfolder = get_eval_output_subfolder(stack=stack, model=model) pending_tasks = 0 skipped_existing_tasks = 0 for original_filename in evals: for n_idx in range(n): output_filename = _output_html_filename(original_filename, n_idx) output_path = os.path.join(output_subfolder, output_filename) if os.path.exists(output_path): skipped_existing_tasks += 1 else: pending_tasks += 1 return pending_tasks, skipped_existing_tasks async def generate_code_and_time( image_url: str, stack: Stack, model: Llm, original_input_filename: str, attempt_idx: int, ) -> Tuple[str, int, Optional[str], Optional[float], Optional[Exception], int]: """ Generates code for an image, measures the time taken, and returns identifiers along with success/failure status. Returns a tuple: (original_input_filename, attempt_idx, content, duration, error_object, retries_used) content and duration are None if an error occurs during generation. """ retries_used = 0 while True: start_time = time.perf_counter() try: content = await generate_code_for_image( image_url=image_url, stack=stack, model=model ) end_time = time.perf_counter() duration = end_time - start_time return ( original_input_filename, attempt_idx, content, duration, None, retries_used, ) except Exception as e: if retries_used >= MAX_EVAL_RETRIES: print( f"Error during code generation for {original_input_filename} " f"(attempt {attempt_idx}, retries exhausted): {e}" ) return ( original_input_filename, attempt_idx, None, None, e, retries_used, ) retries_used += 1 print( f"Retrying {original_input_filename} (attempt {attempt_idx}) " f"{retries_used}/{MAX_EVAL_RETRIES} after error: {e}" ) async def run_image_evals( stack: Optional[Stack] = None, model: Optional[str] = None, n: int = 1, input_files: Optional[List[str]] = None, diff_mode: bool = False, progress_callback: Optional[Callable[[dict[str, Any]], Any | Awaitable[Any]]] = None, ) -> List[str]: INPUT_DIR = EVALS_DIR + "/inputs" evals = _resolve_eval_filenames(input_files) if not stack: raise ValueError("No stack was provided") if not model: raise ValueError("No model was provided") print("User selected stack:", stack) print("User selected model:", model) selected_model = Llm(model) print(f"Running evals for {selected_model.value} model") if input_files and len(input_files) > 0: print(f"Running on {len(evals)} selected files") else: print(f"Running on all {len(evals)} files in {INPUT_DIR}") output_subfolder = get_eval_output_subfolder( stack=stack, model=selected_model.value, ) os.makedirs(output_subfolder, exist_ok=True) task_coroutines: List[ Coroutine[ Any, Any, Tuple[str, int, Optional[str], Optional[float], Optional[Exception], int], ] ] = [] skipped_existing_tasks = 0 for original_filename in evals: # Handle both full paths and relative filenames if os.path.isabs(original_filename): filepath = original_filename original_filename = os.path.basename(original_filename) else: filepath = os.path.join(INPUT_DIR, original_filename) data_url: Optional[str] = None for n_idx in range(n): output_filename = _output_html_filename(original_filename, n_idx) output_path = os.path.join(output_subfolder, output_filename) if diff_mode and os.path.exists(output_path): skipped_existing_tasks += 1 continue if data_url is None: data_url = await image_to_data_url(filepath) current_model_for_task = ( selected_model if n_idx == 0 else Llm.GPT_4_1_2025_04_14 ) coro = generate_code_and_time( image_url=data_url, stack=stack, model=current_model_for_task, original_input_filename=original_filename, attempt_idx=n_idx, ) task_coroutines.append(coro) if diff_mode and skipped_existing_tasks > 0: print( f"Diff mode: skipping {skipped_existing_tasks} existing outputs for " f"{selected_model.value}" ) print(f"Processing {len(task_coroutines)} tasks...") total_tasks = len(task_coroutines) completed_tasks = 0 output_files: List[str] = [] timing_data: List[str] = [] failed_tasks_log: List[str] = [] async def emit_progress(event: dict[str, Any]) -> None: if progress_callback is None: return maybe_awaitable = progress_callback(event) if inspect.isawaitable(maybe_awaitable): await maybe_awaitable for future in asyncio.as_completed(task_coroutines): try: ( task_orig_fn, task_attempt_idx, generated_content, time_taken, error_obj, retries_used, ) = await future completed_tasks += 1 output_html_filename_base = os.path.splitext(task_orig_fn)[0] final_output_html_filename = ( f"{output_html_filename_base}_{task_attempt_idx}.html" ) output_html_filepath = os.path.join( output_subfolder, final_output_html_filename ) if error_obj is not None: failed_tasks_log.append( f"Input: {task_orig_fn}, Attempt: {task_attempt_idx}, OutputFile: " f"{final_output_html_filename}, Retries: {retries_used}, " f"Error: Generation failed - {str(error_obj)}" ) await emit_progress( { "type": "task_complete", "completed_tasks": completed_tasks, "total_tasks": total_tasks, "input_file": task_orig_fn, "attempt_idx": task_attempt_idx, "success": False, "error": str(error_obj), "retries_used": retries_used, } ) elif generated_content is not None and time_taken is not None: try: with open(output_html_filepath, "w") as file: file.write(generated_content) timing_data.append( f"{final_output_html_filename}: {time_taken:.2f} seconds" ) output_files.append(final_output_html_filename) print( f"Successfully processed and wrote {final_output_html_filename}" ) await emit_progress( { "type": "task_complete", "completed_tasks": completed_tasks, "total_tasks": total_tasks, "input_file": task_orig_fn, "attempt_idx": task_attempt_idx, "success": True, "output_file": final_output_html_filename, "duration_seconds": time_taken, "retries_used": retries_used, } ) except Exception as e_write: failed_tasks_log.append( f"Input: {task_orig_fn}, Attempt: {task_attempt_idx}, OutputFile: {final_output_html_filename}, Error: Writing to file failed - {str(e_write)}" ) await emit_progress( { "type": "task_complete", "completed_tasks": completed_tasks, "total_tasks": total_tasks, "input_file": task_orig_fn, "attempt_idx": task_attempt_idx, "success": False, "error": str(e_write), } ) else: failed_tasks_log.append( f"Input: {task_orig_fn}, Attempt: {task_attempt_idx}, OutputFile: {final_output_html_filename}, Error: Unknown issue - content or time_taken is None without explicit error." ) await emit_progress( { "type": "task_complete", "completed_tasks": completed_tasks, "total_tasks": total_tasks, "input_file": task_orig_fn, "attempt_idx": task_attempt_idx, "success": False, "error": "Unknown issue during task processing.", } ) except Exception as e_as_completed: print(f"A task in as_completed failed unexpectedly: {e_as_completed}") failed_tasks_log.append( f"Critical Error: A task processing failed - {str(e_as_completed)}" ) completed_tasks += 1 await emit_progress( { "type": "task_complete", "completed_tasks": completed_tasks, "total_tasks": total_tasks, "input_file": "unknown", "attempt_idx": -1, "success": False, "error": str(e_as_completed), } ) # Write timing data for successful tasks if timing_data: timing_file_path = os.path.join(output_subfolder, "generation_times.txt") try: is_new_or_empty_file = ( not os.path.exists(timing_file_path) or os.path.getsize(timing_file_path) == 0 ) with open(timing_file_path, "a") as file: if is_new_or_empty_file: file.write(f"Model: {selected_model.value}\n") elif timing_data: file.write("\n") file.write("\n".join(timing_data)) print(f"Timing data saved to {timing_file_path}") except Exception as e: print(f"Error writing timing file {timing_file_path}: {e}") # Write log for failed tasks if failed_tasks_log: failed_log_path = os.path.join(output_subfolder, "failed_tasks.txt") try: with open(failed_log_path, "w") as file: file.write("\n".join(failed_tasks_log)) print(f"Failed tasks log saved to {failed_log_path}") except Exception as e: print(f"Error writing failed tasks log {failed_log_path}: {e}") return output_files ================================================ FILE: backend/evals/utils.py ================================================ import base64 async def image_to_data_url(filepath: str): with open(filepath, "rb") as image_file: encoded_string = base64.b64encode(image_file.read()).decode() return f"data:image/png;base64,{encoded_string}" ================================================ FILE: backend/fs_logging/__init__.py ================================================ ================================================ FILE: backend/fs_logging/openai_input_compare.py ================================================ import json from dataclasses import dataclass from typing import Any, TypeAlias, cast from fs_logging.openai_input_formatting import ( summarize_responses_input_item, to_serializable, ) JSONScalar: TypeAlias = None | bool | int | float | str JSONValue: TypeAlias = JSONScalar | list["JSONValue"] | dict[str, "JSONValue"] @dataclass(frozen=True) class OpenAIInputDifference: item_index: int path: str left_summary: str right_summary: str left_value: Any right_value: Any @dataclass(frozen=True) class OpenAIInputComparison: common_prefix_items: int left_item_count: int right_item_count: int difference: OpenAIInputDifference | None def _extract_input_items(payload: Any) -> list[JSONValue]: serialized = cast(JSONValue, to_serializable(payload)) if isinstance(serialized, list): return serialized if isinstance(serialized, dict): serialized_dict = cast(dict[str, JSONValue], serialized) input_items = serialized_dict.get("input") if isinstance(input_items, list): return cast(list[JSONValue], input_items) raise ValueError("Expected a raw input array or a request payload with an 'input' list") def _as_json_dict(value: JSONValue) -> dict[str, JSONValue]: return cast(dict[str, JSONValue], value) def _as_json_list(value: JSONValue) -> list[JSONValue]: return cast(list[JSONValue], value) def _append_dict_path(path: str, key: str) -> str: if not path: return key return f"{path}.{key}" def _append_list_path(path: str, index: int) -> str: return f"{path}[{index}]" def _find_first_value_difference( left: JSONValue, right: JSONValue, path: str = "", ) -> tuple[str, JSONValue, JSONValue] | None: if type(left) is not type(right): return path, left, right if isinstance(left, dict): left_dict = _as_json_dict(left) right_dict = _as_json_dict(right) left_keys = list(left_dict.keys()) right_keys = list(right_dict.keys()) for index in range(min(len(left_keys), len(right_keys))): left_key = left_keys[index] right_key = right_keys[index] if left_key != right_key: key_path = _append_dict_path(path, left_key) return key_path, left, right if len(left_keys) != len(right_keys): extra_key = ( left_keys[len(right_keys)] if len(left_keys) > len(right_keys) else right_keys[len(left_keys)] ) key_path = _append_dict_path(path, extra_key) left_value = left_dict.get(extra_key) right_value = right_dict.get(extra_key) return key_path, cast(JSONValue, left_value), cast(JSONValue, right_value) for key in left_keys: nested = _find_first_value_difference( left_dict[key], right_dict[key], _append_dict_path(path, key), ) if nested is not None: return nested return None if isinstance(left, list): left_list = _as_json_list(left) right_list = _as_json_list(right) for index in range(min(len(left_list), len(right_list))): nested = _find_first_value_difference( left_list[index], right_list[index], _append_list_path(path, index), ) if nested is not None: return nested if len(left_list) != len(right_list): index = min(len(left_list), len(right_list)) item_path = _append_list_path(path, index) left_value = left_list[index] if index < len(left_list) else None right_value = right_list[index] if index < len(right_list) else None return item_path, left_value, right_value return None if left != right: return path, left, right return None def compare_openai_inputs( left_payload: Any, right_payload: Any, ) -> OpenAIInputComparison: left_items = _extract_input_items(left_payload) right_items = _extract_input_items(right_payload) common_prefix_items = 0 for index in range(min(len(left_items), len(right_items))): left_item = left_items[index] right_item = right_items[index] if left_item == right_item: common_prefix_items += 1 continue nested_difference = _find_first_value_difference(left_item, right_item) nested_path = "" if nested_difference is None else nested_difference[0] path = f"input[{index}]" if nested_path: if nested_path.startswith("["): path = f"{path}{nested_path}" else: path = f"{path}.{nested_path}" left_value = left_item if nested_difference is None else nested_difference[1] right_value = right_item if nested_difference is None else nested_difference[2] return OpenAIInputComparison( common_prefix_items=common_prefix_items, left_item_count=len(left_items), right_item_count=len(right_items), difference=OpenAIInputDifference( item_index=index, path=path, left_summary=summarize_responses_input_item(index, left_item), right_summary=summarize_responses_input_item(index, right_item), left_value=left_value, right_value=right_value, ), ) if len(left_items) != len(right_items): index = min(len(left_items), len(right_items)) left_item = left_items[index] if index < len(left_items) else None right_item = right_items[index] if index < len(right_items) else None return OpenAIInputComparison( common_prefix_items=common_prefix_items, left_item_count=len(left_items), right_item_count=len(right_items), difference=OpenAIInputDifference( item_index=index, path=f"input[{index}]", left_summary=( summarize_responses_input_item(index, left_item) if left_item is not None else f"{index:02d} " ), right_summary=( summarize_responses_input_item(index, right_item) if right_item is not None else f"{index:02d} " ), left_value=left_item, right_value=right_item, ), ) return OpenAIInputComparison( common_prefix_items=common_prefix_items, left_item_count=len(left_items), right_item_count=len(right_items), difference=None, ) def format_openai_input_comparison(comparison: OpenAIInputComparison) -> str: lines = [ "OpenAI input comparison", f"common_prefix_items={comparison.common_prefix_items}", f"left_item_count={comparison.left_item_count}", f"right_item_count={comparison.right_item_count}", ] difference = comparison.difference if difference is None: lines.append("difference=none") return "\n".join(lines) lines.extend( [ f"first_different_item_index={difference.item_index}", f"first_different_path={difference.path}", f"left_summary={difference.left_summary}", f"right_summary={difference.right_summary}", "left_value=" + json.dumps(difference.left_value, indent=2, ensure_ascii=False), "right_value=" + json.dumps( difference.right_value, indent=2, ensure_ascii=False, ), ] ) return "\n".join(lines) def compare_openai_input_json_strings( left_json: str, right_json: str, ) -> OpenAIInputComparison: left_payload = json.loads(left_json) right_payload = json.loads(right_json) return compare_openai_inputs(left_payload, right_payload) ================================================ FILE: backend/fs_logging/openai_input_formatting.py ================================================ # pyright: reportUnknownVariableType=false import json from typing import Any from agent.state import ensure_str def truncate_for_log(value: Any, max_len: int = 120) -> str: text = ensure_str(value).replace("\n", "\\n") if len(text) <= max_len: return text return f"{text[:max_len]}..." def as_dict(value: Any) -> dict[str, Any] | None: if isinstance(value, dict): return value model_dump = getattr(value, "model_dump", None) if callable(model_dump): dumped = model_dump() if isinstance(dumped, dict): return dumped to_dict = getattr(value, "to_dict", None) if callable(to_dict): dumped = to_dict() if isinstance(dumped, dict): return dumped dict_method = getattr(value, "dict", None) if callable(dict_method): dumped = dict_method() if isinstance(dumped, dict): return dumped raw_dict = getattr(value, "__dict__", None) if isinstance(raw_dict, dict): normalized = {k: v for k, v in raw_dict.items() if not k.startswith("_")} if normalized: return normalized return None def to_serializable(value: Any) -> Any: if value is None or isinstance(value, (bool, int, float, str)): return value if isinstance(value, dict): return {ensure_str(k): to_serializable(v) for k, v in value.items()} if isinstance(value, (list, tuple)): return [to_serializable(v) for v in value] value_as_dict = as_dict(value) if value_as_dict is not None: return to_serializable(value_as_dict) return ensure_str(value) def summarize_content_part(part: Any) -> str: part_dict = as_dict(part) if part_dict is None: return f"{type(part).__name__}" part_type = part_dict.get("type", "unknown") if part_type in ("input_text", "text", "output_text", "summary_text"): text = ensure_str(part_dict.get("text", "")) return ( f"{part_type}(chars={len(text)} " f"preview='{truncate_for_log(text, max_len=80)}')" ) if part_type in ("input_image", "image_url"): image_url_value: Any = part_dict.get("image_url", "") detail: str | None = None if isinstance(image_url_value, dict): detail = ensure_str(image_url_value.get("detail", "")) image_url_value = image_url_value.get("url", "") else: detail = ensure_str(part_dict.get("detail", "")) url_text = ensure_str(image_url_value) detail_text = detail or "-" return ( f"{part_type}(detail={detail_text} " f"url='{truncate_for_log(url_text, max_len=80)}')" ) return f"{part_type}(keys={sorted(part_dict.keys())})" def summarize_function_call_output_payload(output_text: str) -> str: try: parsed = json.loads(output_text) except json.JSONDecodeError: return ( f"output_chars={len(output_text)} " f"preview='{truncate_for_log(output_text)}'" ) if not isinstance(parsed, dict): return ( f"output_type={type(parsed).__name__} " f"preview='{truncate_for_log(parsed)}'" ) if "error" in parsed: error_text = ensure_str(parsed.get("error")) return f"error='{truncate_for_log(error_text)}'" summary_parts: list[str] = [] content_text = ensure_str(parsed.get("content")) if content_text: summary_parts.append(f"content='{truncate_for_log(content_text, max_len=80)}'") details = parsed.get("details") if isinstance(details, dict): path = ensure_str(details.get("path")) diff_text = details.get("diff") if (not path) and isinstance(diff_text, str) and diff_text: for line in diff_text.splitlines(): if line.startswith("--- "): path = line.removeprefix("--- ").strip() break if path: summary_parts.append(f"path={path}") edits = details.get("edits") if isinstance(edits, list): summary_parts.append(f"edits={len(edits)}") content_length = details.get("contentLength") if isinstance(content_length, int): summary_parts.append(f"content_length={content_length}") first_changed_line = details.get("firstChangedLine") if isinstance(first_changed_line, int): summary_parts.append(f"first_changed_line={first_changed_line}") if isinstance(diff_text, str) and diff_text: diff_lines = diff_text.count("\n") summary_parts.append(f"diff_chars={len(diff_text)}") summary_parts.append(f"diff_lines={diff_lines}") if not summary_parts: summary_parts.append(f"keys={sorted(parsed.keys())}") return " ".join(summary_parts) def summarize_responses_input_item(index: int, item: Any) -> str: item_dict = as_dict(item) if item_dict is None: return f"{index:02d} item_type={type(item).__name__}" if "role" in item_dict: role = ensure_str(item_dict.get("role", "unknown")) content = item_dict.get("content", "") if isinstance(content, str): return ( f"{index:02d} role={role} content=str chars={len(content)} " f"preview='{truncate_for_log(content)}'" ) if isinstance(content, list): part_summaries = [summarize_content_part(part) for part in content] return ( f"{index:02d} role={role} content_parts={len(content)} " f"[{'; '.join(part_summaries)}]" ) return f"{index:02d} role={role} content_type={type(content).__name__}" item_type = ensure_str(item_dict.get("type", "unknown")) if item_type in ("function_call", "custom_tool_call"): raw_args = ( item_dict.get("input") if item_type == "custom_tool_call" else item_dict.get("arguments") ) args_text = ensure_str(raw_args or "") call_id = item_dict.get("call_id") or item_dict.get("id") return ( f"{index:02d} type={item_type} name={item_dict.get('name')} " f"call_id={call_id} args_chars={len(args_text)} " f"preview='{truncate_for_log(args_text)}'" ) if item_type == "function_call_output": output_text = ensure_str(item_dict.get("output", "")) return ( f"{index:02d} type=function_call_output call_id={item_dict.get('call_id')} " f"{summarize_function_call_output_payload(output_text)}" ) if item_type == "message": role = ensure_str(item_dict.get("role", "unknown")) content = item_dict.get("content", []) if isinstance(content, list): part_summaries = [summarize_content_part(part) for part in content] return ( f"{index:02d} type=message role={role} parts={len(content)} " f"[{'; '.join(part_summaries)}]" ) return ( f"{index:02d} type=message role={role} " f"content_type={type(content).__name__}" ) if item_type == "reasoning": summary = item_dict.get("summary") if isinstance(summary, list): summary_parts = [summarize_content_part(part) for part in summary] return ( f"{index:02d} type=reasoning summary_parts={len(summary)} " f"[{'; '.join(summary_parts)}]" ) return f"{index:02d} type=reasoning summary_type={type(summary).__name__}" return f"{index:02d} type={item_type} keys={sorted(item_dict.keys())}" ================================================ FILE: backend/fs_logging/openai_turn_inputs.py ================================================ # pyright: reportUnknownVariableType=false import json import os import uuid from dataclasses import dataclass, field from datetime import datetime from html import escape from typing import Any, Sequence from agent.providers.pricing import MODEL_PRICING from agent.providers.token_usage import TokenUsage from agent.state import ensure_str from fs_logging.openai_input_formatting import ( summarize_responses_input_item, to_serializable, ) from llm import Llm, get_openai_api_name def _render_json_scalar(value: Any) -> str: if value is None: return "null" if isinstance(value, bool): return f"{str(value).lower()}" if isinstance(value, (int, float)): return f"{escape(ensure_str(value))}" text = ensure_str(value) if "\n" not in text and len(text) <= 160: return f"{escape(text)}" return ( "
" f"string ({len(text)} chars)" f"
{escape(text)}
" "
" ) def _render_json_node(value: Any, label: str | None = None) -> str: label_html = "" if label is not None: label_html = f"{escape(label)}: " if isinstance(value, dict): parts = [ "
", ( f"{label_html}" f"object ({len(value)} keys)" ), "
", ] for child_key, child_value in value.items(): parts.append(_render_json_node(child_value, ensure_str(child_key))) parts.append("
") parts.append("
") return "".join(parts) if isinstance(value, list): parts = [ "
", ( f"{label_html}" f"array ({len(value)} items)" ), "
", ] for index, child_value in enumerate(value): parts.append(_render_json_node(child_value, f"[{index}]")) parts.append("
") parts.append("
") return "".join(parts) return ( "
" f"{label_html}{_render_json_scalar(value)}" "
" ) def _render_copy_controls(copy_target_id: str, button_label: str) -> str: return ( "
" f"" "" "
" ) def _log_openai_turn_input(model: Llm, turn_index: int, input_items: Sequence[Any]) -> None: model_name = get_openai_api_name(model) print( f"[OPENAI TURN INPUT] model={model_name} " f"turn={turn_index} items={len(input_items)}" ) for index, item in enumerate(input_items): print( f"[OPENAI TURN INPUT] " f"{summarize_responses_input_item(index, item)}" ) def _is_openai_turn_input_console_enabled() -> bool: value = os.environ.get("OPENAI_TURN_INPUT_CONSOLE", "") return value.strip().lower() in {"1", "true", "yes", "on"} @dataclass class OpenAITurnInputItem: index: int summary: str payload: Any @dataclass class OpenAITurnUsageSummary: input_tokens: int output_tokens: int cache_read: int cache_write: int total_tokens: int cache_hit_rate_percent: float cost_usd: float | None @dataclass class OpenAITurnInputReport: turn_index: int items: list[OpenAITurnInputItem] request_payload: Any | None = None usage: OpenAITurnUsageSummary | None = None @dataclass class OpenAITurnInputLogger: model: Llm enabled: bool = False report_id: str = field(default_factory=lambda: uuid.uuid4().hex) _turn_index: int = 0 _turns: list[OpenAITurnInputReport] = field(default_factory=list) def record_turn_input( self, input_items: Sequence[Any], request_payload: Any | None = None, ) -> None: if not self.enabled: return self._turn_index += 1 if _is_openai_turn_input_console_enabled(): _log_openai_turn_input(self.model, self._turn_index, input_items) turn_items = [ OpenAITurnInputItem( index=index, summary=summarize_responses_input_item(index, item), payload=to_serializable(item), ) for index, item in enumerate(input_items) ] self._turns.append( OpenAITurnInputReport( turn_index=self._turn_index, items=turn_items, request_payload=to_serializable(request_payload), ) ) def record_turn_usage(self, usage: TokenUsage) -> None: if not self.enabled or not self._turns: return pricing = MODEL_PRICING.get(get_openai_api_name(self.model)) cost_usd = usage.cost(pricing) if pricing else None self._turns[-1].usage = OpenAITurnUsageSummary( input_tokens=usage.input, output_tokens=usage.output, cache_read=usage.cache_read, cache_write=usage.cache_write, total_tokens=usage.total, cache_hit_rate_percent=usage.cache_hit_rate_percent(), cost_usd=cost_usd, ) def write_html_report(self) -> str | None: if not self.enabled: return None try: logs_path = os.environ.get("LOGS_PATH", os.getcwd()) logs_directory = os.path.join(logs_path, "run_logs") os.makedirs(logs_directory, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") model_name = get_openai_api_name(self.model).replace("/", "_") filename = ( f"openai_turn_inputs_{model_name}_{timestamp}_{self.report_id[:8]}.html" ) filepath = os.path.join(logs_directory, filename) with open(filepath, "w", encoding="utf-8") as f: f.write(self._render_html_report()) return filepath except Exception as e: print(f"[OPENAI TURN INPUT] Failed to write HTML report: {e}") return None def _render_html_report(self) -> str: model_name = get_openai_api_name(self.model) html_parts = [ "", "", "", " ", " ", " OpenAI Turn Input Report", " ", "", "", "

OpenAI Turn Input Report

", ( "
" f"report_id={escape(self.report_id)} | " f"model={escape(model_name)} | turns={len(self._turns)}" "
" ), ] for turn in self._turns: html_parts.append("
") html_parts.append( f"

Turn {turn.turn_index} (items={len(turn.items)})

" ) if turn.request_payload is not None: request_payload_json = json.dumps( turn.request_payload, indent=2, ensure_ascii=False, ) request_input_json: str | None = None if isinstance(turn.request_payload, dict) and "input" in turn.request_payload: request_input_json = json.dumps( turn.request_payload["input"], indent=2, ensure_ascii=False, ) html_parts.append("
") html_parts.append(" Request payload") if request_input_json is not None: request_input_id = f"request-input-turn-{turn.turn_index}" html_parts.append( " " + _render_copy_controls(request_input_id, "Copy input JSON") ) html_parts.append( f"
"
                        f"{escape(request_input_json)}
" ) html_parts.append("
") html_parts.append(_render_json_node(turn.request_payload, "root")) html_parts.append("
") html_parts.append("
") html_parts.append(" Raw JSON payload") html_parts.append( f"
{escape(request_payload_json)}
" ) html_parts.append("
") html_parts.append("
") if turn.usage is not None: cost_text = "n/a" if isinstance(turn.usage.cost_usd, (float, int)): cost_text = f"${turn.usage.cost_usd:.4f}" html_parts.append(" ") html_parts.append( " " ) html_parts.append(" ") html_parts.append( " " f"" ) html_parts.append( " " f"" ) html_parts.append( " " f"" ) html_parts.append( " " f"" ) html_parts.append( " " f"" ) html_parts.append( " " f"" ) html_parts.append( f" " ) html_parts.append(" ") html_parts.append("
MetricValue
Input tokens{turn.usage.input_tokens}
Output tokens{turn.usage.output_tokens}
Cache read{turn.usage.cache_read}
Cache write{turn.usage.cache_write}
Total tokens{turn.usage.total_tokens}
Cache hit rate{turn.usage.cache_hit_rate_percent:.2f}%
Cost{escape(cost_text)}
") else: html_parts.append( "
Usage unavailable for this turn.
" ) html_parts.append(" ") html_parts.append( " " ) html_parts.append(" ") for item in turn.items: html_parts.append( " " f"" ) html_parts.append(" ") html_parts.append("
IndexSummary
{item.index:02d}{escape(item.summary)}
") for item in turn.items: payload_json = json.dumps( item.payload, indent=2, ensure_ascii=False, ) html_parts.append("
") html_parts.append( f" Item {item.index:02d} payload" ) html_parts.append("
") html_parts.append(_render_json_node(item.payload, "root")) html_parts.append("
") html_parts.append("
") html_parts.append(" Raw JSON payload") html_parts.append(f"
{escape(payload_json)}
") html_parts.append("
") html_parts.append("
") html_parts.append("
") html_parts.extend( [ " ", "", "", ] ) return "\n".join(html_parts) ================================================ FILE: backend/image_generation/__init__.py ================================================ ================================================ FILE: backend/image_generation/core.py ================================================ from image_generation.generation import ( generate_image_dalle, generate_image_replicate, process_tasks, ) __all__ = [ "process_tasks", "generate_image_dalle", "generate_image_replicate", ] ================================================ FILE: backend/image_generation/generation.py ================================================ import asyncio import time from typing import List, Literal, Union from openai import AsyncOpenAI from image_generation.replicate import call_replicate REPLICATE_BATCH_SIZE = 20 async def process_tasks( prompts: List[str], api_key: str, base_url: str | None, model: Literal["dalle3", "flux"], ) -> List[Union[str, None]]: start_time = time.time() results: list[str | BaseException | None] if model == "dalle3": tasks = [generate_image_dalle(prompt, api_key, base_url) for prompt in prompts] results = await asyncio.gather(*tasks, return_exceptions=True) else: results = [] for i in range(0, len(prompts), REPLICATE_BATCH_SIZE): batch = prompts[i : i + REPLICATE_BATCH_SIZE] tasks = [generate_image_replicate(p, api_key) for p in batch] results.extend(await asyncio.gather(*tasks, return_exceptions=True)) end_time = time.time() generation_time = end_time - start_time print(f"Image generation time: {generation_time:.2f} seconds") processed_results: List[Union[str, None]] = [] for result in results: if isinstance(result, BaseException): print(f"An exception occurred: {result}") processed_results.append(None) else: processed_results.append(result) return processed_results async def generate_image_dalle( prompt: str, api_key: str, base_url: str | None ) -> Union[str, None]: client = AsyncOpenAI(api_key=api_key, base_url=base_url) res = await client.images.generate( model="dall-e-3", quality="standard", style="natural", n=1, size="1024x1024", prompt=prompt, ) await client.close() if not res.data: return None return res.data[0].url async def generate_image_replicate(prompt: str, api_key: str) -> str: # We use Flux 2 Klein return await call_replicate( { "prompt": prompt, "aspect_ratio": "1:1", "output_format": "png", }, api_key, ) ================================================ FILE: backend/image_generation/replicate.py ================================================ import asyncio import httpx from typing import Any, Mapping, cast REPLICATE_API_BASE_URL = "https://api.replicate.com/v1" FLUX_MODEL_PATH = "black-forest-labs/flux-2-klein-4b" REMOVE_BACKGROUND_VERSION = ( "a029dff38972b5fda4ec5d75d7d1cd25aeff621d2cf4946a41055d7db66b80bc" ) POLL_INTERVAL_SECONDS = 0.1 MAX_POLLS = 100 def _build_headers(api_token: str) -> dict[str, str]: return { "Authorization": f"Bearer {api_token}", "Content-Type": "application/json", } def _extract_prediction_id(response_json: Mapping[str, Any]) -> str: prediction_id = response_json.get("id") if not isinstance(prediction_id, str) or not prediction_id: raise ValueError("Prediction ID not found in initial response.") return prediction_id async def _poll_prediction( client: httpx.AsyncClient, prediction_id: str, headers: dict[str, str] ) -> dict[str, Any]: status_check_url = f"{REPLICATE_API_BASE_URL}/predictions/{prediction_id}" for _ in range(MAX_POLLS): await asyncio.sleep(POLL_INTERVAL_SECONDS) status_response = await client.get(status_check_url, headers=headers) status_response.raise_for_status() status_response_raw: Any = status_response.json() if not isinstance(status_response_raw, dict): raise ValueError("Invalid prediction status response.") status_response_json = cast(dict[str, Any], status_response_raw) status = status_response_json.get("status") if status == "succeeded": return cast(dict[str, Any], status_response_json) if status == "error": error_message = str(status_response_json.get("error", "Unknown error")) raise ValueError(f"Inference errored out: {error_message}") if status == "failed": raise ValueError("Inference failed") raise TimeoutError("Inference timed out") async def _run_prediction( endpoint_url: str, payload: dict[str, Any], api_token: str ) -> Any: headers = _build_headers(api_token) async with httpx.AsyncClient() as client: try: response = await client.post(endpoint_url, headers=headers, json=payload) response.raise_for_status() response_json = response.json() if not isinstance(response_json, dict): raise ValueError("Invalid prediction creation response.") prediction_id = _extract_prediction_id(response_json) final_response = await _poll_prediction(client, prediction_id, headers) return final_response.get("output") except httpx.HTTPStatusError as exc: raise ValueError(f"HTTP error occurred: {exc}") from exc except httpx.RequestError as exc: raise ValueError(f"An error occurred while requesting: {exc}") from exc except asyncio.TimeoutError as exc: raise TimeoutError("Request timed out") from exc except (TimeoutError, ValueError): raise except Exception as exc: raise ValueError(f"An unexpected error occurred: {exc}") from exc def _extract_output_url(result: Any, context: str) -> str: if isinstance(result, str): return result if isinstance(result, dict): url = cast(Any, result.get("url")) if isinstance(url, str) and url: return url if isinstance(result, list) and len(result) > 0: first: Any = result[0] if isinstance(first, str) and first: return first if isinstance(first, Mapping): url = cast(Any, first.get("url")) if isinstance(url, str) and url: return url raise ValueError(f"Unexpected response from {context}: {result}") async def call_replicate_model( model_path: str, input: dict[str, Any], api_token: str ) -> Any: return await _run_prediction( f"{REPLICATE_API_BASE_URL}/models/{model_path}/predictions", {"input": input}, api_token, ) async def call_replicate_version( version: str, input: dict[str, Any], api_token: str ) -> Any: return await _run_prediction( f"{REPLICATE_API_BASE_URL}/predictions", {"version": version, "input": input}, api_token, ) async def remove_background(image_url: str, api_token: str) -> str: result = await call_replicate_version( REMOVE_BACKGROUND_VERSION, { "image": image_url, "format": "png", "reverse": False, "threshold": 0, "background_type": "rgba", }, api_token, ) return _extract_output_url(result, "background remover") async def call_replicate(input: dict[str, str | int], api_token: str) -> str: result = await call_replicate_model(FLUX_MODEL_PATH, input, api_token) return _extract_output_url(result, "Flux prediction") ================================================ FILE: backend/llm.py ================================================ from enum import Enum from typing import TypedDict # Actual model versions that are passed to the LLMs and stored in our logs class Llm(Enum): # GPT GPT_4_1_2025_04_14 = "gpt-4.1-2025-04-14" GPT_5_2_CODEX_LOW = "gpt-5.2-codex (low thinking)" GPT_5_2_CODEX_MEDIUM = "gpt-5.2-codex (medium thinking)" GPT_5_2_CODEX_HIGH = "gpt-5.2-codex (high thinking)" GPT_5_2_CODEX_XHIGH = "gpt-5.2-codex (xhigh thinking)" GPT_5_3_CODEX_LOW = "gpt-5.3-codex (low thinking)" GPT_5_3_CODEX_MEDIUM = "gpt-5.3-codex (medium thinking)" GPT_5_3_CODEX_HIGH = "gpt-5.3-codex (high thinking)" GPT_5_3_CODEX_XHIGH = "gpt-5.3-codex (xhigh thinking)" GPT_5_4_2026_03_05_NONE = "gpt-5.4-2026-03-05 (no thinking)" GPT_5_4_2026_03_05_LOW = "gpt-5.4-2026-03-05 (low thinking)" GPT_5_4_2026_03_05_MEDIUM = "gpt-5.4-2026-03-05 (medium thinking)" GPT_5_4_2026_03_05_HIGH = "gpt-5.4-2026-03-05 (high thinking)" GPT_5_4_2026_03_05_XHIGH = "gpt-5.4-2026-03-05 (xhigh thinking)" # Claude CLAUDE_SONNET_4_6 = "claude-sonnet-4-6" CLAUDE_4_5_SONNET_2025_09_29 = "claude-sonnet-4-5-20250929" CLAUDE_4_5_OPUS_2025_11_01 = "claude-opus-4-5-20251101" CLAUDE_OPUS_4_6 = "claude-opus-4-6" # Gemini GEMINI_3_FLASH_PREVIEW_HIGH = "gemini-3-flash-preview (high thinking)" GEMINI_3_FLASH_PREVIEW_MINIMAL = "gemini-3-flash-preview (minimal thinking)" GEMINI_3_1_PRO_PREVIEW_HIGH = "gemini-3.1-pro-preview (high thinking)" GEMINI_3_1_PRO_PREVIEW_MEDIUM = "gemini-3.1-pro-preview (medium thinking)" GEMINI_3_1_PRO_PREVIEW_LOW = "gemini-3.1-pro-preview (low thinking)" class Completion(TypedDict): duration: float code: str # Explicitly map each model to the provider backing it. This keeps provider # groupings authoritative and avoids relying on name conventions when checking # models elsewhere in the codebase. MODEL_PROVIDER: dict[Llm, str] = { # OpenAI models Llm.GPT_4_1_2025_04_14: "openai", Llm.GPT_5_2_CODEX_LOW: "openai", Llm.GPT_5_2_CODEX_MEDIUM: "openai", Llm.GPT_5_2_CODEX_HIGH: "openai", Llm.GPT_5_2_CODEX_XHIGH: "openai", Llm.GPT_5_3_CODEX_LOW: "openai", Llm.GPT_5_3_CODEX_MEDIUM: "openai", Llm.GPT_5_3_CODEX_HIGH: "openai", Llm.GPT_5_3_CODEX_XHIGH: "openai", Llm.GPT_5_4_2026_03_05_NONE: "openai", Llm.GPT_5_4_2026_03_05_LOW: "openai", Llm.GPT_5_4_2026_03_05_MEDIUM: "openai", Llm.GPT_5_4_2026_03_05_HIGH: "openai", Llm.GPT_5_4_2026_03_05_XHIGH: "openai", # Anthropic models Llm.CLAUDE_SONNET_4_6: "anthropic", Llm.CLAUDE_4_5_SONNET_2025_09_29: "anthropic", Llm.CLAUDE_4_5_OPUS_2025_11_01: "anthropic", Llm.CLAUDE_OPUS_4_6: "anthropic", # Gemini models Llm.GEMINI_3_FLASH_PREVIEW_HIGH: "gemini", Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL: "gemini", Llm.GEMINI_3_1_PRO_PREVIEW_HIGH: "gemini", Llm.GEMINI_3_1_PRO_PREVIEW_MEDIUM: "gemini", Llm.GEMINI_3_1_PRO_PREVIEW_LOW: "gemini", } # Convenience sets for membership checks OPENAI_MODELS = {m for m, p in MODEL_PROVIDER.items() if p == "openai"} ANTHROPIC_MODELS = {m for m, p in MODEL_PROVIDER.items() if p == "anthropic"} GEMINI_MODELS = {m for m, p in MODEL_PROVIDER.items() if p == "gemini"} OPENAI_MODEL_CONFIG: dict[Llm, dict[str, str]] = { Llm.GPT_4_1_2025_04_14: {"api_name": "gpt-4.1-2025-04-14"}, Llm.GPT_5_2_CODEX_LOW: {"api_name": "gpt-5.2-codex", "reasoning_effort": "low"}, Llm.GPT_5_2_CODEX_MEDIUM: {"api_name": "gpt-5.2-codex", "reasoning_effort": "medium"}, Llm.GPT_5_2_CODEX_HIGH: {"api_name": "gpt-5.2-codex", "reasoning_effort": "high"}, Llm.GPT_5_2_CODEX_XHIGH: {"api_name": "gpt-5.2-codex", "reasoning_effort": "xhigh"}, Llm.GPT_5_3_CODEX_LOW: {"api_name": "gpt-5.3-codex", "reasoning_effort": "low"}, Llm.GPT_5_3_CODEX_MEDIUM: {"api_name": "gpt-5.3-codex", "reasoning_effort": "medium"}, Llm.GPT_5_3_CODEX_HIGH: {"api_name": "gpt-5.3-codex", "reasoning_effort": "high"}, Llm.GPT_5_3_CODEX_XHIGH: {"api_name": "gpt-5.3-codex", "reasoning_effort": "xhigh"}, Llm.GPT_5_4_2026_03_05_NONE: { "api_name": "gpt-5.4-2026-03-05", "reasoning_effort": "none", }, Llm.GPT_5_4_2026_03_05_LOW: { "api_name": "gpt-5.4-2026-03-05", "reasoning_effort": "low", }, Llm.GPT_5_4_2026_03_05_MEDIUM: { "api_name": "gpt-5.4-2026-03-05", "reasoning_effort": "medium", }, Llm.GPT_5_4_2026_03_05_HIGH: { "api_name": "gpt-5.4-2026-03-05", "reasoning_effort": "high", }, Llm.GPT_5_4_2026_03_05_XHIGH: { "api_name": "gpt-5.4-2026-03-05", "reasoning_effort": "xhigh", }, } def get_openai_api_name(model: Llm) -> str: return OPENAI_MODEL_CONFIG[model]["api_name"] def get_openai_reasoning_effort(model: Llm) -> str | None: return OPENAI_MODEL_CONFIG.get(model, {}).get("reasoning_effort") ================================================ FILE: backend/main.py ================================================ # Load environment variables first from dotenv import load_dotenv load_dotenv() from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from config import IS_DEBUG_ENABLED from routes import screenshot, generate_code, home, evals app = FastAPI(openapi_url=None, docs_url=None, redoc_url=None) @app.on_event("startup") async def log_debug_mode() -> None: debug_status = "ENABLED" if IS_DEBUG_ENABLED else "DISABLED" print(f"Backend startup complete. Debug mode is {debug_status}.") # Configure CORS settings app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Add routes app.include_router(generate_code.router) app.include_router(screenshot.router) app.include_router(home.router) app.include_router(evals.router) ================================================ FILE: backend/prompts/__init__.py ================================================ from prompts.system_prompt import SYSTEM_PROMPT __all__ = [ "SYSTEM_PROMPT", ] ================================================ FILE: backend/prompts/create/__init__.py ================================================ from custom_types import InputMode from prompts.create.image import build_image_prompt_messages from prompts.create.text import build_text_prompt_messages from prompts.create.video import build_video_prompt_messages from prompts.prompt_types import Stack, UserTurnInput from prompts.message_builder import Prompt def build_create_prompt_from_input( input_mode: InputMode, stack: Stack, prompt: UserTurnInput, image_generation_enabled: bool, ) -> Prompt: if input_mode == "image": image_urls = prompt.get("images", []) text_prompt = prompt.get("text", "") return build_image_prompt_messages( image_data_urls=image_urls, stack=stack, text_prompt=text_prompt, image_generation_enabled=image_generation_enabled, ) if input_mode == "text": return build_text_prompt_messages( text_prompt=prompt["text"], stack=stack, image_generation_enabled=image_generation_enabled, ) if input_mode == "video": video_urls = prompt.get("videos", []) if not video_urls: raise ValueError("Video mode requires a video to be provided") video_url = video_urls[0] return build_video_prompt_messages( video_data_url=video_url, stack=stack, text_prompt=prompt.get("text", ""), image_generation_enabled=image_generation_enabled, ) raise ValueError(f"Unsupported input mode: {input_mode}") __all__ = ["build_create_prompt_from_input"] ================================================ FILE: backend/prompts/create/image.py ================================================ from openai.types.chat import ChatCompletionContentPartParam, ChatCompletionMessageParam from prompts.prompt_types import Stack from prompts import system_prompt from prompts.policies import build_selected_stack_policy, build_user_image_policy def build_image_prompt_messages( image_data_urls: list[str], stack: Stack, text_prompt: str, image_generation_enabled: bool, ) -> list[ChatCompletionMessageParam]: image_policy = build_user_image_policy(image_generation_enabled) selected_stack = build_selected_stack_policy(stack) user_prompt = f""" Generate code for a web page that looks exactly like the provided screenshot(s). {selected_stack} ## Replication instructions - Make sure the app looks exactly like the screenshot. - Use the exact text from the screenshot. - {image_policy} ## Multiple screenshots If multiple screenshots are provided, organize them meaningfully: - If they appear to be different pages in a website, make them distinct pages and link them. - If they look like different tabs or views in an app, connect them with appropriate navigation. - If they appear unrelated, create a scaffold that separates them into "Screenshot 1", "Screenshot 2", "Screenshot 3", etc. so it is easy to navigate. - For mobile screenshots, do not include the device frame or browser chrome; focus only on the actual UI mockups. """ # Add additional instructions provided by the user if text_prompt.strip(): user_prompt = f"{user_prompt}\n\nAdditional instructions: {text_prompt}" user_content: list[ChatCompletionContentPartParam] = [] for image_data_url in image_data_urls: user_content.append( { "type": "image_url", "image_url": {"url": image_data_url, "detail": "high"}, } ) user_content.append( { "type": "text", "text": user_prompt, } ) return [ { "role": "system", "content": system_prompt.SYSTEM_PROMPT, }, { "role": "user", "content": user_content, }, ] ================================================ FILE: backend/prompts/create/text.py ================================================ from openai.types.chat import ChatCompletionMessageParam from prompts.prompt_types import Stack from prompts import system_prompt from prompts.policies import build_selected_stack_policy, build_user_image_policy def build_text_prompt_messages( text_prompt: str, stack: Stack, image_generation_enabled: bool, ) -> list[ChatCompletionMessageParam]: image_policy = build_user_image_policy(image_generation_enabled) selected_stack = build_selected_stack_policy(stack) USER_PROMPT = f""" Generate UI for {text_prompt}. {selected_stack} # Instructions - Make sure to make it look modern and sleek. - Use modern, professional fonts and colors. - Follow UX best practices. - {image_policy}""" return [ { "role": "system", "content": system_prompt.SYSTEM_PROMPT, }, { "role": "user", "content": USER_PROMPT, }, ] ================================================ FILE: backend/prompts/create/video.py ================================================ from openai.types.chat import ChatCompletionContentPartParam, ChatCompletionMessageParam from prompts.prompt_types import Stack from prompts import system_prompt from prompts.policies import build_selected_stack_policy, build_user_image_policy def build_video_prompt_messages( video_data_url: str, stack: Stack, text_prompt: str, image_generation_enabled: bool, ) -> list[ChatCompletionMessageParam]: image_policy = build_user_image_policy(image_generation_enabled) selected_stack = build_selected_stack_policy(stack) user_text = f""" You have been given a video of a user interacting with a web app. You need to re-create the same app exactly such that the same user interactions will produce the same results in the app you build. - Watch the entire video carefully and understand all the user interactions and UI state changes. - Make sure the app looks exactly like what you see in the video. - Pay close attention to background color, text color, font size, font family, padding, margin, border, etc. Match the colors and sizes exactly. - {image_policy} - If some functionality requires a backend call, just mock the data instead. - MAKE THE APP FUNCTIONAL using JavaScript. Allow the user to interact with the app and get the same behavior as shown in the video. - Use SVGs and interactive 3D elements if needed to match the functionality shown in the video. Analyze this video and generate the code. {selected_stack} """ if text_prompt.strip(): user_text = user_text + "\n\nAdditional instructions: " + text_prompt user_content: list[ChatCompletionContentPartParam] = [ { "type": "image_url", "image_url": {"url": video_data_url, "detail": "high"}, }, { "type": "text", "text": user_text, }, ] return [ { "role": "system", "content": system_prompt.SYSTEM_PROMPT, }, { "role": "user", "content": user_content, }, ] ================================================ FILE: backend/prompts/message_builder.py ================================================ from typing import cast from openai.types.chat import ChatCompletionContentPartParam, ChatCompletionMessageParam from prompts.prompt_types import PromptHistoryMessage Prompt = list[ChatCompletionMessageParam] def _wrap_assistant_file_content(content: str, path: str = "index.html") -> str: stripped = content.strip() if stripped.startswith(""): return stripped return f'\n{stripped}\n' def build_history_message(item: PromptHistoryMessage) -> ChatCompletionMessageParam: role = item["role"] image_urls = item.get("images", []) video_urls = item.get("videos", []) media_urls = [*image_urls, *video_urls] if role == "user" and len(media_urls) > 0: user_content: list[ChatCompletionContentPartParam] = [] for media_url in media_urls: user_content.append( { "type": "image_url", "image_url": {"url": media_url, "detail": "high"}, } ) user_content.append( { "type": "text", "text": item.get("text", ""), } ) return cast( ChatCompletionMessageParam, { "role": role, "content": user_content, }, ) return cast( ChatCompletionMessageParam, { "role": role, "content": ( _wrap_assistant_file_content(item.get("text", "")) if role == "assistant" else item.get("text", "") ), }, ) ================================================ FILE: backend/prompts/pipeline.py ================================================ from custom_types import InputMode from prompts.create import build_create_prompt_from_input from prompts.plan import derive_prompt_construction_plan from prompts.prompt_types import PromptHistoryMessage, Stack, UserTurnInput from prompts.message_builder import Prompt from prompts.update import ( build_update_prompt_from_file_snapshot, build_update_prompt_from_history, ) async def build_prompt_messages( stack: Stack, input_mode: InputMode, generation_type: str, prompt: UserTurnInput, history: list[PromptHistoryMessage], file_state: dict[str, str] | None = None, image_generation_enabled: bool = True, ) -> Prompt: plan = derive_prompt_construction_plan( stack=stack, input_mode=input_mode, generation_type=generation_type, history=history, file_state=file_state, ) strategy = plan["construction_strategy"] if strategy == "update_from_history": return build_update_prompt_from_history( stack=stack, history=history, image_generation_enabled=image_generation_enabled, ) if strategy == "update_from_file_snapshot": assert file_state is not None return build_update_prompt_from_file_snapshot( stack=stack, prompt=prompt, file_state=file_state, image_generation_enabled=image_generation_enabled, ) return build_create_prompt_from_input( input_mode, stack, prompt, image_generation_enabled, ) ================================================ FILE: backend/prompts/plan.py ================================================ from custom_types import InputMode from prompts.prompt_types import ( PromptConstructionPlan, PromptHistoryMessage, Stack, ) def derive_prompt_construction_plan( stack: Stack, input_mode: InputMode, generation_type: str, history: list[PromptHistoryMessage], file_state: dict[str, str] | None, ) -> PromptConstructionPlan: if generation_type == "update": if len(history) > 0: strategy = "update_from_history" elif file_state and file_state.get("content", "").strip(): strategy = "update_from_file_snapshot" else: raise ValueError("Update requests require history or fileState.content") return { "generation_type": "update", "input_mode": input_mode, "stack": stack, "construction_strategy": strategy, } return { "generation_type": "create", "input_mode": input_mode, "stack": stack, "construction_strategy": "create_from_input", } ================================================ FILE: backend/prompts/policies.py ================================================ from prompts.prompt_types import Stack def build_selected_stack_policy(stack: Stack) -> str: return f"Selected stack: {stack}." def build_user_image_policy(image_generation_enabled: bool) -> str: if image_generation_enabled: return ( "Image generation is enabled for this request. Use generate_images for " "missing assets when needed." ) return ( "Image generation is disabled for this request. Do not call generate_images. " "Use provided media, CSS effects, or placeholder URLs (https://placehold.co)." ) ================================================ FILE: backend/prompts/prompt_types.py ================================================ from typing import List, Literal, TypedDict class UserTurnInput(TypedDict): """Normalized current user turn payload from the request.""" text: str images: List[str] videos: List[str] class PromptHistoryMessage(TypedDict): """Explicit role-based message structure for edit history.""" role: Literal["user", "assistant"] text: str images: List[str] videos: List[str] PromptConstructionStrategy = Literal[ "create_from_input", "update_from_history", "update_from_file_snapshot", ] Stack = Literal[ "html_css", "html_tailwind", "react_tailwind", "bootstrap", "ionic_tailwind", "vue_tailwind", ] class PromptConstructionPlan(TypedDict): """Derived plan used by prompt builders to choose a single construction path.""" generation_type: Literal["create", "update"] input_mode: Literal["image", "video", "text"] stack: Stack construction_strategy: PromptConstructionStrategy ================================================ FILE: backend/prompts/request_parsing.py ================================================ from typing import List, cast from prompts.prompt_types import PromptHistoryMessage, UserTurnInput def _to_string_list(value: object) -> List[str]: if not isinstance(value, list): return [] raw_list = cast(List[object], value) return [item for item in raw_list if isinstance(item, str)] def parse_prompt_content(raw_prompt: object) -> UserTurnInput: if not isinstance(raw_prompt, dict): return {"text": "", "images": [], "videos": []} prompt_dict = cast(dict[str, object], raw_prompt) text = prompt_dict.get("text") return { "text": text if isinstance(text, str) else "", "images": _to_string_list(prompt_dict.get("images")), "videos": _to_string_list(prompt_dict.get("videos")), } def parse_prompt_history(raw_history: object) -> List[PromptHistoryMessage]: if not isinstance(raw_history, list): return [] history: List[PromptHistoryMessage] = [] raw_items = cast(List[object], raw_history) for item in raw_items: if not isinstance(item, dict): continue item_dict = cast(dict[str, object], item) role_value = item_dict.get("role") if not isinstance(role_value, str) or role_value not in ("user", "assistant"): continue text = item_dict.get("text") history.append( { "role": role_value, "text": text if isinstance(text, str) else "", "images": _to_string_list(item_dict.get("images")), "videos": _to_string_list(item_dict.get("videos")), } ) return history ================================================ FILE: backend/prompts/system_prompt.py ================================================ SYSTEM_PROMPT = """ You are a coding agent that's an expert at building front-ends. # Tone and style - Be extremely concise in your chat responses. - Do not include code snippets in your messages. Use the file creation and editing tools for all code. - At the end of the task, respond with a one or two sentence summary of what was built. - Always respond to the user in the language that they used. Our system prompts and tooling instructions are in English, but the user may choose to speak in another language and you should respond in that language. But if you're unsure, always pick English. # Tooling instructions - You have access to tools for file creation, file editing, image handling, and option retrieval. - The main file is a single HTML file. Use path "index.html" unless told otherwise. - For a brand new app, call create_file exactly once with the full HTML. - For updates, call edit_file using exact string replacements. Do NOT regenerate the entire file. - Do not output raw HTML in chat. Any code changes must go through tools. - When available, use generate_images to create image URLs from prompts (you may pass multiple prompts). The image generation AI is not capable of generating images with a transparent background. - Use remove_background to remove backgrounds from provided image URLs when needed (you may pass multiple image URLs). - Use retrieve_option to fetch the full HTML for a specific option (1-based option_number) when a user references another option. # Stack-specific instructions ## Tailwind - Use this script to include Tailwind: ## html_css - Only use HTML, CSS and JS. - Do not use Tailwind ## Bootstrap - Use this script to include Bootstrap: ## React - Use these script to include React so that it can run on a standalone page: - For babel, make sure to use https://unpkg.com/@babel/standalone/babel.min.js. DO NOT USE https://cdn.babeljs.io/babel.min.js as it is not the correct version and will cause errors. - Use this script to include Tailwind: ## Ionic - Use these script to include Ionic so that it can run on a standalone page: - Use this script to include Tailwind: - ionicons for icons, add the following ## Vue - Use these script to include Vue so that it can run on a standalone page: - Use this script to include Tailwind: - Use Vue using the global build like so:
{{ message }}
## General instructions for all stacks - You can use Google Fonts or other publicly accessible fonts. - Except for Ionic, Font Awesome for icons: """ ================================================ FILE: backend/prompts/update/__init__.py ================================================ from prompts.update.from_file_snapshot import build_update_prompt_from_file_snapshot from prompts.update.from_history import build_update_prompt_from_history __all__ = [ "build_update_prompt_from_file_snapshot", "build_update_prompt_from_history", ] ================================================ FILE: backend/prompts/update/from_file_snapshot.py ================================================ from typing import cast from openai.types.chat import ChatCompletionMessageParam from prompts import system_prompt from prompts.policies import build_selected_stack_policy, build_user_image_policy from prompts.prompt_types import Stack, UserTurnInput from prompts.message_builder import Prompt, build_history_message def build_update_prompt_from_file_snapshot( stack: Stack, prompt: UserTurnInput, file_state: dict[str, str], image_generation_enabled: bool, ) -> Prompt: path = file_state.get("path", "index.html") request_text = prompt.get("text", "").strip() or "Apply the requested update." selected_stack = build_selected_stack_policy(stack) image_policy = build_user_image_policy(image_generation_enabled) bootstrap_text = f"""{selected_stack} {image_policy} You are editing an existing file. {file_state["content"]} {request_text} """ return [ cast( ChatCompletionMessageParam, { "role": "system", "content": system_prompt.SYSTEM_PROMPT, }, ), build_history_message( { "role": "user", "text": bootstrap_text, "images": prompt.get("images", []), "videos": prompt.get("videos", []), } ), ] ================================================ FILE: backend/prompts/update/from_history.py ================================================ from typing import cast from openai.types.chat import ChatCompletionMessageParam from prompts import system_prompt from prompts.policies import build_selected_stack_policy, build_user_image_policy from prompts.prompt_types import PromptHistoryMessage, Stack from prompts.message_builder import Prompt, build_history_message def build_update_prompt_from_history( stack: Stack, history: list[PromptHistoryMessage], image_generation_enabled: bool, ) -> Prompt: first_user_index = next( (index for index, item in enumerate(history) if item["role"] == "user"), -1, ) if first_user_index == -1: raise ValueError("Update history must include at least one user message") prompt_messages: Prompt = [ cast( ChatCompletionMessageParam, { "role": "system", "content": system_prompt.SYSTEM_PROMPT, }, ) ] selected_stack = build_selected_stack_policy(stack) image_policy = build_user_image_policy(image_generation_enabled) for index, item in enumerate(history): if index == first_user_index: stack_prefix = f"""{selected_stack} {image_policy}""" user_text = item.get("text", "") prefixed_text = ( f"{stack_prefix}\n\n{user_text}" if user_text.strip() else stack_prefix ) prompt_messages.append( build_history_message( { "role": "user", "text": prefixed_text, "images": item.get("images", []), "videos": item.get("videos", []), } ) ) continue prompt_messages.append(build_history_message(item)) return prompt_messages ================================================ FILE: backend/pyproject.toml ================================================ [tool.poetry] name = "backend" version = "0.1.0" description = "" authors = ["Abi Raja "] license = "MIT" package-mode = false [tool.poetry.dependencies] python = "^3.10" fastapi = "^0.115.6" uvicorn = "^0.25.0" websockets = "^14.1" openai = "2.16.0" python-dotenv = "^1.0.0" beautifulsoup4 = "^4.12.2" httpx = "^0.28.1" pre-commit = "^3.6.2" anthropic = "^0.84.0" moviepy = "^1.0.3" pillow = "^10.3.0" types-pillow = "^10.2.0.20240520" aiohttp = "^3.9.5" pydantic = "^2.10" google-genai = "^1.16.1" langfuse = "^3.0.2" [tool.poetry.group.dev.dependencies] pytest = "^7.4.3" pyright = "^1.1.352" pytest-asyncio = "^0.21" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" ================================================ FILE: backend/pyrightconfig.json ================================================ { "exclude": ["image_generation.py"], "typeCheckingMode": "basic", "reportMissingTypeStubs": "none", "reportUnknownVariableType": "warning" } ================================================ FILE: backend/pytest.ini ================================================ [pytest] testpaths = tests python_files = test_*.py python_classes = Test* python_functions = test_* addopts = -v --tb=short asyncio_mode = auto ================================================ FILE: backend/routes/evals.py ================================================ import os import asyncio import json from fastapi import APIRouter, Query, Request, HTTPException from fastapi.responses import StreamingResponse from pydantic import BaseModel from evals.utils import image_to_data_url from evals.config import EVALS_DIR from typing import Set from evals.runner import run_image_evals, count_pending_eval_tasks from typing import List, Dict from llm import Llm from prompts.prompt_types import Stack from pathlib import Path from fs_logging.openai_input_compare import ( compare_openai_inputs, format_openai_input_comparison, ) router = APIRouter() # Update this if the number of outputs generated per input changes N = 1 class Eval(BaseModel): input: str outputs: list[str] class InputFile(BaseModel): name: str path: str @router.get("/eval_input_files", response_model=List[InputFile]) async def get_eval_input_files(): """Get a list of all input files available for evaluations""" input_dir = os.path.join(EVALS_DIR, "inputs") try: files: list[InputFile] = [] for filename in os.listdir(input_dir): if filename.endswith(".png"): file_path = os.path.join(input_dir, filename) files.append(InputFile(name=filename, path=file_path)) return sorted(files, key=lambda x: x.name) except Exception as e: raise HTTPException( status_code=500, detail=f"Error reading input files: {str(e)}" ) @router.get("/evals", response_model=list[Eval]) async def get_evals(folder: str): if not folder: raise HTTPException(status_code=400, detail="Folder path is required") folder_path = Path(folder) if not folder_path.exists(): raise HTTPException(status_code=404, detail=f"Folder not found: {folder}") try: evals: list[Eval] = [] # Get all HTML files from folder files = { f: os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".html") } # Extract base names base_names: Set[str] = set() for filename in files.keys(): base_name = ( filename.rsplit("_", 1)[0] if "_" in filename else filename.replace(".html", "") ) base_names.add(base_name) for base_name in base_names: input_path = os.path.join(EVALS_DIR, "inputs", f"{base_name}.png") if not os.path.exists(input_path): continue # Find matching output file output_file = None for filename, filepath in files.items(): if filename.startswith(base_name): output_file = filepath break if output_file: input_data = await image_to_data_url(input_path) with open(output_file, "r", encoding="utf-8") as f: output_html = f.read() evals.append(Eval(input=input_data, outputs=[output_html])) return evals except Exception as e: raise HTTPException(status_code=500, detail=f"Error processing evals: {str(e)}") class PairwiseEvalResponse(BaseModel): evals: list[Eval] folder1_name: str folder2_name: str @router.get("/pairwise-evals", response_model=PairwiseEvalResponse) async def get_pairwise_evals( folder1: str = Query( "...", description="Absolute path to first folder", ), folder2: str = Query( "..", description="Absolute path to second folder", ), ): if not os.path.exists(folder1) or not os.path.exists(folder2): return {"error": "One or both folders do not exist"} evals: list[Eval] = [] # Get all HTML files from first folder files1 = { f: os.path.join(folder1, f) for f in os.listdir(folder1) if f.endswith(".html") } files2 = { f: os.path.join(folder2, f) for f in os.listdir(folder2) if f.endswith(".html") } # Find common base names (ignoring any suffixes) common_names: Set[str] = set() for f1 in files1.keys(): base_name: str = f1.rsplit("_", 1)[0] if "_" in f1 else f1.replace(".html", "") for f2 in files2.keys(): if f2.startswith(base_name): common_names.add(base_name) # For each matching pair, create an eval for base_name in common_names: # Find the corresponding input image input_image = None input_path = os.path.join(EVALS_DIR, "inputs", f"{base_name}.png") if os.path.exists(input_path): input_image = await image_to_data_url(input_path) else: input_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" # 1x1 transparent PNG # Get the HTML contents output1 = None output2 = None # Find matching files in folder1 for f1 in files1.keys(): if f1.startswith(base_name): with open(files1[f1], "r") as f: output1 = f.read() break # Find matching files in folder2 for f2 in files2.keys(): if f2.startswith(base_name): with open(files2[f2], "r") as f: output2 = f.read() break if output1 and output2: evals.append(Eval(input=input_image, outputs=[output1, output2])) # Extract folder names for the UI folder1_name = os.path.basename(folder1) folder2_name = os.path.basename(folder2) return PairwiseEvalResponse( evals=evals, folder1_name=folder1_name, folder2_name=folder2_name ) class RunEvalsRequest(BaseModel): models: List[str] stack: Stack files: List[str] = [] # Optional list of specific file paths to run evals on diff_mode: bool = False class OpenAIInputCompareRequest(BaseModel): left_json: str right_json: str class OpenAIInputCompareDifferenceResponse(BaseModel): item_index: int path: str left_summary: str right_summary: str left_value: object | None right_value: object | None class OpenAIInputCompareResponse(BaseModel): common_prefix_items: int left_item_count: int right_item_count: int difference: OpenAIInputCompareDifferenceResponse | None formatted: str def _load_openai_input_compare_payload(raw_json: str, side: str) -> object: try: payload = json.loads(raw_json) except json.JSONDecodeError as error: raise HTTPException( status_code=400, detail=( f"Invalid {side} JSON: {error.msg} " f"(line {error.lineno}, column {error.colno})" ), ) try: compare_openai_inputs(payload, payload) except ValueError as error: raise HTTPException(status_code=400, detail=f"Invalid {side} payload: {error}") return payload @router.post("/openai-input-compare", response_model=OpenAIInputCompareResponse) async def compare_openai_inputs_for_evals( request: OpenAIInputCompareRequest, ) -> OpenAIInputCompareResponse: left_payload = _load_openai_input_compare_payload(request.left_json, "left") right_payload = _load_openai_input_compare_payload(request.right_json, "right") comparison = compare_openai_inputs(left_payload, right_payload) difference = None if comparison.difference is not None: difference = OpenAIInputCompareDifferenceResponse( item_index=comparison.difference.item_index, path=comparison.difference.path, left_summary=comparison.difference.left_summary, right_summary=comparison.difference.right_summary, left_value=comparison.difference.left_value, right_value=comparison.difference.right_value, ) return OpenAIInputCompareResponse( common_prefix_items=comparison.common_prefix_items, left_item_count=comparison.left_item_count, right_item_count=comparison.right_item_count, difference=difference, formatted=format_openai_input_comparison(comparison), ) @router.post("/run_evals", response_model=List[str]) async def run_evals(request: RunEvalsRequest) -> List[str]: """Run evaluations on selected images in the inputs directory for multiple models""" all_output_files: List[str] = [] for model in request.models: output_files = await run_image_evals( model=model, stack=request.stack, input_files=request.files, diff_mode=request.diff_mode, ) all_output_files.extend(output_files) return all_output_files def _count_eval_files(selected_files: List[str]) -> int: if selected_files: return len([f for f in selected_files if f.endswith(".png")]) input_dir = os.path.join(EVALS_DIR, "inputs") return len([f for f in os.listdir(input_dir) if f.endswith(".png")]) @router.post("/run_evals_stream") async def run_evals_stream(request: RunEvalsRequest): """Run evaluations and stream progress events as newline-delimited JSON.""" if not request.models: raise HTTPException(status_code=400, detail="At least one model is required") per_model_task_counts: Dict[str, int] = {} per_model_skipped_existing: Dict[str, int] = {} if request.diff_mode: for model in request.models: pending_tasks, skipped_tasks = count_pending_eval_tasks( stack=request.stack, model=model, input_files=request.files, n=N, diff_mode=True, ) per_model_task_counts[model] = pending_tasks per_model_skipped_existing[model] = skipped_tasks else: per_model_task_count = _count_eval_files(request.files) for model in request.models: per_model_task_counts[model] = per_model_task_count per_model_skipped_existing[model] = 0 total_tasks = sum(per_model_task_counts.values()) total_skipped_existing = sum(per_model_skipped_existing.values()) async def event_generator(): queue: asyncio.Queue[dict] = asyncio.Queue() async def emit(event: dict) -> None: await queue.put(event) async def run_all_models() -> None: all_output_files: List[str] = [] completed_offset = 0 try: await emit( { "type": "start", "total_models": len(request.models), "tasks_per_model": per_model_task_counts, "total_tasks": total_tasks, "completed_tasks": 0, "diff_mode": request.diff_mode, "total_skipped_existing": total_skipped_existing, } ) for model_index, model in enumerate(request.models, start=1): model_task_count = per_model_task_counts.get(model, 0) model_skipped_existing = per_model_skipped_existing.get(model, 0) await emit( { "type": "model_start", "model": model, "model_index": model_index, "total_models": len(request.models), "model_tasks": model_task_count, "model_skipped_existing": model_skipped_existing, } ) async def on_progress(event: dict) -> None: await emit( { **event, "model": model, "model_index": model_index, "total_models": len(request.models), "global_completed_tasks": completed_offset + event.get("completed_tasks", 0), "global_total_tasks": total_tasks, } ) output_files = await run_image_evals( model=model, stack=request.stack, input_files=request.files, diff_mode=request.diff_mode, progress_callback=on_progress, ) all_output_files.extend(output_files) completed_offset += model_task_count await emit( { "type": "complete", "completed_tasks": total_tasks, "total_tasks": total_tasks, "output_files": all_output_files, } ) except Exception as e: await emit({"type": "error", "message": str(e)}) finally: await emit({"type": "done"}) producer = asyncio.create_task(run_all_models()) while True: event = await queue.get() if event.get("type") == "done": break yield json.dumps(event) + "\n" await producer return StreamingResponse(event_generator(), media_type="application/x-ndjson") @router.get("/models", response_model=Dict[str, List[str]]) async def get_models(): current_models = [model.value for model in Llm] # Import Stack type from prompts.prompt_types and get all literal values available_stacks = list(Stack.__args__) return {"models": current_models, "stacks": available_stacks} class BestOfNEvalsResponse(BaseModel): evals: list[Eval] folder_names: list[str] @router.get("/best-of-n-evals", response_model=BestOfNEvalsResponse) async def get_best_of_n_evals(request: Request): # Get all query parameters query_params = dict(request.query_params) # Extract all folder paths (folder1, folder2, folder3, etc.) folders: list[str] = [] i = 1 while f"folder{i}" in query_params: folders.append(query_params[f"folder{i}"]) i += 1 if not folders: return {"error": "No folders provided"} # Validate folders exist for folder in folders: if not os.path.exists(folder): return {"error": f"Folder does not exist: {folder}"} evals: list[Eval] = [] folder_names = [os.path.basename(folder) for folder in folders] # Get HTML files from all folders files_by_folder = [] for folder in folders: files = { f: os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".html") } files_by_folder.append(files) # Find common base names across all folders common_names: Set[str] = set() base_names_first_folder = { f.rsplit("_", 1)[0] if "_" in f else f.replace(".html", "") for f in files_by_folder[0].keys() } for base_name in base_names_first_folder: found_in_all = True for folder_files in files_by_folder[1:]: if not any(f.startswith(base_name) for f in folder_files.keys()): found_in_all = False break if found_in_all: common_names.add(base_name) # For each matching set, create an eval for base_name in common_names: # Find the corresponding input image input_image = None input_path = os.path.join(EVALS_DIR, "inputs", f"{base_name}.png") if os.path.exists(input_path): input_image = await image_to_data_url(input_path) else: input_image = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=" # Get HTML contents from all folders outputs: list[str] = [] for folder_files in files_by_folder: output_content: str | None = None for filename in folder_files.keys(): if filename.startswith(base_name): with open(folder_files[filename], "r") as f: output_content = f.read() break if output_content: outputs.append(output_content) else: outputs.append("Output not found") if len(outputs) == len(folders): # Only add if we have outputs from all folders evals.append(Eval(input=input_image, outputs=outputs)) return BestOfNEvalsResponse(evals=evals, folder_names=folder_names) class OutputFolder(BaseModel): name: str path: str modified_time: float @router.get("/output_folders", response_model=List[OutputFolder]) async def get_output_folders(): """Get a list of all output folders available for evaluations, sorted by recently modified""" output_dir = os.path.join(EVALS_DIR, "results") try: folders: list[OutputFolder] = [] for folder_name in os.listdir(output_dir): folder_path = os.path.join(output_dir, folder_name) if os.path.isdir(folder_path) and not folder_name.startswith("."): # Get modification time modified_time = os.path.getmtime(folder_path) folders.append( OutputFolder( name=folder_name, path=folder_path, modified_time=modified_time ) ) # Sort by modified time, most recent first return sorted(folders, key=lambda x: x.modified_time, reverse=True) except Exception as e: raise HTTPException( status_code=500, detail=f"Error reading output folders: {str(e)}" ) ================================================ FILE: backend/routes/generate_code.py ================================================ import asyncio from dataclasses import dataclass, field from abc import ABC, abstractmethod import traceback from typing import Callable, Awaitable from fastapi import APIRouter, WebSocket import openai from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError from config import ( ANTHROPIC_API_KEY, GEMINI_API_KEY, IS_DEBUG_ENABLED, IS_PROD, NUM_VARIANTS, NUM_VARIANTS_VIDEO, OPENAI_API_KEY, OPENAI_BASE_URL, REPLICATE_API_KEY, ) from custom_types import InputMode from llm import ( Llm, ) from typing import ( Any, Callable, Coroutine, Dict, List, Literal, cast, get_args, ) from openai.types.chat import ChatCompletionMessageParam from utils import print_prompt_preview # WebSocket message types MessageType = Literal[ "chunk", "status", "setCode", "error", "variantComplete", "variantError", "variantCount", "variantModels", "thinking", "assistant", "toolStart", "toolResult", ] from prompts.pipeline import build_prompt_messages from prompts.request_parsing import parse_prompt_content, parse_prompt_history from prompts.prompt_types import PromptHistoryMessage, Stack, UserTurnInput from agent.runner import Agent from routes.model_choice_sets import ( ALL_KEYS_MODELS_DEFAULT, ALL_KEYS_MODELS_TEXT_CREATE, ALL_KEYS_MODELS_UPDATE, ANTHROPIC_ONLY_MODELS, GEMINI_ANTHROPIC_MODELS, GEMINI_OPENAI_MODELS, GEMINI_ONLY_MODELS, OPENAI_ANTHROPIC_MODELS, OPENAI_ONLY_MODELS, VIDEO_VARIANT_MODELS, ) # from utils import pprint_prompt from ws.constants import APP_ERROR_WEB_SOCKET_CODE # type: ignore router = APIRouter() @dataclass class PipelineContext: """Context object that carries state through the pipeline""" websocket: WebSocket ws_comm: "WebSocketCommunicator | None" = None params: Dict[str, Any] = field(default_factory=dict) extracted_params: "ExtractedParams | None" = None prompt_messages: List[ChatCompletionMessageParam] = field(default_factory=list) variant_models: List[Llm] = field(default_factory=list) completions: List[str] = field(default_factory=list) variant_completions: Dict[int, str] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict) @property def send_message(self): assert self.ws_comm is not None return self.ws_comm.send_message @property def throw_error(self): assert self.ws_comm is not None return self.ws_comm.throw_error class Middleware(ABC): """Base class for all pipeline middleware""" @abstractmethod async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: """Process the context and call the next middleware""" pass class Pipeline: """Pipeline for processing WebSocket code generation requests""" def __init__(self): self.middlewares: List[Middleware] = [] def use(self, middleware: Middleware) -> "Pipeline": """Add a middleware to the pipeline""" self.middlewares.append(middleware) return self async def execute(self, websocket: WebSocket) -> None: """Execute the pipeline with the given WebSocket""" context = PipelineContext(websocket=websocket) # Build the middleware chain async def start(ctx: PipelineContext): pass # End of pipeline chain = start for middleware in reversed(self.middlewares): chain = self._wrap_middleware(middleware, chain) await chain(context) def _wrap_middleware( self, middleware: Middleware, next_func: Callable[[PipelineContext], Awaitable[None]], ) -> Callable[[PipelineContext], Awaitable[None]]: """Wrap a middleware with its next function""" async def wrapped(context: PipelineContext) -> None: await middleware.process(context, lambda: next_func(context)) return wrapped class WebSocketCommunicator: """Handles WebSocket communication with consistent error handling""" def __init__(self, websocket: WebSocket): self.websocket = websocket self.is_closed = False async def accept(self) -> None: """Accept the WebSocket connection""" await self.websocket.accept() print("Incoming websocket connection...") async def send_message( self, type: MessageType, value: str | None, variantIndex: int, data: Dict[str, Any] | None = None, eventId: str | None = None, ) -> None: """Send a message to the client with debug logging""" if self.is_closed: return # Print for debugging on the backend if type == "error": print(f"Error (variant {variantIndex + 1}): {value}") elif type == "status": print(f"Status (variant {variantIndex + 1}): {value}") elif type == "variantComplete": print(f"Variant {variantIndex + 1} complete") elif type == "variantError": print(f"Variant {variantIndex + 1} error: {value}") try: payload: Dict[str, Any] = {"type": type, "variantIndex": variantIndex} if value is not None: payload["value"] = value if data is not None: payload["data"] = data if eventId is not None: payload["eventId"] = eventId await self.websocket.send_json(payload) except (ConnectionClosedOK, ConnectionClosedError): print(f"WebSocket closed by client, skipping message: {type}") self.is_closed = True async def throw_error(self, message: str) -> None: """Send an error message and close the connection""" print(message) if not self.is_closed: try: await self.websocket.send_json({"type": "error", "value": message}) await self.websocket.close(APP_ERROR_WEB_SOCKET_CODE) except (ConnectionClosedOK, ConnectionClosedError): print("WebSocket already closed by client") self.is_closed = True async def receive_params(self) -> Dict[str, Any]: """Receive parameters from the client""" params: Dict[str, Any] = await self.websocket.receive_json() print("Received params") return params async def close(self) -> None: """Close the WebSocket connection""" if not self.is_closed: try: await self.websocket.close() except (ConnectionClosedOK, ConnectionClosedError): pass # Already closed by client self.is_closed = True @dataclass class ExtractedParams: stack: Stack input_mode: InputMode should_generate_images: bool openai_api_key: str | None anthropic_api_key: str | None gemini_api_key: str | None openai_base_url: str | None generation_type: Literal["create", "update"] prompt: UserTurnInput history: List[PromptHistoryMessage] file_state: Dict[str, str] | None option_codes: List[str] class ParameterExtractionStage: """Handles parameter extraction and validation from WebSocket requests""" def __init__(self, throw_error: Callable[[str], Coroutine[Any, Any, None]]): self.throw_error = throw_error async def extract_and_validate(self, params: Dict[str, Any]) -> ExtractedParams: """Extract and validate all parameters from the request""" # Read the code config settings (stack) from the request. generated_code_config = params.get("generatedCodeConfig", "") if generated_code_config not in get_args(Stack): await self.throw_error( f"Invalid generated code config: {generated_code_config}" ) raise ValueError(f"Invalid generated code config: {generated_code_config}") validated_stack = cast(Stack, generated_code_config) # Validate the input mode input_mode = params.get("inputMode") if input_mode not in get_args(InputMode): await self.throw_error(f"Invalid input mode: {input_mode}") raise ValueError(f"Invalid input mode: {input_mode}") validated_input_mode = cast(InputMode, input_mode) openai_api_key = self._get_from_settings_dialog_or_env( params, "openAiApiKey", OPENAI_API_KEY ) # If neither is provided, we throw an error later only if Claude is used. anthropic_api_key = self._get_from_settings_dialog_or_env( params, "anthropicApiKey", ANTHROPIC_API_KEY ) gemini_api_key = self._get_from_settings_dialog_or_env( params, "geminiApiKey", GEMINI_API_KEY ) # Base URL for OpenAI API openai_base_url: str | None = None # Disable user-specified OpenAI Base URL in prod if not IS_PROD: openai_base_url = self._get_from_settings_dialog_or_env( params, "openAiBaseURL", OPENAI_BASE_URL ) if not openai_base_url: print("Using official OpenAI URL") # Get the image generation flag from the request. Fall back to True if not provided. should_generate_images = bool(params.get("isImageGenerationEnabled", True)) # Extract and validate generation type generation_type = params.get("generationType", "create") if generation_type not in ["create", "update"]: await self.throw_error(f"Invalid generation type: {generation_type}") raise ValueError(f"Invalid generation type: {generation_type}") generation_type = cast(Literal["create", "update"], generation_type) # Extract prompt content prompt: UserTurnInput = parse_prompt_content(params.get("prompt")) # Extract history (default to empty list) history: List[PromptHistoryMessage] = parse_prompt_history( params.get("history") ) # Extract file state for agent edits raw_file_state = params.get("fileState") file_state: Dict[str, str] | None = None if isinstance(raw_file_state, dict): content = raw_file_state.get("content") if isinstance(content, str) and content.strip(): path = raw_file_state.get("path") or "index.html" file_state = {"path": path, "content": content} raw_option_codes = params.get("optionCodes") option_codes: List[str] = [] if isinstance(raw_option_codes, list): for entry in raw_option_codes: if isinstance(entry, str): option_codes.append(entry) elif entry is None: option_codes.append("") else: option_codes.append(str(entry)) return ExtractedParams( stack=validated_stack, input_mode=validated_input_mode, should_generate_images=should_generate_images, openai_api_key=openai_api_key, anthropic_api_key=anthropic_api_key, gemini_api_key=gemini_api_key, openai_base_url=openai_base_url, generation_type=generation_type, prompt=prompt, history=history, file_state=file_state, option_codes=option_codes, ) def _get_from_settings_dialog_or_env( self, params: dict[str, Any], key: str, env_var: str | None ) -> str | None: """Get value from client settings or environment variable""" value = params.get(key) if value: print(f"Using {key} from client-side settings dialog") return value if env_var: print(f"Using {key} from environment variable") return env_var return None class ModelSelectionStage: """Handles selection of variant models based on available API keys and generation type""" def __init__(self, throw_error: Callable[[str], Coroutine[Any, Any, None]]): self.throw_error = throw_error async def select_models( self, generation_type: Literal["create", "update"], input_mode: InputMode, openai_api_key: str | None, anthropic_api_key: str | None, gemini_api_key: str | None = None, ) -> List[Llm]: """Select appropriate models based on available API keys""" try: num_variants = 2 if generation_type == "update" else NUM_VARIANTS variant_models = self._get_variant_models( generation_type, input_mode, num_variants, openai_api_key, anthropic_api_key, gemini_api_key, ) # Print the variant models (one per line) print("Variant models:") for index, model in enumerate(variant_models): print(f"Variant {index + 1}: {model.value}") return variant_models except Exception: await self.throw_error( "No OpenAI, Anthropic, or Gemini API key found. Please add the environment variable " "OPENAI_API_KEY, ANTHROPIC_API_KEY, or GEMINI_API_KEY to backend/.env or in the settings dialog. " "If you add it to .env, make sure to restart the backend server." ) raise Exception("No API key") def _get_variant_models( self, generation_type: Literal["create", "update"], input_mode: InputMode, num_variants: int, openai_api_key: str | None, anthropic_api_key: str | None, gemini_api_key: str | None, ) -> List[Llm]: """Simple model cycling that scales with num_variants""" # Video mode requires Gemini - 2 variants for comparison if input_mode == "video": if not gemini_api_key: raise Exception( "Video mode requires a Gemini API key. " "Please add GEMINI_API_KEY to backend/.env or in the settings dialog" ) return list(VIDEO_VARIANT_MODELS) # Define models based on available API keys if gemini_api_key and anthropic_api_key and openai_api_key: if input_mode == "text" and generation_type == "create": models = list(ALL_KEYS_MODELS_TEXT_CREATE) elif generation_type == "update": models = list(ALL_KEYS_MODELS_UPDATE) else: models = list(ALL_KEYS_MODELS_DEFAULT) elif gemini_api_key and anthropic_api_key: models = list(GEMINI_ANTHROPIC_MODELS) elif gemini_api_key and openai_api_key: models = list(GEMINI_OPENAI_MODELS) elif openai_api_key and anthropic_api_key: models = list(OPENAI_ANTHROPIC_MODELS) elif gemini_api_key: models = list(GEMINI_ONLY_MODELS) elif anthropic_api_key: models = list(ANTHROPIC_ONLY_MODELS) elif openai_api_key: models = list(OPENAI_ONLY_MODELS) else: raise Exception("No OpenAI or Anthropic key") # Cycle through models: [A, B] with num=5 becomes [A, B, A, B, A] selected_models: List[Llm] = [] for i in range(num_variants): selected_models.append(models[i % len(models)]) return selected_models class PromptCreationStage: """Handles prompt assembly for code generation""" def __init__(self, throw_error: Callable[[str], Coroutine[Any, Any, None]]): self.throw_error = throw_error async def build_prompt_messages( self, extracted_params: ExtractedParams, ) -> List[ChatCompletionMessageParam]: """Create prompt messages""" try: prompt_messages = await build_prompt_messages( stack=extracted_params.stack, input_mode=extracted_params.input_mode, generation_type=extracted_params.generation_type, prompt=extracted_params.prompt, history=extracted_params.history, file_state=extracted_params.file_state, image_generation_enabled=extracted_params.should_generate_images, ) print_prompt_preview(prompt_messages) return prompt_messages except Exception: await self.throw_error( "Error assembling prompt. Contact support at support@picoapps.xyz" ) raise class PostProcessingStage: """Handles post-processing after code generation completes""" def __init__(self): pass async def process_completions( self, completions: List[str], websocket: WebSocket, ) -> None: """Process completions and perform cleanup.""" return None class AgenticGenerationStage: """Handles agent tool-calling generation for each variant.""" def __init__( self, send_message: Callable[[MessageType, str | None, int, Dict[str, Any] | None, str | None], Coroutine[Any, Any, None]], openai_api_key: str | None, openai_base_url: str | None, anthropic_api_key: str | None, gemini_api_key: str | None, should_generate_images: bool, file_state: Dict[str, str] | None, option_codes: List[str] | None, ): self.send_message = send_message self.openai_api_key = openai_api_key self.openai_base_url = openai_base_url self.anthropic_api_key = anthropic_api_key self.gemini_api_key = gemini_api_key self.should_generate_images = should_generate_images self.file_state = file_state self.option_codes = option_codes or [] async def process_variants( self, variant_models: List[Llm], prompt_messages: List[ChatCompletionMessageParam], ) -> Dict[int, str]: tasks: List[asyncio.Task[str]] = [] for index, model in enumerate(variant_models): tasks.append( asyncio.create_task( self._run_variant(index, model, prompt_messages) ) ) results = await asyncio.gather(*tasks, return_exceptions=True) variant_completions: Dict[int, str] = {} for index, result in enumerate(results): if isinstance(result, BaseException): print(f"Variant {index + 1} failed: {result}") continue if result: variant_completions[index] = result return variant_completions async def _run_variant( self, index: int, model: Llm, prompt_messages: List[ChatCompletionMessageParam], ) -> str: try: async def send_runner_message( type: str, value: str | None, variant_index: int, data: Dict[str, Any] | None, event_id: str | None, ) -> None: await self.send_message( cast(MessageType, type), value, variant_index, data, event_id, ) runner = Agent( send_message=send_runner_message, variant_index=index, openai_api_key=self.openai_api_key, openai_base_url=self.openai_base_url, anthropic_api_key=self.anthropic_api_key, gemini_api_key=self.gemini_api_key, should_generate_images=self.should_generate_images, initial_file_state=self.file_state, option_codes=self.option_codes, ) completion = await runner.run(model, prompt_messages) if completion: await self.send_message("setCode", completion, index, None, None) await self.send_message( "variantComplete", "Variant generation complete", index, None, None, ) return completion except openai.AuthenticationError as e: print(f"[VARIANT {index + 1}] OpenAI Authentication failed", e) error_message = ( "Incorrect OpenAI key. Please make sure your OpenAI API key is correct, " "or create a new OpenAI API key on your OpenAI dashboard." + ( " Alternatively, you can purchase code generation credits directly on this website." if IS_PROD else "" ) ) await self.send_message("variantError", error_message, index, None, None) return "" except openai.NotFoundError as e: print(f"[VARIANT {index + 1}] OpenAI Model not found", e) error_message = ( e.message + ". Please make sure you have followed the instructions correctly to obtain " "an OpenAI key with GPT vision access: " "https://github.com/abi/screenshot-to-code/blob/main/Troubleshooting.md" + ( " Alternatively, you can purchase code generation credits directly on this website." if IS_PROD else "" ) ) await self.send_message("variantError", error_message, index, None, None) return "" except openai.RateLimitError as e: print(f"[VARIANT {index + 1}] OpenAI Rate limit exceeded", e) error_message = ( "OpenAI error - 'You exceeded your current quota, please check your plan and billing details.'" + ( " Alternatively, you can purchase code generation credits directly on this website." if IS_PROD else "" ) ) await self.send_message("variantError", error_message, index, None, None) return "" except Exception as e: print(f"Error in variant {index + 1}: {e}") traceback.print_exception(type(e), e, e.__traceback__) await self.send_message("variantError", str(e), index, None, None) return "" # Pipeline Middleware Implementations class WebSocketSetupMiddleware(Middleware): """Handles WebSocket setup and teardown""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: # Create and setup WebSocket communicator context.ws_comm = WebSocketCommunicator(context.websocket) await context.ws_comm.accept() try: await next_func() finally: # Always close the WebSocket await context.ws_comm.close() class ParameterExtractionMiddleware(Middleware): """Handles parameter extraction and validation""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: # Receive parameters assert context.ws_comm is not None context.params = await context.ws_comm.receive_params() # Extract and validate param_extractor = ParameterExtractionStage(context.throw_error) context.extracted_params = await param_extractor.extract_and_validate( context.params ) # Log what we're generating print( f"Generating {context.extracted_params.stack} code in {context.extracted_params.input_mode} mode" ) await next_func() class StatusBroadcastMiddleware(Middleware): """Sends initial status messages to all variants""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: # Determine variant count based on input mode and generation type. # Edit/update flows use two variants to keep latency and cost down. assert context.extracted_params is not None is_video_mode = context.extracted_params.input_mode == "video" is_update = context.extracted_params.generation_type == "update" num_variants = ( NUM_VARIANTS_VIDEO if is_video_mode else 2 if is_update else NUM_VARIANTS ) # Tell frontend how many variants we're using await context.send_message("variantCount", str(num_variants), 0) for i in range(num_variants): await context.send_message("status", "Generating code...", i) await next_func() class PromptCreationMiddleware(Middleware): """Handles prompt creation""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: prompt_creator = PromptCreationStage(context.throw_error) assert context.extracted_params is not None context.prompt_messages = await prompt_creator.build_prompt_messages( context.extracted_params, ) await next_func() class CodeGenerationMiddleware(Middleware): """Handles the main code generation logic""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: try: assert context.extracted_params is not None # Select models (handles video mode internally) model_selector = ModelSelectionStage(context.throw_error) context.variant_models = await model_selector.select_models( generation_type=context.extracted_params.generation_type, input_mode=context.extracted_params.input_mode, openai_api_key=context.extracted_params.openai_api_key, anthropic_api_key=context.extracted_params.anthropic_api_key, gemini_api_key=context.extracted_params.gemini_api_key, ) if IS_DEBUG_ENABLED: await context.send_message( "variantModels", None, 0, {"models": [model.value for model in context.variant_models]}, None, ) generation_stage = AgenticGenerationStage( send_message=context.send_message, openai_api_key=context.extracted_params.openai_api_key, openai_base_url=context.extracted_params.openai_base_url, anthropic_api_key=context.extracted_params.anthropic_api_key, gemini_api_key=context.extracted_params.gemini_api_key, should_generate_images=context.extracted_params.should_generate_images, file_state=context.extracted_params.file_state, option_codes=context.extracted_params.option_codes, ) context.variant_completions = await generation_stage.process_variants( variant_models=context.variant_models, prompt_messages=context.prompt_messages, ) # Check if all variants failed if len(context.variant_completions) == 0: await context.throw_error( "Error generating code. Please contact support." ) return # Don't continue the pipeline # Convert to list format context.completions = [] for i in range(len(context.variant_models)): if i in context.variant_completions: context.completions.append(context.variant_completions[i]) else: context.completions.append("") except Exception as e: print(f"[GENERATE_CODE] Unexpected error: {e}") await context.throw_error(f"An unexpected error occurred: {str(e)}") return # Don't continue the pipeline await next_func() class PostProcessingMiddleware(Middleware): """Handles post-processing and logging""" async def process( self, context: PipelineContext, next_func: Callable[[], Awaitable[None]] ) -> None: post_processor = PostProcessingStage() await post_processor.process_completions( context.completions, context.websocket ) await next_func() @router.websocket("/generate-code") async def stream_code(websocket: WebSocket): """Handle WebSocket code generation requests using a pipeline pattern""" pipeline = Pipeline() # Configure the pipeline pipeline.use(WebSocketSetupMiddleware()) pipeline.use(ParameterExtractionMiddleware()) pipeline.use(StatusBroadcastMiddleware()) pipeline.use(PromptCreationMiddleware()) pipeline.use(CodeGenerationMiddleware()) pipeline.use(PostProcessingMiddleware()) # Execute the pipeline await pipeline.execute(websocket) ================================================ FILE: backend/routes/home.py ================================================ from fastapi import APIRouter from fastapi.responses import HTMLResponse router = APIRouter() @router.get("/") async def get_status(): return HTMLResponse( content="

Your backend is running correctly. Please open the front-end URL (default is http://localhost:5173) to use screenshot-to-code.

" ) ================================================ FILE: backend/routes/model_choice_sets.py ================================================ from llm import Llm # Video variants always use Gemini. VIDEO_VARIANT_MODELS = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ) # All API keys available. # Image (Create) ALL_KEYS_MODELS_DEFAULT = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_2_CODEX_HIGH, Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ) # Text (Create) ALL_KEYS_MODELS_TEXT_CREATE = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_2_CODEX_HIGH, Llm.CLAUDE_OPUS_4_6, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, ) # Image + Text (Update) ALL_KEYS_MODELS_UPDATE = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_4_2026_03_05_LOW, ) # Key subset fallbacks. GEMINI_ANTHROPIC_MODELS = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, Llm.CLAUDE_OPUS_4_6, Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ) GEMINI_OPENAI_MODELS = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, ) OPENAI_ANTHROPIC_MODELS = ( Llm.CLAUDE_OPUS_4_6, Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, ) GEMINI_ONLY_MODELS = ( Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ) ANTHROPIC_ONLY_MODELS = ( Llm.CLAUDE_OPUS_4_6, Llm.CLAUDE_SONNET_4_6, ) OPENAI_ONLY_MODELS = ( Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, ) ================================================ FILE: backend/routes/screenshot.py ================================================ import base64 from fastapi import APIRouter, HTTPException from pydantic import BaseModel import httpx from urllib.parse import urlparse router = APIRouter() def normalize_url(url: str) -> str: """ Normalize URL to ensure it has a proper protocol. If no protocol is specified, default to https:// """ url = url.strip() # Parse the URL parsed = urlparse(url) # Check if we have a scheme if not parsed.scheme: # No scheme, add https:// url = f"https://{url}" elif parsed.scheme in ['http', 'https']: # Valid scheme, keep as is pass else: # Check if this might be a domain with port (like example.com:8080) # urlparse treats this as scheme:netloc, but we want to handle it as domain:port if ':' in url and not url.startswith(('http://', 'https://', 'ftp://', 'file://')): # Likely a domain:port without protocol url = f"https://{url}" else: # Invalid protocol raise ValueError(f"Unsupported protocol: {parsed.scheme}") return url def bytes_to_data_url(image_bytes: bytes, mime_type: str) -> str: base64_image = base64.b64encode(image_bytes).decode("utf-8") return f"data:{mime_type};base64,{base64_image}" async def capture_screenshot( target_url: str, api_key: str, device: str = "desktop" ) -> bytes: api_base_url = "https://api.screenshotone.com/take" params = { "access_key": api_key, "url": target_url, "full_page": "true", "device_scale_factor": "1", "format": "png", "block_ads": "true", "block_cookie_banners": "true", "block_trackers": "true", "cache": "false", "viewport_width": "342", "viewport_height": "684", } if device == "desktop": params["viewport_width"] = "1280" params["viewport_height"] = "832" async with httpx.AsyncClient(timeout=60) as client: response = await client.get(api_base_url, params=params) if response.status_code == 200 and response.content: return response.content else: raise Exception("Error taking screenshot") class ScreenshotRequest(BaseModel): url: str apiKey: str class ScreenshotResponse(BaseModel): url: str @router.post("/api/screenshot") async def app_screenshot(request: ScreenshotRequest): # Extract the URL from the request body url = request.url api_key = request.apiKey try: # Normalize the URL normalized_url = normalize_url(url) # Capture screenshot with normalized URL image_bytes = await capture_screenshot(normalized_url, api_key=api_key) # Convert the image bytes to a data url data_url = bytes_to_data_url(image_bytes, "image/png") return ScreenshotResponse(url=data_url) except ValueError as e: # Handle URL normalization errors raise HTTPException(status_code=500, detail=str(e)) except Exception as e: # Handle other errors raise HTTPException(status_code=500, detail=f"Error capturing screenshot: {str(e)}") ================================================ FILE: backend/run_evals.py ================================================ # Load environment variables first from dotenv import load_dotenv load_dotenv() import asyncio from evals.runner import run_image_evals async def main(): await run_image_evals() # async def text_main(): # OUTPUT_DIR = EVALS_DIR + "/outputs" # GENERAL_TEXT_V1 = [ # "Login form", # "Simple notification", # "button", # "saas dashboard", # "landing page for barber shop", # ] # tasks: list[Coroutine[Any, Any, str]] = [] # for prompt in GENERAL_TEXT_V1: # for n in range(N): # Generate N tasks for each input # if n == 0: # task = generate_code_for_text( # text=prompt, # stack=STACK, # model=Llm.CLAUDE_4_5_SONNET_2025_09_29, # ) # else: # task = generate_code_for_text( # text=prompt, stack=STACK, model=Llm.GPT_4_1_2025_04_14 # ) # tasks.append(task) # print(f"Generating {len(tasks)} codes") # results = await asyncio.gather(*tasks) # os.makedirs(OUTPUT_DIR, exist_ok=True) # for i, content in enumerate(results): # # Calculate index for filename and output number # eval_index = i // N # output_number = i % N # filename = GENERAL_TEXT_V1[eval_index] # # File name is derived from the original filename in evals with an added output number # output_filename = f"{os.path.splitext(filename)[0]}_{output_number}.html" # output_filepath = os.path.join(OUTPUT_DIR, output_filename) # with open(output_filepath, "w") as file: # file.write(content) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: backend/run_image_generation_evals.py ================================================ import asyncio import os from typing import List, Optional, Literal from dotenv import load_dotenv import aiohttp from image_generation.generation import process_tasks EVALS = [ "Romantic Background", "Company logo: A stylized green sprout emerging from a circle", "Placeholder image of a PDF cover with abstract design", "A complex bubble diagram showing various interconnected features and aspects of FestivalPro, with a large central bubble surrounded by smaller bubbles of different colors representing different categories and functionalities", "A vibrant, abstract visualization of the RhythmRise experience ecosystem, featuring interconnected neon elements representing music, technology, and human connection", "Banner with text 'LiblibAI学院 课程入口'", "Profile picture of Pierre-Louis Labonne", "Two hands holding iPhone 14 models with colorful displays", "Portrait of a woman with long dark hair smiling at the camera", "Threadless logo on a gradient background from light pink to coral", "Jordan Schlansky Shows Conan His Favorite Nose Hair Trimmer", "Team Coco", "Intro to Large Language Models", "Andrej Karpathy", "He built a $200 million toy company", "CNBC International", "What will happen in year three of the war?", "Channel", "This is it", "How ASML Dominates Chip Machines", ] # Load environment variables load_dotenv() # Get API keys from environment variables OPENAI_API_KEY: Optional[str] = os.getenv("OPENAI_API_KEY") REPLICATE_API_TOKEN: Optional[str] = os.getenv("REPLICATE_API_TOKEN") # Directory to save generated images OUTPUT_DIR: str = "generated_images" async def generate_and_save_images( prompts: List[str], model: Literal["dalle3", "flux"], api_key: Optional[str], ) -> None: # Ensure the output directory exists os.makedirs(OUTPUT_DIR, exist_ok=True) if api_key is None: raise ValueError(f"API key for {model} is not set in the environment variables") # Generate images results: List[Optional[str]] = await process_tasks( prompts, api_key, None, model=model ) # Save images to disk async with aiohttp.ClientSession() as session: for i, image_url in enumerate(results): if image_url: # Get the image data async with session.get(image_url) as response: image_data: bytes = await response.read() # Save the image with a filename based on the input eval prefix = "replicate_" if model == "flux" else "dalle3_" filename: str = ( f"{prefix}{prompts[i][:50].replace(' ', '_').replace(':', '')}.png" ) filepath: str = os.path.join(OUTPUT_DIR, filename) with open(filepath, "wb") as f: f.write(image_data) print(f"Saved {model} image: {filepath}") else: print(f"Failed to generate {model} image for prompt: {prompts[i]}") async def main() -> None: # await generate_and_save_images(EVALS, "dalle3", OPENAI_API_KEY) await generate_and_save_images(EVALS, "flux", REPLICATE_API_TOKEN) if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: backend/start.py ================================================ import argparse import uvicorn if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--port", type=int, default=7001) args = parser.parse_args() uvicorn.run("main:app", port=args.port, reload=True) ================================================ FILE: backend/tests/__init__.py ================================================ ================================================ FILE: backend/tests/test_agent_tool_runtime.py ================================================ import pytest from agent.state import AgentFileState from agent.tools.runtime import AgentToolRuntime from agent.tools.types import ToolCall def test_edit_file_returns_structured_result_with_diff() -> None: runtime = AgentToolRuntime( file_state=AgentFileState( path="index.html", content="
before
\n

keep

\n", ), should_generate_images=False, openai_api_key=None, openai_base_url=None, ) result = runtime._edit_file( { "old_text": "
before
", "new_text": "
after
", } ) assert result.ok is True assert result.updated_content == "
after
\n

keep

\n" assert result.result["content"] == "Successfully edited file at index.html." assert set(result.result["details"].keys()) == {"diff", "firstChangedLine"} assert result.result["details"]["firstChangedLine"] == 1 assert "--- index.html" in result.result["details"]["diff"] assert "+++ index.html" in result.result["details"]["diff"] assert "-
before
" in result.result["details"]["diff"] assert "+
after
" in result.result["details"]["diff"] assert result.summary["firstChangedLine"] == 1 assert result.summary["diff"] == result.result["details"]["diff"] @pytest.mark.asyncio async def test_execute_edit_file_uses_updated_result_shape() -> None: runtime = AgentToolRuntime( file_state=AgentFileState(path="index.html", content="
old
"), should_generate_images=False, openai_api_key=None, openai_base_url=None, ) result = await runtime.execute( ToolCall( id="call-1", name="edit_file", arguments={"old_text": "old", "new_text": "new"}, ) ) # execute() is sync for edit_file and should preserve the structured payload. assert result.ok is True assert result.result["content"] == "Successfully edited file at index.html." assert set(result.result["details"].keys()) == {"diff", "firstChangedLine"} assert "--- index.html" in result.result["details"]["diff"] ================================================ FILE: backend/tests/test_agent_tools.py ================================================ from agent.tools import canonical_tool_definitions def test_canonical_tool_definitions_include_generate_images_when_enabled() -> None: tool_names = [tool.name for tool in canonical_tool_definitions(True)] assert "generate_images" in tool_names def test_canonical_tool_definitions_exclude_generate_images_when_disabled() -> None: tool_names = [tool.name for tool in canonical_tool_definitions(False)] assert "generate_images" not in tool_names def test_edit_file_tool_description_matches_runtime_output_shape() -> None: edit_tool = next( tool for tool in canonical_tool_definitions(True) if tool.name == "edit_file" ) assert "success message" in edit_tool.description assert "unified diff" in edit_tool.description ================================================ FILE: backend/tests/test_batching.py ================================================ import asyncio import pytest from image_generation import generation from agent.tools.runtime import AgentToolRuntime from agent.tools.types import ToolCall from agent.state import AgentFileState @pytest.mark.asyncio async def test_process_tasks_batches_replicate_calls( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr(generation, "REPLICATE_BATCH_SIZE", 3) concurrent = 0 max_concurrent = 0 async def tracking_generate(prompt: str, api_key: str) -> str: nonlocal concurrent, max_concurrent concurrent += 1 max_concurrent = max(max_concurrent, concurrent) await asyncio.sleep(0.01) concurrent -= 1 return f"url-for-{prompt}" monkeypatch.setattr(generation, "generate_image_replicate", tracking_generate) prompts = [f"prompt-{i}" for i in range(7)] results = await generation.process_tasks(prompts, "key", None, "flux") assert len(results) == 7 assert results == [f"url-for-prompt-{i}" for i in range(7)] assert max_concurrent <= 3 @pytest.mark.asyncio async def test_remove_background_batches_calls( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.setattr("agent.tools.runtime.REPLICATE_API_KEY", "fake-key") concurrent = 0 max_concurrent = 0 async def tracking_remove_bg(image_url: str, api_token: str) -> str: nonlocal concurrent, max_concurrent concurrent += 1 max_concurrent = max(max_concurrent, concurrent) await asyncio.sleep(0.01) concurrent -= 1 return f"nobg-{image_url}" monkeypatch.setattr("agent.tools.runtime.remove_background", tracking_remove_bg) runtime = AgentToolRuntime( file_state=AgentFileState(), should_generate_images=True, openai_api_key=None, openai_base_url=None, ) urls = [f"https://example.com/img-{i}.png" for i in range(25)] result = await runtime.execute( ToolCall(id="test", name="remove_background", arguments={"image_urls": urls}) ) assert result.ok assert len(result.result["images"]) == 25 assert all(r["status"] == "ok" for r in result.result["images"]) assert max_concurrent <= 20 ================================================ FILE: backend/tests/test_codegen_utils.py ================================================ from codegen.utils import extract_html_content def test_extract_html_content_from_wrapped_file_tag() -> None: text = '\n

Hello

\n
' result = extract_html_content(text) assert result == "

Hello

" ================================================ FILE: backend/tests/test_evals_openai_input_compare.py ================================================ import pytest from fastapi import HTTPException from routes.evals import OpenAIInputCompareRequest, compare_openai_inputs_for_evals @pytest.mark.asyncio async def test_compare_openai_inputs_for_evals_returns_first_difference() -> None: response = await compare_openai_inputs_for_evals( OpenAIInputCompareRequest( left_json=( '{"input":[{"role":"system","content":"A"},' '{"role":"user","content":"Build dashboard"}]}' ), right_json=( '{"input":[{"role":"system","content":"A"},' '{"role":"user","content":"Build landing page"}]}' ), ) ) assert response.common_prefix_items == 1 assert response.left_item_count == 2 assert response.right_item_count == 2 assert response.difference is not None assert response.difference.path == "input[1].content" assert response.difference.left_value == "Build dashboard" assert response.difference.right_value == "Build landing page" @pytest.mark.asyncio async def test_compare_openai_inputs_for_evals_rejects_invalid_json() -> None: with pytest.raises(HTTPException) as error_info: await compare_openai_inputs_for_evals( OpenAIInputCompareRequest( left_json='{"input": [', right_json='{"input": []}', ) ) assert error_info.value.status_code == 400 assert "Invalid left JSON" in error_info.value.detail ================================================ FILE: backend/tests/test_image_generation_replicate.py ================================================ import pytest from image_generation import replicate def test_extract_output_url_from_string() -> None: assert ( replicate._extract_output_url("https://example.com/image.png", "test") == "https://example.com/image.png" ) def test_extract_output_url_from_dict() -> None: assert ( replicate._extract_output_url({"url": "https://example.com/image.png"}, "test") == "https://example.com/image.png" ) def test_extract_output_url_from_list() -> None: assert ( replicate._extract_output_url(["https://example.com/image.png"], "test") == "https://example.com/image.png" ) def test_extract_output_url_from_list_item_dict() -> None: assert ( replicate._extract_output_url( [{"url": "https://example.com/image.png"}], "test" ) == "https://example.com/image.png" ) def test_extract_output_url_invalid_raises() -> None: with pytest.raises(ValueError): replicate._extract_output_url([], "test") @pytest.mark.asyncio async def test_call_replicate_uses_flux_model(monkeypatch: pytest.MonkeyPatch) -> None: captured: dict[str, object] = {} async def fake_call_replicate_model( model_path: str, input: dict[str, object], api_token: str ) -> list[str]: captured["model_path"] = model_path captured["input"] = input captured["api_token"] = api_token return ["https://example.com/flux.png"] monkeypatch.setattr(replicate, "call_replicate_model", fake_call_replicate_model) result = await replicate.call_replicate({"prompt": "test", "seed": 1}, "token-123") assert result == "https://example.com/flux.png" assert captured["model_path"] == replicate.FLUX_MODEL_PATH assert captured["api_token"] == "token-123" @pytest.mark.asyncio async def test_remove_background_uses_version_and_normalizes_output( monkeypatch: pytest.MonkeyPatch, ) -> None: captured: dict[str, object] = {} async def fake_call_replicate_version( version: str, input: dict[str, object], api_token: str ) -> dict[str, str]: captured["version"] = version captured["input"] = input captured["api_token"] = api_token return {"url": "https://example.com/no-bg.png"} monkeypatch.setattr(replicate, "call_replicate_version", fake_call_replicate_version) result = await replicate.remove_background("https://example.com/input.png", "token") assert result == "https://example.com/no-bg.png" assert captured["version"] == replicate.REMOVE_BACKGROUND_VERSION assert captured["api_token"] == "token" ================================================ FILE: backend/tests/test_model_selection.py ================================================ import pytest from unittest.mock import AsyncMock from routes.generate_code import ModelSelectionStage from llm import Llm class TestModelSelectionAllKeys: """Test model selection when Gemini, Anthropic, and OpenAI API keys are present.""" def setup_method(self): """Set up test fixtures.""" mock_throw_error = AsyncMock() self.model_selector = ModelSelectionStage(mock_throw_error) @pytest.mark.asyncio async def test_gemini_anthropic_create(self): """All keys: fixed order for four variants.""" models = await self.model_selector.select_models( generation_type="create", input_mode="text", openai_api_key="key", anthropic_api_key="key", gemini_api_key="key", ) expected = [ Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_2_CODEX_HIGH, Llm.CLAUDE_OPUS_4_6, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, ] assert models == expected @pytest.mark.asyncio async def test_gemini_anthropic_update_text(self): """All keys text update: uses two fast edit variants.""" models = await self.model_selector.select_models( generation_type="update", input_mode="text", openai_api_key="key", anthropic_api_key="key", gemini_api_key="key", ) expected = [ Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_4_2026_03_05_LOW, ] assert models == expected @pytest.mark.asyncio async def test_gemini_anthropic_update(self): """All keys image update: uses two fast edit variants.""" models = await self.model_selector.select_models( generation_type="update", input_mode="image", openai_api_key="key", anthropic_api_key="key", gemini_api_key="key", ) expected = [ Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GPT_5_4_2026_03_05_LOW, ] assert models == expected @pytest.mark.asyncio async def test_video_create_prefers_gemini_minimal_then_3_1_high(self): """Video create always uses two Gemini variants in fixed order.""" models = await self.model_selector.select_models( generation_type="create", input_mode="video", openai_api_key="key", anthropic_api_key="key", gemini_api_key="key", ) expected = [ Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ] assert models == expected @pytest.mark.asyncio async def test_video_update_prefers_gemini_minimal_then_3_1_high(self): """Video update always uses the same two Gemini variants as video create.""" models = await self.model_selector.select_models( generation_type="update", input_mode="video", openai_api_key="key", anthropic_api_key="key", gemini_api_key="key", ) expected = [ Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL, Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, ] assert models == expected class TestModelSelectionOpenAIAnthropic: """Test model selection when only OpenAI and Anthropic keys are present.""" def setup_method(self): """Set up test fixtures.""" mock_throw_error = AsyncMock() self.model_selector = ModelSelectionStage(mock_throw_error) @pytest.mark.asyncio async def test_openai_anthropic(self): """OpenAI + Anthropic: Claude Opus 4.6, GPT 5.2 Codex (high/medium), cycling""" models = await self.model_selector.select_models( generation_type="create", input_mode="text", openai_api_key="key", anthropic_api_key="key", gemini_api_key=None, ) expected = [ Llm.CLAUDE_OPUS_4_6, Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, Llm.CLAUDE_OPUS_4_6, ] assert models == expected class TestModelSelectionAnthropicOnly: """Test model selection when only Anthropic key is present.""" def setup_method(self): """Set up test fixtures.""" mock_throw_error = AsyncMock() self.model_selector = ModelSelectionStage(mock_throw_error) @pytest.mark.asyncio async def test_anthropic_only(self): """Anthropic only: Claude Opus 4.6 and Claude Sonnet 4.6 cycling""" models = await self.model_selector.select_models( generation_type="create", input_mode="text", openai_api_key=None, anthropic_api_key="key", gemini_api_key=None, ) expected = [ Llm.CLAUDE_OPUS_4_6, Llm.CLAUDE_SONNET_4_6, Llm.CLAUDE_OPUS_4_6, Llm.CLAUDE_SONNET_4_6, ] assert models == expected class TestModelSelectionOpenAIOnly: """Test model selection when only OpenAI key is present.""" def setup_method(self): """Set up test fixtures.""" mock_throw_error = AsyncMock() self.model_selector = ModelSelectionStage(mock_throw_error) @pytest.mark.asyncio async def test_openai_only(self): """OpenAI only: GPT 5.2 Codex (high/medium) only""" models = await self.model_selector.select_models( generation_type="create", input_mode="text", openai_api_key="key", anthropic_api_key=None, gemini_api_key=None, ) expected = [ Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, Llm.GPT_5_2_CODEX_HIGH, Llm.GPT_5_2_CODEX_MEDIUM, ] assert models == expected class TestModelSelectionNoKeys: """Test model selection when no API keys are present.""" def setup_method(self): """Set up test fixtures.""" mock_throw_error = AsyncMock() self.model_selector = ModelSelectionStage(mock_throw_error) @pytest.mark.asyncio async def test_no_keys_raises_error(self): """No keys: Should raise an exception""" with pytest.raises(Exception, match="No API key"): await self.model_selector.select_models( generation_type="create", input_mode="text", openai_api_key=None, anthropic_api_key=None, gemini_api_key=None, ) ================================================ FILE: backend/tests/test_openai_input_compare.py ================================================ from typing import Any from fs_logging.openai_input_compare import ( compare_openai_inputs, format_openai_input_comparison, ) def test_compare_openai_inputs_returns_none_for_identical_inputs() -> None: payload: dict[str, Any] = { "input": [ {"role": "system", "content": "You are a coding agent."}, {"role": "user", "content": "Build a dashboard."}, ] } comparison = compare_openai_inputs(payload, payload) assert comparison.common_prefix_items == 2 assert comparison.left_item_count == 2 assert comparison.right_item_count == 2 assert comparison.difference is None assert "difference=none" in format_openai_input_comparison(comparison) def test_compare_openai_inputs_finds_first_different_block_and_field() -> None: left_payload: dict[str, Any] = { "input": [ {"role": "system", "content": "You are a coding agent."}, { "role": "user", "content": [ {"type": "input_text", "text": "Build a dashboard."}, { "type": "input_image", "image_url": "data:image/png;base64,left", "detail": "original", }, ], }, ] } right_payload: dict[str, Any] = { "input": [ {"role": "system", "content": "You are a coding agent."}, { "role": "user", "content": [ {"type": "input_text", "text": "Build a dashboard."}, { "type": "input_image", "image_url": "data:image/png;base64,right", "detail": "original", }, ], }, ] } comparison = compare_openai_inputs(left_payload, right_payload) assert comparison.common_prefix_items == 1 assert comparison.difference is not None assert comparison.difference.item_index == 1 assert comparison.difference.path == "input[1].content[1].image_url" assert comparison.difference.left_value == "data:image/png;base64,left" assert comparison.difference.right_value == "data:image/png;base64,right" def test_compare_openai_inputs_accepts_raw_input_arrays() -> None: left_input: list[dict[str, Any]] = [ {"role": "system", "content": "You are a coding agent."}, {"role": "user", "content": "Build a dashboard."}, ] right_input: list[dict[str, Any]] = [ {"role": "system", "content": "You are a coding agent."}, {"role": "user", "content": "Build a landing page."}, ] comparison = compare_openai_inputs(left_input, right_input) formatted = format_openai_input_comparison(comparison) assert comparison.difference is not None assert comparison.difference.path == "input[1].content" assert "first_different_item_index=1" in formatted assert "first_different_path=input[1].content" in formatted ================================================ FILE: backend/tests/test_openai_provider_session.py ================================================ import copy from typing import Any import pytest from agent.providers.base import ExecutedToolCall, ProviderTurn from agent.providers.openai import OpenAIProviderSession from agent.tools import ToolCall, ToolExecutionResult from llm import Llm class _EmptyAsyncStream: def __aiter__(self) -> "_EmptyAsyncStream": return self async def __anext__(self) -> object: raise StopAsyncIteration class _FakeResponses: def __init__(self) -> None: self.calls: list[dict[str, Any]] = [] async def create(self, **kwargs: Any) -> _EmptyAsyncStream: self.calls.append(copy.deepcopy(kwargs)) return _EmptyAsyncStream() class _FakeOpenAIClient: def __init__(self) -> None: self.responses = _FakeResponses() async def close(self) -> None: return None async def _noop_event_sink(_: Any) -> None: return None def _test_tools() -> list[dict[str, Any]]: return [ { "type": "function", "name": "edit_file", "description": "Apply an edit.", "parameters": { "type": "object", "properties": { "path": {"type": "string"}, }, "required": ["path"], }, "strict": True, } ] @pytest.mark.asyncio async def test_openai_provider_session_omits_prompt_cache_key_across_turns() -> None: client = _FakeOpenAIClient() session = OpenAIProviderSession( client=client, # type: ignore[arg-type] model=Llm.GPT_5_2_CODEX_HIGH, prompt_messages=[{"role": "user", "content": "Build a landing page."}], tools=_test_tools(), ) first_turn = await session.stream_turn(_noop_event_sink) session.append_tool_results( ProviderTurn( assistant_text=first_turn.assistant_text, tool_calls=[], assistant_turn=[ { "type": "function_call", "call_id": "call-1", "name": "edit_file", "arguments": '{"path":"index.html"}', } ], ), [ ExecutedToolCall( tool_call=ToolCall( id="call-1", name="edit_file", arguments={"path": "index.html"}, ), result=ToolExecutionResult( ok=True, result={ "content": "Successfully edited file at index.html.", "details": { "diff": "--- index.html\n+++ index.html\n@@ -1 +1 @@\n-a\n+b\n", "firstChangedLine": 1, }, }, summary={"content": "Successfully edited file at index.html."}, ), ) ], ) await session.stream_turn(_noop_event_sink) first_call = client.responses.calls[0] second_call = client.responses.calls[1] first_input = first_call["input"] second_input = second_call["input"] assert "prompt_cache_key" not in first_call assert "prompt_cache_key" not in second_call assert "prompt_cache_retention" not in first_call assert "prompt_cache_retention" not in second_call assert isinstance(first_input, list) assert isinstance(second_input, list) assert len(second_input) > len(first_input) @pytest.mark.asyncio async def test_openai_provider_session_omits_prompt_cache_key_for_all_prompts() -> None: first_client = _FakeOpenAIClient() second_client = _FakeOpenAIClient() different_prompt_client = _FakeOpenAIClient() first_session = OpenAIProviderSession( client=first_client, # type: ignore[arg-type] model=Llm.GPT_5_2_CODEX_HIGH, prompt_messages=[{"role": "user", "content": "Build a landing page."}], tools=_test_tools(), ) second_session = OpenAIProviderSession( client=second_client, # type: ignore[arg-type] model=Llm.GPT_5_2_CODEX_HIGH, prompt_messages=[{"role": "user", "content": "Build a landing page."}], tools=_test_tools(), ) different_prompt_session = OpenAIProviderSession( client=different_prompt_client, # type: ignore[arg-type] model=Llm.GPT_5_2_CODEX_HIGH, prompt_messages=[{"role": "user", "content": "Build a dashboard."}], tools=_test_tools(), ) await first_session.stream_turn(_noop_event_sink) await second_session.stream_turn(_noop_event_sink) await different_prompt_session.stream_turn(_noop_event_sink) assert "prompt_cache_key" not in first_client.responses.calls[0] assert "prompt_cache_key" not in second_client.responses.calls[0] assert "prompt_cache_key" not in different_prompt_client.responses.calls[0] @pytest.mark.asyncio async def test_openai_provider_session_uses_gpt_5_4_none_reasoning_effort() -> None: client = _FakeOpenAIClient() session = OpenAIProviderSession( client=client, # type: ignore[arg-type] model=Llm.GPT_5_4_2026_03_05_NONE, prompt_messages=[{"role": "user", "content": "Build a dashboard."}], tools=_test_tools(), ) await session.stream_turn(_noop_event_sink) first_call = client.responses.calls[0] assert first_call["model"] == "gpt-5.4-2026-03-05" assert first_call["prompt_cache_retention"] == "24h" assert first_call["reasoning"] == {"effort": "none", "summary": "auto"} @pytest.mark.asyncio async def test_openai_provider_session_uses_gpt_5_4_high_reasoning_effort() -> None: client = _FakeOpenAIClient() session = OpenAIProviderSession( client=client, # type: ignore[arg-type] model=Llm.GPT_5_4_2026_03_05_HIGH, prompt_messages=[{"role": "user", "content": "Build a dashboard."}], tools=_test_tools(), ) await session.stream_turn(_noop_event_sink) first_call = client.responses.calls[0] assert first_call["model"] == "gpt-5.4-2026-03-05" assert first_call["prompt_cache_retention"] == "24h" assert first_call["reasoning"] == {"effort": "high", "summary": "auto"} ================================================ FILE: backend/tests/test_openai_reasoning_parser.py ================================================ import pytest from agent.providers.openai import ( OpenAIResponsesParseState, _convert_message_to_responses_input, parse_event, ) from agent.providers.types import StreamEvent @pytest.mark.asyncio async def test_reasoning_summary_part_skipped_after_summary_delta() -> None: state = OpenAIResponsesParseState() events: list[StreamEvent] = [] async def on_event(event: StreamEvent) -> None: events.append(event) await parse_event( {"type": "response.reasoning_summary_text.delta", "delta": "Planning step."}, state, on_event, ) await parse_event( { "type": "response.reasoning_summary_part.done", "part": {"text": "Planning step."}, }, state, on_event, ) thinking_text = [event.text for event in events if event.type == "thinking_delta"] assert thinking_text == ["Planning step."] @pytest.mark.asyncio async def test_reasoning_summary_part_added_and_done_emits_once() -> None: state = OpenAIResponsesParseState() events: list[StreamEvent] = [] async def on_event(event: StreamEvent) -> None: events.append(event) await parse_event( { "type": "response.reasoning_summary_part.added", "part": {"text": "Refining layout and assets."}, }, state, on_event, ) await parse_event( { "type": "response.reasoning_summary_part.done", "part": {"text": "Refining layout and assets."}, }, state, on_event, ) thinking_text = [event.text for event in events if event.type == "thinking_delta"] assert thinking_text == ["Refining layout and assets."] def test_convert_image_url_defaults_to_high_detail() -> None: message = { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": "data:image/png;base64,abc"}}, ], } result = _convert_message_to_responses_input(message) # type: ignore image_part = result["content"][0] assert image_part["detail"] == "high" def test_convert_image_url_preserves_explicit_detail() -> None: message = { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": "data:image/png;base64,abc", "detail": "low"}, }, ], } result = _convert_message_to_responses_input(message) # type: ignore image_part = result["content"][0] assert image_part["detail"] == "low" ================================================ FILE: backend/tests/test_openai_turn_input_logging.py ================================================ from pathlib import Path from agent.providers.token_usage import TokenUsage from fs_logging.openai_turn_inputs import OpenAITurnInputLogger from llm import Llm def test_openai_turn_input_logger_writes_html_report(tmp_path, monkeypatch) -> None: monkeypatch.setenv("LOGS_PATH", str(tmp_path)) logger = OpenAITurnInputLogger(model=Llm.GPT_5_2_CODEX_LOW, enabled=True) logger.record_turn_input( [ { "role": "user", "content": "Build this page", }, { "type": "function_call", "name": "read_file", "call_id": "call-1", "arguments": '{"path":"/tmp/example.txt"}', }, ] ) logger.record_turn_usage( TokenUsage( input=1200, output=300, cache_read=600, total=2100, ) ) report_path = logger.write_html_report() assert report_path is not None report = Path(report_path) assert report.exists() assert report.parent == tmp_path / "run_logs" html = report.read_text(encoding="utf-8") assert "OpenAI Turn Input Report" in html assert "Turn 1 (items=2)" in html assert "Build this page" in html assert "read_file" in html assert "Input tokens" in html assert "1200" in html assert "Cache hit rate" in html assert "33.33%" in html assert "Cost" in html assert "$" in html def test_openai_turn_input_logger_preserves_full_large_payloads( tmp_path, monkeypatch ) -> None: monkeypatch.setenv("LOGS_PATH", str(tmp_path)) logger = OpenAITurnInputLogger(model=Llm.GPT_5_3_CODEX_LOW, enabled=True) logger.record_turn_input( [ { "role": "user", "content": [{"type": "input_text", "text": "BEGIN-" + ("x" * 450) + "-END"}], } ] ) report_path = logger.write_html_report() assert report_path is not None html = Path(report_path).read_text(encoding="utf-8") assert "Usage unavailable for this turn." in html assert "Raw JSON payload" in html assert "string (460 chars)" in html assert "BEGIN-" in html assert "-END" in html assert "truncated 50 chars" not in html def test_openai_turn_input_logger_includes_request_payload( tmp_path, monkeypatch ) -> None: monkeypatch.setenv("LOGS_PATH", str(tmp_path)) logger = OpenAITurnInputLogger(model=Llm.GPT_5_2_CODEX_HIGH, enabled=True) logger.record_turn_input( [ { "role": "user", "content": "Build this page", } ], request_payload={ "model": "gpt-5.2-codex", "input": [{"role": "user", "content": "Build this page"}], }, ) report_path = logger.write_html_report() assert report_path is not None html = Path(report_path).read_text(encoding="utf-8") assert "Request payload" in html assert "Copy input JSON" in html assert "request-input-turn-1" in html def test_openai_turn_input_logger_disabled_writes_nothing(tmp_path, monkeypatch) -> None: monkeypatch.setenv("LOGS_PATH", str(tmp_path)) logger = OpenAITurnInputLogger(model=Llm.GPT_5_2_CODEX_LOW) logger.record_turn_input([{"role": "user", "content": "Build this page"}]) logger.record_turn_usage(TokenUsage(input=100, output=50, total=150)) report_path = logger.write_html_report() assert report_path is None assert not (tmp_path / "run_logs").exists() def test_openai_turn_input_logger_summarizes_function_call_output( tmp_path, monkeypatch ) -> None: monkeypatch.setenv("LOGS_PATH", str(tmp_path)) logger = OpenAITurnInputLogger(model=Llm.GPT_5_2_CODEX_LOW, enabled=True) logger.record_turn_input( [ { "type": "function_call_output", "call_id": "call-1", "output": ( '{"content":"Successfully edited file at index.html.",' '"details":{"diff":"--- index.html\\n+++ index.html\\n@@ -1 +1 @@\\n-a\\n+b\\n",' '"firstChangedLine":1}}' ), } ] ) report_path = logger.write_html_report() assert report_path is not None html = Path(report_path).read_text(encoding="utf-8") assert "type=function_call_output call_id=call-1" in html assert "path=index.html" in html assert "first_changed_line=1" in html assert "diff_chars=" in html assert 'preview='{"content":' not in html ================================================ FILE: backend/tests/test_parameter_extraction_stage.py ================================================ from unittest.mock import AsyncMock import pytest from routes.generate_code import ParameterExtractionStage @pytest.mark.asyncio async def test_extracts_gemini_api_key_from_settings_dialog() -> None: stage = ParameterExtractionStage(AsyncMock()) extracted = await stage.extract_and_validate( { "generatedCodeConfig": "html_tailwind", "inputMode": "text", "openAiApiKey": "", "anthropicApiKey": "", "geminiApiKey": "gemini-from-ui", "prompt": {"text": "hello"}, } ) assert extracted.gemini_api_key == "gemini-from-ui" @pytest.mark.asyncio async def test_extracts_gemini_api_key_from_env_when_not_in_request(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr("routes.generate_code.GEMINI_API_KEY", "gemini-from-env") stage = ParameterExtractionStage(AsyncMock()) extracted = await stage.extract_and_validate( { "generatedCodeConfig": "html_tailwind", "inputMode": "text", "prompt": {"text": "hello"}, } ) assert extracted.gemini_api_key == "gemini-from-env" ================================================ FILE: backend/tests/test_prompt_summary.py ================================================ import io import sys from typing import cast from openai.types.chat import ChatCompletionMessageParam from utils import ( format_prompt_preview, format_prompt_summary, print_prompt_preview, print_prompt_summary, ) def test_format_prompt_summary(): messages = [ {"role": "system", "content": "lorem ipsum dolor sit amet"}, { "role": "user", "content": [ {"type": "text", "text": "hello world"}, { "type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}, }, { "type": "image_url", "image_url": {"url": "data:image/png;base64,BBB"}, }, ], }, ] summary = format_prompt_summary(messages) assert "SYSTEM: lorem ipsum" in summary assert "[2 images]" in summary def test_print_prompt_summary(): messages = [ {"role": "system", "content": "short message"}, {"role": "user", "content": "hello"}, ] # Capture stdout captured_output = io.StringIO() sys.stdout = captured_output print_prompt_summary(cast(list[ChatCompletionMessageParam], messages)) # Reset stdout sys.stdout = sys.__stdout__ output = captured_output.getvalue() # Check that output contains box characters and content assert "┌─" in output assert "└─" in output assert "PROMPT SUMMARY" in output assert "SYSTEM: short message" in output assert "USER: hello" in output def test_print_prompt_summary_long_content(): messages = [ {"role": "system", "content": "This is a very long system message that should be wrapped properly within the box boundaries"}, {"role": "user", "content": "short"}, ] # Capture stdout captured_output = io.StringIO() sys.stdout = captured_output print_prompt_summary(cast(list[ChatCompletionMessageParam], messages)) # Reset stdout sys.stdout = sys.__stdout__ output = captured_output.getvalue() lines = output.strip().split('\n') # Check that all lines have consistent box formatting for line in lines: if line.startswith('│') and line.endswith('│'): # All content lines should have same length assert len(line) == len(lines[0]) if lines[0].startswith('┌') else True # Check content is present assert "PROMPT SUMMARY" in output assert "SYSTEM:" in output assert "USER: short" in output def test_format_prompt_summary_no_truncate(): messages = [ {"role": "system", "content": "This is a very long message that would normally be truncated at 40 characters but should be shown in full"}, ] # Test with truncation (default) summary_truncated = format_prompt_summary( cast(list[ChatCompletionMessageParam], messages) ) assert "..." in summary_truncated assert len(summary_truncated.split(": ", 1)[1]) <= 50 # Role + truncated content # Test without truncation summary_full = format_prompt_summary( cast(list[ChatCompletionMessageParam], messages), truncate=False ) assert "..." not in summary_full assert "shown in full" in summary_full def test_print_prompt_summary_no_truncate(): messages = [ {"role": "system", "content": "This is a very long message that would normally be truncated but should be shown in full when truncate=False"}, ] # Capture stdout captured_output = io.StringIO() sys.stdout = captured_output print_prompt_summary( cast(list[ChatCompletionMessageParam], messages), truncate=False ) # Reset stdout sys.stdout = sys.__stdout__ output = captured_output.getvalue() # Check that full content is shown assert "shown in full when truncate=False" in output assert "..." not in output def test_format_prompt_preview_collapses_long_content(): long_code = "\n" + ("x" * 800) + "\n" messages = [ {"role": "system", "content": "short"}, {"role": "assistant", "content": long_code}, ] preview = format_prompt_preview( cast(list[ChatCompletionMessageParam], messages), max_chars_per_message=120 ) assert "1. SYSTEM" in preview assert "2. ASSISTANT" in preview assert "[collapsed " in preview def test_print_prompt_preview(): messages = [ {"role": "system", "content": "System message"}, { "role": "user", "content": [ {"type": "text", "text": "User request"}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,AAA"}}, ], }, ] captured_output = io.StringIO() sys.stdout = captured_output print_prompt_preview(cast(list[ChatCompletionMessageParam], messages)) sys.stdout = sys.__stdout__ output = captured_output.getvalue() assert "PROMPT PREVIEW" in output assert "1. SYSTEM" in output assert "2. USER [1 media]" in output ================================================ FILE: backend/tests/test_prompts.py ================================================ import pytest from unittest.mock import patch, MagicMock import sys from typing import Any, Dict, List, TypedDict, cast from openai.types.chat import ChatCompletionMessageParam # Mock moviepy before importing prompts sys.modules["moviepy"] = MagicMock() sys.modules["moviepy.editor"] = MagicMock() from prompts.pipeline import build_prompt_messages from prompts.plan import derive_prompt_construction_plan from prompts.prompt_types import Stack # Type definitions for test structures class ExpectedResult(TypedDict): messages: List[ChatCompletionMessageParam] def assert_structure_match(actual: object, expected: object, path: str = "") -> None: """ Compare actual and expected structures with special markers: - : Matches any value - : Checks if the actual value contains 'text' Args: actual: The actual value to check expected: The expected value or pattern path: Current path in the structure (for error messages) """ if ( isinstance(expected, str) and expected.startswith("<") and expected.endswith(">") ): # Handle special markers if expected == "": # Match any value return elif expected.startswith(""): # Extract the text to search for search_text = expected[10:-1] # Remove "" assert isinstance( actual, str ), f"At {path}: expected string, got {type(actual).__name__}" assert ( search_text in actual ), f"At {path}: '{search_text}' not found in '{actual}'" return # Handle different types if isinstance(expected, dict): assert isinstance( actual, dict ), f"At {path}: expected dict, got {type(actual).__name__}" expected_dict: Dict[str, object] = expected actual_dict: Dict[str, object] = actual for key, value in expected_dict.items(): assert key in actual_dict, f"At {path}: key '{key}' not found in actual" assert_structure_match(actual_dict[key], value, f"{path}.{key}" if path else key) elif isinstance(expected, list): assert isinstance( actual, list ), f"At {path}: expected list, got {type(actual).__name__}" expected_list: List[object] = expected actual_list: List[object] = actual assert len(actual_list) == len( expected_list ), f"At {path}: list length mismatch (expected {len(expected_list)}, got {len(actual_list)})" for i, (a, e) in enumerate(zip(actual_list, expected_list)): assert_structure_match(a, e, f"{path}[{i}]") else: # Direct comparison for other types assert actual == expected, f"At {path}: expected {expected}, got {actual}" class TestCreatePrompt: """Test cases for create_prompt function.""" # Test data constants TEST_IMAGE_URL: str = "data:image/png;base64,test_image_data" RESULT_IMAGE_URL: str = "data:image/png;base64,result_image_data" MOCK_SYSTEM_PROMPT: str = "Mock HTML Tailwind system prompt" TEST_STACK: Stack = "html_tailwind" ENABLED_IMAGE_POLICY: str = ( "Image generation is enabled for this request. Use generate_images for " "missing assets when needed." ) @staticmethod def wrapped_file(content: str) -> str: return f'\n{content}\n' def test_plan_create_uses_create_from_input(self) -> None: plan = derive_prompt_construction_plan( stack=self.TEST_STACK, input_mode="image", generation_type="create", history=[], file_state=None, ) assert plan["construction_strategy"] == "create_from_input" def test_plan_update_with_history_uses_history_strategy(self) -> None: plan = derive_prompt_construction_plan( stack=self.TEST_STACK, input_mode="image", generation_type="update", history=[{"role": "user", "text": "change", "images": [], "videos": []}], file_state=None, ) assert plan["construction_strategy"] == "update_from_history" def test_plan_update_without_history_uses_file_snapshot_strategy(self) -> None: plan = derive_prompt_construction_plan( stack=self.TEST_STACK, input_mode="image", generation_type="update", history=[], file_state={"path": "index.html", "content": ""}, ) assert plan["construction_strategy"] == "update_from_file_snapshot" @pytest.mark.asyncio async def test_image_mode_create_single_image(self) -> None: """Test create generation with single image in image mode.""" # Setup test data params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "create", } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ {"role": "system", "content": self.MOCK_SYSTEM_PROMPT}, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": self.TEST_IMAGE_URL, "detail": "high", }, }, { "type": "text", "text": "", }, ], }, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_image_mode_create_with_image_generation_disabled(self) -> None: params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "create", } with patch("prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT): messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=[], image_generation_enabled=False, ) system_content = messages[0].get("content") assert isinstance(system_content, str) assert system_content == self.MOCK_SYSTEM_PROMPT user_content = messages[1].get("content") assert isinstance(user_content, list) text_part = next( ( part for part in user_content if isinstance(part, dict) and part.get("type") == "text" ), None, ) assert isinstance(text_part, dict) user_text = text_part.get("text") assert isinstance(user_text, str) assert "Image generation is disabled for this request. Do not call generate_images." in user_text @pytest.mark.asyncio async def test_image_mode_update_with_history(self) -> None: """Test update generation with conversation history in image mode.""" # Setup test data params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "update", "history": [ {"role": "assistant", "text": "Initial code", "images": [], "videos": []}, {"role": "user", "text": "Make the background blue", "images": [], "videos": []}, {"role": "assistant", "text": "Updated code", "images": [], "videos": []}, {"role": "user", "text": "Add a header", "images": [], "videos": []}, ], } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "assistant", "content": self.wrapped_file("Initial code"), }, { "role": "user", "content": ( f"Selected stack: {self.TEST_STACK}.\n\n" f"{self.ENABLED_IMAGE_POLICY}\n\n" "Make the background blue" ), }, { "role": "assistant", "content": self.wrapped_file("Updated code"), }, {"role": "user", "content": "Add a header"}, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_update_history_with_image_generation_disabled(self) -> None: with patch("prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT): messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type="update", prompt={"text": "", "images": [self.TEST_IMAGE_URL], "videos": []}, history=[ {"role": "assistant", "text": "Initial code", "images": [], "videos": []}, {"role": "user", "text": "Make the background blue", "images": [], "videos": []}, {"role": "assistant", "text": "Updated code", "images": [], "videos": []}, ], image_generation_enabled=False, ) system_content = messages[0].get("content") assert isinstance(system_content, str) assert system_content == self.MOCK_SYSTEM_PROMPT first_user_content = messages[2].get("content") assert isinstance(first_user_content, str) assert "Selected stack: html_tailwind." in first_user_content assert "Image generation is disabled for this request. Do not call generate_images." in first_user_content assert "Make the background blue" in first_user_content @pytest.mark.asyncio async def test_text_mode_create_generation(self) -> None: """Test create generation from text description in text mode.""" # Setup test data text_description: str = "a modern landing page with hero section" params: Dict[str, Any] = { "prompt": { "text": text_description, "images": [] }, "generationType": "create" } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="text", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT }, { "role": "user", "content": f"" } ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_text_mode_update_with_history(self) -> None: """Test update generation with conversation history in text mode.""" # Setup test data text_description: str = "a dashboard with charts" params: Dict[str, Any] = { "prompt": { "text": text_description, "images": [] }, "generationType": "update", "history": [ {"role": "assistant", "text": "Initial dashboard", "images": [], "videos": []}, {"role": "user", "text": "Add a sidebar", "images": [], "videos": []}, {"role": "assistant", "text": "Dashboard with sidebar", "images": [], "videos": []}, {"role": "user", "text": "Now add a navigation menu", "images": [], "videos": []}, ] } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="text", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "assistant", "content": self.wrapped_file("Initial dashboard") }, { "role": "user", "content": ( f"Selected stack: {self.TEST_STACK}.\n\n" f"{self.ENABLED_IMAGE_POLICY}\n\n" "Add a sidebar" ), }, { "role": "assistant", "content": self.wrapped_file( "Dashboard with sidebar" ) }, { "role": "user", "content": "Now add a navigation menu" } ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_video_mode_basic_prompt_creation(self) -> None: """Test basic video prompt creation in video mode. For video mode with generation_type="create", we now assemble a regular system+user prompt so video generation can run through the agent runner path. """ # Setup test data video_data_url: str = "data:video/mp4;base64,test_video_data" params: Dict[str, Any] = { "prompt": { "text": "", "images": [], "videos": [video_data_url], }, "generationType": "create" } # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="video", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) expected: ExpectedResult = { "messages": [ { "role": "system", "content": "", }, { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": video_data_url, "detail": "high"}, }, { "type": "text", "text": "", }, ], }, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_create_raises_on_unsupported_input_mode(self) -> None: params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL], "videos": []}, "generationType": "create", } with pytest.raises(ValueError, match="Unsupported input mode: audio"): await build_prompt_messages( stack=self.TEST_STACK, input_mode=cast(Any, "audio"), generation_type=params["generationType"], prompt=params["prompt"], history=[], ) @pytest.mark.asyncio async def test_image_mode_update_with_single_image_in_history(self) -> None: """Test update with user message containing a single image.""" # Setup test data reference_image_url: str = "data:image/png;base64,reference_image" params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "update", "history": [ {"role": "assistant", "text": "Initial code", "images": [], "videos": []}, {"role": "user", "text": "Add a button", "images": [reference_image_url], "videos": []}, {"role": "assistant", "text": "Code with button", "images": [], "videos": []}, ] } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "assistant", "content": self.wrapped_file("Initial code"), }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": reference_image_url, "detail": "high", }, }, { "type": "text", "text": ( f"Selected stack: {self.TEST_STACK}.\n\n" f"{self.ENABLED_IMAGE_POLICY}\n\n" "Add a button" ), }, ], }, { "role": "assistant", "content": self.wrapped_file("Code with button"), }, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_image_mode_update_with_multiple_images_in_history(self) -> None: """Test update with user message containing multiple images.""" # Setup test data example1_url: str = "data:image/png;base64,example1" example2_url: str = "data:image/png;base64,example2" params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "update", "history": [ {"role": "assistant", "text": "Initial code", "images": [], "videos": []}, {"role": "user", "text": "Style like these examples", "images": [example1_url, example2_url], "videos": []}, {"role": "assistant", "text": "Styled code", "images": [], "videos": []}, ] } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "assistant", "content": self.wrapped_file("Initial code"), }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": example1_url, "detail": "high", }, }, { "type": "image_url", "image_url": { "url": example2_url, "detail": "high", }, }, { "type": "text", "text": ( f"Selected stack: {self.TEST_STACK}.\n\n" f"{self.ENABLED_IMAGE_POLICY}\n\n" "Style like these examples" ), }, ], }, { "role": "assistant", "content": self.wrapped_file("Styled code"), }, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_update_with_empty_images_arrays(self) -> None: """Test that empty images arrays don't break existing functionality.""" # Setup test data with explicit empty images arrays params: Dict[str, Any] = { "prompt": {"text": "", "images": [self.TEST_IMAGE_URL]}, "generationType": "update", "history": [ {"role": "assistant", "text": "Initial code", "images": [], "videos": []}, {"role": "user", "text": "Make it blue", "images": [], "videos": []}, {"role": "assistant", "text": "Blue code", "images": [], "videos": []}, ] } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): # Call the function messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params.get("history", []), ) # Define expected structure - should be text-only messages expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "assistant", "content": self.wrapped_file("Initial code"), }, { "role": "user", "content": ( f"Selected stack: {self.TEST_STACK}.\n\n" f"{self.ENABLED_IMAGE_POLICY}\n\n" "Make it blue" ), }, # Text-only message { "role": "assistant", "content": self.wrapped_file("Blue code"), }, ], } # Assert the structure matches actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) @pytest.mark.asyncio async def test_update_bootstraps_from_file_state_when_history_is_empty(self) -> None: """Update should synthesize a user message from fileState + prompt when history is empty.""" ref_image_url: str = "data:image/png;base64,ref_image" params: Dict[str, Any] = { "generationType": "update", "prompt": {"text": "Make the header blue", "images": [ref_image_url], "videos": []}, "history": [], "fileState": { "path": "index.html", "content": "Original imported code", }, } with patch( "prompts.system_prompt.SYSTEM_PROMPT", new=self.MOCK_SYSTEM_PROMPT, ): messages = await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type=params["generationType"], prompt=params["prompt"], history=params["history"], file_state=params["fileState"], ) expected: ExpectedResult = { "messages": [ { "role": "system", "content": self.MOCK_SYSTEM_PROMPT, }, { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": ref_image_url, "detail": "high", }, }, { "type": "text", "text": ">", }, ], }, ], } actual: ExpectedResult = {"messages": messages} assert_structure_match(actual, expected) user_content = messages[1].get("content") assert isinstance(user_content, list) text_part = next( (part for part in user_content if isinstance(part, dict) and part.get("type") == "text"), None, ) assert isinstance(text_part, dict) synthesized_text = text_part.get("text", "") assert isinstance(synthesized_text, str) assert f"Selected stack: {self.TEST_STACK}." in synthesized_text assert "Original imported code" in synthesized_text assert "" in synthesized_text assert "Make the header blue" in synthesized_text @pytest.mark.asyncio async def test_update_requires_history_or_file_state(self) -> None: with pytest.raises(ValueError): await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type="update", prompt={"text": "Change title", "images": [], "videos": []}, history=[], ) @pytest.mark.asyncio async def test_update_history_requires_user_message(self) -> None: with pytest.raises( ValueError, match="Update history must include at least one user message" ): await build_prompt_messages( stack=self.TEST_STACK, input_mode="image", generation_type="update", prompt={"text": "Change title", "images": [], "videos": []}, history=[ { "role": "assistant", "text": "Code only", "images": [], "videos": [], } ], ) ================================================ FILE: backend/tests/test_request_parsing.py ================================================ from prompts.request_parsing import parse_prompt_content, parse_prompt_history def test_parse_prompt_content_with_valid_data() -> None: result = parse_prompt_content( { "text": "Build this page", "images": ["img1", "img2"], "videos": ["vid1"], } ) assert result == { "text": "Build this page", "images": ["img1", "img2"], "videos": ["vid1"], } def test_parse_prompt_content_filters_invalid_media_types() -> None: result = parse_prompt_content( { "text": "Prompt", "images": ["img1", 123, None, "img2"], "videos": ["vid1", {"x": 1}, "vid2"], } ) assert result == { "text": "Prompt", "images": ["img1", "img2"], "videos": ["vid1", "vid2"], } def test_parse_prompt_content_defaults_for_invalid_payload() -> None: assert parse_prompt_content(None) == {"text": "", "images": [], "videos": []} assert parse_prompt_content("bad") == {"text": "", "images": [], "videos": []} assert parse_prompt_content({"text": 1}) == {"text": "", "images": [], "videos": []} def test_parse_prompt_history_with_valid_entries() -> None: result = parse_prompt_history( [ { "role": "assistant", "text": "", "images": [], "videos": [], }, { "role": "user", "text": "Please update", "images": ["img1"], "videos": ["vid1"], }, ] ) assert result == [ { "role": "assistant", "text": "", "images": [], "videos": [], }, { "role": "user", "text": "Please update", "images": ["img1"], "videos": ["vid1"], }, ] def test_parse_prompt_history_filters_invalid_items() -> None: result = parse_prompt_history( [ "bad", {"role": "tool", "text": "skip me"}, {"role": "user", "text": "keep me", "images": ["img1", 3], "videos": [None, "vid1"]}, {"role": "assistant", "text": 123, "images": "bad", "videos": "bad"}, ] ) assert result == [ { "role": "user", "text": "keep me", "images": ["img1"], "videos": ["vid1"], }, { "role": "assistant", "text": "", "images": [], "videos": [], }, ] def test_parse_prompt_history_defaults_for_invalid_payload() -> None: assert parse_prompt_history(None) == [] assert parse_prompt_history({"role": "user"}) == [] ================================================ FILE: backend/tests/test_screenshot.py ================================================ import pytest from routes.screenshot import normalize_url class TestNormalizeUrl: """Test cases for URL normalization functionality.""" def test_url_without_protocol(self): """Test that URLs without protocol get https:// added.""" assert normalize_url("example.com") == "https://example.com" assert normalize_url("www.example.com") == "https://www.example.com" assert normalize_url("subdomain.example.com") == "https://subdomain.example.com" def test_url_with_http_protocol(self): """Test that existing http protocol is preserved.""" assert normalize_url("http://example.com") == "http://example.com" assert normalize_url("http://www.example.com") == "http://www.example.com" def test_url_with_https_protocol(self): """Test that existing https protocol is preserved.""" assert normalize_url("https://example.com") == "https://example.com" assert normalize_url("https://www.example.com") == "https://www.example.com" def test_url_with_path_and_params(self): """Test URLs with paths and query parameters.""" assert normalize_url("example.com/path") == "https://example.com/path" assert normalize_url("example.com/path?param=value") == "https://example.com/path?param=value" assert normalize_url("example.com:8080/path") == "https://example.com:8080/path" def test_url_with_whitespace(self): """Test that whitespace is stripped.""" assert normalize_url(" example.com ") == "https://example.com" assert normalize_url("\texample.com\n") == "https://example.com" def test_invalid_protocols(self): """Test that unsupported protocols raise ValueError.""" with pytest.raises(ValueError, match="Unsupported protocol: ftp"): normalize_url("ftp://example.com") with pytest.raises(ValueError, match="Unsupported protocol: file"): normalize_url("file:///path/to/file") def test_localhost_urls(self): """Test localhost URLs.""" assert normalize_url("localhost") == "https://localhost" assert normalize_url("localhost:3000") == "https://localhost:3000" assert normalize_url("http://localhost:8080") == "http://localhost:8080" def test_ip_address_urls(self): """Test IP address URLs.""" assert normalize_url("192.168.1.1") == "https://192.168.1.1" assert normalize_url("192.168.1.1:8080") == "https://192.168.1.1:8080" assert normalize_url("http://192.168.1.1") == "http://192.168.1.1" def test_complex_urls(self): """Test more complex URL scenarios.""" assert normalize_url("example.com/path/to/page.html#section") == "https://example.com/path/to/page.html#section" assert normalize_url("user:pass@example.com") == "https://user:pass@example.com" assert normalize_url("example.com?q=search&lang=en") == "https://example.com?q=search&lang=en" ================================================ FILE: backend/tests/test_status_broadcast.py ================================================ from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock from typing import Any, cast import pytest from routes.generate_code import ( ExtractedParams, PipelineContext, StatusBroadcastMiddleware, ) @pytest.mark.asyncio async def test_video_update_broadcasts_two_variants() -> None: sent_messages: list[tuple[str, str | None, int]] = [] async def send_message( msg_type: str, value: str | None, variant_index: int, data=None, eventId=None, ) -> None: sent_messages.append((msg_type, value, variant_index)) context = PipelineContext(websocket=MagicMock()) context.ws_comm = cast( Any, SimpleNamespace( send_message=send_message, throw_error=AsyncMock(), ), ) context.extracted_params = ExtractedParams( stack="html_tailwind", input_mode="video", should_generate_images=True, openai_api_key=None, anthropic_api_key=None, gemini_api_key="key", openai_base_url=None, generation_type="update", prompt={"text": "Edit this video output", "images": [], "videos": []}, history=[], file_state=None, option_codes=[], ) middleware = StatusBroadcastMiddleware() next_called = False async def next_func() -> None: nonlocal next_called next_called = True await middleware.process(context, next_func) assert sent_messages[0] == ("variantCount", "2", 0) status_messages = [m for m in sent_messages if m[0] == "status"] assert len(status_messages) == 2 assert [m[2] for m in status_messages] == [0, 1] assert next_called is True @pytest.mark.asyncio async def test_image_update_broadcasts_two_variants() -> None: sent_messages: list[tuple[str, str | None, int]] = [] async def send_message( msg_type: str, value: str | None, variant_index: int, data=None, eventId=None, ) -> None: sent_messages.append((msg_type, value, variant_index)) context = PipelineContext(websocket=MagicMock()) context.ws_comm = cast( Any, SimpleNamespace( send_message=send_message, throw_error=AsyncMock(), ), ) context.extracted_params = ExtractedParams( stack="html_tailwind", input_mode="image", should_generate_images=True, openai_api_key="key", anthropic_api_key="key", gemini_api_key=None, openai_base_url=None, generation_type="update", prompt={"text": "Edit this screenshot", "images": ["data:image/png;base64,abc"], "videos": []}, history=[], file_state={"path": "index.html", "content": ""}, option_codes=[], ) middleware = StatusBroadcastMiddleware() next_called = False async def next_func() -> None: nonlocal next_called next_called = True await middleware.process(context, next_func) assert sent_messages[0] == ("variantCount", "2", 0) status_messages = [m for m in sent_messages if m[0] == "status"] assert len(status_messages) == 2 assert [m[2] for m in status_messages] == [0, 1] assert next_called is True ================================================ FILE: backend/tests/test_token_usage.py ================================================ """Tests for unified token usage tracking and cost computation.""" from types import SimpleNamespace from agent.providers.pricing import MODEL_PRICING, ModelPricing from agent.providers.token_usage import TokenUsage from agent.providers.anthropic import _extract_anthropic_usage from agent.providers.gemini import _extract_usage as _extract_gemini_usage from agent.providers.openai import _extract_openai_usage # --------------------------------------------------------------------------- # TokenUsage.accumulate # --------------------------------------------------------------------------- class TestAccumulate: def test_sums_all_fields(self) -> None: a = TokenUsage(input=100, output=50, cache_read=20, cache_write=10, total=180) b = TokenUsage(input=200, output=80, cache_read=30, cache_write=5, total=315) a.accumulate(b) assert a == TokenUsage( input=300, output=130, cache_read=50, cache_write=15, total=495 ) def test_accumulate_zero_is_noop(self) -> None: a = TokenUsage(input=100, output=50, cache_read=20, total=170) a.accumulate(TokenUsage()) assert a == TokenUsage(input=100, output=50, cache_read=20, total=170) def test_multiple_accumulations(self) -> None: total = TokenUsage() for i in range(1, 4): total.accumulate(TokenUsage(input=i * 10, output=i * 5, total=i * 15)) # input: 10+20+30=60, output: 5+10+15=30, total: 15+30+45=90 assert total.input == 60 assert total.output == 30 assert total.total == 90 # --------------------------------------------------------------------------- # TokenUsage.cost # --------------------------------------------------------------------------- class TestCost: def test_basic_cost(self) -> None: usage = TokenUsage(input=1_000_000, output=1_000_000, total=2_000_000) pricing = ModelPricing(input=2.00, output=8.00) # 1M * $2 + 1M * $8 = $10 assert usage.cost(pricing) == 10.0 def test_zero_tokens_zero_cost(self) -> None: usage = TokenUsage() pricing = ModelPricing(input=5.00, output=25.00, cache_read=0.50) assert usage.cost(pricing) == 0.0 def test_cache_heavy_scenario(self) -> None: # 100k non-cached input, 900k cached, 500k output usage = TokenUsage( input=100_000, output=500_000, cache_read=900_000, total=1_500_000 ) pricing = ModelPricing(input=2.00, output=8.00, cache_read=0.50) # 100k * $2/M + 500k * $8/M + 900k * $0.50/M # = $0.20 + $4.00 + $0.45 = $4.65 expected = (100_000 * 2.00 + 500_000 * 8.00 + 900_000 * 0.50) / 1_000_000 assert abs(usage.cost(pricing) - expected) < 1e-9 def test_anthropic_with_cache_write(self) -> None: usage = TokenUsage( input=500_000, output=200_000, cache_read=300_000, cache_write=100_000, total=1_100_000, ) pricing = ModelPricing( input=3.00, output=15.00, cache_read=0.30, cache_write=3.75 ) expected = ( 500_000 * 3.00 + 200_000 * 15.00 + 300_000 * 0.30 + 100_000 * 3.75 ) / 1_000_000 assert abs(usage.cost(pricing) - expected) < 1e-9 class TestCacheHitRate: def test_zero_total_input_is_zero_percent(self) -> None: usage = TokenUsage() assert usage.total_input_tokens() == 0 assert usage.cache_hit_rate_percent() == 0.0 def test_cache_hit_rate_without_cache_write(self) -> None: usage = TokenUsage(input=300, cache_read=100) assert usage.total_input_tokens() == 400 assert abs(usage.cache_hit_rate_percent() - 25.0) < 1e-9 def test_cache_hit_rate_includes_cache_write_in_denominator(self) -> None: usage = TokenUsage(input=300, cache_read=100, cache_write=100) assert usage.total_input_tokens() == 500 assert abs(usage.cache_hit_rate_percent() - 20.0) < 1e-9 # --------------------------------------------------------------------------- # Gemini: _extract_usage # --------------------------------------------------------------------------- def _gemini_chunk( prompt: int = 0, candidates: int = 0, thoughts: int = 0, cached: int = 0, total: int = 0, ) -> SimpleNamespace: """Build a fake Gemini GenerateContentResponse with usage_metadata.""" return SimpleNamespace( usage_metadata=SimpleNamespace( prompt_token_count=prompt, candidates_token_count=candidates, thoughts_token_count=thoughts, cached_content_token_count=cached, total_token_count=total, ) ) class TestGeminiExtract: def test_normal_response(self) -> None: chunk = _gemini_chunk( prompt=1000, candidates=400, thoughts=100, cached=200, total=1500 ) usage = _extract_gemini_usage(chunk) # type: ignore[arg-type] assert usage is not None assert usage.input == 800 # 1000 - 200 assert usage.output == 500 # 400 + 100 assert usage.cache_read == 200 assert usage.cache_write == 0 assert usage.total == 1500 def test_no_cache(self) -> None: chunk = _gemini_chunk(prompt=500, candidates=200, thoughts=50, total=750) usage = _extract_gemini_usage(chunk) # type: ignore[arg-type] assert usage is not None assert usage.input == 500 assert usage.cache_read == 0 def test_no_usage_metadata_returns_none(self) -> None: chunk = SimpleNamespace(usage_metadata=None) assert _extract_gemini_usage(chunk) is None # type: ignore[arg-type] def test_none_subfields_default_to_zero(self) -> None: chunk = SimpleNamespace( usage_metadata=SimpleNamespace( prompt_token_count=None, candidates_token_count=None, thoughts_token_count=None, cached_content_token_count=None, total_token_count=None, ) ) usage = _extract_gemini_usage(chunk) # type: ignore[arg-type] assert usage == TokenUsage() # --------------------------------------------------------------------------- # OpenAI: _extract_openai_usage # --------------------------------------------------------------------------- def _openai_response( input_tokens: int = 0, output_tokens: int = 0, total_tokens: int = 0, cached_tokens: int = 0, ) -> SimpleNamespace: """Build a fake OpenAI response.completed event payload.""" return SimpleNamespace( usage=SimpleNamespace( input_tokens=input_tokens, output_tokens=output_tokens, total_tokens=total_tokens, input_tokens_details=SimpleNamespace(cached_tokens=cached_tokens), ) ) class TestOpenAIExtract: def test_normal_response(self) -> None: resp = _openai_response( input_tokens=1000, output_tokens=500, total_tokens=1500, cached_tokens=300 ) usage = _extract_openai_usage(resp) assert usage.input == 700 # 1000 - 300 assert usage.output == 500 assert usage.cache_read == 300 assert usage.cache_write == 0 assert usage.total == 1500 def test_no_cache(self) -> None: resp = _openai_response( input_tokens=800, output_tokens=200, total_tokens=1000 ) usage = _extract_openai_usage(resp) assert usage.input == 800 assert usage.cache_read == 0 def test_no_usage_returns_empty(self) -> None: resp = SimpleNamespace() # no .usage attribute usage = _extract_openai_usage(resp) assert usage == TokenUsage() def test_no_input_tokens_details(self) -> None: resp = SimpleNamespace( usage=SimpleNamespace( input_tokens=500, output_tokens=200, total_tokens=700, ) ) usage = _extract_openai_usage(resp) assert usage.input == 500 assert usage.cache_read == 0 # --------------------------------------------------------------------------- # Anthropic: _extract_anthropic_usage # --------------------------------------------------------------------------- def _anthropic_message( input_tokens: int = 0, output_tokens: int = 0, cache_read: int = 0, cache_write: int = 0, ) -> SimpleNamespace: """Build a fake Anthropic final message with usage.""" return SimpleNamespace( usage=SimpleNamespace( input_tokens=input_tokens, output_tokens=output_tokens, cache_read_input_tokens=cache_read, cache_creation_input_tokens=cache_write, ) ) class TestAnthropicExtract: def test_normal_response(self) -> None: msg = _anthropic_message( input_tokens=1000, output_tokens=500, cache_read=200, cache_write=50 ) usage = _extract_anthropic_usage(msg) assert usage.input == 1000 assert usage.output == 500 assert usage.cache_read == 200 assert usage.cache_write == 50 assert usage.total == 1750 # sum of all fields def test_no_cache(self) -> None: msg = _anthropic_message(input_tokens=600, output_tokens=300) usage = _extract_anthropic_usage(msg) assert usage.input == 600 assert usage.cache_read == 0 assert usage.cache_write == 0 assert usage.total == 900 def test_no_usage_returns_empty(self) -> None: msg = SimpleNamespace() # no .usage attribute usage = _extract_anthropic_usage(msg) assert usage == TokenUsage() # --------------------------------------------------------------------------- # MODEL_PRICING lookup # --------------------------------------------------------------------------- class TestModelPricing: def test_known_models_have_pricing(self) -> None: for name in [ "gpt-4.1-2025-04-14", "gpt-5.2-codex", "claude-opus-4-6", "claude-sonnet-4-6", "gemini-3-flash-preview", "gemini-3-pro-preview", "gpt-5.4-2026-03-05", ]: assert name in MODEL_PRICING, f"missing pricing for {name}" def test_unknown_model_returns_none(self) -> None: assert MODEL_PRICING.get("nonexistent-model") is None def test_anthropic_has_cache_write_rate(self) -> None: for name in ["claude-opus-4-6", "claude-sonnet-4-6"]: assert MODEL_PRICING[name].cache_write > 0 def test_openai_gemini_no_cache_write(self) -> None: for name in ["gpt-4.1-2025-04-14", "gemini-3-flash-preview"]: assert MODEL_PRICING[name].cache_write == 0.0 ================================================ FILE: backend/utils.py ================================================ import copy import json import textwrap from typing import List from openai.types.chat import ChatCompletionMessageParam def pprint_prompt(prompt_messages: List[ChatCompletionMessageParam]): print(json.dumps(truncate_data_strings(prompt_messages), indent=4)) def format_prompt_summary(prompt_messages: List[ChatCompletionMessageParam], truncate: bool = True) -> str: parts: list[str] = [] for message in prompt_messages: role = message["role"] content = message.get("content") text = "" image_count = 0 if isinstance(content, list): for item in content: if item["type"] == "text": text += item["text"] + " " elif item["type"] == "image_url": image_count += 1 else: text = str(content) text = text.strip() if truncate and len(text) > 40: text = text[:40] + "..." img_part = f" + [{image_count} images]" if image_count else "" parts.append(f" {role.upper()}: {text}{img_part}") return "\n".join(parts) def print_prompt_summary(prompt_messages: List[ChatCompletionMessageParam], truncate: bool = True): summary = format_prompt_summary(prompt_messages, truncate) lines = summary.split('\n') # Find the maximum line length, with a minimum of 20 # If truncating, max is 80, otherwise allow up to 120 for full content max_allowed = 80 if truncate else 120 max_length = max(len(line) for line in lines) if lines else 20 max_length = max(20, min(max_allowed, max_length)) # Ensure title fits title = "PROMPT SUMMARY" max_length = max(max_length, len(title) + 4) print("┌─" + "─" * max_length + "─┐") title_padding = (max_length - len(title)) // 2 print(f"│ {' ' * title_padding}{title}{' ' * (max_length - len(title) - title_padding)} │") print("├─" + "─" * max_length + "─┤") for line in lines: if len(line) <= max_length: print(f"│ {line:<{max_length}} │") else: # Wrap long lines words = line.split() current_line = "" for word in words: if len(current_line + " " + word) <= max_length: current_line += (" " + word) if current_line else word else: if current_line: print(f"│ {current_line:<{max_length}} │") current_line = word if current_line: print(f"│ {current_line:<{max_length}} │") print("└─" + "─" * max_length + "─┘") print() def _collapse_preview_text(text: str, max_chars: int = 280) -> str: trimmed = text.strip() if not trimmed: return "(no text)" looks_like_code = any( marker in trimmed for marker in ("```", " str: parts: list[str] = [] for idx, message in enumerate(prompt_messages): role = str(message.get("role", "unknown")).upper() content = message.get("content") text_chunks: list[str] = [] media_count = 0 if isinstance(content, list): for item in content: if not isinstance(item, dict): continue item_type = item.get("type") if item_type == "image_url": media_count += 1 elif item_type == "text": item_text = item.get("text") if isinstance(item_text, str): text_chunks.append(item_text) else: text_chunks.append("" if content is None else str(content)) preview_text = _collapse_preview_text( "\n".join(text_chunks), max_chars=max_chars_per_message ) media_suffix = f" [{media_count} media]" if media_count else "" parts.append(f"{idx + 1}. {role}{media_suffix}") wrapped_preview = textwrap.wrap( preview_text, width=100, break_long_words=False, break_on_hyphens=False ) if wrapped_preview: for line in wrapped_preview: parts.append(f" {line}") else: parts.append(" (no text)") return "\n".join(parts) def print_prompt_preview(prompt_messages: List[ChatCompletionMessageParam]) -> None: preview = format_prompt_preview(prompt_messages) lines = preview.split("\n") max_length = max(len(line) for line in lines) if lines else 20 max_length = max(20, min(120, max_length)) title = "PROMPT PREVIEW" max_length = max(max_length, len(title) + 4) print("┌─" + "─" * max_length + "─┐") title_padding = (max_length - len(title)) // 2 print( f"│ {' ' * title_padding}{title}{' ' * (max_length - len(title) - title_padding)} │" ) print("├─" + "─" * max_length + "─┤") for line in lines: if len(line) <= max_length: print(f"│ {line:<{max_length}} │") else: wrapped = textwrap.wrap( line, width=max_length, break_long_words=False, break_on_hyphens=False ) for wrapped_line in wrapped: print(f"│ {wrapped_line:<{max_length}} │") print("└─" + "─" * max_length + "─┘") print() def truncate_data_strings(data: List[ChatCompletionMessageParam]): # type: ignore # Deep clone the data to avoid modifying the original object cloned_data = copy.deepcopy(data) if isinstance(cloned_data, dict): for key, value in cloned_data.items(): # type: ignore # Recursively call the function if the value is a dictionary or a list if isinstance(value, (dict, list)): cloned_data[key] = truncate_data_strings(value) # type: ignore # Truncate the string if it it's long and add ellipsis and length elif isinstance(value, str): cloned_data[key] = value[:40] # type: ignore if len(value) > 40: cloned_data[key] += "..." + f" ({len(value)} chars)" # type: ignore elif isinstance(cloned_data, list): # type: ignore # Process each item in the list cloned_data = [truncate_data_strings(item) for item in cloned_data] # type: ignore return cloned_data # type: ignore ================================================ FILE: backend/video/__init__.py ================================================ from video.cost_estimation import ( CostEstimate, MediaResolution, TokenEstimate, calculate_cost, estimate_video_generation_cost, estimate_video_input_tokens, format_cost_estimate, get_video_duration_from_bytes, ) from video.utils import ( extract_tag_content, get_video_bytes_and_mime_type, ) __all__ = [ # Cost estimation "CostEstimate", "MediaResolution", "TokenEstimate", "calculate_cost", "estimate_video_generation_cost", "estimate_video_input_tokens", "format_cost_estimate", "get_video_duration_from_bytes", # Video utilities "extract_tag_content", "get_video_bytes_and_mime_type", ] ================================================ FILE: backend/video/cost_estimation.py ================================================ from dataclasses import dataclass from enum import Enum from typing import Tuple from llm import Llm class MediaResolution(Enum): LOW = "low" MEDIUM = "medium" HIGH = "high" @dataclass class TokenEstimate: input_tokens: int estimated_output_tokens: int total_tokens: int @dataclass class CostEstimate: input_cost: float output_cost: float total_cost: float input_tokens: int output_tokens: int # Gemini 3 video token rates per frame (from documentation) # https://ai.google.dev/gemini-api/docs/media-resolution#token-counts VIDEO_TOKENS_PER_FRAME = { MediaResolution.LOW: 70, MediaResolution.MEDIUM: 70, MediaResolution.HIGH: 280, } # Prompt overhead (system prompt + user text) PROMPT_TOKENS_ESTIMATE = 1200 # Pricing per million tokens (USD) # https://ai.google.dev/gemini-api/docs/pricing GEMINI_PRICING = { # Gemini 3 Flash Preview "gemini-3-flash-preview": { "input_per_million": 0.50, # text/image/video "output_per_million": 3.00, # including thinking tokens }, # Gemini 3 Pro Preview "gemini-3.1-pro-preview": { "input_per_million": 2.00, "output_per_million": 12.00, }, } def get_model_api_name(model: Llm) -> str: if model in [Llm.GEMINI_3_FLASH_PREVIEW_HIGH, Llm.GEMINI_3_FLASH_PREVIEW_MINIMAL]: return "gemini-3-flash-preview" elif model in [ Llm.GEMINI_3_1_PRO_PREVIEW_HIGH, Llm.GEMINI_3_1_PRO_PREVIEW_MEDIUM, Llm.GEMINI_3_1_PRO_PREVIEW_LOW, ]: return "gemini-3.1-pro-preview" return model.value def estimate_video_input_tokens( video_duration_seconds: float, fps: float = 1.0, media_resolution: MediaResolution = MediaResolution.HIGH, ) -> int: # Calculate frames based on fps total_frames = video_duration_seconds * fps tokens_per_frame = VIDEO_TOKENS_PER_FRAME[media_resolution] frame_tokens = int(total_frames * tokens_per_frame) # Add prompt overhead (system prompt + user text) total_tokens = frame_tokens + PROMPT_TOKENS_ESTIMATE return total_tokens def estimate_output_tokens( max_output_tokens: int = 50000, thinking_level: str = "high", ) -> int: # Rough estimation: thinking takes up significant portion of output # High thinking: ~60-70% of output may be thinking tokens # Low thinking: ~30-40% of output may be thinking tokens # Minimal thinking: ~10-20% of output may be thinking tokens thinking_multipliers = { "high": 0.7, "low": 0.5, "minimal": 0.3, } multiplier = thinking_multipliers.get(thinking_level, 0.5) # Assume we use roughly 60% of max output tokens on average estimated_usage = int(max_output_tokens * 0.6) return estimated_usage def calculate_cost( input_tokens: int, output_tokens: int, model: Llm, ) -> CostEstimate: model_name = get_model_api_name(model) pricing = GEMINI_PRICING.get(model_name, GEMINI_PRICING["gemini-3-flash-preview"]) input_cost = (input_tokens / 1_000_000) * pricing["input_per_million"] output_cost = (output_tokens / 1_000_000) * pricing["output_per_million"] total_cost = input_cost + output_cost return CostEstimate( input_cost=input_cost, output_cost=output_cost, total_cost=total_cost, input_tokens=input_tokens, output_tokens=output_tokens, ) def estimate_video_generation_cost( video_duration_seconds: float, model: Llm, fps: float = 1.0, media_resolution: MediaResolution = MediaResolution.HIGH, max_output_tokens: int = 50000, thinking_level: str = "high", ) -> CostEstimate: input_tokens = estimate_video_input_tokens( video_duration_seconds=video_duration_seconds, fps=fps, media_resolution=media_resolution, ) output_tokens = estimate_output_tokens( max_output_tokens=max_output_tokens, thinking_level=thinking_level, ) return calculate_cost( input_tokens=input_tokens, output_tokens=output_tokens, model=model, ) def format_cost_estimate(cost: CostEstimate) -> str: return ( f"Estimated Cost:\n" f" Input tokens: {cost.input_tokens:,} (${cost.input_cost:.4f})\n" f" Output tokens: {cost.output_tokens:,} (${cost.output_cost:.4f})\n" f" Total estimated cost: ${cost.total_cost:.4f}" ) def format_detailed_input_estimate( video_duration_seconds: float, fps: float, media_resolution: MediaResolution, model: Llm, ) -> str: total_frames = video_duration_seconds * fps tokens_per_frame = VIDEO_TOKENS_PER_FRAME[media_resolution] frame_tokens = int(total_frames * tokens_per_frame) total_input_tokens = frame_tokens + PROMPT_TOKENS_ESTIMATE model_name = get_model_api_name(model) pricing = GEMINI_PRICING.get(model_name, GEMINI_PRICING["gemini-3-flash-preview"]) input_cost = (total_input_tokens / 1_000_000) * pricing["input_per_million"] return ( f"Input Token Calculation:\n" f" Frames: {video_duration_seconds:.2f}s × {fps} fps = {total_frames:.1f} frames\n" f" Frame tokens: {total_frames:.1f} × {tokens_per_frame} tokens/frame = {frame_tokens:,} tokens\n" f" Prompt overhead: {PROMPT_TOKENS_ESTIMATE:,} tokens\n" f" Total input: {total_input_tokens:,} tokens\n" f" Cost: {total_input_tokens:,} ÷ 1M × ${pricing['input_per_million']:.2f} = ${input_cost:.4f}" ) def get_video_duration_from_bytes(video_bytes: bytes) -> float | None: try: import tempfile import os from moviepy.editor import VideoFileClip # Write bytes to a temporary file with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file: tmp_file.write(video_bytes) tmp_path = tmp_file.name try: clip = VideoFileClip(tmp_path) duration = clip.duration clip.close() return duration finally: os.unlink(tmp_path) except Exception as e: print(f"Error getting video duration: {e}") return None ================================================ FILE: backend/video/utils.py ================================================ import base64 def extract_tag_content(tag: str, text: str) -> str: """ Extracts content for a given tag from the provided text. :param tag: The tag to search for. :param text: The text to search within. :return: The content found within the tag, if any. """ tag_start = f"<{tag}>" tag_end = f"" start_idx = text.find(tag_start) end_idx = text.find(tag_end, start_idx) if start_idx != -1 and end_idx != -1: return text[start_idx : end_idx + len(tag_end)] return "" def get_video_bytes_and_mime_type(video_data_url: str) -> tuple[bytes, str]: video_encoded_data = video_data_url.split(",")[1] video_bytes = base64.b64decode(video_encoded_data) mime_type = video_data_url.split(";")[0].split(":")[1] return video_bytes, mime_type ================================================ FILE: backend/ws/__init__.py ================================================ ================================================ FILE: backend/ws/constants.py ================================================ # WebSocket protocol (RFC 6455) allows for the use of custom close codes in the range 4000-4999 APP_ERROR_WEB_SOCKET_CODE = 4332 ================================================ FILE: blog/evaluating-claude.md ================================================ # Claude 3 for converting screenshots to code Claude 3 dropped yesterday, claiming to rival GPT-4 on a wide variety of tasks. I maintain a very popular open source project called “screenshot-to-code” (this one!) that uses GPT-4 vision to convert screenshots/designs into clean code. Naturally, I was excited to see how good Claude 3 was at this task. **TLDR:** Claude 3 is on par with GPT-4 vision for screenshot to code, better in some ways but worse in others. ## Evaluation Setup I don’t know of a public benchmark for “screenshot to code” so I created simple evaluation setup for the purposes of testing: - **Evaluation Dataset**: 16 screenshots with a mix of UI elements, landing pages, dashboards and popular websites. Screenshot 2024-03-05 at 3 05 52 PM - **Evaluation Metric**: Replication accuracy, as in “How close does the generated code look to the screenshot?” While there are other metrics that are important like code quality, speed and so on, this is by far the #1 thing most users of the repo care about. - **Evaluation Mechanism**: Each output is subjectively rated by a human on a rating scale from 0 to 4. 4 = very close to an exact replica while 0 = nothing like the screenshot. With 16 screenshots, the maximum any model can score is 64. To make the evaluation process easy, I created [a Python script](https://github.com/abi/screenshot-to-code/blob/main/backend/run_evals.py) that runs code for all the inputs in parallel. I also made a simple UI to do a side-by-side comparison of the input and output. ![Google Chrome](https://github.com/abi/screenshot-to-code/assets/23818/38126f8f-205d-4ed1-b8cf-039e81dcc3d0) ## Results Quick note about what kind of code we’ll be generating: currently, screenshot-to-code supports generating code in HTML + Tailwind, React, Vue, and several other frameworks. Stacks can impact the replication accuracy quite a bit. For example, because Bootstrap uses a relatively restrictive set of user elements, generations using Bootstrap tend to have a distinct "Bootstrap" style. I only ran the evals on HTML/Tailwind here which is the stack where GPT-4 vision tends to perform the best. Here are the results (average of 3 runs for each model): - GPT-4 Vision obtains a score of **65.10%** - this is what we’re trying to beat - Claude 3 Sonnet receives a score of **70.31%**, which is a bit better. - Surprisingly, Claude 3 Opus which is supposed to be the smarter and slower model scores worse than both GPT-4 vision and Claude 3 Sonnet, comes in at **61.46%**. Overall, a very strong showing for Claude 3. Obviously, there's a lot of subjectivity involved in this evaluation but Claude 3 is definitely on par with GPT-4 Vision, if not better. You can see the [side-by-side comparison for a run of Claude 3 Sonnet here](https://github.com/abi/screenshot-to-code-files/blob/main/sonnet%20results.png). And for [a run of GPT-4 Vision here](https://github.com/abi/screenshot-to-code-files/blob/main/gpt%204%20vision%20results.png). Other notes: - The prompts used are optimized for GPT-4 vision. Adjusting the prompts a bit for Claude did yield a small improvement. But nothing game-changing and potentially not worth the trade-off of maintaining two sets of prompts. - All the models excel at code quality - the quality is usually comparable to a human or better. - Claude 3 is much less lazy than GPT-4 Vision. When asked to recreate Hacker News, GPT-4 Vision will only create two items in the list and leave comments in this code like `` and ``. Screenshot 2024-03-05 at 9 25 04 PM While Claude 3 Sonnet can sometimes be lazy too, most of the time, it does what you ask it to do. Screenshot 2024-03-05 at 9 30 23 PM - For some reason, all the models struggle with side-by-side "flex" layouts Screenshot 2024-03-05 at 9 20 58 PM - Claude 3 Sonnet is a lot faster - Claude 3 gets background and text colors wrong quite often! (like in the Hacker News image above) - My suspicion is that Claude 3 Opus results can be improved to be on par with the other models through better prompting Overall, I'm very impressed with Claude 3 Sonnet for this use case. I've added it as an alternative to GPT-4 Vision in the open source repo (hosted version update coming soon). If you’d like to contribute to this effort, I have some documentation on [running these evals yourself here](https://github.com/abi/screenshot-to-code/blob/main/Evaluation.md). I'm also working on a better evaluation mechanism with Elo ratings and would love some help on that. ================================================ FILE: design-docs/agent-tool-calling-flow.md ================================================ # Agent Tool-Calling Flow (Backend) This document explains exactly what happens after prompt messages are built and a variant starts running in the agent. ## Entry Point Per variant, `Agent(...).run(model, prompt_messages)` is called from: - `backend/routes/generate_code.py` (`AgenticGenerationStage._run_variant`) `Agent` is a thin wrapper over `AgentEngine`: - `backend/agent/runner.py` - `backend/agent/engine.py` ## Core Tool-Calling Loop The main loop lives in: - `backend/agent/engine.py` -> `AgentEngine._run_with_session(...)` Loop behavior: 1. Start turn-local streaming state. - Create event IDs for assistant/thinking streams. - Initialize: - `started_tool_ids` - `streamed_lengths` 2. Stream one provider turn. - Call `turn = await session.stream_turn(on_event)` - `on_event` handles streamed deltas: - `assistant_delta` -> websocket `assistant` - `thinking_delta` -> websocket `thinking` - `tool_call_delta` -> `_handle_streamed_tool_delta(...)` 3. Branch by tool calls. - If `turn.tool_calls` is empty: finalize and return. - Otherwise execute each tool call, emit tool lifecycle messages, and collect results. 4. Continue conversation with tool results. - Call `session.append_tool_results(turn, executed_tool_calls)` - Next loop iteration sends another model turn with updated history. 5. Guardrail. - Maximum 20 tool turns; raises if exceeded. ## Tool Execution Tool runtime: - `backend/agent/tools/runtime.py` -> `AgentToolRuntime.execute(...)` Tool definitions: - `backend/agent/tools/definitions.py` -> `canonical_tool_definitions(...)` Supported tools: - `create_file` - `edit_file` - `generate_images` - `remove_background` - `retrieve_option` Execution lifecycle per tool call: 1. Emit `toolStart` (unless already emitted from streamed args). 2. If `create_file`, stream preview code chunks while args are still arriving. 3. Execute tool in runtime. 4. If tool returns `updated_content`, emit `setCode`. 5. Emit `toolResult` with `{ name, output, ok }`. ### Live streamed `create_file` preview The engine parses partial tool arguments from provider deltas using: - `backend/agent/tools/parsing.py`: - `extract_content_from_args(...)` - `extract_path_from_args(...)` Then `_handle_streamed_tool_delta(...)` in `engine.py`: - Emits early `toolStart` for `create_file` - Emits incremental `setCode` updates as `content` grows This allows frontend preview before actual tool execution completes. ## Provider-Specific Continuation Provider contract: - `backend/agent/providers/base.py` - `ProviderSession` - `ProviderTurn` Each provider returns a `ProviderTurn` with: - `assistant_text` - `tool_calls` - `assistant_turn` (provider-native turn object needed for continuation) After tool execution, each provider appends tool results differently. ### OpenAI continuation - `backend/agent/providers/openai.py` -> `OpenAIProviderSession.append_tool_results(...)` Behavior: 1. Append prior assistant output items (`turn.assistant_turn`) to request history. 2. Append one `function_call_output` per tool result: - `{"type":"function_call_output","call_id":...,"output": json_string}` Next `responses.create(...)` turn uses this updated item list. ### Anthropic continuation - `backend/agent/providers/anthropic.py` -> `AnthropicProviderSession.append_tool_results(...)` Behavior: 1. Append assistant message blocks: - optional text block - tool_use blocks (`id`, `name`, `input`) 2. Append user message with tool_result blocks: - `tool_use_id`, serialized result content, `is_error` Next `messages.stream(...)` turn continues from these blocks. ### Gemini continuation - `backend/agent/providers/gemini.py` -> `GeminiProviderSession.append_tool_results(...)` Behavior: 1. Append exact original model content (`turn.assistant_turn`). 2. Append `role="tool"` content with `Part.from_function_response(...)` per tool. This preserves the model part structure required for reliable continuation (including thought-signature-sensitive flows). ## Response Streaming to Frontend Frontend websocket message types emitted during generation: - `assistant` - `thinking` - `toolStart` - `toolResult` - `setCode` Where they come from: 1. Provider parser emits `StreamEvent` deltas during `stream_turn(...)`. 2. Engine forwards deltas immediately via `send_message(...)`. 3. Tool execution adds explicit lifecycle events and code updates. Typical per-turn stream sequence: 1. thinking/assistant deltas 2. tool call deltas (optional) 3. `toolStart` 4. `setCode` previews (for `create_file`, optional) 5. `toolResult` 6. next model turn starts, repeat Finalization: - If no more tool calls, engine returns final code from in-memory file state. - If file state is empty, engine tries HTML extraction from final assistant text. ## Module Map - Engine orchestration: `backend/agent/engine.py` - Agent entrypoint: `backend/agent/runner.py` - Provider factory: `backend/agent/providers/factory.py` - Provider contract: `backend/agent/providers/base.py` - Provider implementations: - `backend/agent/providers/openai.py` - `backend/agent/providers/anthropic.py` - `backend/agent/providers/gemini.py` - Tool system: - `backend/agent/tools/definitions.py` - `backend/agent/tools/runtime.py` - `backend/agent/tools/parsing.py` - `backend/agent/tools/summaries.py` ================================================ FILE: design-docs/agentic-runner-refactor.md ================================================ # Agentic Runner Refactor Spec ## Goals - Reduce duplicated streaming logic across OpenAI/Anthropic/Gemini runners. - Centralize tool schemas and telemetry formatting. - Make the agent pipeline easier to test, extend, and reason about. ## Decision: Unified Stream Loop + Provider Adapters - Introduce a provider-agnostic stream loop that consumes normalized events (assistant_delta, thinking_delta, tool_call_delta, tool_call_complete, done). - Add per-provider adapters that translate native streams into normalized events. - Centralize tool execution, `toolStart`/`toolResult` emission, and `setCode` preview streaming inside the unified loop. - Keep per-provider adapters small and focused on parsing provider-specific payloads. ## Decision: Canonical Tool Definitions + Serializer Layer - Define tool schemas once in a canonical representation. - Add serializer helpers to produce OpenAI Responses, Anthropic, and Gemini tool schemas from the canonical form. - Centralize tool input/output summaries to keep UI telemetry consistent across providers and reduce duplication. ## Planned Removals ### Image Cache - Remove prompt-to-URL image caching in the agent tool layer. - Rationale: simplify state, reduce hidden cross-variant coupling. - Follow-up: ensure image generation remains deterministic per request when needed (e.g., pass explicit seeds or expose caching at a higher layer if required). ### OpenAI ChatCompletion Path - Remove the legacy ChatCompletion streaming path. - Route all OpenAI models through the Responses API implementation. - Update model lists and runtime checks to eliminate the ChatCompletion branch. ### Non-Agentic Generation Paths (e.g., Video) Keep video generation, but route it through the agent runner: - Replace video-specific streaming helpers with agent runner support for video inputs. - Remove conditional branches that bypass the agent path for video create/update. - Preserve video-specific prompt and media handling, but integrate it into the agent tool/stream pipeline. - Update tests and docs to reflect a single agent generation path that supports video inputs. ## File/Module Split - `agent/runner.py`: orchestration + shared stream loop. - `agent/providers/`: provider adapters (openai, responses, anthropic, gemini). - `agent/tools.py`: tool definitions, serialization, and execution. - `agent/state.py`: file state + seeding utilities. ## Non-Goals - No functional UX changes beyond the removal items above. - No redesign of the frontend agent activity UI; it should continue to consume the same tool/assistant/thinking events. ================================================ FILE: design-docs/commits-and-variants.md ================================================ # Commits and Non-Blocking Variants This document explains how the commit system and non-blocking variant generation work in screenshot-to-code. ## Commit System ### What are Commits? Commits represent discrete versions in the application's history. Each commit contains: - **Hash**: Unique identifier (generated using `nanoid()`) - **Parent Hash**: Links to previous commit for history tracking - **Variants**: Multiple code generation options (typically 2) - **Selected Variant**: Which variant the user is currently viewing - **Status**: Whether the commit is still being edited (`isCommitted: false`) or finalized (`isCommitted: true`) ### Commit Types ```typescript type CommitType = "ai_create" | "ai_edit" | "code_create"; ``` - **ai_create**: Initial generation from screenshot/video - **ai_edit**: Updates based on user instructions - **code_create**: Import from existing code ### Data Structure ```typescript type Commit = { hash: CommitHash; parentHash: CommitHash | null; dateCreated: Date; isCommitted: boolean; variants: Variant[]; selectedVariantIndex: number; type: CommitType; inputs: any; // Type-specific inputs } type Variant = { code: string; status: VariantStatus; } type VariantStatus = "generating" | "complete" | "cancelled"; ``` ### Storage and Management Commits are stored in the project store as a flat record: ```typescript commits: Record head: CommitHash | null // Current active commit ``` The `head` pointer tracks which commit is currently active. History is reconstructed by following `parentHash` links. ## Non-Blocking Variants ### Traditional Variant Generation (Before) ``` Start Generation → Wait for ALL variants → Show results User Experience: [Loading...........................] → Ready ``` Problems: - Users wait for the slowest variant - No interaction until everything completes - Poor perceived performance ### Non-Blocking Variant Generation (After) ``` Start Generation → Show results as each variant completes User Experience: [Loading.....] → Ready (Option 1) [Loading..........] → Ready (Option 2) ``` Benefits: - Immediate interaction when first variant completes - Can switch between completed variants while others generate - Significantly improved perceived performance ### Implementation Overview #### Frontend Changes **App.tsx**: Enhanced event handling ```typescript // New WebSocket events onVariantComplete: (variantIndex) => { updateVariantStatus(commit.hash, variantIndex, 'complete'); } onVariantError: (variantIndex, error) => { updateVariantStatus(commit.hash, variantIndex, 'cancelled'); } ``` **Sidebar.tsx**: Dual-condition UI ```typescript // Show update UI when either condition is true {(appState === AppState.CODE_READY || isSelectedVariantComplete) && ( )} ``` **Variants.tsx**: Real-time status indicators - Green dot: Complete variants - Red dot: Cancelled variants - Spinner: Currently generating variants #### Backend Changes **generate_code.py**: Independent variant processing ```python # Process each variant independently async def process_variant_completion(index: int, task: asyncio.Task): completion = await task # Wait for THIS variant only # Process images immediately processed_html = await perform_image_generation(...) # Send to frontend immediately await send_message("setCode", processed_html, index) await send_message("variantComplete", "Variant generation complete", index) ``` ### State Management #### App State vs Variant Status The system uses a **hybrid state approach**: - **AppState**: Global generation status (`INITIAL` → `CODING` → `CODE_READY`) - **Variant Status**: Individual variant status (`generating` → `complete`/`cancelled`) #### UI Logic ```typescript // UI shows update interface when either: const canUpdate = appState === AppState.CODE_READY || // All variants done isSelectedVariantComplete; // Selected variant done // User can interact immediately when their selected variant completes ``` ### WebSocket Protocol #### Events from Backend ```typescript type WebSocketResponse = { type: "chunk" | "status" | "setCode" | "variantComplete" | "variantError"; value: string; variantIndex: number; } ``` - **chunk**: Streaming code content during generation - **status**: Status updates (e.g., "Generating images...") - **setCode**: Final code for a variant - **variantComplete**: Variant finished successfully - **variantError**: Variant failed with error #### Event Flow ``` Backend: Generate Variant 1 → "setCode" → "variantComplete" Frontend: Update UI → Allow interaction Backend: Generate Variant 2 → "setCode" → "variantComplete" Frontend: Update UI → User can switch to this variant Backend: Generate Variant 3 → "variantError" Frontend: Show error → Mark as cancelled ``` ### User Experience Flow 1. **User starts generation** - All variants marked as `status: "generating"` - UI shows loading state with spinners 2. **First variant completes** - Receives `variantComplete` event - Status updated to `"complete"` - If this is the selected variant → UI immediately allows updates - User can start editing while other variants generate 3. **User switches variants** - Can switch to any completed variant immediately - Can switch to generating variants (will show loading until complete) 4. **User starts update** - Automatically cancels all other generating variants - Prevents wasted computation ### Benefits 1. **Perceived Performance**: Users see results 2-3x faster 2. **Parallel Processing**: Multiple models generate simultaneously 3. **Flexible Interaction**: Switch between ready options while others work 4. **Resource Efficiency**: Cancel unused variants when user makes changes 5. **Graceful Degradation**: System works even if some variants fail ### Technical Considerations #### Variant Cancellation When users start updates, other generating variants are cancelled: ```typescript // Cancel generating variants when user updates currentCommit.variants.forEach((variant, index) => { if (index !== selectedVariantIndex && variant.status === 'generating') { wsRef.current.send(JSON.stringify({ type: "cancel_variant", variantIndex: index })); } }); ``` #### Error Handling Each variant handles errors independently: - Failed variants don't block successful ones - Users see specific error messages per variant - System remains functional if some variants fail #### WebSocket Lifecycle - New generations replace previous WebSocket connections - Previous connections are closed to prevent resource leaks - Backend handles connection state checking before sending messages This architecture enables a responsive, non-blocking user experience while maintaining system reliability and resource efficiency. ================================================ FILE: design-docs/general.md ================================================ ## Input mode - Input mode is used for model selection (but it shouldn’t be really? I don’t know) - Model selection - Prompt selection ## Models Current Gemini client only uses messages[0] - doesn't support conversation history like OpenAI/Claude clients ================================================ FILE: design-docs/images-in-update-history.md ================================================ # Images in Update History ## Status: ✅ IMPLEMENTED Multiple images in update history are fully supported in the backend. ## Implementation ### Core Function - `create_message_from_history_item()` in `prompts/__init__.py` handles image processing - User messages with `images` array create multipart content (images + text) - Assistant messages remain text-only (code) - Empty `images` arrays gracefully fallback to text-only ### Supported Flows - ✅ Regular updates with images - ✅ Imported code updates with images - ✅ Multiple images per message - ✅ Backward compatibility (no images) ### Tests - ✅ Single image in history (`test_prompts.py`) - ✅ Multiple images in history (`test_prompts.py`, `test_prompts_additional.py`) - ✅ Imported code with images (`test_prompts.py`, `test_prompts_additional.py`) - ✅ Empty images arrays (`test_prompts_additional.py`) ## Usage Frontend can send update history items with: ```typescript { text: "Update instructions", images: ["data:image/png;base64,img1", "data:image/png;base64,img2"] } ``` Backend automatically creates proper multipart messages for AI models. ================================================ FILE: design-docs/prompt-history-refactor.md ================================================ # Prompt History Refactor (Frontend -> Backend) ## Goal Simplify edit prompt history so we no longer reconstruct conversation state from commit ancestry and index parity. The new model is: - Frontend stores explicit conversation history per variant. - Frontend sends explicit role-based history to backend. - Backend trusts structured history and only assembles messages. ## Key decisions (final) - No `sourceVersionNumber` field is required. - The selected variant is the source of truth for "what history to extend next". - No backend persistence/migration work is needed for this refactor. - Prompt history state is frontend-side session/project state. - No legacy reconstruction parity path. - Old reconstruction/index-parity behavior is removed rather than preserved. ## Previous behavior (removed) Before this refactor: - Frontend rebuilt history via `extractHistory(...)` by walking commit parent links. - Backend inferred message role from array index parity (`assistant` for even, `user` for odd). - History state was implicit and brittle, especially with branching and variant selection. ## New frontend storage model ### 1) Per-variant explicit history Each variant now carries its own history: - `Variant.history: VariantHistoryMessage[]` - `VariantHistoryMessage`: - `role: "user" | "assistant"` - `text: string` - `imageAssetIds: string[]` - `videoAssetIds: string[]` This history is authoritative for that variant. ### 2) Shared media asset store Frontend keeps media in one shared map: - `assetsById: Record` - `PromptAsset`: - `id` - `type: "image" | "video"` - `dataUrl` Variant history references media by ID, not by embedding large base64 strings directly. ### 3) Utilities extracted from `App.tsx` Prompt-history/media helper logic moved to: - `frontend/src/lib/prompt-history.ts` Key helpers: - `cloneVariantHistory(...)` - `registerAssetIds(...)` - `toRequestHistory(...)` - `buildUserHistoryMessage(...)` - `buildAssistantHistoryMessage(...)` ## How requests are built now ### Create flow - Create seeds `variantHistory` with a single `user` message. - Images/videos are registered in `assetsById` and referenced by IDs in the variant history. - Request payload includes: - `prompt` (create input) - `variantHistory` (for local commit state) ### Update flow - Uses the currently selected variant as source of truth. - Base history = selected variant history (or fallback assistant snapshot of current code if empty). - Appends new `user` update message (+ optional media IDs). - Converts variant history into request history (`role`, `text`, `images`, `videos`) via `toRequestHistory(...)`. ### Completion behavior On variant completion: - Final generated code is appended as an `assistant` history message for that specific variant. ## Branching behavior We still keep flat version labels (v1, v2, ...), but edits can branch from any selected version/variant. Important detail: - The active selected variant's explicit history is what gets extended for the next edit. - This naturally supports branching without reconstructing from global commit ancestry. ## Request payload shape (simplified) When sending an edit, frontend sends explicit role history like: - `history[i].role`: `"user"` or `"assistant"` - `history[i].text`: textual instruction or generated code - `history[i].images`: data URLs for image inputs for that message - `history[i].videos`: data URLs for video inputs for that message Backend does not infer roles by index anymore; it uses the provided role directly. ## Backend parsing and prompt assembly ### Request parsing extracted Raw request normalization moved into: - `backend/prompts/request_parsing.py` Functions: - `parse_prompt_content(raw_prompt)` - `parse_prompt_history(raw_history)` `generate_code.py` now calls these helpers, so the route file is smaller and parsing is centralized. ### Prompt assembly changes `backend/prompts/builders.py` now: - Consumes explicit `PromptHistoryMessage` entries. - Uses provided `role` directly (no index parity inference). - For update generation, builds: - `system` message - followed by all provided explicit history messages ### Imported-code path Imported-code update logic now works with explicit role history and chooses the latest relevant `user` instruction cleanly. ## Logging/observability updates - Removed runtime `PROMPT SUMMARY` logging from generation path. - Added compact `PROMPT PREVIEW` logging before model execution. - Large text/code is collapsed for readability. ## Tests updated/added ### Backend - Updated prompt assembly expectations to explicit role history: - `backend/tests/test_prompts.py` - Merged and removed duplicate file: - deleted `backend/tests/test_prompts_additional.py` - Added request parser tests: - `backend/tests/test_request_parsing.py` - Added prompt preview tests: - `backend/tests/test_prompt_summary.py` ### Frontend - Removed legacy `extractHistory` tests and updated fixtures: - `frontend/src/components/history/utils.test.ts` - Added helper tests for extracted prompt-history utilities: - `frontend/src/lib/prompt-history.test.ts` ## Net result The prompt-history pipeline is now explicit, variant-local, and much easier to reason about: - No implicit role inference. - No tree reconstruction for edit prompts. - Cleaner branch handling via selected variant history. - Smaller route parsing surface via dedicated parser module. ================================================ FILE: design-docs/variant-system.md ================================================ # Variant System ## Overview The variant system generates multiple code options in parallel, allowing users to compare different AI-generated implementations. The system defaults to 3 variants and scales automatically by changing `NUM_VARIANTS` in config. ## Configuration **Key Setting:** `NUM_VARIANTS = 3` in `backend/config.py` Changing this value automatically scales the entire system to support any number of variants. ## Model Selection Models cycle based on available API keys: ```python # Both API keys present models = [claude_model, Llm.GPT_4_1_NANO_2025_04_14] # Claude only models = [claude_model, Llm.CLAUDE_4_5_SONNET_2025_09_29] # OpenAI only models = [Llm.GPT_4O_2024_11_20] ``` **Cycling:** If models = [A, B] and NUM_VARIANTS = 5, result is [A, B, A, B, A] **Generation Type:** - **Create**: Primary model is Claude 3.7 Sonnet - **Update**: Primary model is Claude Sonnet 4.5 ## Frontend ### Grid Layouts - **2 variants**: 2-column - **3 variants**: 2-column (third wraps below - prevents squishing) - **4 variants**: 2x2 grid - **5-6 variants**: 3-column - **7+ variants**: 4-column ### Keyboard Shortcuts - **Option/Alt + 1, 2, 3...**: Switch variants - Works globally, even in text fields - Uses `event.code` for cross-platform compatibility - Visual indicators show ⌥1, ⌥2, ⌥3 ## Architecture ### Backend - `StatusBroadcastMiddleware` sends `variantCount` to frontend - `ModelSelectionStage` cycles through available models - Pipeline generates variants in parallel via WebSocket ### Frontend - Learns variant count from backend dynamically - `resizeVariants()` adapts UI to backend count - Error handling per variant with status display ## WebSocket Messages ```typescript "variantCount" | "chunk" | "status" | "setCode" | "variantComplete" | "variantError" ``` ## Implementation Notes ✅ **Scalable**: Change `NUM_VARIANTS` and everything adapts ✅ **Cross-platform**: Keyboard shortcuts work Mac/Windows ✅ **Responsive**: Grid layouts adapt to count ✅ **Simple**: Model cycling handles any variant count ## Key Files - `backend/config.py` - NUM_VARIANTS setting - `backend/routes/generate_code.py` - Model selection pipeline - `frontend/src/components/variants/Variants.tsx` - UI and shortcuts - `frontend/src/store/project-store.ts` - State management ================================================ FILE: docker-compose.yml ================================================ version: '3.9' services: backend: build: context: ./backend dockerfile: Dockerfile env_file: - .env # or # environment: #- BACKEND_PORT=7001 # if you change the port, make sure to also change the VITE_WS_BACKEND_URL at frontend/.env.local # - OPENAI_API_KEY=your_openai_api_key ports: - "${BACKEND_PORT:-7001}:${BACKEND_PORT:-7001}" command: poetry run uvicorn main:app --host 0.0.0.0 --port ${BACKEND_PORT:-7001} frontend: build: context: ./frontend dockerfile: Dockerfile ports: - "5173:5173" ================================================ FILE: frontend/.eslintrc.cjs ================================================ module.exports = { root: true, env: { browser: true, es2020: true }, extends: [ 'eslint:recommended', 'plugin:@typescript-eslint/recommended', 'plugin:react-hooks/recommended', ], ignorePatterns: ['dist', '.eslintrc.cjs'], parser: '@typescript-eslint/parser', plugins: ['react-refresh'], rules: { 'react-refresh/only-export-components': [ 'warn', { allowConstantExport: true }, ], }, } ================================================ FILE: frontend/.gitignore ================================================ # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* pnpm-debug.log* lerna-debug.log* node_modules dist dist-ssr *.local # Editor directories and files .vscode/* !.vscode/extensions.json .idea .DS_Store *.suo *.ntvs* *.njsproj *.sln *.sw? # Env files .env* # Test files src/tests/results/ ================================================ FILE: frontend/Dockerfile ================================================ FROM node:22-bullseye-slim # Set the working directory in the container WORKDIR /app # Copy package.json and yarn.lock COPY package.json yarn.lock /app/ # Set the environment variable to skip Puppeteer download ENV PUPPETEER_SKIP_DOWNLOAD=true # Install dependencies RUN yarn install # Copy the current directory contents into the container at /app COPY ./ /app/ # Expose port 5173 to access the server EXPOSE 5173 # Command to run the application CMD ["yarn", "dev", "--host", "0.0.0.0"] ================================================ FILE: frontend/components.json ================================================ { "$schema": "https://ui.shadcn.com/schema.json", "style": "new-york", "rsc": false, "tsx": true, "tailwind": { "config": "tailwind.config.js", "css": "src/index.css", "baseColor": "slate", "cssVariables": true }, "aliases": { "components": "@/components", "utils": "@/lib/utils" } } ================================================ FILE: frontend/index.html ================================================ <%- injectHead %> Screenshot to Code
================================================ FILE: frontend/jest.config.js ================================================ export default { preset: "ts-jest", testEnvironment: "node", setupFiles: ["/src/setupTests.ts"], transform: { "^.+\\.tsx?$": "ts-jest", }, testTimeout: 30000, }; ================================================ FILE: frontend/package.json ================================================ { "name": "screenshot-to-code", "private": true, "version": "0.0.0", "type": "module", "scripts": { "dev": "vite", "dev-hosted": "vite --mode prod", "build": "tsc && vite build", "build-hosted": "tsc && vite build --mode prod", "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", "preview": "vite preview", "test": "jest", "test:qa": "RUN_E2E=true TEST_ROOT_PATH=src/tests yarn test src/tests/qa.test.ts" }, "dependencies": { "@codemirror/lang-html": "^6.4.6", "@radix-ui/react-accordion": "^1.1.2", "@radix-ui/react-alert-dialog": "^1.0.5", "@radix-ui/react-checkbox": "^1.0.4", "@radix-ui/react-collapsible": "^1.0.3", "@radix-ui/react-dialog": "^1.0.5", "@radix-ui/react-hover-card": "^1.0.7", "@radix-ui/react-icons": "^1.3.0", "@radix-ui/react-label": "^2.0.2", "@radix-ui/react-popover": "^1.0.7", "@radix-ui/react-progress": "^1.0.3", "@radix-ui/react-scroll-area": "^1.0.5", "@radix-ui/react-select": "^2.0.0", "@radix-ui/react-separator": "^1.0.3", "@radix-ui/react-slot": "^1.0.2", "@radix-ui/react-switch": "^1.0.3", "@radix-ui/react-tabs": "^1.0.4", "@types/react-syntax-highlighter": "^15.5.13", "class-variance-authority": "^0.7.0", "classnames": "^2.3.2", "clsx": "^2.0.0", "codemirror": "^6.0.1", "copy-to-clipboard": "^3.3.3", "html2canvas": "^1.4.1", "nanoid": "^5.0.7", "react": "^18.2.0", "react-dom": "^18.2.0", "react-dropzone": "^14.2.3", "react-hot-toast": "^2.4.1", "react-icons": "^4.12.0", "react-markdown": "^10.1.0", "react-router-dom": "^6.20.1", "react-syntax-highlighter": "^16.1.0", "tailwind-merge": "^2.0.0", "tailwindcss-animate": "^1.0.7", "thememirror": "^2.0.1", "vite-plugin-checker": "^0.9.3", "webm-duration-fix": "^1.0.4", "zustand": "^4.5.2" }, "devDependencies": { "@types/jest": "^29.5.12", "@types/node": "^20.9.0", "@types/puppeteer": "^7.0.4", "@types/react": "^18.2.15", "@types/react-dom": "^18.2.7", "@typescript-eslint/eslint-plugin": "^6.0.0", "@typescript-eslint/parser": "^6.0.0", "@vitejs/plugin-react": "^4.0.3", "autoprefixer": "^10.4.16", "dotenv": "^16.4.5", "eslint": "^8.45.0", "eslint-plugin-react-hooks": "^4.6.0", "eslint-plugin-react-refresh": "^0.4.3", "jest": "^29.7.0", "postcss": "^8.4.31", "puppeteer": "^22.6.4", "tailwindcss": "^3.3.5", "ts-jest": "^29.1.2", "typescript": "^5.0.2", "vite": "^4.4.5", "vite-plugin-html": "^3.2.0", "vitest": "^1.0.1" }, "engines": { "node": ">=14.18.0" } } ================================================ FILE: frontend/postcss.config.js ================================================ export default { plugins: { tailwindcss: {}, autoprefixer: {}, }, } ================================================ FILE: frontend/src/App.tsx ================================================ import { useEffect, useRef, useState } from "react"; import { generateCode } from "./generateCode"; import { AppState, AppTheme, EditorTheme, Settings } from "./types"; import { IS_RUNNING_ON_CLOUD } from "./config"; import { PicoBadge } from "./components/messages/PicoBadge"; import { OnboardingNote } from "./components/messages/OnboardingNote"; import { usePersistedState } from "./hooks/usePersistedState"; import TermsOfServiceDialog from "./components/TermsOfServiceDialog"; import { USER_CLOSE_WEB_SOCKET_CODE } from "./constants"; import toast from "react-hot-toast"; import { nanoid } from "nanoid"; import { Stack } from "./lib/stacks"; import { CodeGenerationModel } from "./lib/models"; import useBrowserTabIndicator from "./hooks/useBrowserTabIndicator"; import { LuChevronLeft } from "react-icons/lu"; import { buildAssistantHistoryMessage, buildUserHistoryMessage, cloneVariantHistory, GenerationRequest, registerAssetIds, toRequestHistory, } from "./lib/prompt-history"; // import TipLink from "./components/messages/TipLink"; import { useAppStore } from "./store/app-store"; import { useProjectStore } from "./store/project-store"; import { removeHighlight } from "./components/select-and-edit/utils"; import Sidebar from "./components/sidebar/Sidebar"; import IconStrip from "./components/sidebar/IconStrip"; import HistoryDisplay from "./components/history/HistoryDisplay"; import PreviewPane from "./components/preview/PreviewPane"; import StartPane from "./components/start-pane/StartPane"; import SettingsTab from "./components/settings/SettingsTab"; import { Commit } from "./components/commits/types"; import { createCommit } from "./components/commits/utils"; function App() { const { // Inputs inputMode, setInputMode, referenceImages, setReferenceImages, initialPrompt, setInitialPrompt, upsertPromptAssets, resetPromptAssets, head, commits, addCommit, removeCommit, setHead, appendCommitCode, setCommitCode, resetCommits, resetHead, updateVariantStatus, resizeVariants, setVariantModels, appendVariantHistoryMessage, startAgentEvent, appendAgentEventContent, finishAgentEvent, // Outputs appendExecutionConsole, resetExecutionConsoles, } = useProjectStore(); const { disableInSelectAndEditMode, setUpdateInstruction, updateImages, setUpdateImages, appState, setAppState, selectedElement, setSelectedElement, } = useAppStore(); // Settings const [settings, setSettings] = usePersistedState( { openAiApiKey: null, openAiBaseURL: null, anthropicApiKey: null, geminiApiKey: null, screenshotOneApiKey: null, isImageGenerationEnabled: true, editorTheme: EditorTheme.COBALT, generatedCodeConfig: Stack.HTML_TAILWIND, codeGenerationModel: CodeGenerationModel.CLAUDE_4_5_OPUS_2025_11_01, // Only relevant for hosted version isTermOfServiceAccepted: false, }, "setting" ); const [appTheme, setAppTheme] = usePersistedState( AppTheme.SYSTEM, "app-theme" ); const wsRef = useRef(null); const lastThinkingEventIdRef = useRef>({}); const lastAssistantEventIdRef = useRef>({}); const lastToolEventIdRef = useRef>({}); const [isHistoryOpen, setIsHistoryOpen] = useState(false); const [isSettingsOpen, setIsSettingsOpen] = useState(false); const [mobilePane, setMobilePane] = useState<"preview" | "chat">("preview"); const showSelectAndEditFeature = settings.generatedCodeConfig === Stack.HTML_TAILWIND || settings.generatedCodeConfig === Stack.HTML_CSS; // Indicate coding state using the browser tab's favicon and title useBrowserTabIndicator(appState === AppState.CODING); // When the user already has the settings in local storage, newly added keys // do not get added to the settings so if it's falsy, we populate it with the default // value useEffect(() => { if (!settings.generatedCodeConfig) { setSettings((prev) => ({ ...prev, generatedCodeConfig: Stack.HTML_TAILWIND, })); } }, [settings.generatedCodeConfig, setSettings]); useEffect(() => { const mediaQuery = window.matchMedia("(prefers-color-scheme: dark)"); const applyTheme = () => { const isDark = appTheme === AppTheme.DARK || (appTheme === AppTheme.SYSTEM && mediaQuery.matches); document.documentElement.classList.toggle("dark", isDark); document.body.classList.toggle("dark", isDark); }; applyTheme(); if (appTheme !== AppTheme.SYSTEM) { return; } const onChange = () => applyTheme(); mediaQuery.addEventListener("change", onChange); return () => { mediaQuery.removeEventListener("change", onChange); }; }, [appTheme]); const getAssetsById = () => useProjectStore.getState().assetsById; // Functions const reset = () => { setAppState(AppState.INITIAL); setUpdateInstruction(""); setUpdateImages([]); disableInSelectAndEditMode(); resetExecutionConsoles(); resetCommits(); resetHead(); resetPromptAssets(); // Inputs setInputMode("image"); setReferenceImages([]); }; const regenerate = () => { if (head === null) { toast.error( "No current version set. Please contact support via chat or Github." ); throw new Error("Regenerate called with no head"); } // Retrieve the previous command const currentCommit = commits[head]; if (currentCommit.type !== "ai_create") { toast.error("Only the first version can be regenerated."); return; } // Re-run the create if (inputMode === "image" || inputMode === "video") { doCreate(referenceImages, inputMode); } else { // TODO: Fix this doCreateFromText(initialPrompt); } }; // Used when the user cancels the code generation const cancelCodeGeneration = () => { wsRef.current?.close?.(USER_CLOSE_WEB_SOCKET_CODE); }; // Used for user-initiated cancellation and failed edit rollbacks const cancelCodeGenerationAndReset = (commit: Commit) => { // When the current commit is the first version, reset the entire app state if (commit.type === "ai_create") { reset(); } else { // Otherwise, remove current commit from commits removeCommit(commit.hash); // Revert to parent commit const parentCommitHash = commit.parentHash; if (parentCommitHash) { setHead(parentCommitHash); } else { throw new Error("Parent commit not found"); } setAppState(AppState.CODE_READY); } }; function doGenerateCode(params: GenerationRequest) { // Reset the execution console resetExecutionConsoles(); // Set the app state to coding during generation setAppState(AppState.CODING); const { variantHistory, ...requestParams } = params; // Merge settings with params const updatedParams = { ...requestParams, ...settings }; // Use 4 variants for create, 2 for edits to match backend counts // and avoid a flash when the backend sends the actual variant count const initialVariantCount = requestParams.generationType === "create" ? 4 : 2; const baseCommitObject = { variants: Array(initialVariantCount) .fill(null) .map(() => ({ code: "", history: cloneVariantHistory(variantHistory), })), }; const commitInputObject = requestParams.generationType === "create" ? { ...baseCommitObject, type: "ai_create" as const, parentHash: null, inputs: requestParams.prompt, } : { ...baseCommitObject, type: "ai_edit" as const, parentHash: head, inputs: requestParams.prompt, }; // Create a new commit and set it as the head const commit = createCommit(commitInputObject); addCommit(commit); setHead(commit.hash); lastThinkingEventIdRef.current = {}; lastAssistantEventIdRef.current = {}; lastToolEventIdRef.current = {}; const finishThinkingEvent = (variantIndex: number, status: "complete" | "error") => { const eventId = lastThinkingEventIdRef.current[variantIndex]; if (!eventId) return; finishAgentEvent(commit.hash, variantIndex, eventId, { status, endedAt: Date.now(), }); delete lastThinkingEventIdRef.current[variantIndex]; }; const finishAssistantEvent = (variantIndex: number, status: "complete" | "error") => { const eventId = lastAssistantEventIdRef.current[variantIndex]; if (!eventId) return; finishAgentEvent(commit.hash, variantIndex, eventId, { status, endedAt: Date.now(), }); delete lastAssistantEventIdRef.current[variantIndex]; }; const finishToolEvent = (variantIndex: number, status: "complete" | "error") => { const eventId = lastToolEventIdRef.current[variantIndex]; if (!eventId) return; finishAgentEvent(commit.hash, variantIndex, eventId, { status, endedAt: Date.now(), }); delete lastToolEventIdRef.current[variantIndex]; }; const finishInFlightEvents = (status: "complete" | "error") => { Object.keys(lastThinkingEventIdRef.current).forEach((key) => { finishThinkingEvent(Number(key), status); }); Object.keys(lastAssistantEventIdRef.current).forEach((key) => { finishAssistantEvent(Number(key), status); }); Object.keys(lastToolEventIdRef.current).forEach((key) => { finishToolEvent(Number(key), status); }); }; generateCode(wsRef, updatedParams, { onChange: (token, variantIndex) => { appendCommitCode(commit.hash, variantIndex, token); }, onSetCode: (code, variantIndex) => { setCommitCode(commit.hash, variantIndex, code); }, onStatusUpdate: (line, variantIndex) => appendExecutionConsole(variantIndex, line), onVariantComplete: (variantIndex) => { console.log(`Variant ${variantIndex} complete event received`); updateVariantStatus(commit.hash, variantIndex, "complete"); const currentCode = useProjectStore.getState().commits[commit.hash]?.variants[variantIndex] ?.code || ""; if (currentCode.trim().length > 0) { appendVariantHistoryMessage( commit.hash, variantIndex, buildAssistantHistoryMessage(currentCode) ); } finishThinkingEvent(variantIndex, "complete"); finishAssistantEvent(variantIndex, "complete"); finishToolEvent(variantIndex, "complete"); if (commit.type === "ai_edit") { const { updateInstruction: currentInstruction, updateImages: currentImages, } = useAppStore.getState(); const instructionUnchanged = currentInstruction === commit.inputs.text; const imagesUnchanged = currentImages.length === commit.inputs.images.length && currentImages.every( (image, index) => image === commit.inputs.images[index] ); // This conditional clear handles three UX scenarios: // 1) All variants fail: no completion event, so keep prompt/images for retry. // 2) A variant completes and user has typed/changed images: do not clear. // 3) A variant completes and user has not changed draft: clear for next edit. if (instructionUnchanged && imagesUnchanged) { setUpdateInstruction(""); setUpdateImages([]); } } }, onVariantError: (variantIndex, error) => { console.error(`Error in variant ${variantIndex}:`, error); updateVariantStatus(commit.hash, variantIndex, "error", error); finishThinkingEvent(variantIndex, "error"); finishAssistantEvent(variantIndex, "error"); finishToolEvent(variantIndex, "error"); }, onVariantCount: (count) => { console.log(`Backend is using ${count} variants`); resizeVariants(commit.hash, count); }, onVariantModels: (models) => { setVariantModels(commit.hash, models); }, onThinking: (content, variantIndex, eventId) => { if (!eventId) return; lastThinkingEventIdRef.current[variantIndex] = eventId; startAgentEvent(commit.hash, variantIndex, { id: eventId, type: "thinking", status: "running", startedAt: Date.now(), }); appendAgentEventContent(commit.hash, variantIndex, eventId, content); }, onAssistant: (content, variantIndex, eventId) => { if (!eventId) return; lastAssistantEventIdRef.current[variantIndex] = eventId; startAgentEvent(commit.hash, variantIndex, { id: eventId, type: "assistant", status: "running", startedAt: Date.now(), }); appendAgentEventContent(commit.hash, variantIndex, eventId, content); }, onToolStart: (data, variantIndex, eventId) => { if (!eventId) return; const lastThinking = lastThinkingEventIdRef.current[variantIndex]; if (lastThinking && lastThinking !== eventId) { finishThinkingEvent(variantIndex, "complete"); } const lastAssistant = lastAssistantEventIdRef.current[variantIndex]; if (lastAssistant && lastAssistant !== eventId) { finishAssistantEvent(variantIndex, "complete"); } startAgentEvent(commit.hash, variantIndex, { id: eventId, type: "tool", status: "running", toolName: data?.name, input: data?.input, startedAt: Date.now(), }); lastToolEventIdRef.current[variantIndex] = eventId; }, onToolResult: (data, variantIndex, eventId) => { if (!eventId) return; finishAgentEvent(commit.hash, variantIndex, eventId, { status: data?.ok === false ? "error" : "complete", output: data?.output, endedAt: Date.now(), }); if (lastToolEventIdRef.current[variantIndex] === eventId) { delete lastToolEventIdRef.current[variantIndex]; } }, onCancel: (reason, errorMessage) => { // Close any running agent events when the socket ends without per-event // terminal messages, otherwise they remain stuck in "running" state. finishInFlightEvents(reason === "request_failed" ? "error" : "complete"); if (reason === "request_failed" && commit.type === "ai_create") { const latestCreateCommit = useProjectStore.getState().commits[commit.hash]; latestCreateCommit?.variants.forEach((variant, variantIndex) => { if (variant.status === "generating") { updateVariantStatus( commit.hash, variantIndex, "error", errorMessage || "Generation failed. Please retry." ); } }); setAppState(AppState.CODE_READY); return; } cancelCodeGenerationAndReset(commit); }, onComplete: () => { finishInFlightEvents("complete"); setAppState(AppState.CODE_READY); }, }); } // Initial version creation function doCreate( referenceImages: string[], inputMode: "image" | "video", textPrompt: string = "" ) { // Reset any existing state reset(); // Set the input states setReferenceImages(referenceImages); setInputMode(inputMode); // Kick off the code generation if (referenceImages.length > 0) { const media = inputMode === "video" ? [referenceImages[0]] : referenceImages; const imageAssetIds = inputMode === "image" ? registerAssetIds( "image", media, getAssetsById, upsertPromptAssets, nanoid ) : []; const videoAssetIds = inputMode === "video" ? registerAssetIds( "video", media, getAssetsById, upsertPromptAssets, nanoid ) : []; const variantHistory = [ buildUserHistoryMessage(textPrompt, imageAssetIds, videoAssetIds), ]; doGenerateCode({ generationType: "create", inputMode, prompt: { text: textPrompt, images: inputMode === "image" ? media : [], videos: inputMode === "video" ? media : [], }, variantHistory, }); } } function doCreateFromText(text: string) { // Reset any existing state reset(); setInputMode("text"); setInitialPrompt(text); doGenerateCode({ generationType: "create", inputMode: "text", prompt: { text, images: [], videos: [] }, variantHistory: [buildUserHistoryMessage(text)], }); } // Subsequent updates async function doUpdate(updateInstruction: string) { if (updateInstruction.trim() === "") { toast.error("Please include some instructions for AI on what to update."); return; } if (head === null) { toast.error( "No current version set. Contact support or open a Github issue." ); throw new Error("Update called with no head"); } const currentCommit = commits[head]; const currentCode = currentCommit?.variants[currentCommit.selectedVariantIndex]?.code || ""; const optionCodes = currentCommit?.variants.map( (variant) => variant.code || "" ); let modifiedUpdateInstruction = updateInstruction; let selectedElementHtml: string | undefined; // Send in a reference to the selected element if it exists if (selectedElement) { const elementHtml = removeHighlight(selectedElement).outerHTML; selectedElementHtml = elementHtml; modifiedUpdateInstruction = updateInstruction + " referring to this element specifically: " + elementHtml; setSelectedElement(null); } const selectedVariant = currentCommit.variants[currentCommit.selectedVariantIndex]; const baseVariantHistory = selectedVariant.history; const updateImageAssetIds = registerAssetIds( "image", updateImages, getAssetsById, upsertPromptAssets, nanoid ); const updatedVariantHistory = [ ...cloneVariantHistory(baseVariantHistory), buildUserHistoryMessage(modifiedUpdateInstruction, updateImageAssetIds), ]; const shouldBootstrapFromFileState = baseVariantHistory.length === 0 && currentCode.trim().length > 0; const updatedHistory = shouldBootstrapFromFileState ? [] : toRequestHistory(updatedVariantHistory, getAssetsById); doGenerateCode({ generationType: "update", inputMode, prompt: { text: updateInstruction, images: updateImages, videos: [], selectedElementHtml, }, history: updatedHistory, optionCodes, variantHistory: updatedVariantHistory, fileState: currentCode ? { path: "index.html", content: currentCode, } : undefined, }); } const handleTermDialogOpenChange = (open: boolean) => { setSettings((s) => ({ ...s, isTermOfServiceAccepted: !open, })); }; function setStack(stack: Stack) { setSettings((prev) => ({ ...prev, generatedCodeConfig: stack, })); } function importFromCode(code: string, stack: Stack) { // Reset any existing state reset(); // Set up this project setStack(stack); // Create a new commit and set it as the head const commit = createCommit({ type: "code_create", parentHash: null, variants: [{ code, history: [] }], inputs: null, }); addCommit(commit); setHead(commit.hash); // Set the app state setAppState(AppState.CODE_READY); } const showContentPanel = appState === AppState.CODING || appState === AppState.CODE_READY || isHistoryOpen; const isCodingOrReady = appState === AppState.CODING || appState === AppState.CODE_READY; const showMobileChatPane = showContentPanel && mobilePane === "chat"; return (
{IS_RUNNING_ON_CLOUD && } {IS_RUNNING_ON_CLOUD && ( )} {/* Icon strip - always visible */}
{ setIsHistoryOpen((prev) => !prev); setIsSettingsOpen(false); setMobilePane("chat"); }} onToggleEditor={() => { setIsHistoryOpen(false); setIsSettingsOpen(false); setMobilePane("preview"); }} onLogoClick={() => { setIsHistoryOpen(false); setIsSettingsOpen(false); setMobilePane("preview"); }} onNewProject={() => { reset(); setIsHistoryOpen(false); setIsSettingsOpen(false); setMobilePane("preview"); }} onOpenSettings={() => { setIsSettingsOpen(true); setIsHistoryOpen(false); }} />
{isCodingOrReady && !isSettingsOpen && (
)} {/* Content panel - shows sidebar, history, or editor */} {showContentPanel && !isSettingsOpen && (
{isHistoryOpen ? (

Versions

) : ( <> {IS_RUNNING_ON_CLOUD && !settings.openAiApiKey && (
)} {(appState === AppState.CODING || appState === AppState.CODE_READY) && ( { setIsHistoryOpen(true); setMobilePane("chat"); }} /> )} )}
)}
{isSettingsOpen ? ( ) : ( <> {appState === AppState.INITIAL && ( )} {isCodingOrReady && ( { setIsHistoryOpen(true); setMobilePane("chat"); }} /> )} )}
); } export default App; ================================================ FILE: frontend/src/components/ImageLightbox.tsx ================================================ import { useEffect, useRef, useState, useCallback } from "react"; import { LuMinus, LuPlus, LuX } from "react-icons/lu"; import { Dialog, DialogPortal, DialogOverlay } from "./ui/dialog"; const MIN_ZOOM = 0.5; const MAX_ZOOM = 10; const DEFAULT_DISPLAY_WIDTH = 1000; interface ImageLightboxProps { image: string | null; onClose: () => void; } function ImageLightbox({ image, onClose }: ImageLightboxProps) { const viewportRef = useRef(null); const [zoom, setZoom] = useState(1); const [naturalSize, setNaturalSize] = useState<{ width: number; height: number; } | null>(null); const [fitScale, setFitScale] = useState(1); const initialZoomSet = useRef(false); const dragRef = useRef({ isDragging: false, startX: 0, startY: 0, scrollLeft: 0, scrollTop: 0, didDrag: false, }); // Reset state when image changes useEffect(() => { setZoom(1); setNaturalSize(null); setFitScale(1); initialZoomSet.current = false; }, [image]); const recomputeFitScale = useCallback(() => { if (!viewportRef.current || !naturalSize) return; // Subtract p-8 padding (32px each side) const viewportWidth = viewportRef.current.clientWidth - 64; const viewportHeight = viewportRef.current.clientHeight - 64; if (viewportWidth <= 0 || viewportHeight <= 0) return; const scale = Math.min( viewportWidth / naturalSize.width, viewportHeight / naturalSize.height, 1 ); setFitScale(scale); // Set initial zoom to target DEFAULT_DISPLAY_WIDTH (only clamp to viewport width) if (!initialZoomSet.current) { initialZoomSet.current = true; const targetScale = DEFAULT_DISPLAY_WIDTH / naturalSize.width; const maxWidthScale = viewportWidth / naturalSize.width; const clampedScale = Math.min(targetScale, maxWidthScale); setZoom(Math.max(MIN_ZOOM, Math.min(MAX_ZOOM, clampedScale / scale))); } }, [naturalSize]); useEffect(() => { if (!image) return; recomputeFitScale(); const handleResize = () => recomputeFitScale(); window.addEventListener("resize", handleResize); return () => window.removeEventListener("resize", handleResize); }, [image, recomputeFitScale]); useEffect(() => { recomputeFitScale(); }, [recomputeFitScale]); const zoomIn = () => { setZoom((z) => Math.min(MAX_ZOOM, Math.round((z + 0.5) * 100) / 100)); }; const zoomOut = () => { setZoom((z) => Math.max(MIN_ZOOM, Math.round((z - 0.5) * 100) / 100)); }; const zoomToFit = () => setZoom(1); const zoomToDefault = () => { if (!naturalSize || fitScale <= 0 || !viewportRef.current) return; const viewportWidth = viewportRef.current.clientWidth - 64; const targetScale = DEFAULT_DISPLAY_WIDTH / naturalSize.width; const maxWidthScale = viewportWidth / naturalSize.width; const clampedScale = Math.min(targetScale, maxWidthScale); setZoom(Math.max(MIN_ZOOM, Math.min(MAX_ZOOM, clampedScale / fitScale))); }; const handleMouseDown = useCallback((e: React.MouseEvent) => { if (!viewportRef.current || e.button !== 0) return; dragRef.current = { isDragging: true, startX: e.clientX, startY: e.clientY, scrollLeft: viewportRef.current.scrollLeft, scrollTop: viewportRef.current.scrollTop, didDrag: false, }; viewportRef.current.style.cursor = "grabbing"; e.preventDefault(); }, []); const handleMouseMove = useCallback((e: React.MouseEvent) => { const drag = dragRef.current; if (!drag.isDragging || !viewportRef.current) return; const dx = e.clientX - drag.startX; const dy = e.clientY - drag.startY; if (Math.abs(dx) > 3 || Math.abs(dy) > 3) { drag.didDrag = true; } viewportRef.current.scrollLeft = drag.scrollLeft - dx; viewportRef.current.scrollTop = drag.scrollTop - dy; }, []); const handleMouseUp = useCallback(() => { dragRef.current.isDragging = false; if (viewportRef.current) { viewportRef.current.style.cursor = ""; } }, []); const handleWheel = useCallback((e: React.WheelEvent) => { if (!viewportRef.current) return; viewportRef.current.scrollTop += e.deltaY; viewportRef.current.scrollLeft += e.deltaX; }, []); const handleViewportClick = useCallback(() => { if (dragRef.current.didDrag) { dragRef.current.didDrag = false; return; } onClose(); }, [onClose]); const effectiveScale = fitScale * zoom; const displayWidth = naturalSize ? Math.max(1, Math.round(naturalSize.width * effectiveScale)) : undefined; const displayHeight = naturalSize ? Math.max(1, Math.round(naturalSize.height * effectiveScale)) : undefined; return ( !open && onClose()}>
{/* Scrollable viewport - drag to scroll, click to close */}
{image && ( Reference image e.stopPropagation()} style={ displayWidth && displayHeight ? { width: `${displayWidth}px`, height: `${displayHeight}px`, maxWidth: "none", maxHeight: "none", } : { visibility: "hidden" as const } } onLoad={(event) => { setNaturalSize({ width: event.currentTarget.naturalWidth, height: event.currentTarget.naturalHeight, }); }} /> )}
{/* Zoom controls - bottom center pill */}
e.stopPropagation()} onMouseDown={(e) => e.stopPropagation()} >
); } export default ImageLightbox; ================================================ FILE: frontend/src/components/ImageUpload.tsx ================================================ import { useState, useEffect, useMemo, useRef, useCallback } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "react-hot-toast"; import ScreenRecorder from "./recording/ScreenRecorder"; import { ScreenRecorderState } from "../types"; import { Stack } from "../lib/stacks"; const baseStyle = { flex: 1, width: "80%", margin: "0 auto", minHeight: "400px", display: "flex", flexDirection: "column", alignItems: "center", justifyContent: "center", padding: "20px", borderWidth: 2, borderRadius: 2, borderColor: "#eeeeee", borderStyle: "dashed", backgroundColor: "#fafafa", color: "#bdbdbd", outline: "none", transition: "border .24s ease-in-out", }; const focusedStyle = { borderColor: "#2196f3", }; const acceptStyle = { borderColor: "#00e676", }; const rejectStyle = { borderColor: "#ff1744", }; // TODO: Move to a separate file function fileToDataURL(file: File): Promise { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = () => { const result = reader.result as string; // Check if the MIME type is correctly set in the data URL // Some browsers return application/octet-stream for video files if (result.startsWith("data:application/octet-stream") && file.type) { // Replace with the correct MIME type from the file const correctedResult = result.replace( "data:application/octet-stream", `data:${file.type}` ); resolve(correctedResult); } else { resolve(result); } }; reader.onerror = (error) => reject(error); reader.readAsDataURL(file); }); } type FileWithPreview = { preview: string; } & File; interface Props { setReferenceImages: ( referenceImages: string[], inputMode: "image" | "video", textPrompt?: string ) => void; onUploadStateChange?: (hasUpload: boolean) => void; stack: Stack; setStack: (stack: Stack) => void; } function ImageUpload({ setReferenceImages, onUploadStateChange, stack, setStack }: Props) { const [files, setFiles] = useState([]); const [uploadedDataUrls, setUploadedDataUrls] = useState([]); const [uploadedInputMode, setUploadedInputMode] = useState< "image" | "video" >("image"); const [textPrompt, setTextPrompt] = useState(""); const [showTextPrompt, setShowTextPrompt] = useState(false); const textInputRef = useRef(null); // TODO: Switch to Zustand const [screenRecorderState, setScreenRecorderState] = useState(ScreenRecorderState.INITIAL); const hasUploadedFile = uploadedDataUrls.length > 0; // Notify parent of upload state changes useEffect(() => { onUploadStateChange?.(hasUploadedFile); }, [hasUploadedFile, onUploadStateChange]); const handleGenerate = useCallback(() => { if (uploadedDataUrls.length > 0) { setReferenceImages(uploadedDataUrls, uploadedInputMode, textPrompt); } }, [uploadedDataUrls, uploadedInputMode, textPrompt, setReferenceImages]); // Global Enter key listener for generating when image is uploaded useEffect(() => { if (!hasUploadedFile) return; const handleGlobalKeyDown = (e: KeyboardEvent) => { if (e.key === "Enter" && !e.shiftKey) { // Don't fire if textarea is focused (it has its own handler) if (document.activeElement === textInputRef.current) return; e.preventDefault(); handleGenerate(); } }; document.addEventListener("keydown", handleGlobalKeyDown); return () => document.removeEventListener("keydown", handleGlobalKeyDown); }, [hasUploadedFile, handleGenerate]); const handleClear = () => { setUploadedDataUrls([]); setFiles([]); setTextPrompt(""); setShowTextPrompt(false); }; const handleKeyDown = (e: React.KeyboardEvent) => { if (e.key === "Enter" && !e.shiftKey) { e.preventDefault(); handleGenerate(); } }; const { getRootProps, getInputProps, isFocused, isDragAccept, isDragReject } = useDropzone({ maxFiles: 1, maxSize: 1024 * 1024 * 20, // 20 MB accept: { // Image formats "image/png": [".png"], "image/jpeg": [".jpeg"], "image/jpg": [".jpg"], // Video formats "video/quicktime": [".mov"], "video/mp4": [".mp4"], "video/webm": [".webm"], }, onDrop: (acceptedFiles) => { // Set up the preview thumbnail images setFiles( acceptedFiles.map((file: File) => Object.assign(file, { preview: URL.createObjectURL(file), }) ) as FileWithPreview[] ); // Determine input mode from file type (more reliable than checking data URL) const firstFile = acceptedFiles[0]; const isVideo = firstFile?.type?.startsWith("video/") || [".mp4", ".mov", ".webm"].some(ext => firstFile?.name?.toLowerCase().endsWith(ext)); // Convert images to data URLs and store them (don't trigger generation yet) Promise.all(acceptedFiles.map((file) => fileToDataURL(file))) .then((dataUrls) => { if (dataUrls.length > 0) { // Use file type detection as primary, fall back to data URL check const inputMode = isVideo || (dataUrls[0] as string).startsWith("data:video") ? "video" : "image"; setUploadedDataUrls(dataUrls as string[]); setUploadedInputMode(inputMode); // Focus the text input after upload setTimeout(() => textInputRef.current?.focus(), 100); } }) .catch((error) => { toast.error("Error reading files" + error); console.error("Error reading files:", error); }); }, onDropRejected: (rejectedFiles) => { toast.error(rejectedFiles[0].errors[0].message); }, }); useEffect(() => { return () => files.forEach((file) => URL.revokeObjectURL(file.preview)); }, [files]); const style = useMemo( () => ({ ...baseStyle, ...(isFocused ? focusedStyle : {}), ...(isDragAccept ? acceptStyle : {}), ...(isDragReject ? rejectStyle : {}), }), [isFocused, isDragAccept, isDragReject] ); // Screen recorder callback - wrap to include empty text prompt const handleScreenRecorderGenerate = ( images: string[], inputMode: "image" | "video" ) => { setReferenceImages(images, inputMode, ""); }; return (
{screenRecorderState === ScreenRecorderState.INITIAL && !hasUploadedFile && ( /* eslint-disable-next-line @typescript-eslint/no-explicit-any */

Drag & drop a screenshot here,
or click to upload

)} {hasUploadedFile && (
{/* Image/Video Preview */}
{uploadedInputMode === "video" ? (
{/* Text Prompt Toggle/Input */} {!showTextPrompt ? ( ) : (